file.c 68.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3 4 5 6 7 8 9
 * file.c
 *
 * File open, close, extend, truncate
 *
 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
 */

10
#include <linux/capability.h>
11 12 13 14 15 16
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
17
#include <linux/sched.h>
18
#include <linux/splice.h>
T
Tiger Yang 已提交
19
#include <linux/mount.h>
20
#include <linux/writeback.h>
M
Mark Fasheh 已提交
21
#include <linux/falloc.h>
22
#include <linux/quotaops.h>
23
#include <linux/blkdev.h>
24
#include <linux/backing-dev.h>
25 26 27 28 29 30 31 32 33 34 35 36 37

#include <cluster/masklog.h>

#include "ocfs2.h"

#include "alloc.h"
#include "aops.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
#include "sysfile.h"
#include "inode.h"
H
Herbert Poetzl 已提交
38
#include "ioctl.h"
39
#include "journal.h"
40
#include "locks.h"
41 42 43
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
T
Tiger Yang 已提交
44
#include "xattr.h"
T
Tiger Yang 已提交
45
#include "acl.h"
46
#include "quota.h"
T
Tao Ma 已提交
47
#include "refcounttree.h"
48
#include "ocfs2_trace.h"
49 50 51

#include "buffer_head_io.h"

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
	struct ocfs2_file_private *fp;

	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
	if (!fp)
		return -ENOMEM;

	fp->fp_file = file;
	mutex_init(&fp->fp_mutex);
	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
	file->private_data = fp;

	return 0;
}

static void ocfs2_free_file_private(struct inode *inode, struct file *file)
{
	struct ocfs2_file_private *fp = file->private_data;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

	if (fp) {
		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
		ocfs2_lock_res_free(&fp->fp_flock);
		kfree(fp);
		file->private_data = NULL;
	}
}

81 82 83 84 85 86
static int ocfs2_file_open(struct inode *inode, struct file *file)
{
	int status;
	int mode = file->f_flags;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

87
	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
P
piaojun 已提交
88
			      (unsigned long long)oi->ip_blkno,
89 90
			      file->f_path.dentry->d_name.len,
			      file->f_path.dentry->d_name.name, mode);
91

92 93 94 95 96
	if (file->f_mode & FMODE_WRITE) {
		status = dquot_initialize(inode);
		if (status)
			goto leave;
	}
97

98 99 100 101 102
	spin_lock(&oi->ip_lock);

	/* Check that the inode hasn't been wiped from disk by another
	 * node. If it hasn't then we're safe as long as we hold the
	 * spin lock until our increment of open count. */
P
piaojun 已提交
103
	if (oi->ip_flags & OCFS2_INODE_DELETED) {
104 105 106 107 108 109 110 111 112 113 114
		spin_unlock(&oi->ip_lock);

		status = -ENOENT;
		goto leave;
	}

	if (mode & O_DIRECT)
		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;

	oi->ip_open_count++;
	spin_unlock(&oi->ip_lock);
115 116 117 118 119 120 121 122 123 124 125 126

	status = ocfs2_init_file_private(inode, file);
	if (status) {
		/*
		 * We want to set open count back if we're failing the
		 * open.
		 */
		spin_lock(&oi->ip_lock);
		oi->ip_open_count--;
		spin_unlock(&oi->ip_lock);
	}

G
Gang He 已提交
127 128
	file->f_mode |= FMODE_NOWAIT;

129 130 131 132 133 134 135 136 137 138 139
leave:
	return status;
}

static int ocfs2_file_release(struct inode *inode, struct file *file)
{
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

	spin_lock(&oi->ip_lock);
	if (!--oi->ip_open_count)
		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
140 141 142 143 144 145

	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
				 oi->ip_blkno,
				 file->f_path.dentry->d_name.len,
				 file->f_path.dentry->d_name.name,
				 oi->ip_open_count);
146 147
	spin_unlock(&oi->ip_lock);

148 149
	ocfs2_free_file_private(inode, file);

150 151 152
	return 0;
}

153 154 155 156 157 158 159 160 161 162 163
static int ocfs2_dir_open(struct inode *inode, struct file *file)
{
	return ocfs2_init_file_private(inode, file);
}

static int ocfs2_dir_release(struct inode *inode, struct file *file)
{
	ocfs2_free_file_private(inode, file);
	return 0;
}

164 165
static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
			   int datasync)
166 167
{
	int err = 0;
168
	struct inode *inode = file->f_mapping->host;
169
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
170 171 172 173 174
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	journal_t *journal = osb->journal->j_journal;
	int ret;
	tid_t commit_tid;
	bool needs_barrier = false;
175

176
	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
P
piaojun 已提交
177
			      oi->ip_blkno,
178 179 180
			      file->f_path.dentry->d_name.len,
			      file->f_path.dentry->d_name.name,
			      (unsigned long long)datasync);
181

182 183 184
	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
		return -EROFS;

185
	err = file_write_and_wait_range(file, start, end);
186 187 188
	if (err)
		return err;

189 190 191 192 193 194
	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
	if (journal->j_flags & JBD2_BARRIER &&
	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
		needs_barrier = true;
	err = jbd2_complete_transaction(journal, commit_tid);
	if (needs_barrier) {
195
		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
196 197
		if (!err)
			err = ret;
198
	}
199

T
Tao Ma 已提交
200 201
	if (err)
		mlog_errno(err);
202 203 204 205

	return (err < 0) ? -EIO : 0;
}

T
Tiger Yang 已提交
206 207 208
int ocfs2_should_update_atime(struct inode *inode,
			      struct vfsmount *vfsmnt)
{
209
	struct timespec64 now;
T
Tiger Yang 已提交
210 211 212 213 214 215
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
		return 0;

	if ((inode->i_flags & S_NOATIME) ||
216
	    ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
T
Tiger Yang 已提交
217 218
		return 0;

219 220 221 222 223 224 225 226 227 228 229
	/*
	 * We can be called with no vfsmnt structure - NFSD will
	 * sometimes do this.
	 *
	 * Note that our action here is different than touch_atime() -
	 * if we can't tell whether this is a noatime mount, then we
	 * don't know whether to trust the value of s_atime_quantum.
	 */
	if (vfsmnt == NULL)
		return 0;

T
Tiger Yang 已提交
230 231 232 233
	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
		return 0;

234
	if (vfsmnt->mnt_flags & MNT_RELATIME) {
235 236
		if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
		    (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
237 238 239 240 241
			return 1;

		return 0;
	}

242
	now = current_time(inode);
T
Tiger Yang 已提交
243 244 245 246 247 248 249 250 251 252 253 254
	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
		return 0;
	else
		return 1;
}

int ocfs2_update_inode_atime(struct inode *inode,
			     struct buffer_head *bh)
{
	int ret;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	handle_t *handle;
255
	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
T
Tiger Yang 已提交
256 257

	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
258 259
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
T
Tiger Yang 已提交
260 261 262 263
		mlog_errno(ret);
		goto out;
	}

264
	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
265
				      OCFS2_JOURNAL_ACCESS_WRITE);
266 267 268 269 270 271 272
	if (ret) {
		mlog_errno(ret);
		goto out_commit;
	}

	/*
	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
273
	 * have i_rwsem to guard against concurrent changes to other
274 275
	 * inode fields.
	 */
276
	inode->i_atime = current_time(inode);
277 278
	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
279
	ocfs2_update_inode_fsync_trans(handle, inode, 0);
280
	ocfs2_journal_dirty(handle, bh);
T
Tiger Yang 已提交
281

282
out_commit:
283
	ocfs2_commit_trans(osb, handle);
T
Tiger Yang 已提交
284 285 286 287
out:
	return ret;
}

288
int ocfs2_set_inode_size(handle_t *handle,
289 290 291
				struct inode *inode,
				struct buffer_head *fe_bh,
				u64 new_i_size)
292 293 294 295
{
	int status;

	i_size_write(inode, new_i_size);
296
	inode->i_blocks = ocfs2_inode_sector_count(inode);
297
	inode->i_ctime = inode->i_mtime = current_time(inode);
298 299 300 301 302 303 304 305 306 307 308

	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
	if (status < 0) {
		mlog_errno(status);
		goto bail;
	}

bail:
	return status;
}

309 310 311
int ocfs2_simple_size_update(struct inode *inode,
			     struct buffer_head *di_bh,
			     u64 new_i_size)
312 313 314
{
	int ret;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
315
	handle_t *handle = NULL;
316

317
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
318 319
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
320 321 322 323 324 325 326 327 328
		mlog_errno(ret);
		goto out;
	}

	ret = ocfs2_set_inode_size(handle, inode, di_bh,
				   new_i_size);
	if (ret < 0)
		mlog_errno(ret);

329
	ocfs2_update_inode_fsync_trans(handle, inode, 0);
330
	ocfs2_commit_trans(osb, handle);
331 332 333 334
out:
	return ret;
}

335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
static int ocfs2_cow_file_pos(struct inode *inode,
			      struct buffer_head *fe_bh,
			      u64 offset)
{
	int status;
	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	unsigned int num_clusters = 0;
	unsigned int ext_flags = 0;

	/*
	 * If the new offset is aligned to the range of the cluster, there is
	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
	 * CoW either.
	 */
	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
		return 0;

	status = ocfs2_get_clusters(inode, cpos, &phys,
				    &num_clusters, &ext_flags);
	if (status) {
		mlog_errno(status);
		goto out;
	}

	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
		goto out;

362
	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
363 364 365 366 367

out:
	return status;
}

368 369 370 371 372 373
static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
				     struct inode *inode,
				     struct buffer_head *fe_bh,
				     u64 new_i_size)
{
	int status;
374
	handle_t *handle;
375
	struct ocfs2_dinode *di;
376
	u64 cluster_bytes;
377

378 379 380 381 382 383 384 385 386 387 388
	/*
	 * We need to CoW the cluster contains the offset if it is reflinked
	 * since we will call ocfs2_zero_range_for_truncate later which will
	 * write "0" from offset to the end of the cluster.
	 */
	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
	if (status) {
		mlog_errno(status);
		return status;
	}

389 390 391
	/* TODO: This needs to actually orphan the inode in this
	 * transaction. */

392
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
393 394 395 396 397 398
	if (IS_ERR(handle)) {
		status = PTR_ERR(handle);
		mlog_errno(status);
		goto out;
	}

399
	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
400
					 OCFS2_JOURNAL_ACCESS_WRITE);
401 402 403 404 405 406 407 408
	if (status < 0) {
		mlog_errno(status);
		goto out_commit;
	}

	/*
	 * Do this before setting i_size.
	 */
409 410 411
	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
					       cluster_bytes);
412 413 414 415 416 417
	if (status) {
		mlog_errno(status);
		goto out_commit;
	}

	i_size_write(inode, new_i_size);
418
	inode->i_ctime = inode->i_mtime = current_time(inode);
419 420 421 422 423

	di = (struct ocfs2_dinode *) fe_bh->b_data;
	di->i_size = cpu_to_le64(new_i_size);
	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
424
	ocfs2_update_inode_fsync_trans(handle, inode, 0);
425

426
	ocfs2_journal_dirty(handle, fe_bh);
427

428
out_commit:
429
	ocfs2_commit_trans(osb, handle);
430 431 432 433
out:
	return status;
}

434
int ocfs2_truncate_file(struct inode *inode,
435 436 437 438 439 440 441
			       struct buffer_head *di_bh,
			       u64 new_i_size)
{
	int status = 0;
	struct ocfs2_dinode *fe = NULL;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

442 443
	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
	 * already validated it */
444 445
	fe = (struct ocfs2_dinode *) di_bh->b_data;

446 447 448 449
	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
				  (unsigned long long)le64_to_cpu(fe->i_size),
				  (unsigned long long)new_i_size);

450
	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
451 452 453
			"Inode %llu, inode i_size = %lld != di "
			"i_size = %llu, i_flags = 0x%x\n",
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
454
			i_size_read(inode),
455 456
			(unsigned long long)le64_to_cpu(fe->i_size),
			le32_to_cpu(fe->i_flags));
457 458

	if (new_i_size > le64_to_cpu(fe->i_size)) {
459 460 461
		trace_ocfs2_truncate_file_error(
			(unsigned long long)le64_to_cpu(fe->i_size),
			(unsigned long long)new_i_size);
462 463 464 465 466
		status = -EINVAL;
		mlog_errno(status);
		goto bail;
	}

467 468
	down_write(&OCFS2_I(inode)->ip_alloc_sem);

469 470 471
	ocfs2_resv_discard(&osb->osb_la_resmap,
			   &OCFS2_I(inode)->ip_la_data_resv);

M
Mark Fasheh 已提交
472 473 474 475 476 477 478
	/*
	 * The inode lock forced other nodes to sync and drop their
	 * pages, which (correctly) happens even if we have a truncate
	 * without allocation change - ocfs2 cluster sizes can be much
	 * greater than page size, so we have to truncate them
	 * anyway.
	 */
479

M
Mark Fasheh 已提交
480
	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
J
Jan Kara 已提交
481 482 483
		unmap_mapping_range(inode->i_mapping,
				    new_i_size + PAGE_SIZE - 1, 0, 1);
		truncate_inode_pages(inode->i_mapping, new_i_size);
M
Mark Fasheh 已提交
484
		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
485
					       i_size_read(inode), 1);
M
Mark Fasheh 已提交
486 487 488
		if (status)
			mlog_errno(status);

M
Mark Fasheh 已提交
489
		goto bail_unlock_sem;
M
Mark Fasheh 已提交
490 491
	}

492 493 494 495 496 497 498
	/* alright, we're going to need to do a full blown alloc size
	 * change. Orphan the inode so that recovery can complete the
	 * truncate if necessary. This does the task of marking
	 * i_size. */
	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
	if (status < 0) {
		mlog_errno(status);
M
Mark Fasheh 已提交
499
		goto bail_unlock_sem;
500 501
	}

J
Jan Kara 已提交
502 503 504
	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
	truncate_inode_pages(inode->i_mapping, new_i_size);

505
	status = ocfs2_commit_truncate(osb, inode, di_bh);
506 507
	if (status < 0) {
		mlog_errno(status);
M
Mark Fasheh 已提交
508
		goto bail_unlock_sem;
509 510 511
	}

	/* TODO: orphan dir cleanup here. */
M
Mark Fasheh 已提交
512
bail_unlock_sem:
513 514
	up_write(&OCFS2_I(inode)->ip_alloc_sem);

515
bail:
516 517
	if (!status && OCFS2_I(inode)->ip_clusters == 0)
		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
518 519 520 521 522

	return status;
}

/*
523
 * extend file allocation only here.
524 525 526 527 528 529 530 531
 * we'll update all the disk stuff, and oip->alloc_size
 *
 * expect stuff to be locked, a transaction started and enough data /
 * metadata reservations in the contexts.
 *
 * Will return -EAGAIN, and a reason if a restart is needed.
 * If passed in, *reason will always be set, even in error.
 */
532 533 534 535 536 537 538 539 540 541
int ocfs2_add_inode_data(struct ocfs2_super *osb,
			 struct inode *inode,
			 u32 *logical_offset,
			 u32 clusters_to_add,
			 int mark_unwritten,
			 struct buffer_head *fe_bh,
			 handle_t *handle,
			 struct ocfs2_alloc_context *data_ac,
			 struct ocfs2_alloc_context *meta_ac,
			 enum ocfs2_alloc_restarted *reason_ret)
542
{
543
	struct ocfs2_extent_tree et;
544

545
	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
J
Joseph Qi 已提交
546 547 548
	return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
					   clusters_to_add, mark_unwritten,
					   data_ac, meta_ac, reason_ret);
549 550
}

551 552
static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
				   u32 clusters_to_add, int mark_unwritten)
553 554 555
{
	int status = 0;
	int restart_func = 0;
556
	int credits;
557
	u32 prev_clusters;
558 559
	struct buffer_head *bh = NULL;
	struct ocfs2_dinode *fe = NULL;
560
	handle_t *handle = NULL;
561 562
	struct ocfs2_alloc_context *data_ac = NULL;
	struct ocfs2_alloc_context *meta_ac = NULL;
563
	enum ocfs2_alloc_restarted why = RESTART_NONE;
564
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
565
	struct ocfs2_extent_tree et;
566
	int did_quota = 0;
567

M
Mark Fasheh 已提交
568
	/*
J
Junxiao Bi 已提交
569
	 * Unwritten extent only exists for file systems which
M
Mark Fasheh 已提交
570 571
	 * support holes.
	 */
572
	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
M
Mark Fasheh 已提交
573

574
	status = ocfs2_read_inode_block(inode, &bh);
575 576 577 578 579 580 581 582 583
	if (status < 0) {
		mlog_errno(status);
		goto leave;
	}
	fe = (struct ocfs2_dinode *) bh->b_data;

restart_all:
	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

584
	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
585 586
	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
				       &data_ac, &meta_ac);
587 588 589 590 591
	if (status) {
		mlog_errno(status);
		goto leave;
	}

592
	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
593
	handle = ocfs2_start_trans(osb, credits);
594 595 596 597 598 599 600 601
	if (IS_ERR(handle)) {
		status = PTR_ERR(handle);
		handle = NULL;
		mlog_errno(status);
		goto leave;
	}

restarted_transaction:
602 603 604 605 606 607
	trace_ocfs2_extend_allocation(
		(unsigned long long)OCFS2_I(inode)->ip_blkno,
		(unsigned long long)i_size_read(inode),
		le32_to_cpu(fe->i_clusters), clusters_to_add,
		why, restart_func);

608 609 610
	status = dquot_alloc_space_nodirty(inode,
			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
	if (status)
611 612 613
		goto leave;
	did_quota = 1;

614 615 616
	/* reserve a write to the file entry early on - that we if we
	 * run out of credits in the allocation path, we can still
	 * update i_size. */
617
	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
618
					 OCFS2_JOURNAL_ACCESS_WRITE);
619 620 621 622 623 624 625
	if (status < 0) {
		mlog_errno(status);
		goto leave;
	}

	prev_clusters = OCFS2_I(inode)->ip_clusters;

626 627 628 629 630 631 632 633 634 635
	status = ocfs2_add_inode_data(osb,
				      inode,
				      &logical_start,
				      clusters_to_add,
				      mark_unwritten,
				      bh,
				      handle,
				      data_ac,
				      meta_ac,
				      &why);
636 637 638 639 640
	if ((status < 0) && (status != -EAGAIN)) {
		if (status != -ENOSPC)
			mlog_errno(status);
		goto leave;
	}
641
	ocfs2_update_inode_fsync_trans(handle, inode, 1);
642
	ocfs2_journal_dirty(handle, bh);
643 644 645 646

	spin_lock(&OCFS2_I(inode)->ip_lock);
	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
	spin_unlock(&OCFS2_I(inode)->ip_lock);
647
	/* Release unused quota reservation */
648
	dquot_free_space(inode,
649 650
			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
	did_quota = 0;
651 652 653 654

	if (why != RESTART_NONE && clusters_to_add) {
		if (why == RESTART_META) {
			restart_func = 1;
655
			status = 0;
656 657 658
		} else {
			BUG_ON(why != RESTART_TRANS);

659
			status = ocfs2_allocate_extend_trans(handle, 1);
660 661 662 663 664 665 666 667 668 669 670
			if (status < 0) {
				/* handle still has to be committed at
				 * this point. */
				status = -ENOMEM;
				mlog_errno(status);
				goto leave;
			}
			goto restarted_transaction;
		}
	}

671
	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
672
	     le32_to_cpu(fe->i_clusters),
673 674 675
	     (unsigned long long)le64_to_cpu(fe->i_size),
	     OCFS2_I(inode)->ip_clusters,
	     (unsigned long long)i_size_read(inode));
676 677

leave:
678
	if (status < 0 && did_quota)
679
		dquot_free_space(inode,
680
			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
681
	if (handle) {
682
		ocfs2_commit_trans(osb, handle);
683 684 685 686 687 688 689 690 691 692 693 694 695 696
		handle = NULL;
	}
	if (data_ac) {
		ocfs2_free_alloc_context(data_ac);
		data_ac = NULL;
	}
	if (meta_ac) {
		ocfs2_free_alloc_context(meta_ac);
		meta_ac = NULL;
	}
	if ((!status) && restart_func) {
		restart_func = 0;
		goto restart_all;
	}
697 698
	brelse(bh);
	bh = NULL;
699 700 701 702

	return status;
}

703 704 705 706
/*
 * While a write will already be ordering the data, a truncate will not.
 * Thus, we need to explicitly order the zeroed pages.
 */
707
static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
708 709 710
						      struct buffer_head *di_bh,
						      loff_t start_byte,
						      loff_t length)
711 712 713 714 715 716 717 718 719 720 721 722 723 724 725
{
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	handle_t *handle = NULL;
	int ret = 0;

	if (!ocfs2_should_order_data(inode))
		goto out;

	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	if (IS_ERR(handle)) {
		ret = -ENOMEM;
		mlog_errno(ret);
		goto out;
	}

726
	ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
727 728 729 730 731 732 733 734
	if (ret < 0) {
		mlog_errno(ret);
		goto out;
	}

	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
				      OCFS2_JOURNAL_ACCESS_WRITE);
	if (ret)
735
		mlog_errno(ret);
736
	ocfs2_update_inode_fsync_trans(handle, inode, 1);
737 738 739 740 741 742 743 744 745 746

out:
	if (ret) {
		if (!IS_ERR(handle))
			ocfs2_commit_trans(osb, handle);
		handle = ERR_PTR(ret);
	}
	return handle;
}

747 748
/* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
749
 * worry about recursive locking in ->write_begin() and ->write_end(). */
750
static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
751
				 u64 abs_to, struct buffer_head *di_bh)
752 753 754
{
	struct address_space *mapping = inode->i_mapping;
	struct page *page;
755
	unsigned long index = abs_from >> PAGE_SHIFT;
756
	handle_t *handle;
757
	int ret = 0;
758
	unsigned zero_from, zero_to, block_start, block_end;
759
	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
760

761
	BUG_ON(abs_from >= abs_to);
762
	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
763
	BUG_ON(abs_from & (inode->i_blkbits - 1));
764

765 766 767
	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
						      abs_from,
						      abs_to - abs_from);
768 769 770 771 772
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
	}

773
	page = find_or_create_page(mapping, index, GFP_NOFS);
774 775 776
	if (!page) {
		ret = -ENOMEM;
		mlog_errno(ret);
777
		goto out_commit_trans;
778 779
	}

780
	/* Get the offsets within the page that we want to zero */
781 782
	zero_from = abs_from & (PAGE_SIZE - 1);
	zero_to = abs_to & (PAGE_SIZE - 1);
783
	if (!zero_to)
784
		zero_to = PAGE_SIZE;
785

786 787 788 789 790
	trace_ocfs2_write_zero_page(
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			(unsigned long long)abs_from,
			(unsigned long long)abs_to,
			index, zero_from, zero_to);
791

792 793 794
	/* We know that zero_from is block aligned */
	for (block_start = zero_from; block_start < zero_to;
	     block_start = block_end) {
F
Fabian Frederick 已提交
795
		block_end = block_start + i_blocksize(inode);
796 797

		/*
C
Christoph Hellwig 已提交
798 799
		 * block_start is block-aligned.  Bump it by one to force
		 * __block_write_begin and block_commit_write to zero the
800 801
		 * whole block.
		 */
C
Christoph Hellwig 已提交
802 803
		ret = __block_write_begin(page, block_start + 1, 0,
					  ocfs2_get_block);
804 805
		if (ret < 0) {
			mlog_errno(ret);
806 807 808
			goto out_unlock;
		}

809 810 811 812 813 814 815 816 817

		/* must not update i_size! */
		ret = block_commit_write(page, block_start + 1,
					 block_start + 1);
		if (ret < 0)
			mlog_errno(ret);
		else
			ret = 0;
	}
818

819 820 821 822 823 824 825 826
	/*
	 * fs-writeback will release the dirty pages without page lock
	 * whose offset are over inode size, the release happens at
	 * block_write_full_page().
	 */
	i_size_write(inode, abs_to);
	inode->i_blocks = ocfs2_inode_sector_count(inode);
	di->i_size = cpu_to_le64((u64)i_size_read(inode));
827
	inode->i_mtime = inode->i_ctime = current_time(inode);
828 829 830
	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
	di->i_mtime_nsec = di->i_ctime_nsec;
831 832
	if (handle) {
		ocfs2_journal_dirty(handle, di_bh);
833
		ocfs2_update_inode_fsync_trans(handle, inode, 1);
834
	}
835

836 837
out_unlock:
	unlock_page(page);
838
	put_page(page);
839 840 841
out_commit_trans:
	if (handle)
		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
842 843 844 845
out:
	return ret;
}

846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
/*
 * Find the next range to zero.  We do this in terms of bytes because
 * that's what ocfs2_zero_extend() wants, and it is dealing with the
 * pagecache.  We may return multiple extents.
 *
 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
 * needs to be zeroed.  range_start and range_end return the next zeroing
 * range.  A subsequent call should pass the previous range_end as its
 * zero_start.  If range_end is 0, there's nothing to do.
 *
 * Unwritten extents are skipped over.  Refcounted extents are CoWd.
 */
static int ocfs2_zero_extend_get_range(struct inode *inode,
				       struct buffer_head *di_bh,
				       u64 zero_start, u64 zero_end,
				       u64 *range_start, u64 *range_end)
862
{
863 864 865 866 867 868 869
	int rc = 0, needs_cow = 0;
	u32 p_cpos, zero_clusters = 0;
	u32 zero_cpos =
		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
	unsigned int num_clusters = 0;
	unsigned int ext_flags = 0;
870

871 872 873 874 875
	while (zero_cpos < last_cpos) {
		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
					&num_clusters, &ext_flags);
		if (rc) {
			mlog_errno(rc);
876 877 878
			goto out;
		}

879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911
		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
			zero_clusters = num_clusters;
			if (ext_flags & OCFS2_EXT_REFCOUNTED)
				needs_cow = 1;
			break;
		}

		zero_cpos += num_clusters;
	}
	if (!zero_clusters) {
		*range_end = 0;
		goto out;
	}

	while ((zero_cpos + zero_clusters) < last_cpos) {
		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
					&p_cpos, &num_clusters,
					&ext_flags);
		if (rc) {
			mlog_errno(rc);
			goto out;
		}

		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
			break;
		if (ext_flags & OCFS2_EXT_REFCOUNTED)
			needs_cow = 1;
		zero_clusters += num_clusters;
	}
	if ((zero_cpos + zero_clusters) > last_cpos)
		zero_clusters = last_cpos - zero_cpos;

	if (needs_cow) {
912
		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
913
					zero_clusters, UINT_MAX);
914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932
		if (rc) {
			mlog_errno(rc);
			goto out;
		}
	}

	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
					     zero_cpos + zero_clusters);

out:
	return rc;
}

/*
 * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
 * has made sure that the entire range needs zeroing.
 */
static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
933
				   u64 range_end, struct buffer_head *di_bh)
934 935 936 937 938
{
	int rc = 0;
	u64 next_pos;
	u64 zero_pos = range_start;

939 940 941 942
	trace_ocfs2_zero_extend_range(
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			(unsigned long long)range_start,
			(unsigned long long)range_end);
943 944 945
	BUG_ON(range_start >= range_end);

	while (zero_pos < range_end) {
946
		next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
947 948
		if (next_pos > range_end)
			next_pos = range_end;
949
		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
950 951 952 953 954
		if (rc < 0) {
			mlog_errno(rc);
			break;
		}
		zero_pos = next_pos;
955 956 957 958 959 960

		/*
		 * Very large extends have the potential to lock up
		 * the cpu for extended periods of time.
		 */
		cond_resched();
961 962
	}

963 964 965 966 967 968 969 970 971 972 973
	return rc;
}

int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
		      loff_t zero_to_size)
{
	int ret = 0;
	u64 zero_start, range_start = 0, range_end = 0;
	struct super_block *sb = inode->i_sb;

	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
974 975 976
	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
				(unsigned long long)zero_start,
				(unsigned long long)i_size_read(inode));
977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
	while (zero_start < zero_to_size) {
		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
						  zero_to_size,
						  &range_start,
						  &range_end);
		if (ret) {
			mlog_errno(ret);
			break;
		}
		if (!range_end)
			break;
		/* Trim the ends */
		if (range_start < zero_start)
			range_start = zero_start;
		if (range_end > zero_to_size)
			range_end = zero_to_size;

		ret = ocfs2_zero_extend_range(inode, range_start,
995
					      range_end, di_bh);
996 997 998 999 1000 1001 1002
		if (ret) {
			mlog_errno(ret);
			break;
		}
		zero_start = range_end;
	}

1003 1004 1005
	return ret;
}

1006 1007
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
			  u64 new_i_size, u64 zero_to)
1008 1009 1010 1011 1012
{
	int ret;
	u32 clusters_to_add;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

1013 1014 1015 1016
	/*
	 * Only quota files call this without a bh, and they can't be
	 * refcounted.
	 */
1017
	BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
1018 1019
	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));

1020 1021 1022 1023 1024 1025 1026
	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
	if (clusters_to_add < oi->ip_clusters)
		clusters_to_add = 0;
	else
		clusters_to_add -= oi->ip_clusters;

	if (clusters_to_add) {
1027 1028
		ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
					      clusters_to_add, 0);
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039
		if (ret) {
			mlog_errno(ret);
			goto out;
		}
	}

	/*
	 * Call this even if we don't add any clusters to the tree. We
	 * still need to zero the area between the old i_size and the
	 * new i_size.
	 */
1040
	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1041 1042 1043 1044 1045 1046 1047
	if (ret < 0)
		mlog_errno(ret);

out:
	return ret;
}

1048 1049
static int ocfs2_extend_file(struct inode *inode,
			     struct buffer_head *di_bh,
1050
			     u64 new_i_size)
1051
{
M
Mark Fasheh 已提交
1052
	int ret = 0;
M
Mark Fasheh 已提交
1053
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1054

1055
	BUG_ON(!di_bh);
1056

1057 1058 1059 1060 1061
	/* setattr sometimes calls us like this. */
	if (new_i_size == 0)
		goto out;

	if (i_size_read(inode) == new_i_size)
1062
		goto out;
1063 1064
	BUG_ON(new_i_size < i_size_read(inode));

1065
	/*
1066 1067
	 * The alloc sem blocks people in read/write from reading our
	 * allocation until we're done changing it. We depend on
1068
	 * i_rwsem to block other extend/truncate calls while we're
1069 1070
	 * here.  We even have to hold it for sparse files because there
	 * might be some tail zeroing.
1071
	 */
M
Mark Fasheh 已提交
1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
	down_write(&oi->ip_alloc_sem);

	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
		/*
		 * We can optimize small extends by keeping the inodes
		 * inline data.
		 */
		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
			up_write(&oi->ip_alloc_sem);
			goto out_update_size;
		}

		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
		if (ret) {
			up_write(&oi->ip_alloc_sem);
			mlog_errno(ret);
M
Mark Fasheh 已提交
1088
			goto out;
M
Mark Fasheh 已提交
1089 1090 1091
		}
	}

1092 1093 1094 1095 1096
	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
	else
		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
					    new_i_size);
M
Mark Fasheh 已提交
1097 1098

	up_write(&oi->ip_alloc_sem);
1099

1100 1101
	if (ret < 0) {
		mlog_errno(ret);
M
Mark Fasheh 已提交
1102
		goto out;
1103 1104
	}

1105
out_update_size:
1106 1107 1108
	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
	if (ret < 0)
		mlog_errno(ret);
1109 1110 1111 1112 1113

out:
	return ret;
}

1114 1115
int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
		  struct iattr *attr)
1116 1117
{
	int status = 0, size_change;
1118
	int inode_locked = 0;
1119
	struct inode *inode = d_inode(dentry);
1120 1121 1122
	struct super_block *sb = inode->i_sb;
	struct ocfs2_super *osb = OCFS2_SB(sb);
	struct buffer_head *bh = NULL;
1123
	handle_t *handle = NULL;
1124
	struct dquot *transfer_to[MAXQUOTAS] = { };
1125
	int qtype;
1126 1127
	int had_lock;
	struct ocfs2_lock_holder oh;
1128

1129 1130 1131 1132
	trace_ocfs2_setattr(inode, dentry,
			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
			    dentry->d_name.len, dentry->d_name.name,
			    attr->ia_valid, attr->ia_mode,
1133 1134
			    from_kuid(&init_user_ns, attr->ia_uid),
			    from_kgid(&init_user_ns, attr->ia_gid));
1135

1136 1137 1138 1139
	/* ensuring we don't even attempt to truncate a symlink */
	if (S_ISLNK(inode->i_mode))
		attr->ia_valid &= ~ATTR_SIZE;

1140 1141
#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
			   | ATTR_GID | ATTR_UID | ATTR_MODE)
1142
	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1143 1144
		return 0;

C
Christian Brauner 已提交
1145
	status = setattr_prepare(&init_user_ns, dentry, attr);
1146 1147 1148
	if (status)
		return status;

1149
	if (is_quota_modification(mnt_userns, inode, attr)) {
1150 1151 1152 1153
		status = dquot_initialize(inode);
		if (status)
			return status;
	}
1154 1155
	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
	if (size_change) {
1156 1157 1158 1159 1160 1161 1162
		/*
		 * Here we should wait dio to finish before inode lock
		 * to avoid a deadlock between ocfs2_setattr() and
		 * ocfs2_dio_end_io_write()
		 */
		inode_dio_wait(inode);

1163 1164 1165 1166 1167 1168 1169
		status = ocfs2_rw_lock(inode, 1);
		if (status < 0) {
			mlog_errno(status);
			goto bail;
		}
	}

1170 1171 1172
	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
	if (had_lock < 0) {
		status = had_lock;
1173
		goto bail_unlock_rw;
1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193
	} else if (had_lock) {
		/*
		 * As far as we know, ocfs2_setattr() could only be the first
		 * VFS entry point in the call chain of recursive cluster
		 * locking issue.
		 *
		 * For instance:
		 * chmod_common()
		 *  notify_change()
		 *   ocfs2_setattr()
		 *    posix_acl_chmod()
		 *     ocfs2_iop_get_acl()
		 *
		 * But, we're not 100% sure if it's always true, because the
		 * ordering of the VFS entry points in the call chain is out
		 * of our control. So, we'd better dump the stack here to
		 * catch the other cases of recursive locking.
		 */
		mlog(ML_ERROR, "Another case of recursive locking:\n");
		dump_stack();
1194
	}
1195
	inode_locked = 1;
1196

1197
	if (size_change) {
1198 1199
		status = inode_newsize_ok(inode, attr->ia_size);
		if (status)
1200 1201
			goto bail_unlock;

1202
		if (i_size_read(inode) >= attr->ia_size) {
J
Joel Becker 已提交
1203 1204 1205 1206 1207 1208
			if (ocfs2_should_order_data(inode)) {
				status = ocfs2_begin_ordered_truncate(inode,
								      attr->ia_size);
				if (status)
					goto bail_unlock;
			}
1209
			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
J
Joel Becker 已提交
1210
		} else
1211
			status = ocfs2_extend_file(inode, bh, attr->ia_size);
1212 1213 1214 1215 1216 1217 1218 1219
		if (status < 0) {
			if (status != -ENOSPC)
				mlog_errno(status);
			status = -ENOSPC;
			goto bail_unlock;
		}
	}

1220 1221
	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
	    (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
1222 1223 1224
		/*
		 * Gather pointers to quota structures so that allocation /
		 * freeing of quota structures happens here and not inside
1225
		 * dquot_transfer() where we have problems with lock ordering
1226
		 */
1227
		if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
1228 1229
		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1230
			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1231 1232
			if (IS_ERR(transfer_to[USRQUOTA])) {
				status = PTR_ERR(transfer_to[USRQUOTA]);
1233
				transfer_to[USRQUOTA] = NULL;
1234
				goto bail_unlock;
1235
			}
1236
		}
1237
		if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
1238 1239
		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1240
			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1241 1242
			if (IS_ERR(transfer_to[GRPQUOTA])) {
				status = PTR_ERR(transfer_to[GRPQUOTA]);
1243
				transfer_to[GRPQUOTA] = NULL;
1244
				goto bail_unlock;
1245
			}
1246
		}
1247
		down_write(&OCFS2_I(inode)->ip_alloc_sem);
1248 1249
		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
					   2 * ocfs2_quota_trans_credits(sb));
1250 1251 1252
		if (IS_ERR(handle)) {
			status = PTR_ERR(handle);
			mlog_errno(status);
1253
			goto bail_unlock_alloc;
1254
		}
1255
		status = __dquot_transfer(inode, transfer_to);
1256 1257 1258
		if (status < 0)
			goto bail_commit;
	} else {
1259
		down_write(&OCFS2_I(inode)->ip_alloc_sem);
1260 1261 1262 1263
		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
		if (IS_ERR(handle)) {
			status = PTR_ERR(handle);
			mlog_errno(status);
1264
			goto bail_unlock_alloc;
1265
		}
1266 1267
	}

C
Christian Brauner 已提交
1268
	setattr_copy(&init_user_ns, inode, attr);
C
Christoph Hellwig 已提交
1269 1270
	mark_inode_dirty(inode);

1271 1272 1273 1274 1275
	status = ocfs2_mark_inode_dirty(handle, inode, bh);
	if (status < 0)
		mlog_errno(status);

bail_commit:
1276
	ocfs2_commit_trans(osb, handle);
1277 1278
bail_unlock_alloc:
	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1279
bail_unlock:
1280 1281
	if (status && inode_locked) {
		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1282 1283
		inode_locked = 0;
	}
1284 1285 1286 1287 1288
bail_unlock_rw:
	if (size_change)
		ocfs2_rw_unlock(inode, 1);
bail:

1289
	/* Release quota pointers in case we acquired them */
J
Jan Kara 已提交
1290
	for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
1291 1292
		dqput(transfer_to[qtype]);

T
Tiger Yang 已提交
1293
	if (!status && attr->ia_valid & ATTR_MODE) {
1294
		status = ocfs2_acl_chmod(inode, bh);
T
Tiger Yang 已提交
1295 1296 1297
		if (status < 0)
			mlog_errno(status);
	}
1298
	if (inode_locked)
1299
		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
T
Tiger Yang 已提交
1300

1301
	brelse(bh);
1302 1303 1304
	return status;
}

1305 1306
int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
		  struct kstat *stat, u32 request_mask, unsigned int flags)
1307
{
1308 1309
	struct inode *inode = d_inode(path->dentry);
	struct super_block *sb = path->dentry->d_sb;
1310 1311 1312
	struct ocfs2_super *osb = sb->s_fs_info;
	int err;

1313
	err = ocfs2_inode_revalidate(path->dentry);
1314 1315 1316 1317 1318 1319
	if (err) {
		if (err != -ENOENT)
			mlog_errno(err);
		goto bail;
	}

C
Christian Brauner 已提交
1320
	generic_fillattr(&init_user_ns, inode, stat);
1321 1322 1323 1324 1325 1326 1327 1328
	/*
	 * If there is inline data in the inode, the inode will normally not
	 * have data blocks allocated (it may have an external xattr block).
	 * Report at least one sector for such files, so tools like tar, rsync,
	 * others don't incorrectly think the file is completely sparse.
	 */
	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
		stat->blocks += (stat->size + 511)>>9;
1329 1330 1331 1332 1333 1334 1335 1336

	/* We set the blksize from the cluster size for performance */
	stat->blksize = osb->s_clustersize;

bail:
	return err;
}

1337 1338
int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
		     int mask)
T
Tiger Yang 已提交
1339
{
1340 1341
	int ret, had_lock;
	struct ocfs2_lock_holder oh;
T
Tiger Yang 已提交
1342

1343
	if (mask & MAY_NOT_BLOCK)
1344 1345
		return -ECHILD;

1346 1347 1348
	had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
	if (had_lock < 0) {
		ret = had_lock;
T
Tiger Yang 已提交
1349
		goto out;
1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
	} else if (had_lock) {
		/* See comments in ocfs2_setattr() for details.
		 * The call chain of this case could be:
		 * do_sys_open()
		 *  may_open()
		 *   inode_permission()
		 *    ocfs2_permission()
		 *     ocfs2_iop_get_acl()
		 */
		mlog(ML_ERROR, "Another case of recursive locking:\n");
		dump_stack();
T
Tiger Yang 已提交
1361 1362
	}

1363
	ret = generic_permission(&init_user_ns, inode, mask);
T
Tiger Yang 已提交
1364

1365
	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
T
Tiger Yang 已提交
1366 1367 1368 1369
out:
	return ret;
}

1370 1371
static int __ocfs2_write_remove_suid(struct inode *inode,
				     struct buffer_head *bh)
1372 1373
{
	int ret;
1374
	handle_t *handle;
1375 1376 1377
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	struct ocfs2_dinode *di;

1378 1379 1380
	trace_ocfs2_write_remove_suid(
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			inode->i_mode);
1381

1382
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1383 1384
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
1385 1386 1387 1388
		mlog_errno(ret);
		goto out;
	}

1389
	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1390
				      OCFS2_JOURNAL_ACCESS_WRITE);
1391 1392
	if (ret < 0) {
		mlog_errno(ret);
1393
		goto out_trans;
1394 1395 1396 1397 1398 1399 1400 1401
	}

	inode->i_mode &= ~S_ISUID;
	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
		inode->i_mode &= ~S_ISGID;

	di = (struct ocfs2_dinode *) bh->b_data;
	di->i_mode = cpu_to_le16(inode->i_mode);
1402
	ocfs2_update_inode_fsync_trans(handle, inode, 0);
1403

1404
	ocfs2_journal_dirty(handle, bh);
1405

1406
out_trans:
1407
	ocfs2_commit_trans(osb, handle);
1408 1409 1410 1411
out:
	return ret;
}

1412 1413 1414 1415 1416
static int ocfs2_write_remove_suid(struct inode *inode)
{
	int ret;
	struct buffer_head *bh = NULL;

1417
	ret = ocfs2_read_inode_block(inode, &bh);
1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428
	if (ret < 0) {
		mlog_errno(ret);
		goto out;
	}

	ret =  __ocfs2_write_remove_suid(inode, bh);
out:
	brelse(bh);
	return ret;
}

1429 1430 1431 1432 1433 1434 1435 1436 1437 1438
/*
 * Allocate enough extents to cover the region starting at byte offset
 * start for len bytes. Existing extents are skipped, any extents
 * added are marked as "unwritten".
 */
static int ocfs2_allocate_unwritten_extents(struct inode *inode,
					    u64 start, u64 len)
{
	int ret;
	u32 cpos, phys_cpos, clusters, alloc_size;
M
Mark Fasheh 已提交
1439 1440 1441 1442
	u64 end = start + len;
	struct buffer_head *di_bh = NULL;

	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1443
		ret = ocfs2_read_inode_block(inode, &di_bh);
M
Mark Fasheh 已提交
1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
		if (ret) {
			mlog_errno(ret);
			goto out;
		}

		/*
		 * Nothing to do if the requested reservation range
		 * fits within the inode.
		 */
		if (ocfs2_size_fits_inline_data(di_bh, end))
			goto out;

		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}
	}
1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492

	/*
	 * We consider both start and len to be inclusive.
	 */
	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
	clusters -= cpos;

	while (clusters) {
		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
					 &alloc_size, NULL);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}

		/*
		 * Hole or existing extent len can be arbitrary, so
		 * cap it to our own allocation request.
		 */
		if (alloc_size > clusters)
			alloc_size = clusters;

		if (phys_cpos) {
			/*
			 * We already have an allocation at this
			 * region so we can safely skip it.
			 */
			goto next;
		}

1493
		ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506
		if (ret) {
			if (ret != -ENOSPC)
				mlog_errno(ret);
			goto out;
		}

next:
		cpos += alloc_size;
		clusters -= alloc_size;
	}

	ret = 0;
out:
M
Mark Fasheh 已提交
1507 1508

	brelse(di_bh);
1509 1510 1511
	return ret;
}

1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532
/*
 * Truncate a byte range, avoiding pages within partial clusters. This
 * preserves those pages for the zeroing code to write to.
 */
static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
					 u64 byte_len)
{
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	loff_t start, end;
	struct address_space *mapping = inode->i_mapping;

	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
	end = byte_start + byte_len;
	end = end & ~(osb->s_clustersize - 1);

	if (start < end) {
		unmap_mapping_range(mapping, start, end - start, 0);
		truncate_inode_pages_range(mapping, start, end - 1);
	}
}

J
Junxiao Bi 已提交
1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
/*
 * zero out partial blocks of one cluster.
 *
 * start: file offset where zero starts, will be made upper block aligned.
 * len: it will be trimmed to the end of current cluster if "start + len"
 *      is bigger than it.
 */
static int ocfs2_zeroout_partial_cluster(struct inode *inode,
					u64 start, u64 len)
{
	int ret;
	u64 start_block, end_block, nr_blocks;
	u64 p_block, offset;
	u32 cluster, p_cluster, nr_clusters;
	struct super_block *sb = inode->i_sb;
	u64 end = ocfs2_align_bytes_to_clusters(sb, start);

	if (start + len < end)
		end = start + len;

	start_block = ocfs2_blocks_for_bytes(sb, start);
	end_block = ocfs2_blocks_for_bytes(sb, end);
	nr_blocks = end_block - start_block;
	if (!nr_blocks)
		return 0;

	cluster = ocfs2_bytes_to_clusters(sb, start);
	ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
				&nr_clusters, NULL);
	if (ret)
		return ret;
	if (!p_cluster)
		return 0;

	offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
	p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
	return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
}

1572 1573 1574 1575
static int ocfs2_zero_partial_clusters(struct inode *inode,
				       u64 start, u64 len)
{
	int ret = 0;
1576 1577
	u64 tmpend = 0;
	u64 end = start + len;
1578 1579 1580
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	unsigned int csize = osb->s_clustersize;
	handle_t *handle;
J
Junxiao Bi 已提交
1581
	loff_t isize = i_size_read(inode);
1582 1583 1584 1585 1586 1587 1588 1589

	/*
	 * The "start" and "end" values are NOT necessarily part of
	 * the range whose allocation is being deleted. Rather, this
	 * is what the user passed in with the request. We must zero
	 * partial clusters here. There's no need to worry about
	 * physical allocation - the zeroing code knows to skip holes.
	 */
1590 1591 1592
	trace_ocfs2_zero_partial_clusters(
		(unsigned long long)OCFS2_I(inode)->ip_blkno,
		(unsigned long long)start, (unsigned long long)end);
1593 1594 1595 1596 1597 1598 1599 1600 1601

	/*
	 * If both edges are on a cluster boundary then there's no
	 * zeroing required as the region is part of the allocation to
	 * be truncated.
	 */
	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
		goto out;

J
Junxiao Bi 已提交
1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621
	/* No page cache for EOF blocks, issue zero out to disk. */
	if (end > isize) {
		/*
		 * zeroout eof blocks in last cluster starting from
		 * "isize" even "start" > "isize" because it is
		 * complicated to zeroout just at "start" as "start"
		 * may be not aligned with block size, buffer write
		 * would be required to do that, but out of eof buffer
		 * write is not supported.
		 */
		ret = ocfs2_zeroout_partial_cluster(inode, isize,
					end - isize);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}
		if (start >= isize)
			goto out;
		end = isize;
	}
1622
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1623 1624
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
1625 1626 1627 1628 1629
		mlog_errno(ret);
		goto out;
	}

	/*
1630 1631 1632 1633 1634
	 * If start is on a cluster boundary and end is somewhere in another
	 * cluster, we have not COWed the cluster starting at start, unless
	 * end is also within the same cluster. So, in this case, we skip this
	 * first call to ocfs2_zero_range_for_truncate() truncate and move on
	 * to the next one.
1635
	 */
1636 1637 1638 1639 1640 1641 1642 1643 1644
	if ((start & (csize - 1)) != 0) {
		/*
		 * We want to get the byte offset of the end of the 1st
		 * cluster.
		 */
		tmpend = (u64)osb->s_clustersize +
			(start & ~(osb->s_clustersize - 1));
		if (tmpend > end)
			tmpend = end;
1645

1646 1647 1648
		trace_ocfs2_zero_partial_clusters_range1(
			(unsigned long long)start,
			(unsigned long long)tmpend);
1649

1650 1651 1652 1653 1654
		ret = ocfs2_zero_range_for_truncate(inode, handle, start,
						    tmpend);
		if (ret)
			mlog_errno(ret);
	}
1655 1656 1657 1658 1659 1660 1661 1662 1663

	if (tmpend < end) {
		/*
		 * This may make start and end equal, but the zeroing
		 * code will skip any work in that case so there's no
		 * need to catch it up here.
		 */
		start = end & ~(osb->s_clustersize - 1);

1664 1665
		trace_ocfs2_zero_partial_clusters_range2(
			(unsigned long long)start, (unsigned long long)end);
1666 1667 1668 1669 1670

		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
		if (ret)
			mlog_errno(ret);
	}
1671
	ocfs2_update_inode_fsync_trans(handle, inode, 1);
1672 1673 1674 1675 1676 1677

	ocfs2_commit_trans(osb, handle);
out:
	return ret;
}

T
Tristan Ye 已提交
1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714
static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
{
	int i;
	struct ocfs2_extent_rec *rec = NULL;

	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {

		rec = &el->l_recs[i];

		if (le32_to_cpu(rec->e_cpos) < pos)
			break;
	}

	return i;
}

/*
 * Helper to calculate the punching pos and length in one run, we handle the
 * following three cases in order:
 *
 * - remove the entire record
 * - remove a partial record
 * - no record needs to be removed (hole-punching completed)
*/
static void ocfs2_calc_trunc_pos(struct inode *inode,
				 struct ocfs2_extent_list *el,
				 struct ocfs2_extent_rec *rec,
				 u32 trunc_start, u32 *trunc_cpos,
				 u32 *trunc_len, u32 *trunc_end,
				 u64 *blkno, int *done)
{
	int ret = 0;
	u32 coff, range;

	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);

	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1715 1716 1717
		/*
		 * remove an entire extent record.
		 */
T
Tristan Ye 已提交
1718 1719 1720 1721 1722 1723 1724 1725 1726 1727
		*trunc_cpos = le32_to_cpu(rec->e_cpos);
		/*
		 * Skip holes if any.
		 */
		if (range < *trunc_end)
			*trunc_end = range;
		*trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
		*blkno = le64_to_cpu(rec->e_blkno);
		*trunc_end = le32_to_cpu(rec->e_cpos);
	} else if (range > trunc_start) {
1728 1729 1730 1731
		/*
		 * remove a partial extent record, which means we're
		 * removing the last extent record.
		 */
T
Tristan Ye 已提交
1732
		*trunc_cpos = trunc_start;
1733 1734 1735 1736 1737
		/*
		 * skip hole if any.
		 */
		if (range < *trunc_end)
			*trunc_end = range;
T
Tristan Ye 已提交
1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757
		*trunc_len = *trunc_end - trunc_start;
		coff = trunc_start - le32_to_cpu(rec->e_cpos);
		*blkno = le64_to_cpu(rec->e_blkno) +
				ocfs2_clusters_to_blocks(inode->i_sb, coff);
		*trunc_end = trunc_start;
	} else {
		/*
		 * It may have two following possibilities:
		 *
		 * - last record has been removed
		 * - trunc_start was within a hole
		 *
		 * both two cases mean the completion of hole punching.
		 */
		ret = 1;
	}

	*done = ret;
}

1758 1759 1760
int ocfs2_remove_inode_range(struct inode *inode,
			     struct buffer_head *di_bh, u64 byte_start,
			     u64 byte_len)
1761
{
T
Tristan Ye 已提交
1762 1763 1764
	int ret = 0, flags = 0, done = 0, i;
	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
	u32 cluster_in_el;
1765 1766
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	struct ocfs2_cached_dealloc_ctxt dealloc;
1767
	struct address_space *mapping = inode->i_mapping;
1768
	struct ocfs2_extent_tree et;
T
Tristan Ye 已提交
1769 1770 1771
	struct ocfs2_path *path = NULL;
	struct ocfs2_extent_list *el = NULL;
	struct ocfs2_extent_rec *rec = NULL;
1772
	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
T
Tristan Ye 已提交
1773
	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1774

1775
	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1776 1777
	ocfs2_init_dealloc_ctxt(&dealloc);

1778 1779 1780 1781 1782
	trace_ocfs2_remove_inode_range(
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			(unsigned long long)byte_start,
			(unsigned long long)byte_len);

1783 1784 1785
	if (byte_len == 0)
		return 0;

M
Mark Fasheh 已提交
1786 1787
	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1788 1789
					    byte_start + byte_len, 0);
		if (ret) {
M
Mark Fasheh 已提交
1790
			mlog_errno(ret);
1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801
			goto out;
		}
		/*
		 * There's no need to get fancy with the page cache
		 * truncate of an inline-data inode. We're talking
		 * about less than a page here, which will be cached
		 * in the dinode buffer anyway.
		 */
		unmap_mapping_range(mapping, 0, 0, 0);
		truncate_inode_pages(mapping, 0);
		goto out;
M
Mark Fasheh 已提交
1802 1803
	}

1804 1805 1806 1807 1808 1809
	/*
	 * For reflinks, we may need to CoW 2 clusters which might be
	 * partially zero'd later, if hole's start and end offset were
	 * within one cluster(means is not exactly aligned to clustersize).
	 */

1810
	if (ocfs2_is_refcount_inode(inode)) {
1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}

		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}
	}

1824
	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
T
Tristan Ye 已提交
1825 1826
	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
	cluster_in_el = trunc_end;
1827 1828 1829 1830 1831 1832 1833

	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
	if (ret) {
		mlog_errno(ret);
		goto out;
	}

T
Tristan Ye 已提交
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844
	path = ocfs2_new_path_from_et(&et);
	if (!path) {
		ret = -ENOMEM;
		mlog_errno(ret);
		goto out;
	}

	while (trunc_end > trunc_start) {

		ret = ocfs2_find_path(INODE_CACHE(inode), path,
				      cluster_in_el);
1845 1846 1847 1848 1849
		if (ret) {
			mlog_errno(ret);
			goto out;
		}

T
Tristan Ye 已提交
1850
		el = path_leaf_el(path);
1851

T
Tristan Ye 已提交
1852 1853 1854 1855 1856 1857 1858
		i = ocfs2_find_rec(el, trunc_end);
		/*
		 * Need to go to previous extent block.
		 */
		if (i < 0) {
			if (path->p_tree_depth == 0)
				break;
1859

T
Tristan Ye 已提交
1860 1861 1862
			ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
							    path,
							    &cluster_in_el);
1863 1864 1865 1866
			if (ret) {
				mlog_errno(ret);
				goto out;
			}
T
Tristan Ye 已提交
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897

			/*
			 * We've reached the leftmost extent block,
			 * it's safe to leave.
			 */
			if (cluster_in_el == 0)
				break;

			/*
			 * The 'pos' searched for previous extent block is
			 * always one cluster less than actual trunc_end.
			 */
			trunc_end = cluster_in_el + 1;

			ocfs2_reinit_path(path, 1);

			continue;

		} else
			rec = &el->l_recs[i];

		ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
				     &trunc_len, &trunc_end, &blkno, &done);
		if (done)
			break;

		flags = rec->e_flags;
		phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);

		ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
					       phys_cpos, trunc_len, flags,
1898
					       &dealloc, refcount_loc, false);
T
Tristan Ye 已提交
1899 1900 1901
		if (ret < 0) {
			mlog_errno(ret);
			goto out;
1902 1903
		}

T
Tristan Ye 已提交
1904 1905 1906
		cluster_in_el = trunc_end;

		ocfs2_reinit_path(path, 1);
1907 1908 1909 1910 1911
	}

	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);

out:
1912
	ocfs2_free_path(path);
1913 1914 1915 1916 1917 1918
	ocfs2_schedule_truncate_log_flush(osb, 1);
	ocfs2_run_deallocs(osb, &dealloc);

	return ret;
}

1919 1920 1921
/*
 * Parts of this function taken from xfs_change_file_space()
 */
M
Mark Fasheh 已提交
1922 1923 1924 1925
static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
				     loff_t f_pos, unsigned int cmd,
				     struct ocfs2_space_resv *sr,
				     int change_size)
1926 1927 1928
{
	int ret;
	s64 llen;
1929
	loff_t size, orig_isize;
1930 1931 1932
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	struct buffer_head *di_bh = NULL;
	handle_t *handle;
1933
	unsigned long long max_off = inode->i_sb->s_maxbytes;
1934 1935 1936 1937

	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
		return -EROFS;

A
Al Viro 已提交
1938
	inode_lock(inode);
1939 1940 1941 1942 1943 1944 1945 1946 1947 1948

	/*
	 * This prevents concurrent writes on other nodes
	 */
	ret = ocfs2_rw_lock(inode, 1);
	if (ret) {
		mlog_errno(ret);
		goto out;
	}

M
Mark Fasheh 已提交
1949
	ret = ocfs2_inode_lock(inode, &di_bh, 1);
1950 1951 1952 1953 1954 1955 1956
	if (ret) {
		mlog_errno(ret);
		goto out_rw_unlock;
	}

	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
		ret = -EPERM;
M
Mark Fasheh 已提交
1957
		goto out_inode_unlock;
1958 1959 1960 1961 1962 1963
	}

	switch (sr->l_whence) {
	case 0: /*SEEK_SET*/
		break;
	case 1: /*SEEK_CUR*/
M
Mark Fasheh 已提交
1964
		sr->l_start += f_pos;
1965 1966
		break;
	case 2: /*SEEK_END*/
J
Junxiao Bi 已提交
1967
		sr->l_start += i_size_read(inode);
1968 1969 1970
		break;
	default:
		ret = -EINVAL;
M
Mark Fasheh 已提交
1971
		goto out_inode_unlock;
1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
	}
	sr->l_whence = 0;

	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;

	if (sr->l_start < 0
	    || sr->l_start > max_off
	    || (sr->l_start + llen) < 0
	    || (sr->l_start + llen) > max_off) {
		ret = -EINVAL;
M
Mark Fasheh 已提交
1982
		goto out_inode_unlock;
1983
	}
M
Mark Fasheh 已提交
1984
	size = sr->l_start + sr->l_len;
1985

1986 1987
	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
	    cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
1988 1989
		if (sr->l_len <= 0) {
			ret = -EINVAL;
M
Mark Fasheh 已提交
1990
			goto out_inode_unlock;
1991 1992 1993
		}
	}

M
Mark Fasheh 已提交
1994
	if (file && should_remove_suid(file->f_path.dentry)) {
1995 1996 1997
		ret = __ocfs2_write_remove_suid(inode, di_bh);
		if (ret) {
			mlog_errno(ret);
M
Mark Fasheh 已提交
1998
			goto out_inode_unlock;
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
		}
	}

	down_write(&OCFS2_I(inode)->ip_alloc_sem);
	switch (cmd) {
	case OCFS2_IOC_RESVSP:
	case OCFS2_IOC_RESVSP64:
		/*
		 * This takes unsigned offsets, but the signed ones we
		 * pass have been checked against overflow above.
		 */
		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
						       sr->l_len);
		break;
	case OCFS2_IOC_UNRESVSP:
	case OCFS2_IOC_UNRESVSP64:
		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
					       sr->l_len);
		break;
	default:
		ret = -EINVAL;
	}
2021

J
Junxiao Bi 已提交
2022
	orig_isize = i_size_read(inode);
2023 2024 2025 2026 2027 2028 2029
	/* zeroout eof blocks in the cluster. */
	if (!ret && change_size && orig_isize < size) {
		ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
					size - orig_isize);
		if (!ret)
			i_size_write(inode, size);
	}
2030 2031 2032
	up_write(&OCFS2_I(inode)->ip_alloc_sem);
	if (ret) {
		mlog_errno(ret);
M
Mark Fasheh 已提交
2033
		goto out_inode_unlock;
2034 2035 2036 2037 2038 2039 2040 2041 2042
	}

	/*
	 * We update c/mtime for these changes
	 */
	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		mlog_errno(ret);
M
Mark Fasheh 已提交
2043
		goto out_inode_unlock;
2044 2045
	}

2046
	inode->i_ctime = inode->i_mtime = current_time(inode);
2047 2048 2049 2050
	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
	if (ret < 0)
		mlog_errno(ret);

2051
	if (file && (file->f_flags & O_SYNC))
2052 2053
		handle->h_sync = 1;

2054 2055
	ocfs2_commit_trans(osb, handle);

M
Mark Fasheh 已提交
2056
out_inode_unlock:
2057
	brelse(di_bh);
M
Mark Fasheh 已提交
2058
	ocfs2_inode_unlock(inode, 1);
2059 2060 2061 2062
out_rw_unlock:
	ocfs2_rw_unlock(inode, 1);

out:
A
Al Viro 已提交
2063
	inode_unlock(inode);
2064 2065 2066
	return ret;
}

M
Mark Fasheh 已提交
2067 2068 2069
int ocfs2_change_file_space(struct file *file, unsigned int cmd,
			    struct ocfs2_space_resv *sr)
{
A
Al Viro 已提交
2070
	struct inode *inode = file_inode(file);
2071
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2072
	int ret;
M
Mark Fasheh 已提交
2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086

	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
	    !ocfs2_writes_unwritten_extents(osb))
		return -ENOTTY;
	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
		 !ocfs2_sparse_alloc(osb))
		return -ENOTTY;

	if (!S_ISREG(inode->i_mode))
		return -EINVAL;

	if (!(file->f_mode & FMODE_WRITE))
		return -EBADF;

2087 2088 2089 2090 2091 2092
	ret = mnt_want_write_file(file);
	if (ret)
		return ret;
	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
	mnt_drop_write_file(file);
	return ret;
M
Mark Fasheh 已提交
2093 2094
}

2095
static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
M
Mark Fasheh 已提交
2096 2097
			    loff_t len)
{
A
Al Viro 已提交
2098
	struct inode *inode = file_inode(file);
M
Mark Fasheh 已提交
2099 2100 2101
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	struct ocfs2_space_resv sr;
	int change_size = 1;
2102
	int cmd = OCFS2_IOC_RESVSP64;
M
Mark Fasheh 已提交
2103

2104 2105
	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
		return -EOPNOTSUPP;
M
Mark Fasheh 已提交
2106 2107 2108 2109 2110 2111
	if (!ocfs2_writes_unwritten_extents(osb))
		return -EOPNOTSUPP;

	if (mode & FALLOC_FL_KEEP_SIZE)
		change_size = 0;

2112 2113 2114
	if (mode & FALLOC_FL_PUNCH_HOLE)
		cmd = OCFS2_IOC_UNRESVSP64;

M
Mark Fasheh 已提交
2115 2116 2117 2118
	sr.l_whence = 0;
	sr.l_start = (s64)offset;
	sr.l_len = (s64)len;

2119 2120
	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
					 change_size);
M
Mark Fasheh 已提交
2121 2122
}

T
Tao Ma 已提交
2123 2124 2125 2126 2127 2128 2129 2130 2131
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
				   size_t count)
{
	int ret = 0;
	unsigned int extent_flags;
	u32 cpos, clusters, extent_len, phys_cpos;
	struct super_block *sb = inode->i_sb;

	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2132
	    !ocfs2_is_refcount_inode(inode) ||
2133
	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
T
Tao Ma 已提交
2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161
		return 0;

	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;

	while (clusters) {
		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
					 &extent_flags);
		if (ret < 0) {
			mlog_errno(ret);
			goto out;
		}

		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
			ret = 1;
			break;
		}

		if (extent_len > clusters)
			extent_len = clusters;

		clusters -= extent_len;
		cpos += extent_len;
	}
out:
	return ret;
}

M
Mark Fasheh 已提交
2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
{
	int blockmask = inode->i_sb->s_blocksize - 1;
	loff_t final_size = pos + count;

	if ((pos & blockmask) || (final_size & blockmask))
		return 1;
	return 0;
}

2172 2173 2174 2175 2176
static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
					    struct buffer_head **di_bh,
					    int meta_level,
					    int write_sem,
					    int wait)
T
Tao Ma 已提交
2177
{
2178
	int ret = 0;
T
Tao Ma 已提交
2179

2180
	if (wait)
2181
		ret = ocfs2_inode_lock(inode, di_bh, meta_level);
2182
	else
2183
		ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
2184
	if (ret < 0)
T
Tao Ma 已提交
2185
		goto out;
2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201

	if (wait) {
		if (write_sem)
			down_write(&OCFS2_I(inode)->ip_alloc_sem);
		else
			down_read(&OCFS2_I(inode)->ip_alloc_sem);
	} else {
		if (write_sem)
			ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
		else
			ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);

		if (!ret) {
			ret = -EAGAIN;
			goto out_unlock;
		}
T
Tao Ma 已提交
2202 2203
	}

2204
	return ret;
T
Tao Ma 已提交
2205

2206 2207
out_unlock:
	brelse(*di_bh);
2208
	*di_bh = NULL;
2209
	ocfs2_inode_unlock(inode, meta_level);
T
Tao Ma 已提交
2210 2211 2212 2213
out:
	return ret;
}

2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230
static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
					       struct buffer_head **di_bh,
					       int meta_level,
					       int write_sem)
{
	if (write_sem)
		up_write(&OCFS2_I(inode)->ip_alloc_sem);
	else
		up_read(&OCFS2_I(inode)->ip_alloc_sem);

	brelse(*di_bh);
	*di_bh = NULL;

	if (meta_level >= 0)
		ocfs2_inode_unlock(inode, meta_level);
}

2231
static int ocfs2_prepare_inode_for_write(struct file *file,
G
Gang He 已提交
2232
					 loff_t pos, size_t count, int wait)
2233
{
G
Gang He 已提交
2234
	int ret = 0, meta_level = 0, overwrite_io = 0;
2235
	int write_sem = 0;
2236
	struct dentry *dentry = file->f_path.dentry;
2237
	struct inode *inode = d_inode(dentry);
G
Gang He 已提交
2238
	struct buffer_head *di_bh = NULL;
2239 2240
	u32 cpos;
	u32 clusters;
2241

2242
	/*
2243 2244
	 * We start with a read level meta lock and only jump to an ex
	 * if we need to make modifications here.
2245 2246
	 */
	for(;;) {
2247 2248 2249 2250 2251
		ret = ocfs2_inode_lock_for_extent_tree(inode,
						       &di_bh,
						       meta_level,
						       write_sem,
						       wait);
2252
		if (ret < 0) {
G
Gang He 已提交
2253 2254
			if (ret != -EAGAIN)
				mlog_errno(ret);
2255 2256 2257
			goto out;
		}

G
Gang He 已提交
2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272
		/*
		 * Check if IO will overwrite allocated blocks in case
		 * IOCB_NOWAIT flag is set.
		 */
		if (!wait && !overwrite_io) {
			overwrite_io = 1;

			ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
			if (ret < 0) {
				if (ret != -EAGAIN)
					mlog_errno(ret);
				goto out_unlock;
			}
		}

2273 2274 2275 2276 2277
		/* Clear suid / sgid if necessary. We do this here
		 * instead of later in the write path because
		 * remove_suid() calls ->setattr without any hint that
		 * we may have already done our cluster locking. Since
		 * ocfs2_setattr() *must* take cluster locks to
2278
		 * proceed, this will lead us to recursively lock the
2279 2280 2281
		 * inode. There's also the dinode i_size state which
		 * can be lost via setattr during extending writes (we
		 * set inode->i_size at the end of a write. */
T
Tiger Yang 已提交
2282
		if (should_remove_suid(dentry)) {
2283
			if (meta_level == 0) {
2284 2285 2286 2287
				ocfs2_inode_unlock_for_extent_tree(inode,
								   &di_bh,
								   meta_level,
								   write_sem);
2288 2289 2290 2291 2292 2293 2294
				meta_level = 1;
				continue;
			}

			ret = ocfs2_write_remove_suid(inode);
			if (ret < 0) {
				mlog_errno(ret);
T
Tiger Yang 已提交
2295
				goto out_unlock;
2296 2297 2298
			}
		}

2299
		ret = ocfs2_check_range_for_refcount(inode, pos, count);
T
Tao Ma 已提交
2300
		if (ret == 1) {
2301 2302 2303 2304
			ocfs2_inode_unlock_for_extent_tree(inode,
							   &di_bh,
							   meta_level,
							   write_sem);
2305 2306
			meta_level = 1;
			write_sem = 1;
2307 2308 2309
			ret = ocfs2_inode_lock_for_extent_tree(inode,
							       &di_bh,
							       meta_level,
2310
							       write_sem,
2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321
							       wait);
			if (ret < 0) {
				if (ret != -EAGAIN)
					mlog_errno(ret);
				goto out;
			}

			cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
			clusters =
				ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
			ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
T
Tao Ma 已提交
2322 2323 2324
		}

		if (ret < 0) {
2325 2326
			if (ret != -EAGAIN)
				mlog_errno(ret);
T
Tao Ma 已提交
2327 2328 2329
			goto out_unlock;
		}

2330 2331 2332
		break;
	}

T
Tiger Yang 已提交
2333
out_unlock:
2334
	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
G
Gang He 已提交
2335 2336
					    pos, count, wait);

2337 2338 2339 2340
	ocfs2_inode_unlock_for_extent_tree(inode,
					   &di_bh,
					   meta_level,
					   write_sem);
T
Tiger Yang 已提交
2341 2342 2343 2344 2345

out:
	return ret;
}

A
Al Viro 已提交
2346 2347
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
				    struct iov_iter *from)
T
Tiger Yang 已提交
2348
{
G
Gang He 已提交
2349
	int rw_level;
2350
	ssize_t written = 0;
2351
	ssize_t ret;
R
Ryan Ding 已提交
2352
	size_t count = iov_iter_count(from);
2353
	struct file *file = iocb->ki_filp;
A
Al Viro 已提交
2354
	struct inode *inode = file_inode(file);
M
Mark Fasheh 已提交
2355
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2356 2357
	int full_coherency = !(osb->s_mount_opt &
			       OCFS2_MOUNT_COHERENCY_BUFFERED);
2358
	void *saved_ki_complete = NULL;
2359 2360
	int append_write = ((iocb->ki_pos + count) >=
			i_size_read(inode) ? 1 : 0);
G
Gang He 已提交
2361 2362
	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2363

2364
	trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
2365 2366 2367
		(unsigned long long)OCFS2_I(inode)->ip_blkno,
		file->f_path.dentry->d_name.len,
		file->f_path.dentry->d_name.name,
A
Al Viro 已提交
2368
		(unsigned int)from->nr_segs);	/* GRRRRR */
T
Tiger Yang 已提交
2369

G
Gang He 已提交
2370 2371 2372
	if (!direct_io && nowait)
		return -EOPNOTSUPP;

C
Christoph Hellwig 已提交
2373
	if (count == 0)
T
Tiger Yang 已提交
2374 2375
		return 0;

G
Gang He 已提交
2376 2377 2378 2379 2380
	if (nowait) {
		if (!inode_trylock(inode))
			return -EAGAIN;
	} else
		inode_lock(inode);
2381

2382 2383 2384
	/*
	 * Concurrent O_DIRECT writes are allowed with
	 * mount_option "coherency=buffered".
2385
	 * For append write, we must take rw EX.
2386
	 */
2387
	rw_level = (!direct_io || full_coherency || append_write);
2388

G
Gang He 已提交
2389 2390 2391 2392
	if (nowait)
		ret = ocfs2_try_rw_lock(inode, rw_level);
	else
		ret = ocfs2_rw_lock(inode, rw_level);
T
Tiger Yang 已提交
2393
	if (ret < 0) {
G
Gang He 已提交
2394 2395
		if (ret != -EAGAIN)
			mlog_errno(ret);
2396
		goto out_mutex;
T
Tiger Yang 已提交
2397 2398
	}

2399 2400 2401 2402 2403 2404 2405 2406 2407 2408
	/*
	 * O_DIRECT writes with "coherency=full" need to take EX cluster
	 * inode_lock to guarantee coherency.
	 */
	if (direct_io && full_coherency) {
		/*
		 * We need to take and drop the inode lock to force
		 * other nodes to drop their caches.  Buffered I/O
		 * already does this in write_begin().
		 */
G
Gang He 已提交
2409 2410 2411 2412
		if (nowait)
			ret = ocfs2_try_inode_lock(inode, NULL, 1);
		else
			ret = ocfs2_inode_lock(inode, NULL, 1);
2413
		if (ret < 0) {
G
Gang He 已提交
2414 2415
			if (ret != -EAGAIN)
				mlog_errno(ret);
2416
			goto out;
2417 2418 2419 2420 2421
		}

		ocfs2_inode_unlock(inode, 1);
	}

2422 2423 2424 2425
	ret = generic_write_checks(iocb, from);
	if (ret <= 0) {
		if (ret)
			mlog_errno(ret);
2426 2427
		goto out;
	}
2428
	count = ret;
2429

G
Gang He 已提交
2430
	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
T
Tiger Yang 已提交
2431
	if (ret < 0) {
G
Gang He 已提交
2432 2433
		if (ret != -EAGAIN)
			mlog_errno(ret);
T
Tiger Yang 已提交
2434 2435
		goto out;
	}
2436

2437 2438
	if (direct_io && !is_sync_kiocb(iocb) &&
	    ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
M
Mark Fasheh 已提交
2439
		/*
2440
		 * Make it a sync io if it's an unaligned aio.
M
Mark Fasheh 已提交
2441
		 */
2442
		saved_ki_complete = xchg(&iocb->ki_complete, NULL);
M
Mark Fasheh 已提交
2443 2444
	}

2445
	/* communicate with ocfs2_dio_end_io */
2446
	ocfs2_iocb_set_rw_locked(iocb, rw_level);
2447

A
Al Viro 已提交
2448
	written = __generic_file_write_iter(iocb, from);
2449
	/* buffered aio wouldn't have proper lock coverage today */
2450
	BUG_ON(written == -EIOCBQUEUED && !direct_io);
2451

2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464
	/*
	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
	 * function pointer which is called when o_direct io completes so that
	 * it can unlock our rw lock.
	 * Unfortunately there are error cases which call end_io and others
	 * that don't.  so we don't have to unlock the rw_lock if either an
	 * async dio is going to do it in the future or an end_io after an
	 * error has already done it.
	 */
	if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
		rw_level = -1;
	}

A
Al Viro 已提交
2465
	if (unlikely(written <= 0))
2466
		goto out;
A
Al Viro 已提交
2467

A
Al Viro 已提交
2468
	if (((file->f_flags & O_DSYNC) && !direct_io) ||
R
Ryan Ding 已提交
2469
	    IS_SYNC(inode)) {
A
Al Viro 已提交
2470 2471 2472
		ret = filemap_fdatawrite_range(file->f_mapping,
					       iocb->ki_pos - written,
					       iocb->ki_pos - 1);
2473 2474 2475
		if (ret < 0)
			written = ret;

2476
		if (!ret) {
J
Joel Becker 已提交
2477
			ret = jbd2_journal_force_commit(osb->journal->j_journal);
M
Mark Fasheh 已提交
2478 2479 2480
			if (ret < 0)
				written = ret;
		}
2481 2482

		if (!ret)
A
Al Viro 已提交
2483 2484 2485
			ret = filemap_fdatawait_range(file->f_mapping,
						      iocb->ki_pos - written,
						      iocb->ki_pos - 1);
M
Mark Fasheh 已提交
2486 2487
	}

2488
out:
2489 2490 2491
	if (saved_ki_complete)
		xchg(&iocb->ki_complete, saved_ki_complete);

2492 2493 2494
	if (rw_level != -1)
		ocfs2_rw_unlock(inode, rw_level);

2495
out_mutex:
A
Al Viro 已提交
2496
	inode_unlock(inode);
2497

2498 2499 2500
	if (written)
		ret = written;
	return ret;
2501 2502
}

A
Al Viro 已提交
2503 2504
static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
				   struct iov_iter *to)
2505
{
2506
	int ret = 0, rw_level = -1, lock_level = 0;
2507
	struct file *filp = iocb->ki_filp;
A
Al Viro 已提交
2508
	struct inode *inode = file_inode(filp);
G
Gang He 已提交
2509 2510
	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2511

2512
	trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
2513 2514
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			filp->f_path.dentry->d_name.len,
A
Al Viro 已提交
2515 2516
			filp->f_path.dentry->d_name.name,
			to->nr_segs);	/* GRRRRR */
2517

2518 2519 2520 2521 2522 2523 2524

	if (!inode) {
		ret = -EINVAL;
		mlog_errno(ret);
		goto bail;
	}

G
Gang He 已提交
2525 2526 2527
	if (!direct_io && nowait)
		return -EOPNOTSUPP;

2528
	/*
2529
	 * buffered reads protect themselves in ->read_folio().  O_DIRECT reads
2530 2531
	 * need locks to protect pending reads from racing with truncate.
	 */
G
Gang He 已提交
2532 2533 2534 2535 2536 2537
	if (direct_io) {
		if (nowait)
			ret = ocfs2_try_rw_lock(inode, 0);
		else
			ret = ocfs2_rw_lock(inode, 0);

2538
		if (ret < 0) {
G
Gang He 已提交
2539 2540
			if (ret != -EAGAIN)
				mlog_errno(ret);
2541 2542 2543 2544
			goto bail;
		}
		rw_level = 0;
		/* communicate with ocfs2_dio_end_io */
2545
		ocfs2_iocb_set_rw_locked(iocb, rw_level);
2546 2547
	}

2548 2549 2550 2551
	/*
	 * We're fine letting folks race truncates and extending
	 * writes with read across the cluster, just like they can
	 * locally. Hence no rw_lock during read.
2552
	 *
2553 2554
	 * Take and drop the meta data lock to update inode fields
	 * like i_size. This allows the checks down below
2555
	 * generic_file_read_iter() a chance of actually working.
2556
	 */
G
Gang He 已提交
2557 2558
	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
				     !nowait);
2559
	if (ret < 0) {
G
Gang He 已提交
2560 2561
		if (ret != -EAGAIN)
			mlog_errno(ret);
2562 2563
		goto bail;
	}
M
Mark Fasheh 已提交
2564
	ocfs2_inode_unlock(inode, lock_level);
2565

A
Al Viro 已提交
2566
	ret = generic_file_read_iter(iocb, to);
2567
	trace_generic_file_read_iter_ret(ret);
2568 2569

	/* buffered aio wouldn't have proper lock coverage today */
2570
	BUG_ON(ret == -EIOCBQUEUED && !direct_io);
2571

A
Al Viro 已提交
2572
	/* see ocfs2_file_write_iter */
2573 2574 2575 2576 2577
	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
		rw_level = -1;
	}

bail:
2578
	if (rw_level != -1)
2579 2580 2581 2582 2583
		ocfs2_rw_unlock(inode, rw_level);

	return ret;
}

S
Sunil Mushran 已提交
2584
/* Refer generic_file_llseek_unlocked() */
2585
static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
S
Sunil Mushran 已提交
2586 2587 2588 2589
{
	struct inode *inode = file->f_mapping->host;
	int ret = 0;

A
Al Viro 已提交
2590
	inode_lock(inode);
S
Sunil Mushran 已提交
2591

2592
	switch (whence) {
S
Sunil Mushran 已提交
2593 2594 2595
	case SEEK_SET:
		break;
	case SEEK_END:
2596 2597 2598 2599 2600 2601 2602 2603 2604 2605
		/* SEEK_END requires the OCFS2 inode lock for the file
		 * because it references the file's size.
		 */
		ret = ocfs2_inode_lock(inode, NULL, 0);
		if (ret < 0) {
			mlog_errno(ret);
			goto out;
		}
		offset += i_size_read(inode);
		ocfs2_inode_unlock(inode, 0);
S
Sunil Mushran 已提交
2606 2607 2608 2609 2610 2611 2612 2613 2614 2615
		break;
	case SEEK_CUR:
		if (offset == 0) {
			offset = file->f_pos;
			goto out;
		}
		offset += file->f_pos;
		break;
	case SEEK_DATA:
	case SEEK_HOLE:
2616
		ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
S
Sunil Mushran 已提交
2617 2618 2619 2620 2621 2622 2623 2624
		if (ret)
			goto out;
		break;
	default:
		ret = -EINVAL;
		goto out;
	}

J
Jie Liu 已提交
2625
	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
S
Sunil Mushran 已提交
2626 2627

out:
A
Al Viro 已提交
2628
	inode_unlock(inode);
S
Sunil Mushran 已提交
2629 2630 2631 2632 2633
	if (ret)
		return ret;
	return offset;
}

2634 2635 2636
static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
				     struct file *file_out, loff_t pos_out,
				     loff_t len, unsigned int remap_flags)
2637
{
2638 2639 2640 2641 2642 2643 2644 2645
	struct inode *inode_in = file_inode(file_in);
	struct inode *inode_out = file_inode(file_out);
	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
	struct buffer_head *in_bh = NULL, *out_bh = NULL;
	bool same_inode = (inode_in == inode_out);
	loff_t remapped = 0;
	ssize_t ret;

2646 2647
	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
		return -EINVAL;
2648 2649 2650 2651
	if (!ocfs2_refcount_tree(osb))
		return -EOPNOTSUPP;
	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
		return -EROFS;
2652

2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706
	/* Lock both files against IO */
	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
	if (ret)
		return ret;

	/* Check file eligibility and prepare for block sharing. */
	ret = -EINVAL;
	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
		goto out_unlock;

	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
			&len, remap_flags);
	if (ret < 0 || len == 0)
		goto out_unlock;

	/* Lock out changes to the allocation maps and remap. */
	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
	if (!same_inode)
		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
				  SINGLE_DEPTH_NESTING);

	/* Zap any page cache for the destination file's range. */
	truncate_inode_pages_range(&inode_out->i_data,
				   round_down(pos_out, PAGE_SIZE),
				   round_up(pos_out + len, PAGE_SIZE) - 1);

	remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
			inode_out, out_bh, pos_out, len);
	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
	if (!same_inode)
		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
	if (remapped < 0) {
		ret = remapped;
		mlog_errno(ret);
		goto out_unlock;
	}

	/*
	 * Empty the extent map so that we may get the right extent
	 * record from the disk.
	 */
	ocfs2_extent_map_trunc(inode_in, 0);
	ocfs2_extent_map_trunc(inode_out, 0);

	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
	if (ret) {
		mlog_errno(ret);
		goto out_unlock;
	}

out_unlock:
	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
	return remapped > 0 ? remapped : ret;
2707 2708
}

2709
const struct inode_operations ocfs2_file_iops = {
2710 2711
	.setattr	= ocfs2_setattr,
	.getattr	= ocfs2_getattr,
T
Tiger Yang 已提交
2712
	.permission	= ocfs2_permission,
T
Tiger Yang 已提交
2713
	.listxattr	= ocfs2_listxattr,
M
Mark Fasheh 已提交
2714
	.fiemap		= ocfs2_fiemap,
2715
	.get_acl	= ocfs2_iop_get_acl,
2716
	.set_acl	= ocfs2_iop_set_acl,
M
Miklos Szeredi 已提交
2717 2718
	.fileattr_get	= ocfs2_fileattr_get,
	.fileattr_set	= ocfs2_fileattr_set,
2719 2720
};

2721
const struct inode_operations ocfs2_special_file_iops = {
2722 2723
	.setattr	= ocfs2_setattr,
	.getattr	= ocfs2_getattr,
T
Tiger Yang 已提交
2724
	.permission	= ocfs2_permission,
2725
	.get_acl	= ocfs2_iop_get_acl,
2726
	.set_acl	= ocfs2_iop_set_acl,
2727 2728
};

M
Mark Fasheh 已提交
2729 2730 2731 2732
/*
 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
 */
2733
const struct file_operations ocfs2_fops = {
S
Sunil Mushran 已提交
2734
	.llseek		= ocfs2_file_llseek,
2735 2736 2737 2738
	.mmap		= ocfs2_mmap,
	.fsync		= ocfs2_sync_file,
	.release	= ocfs2_file_release,
	.open		= ocfs2_file_open,
A
Al Viro 已提交
2739
	.read_iter	= ocfs2_file_read_iter,
A
Al Viro 已提交
2740
	.write_iter	= ocfs2_file_write_iter,
2741
	.unlocked_ioctl	= ocfs2_ioctl,
M
Mark Fasheh 已提交
2742 2743 2744
#ifdef CONFIG_COMPAT
	.compat_ioctl   = ocfs2_compat_ioctl,
#endif
M
Mark Fasheh 已提交
2745
	.lock		= ocfs2_lock,
2746
	.flock		= ocfs2_flock,
2747
	.splice_read	= generic_file_splice_read,
2748
	.splice_write	= iter_file_splice_write,
2749
	.fallocate	= ocfs2_fallocate,
2750
	.remap_file_range = ocfs2_remap_file_range,
2751 2752
};

2753
const struct file_operations ocfs2_dops = {
J
Jan Kara 已提交
2754
	.llseek		= generic_file_llseek,
2755
	.read		= generic_read_dir,
A
Al Viro 已提交
2756
	.iterate	= ocfs2_readdir,
2757
	.fsync		= ocfs2_sync_file,
2758 2759
	.release	= ocfs2_dir_release,
	.open		= ocfs2_dir_open,
2760
	.unlocked_ioctl	= ocfs2_ioctl,
M
Mark Fasheh 已提交
2761 2762
#ifdef CONFIG_COMPAT
	.compat_ioctl   = ocfs2_compat_ioctl,
M
Mark Fasheh 已提交
2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780
#endif
	.lock		= ocfs2_lock,
	.flock		= ocfs2_flock,
};

/*
 * POSIX-lockless variants of our file_operations.
 *
 * These will be used if the underlying cluster stack does not support
 * posix file locking, if the user passes the "localflocks" mount
 * option, or if we have a local-only fs.
 *
 * ocfs2_flock is in here because all stacks handle UNIX file locks,
 * so we still want it in the case of no stack support for
 * plocks. Internally, it will do the right thing when asked to ignore
 * the cluster.
 */
const struct file_operations ocfs2_fops_no_plocks = {
S
Sunil Mushran 已提交
2781
	.llseek		= ocfs2_file_llseek,
M
Mark Fasheh 已提交
2782 2783 2784 2785
	.mmap		= ocfs2_mmap,
	.fsync		= ocfs2_sync_file,
	.release	= ocfs2_file_release,
	.open		= ocfs2_file_open,
A
Al Viro 已提交
2786
	.read_iter	= ocfs2_file_read_iter,
A
Al Viro 已提交
2787
	.write_iter	= ocfs2_file_write_iter,
M
Mark Fasheh 已提交
2788 2789 2790 2791 2792
	.unlocked_ioctl	= ocfs2_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl   = ocfs2_compat_ioctl,
#endif
	.flock		= ocfs2_flock,
2793
	.splice_read	= generic_file_splice_read,
2794
	.splice_write	= iter_file_splice_write,
2795
	.fallocate	= ocfs2_fallocate,
2796
	.remap_file_range = ocfs2_remap_file_range,
M
Mark Fasheh 已提交
2797 2798 2799 2800 2801
};

const struct file_operations ocfs2_dops_no_plocks = {
	.llseek		= generic_file_llseek,
	.read		= generic_read_dir,
A
Al Viro 已提交
2802
	.iterate	= ocfs2_readdir,
M
Mark Fasheh 已提交
2803 2804 2805 2806 2807 2808
	.fsync		= ocfs2_sync_file,
	.release	= ocfs2_dir_release,
	.open		= ocfs2_dir_open,
	.unlocked_ioctl	= ocfs2_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl   = ocfs2_compat_ioctl,
M
Mark Fasheh 已提交
2809
#endif
2810
	.flock		= ocfs2_flock,
2811
};