file.c 86.7 KB
Newer Older
M
Miklos Szeredi 已提交
1 2
/*
  FUSE: Filesystem in Userspace
M
Miklos Szeredi 已提交
3
  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
M
Miklos Szeredi 已提交
4 5 6 7 8 9 10 11 12 13

  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
*/

#include "fuse_i.h"

#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/kernel.h>
A
Alexey Dobriyan 已提交
14
#include <linux/sched.h>
15
#include <linux/sched/signal.h>
16
#include <linux/module.h>
17
#include <linux/compat.h>
18
#include <linux/swap.h>
19
#include <linux/falloc.h>
20
#include <linux/uio.h>
21
#include <linux/fs.h>
M
Miklos Szeredi 已提交
22

23 24
static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
				      struct fuse_page_desc **desc)
M
Miklos Szeredi 已提交
25 26 27 28 29 30 31 32 33 34
{
	struct page **pages;

	pages = kzalloc(npages * (sizeof(struct page *) +
				  sizeof(struct fuse_page_desc)), flags);
	*desc = (void *) (pages + npages);

	return pages;
}

35
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
36
			  int opcode, struct fuse_open_out *outargp)
M
Miklos Szeredi 已提交
37 38
{
	struct fuse_open_in inarg;
39
	FUSE_ARGS(args);
M
Miklos Szeredi 已提交
40 41

	memset(&inarg, 0, sizeof(inarg));
42
	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
43
	if (!fm->fc->atomic_o_trunc)
44
		inarg.flags &= ~O_TRUNC;
45 46 47 48 49 50 51 52
	args.opcode = opcode;
	args.nodeid = nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(*outargp);
	args.out_args[0].value = outargp;
M
Miklos Szeredi 已提交
53

54
	return fuse_simple_request(fm, &args);
M
Miklos Szeredi 已提交
55 56
}

57 58 59 60 61 62
struct fuse_release_args {
	struct fuse_args args;
	struct fuse_release_in inarg;
	struct inode *inode;
};

63
struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
M
Miklos Szeredi 已提交
64 65
{
	struct fuse_file *ff;
T
Tejun Heo 已提交
66

67
	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
T
Tejun Heo 已提交
68 69 70
	if (unlikely(!ff))
		return NULL;

71
	ff->fm = fm;
72 73
	ff->release_args = kzalloc(sizeof(*ff->release_args),
				   GFP_KERNEL_ACCOUNT);
74
	if (!ff->release_args) {
T
Tejun Heo 已提交
75 76
		kfree(ff);
		return NULL;
M
Miklos Szeredi 已提交
77
	}
T
Tejun Heo 已提交
78 79

	INIT_LIST_HEAD(&ff->write_entry);
M
Miklos Szeredi 已提交
80
	mutex_init(&ff->readdir.lock);
81
	refcount_set(&ff->count, 1);
T
Tejun Heo 已提交
82 83 84
	RB_CLEAR_NODE(&ff->polled_node);
	init_waitqueue_head(&ff->poll_wait);

85
	ff->kh = atomic64_inc_return(&fm->fc->khctr);
T
Tejun Heo 已提交
86

M
Miklos Szeredi 已提交
87 88 89 90 91
	return ff;
}

void fuse_file_free(struct fuse_file *ff)
{
92
	kfree(ff->release_args);
M
Miklos Szeredi 已提交
93
	mutex_destroy(&ff->readdir.lock);
M
Miklos Szeredi 已提交
94 95 96
	kfree(ff);
}

97
static struct fuse_file *fuse_file_get(struct fuse_file *ff)
98
{
99
	refcount_inc(&ff->count);
100 101 102
	return ff;
}

103
static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
104
			     int error)
M
Miklos Szeredi 已提交
105
{
106 107 108 109
	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);

	iput(ra->inode);
	kfree(ra);
M
Miklos Szeredi 已提交
110 111
}

112
static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
113
{
114
	if (refcount_dec_and_test(&ff->count)) {
115
		struct fuse_args *args = &ff->release_args->args;
116

117
		if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
118
			/* Do nothing when client does not implement 'open' */
119
			fuse_release_end(ff->fm, args, 0);
120
		} else if (sync) {
121 122
			fuse_simple_request(ff->fm, args);
			fuse_release_end(ff->fm, args, 0);
123
		} else {
124
			args->end = fuse_release_end;
125
			if (fuse_simple_background(ff->fm, args,
126
						   GFP_KERNEL | __GFP_NOFAIL))
127
				fuse_release_end(ff->fm, args, -ENOTCONN);
128
		}
129 130 131 132
		kfree(ff);
	}
}

133
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
134
		 bool isdir)
135
{
136
	struct fuse_conn *fc = fm->fc;
137 138 139
	struct fuse_file *ff;
	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;

140
	ff = fuse_file_alloc(fm);
141 142 143
	if (!ff)
		return -ENOMEM;

144
	ff->fh = 0;
145 146
	/* Default for no-open */
	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
147
	if (isdir ? !fc->no_opendir : !fc->no_open) {
148 149 150
		struct fuse_open_out outarg;
		int err;

151
		err = fuse_send_open(fm, nodeid, file, opcode, &outarg);
152 153 154 155
		if (!err) {
			ff->fh = outarg.fh;
			ff->open_flags = outarg.open_flags;

156
		} else if (err != -ENOSYS) {
157 158 159
			fuse_file_free(ff);
			return err;
		} else {
160 161 162 163
			if (isdir)
				fc->no_opendir = 1;
			else
				fc->no_open = 1;
164
		}
165 166 167
	}

	if (isdir)
168
		ff->open_flags &= ~FOPEN_DIRECT_IO;
169 170

	ff->nodeid = nodeid;
171
	file->private_data = ff;
172 173 174

	return 0;
}
175
EXPORT_SYMBOL_GPL(fuse_do_open);
176

177 178 179 180 181 182 183 184 185
static void fuse_link_write_file(struct file *file)
{
	struct inode *inode = file_inode(file);
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct fuse_file *ff = file->private_data;
	/*
	 * file may be written through mmap, so chain it onto the
	 * inodes's write_file list
	 */
186
	spin_lock(&fi->lock);
187 188
	if (list_empty(&ff->write_entry))
		list_add(&ff->write_entry, &fi->write_files);
189
	spin_unlock(&fi->lock);
190 191
}

192
void fuse_finish_open(struct inode *inode, struct file *file)
M
Miklos Szeredi 已提交
193
{
194
	struct fuse_file *ff = file->private_data;
195
	struct fuse_conn *fc = get_fuse_conn(inode);
196

197 198 199
	if (ff->open_flags & FOPEN_STREAM)
		stream_open(inode, file);
	else if (ff->open_flags & FOPEN_NONSEEKABLE)
T
Tejun Heo 已提交
200
		nonseekable_open(inode, file);
201

202 203 204
	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
		struct fuse_inode *fi = get_fuse_inode(inode);

205
		spin_lock(&fi->lock);
206
		fi->attr_version = atomic64_inc_return(&fc->attr_version);
207
		i_size_write(inode, 0);
208
		spin_unlock(&fi->lock);
209
		truncate_pagecache(inode, 0);
210
		fuse_invalidate_attr(inode);
211 212
		if (fc->writeback_cache)
			file_update_time(file);
213 214
	} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
		invalidate_inode_pages2(inode->i_mapping);
215
	}
216

P
Pavel Emelyanov 已提交
217 218
	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
		fuse_link_write_file(file);
M
Miklos Szeredi 已提交
219 220
}

221
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
M
Miklos Szeredi 已提交
222
{
223 224
	struct fuse_mount *fm = get_fuse_mount(inode);
	struct fuse_conn *fc = fm->fc;
M
Miklos Szeredi 已提交
225
	int err;
226
	bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
227 228
			  fc->atomic_o_trunc &&
			  fc->writeback_cache;
229 230
	bool dax_truncate = (file->f_flags & O_TRUNC) &&
			  fc->atomic_o_trunc && FUSE_IS_DAX(inode);
M
Miklos Szeredi 已提交
231

M
Miklos Szeredi 已提交
232 233 234
	if (fuse_is_bad(inode))
		return -EIO;

M
Miklos Szeredi 已提交
235 236 237 238
	err = generic_file_open(inode, file);
	if (err)
		return err;

239
	if (is_wb_truncate || dax_truncate) {
A
Al Viro 已提交
240
		inode_lock(inode);
241 242
		fuse_set_nowrite(inode);
	}
243

244 245 246 247 248 249
	if (dax_truncate) {
		down_write(&get_fuse_inode(inode)->i_mmap_sem);
		err = fuse_dax_break_layouts(inode, 0, 0);
		if (err)
			goto out;
	}
M
Miklos Szeredi 已提交
250

251
	err = fuse_do_open(fm, get_node_id(inode), file, isdir);
252 253
	if (!err)
		fuse_finish_open(inode, file);
254

255 256 257 258 259
out:
	if (dax_truncate)
		up_write(&get_fuse_inode(inode)->i_mmap_sem);

	if (is_wb_truncate | dax_truncate) {
260
		fuse_release_nowrite(inode);
A
Al Viro 已提交
261
		inode_unlock(inode);
262
	}
263 264

	return err;
M
Miklos Szeredi 已提交
265 266
}

267 268
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
				 int flags, int opcode)
269
{
270
	struct fuse_conn *fc = ff->fm->fc;
271
	struct fuse_release_args *ra = ff->release_args;
M
Miklos Szeredi 已提交
272

273 274 275 276 277 278
	/* Inode is NULL on error path of fuse_create_open() */
	if (likely(fi)) {
		spin_lock(&fi->lock);
		list_del(&ff->write_entry);
		spin_unlock(&fi->lock);
	}
279 280 281 282 283
	spin_lock(&fc->lock);
	if (!RB_EMPTY_NODE(&ff->polled_node))
		rb_erase(&ff->polled_node, &fc->polled_files);
	spin_unlock(&fc->lock);

284
	wake_up_interruptible_all(&ff->poll_wait);
285

286 287 288 289 290 291 292 293 294
	ra->inarg.fh = ff->fh;
	ra->inarg.flags = flags;
	ra->args.in_numargs = 1;
	ra->args.in_args[0].size = sizeof(struct fuse_release_in);
	ra->args.in_args[0].value = &ra->inarg;
	ra->args.opcode = opcode;
	ra->args.nodeid = ff->nodeid;
	ra->args.force = true;
	ra->args.nocreds = true;
M
Miklos Szeredi 已提交
295 296
}

297
void fuse_release_common(struct file *file, bool isdir)
M
Miklos Szeredi 已提交
298
{
299
	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
300
	struct fuse_file *ff = file->private_data;
301
	struct fuse_release_args *ra = ff->release_args;
302
	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
T
Tejun Heo 已提交
303

304
	fuse_prepare_release(fi, ff, file->f_flags, opcode);
T
Tejun Heo 已提交
305

M
Miklos Szeredi 已提交
306
	if (ff->flock) {
307
		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
308
		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc,
309
							  (fl_owner_t) file);
M
Miklos Szeredi 已提交
310
	}
311
	/* Hold inode until release is finished */
312
	ra->inode = igrab(file_inode(file));
T
Tejun Heo 已提交
313 314 315 316 317

	/*
	 * Normally this will send the RELEASE request, however if
	 * some asynchronous READ or WRITE requests are outstanding,
	 * the sending will be delayed.
318 319 320 321
	 *
	 * Make the release synchronous if this is a fuseblk mount,
	 * synchronous RELEASE is allowed (and desirable) in this case
	 * because the server can be trusted not to screw up.
T
Tejun Heo 已提交
322
	 */
323
	fuse_file_put(ff, ff->fm->fc->destroy, isdir);
M
Miklos Szeredi 已提交
324 325
}

326 327
static int fuse_open(struct inode *inode, struct file *file)
{
328
	return fuse_open_common(inode, file, false);
329 330 331 332
}

static int fuse_release(struct inode *inode, struct file *file)
{
P
Pavel Emelyanov 已提交
333 334 335 336
	struct fuse_conn *fc = get_fuse_conn(inode);

	/* see fuse_vma_close() for !writeback_cache case */
	if (fc->writeback_cache)
M
Miklos Szeredi 已提交
337
		write_inode_now(inode, 1);
M
Maxim Patlasov 已提交
338

339
	fuse_release_common(file, false);
340 341 342 343 344

	/* return value is ignored by VFS */
	return 0;
}

345
void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
346
{
347
	WARN_ON(refcount_read(&ff->count) > 1);
348
	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
349 350 351 352
	/*
	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
	 * synchronous, we are fine with not doing igrab() here"
	 */
353
	fuse_file_put(ff, true, false);
354
}
355
EXPORT_SYMBOL_GPL(fuse_sync_release);
356

357
/*
358 359
 * Scramble the ID space with XTEA, so that the value of the files_struct
 * pointer is not exposed to userspace.
360
 */
361
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
362
{
363 364 365 366 367 368 369 370 371 372 373 374 375 376
	u32 *k = fc->scramble_key;
	u64 v = (unsigned long) id;
	u32 v0 = v;
	u32 v1 = v >> 32;
	u32 sum = 0;
	int i;

	for (i = 0; i < 32; i++) {
		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
		sum += 0x9E3779B9;
		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
	}

	return (u64) v0 + ((u64) v1 << 32);
377 378
}

379 380
struct fuse_writepage_args {
	struct fuse_io_args ia;
381
	struct rb_node writepages_entry;
382 383 384 385 386 387
	struct list_head queue_entry;
	struct fuse_writepage_args *next;
	struct inode *inode;
};

static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
388 389
					    pgoff_t idx_from, pgoff_t idx_to)
{
390 391 392
	struct rb_node *n;

	n = fi->writepages.rb_node;
393

394 395
	while (n) {
		struct fuse_writepage_args *wpa;
396 397
		pgoff_t curr_index;

398
		wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
399 400
		WARN_ON(get_fuse_inode(wpa->inode) != fi);
		curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
401 402 403 404 405
		if (idx_from >= curr_index + wpa->ia.ap.num_pages)
			n = n->rb_right;
		else if (idx_to < curr_index)
			n = n->rb_left;
		else
406
			return wpa;
407 408 409 410
	}
	return NULL;
}

M
Miklos Szeredi 已提交
411
/*
412
 * Check if any page in a range is under writeback
M
Miklos Szeredi 已提交
413 414 415 416
 *
 * This is currently done by walking the list of writepage requests
 * for the inode, which can be pretty inefficient.
 */
417 418
static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
				   pgoff_t idx_to)
M
Miklos Szeredi 已提交
419 420
{
	struct fuse_inode *fi = get_fuse_inode(inode);
421
	bool found;
M
Miklos Szeredi 已提交
422

423
	spin_lock(&fi->lock);
424
	found = fuse_find_writeback(fi, idx_from, idx_to);
425
	spin_unlock(&fi->lock);
M
Miklos Szeredi 已提交
426 427 428 429

	return found;
}

430 431 432 433 434
static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
{
	return fuse_range_is_writeback(inode, index, index);
}

M
Miklos Szeredi 已提交
435 436 437 438 439 440
/*
 * Wait for page writeback to be completed.
 *
 * Since fuse doesn't rely on the VM writeback tracking, this has to
 * use some other means.
 */
441
static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
M
Miklos Szeredi 已提交
442 443 444 445 446 447
{
	struct fuse_inode *fi = get_fuse_inode(inode);

	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
}

448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
/*
 * Wait for all pending writepages on the inode to finish.
 *
 * This is currently done by blocking further writes with FUSE_NOWRITE
 * and waiting for all sent writes to complete.
 *
 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
 * could conflict with truncation.
 */
static void fuse_sync_writes(struct inode *inode)
{
	fuse_set_nowrite(inode);
	fuse_release_nowrite(inode);
}

463
static int fuse_flush(struct file *file, fl_owner_t id)
M
Miklos Szeredi 已提交
464
{
A
Al Viro 已提交
465
	struct inode *inode = file_inode(file);
466
	struct fuse_mount *fm = get_fuse_mount(inode);
M
Miklos Szeredi 已提交
467 468
	struct fuse_file *ff = file->private_data;
	struct fuse_flush_in inarg;
469
	FUSE_ARGS(args);
M
Miklos Szeredi 已提交
470 471
	int err;

M
Miklos Szeredi 已提交
472
	if (fuse_is_bad(inode))
473 474
		return -EIO;

M
Miklos Szeredi 已提交
475
	err = write_inode_now(inode, 1);
476 477 478
	if (err)
		return err;

A
Al Viro 已提交
479
	inode_lock(inode);
480
	fuse_sync_writes(inode);
A
Al Viro 已提交
481
	inode_unlock(inode);
482

483
	err = filemap_check_errors(file->f_mapping);
484 485 486
	if (err)
		return err;

487
	err = 0;
488
	if (fm->fc->no_flush)
489 490
		goto inval_attr_out;

M
Miklos Szeredi 已提交
491 492
	memset(&inarg, 0, sizeof(inarg));
	inarg.fh = ff->fh;
493
	inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
494 495 496 497 498 499 500
	args.opcode = FUSE_FLUSH;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.force = true;

501
	err = fuse_simple_request(fm, &args);
M
Miklos Szeredi 已提交
502
	if (err == -ENOSYS) {
503
		fm->fc->no_flush = 1;
M
Miklos Szeredi 已提交
504 505
		err = 0;
	}
506 507 508 509 510 511

inval_attr_out:
	/*
	 * In memory i_blocks is not maintained by fuse, if writeback cache is
	 * enabled, i_blocks from cached attr may not be accurate.
	 */
512
	if (!err && fm->fc->writeback_cache)
513
		fuse_invalidate_attr(inode);
M
Miklos Szeredi 已提交
514 515 516
	return err;
}

517
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
M
Miklos Szeredi 已提交
518
		      int datasync, int opcode)
M
Miklos Szeredi 已提交
519
{
520
	struct inode *inode = file->f_mapping->host;
521
	struct fuse_mount *fm = get_fuse_mount(inode);
M
Miklos Szeredi 已提交
522
	struct fuse_file *ff = file->private_data;
523
	FUSE_ARGS(args);
M
Miklos Szeredi 已提交
524
	struct fuse_fsync_in inarg;
M
Miklos Szeredi 已提交
525 526 527

	memset(&inarg, 0, sizeof(inarg));
	inarg.fh = ff->fh;
528
	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
529 530 531 532 533
	args.opcode = opcode;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
534
	return fuse_simple_request(fm, &args);
M
Miklos Szeredi 已提交
535 536 537 538 539 540 541
}

static int fuse_fsync(struct file *file, loff_t start, loff_t end,
		      int datasync)
{
	struct inode *inode = file->f_mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
M
Miklos Szeredi 已提交
542 543
	int err;

M
Miklos Szeredi 已提交
544
	if (fuse_is_bad(inode))
545 546
		return -EIO;

A
Al Viro 已提交
547
	inode_lock(inode);
548

M
Miklos Szeredi 已提交
549 550 551 552 553
	/*
	 * Start writeback against all dirty pages of the inode, then
	 * wait for all outstanding writes, before sending the FSYNC
	 * request.
	 */
554
	err = file_write_and_wait_range(file, start, end);
M
Miklos Szeredi 已提交
555
	if (err)
556
		goto out;
M
Miklos Szeredi 已提交
557 558

	fuse_sync_writes(inode);
559 560 561

	/*
	 * Due to implementation of fuse writeback
562
	 * file_write_and_wait_range() does not catch errors.
563 564
	 * We have to do this directly after fuse_sync_writes()
	 */
565
	err = file_check_and_advance_wb_err(file);
566 567 568
	if (err)
		goto out;

M
Miklos Szeredi 已提交
569 570 571
	err = sync_inode_metadata(inode, 1);
	if (err)
		goto out;
M
Miklos Szeredi 已提交
572

M
Miklos Szeredi 已提交
573
	if (fc->no_fsync)
M
Miklos Szeredi 已提交
574
		goto out;
M
Maxim Patlasov 已提交
575

M
Miklos Szeredi 已提交
576
	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
M
Miklos Szeredi 已提交
577
	if (err == -ENOSYS) {
M
Miklos Szeredi 已提交
578
		fc->no_fsync = 1;
M
Miklos Szeredi 已提交
579 580
		err = 0;
	}
581
out:
A
Al Viro 已提交
582
	inode_unlock(inode);
M
Miklos Szeredi 已提交
583

M
Miklos Szeredi 已提交
584
	return err;
585 586
}

587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606
void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
			 size_t count, int opcode)
{
	struct fuse_file *ff = file->private_data;
	struct fuse_args *args = &ia->ap.args;

	ia->read.in.fh = ff->fh;
	ia->read.in.offset = pos;
	ia->read.in.size = count;
	ia->read.in.flags = file->f_flags;
	args->opcode = opcode;
	args->nodeid = ff->nodeid;
	args->in_numargs = 1;
	args->in_args[0].size = sizeof(ia->read.in);
	args->in_args[0].value = &ia->read.in;
	args->out_argvar = true;
	args->out_numargs = 1;
	args->out_args[0].size = count;
}

607 608
static void fuse_release_user_pages(struct fuse_args_pages *ap,
				    bool should_dirty)
609
{
610
	unsigned int i;
611

612
	for (i = 0; i < ap->num_pages; i++) {
613
		if (should_dirty)
614 615
			set_page_dirty_lock(ap->pages[i]);
		put_page(ap->pages[i]);
616 617 618
	}
}

619 620 621 622 623
static void fuse_io_release(struct kref *kref)
{
	kfree(container_of(kref, struct fuse_io_priv, refcnt));
}

624 625 626 627 628 629 630 631 632 633 634
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{
	if (io->err)
		return io->err;

	if (io->bytes >= 0 && io->write)
		return -EIO;

	return io->bytes < 0 ? io->size : io->bytes;
}

M
Maxim Patlasov 已提交
635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
/**
 * In case of short read, the caller sets 'pos' to the position of
 * actual end of fuse request in IO request. Otherwise, if bytes_requested
 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
 *
 * An example:
 * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
 * both submitted asynchronously. The first of them was ACKed by userspace as
 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
 * second request was ACKed as short, e.g. only 1K was read, resulting in
 * pos == 33K.
 *
 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
 * will be equal to the length of the longest contiguous fragment of
 * transferred data starting from the beginning of IO request.
 */
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
	int left;

	spin_lock(&io->lock);
	if (err)
		io->err = io->err ? : err;
	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
		io->bytes = pos;

	left = --io->reqs;
662
	if (!left && io->blocking)
663
		complete(io->done);
M
Maxim Patlasov 已提交
664 665
	spin_unlock(&io->lock);

666
	if (!left && !io->blocking) {
667
		ssize_t res = fuse_get_res_by_io(io);
M
Maxim Patlasov 已提交
668

669 670 671 672
		if (res >= 0) {
			struct inode *inode = file_inode(io->iocb->ki_filp);
			struct fuse_conn *fc = get_fuse_conn(inode);
			struct fuse_inode *fi = get_fuse_inode(inode);
M
Maxim Patlasov 已提交
673

674
			spin_lock(&fi->lock);
675
			fi->attr_version = atomic64_inc_return(&fc->attr_version);
676
			spin_unlock(&fi->lock);
M
Maxim Patlasov 已提交
677 678
		}

679
		io->iocb->ki_complete(io->iocb, res, 0);
M
Maxim Patlasov 已提交
680
	}
681 682

	kref_put(&io->refcnt, fuse_io_release);
M
Maxim Patlasov 已提交
683 684
}

685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703
static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
					  unsigned int npages)
{
	struct fuse_io_args *ia;

	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
	if (ia) {
		ia->io = io;
		ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
						&ia->ap.descs);
		if (!ia->ap.pages) {
			kfree(ia);
			ia = NULL;
		}
	}
	return ia;
}

static void fuse_io_free(struct fuse_io_args *ia)
M
Maxim Patlasov 已提交
704
{
705 706 707 708
	kfree(ia->ap.pages);
	kfree(ia);
}

709
static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
710 711 712 713
				  int err)
{
	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
	struct fuse_io_priv *io = ia->io;
M
Maxim Patlasov 已提交
714 715
	ssize_t pos = -1;

716
	fuse_release_user_pages(&ia->ap, io->should_dirty);
M
Maxim Patlasov 已提交
717

718 719 720 721 722 723 724 725 726
	if (err) {
		/* Nothing */
	} else if (io->write) {
		if (ia->write.out.size > ia->write.in.size) {
			err = -EIO;
		} else if (ia->write.in.size != ia->write.out.size) {
			pos = ia->write.in.offset - io->offset +
				ia->write.out.size;
		}
M
Maxim Patlasov 已提交
727
	} else {
728 729 730 731
		u32 outsize = args->out_args[0].size;

		if (ia->read.in.size != outsize)
			pos = ia->read.in.offset - io->offset + outsize;
M
Maxim Patlasov 已提交
732 733
	}

734 735
	fuse_aio_complete(io, err, pos);
	fuse_io_free(ia);
M
Maxim Patlasov 已提交
736 737
}

738
static ssize_t fuse_async_req_send(struct fuse_mount *fm,
739
				   struct fuse_io_args *ia, size_t num_bytes)
M
Maxim Patlasov 已提交
740
{
741 742 743
	ssize_t err;
	struct fuse_io_priv *io = ia->io;

M
Maxim Patlasov 已提交
744
	spin_lock(&io->lock);
745
	kref_get(&io->refcnt);
M
Maxim Patlasov 已提交
746 747 748 749
	io->size += num_bytes;
	io->reqs++;
	spin_unlock(&io->lock);

750
	ia->ap.args.end = fuse_aio_complete_req;
751
	ia->ap.args.may_block = io->should_dirty;
752
	err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
M
Miklos Szeredi 已提交
753
	if (err)
754
		fuse_aio_complete_req(fm, &ia->ap.args, err);
M
Maxim Patlasov 已提交
755

M
Miklos Szeredi 已提交
756
	return num_bytes;
M
Maxim Patlasov 已提交
757 758
}

759 760
static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
			      fl_owner_t owner)
761
{
762
	struct file *file = ia->io->iocb->ki_filp;
763
	struct fuse_file *ff = file->private_data;
764
	struct fuse_mount *fm = ff->fm;
765

766
	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
767
	if (owner != NULL) {
768
		ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
769
		ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
770
	}
771

772
	if (ia->io->async)
773
		return fuse_async_req_send(fm, ia, count);
774

775
	return fuse_simple_request(fm, &ia->ap.args);
776 777
}

778 779 780 781 782 783
static void fuse_read_update_size(struct inode *inode, loff_t size,
				  u64 attr_ver)
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);

784
	spin_lock(&fi->lock);
785 786
	if (attr_ver == fi->attr_version && size < inode->i_size &&
	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
787
		fi->attr_version = atomic64_inc_return(&fc->attr_version);
788 789
		i_size_write(inode, size);
	}
790
	spin_unlock(&fi->lock);
791 792
}

793
static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
794
			    struct fuse_args_pages *ap)
795
{
P
Pavel Emelyanov 已提交
796 797 798 799 800 801 802 803 804
	struct fuse_conn *fc = get_fuse_conn(inode);

	if (fc->writeback_cache) {
		/*
		 * A hole in a file. Some data after the hole are in page cache,
		 * but have not reached the client fs yet. So, the hole is not
		 * present there.
		 */
		int i;
805 806
		int start_idx = num_read >> PAGE_SHIFT;
		size_t off = num_read & (PAGE_SIZE - 1);
807

808 809
		for (i = start_idx; i < ap->num_pages; i++) {
			zero_user_segment(ap->pages[i], off, PAGE_SIZE);
P
Pavel Emelyanov 已提交
810 811 812
			off = 0;
		}
	} else {
813
		loff_t pos = page_offset(ap->pages[0]) + num_read;
P
Pavel Emelyanov 已提交
814 815
		fuse_read_update_size(inode, pos, attr_ver);
	}
816 817
}

818
static int fuse_do_readpage(struct file *file, struct page *page)
M
Miklos Szeredi 已提交
819 820
{
	struct inode *inode = page->mapping->host;
821
	struct fuse_mount *fm = get_fuse_mount(inode);
822
	loff_t pos = page_offset(page);
823 824 825 826 827 828 829 830 831
	struct fuse_page_desc desc = { .length = PAGE_SIZE };
	struct fuse_io_args ia = {
		.ap.args.page_zeroing = true,
		.ap.args.out_pages = true,
		.ap.num_pages = 1,
		.ap.pages = &page,
		.ap.descs = &desc,
	};
	ssize_t res;
832
	u64 attr_ver;
833

M
Miklos Szeredi 已提交
834
	/*
L
Lucas De Marchi 已提交
835
	 * Page writeback can extend beyond the lifetime of the
M
Miklos Szeredi 已提交
836 837 838 839 840
	 * page-cache page, so make sure we read a properly synced
	 * page.
	 */
	fuse_wait_on_page_writeback(inode, page->index);

841
	attr_ver = fuse_get_attr_version(fm->fc);
842

843 844 845 846
	/* Don't overflow end offset */
	if (pos + (desc.length - 1) == LLONG_MAX)
		desc.length--;

847
	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
848
	res = fuse_simple_request(fm, &ia.ap.args);
849 850 851 852 853 854
	if (res < 0)
		return res;
	/*
	 * Short read means EOF.  If file size is larger, truncate it
	 */
	if (res < desc.length)
855
		fuse_short_read(inode, attr_ver, res, &ia.ap);
856

857
	SetPageUptodate(page);
858

859
	return 0;
860 861 862 863 864 865 866 867
}

static int fuse_readpage(struct file *file, struct page *page)
{
	struct inode *inode = page->mapping->host;
	int err;

	err = -EIO;
M
Miklos Szeredi 已提交
868
	if (fuse_is_bad(inode))
869 870 871
		goto out;

	err = fuse_do_readpage(file, page);
872
	fuse_invalidate_atime(inode);
M
Miklos Szeredi 已提交
873 874 875 876 877
 out:
	unlock_page(page);
	return err;
}

878
static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
879
			       int err)
880
{
881
	int i;
882 883 884 885
	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
	struct fuse_args_pages *ap = &ia->ap;
	size_t count = ia->read.in.size;
	size_t num_read = args->out_args[0].size;
886
	struct address_space *mapping = NULL;
887

888 889
	for (i = 0; mapping == NULL && i < ap->num_pages; i++)
		mapping = ap->pages[i]->mapping;
890

891 892 893 894 895 896
	if (mapping) {
		struct inode *inode = mapping->host;

		/*
		 * Short read means EOF. If file size is larger, truncate it
		 */
897 898
		if (!err && num_read < count)
			fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
899

900
		fuse_invalidate_atime(inode);
901
	}
902

903 904 905 906
	for (i = 0; i < ap->num_pages; i++) {
		struct page *page = ap->pages[i];

		if (!err)
907
			SetPageUptodate(page);
908 909
		else
			SetPageError(page);
910
		unlock_page(page);
911
		put_page(page);
912
	}
913 914 915 916
	if (ia->ff)
		fuse_file_put(ia->ff, false, false);

	fuse_io_free(ia);
917 918
}

919
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
920
{
921
	struct fuse_file *ff = file->private_data;
922
	struct fuse_mount *fm = ff->fm;
923 924 925
	struct fuse_args_pages *ap = &ia->ap;
	loff_t pos = page_offset(ap->pages[0]);
	size_t count = ap->num_pages << PAGE_SHIFT;
926
	ssize_t res;
927 928 929 930 931
	int err;

	ap->args.out_pages = true;
	ap->args.page_zeroing = true;
	ap->args.page_replace = true;
932 933 934 935 936 937 938 939

	/* Don't overflow end offset */
	if (pos + (count - 1) == LLONG_MAX) {
		count--;
		ap->descs[ap->num_pages - 1].length--;
	}
	WARN_ON((loff_t) (pos + count) < 0);

940
	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
941 942
	ia->read.attr_ver = fuse_get_attr_version(fm->fc);
	if (fm->fc->async_read) {
943 944
		ia->ff = fuse_file_get(ff);
		ap->args.end = fuse_readpages_end;
945
		err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
946 947
		if (!err)
			return;
948
	} else {
949
		res = fuse_simple_request(fm, &ap->args);
950
		err = res < 0 ? res : 0;
951
	}
952
	fuse_readpages_end(fm, &ap->args, err);
953 954
}

955
static void fuse_readahead(struct readahead_control *rac)
956
{
957
	struct inode *inode = rac->mapping->host;
958
	struct fuse_conn *fc = get_fuse_conn(inode);
959
	unsigned int i, max_pages, nr_pages = 0;
960

M
Miklos Szeredi 已提交
961
	if (fuse_is_bad(inode))
962
		return;
963

964 965
	max_pages = min_t(unsigned int, fc->max_pages,
			fc->max_read / PAGE_SIZE);
966

967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
	for (;;) {
		struct fuse_io_args *ia;
		struct fuse_args_pages *ap;

		nr_pages = readahead_count(rac) - nr_pages;
		if (nr_pages > max_pages)
			nr_pages = max_pages;
		if (nr_pages == 0)
			break;
		ia = fuse_io_alloc(NULL, nr_pages);
		if (!ia)
			return;
		ap = &ia->ap;
		nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
		for (i = 0; i < nr_pages; i++) {
			fuse_wait_on_page_writeback(inode,
						    readahead_index(rac) + i);
			ap->descs[i].length = PAGE_SIZE;
		}
		ap->num_pages = nr_pages;
		fuse_send_readpages(ia, rac->file);
988
	}
989 990
}

991
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
M
Miklos Szeredi 已提交
992 993
{
	struct inode *inode = iocb->ki_filp->f_mapping->host;
994
	struct fuse_conn *fc = get_fuse_conn(inode);
M
Miklos Szeredi 已提交
995

996 997 998 999 1000 1001
	/*
	 * In auto invalidate mode, always update attributes on read.
	 * Otherwise, only update if we attempt to read past EOF (to ensure
	 * i_size is up to date).
	 */
	if (fc->auto_inval_data ||
1002
	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
M
Miklos Szeredi 已提交
1003
		int err;
M
Miklos Szeredi 已提交
1004
		err = fuse_update_attributes(inode, iocb->ki_filp);
M
Miklos Szeredi 已提交
1005 1006 1007 1008
		if (err)
			return err;
	}

1009
	return generic_file_read_iter(iocb, to);
M
Miklos Szeredi 已提交
1010 1011
}

1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
				 loff_t pos, size_t count)
{
	struct fuse_args *args = &ia->ap.args;

	ia->write.in.fh = ff->fh;
	ia->write.in.offset = pos;
	ia->write.in.size = count;
	args->opcode = FUSE_WRITE;
	args->nodeid = ff->nodeid;
	args->in_numargs = 2;
1023
	if (ff->fm->fc->minor < 9)
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
	else
		args->in_args[0].size = sizeof(ia->write.in);
	args->in_args[0].value = &ia->write.in;
	args->in_args[1].size = count;
	args->out_numargs = 1;
	args->out_args[0].size = sizeof(ia->write.out);
	args->out_args[0].value = &ia->write.out;
}

static unsigned int fuse_write_flags(struct kiocb *iocb)
{
	unsigned int flags = iocb->ki_filp->f_flags;

	if (iocb->ki_flags & IOCB_DSYNC)
		flags |= O_DSYNC;
	if (iocb->ki_flags & IOCB_SYNC)
		flags |= O_SYNC;

	return flags;
}

1046 1047
static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
			       size_t count, fl_owner_t owner)
1048
{
1049
	struct kiocb *iocb = ia->io->iocb;
1050
	struct file *file = iocb->ki_filp;
1051
	struct fuse_file *ff = file->private_data;
1052
	struct fuse_mount *fm = ff->fm;
1053 1054
	struct fuse_write_in *inarg = &ia->write.in;
	ssize_t err;
1055

1056
	fuse_write_args_fill(ia, ff, pos, count);
1057
	inarg->flags = fuse_write_flags(iocb);
1058 1059
	if (owner != NULL) {
		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1060
		inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
1061
	}
1062

1063
	if (ia->io->async)
1064
		return fuse_async_req_send(fm, ia, count);
1065

1066
	err = fuse_simple_request(fm, &ia->ap.args);
1067 1068
	if (!err && ia->write.out.size > count)
		err = -EIO;
1069

1070
	return err ?: ia->write.out.size;
M
Miklos Szeredi 已提交
1071 1072
}

M
Maxim Patlasov 已提交
1073
bool fuse_write_update_size(struct inode *inode, loff_t pos)
1074 1075 1076
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
M
Maxim Patlasov 已提交
1077
	bool ret = false;
1078

1079
	spin_lock(&fi->lock);
1080
	fi->attr_version = atomic64_inc_return(&fc->attr_version);
M
Maxim Patlasov 已提交
1081
	if (pos > inode->i_size) {
1082
		i_size_write(inode, pos);
M
Maxim Patlasov 已提交
1083 1084
		ret = true;
	}
1085
	spin_unlock(&fi->lock);
M
Maxim Patlasov 已提交
1086 1087

	return ret;
1088 1089
}

1090 1091 1092
static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
				     struct kiocb *iocb, struct inode *inode,
				     loff_t pos, size_t count)
N
Nick Piggin 已提交
1093
{
1094 1095 1096
	struct fuse_args_pages *ap = &ia->ap;
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
1097
	struct fuse_mount *fm = ff->fm;
1098
	unsigned int offset, i;
V
Vivek Goyal 已提交
1099
	bool short_write;
1100
	int err;
N
Nick Piggin 已提交
1101

1102 1103
	for (i = 0; i < ap->num_pages; i++)
		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
N
Nick Piggin 已提交
1104

1105 1106
	fuse_write_args_fill(ia, ff, pos, count);
	ia->write.in.flags = fuse_write_flags(iocb);
N
Nick Piggin 已提交
1107

1108
	err = fuse_simple_request(fm, &ap->args);
M
Miklos Szeredi 已提交
1109 1110
	if (!err && ia->write.out.size > count)
		err = -EIO;
1111

V
Vivek Goyal 已提交
1112
	short_write = ia->write.out.size < count;
1113 1114 1115 1116
	offset = ap->descs[0].offset;
	count = ia->write.out.size;
	for (i = 0; i < ap->num_pages; i++) {
		struct page *page = ap->pages[i];
N
Nick Piggin 已提交
1117

V
Vivek Goyal 已提交
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
		if (err) {
			ClearPageUptodate(page);
		} else {
			if (count >= PAGE_SIZE - offset)
				count -= PAGE_SIZE - offset;
			else {
				if (short_write)
					ClearPageUptodate(page);
				count = 0;
			}
			offset = 0;
		}
		if (ia->write.page_locked && (i == ap->num_pages - 1))
			unlock_page(page);
1132
		put_page(page);
N
Nick Piggin 已提交
1133 1134
	}

1135
	return err;
N
Nick Piggin 已提交
1136 1137
}

V
Vivek Goyal 已提交
1138
static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
1139 1140 1141
				     struct address_space *mapping,
				     struct iov_iter *ii, loff_t pos,
				     unsigned int max_pages)
N
Nick Piggin 已提交
1142
{
V
Vivek Goyal 已提交
1143
	struct fuse_args_pages *ap = &ia->ap;
N
Nick Piggin 已提交
1144
	struct fuse_conn *fc = get_fuse_conn(mapping->host);
1145
	unsigned offset = pos & (PAGE_SIZE - 1);
N
Nick Piggin 已提交
1146 1147 1148
	size_t count = 0;
	int err;

1149 1150
	ap->args.in_pages = true;
	ap->descs[0].offset = offset;
N
Nick Piggin 已提交
1151 1152 1153 1154

	do {
		size_t tmp;
		struct page *page;
1155 1156
		pgoff_t index = pos >> PAGE_SHIFT;
		size_t bytes = min_t(size_t, PAGE_SIZE - offset,
N
Nick Piggin 已提交
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
				     iov_iter_count(ii));

		bytes = min_t(size_t, bytes, fc->max_write - count);

 again:
		err = -EFAULT;
		if (iov_iter_fault_in_readable(ii, bytes))
			break;

		err = -ENOMEM;
1167
		page = grab_cache_page_write_begin(mapping, index, 0);
N
Nick Piggin 已提交
1168 1169 1170
		if (!page)
			break;

1171 1172 1173
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

N
Nick Piggin 已提交
1174 1175 1176
		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
		flush_dcache_page(page);

1177
		iov_iter_advance(ii, tmp);
N
Nick Piggin 已提交
1178 1179
		if (!tmp) {
			unlock_page(page);
1180
			put_page(page);
N
Nick Piggin 已提交
1181 1182 1183 1184 1185
			bytes = min(bytes, iov_iter_single_seg_count(ii));
			goto again;
		}

		err = 0;
1186 1187 1188
		ap->pages[ap->num_pages] = page;
		ap->descs[ap->num_pages].length = tmp;
		ap->num_pages++;
N
Nick Piggin 已提交
1189 1190 1191 1192

		count += tmp;
		pos += tmp;
		offset += tmp;
1193
		if (offset == PAGE_SIZE)
N
Nick Piggin 已提交
1194 1195
			offset = 0;

V
Vivek Goyal 已提交
1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
		/* If we copied full page, mark it uptodate */
		if (tmp == PAGE_SIZE)
			SetPageUptodate(page);

		if (PageUptodate(page)) {
			unlock_page(page);
		} else {
			ia->write.page_locked = true;
			break;
		}
1206 1207
		if (!fc->big_writes)
			break;
N
Nick Piggin 已提交
1208
	} while (iov_iter_count(ii) && count < fc->max_write &&
1209
		 ap->num_pages < max_pages && offset == 0);
N
Nick Piggin 已提交
1210 1211 1212 1213

	return count > 0 ? count : err;
}

1214 1215
static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
				     unsigned int max_pages)
1216
{
1217
	return min_t(unsigned int,
1218 1219
		     ((pos + len - 1) >> PAGE_SHIFT) -
		     (pos >> PAGE_SHIFT) + 1,
1220
		     max_pages);
1221 1222
}

1223
static ssize_t fuse_perform_write(struct kiocb *iocb,
N
Nick Piggin 已提交
1224 1225 1226 1227 1228
				  struct address_space *mapping,
				  struct iov_iter *ii, loff_t pos)
{
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
1229
	struct fuse_inode *fi = get_fuse_inode(inode);
N
Nick Piggin 已提交
1230 1231 1232
	int err = 0;
	ssize_t res = 0;

1233 1234 1235
	if (inode->i_size < pos + iov_iter_count(ii))
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

N
Nick Piggin 已提交
1236 1237
	do {
		ssize_t count;
1238 1239
		struct fuse_io_args ia = {};
		struct fuse_args_pages *ap = &ia.ap;
1240 1241
		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
						      fc->max_pages);
N
Nick Piggin 已提交
1242

1243 1244 1245
		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
		if (!ap->pages) {
			err = -ENOMEM;
N
Nick Piggin 已提交
1246 1247 1248
			break;
		}

V
Vivek Goyal 已提交
1249
		count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
N
Nick Piggin 已提交
1250 1251 1252
		if (count <= 0) {
			err = count;
		} else {
1253 1254
			err = fuse_send_write_pages(&ia, iocb, inode,
						    pos, count);
N
Nick Piggin 已提交
1255
			if (!err) {
1256 1257
				size_t num_written = ia.write.out.size;

N
Nick Piggin 已提交
1258 1259 1260 1261 1262 1263 1264 1265
				res += num_written;
				pos += num_written;

				/* break out of the loop on short write */
				if (num_written != count)
					err = -EIO;
			}
		}
1266
		kfree(ap->pages);
N
Nick Piggin 已提交
1267 1268 1269 1270 1271
	} while (!err && iov_iter_count(ii));

	if (res > 0)
		fuse_write_update_size(inode, pos);

1272
	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
N
Nick Piggin 已提交
1273 1274 1275 1276 1277
	fuse_invalidate_attr(inode);

	return res > 0 ? res : err;
}

1278
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
N
Nick Piggin 已提交
1279 1280 1281 1282
{
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
	ssize_t written = 0;
A
Anand Avati 已提交
1283
	ssize_t written_buffered = 0;
N
Nick Piggin 已提交
1284 1285
	struct inode *inode = mapping->host;
	ssize_t err;
A
Anand Avati 已提交
1286
	loff_t endbyte = 0;
N
Nick Piggin 已提交
1287

P
Pavel Emelyanov 已提交
1288 1289
	if (get_fuse_conn(inode)->writeback_cache) {
		/* Update size (EOF optimization) and mode (SUID clearing) */
M
Miklos Szeredi 已提交
1290
		err = fuse_update_attributes(mapping->host, file);
P
Pavel Emelyanov 已提交
1291 1292 1293
		if (err)
			return err;

A
Al Viro 已提交
1294
		return generic_file_write_iter(iocb, from);
P
Pavel Emelyanov 已提交
1295 1296
	}

A
Al Viro 已提交
1297
	inode_lock(inode);
N
Nick Piggin 已提交
1298 1299

	/* We can write back this queue in page reclaim */
1300
	current->backing_dev_info = inode_to_bdi(inode);
N
Nick Piggin 已提交
1301

1302 1303
	err = generic_write_checks(iocb, from);
	if (err <= 0)
N
Nick Piggin 已提交
1304 1305
		goto out;

1306
	err = file_remove_privs(file);
N
Nick Piggin 已提交
1307 1308 1309
	if (err)
		goto out;

1310 1311 1312
	err = file_update_time(file);
	if (err)
		goto out;
N
Nick Piggin 已提交
1313

1314
	if (iocb->ki_flags & IOCB_DIRECT) {
1315
		loff_t pos = iocb->ki_pos;
1316
		written = generic_file_direct_write(iocb, from);
A
Al Viro 已提交
1317
		if (written < 0 || !iov_iter_count(from))
A
Anand Avati 已提交
1318 1319 1320
			goto out;

		pos += written;
N
Nick Piggin 已提交
1321

1322
		written_buffered = fuse_perform_write(iocb, mapping, from, pos);
A
Anand Avati 已提交
1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334
		if (written_buffered < 0) {
			err = written_buffered;
			goto out;
		}
		endbyte = pos + written_buffered - 1;

		err = filemap_write_and_wait_range(file->f_mapping, pos,
						   endbyte);
		if (err)
			goto out;

		invalidate_mapping_pages(file->f_mapping,
1335 1336
					 pos >> PAGE_SHIFT,
					 endbyte >> PAGE_SHIFT);
A
Anand Avati 已提交
1337 1338 1339 1340

		written += written_buffered;
		iocb->ki_pos = pos + written_buffered;
	} else {
1341
		written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
A
Anand Avati 已提交
1342
		if (written >= 0)
1343
			iocb->ki_pos += written;
A
Anand Avati 已提交
1344
	}
N
Nick Piggin 已提交
1345 1346
out:
	current->backing_dev_info = NULL;
A
Al Viro 已提交
1347
	inode_unlock(inode);
1348 1349
	if (written > 0)
		written = generic_write_sync(iocb, written);
N
Nick Piggin 已提交
1350 1351 1352 1353

	return written ? written : err;
}

1354 1355 1356
static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
					       unsigned int index,
					       unsigned int nr_pages)
1357 1358 1359
{
	int i;

1360
	for (i = index; i < index + nr_pages; i++)
1361
		descs[i].length = PAGE_SIZE - descs[i].offset;
1362 1363
}

1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
}

static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
					size_t max_size)
{
	return min(iov_iter_single_seg_count(ii), max_size);
}

1375 1376 1377
static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
			       size_t *nbytesp, int write,
			       unsigned int max_pages)
M
Miklos Szeredi 已提交
1378
{
1379
	size_t nbytes = 0;  /* # bytes already packed in req */
1380
	ssize_t ret = 0;
1381

1382
	/* Special case for kernel I/O: can copy directly into the buffer */
D
David Howells 已提交
1383
	if (iov_iter_is_kvec(ii)) {
1384 1385 1386
		unsigned long user_addr = fuse_get_user_addr(ii);
		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);

1387
		if (write)
1388
			ap->args.in_args[1].value = (void *) user_addr;
1389
		else
1390
			ap->args.out_args[0].value = (void *) user_addr;
1391

1392 1393
		iov_iter_advance(ii, frag_size);
		*nbytesp = frag_size;
1394 1395
		return 0;
	}
M
Miklos Szeredi 已提交
1396

1397
	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
1398
		unsigned npages;
A
Al Viro 已提交
1399
		size_t start;
1400
		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
1401
					*nbytesp - nbytes,
1402
					max_pages - ap->num_pages,
1403
					&start);
1404
		if (ret < 0)
1405
			break;
1406

A
Al Viro 已提交
1407 1408
		iov_iter_advance(ii, ret);
		nbytes += ret;
1409

A
Al Viro 已提交
1410 1411
		ret += start;
		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
1412

1413 1414
		ap->descs[ap->num_pages].offset = start;
		fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
1415

1416 1417
		ap->num_pages += npages;
		ap->descs[ap->num_pages - 1].length -=
A
Al Viro 已提交
1418
			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1419
	}
1420 1421

	if (write)
1422
		ap->args.in_pages = true;
1423
	else
1424
		ap->args.out_pages = true;
1425

1426
	*nbytesp = nbytes;
1427

1428
	return ret < 0 ? ret : 0;
M
Miklos Szeredi 已提交
1429 1430
}

A
Al Viro 已提交
1431 1432
ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
		       loff_t *ppos, int flags)
M
Miklos Szeredi 已提交
1433
{
1434 1435
	int write = flags & FUSE_DIO_WRITE;
	int cuse = flags & FUSE_DIO_CUSE;
1436
	struct file *file = io->iocb->ki_filp;
1437
	struct inode *inode = file->f_mapping->host;
1438
	struct fuse_file *ff = file->private_data;
1439
	struct fuse_conn *fc = ff->fm->fc;
M
Miklos Szeredi 已提交
1440 1441
	size_t nmax = write ? fc->max_write : fc->max_read;
	loff_t pos = *ppos;
A
Al Viro 已提交
1442
	size_t count = iov_iter_count(iter);
1443 1444
	pgoff_t idx_from = pos >> PAGE_SHIFT;
	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
M
Miklos Szeredi 已提交
1445
	ssize_t res = 0;
1446
	int err = 0;
1447 1448
	struct fuse_io_args *ia;
	unsigned int max_pages;
1449

1450 1451 1452 1453
	max_pages = iov_iter_npages(iter, fc->max_pages);
	ia = fuse_io_alloc(io, max_pages);
	if (!ia)
		return -ENOMEM;
M
Miklos Szeredi 已提交
1454

1455
	ia->io = io;
1456 1457
	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
		if (!write)
A
Al Viro 已提交
1458
			inode_lock(inode);
1459 1460
		fuse_sync_writes(inode);
		if (!write)
A
Al Viro 已提交
1461
			inode_unlock(inode);
1462 1463
	}

1464
	io->should_dirty = !write && iter_is_iovec(iter);
M
Miklos Szeredi 已提交
1465
	while (count) {
1466
		ssize_t nres;
1467
		fl_owner_t owner = current->files;
1468
		size_t nbytes = min(count, nmax);
1469 1470 1471

		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
					  max_pages);
1472
		if (err && !nbytes)
M
Miklos Szeredi 已提交
1473
			break;
1474

M
Miklos Szeredi 已提交
1475
		if (write) {
1476
			if (!capable(CAP_FSETID))
1477
				ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
M
Miklos Szeredi 已提交
1478

1479
			nres = fuse_send_write(ia, pos, nbytes, owner);
M
Miklos Szeredi 已提交
1480
		} else {
1481
			nres = fuse_send_read(ia, pos, nbytes, owner);
M
Miklos Szeredi 已提交
1482
		}
1483

1484 1485 1486 1487 1488 1489
		if (!io->async || nres < 0) {
			fuse_release_user_pages(&ia->ap, io->should_dirty);
			fuse_io_free(ia);
		}
		ia = NULL;
		if (nres < 0) {
1490
			iov_iter_revert(iter, nbytes);
1491
			err = nres;
M
Miklos Szeredi 已提交
1492 1493
			break;
		}
1494 1495
		WARN_ON(nres > nbytes);

M
Miklos Szeredi 已提交
1496 1497 1498
		count -= nres;
		res += nres;
		pos += nres;
1499 1500
		if (nres != nbytes) {
			iov_iter_revert(iter, nbytes - nres);
M
Miklos Szeredi 已提交
1501
			break;
1502
		}
1503
		if (count) {
1504 1505 1506
			max_pages = iov_iter_npages(iter, fc->max_pages);
			ia = fuse_io_alloc(io, max_pages);
			if (!ia)
1507 1508
				break;
		}
M
Miklos Szeredi 已提交
1509
	}
1510 1511
	if (ia)
		fuse_io_free(ia);
1512
	if (res > 0)
M
Miklos Szeredi 已提交
1513 1514
		*ppos = pos;

1515
	return res > 0 ? res : err;
M
Miklos Szeredi 已提交
1516
}
1517
EXPORT_SYMBOL_GPL(fuse_direct_io);
M
Miklos Szeredi 已提交
1518

1519
static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
A
Al Viro 已提交
1520 1521
				  struct iov_iter *iter,
				  loff_t *ppos)
M
Miklos Szeredi 已提交
1522
{
1523
	ssize_t res;
1524
	struct inode *inode = file_inode(io->iocb->ki_filp);
1525

A
Al Viro 已提交
1526
	res = fuse_direct_io(io, iter, ppos, 0);
1527

1528
	fuse_invalidate_atime(inode);
1529 1530

	return res;
M
Miklos Szeredi 已提交
1531 1532
}

1533 1534
static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);

1535
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1536
{
1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547
	ssize_t res;

	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
		res = fuse_direct_IO(iocb, to);
	} else {
		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);

		res = __fuse_direct_read(&io, to, &iocb->ki_pos);
	}

	return res;
1548 1549
}

1550
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
A
Anand Avati 已提交
1551
{
1552 1553
	struct inode *inode = file_inode(iocb->ki_filp);
	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1554
	ssize_t res;
A
Anand Avati 已提交
1555 1556

	/* Don't allow parallel writes to the same file */
A
Al Viro 已提交
1557
	inode_lock(inode);
1558
	res = generic_write_checks(iocb, from);
1559 1560 1561 1562 1563 1564 1565 1566
	if (res > 0) {
		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
			res = fuse_direct_IO(iocb, from);
		} else {
			res = fuse_direct_io(&io, from, &iocb->ki_pos,
					     FUSE_DIO_WRITE);
		}
	}
1567
	fuse_invalidate_attr(inode);
1568
	if (res > 0)
1569
		fuse_write_update_size(inode, iocb->ki_pos);
A
Al Viro 已提交
1570
	inode_unlock(inode);
A
Anand Avati 已提交
1571 1572 1573 1574

	return res;
}

1575 1576
static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
1577 1578
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
1579
	struct inode *inode = file_inode(file);
1580

M
Miklos Szeredi 已提交
1581
	if (fuse_is_bad(inode))
1582
		return -EIO;
1583

1584 1585 1586
	if (FUSE_IS_DAX(inode))
		return fuse_dax_read_iter(iocb, to);

1587 1588 1589 1590 1591 1592 1593 1594
	if (!(ff->open_flags & FOPEN_DIRECT_IO))
		return fuse_cache_read_iter(iocb, to);
	else
		return fuse_direct_read_iter(iocb, to);
}

static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
1595 1596
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
1597
	struct inode *inode = file_inode(file);
1598

M
Miklos Szeredi 已提交
1599
	if (fuse_is_bad(inode))
1600
		return -EIO;
1601

1602 1603 1604
	if (FUSE_IS_DAX(inode))
		return fuse_dax_write_iter(iocb, from);

1605 1606 1607 1608 1609 1610
	if (!(ff->open_flags & FOPEN_DIRECT_IO))
		return fuse_cache_write_iter(iocb, from);
	else
		return fuse_direct_write_iter(iocb, from);
}

1611
static void fuse_writepage_free(struct fuse_writepage_args *wpa)
M
Miklos Szeredi 已提交
1612
{
1613
	struct fuse_args_pages *ap = &wpa->ia.ap;
1614 1615
	int i;

1616 1617 1618 1619 1620
	for (i = 0; i < ap->num_pages; i++)
		__free_page(ap->pages[i]);

	if (wpa->ia.ff)
		fuse_file_put(wpa->ia.ff, false, false);
1621

1622 1623
	kfree(ap->pages);
	kfree(wpa);
M
Miklos Szeredi 已提交
1624 1625
}

1626
static void fuse_writepage_finish(struct fuse_mount *fm,
1627
				  struct fuse_writepage_args *wpa)
M
Miklos Szeredi 已提交
1628
{
1629 1630
	struct fuse_args_pages *ap = &wpa->ia.ap;
	struct inode *inode = wpa->inode;
M
Miklos Szeredi 已提交
1631
	struct fuse_inode *fi = get_fuse_inode(inode);
1632
	struct backing_dev_info *bdi = inode_to_bdi(inode);
1633
	int i;
M
Miklos Szeredi 已提交
1634

1635
	for (i = 0; i < ap->num_pages; i++) {
1636
		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1637
		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
1638
		wb_writeout_inc(&bdi->wb);
1639
	}
M
Miklos Szeredi 已提交
1640 1641 1642
	wake_up(&fi->page_waitq);
}

1643
/* Called under fi->lock, may release and reacquire it */
1644
static void fuse_send_writepage(struct fuse_mount *fm,
1645
				struct fuse_writepage_args *wpa, loff_t size)
1646 1647
__releases(fi->lock)
__acquires(fi->lock)
M
Miklos Szeredi 已提交
1648
{
1649 1650 1651 1652 1653 1654
	struct fuse_writepage_args *aux, *next;
	struct fuse_inode *fi = get_fuse_inode(wpa->inode);
	struct fuse_write_in *inarg = &wpa->ia.write.in;
	struct fuse_args *args = &wpa->ia.ap.args;
	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
	int err;
M
Miklos Szeredi 已提交
1655

1656
	fi->writectr++;
1657 1658
	if (inarg->offset + data_size <= size) {
		inarg->size = data_size;
M
Miklos Szeredi 已提交
1659
	} else if (inarg->offset < size) {
1660
		inarg->size = size - inarg->offset;
M
Miklos Szeredi 已提交
1661 1662 1663
	} else {
		/* Got truncated off completely */
		goto out_free;
M
Miklos Szeredi 已提交
1664
	}
M
Miklos Szeredi 已提交
1665

1666 1667 1668 1669
	args->in_args[1].size = inarg->size;
	args->force = true;
	args->nocreds = true;

1670
	err = fuse_simple_background(fm, args, GFP_ATOMIC);
1671 1672
	if (err == -ENOMEM) {
		spin_unlock(&fi->lock);
1673
		err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
1674 1675 1676
		spin_lock(&fi->lock);
	}

1677
	/* Fails on broken connection only */
1678
	if (unlikely(err))
1679 1680
		goto out_free;

M
Miklos Szeredi 已提交
1681 1682 1683
	return;

 out_free:
1684
	fi->writectr--;
1685
	rb_erase(&wpa->writepages_entry, &fi->writepages);
1686
	fuse_writepage_finish(fm, wpa);
1687
	spin_unlock(&fi->lock);
M
Miklos Szeredi 已提交
1688 1689

	/* After fuse_writepage_finish() aux request list is private */
1690 1691 1692 1693
	for (aux = wpa->next; aux; aux = next) {
		next = aux->next;
		aux->next = NULL;
		fuse_writepage_free(aux);
M
Miklos Szeredi 已提交
1694 1695
	}

1696
	fuse_writepage_free(wpa);
1697
	spin_lock(&fi->lock);
M
Miklos Szeredi 已提交
1698 1699
}

M
Miklos Szeredi 已提交
1700 1701 1702 1703
/*
 * If fi->writectr is positive (no truncate or fsync going on) send
 * all queued writepage requests.
 *
1704
 * Called with fi->lock
M
Miklos Szeredi 已提交
1705 1706
 */
void fuse_flush_writepages(struct inode *inode)
1707 1708
__releases(fi->lock)
__acquires(fi->lock)
M
Miklos Szeredi 已提交
1709
{
1710
	struct fuse_mount *fm = get_fuse_mount(inode);
M
Miklos Szeredi 已提交
1711
	struct fuse_inode *fi = get_fuse_inode(inode);
M
Miklos Szeredi 已提交
1712
	loff_t crop = i_size_read(inode);
1713
	struct fuse_writepage_args *wpa;
M
Miklos Szeredi 已提交
1714 1715

	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1716 1717 1718
		wpa = list_entry(fi->queued_writes.next,
				 struct fuse_writepage_args, queue_entry);
		list_del_init(&wpa->queue_entry);
1719
		fuse_send_writepage(fm, wpa, crop);
M
Miklos Szeredi 已提交
1720 1721 1722
	}
}

1723 1724
static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
						struct fuse_writepage_args *wpa)
1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746
{
	pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
	pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
	struct rb_node **p = &root->rb_node;
	struct rb_node  *parent = NULL;

	WARN_ON(!wpa->ia.ap.num_pages);
	while (*p) {
		struct fuse_writepage_args *curr;
		pgoff_t curr_index;

		parent = *p;
		curr = rb_entry(parent, struct fuse_writepage_args,
				writepages_entry);
		WARN_ON(curr->inode != wpa->inode);
		curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;

		if (idx_from >= curr_index + curr->ia.ap.num_pages)
			p = &(*p)->rb_right;
		else if (idx_to < curr_index)
			p = &(*p)->rb_left;
		else
1747
			return curr;
1748 1749 1750 1751
	}

	rb_link_node(&wpa->writepages_entry, parent, p);
	rb_insert_color(&wpa->writepages_entry, root);
1752 1753 1754 1755 1756 1757
	return NULL;
}

static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
{
	WARN_ON(fuse_insert_writeback(root, wpa));
1758 1759
}

1760
static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
1761
			       int error)
M
Miklos Szeredi 已提交
1762
{
1763 1764 1765
	struct fuse_writepage_args *wpa =
		container_of(args, typeof(*wpa), ia.ap.args);
	struct inode *inode = wpa->inode;
M
Miklos Szeredi 已提交
1766
	struct fuse_inode *fi = get_fuse_inode(inode);
1767
	struct fuse_conn *fc = get_fuse_conn(inode);
M
Miklos Szeredi 已提交
1768

1769
	mapping_set_error(inode->i_mapping, error);
1770 1771 1772 1773 1774 1775 1776 1777
	/*
	 * A writeback finished and this might have updated mtime/ctime on
	 * server making local mtime/ctime stale.  Hence invalidate attrs.
	 * Do this only if writeback_cache is not enabled.  If writeback_cache
	 * is enabled, we trust local ctime/mtime.
	 */
	if (!fc->writeback_cache)
		fuse_invalidate_attr(inode);
1778
	spin_lock(&fi->lock);
1779
	rb_erase(&wpa->writepages_entry, &fi->writepages);
1780
	while (wpa->next) {
1781
		struct fuse_mount *fm = get_fuse_mount(inode);
1782 1783 1784 1785 1786 1787
		struct fuse_write_in *inarg = &wpa->ia.write.in;
		struct fuse_writepage_args *next = wpa->next;

		wpa->next = next->next;
		next->next = NULL;
		next->ia.ff = fuse_file_get(wpa->ia.ff);
1788
		tree_insert(&fi->writepages, next);
1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812

		/*
		 * Skip fuse_flush_writepages() to make it easy to crop requests
		 * based on primary request size.
		 *
		 * 1st case (trivial): there are no concurrent activities using
		 * fuse_set/release_nowrite.  Then we're on safe side because
		 * fuse_flush_writepages() would call fuse_send_writepage()
		 * anyway.
		 *
		 * 2nd case: someone called fuse_set_nowrite and it is waiting
		 * now for completion of all in-flight requests.  This happens
		 * rarely and no more than once per page, so this should be
		 * okay.
		 *
		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
		 * that fuse_set_nowrite returned implies that all in-flight
		 * requests were completed along with all of their secondary
		 * requests.  Further primary requests are blocked by negative
		 * writectr.  Hence there cannot be any in-flight requests and
		 * no invocations of fuse_writepage_end() while we're in
		 * fuse_set_nowrite..fuse_release_nowrite section.
		 */
1813
		fuse_send_writepage(fm, next, inarg->offset + inarg->size);
1814
	}
M
Miklos Szeredi 已提交
1815
	fi->writectr--;
1816
	fuse_writepage_finish(fm, wpa);
1817
	spin_unlock(&fi->lock);
1818
	fuse_writepage_free(wpa);
M
Miklos Szeredi 已提交
1819 1820
}

M
Miklos Szeredi 已提交
1821 1822
static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
					       struct fuse_inode *fi)
1823
{
M
Miklos Szeredi 已提交
1824
	struct fuse_file *ff = NULL;
1825

1826
	spin_lock(&fi->lock);
M
Miklos Szeredi 已提交
1827
	if (!list_empty(&fi->write_files)) {
M
Miklos Szeredi 已提交
1828 1829 1830 1831
		ff = list_entry(fi->write_files.next, struct fuse_file,
				write_entry);
		fuse_file_get(ff);
	}
1832
	spin_unlock(&fi->lock);
1833 1834 1835 1836

	return ff;
}

M
Miklos Szeredi 已提交
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851
static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
					     struct fuse_inode *fi)
{
	struct fuse_file *ff = __fuse_write_file_get(fc, fi);
	WARN_ON(!ff);
	return ff;
}

int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct fuse_file *ff;
	int err;

1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862
	/*
	 * Inode is always written before the last reference is dropped and
	 * hence this should not be reached from reclaim.
	 *
	 * Writing back the inode from reclaim can deadlock if the request
	 * processing itself needs an allocation.  Allocations triggering
	 * reclaim while serving a request can't be prevented, because it can
	 * involve any number of unrelated userspace processes.
	 */
	WARN_ON(wbc->for_reclaim);

M
Miklos Szeredi 已提交
1863
	ff = __fuse_write_file_get(fc, fi);
1864
	err = fuse_flush_times(inode, ff);
M
Miklos Szeredi 已提交
1865
	if (ff)
1866
		fuse_file_put(ff, false, false);
M
Miklos Szeredi 已提交
1867 1868 1869 1870

	return err;
}

1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889
static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
{
	struct fuse_writepage_args *wpa;
	struct fuse_args_pages *ap;

	wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
	if (wpa) {
		ap = &wpa->ia.ap;
		ap->num_pages = 0;
		ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
		if (!ap->pages) {
			kfree(wpa);
			wpa = NULL;
		}
	}
	return wpa;

}

M
Miklos Szeredi 已提交
1890 1891 1892 1893 1894 1895
static int fuse_writepage_locked(struct page *page)
{
	struct address_space *mapping = page->mapping;
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
1896 1897
	struct fuse_writepage_args *wpa;
	struct fuse_args_pages *ap;
M
Miklos Szeredi 已提交
1898
	struct page *tmp_page;
M
Miklos Szeredi 已提交
1899
	int error = -ENOMEM;
M
Miklos Szeredi 已提交
1900 1901 1902

	set_page_writeback(page);

1903 1904
	wpa = fuse_writepage_args_alloc();
	if (!wpa)
M
Miklos Szeredi 已提交
1905
		goto err;
1906
	ap = &wpa->ia.ap;
M
Miklos Szeredi 已提交
1907 1908 1909 1910 1911

	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
	if (!tmp_page)
		goto err_free;

M
Miklos Szeredi 已提交
1912
	error = -EIO;
1913 1914
	wpa->ia.ff = fuse_write_file_get(fc, fi);
	if (!wpa->ia.ff)
1915
		goto err_nofile;
M
Miklos Szeredi 已提交
1916

1917
	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
M
Miklos Szeredi 已提交
1918 1919

	copy_highpage(tmp_page, page);
1920 1921 1922 1923 1924 1925 1926 1927 1928
	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
	wpa->next = NULL;
	ap->args.in_pages = true;
	ap->num_pages = 1;
	ap->pages[0] = tmp_page;
	ap->descs[0].offset = 0;
	ap->descs[0].length = PAGE_SIZE;
	ap->args.end = fuse_writepage_end;
	wpa->inode = inode;
M
Miklos Szeredi 已提交
1929

1930
	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1931
	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
M
Miklos Szeredi 已提交
1932

1933
	spin_lock(&fi->lock);
1934
	tree_insert(&fi->writepages, wpa);
1935
	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
M
Miklos Szeredi 已提交
1936
	fuse_flush_writepages(inode);
1937
	spin_unlock(&fi->lock);
M
Miklos Szeredi 已提交
1938

1939 1940
	end_page_writeback(page);

M
Miklos Szeredi 已提交
1941 1942
	return 0;

1943 1944
err_nofile:
	__free_page(tmp_page);
M
Miklos Szeredi 已提交
1945
err_free:
1946
	kfree(wpa);
M
Miklos Szeredi 已提交
1947
err:
1948
	mapping_set_error(page->mapping, error);
M
Miklos Szeredi 已提交
1949
	end_page_writeback(page);
M
Miklos Szeredi 已提交
1950
	return error;
M
Miklos Szeredi 已提交
1951 1952 1953 1954 1955 1956
}

static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
	int err;

1957 1958 1959 1960 1961 1962 1963 1964 1965
	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
		/*
		 * ->writepages() should be called for sync() and friends.  We
		 * should only get here on direct reclaim and then we are
		 * allowed to skip a page which is already in flight
		 */
		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);

		redirty_page_for_writepage(wbc, page);
1966
		unlock_page(page);
1967 1968 1969
		return 0;
	}

M
Miklos Szeredi 已提交
1970 1971 1972 1973 1974 1975
	err = fuse_writepage_locked(page);
	unlock_page(page);

	return err;
}

1976
struct fuse_fill_wb_data {
1977
	struct fuse_writepage_args *wpa;
1978 1979
	struct fuse_file *ff;
	struct inode *inode;
1980
	struct page **orig_pages;
1981
	unsigned int max_pages;
1982 1983
};

1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
{
	struct fuse_args_pages *ap = &data->wpa->ia.ap;
	struct fuse_conn *fc = get_fuse_conn(data->inode);
	struct page **pages;
	struct fuse_page_desc *descs;
	unsigned int npages = min_t(unsigned int,
				    max_t(unsigned int, data->max_pages * 2,
					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
				    fc->max_pages);
	WARN_ON(npages <= data->max_pages);

	pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
	if (!pages)
		return false;

	memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
	memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
	kfree(ap->pages);
	ap->pages = pages;
	ap->descs = descs;
	data->max_pages = npages;

	return true;
}

2010 2011
static void fuse_writepages_send(struct fuse_fill_wb_data *data)
{
2012
	struct fuse_writepage_args *wpa = data->wpa;
2013 2014
	struct inode *inode = data->inode;
	struct fuse_inode *fi = get_fuse_inode(inode);
2015
	int num_pages = wpa->ia.ap.num_pages;
2016
	int i;
2017

2018
	wpa->ia.ff = fuse_file_get(data->ff);
2019
	spin_lock(&fi->lock);
2020
	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2021
	fuse_flush_writepages(inode);
2022
	spin_unlock(&fi->lock);
2023 2024 2025

	for (i = 0; i < num_pages; i++)
		end_page_writeback(data->orig_pages[i]);
2026 2027
}

2028
/*
2029 2030
 * Check under fi->lock if the page is under writeback, and insert it onto the
 * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
2031 2032
 * one already added for a page at this offset.  If there's none, then insert
 * this new request onto the auxiliary list, otherwise reuse the existing one by
2033
 * swapping the new temp page with the old one.
2034
 */
2035 2036
static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
			       struct page *page)
2037
{
2038 2039 2040 2041
	struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
	struct fuse_writepage_args *tmp;
	struct fuse_writepage_args *old_wpa;
	struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
2042

2043
	WARN_ON(new_ap->num_pages != 0);
2044
	new_ap->num_pages = 1;
2045

2046
	spin_lock(&fi->lock);
2047
	old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
2048
	if (!old_wpa) {
2049
		spin_unlock(&fi->lock);
2050
		return true;
2051
	}
2052

2053
	for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
2054 2055
		pgoff_t curr_index;

2056 2057
		WARN_ON(tmp->inode != new_wpa->inode);
		curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
2058
		if (curr_index == page->index) {
2059 2060
			WARN_ON(tmp->ia.ap.num_pages != 1);
			swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
2061
			break;
2062 2063 2064
		}
	}

2065
	if (!tmp) {
2066 2067
		new_wpa->next = old_wpa->next;
		old_wpa->next = new_wpa;
2068
	}
2069

2070
	spin_unlock(&fi->lock);
2071 2072

	if (tmp) {
2073
		struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
2074

2075
		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
2076
		dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
2077
		wb_writeout_inc(&bdi->wb);
2078
		fuse_writepage_free(new_wpa);
2079
	}
2080

2081
	return false;
2082 2083
}

2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117
static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
				     struct fuse_args_pages *ap,
				     struct fuse_fill_wb_data *data)
{
	WARN_ON(!ap->num_pages);

	/*
	 * Being under writeback is unlikely but possible.  For example direct
	 * read to an mmaped fuse file will set the page dirty twice; once when
	 * the pages are faulted with get_user_pages(), and then after the read
	 * completed.
	 */
	if (fuse_page_is_writeback(data->inode, page->index))
		return true;

	/* Reached max pages */
	if (ap->num_pages == fc->max_pages)
		return true;

	/* Reached max write bytes */
	if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
		return true;

	/* Discontinuity */
	if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
		return true;

	/* Need to grow the pages array?  If so, did the expansion fail? */
	if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
		return true;

	return false;
}

2118 2119 2120 2121
static int fuse_writepages_fill(struct page *page,
		struct writeback_control *wbc, void *_data)
{
	struct fuse_fill_wb_data *data = _data;
2122 2123
	struct fuse_writepage_args *wpa = data->wpa;
	struct fuse_args_pages *ap = &wpa->ia.ap;
2124
	struct inode *inode = data->inode;
2125
	struct fuse_inode *fi = get_fuse_inode(inode);
2126 2127 2128 2129 2130 2131
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct page *tmp_page;
	int err;

	if (!data->ff) {
		err = -EIO;
2132
		data->ff = fuse_write_file_get(fc, fi);
2133 2134 2135 2136
		if (!data->ff)
			goto out_unlock;
	}

2137
	if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
2138
		fuse_writepages_send(data);
2139
		data->wpa = NULL;
2140
	}
M
Miklos Szeredi 已提交
2141

2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155
	err = -ENOMEM;
	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
	if (!tmp_page)
		goto out_unlock;

	/*
	 * The page must not be redirtied until the writeout is completed
	 * (i.e. userspace has sent a reply to the write request).  Otherwise
	 * there could be more than one temporary page instance for each real
	 * page.
	 *
	 * This is ensured by holding the page lock in page_mkwrite() while
	 * checking fuse_page_is_writeback().  We already hold the page lock
	 * since clear_page_dirty_for_io() and keep it held until we add the
2156
	 * request to the fi->writepages list and increment ap->num_pages.
2157 2158 2159
	 * After this fuse_page_is_writeback() will indicate that the page is
	 * under writeback, so we can release the page lock.
	 */
2160
	if (data->wpa == NULL) {
2161
		err = -ENOMEM;
2162 2163
		wpa = fuse_writepage_args_alloc();
		if (!wpa) {
2164 2165 2166
			__free_page(tmp_page);
			goto out_unlock;
		}
2167
		data->max_pages = 1;
2168

2169 2170 2171 2172 2173 2174 2175 2176
		ap = &wpa->ia.ap;
		fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
		wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
		wpa->next = NULL;
		ap->args.in_pages = true;
		ap->args.end = fuse_writepage_end;
		ap->num_pages = 0;
		wpa->inode = inode;
2177 2178 2179 2180
	}
	set_page_writeback(page);

	copy_highpage(tmp_page, page);
2181 2182 2183
	ap->pages[ap->num_pages] = tmp_page;
	ap->descs[ap->num_pages].offset = 0;
	ap->descs[ap->num_pages].length = PAGE_SIZE;
2184
	data->orig_pages[ap->num_pages] = page;
2185

2186
	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2187
	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
2188 2189

	err = 0;
2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200
	if (data->wpa) {
		/*
		 * Protected by fi->lock against concurrent access by
		 * fuse_page_is_writeback().
		 */
		spin_lock(&fi->lock);
		ap->num_pages++;
		spin_unlock(&fi->lock);
	} else if (fuse_writepage_add(wpa, page)) {
		data->wpa = wpa;
	} else {
2201 2202
		end_page_writeback(page);
	}
2203 2204 2205 2206 2207 2208 2209 2210 2211 2212
out_unlock:
	unlock_page(page);

	return err;
}

static int fuse_writepages(struct address_space *mapping,
			   struct writeback_control *wbc)
{
	struct inode *inode = mapping->host;
2213
	struct fuse_conn *fc = get_fuse_conn(inode);
2214 2215 2216 2217
	struct fuse_fill_wb_data data;
	int err;

	err = -EIO;
M
Miklos Szeredi 已提交
2218
	if (fuse_is_bad(inode))
2219 2220 2221
		goto out;

	data.inode = inode;
2222
	data.wpa = NULL;
2223 2224
	data.ff = NULL;

2225
	err = -ENOMEM;
2226
	data.orig_pages = kcalloc(fc->max_pages,
2227
				  sizeof(struct page *),
2228 2229 2230 2231
				  GFP_NOFS);
	if (!data.orig_pages)
		goto out;

2232
	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2233 2234
	if (data.wpa) {
		WARN_ON(!data.wpa->ia.ap.num_pages);
2235 2236 2237
		fuse_writepages_send(&data);
	}
	if (data.ff)
2238
		fuse_file_put(data.ff, false, false);
2239 2240

	kfree(data.orig_pages);
2241 2242 2243 2244
out:
	return err;
}

2245 2246 2247 2248 2249 2250 2251 2252
/*
 * It's worthy to make sure that space is reserved on disk for the write,
 * but how to implement it without killing performance need more thinking.
 */
static int fuse_write_begin(struct file *file, struct address_space *mapping,
		loff_t pos, unsigned len, unsigned flags,
		struct page **pagep, void **fsdata)
{
2253
	pgoff_t index = pos >> PAGE_SHIFT;
A
Al Viro 已提交
2254
	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266
	struct page *page;
	loff_t fsize;
	int err = -ENOMEM;

	WARN_ON(!fc->writeback_cache);

	page = grab_cache_page_write_begin(mapping, index, flags);
	if (!page)
		goto error;

	fuse_wait_on_page_writeback(mapping->host, page->index);

2267
	if (PageUptodate(page) || len == PAGE_SIZE)
2268 2269 2270 2271 2272 2273
		goto success;
	/*
	 * Check if the start this page comes after the end of file, in which
	 * case the readpage can be optimized away.
	 */
	fsize = i_size_read(mapping->host);
2274 2275
	if (fsize <= (pos & PAGE_MASK)) {
		size_t off = pos & ~PAGE_MASK;
2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
		if (off)
			zero_user_segment(page, 0, off);
		goto success;
	}
	err = fuse_do_readpage(file, page);
	if (err)
		goto cleanup;
success:
	*pagep = page;
	return 0;

cleanup:
	unlock_page(page);
2289
	put_page(page);
2290 2291 2292 2293 2294 2295 2296 2297 2298 2299
error:
	return err;
}

static int fuse_write_end(struct file *file, struct address_space *mapping,
		loff_t pos, unsigned len, unsigned copied,
		struct page *page, void *fsdata)
{
	struct inode *inode = page->mapping->host;

2300 2301 2302 2303
	/* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
	if (!copied)
		goto unlock;

2304 2305
	if (!PageUptodate(page)) {
		/* Zero any unwritten bytes at the end of the page */
2306
		size_t endoff = (pos + copied) & ~PAGE_MASK;
2307
		if (endoff)
2308
			zero_user_segment(page, endoff, PAGE_SIZE);
2309 2310 2311 2312 2313
		SetPageUptodate(page);
	}

	fuse_write_update_size(inode, pos + copied);
	set_page_dirty(page);
2314 2315

unlock:
2316
	unlock_page(page);
2317
	put_page(page);
2318 2319 2320 2321

	return copied;
}

M
Miklos Szeredi 已提交
2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357
static int fuse_launder_page(struct page *page)
{
	int err = 0;
	if (clear_page_dirty_for_io(page)) {
		struct inode *inode = page->mapping->host;
		err = fuse_writepage_locked(page);
		if (!err)
			fuse_wait_on_page_writeback(inode, page->index);
	}
	return err;
}

/*
 * Write back dirty pages now, because there may not be any suitable
 * open files later
 */
static void fuse_vma_close(struct vm_area_struct *vma)
{
	filemap_write_and_wait(vma->vm_file->f_mapping);
}

/*
 * Wait for writeback against this page to complete before allowing it
 * to be marked dirty again, and hence written back again, possibly
 * before the previous writepage completed.
 *
 * Block here, instead of in ->writepage(), so that the userspace fs
 * can only block processes actually operating on the filesystem.
 *
 * Otherwise unprivileged userspace fs would be able to block
 * unrelated:
 *
 * - page migration
 * - sync(2)
 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
 */
2358
static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
M
Miklos Szeredi 已提交
2359
{
2360
	struct page *page = vmf->page;
2361
	struct inode *inode = file_inode(vmf->vma->vm_file);
M
Miklos Szeredi 已提交
2362

2363
	file_update_time(vmf->vma->vm_file);
M
Miklos Szeredi 已提交
2364 2365 2366 2367 2368
	lock_page(page);
	if (page->mapping != inode->i_mapping) {
		unlock_page(page);
		return VM_FAULT_NOPAGE;
	}
M
Miklos Szeredi 已提交
2369 2370

	fuse_wait_on_page_writeback(inode, page->index);
M
Miklos Szeredi 已提交
2371
	return VM_FAULT_LOCKED;
M
Miklos Szeredi 已提交
2372 2373
}

2374
static const struct vm_operations_struct fuse_file_vm_ops = {
M
Miklos Szeredi 已提交
2375 2376
	.close		= fuse_vma_close,
	.fault		= filemap_fault,
2377
	.map_pages	= filemap_map_pages,
M
Miklos Szeredi 已提交
2378 2379 2380 2381 2382
	.page_mkwrite	= fuse_page_mkwrite,
};

static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
2383 2384
	struct fuse_file *ff = file->private_data;

S
Stefan Hajnoczi 已提交
2385 2386 2387 2388
	/* DAX mmap is superior to direct_io mmap */
	if (FUSE_IS_DAX(file_inode(file)))
		return fuse_dax_mmap(file, vma);

2389 2390 2391 2392 2393 2394 2395 2396 2397 2398
	if (ff->open_flags & FOPEN_DIRECT_IO) {
		/* Can't provide the coherency needed for MAP_SHARED */
		if (vma->vm_flags & VM_MAYSHARE)
			return -ENODEV;

		invalidate_inode_pages2(file->f_mapping);

		return generic_file_mmap(file, vma);
	}

2399 2400 2401
	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
		fuse_link_write_file(file);

M
Miklos Szeredi 已提交
2402 2403
	file_accessed(file);
	vma->vm_ops = &fuse_file_vm_ops;
M
Miklos Szeredi 已提交
2404 2405 2406
	return 0;
}

2407 2408
static int convert_fuse_file_lock(struct fuse_conn *fc,
				  const struct fuse_file_lock *ffl,
2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422
				  struct file_lock *fl)
{
	switch (ffl->type) {
	case F_UNLCK:
		break;

	case F_RDLCK:
	case F_WRLCK:
		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
		    ffl->end < ffl->start)
			return -EIO;

		fl->fl_start = ffl->start;
		fl->fl_end = ffl->end;
2423 2424

		/*
2425 2426
		 * Convert pid into init's pid namespace.  The locks API will
		 * translate it into the caller's pid namespace.
2427 2428
		 */
		rcu_read_lock();
2429
		fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2430
		rcu_read_unlock();
2431 2432 2433 2434 2435 2436 2437 2438 2439
		break;

	default:
		return -EIO;
	}
	fl->fl_type = ffl->type;
	return 0;
}

2440
static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2441
			 const struct file_lock *fl, int opcode, pid_t pid,
2442
			 int flock, struct fuse_lk_in *inarg)
2443
{
A
Al Viro 已提交
2444
	struct inode *inode = file_inode(file);
2445
	struct fuse_conn *fc = get_fuse_conn(inode);
2446
	struct fuse_file *ff = file->private_data;
2447 2448 2449 2450 2451 2452 2453 2454

	memset(inarg, 0, sizeof(*inarg));
	inarg->fh = ff->fh;
	inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
	inarg->lk.start = fl->fl_start;
	inarg->lk.end = fl->fl_end;
	inarg->lk.type = fl->fl_type;
	inarg->lk.pid = pid;
2455
	if (flock)
2456
		inarg->lk_flags |= FUSE_LK_FLOCK;
2457 2458 2459 2460 2461
	args->opcode = opcode;
	args->nodeid = get_node_id(inode);
	args->in_numargs = 1;
	args->in_args[0].size = sizeof(*inarg);
	args->in_args[0].value = inarg;
2462 2463 2464 2465
}

static int fuse_getlk(struct file *file, struct file_lock *fl)
{
A
Al Viro 已提交
2466
	struct inode *inode = file_inode(file);
2467
	struct fuse_mount *fm = get_fuse_mount(inode);
2468 2469
	FUSE_ARGS(args);
	struct fuse_lk_in inarg;
2470 2471 2472
	struct fuse_lk_out outarg;
	int err;

2473
	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2474 2475 2476
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2477
	err = fuse_simple_request(fm, &args);
2478
	if (!err)
2479
		err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
2480 2481 2482 2483

	return err;
}

2484
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2485
{
A
Al Viro 已提交
2486
	struct inode *inode = file_inode(file);
2487
	struct fuse_mount *fm = get_fuse_mount(inode);
2488 2489
	FUSE_ARGS(args);
	struct fuse_lk_in inarg;
2490
	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2491
	struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
2492
	pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
2493 2494
	int err;

J
J. Bruce Fields 已提交
2495
	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
M
Miklos Szeredi 已提交
2496 2497 2498 2499
		/* NLM needs asynchronous locks, which we don't support yet */
		return -ENOLCK;
	}

2500
	/* Unlock on close is handled by the flush method */
2501
	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
2502 2503
		return 0;

2504
	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2505
	err = fuse_simple_request(fm, &args);
2506

2507 2508 2509
	/* locking is restartable */
	if (err == -EINTR)
		err = -ERESTARTSYS;
2510

2511 2512 2513 2514 2515
	return err;
}

static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
{
A
Al Viro 已提交
2516
	struct inode *inode = file_inode(file);
2517 2518 2519
	struct fuse_conn *fc = get_fuse_conn(inode);
	int err;

M
Miklos Szeredi 已提交
2520 2521 2522
	if (cmd == F_CANCELLK) {
		err = 0;
	} else if (cmd == F_GETLK) {
2523
		if (fc->no_lock) {
2524
			posix_test_lock(file, fl);
2525 2526 2527 2528 2529
			err = 0;
		} else
			err = fuse_getlk(file, fl);
	} else {
		if (fc->no_lock)
M
Miklos Szeredi 已提交
2530
			err = posix_lock_file(file, fl, NULL);
2531
		else
2532
			err = fuse_setlk(file, fl, 0);
2533 2534 2535 2536
	}
	return err;
}

2537 2538
static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
{
A
Al Viro 已提交
2539
	struct inode *inode = file_inode(file);
2540 2541 2542
	struct fuse_conn *fc = get_fuse_conn(inode);
	int err;

M
Miklos Szeredi 已提交
2543
	if (fc->no_flock) {
2544
		err = locks_lock_file_wait(file, fl);
2545
	} else {
M
Miklos Szeredi 已提交
2546 2547
		struct fuse_file *ff = file->private_data;

2548
		/* emulate flock with POSIX locks */
M
Miklos Szeredi 已提交
2549
		ff->flock = true;
2550 2551 2552 2553 2554 2555
		err = fuse_setlk(file, fl, 1);
	}

	return err;
}

M
Miklos Szeredi 已提交
2556 2557 2558
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
	struct inode *inode = mapping->host;
2559
	struct fuse_mount *fm = get_fuse_mount(inode);
2560
	FUSE_ARGS(args);
M
Miklos Szeredi 已提交
2561 2562 2563 2564
	struct fuse_bmap_in inarg;
	struct fuse_bmap_out outarg;
	int err;

2565
	if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
M
Miklos Szeredi 已提交
2566 2567 2568 2569 2570
		return 0;

	memset(&inarg, 0, sizeof(inarg));
	inarg.block = block;
	inarg.blocksize = inode->i_sb->s_blocksize;
2571 2572 2573 2574 2575 2576 2577 2578
	args.opcode = FUSE_BMAP;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2579
	err = fuse_simple_request(fm, &args);
M
Miklos Szeredi 已提交
2580
	if (err == -ENOSYS)
2581
		fm->fc->no_bmap = 1;
M
Miklos Szeredi 已提交
2582 2583 2584 2585

	return err ? 0 : outarg.block;
}

2586 2587 2588
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
	struct inode *inode = file->f_mapping->host;
2589
	struct fuse_mount *fm = get_fuse_mount(inode);
2590 2591 2592 2593 2594 2595 2596 2597 2598 2599
	struct fuse_file *ff = file->private_data;
	FUSE_ARGS(args);
	struct fuse_lseek_in inarg = {
		.fh = ff->fh,
		.offset = offset,
		.whence = whence
	};
	struct fuse_lseek_out outarg;
	int err;

2600
	if (fm->fc->no_lseek)
2601 2602
		goto fallback;

2603 2604 2605 2606 2607 2608 2609 2610
	args.opcode = FUSE_LSEEK;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2611
	err = fuse_simple_request(fm, &args);
2612 2613
	if (err) {
		if (err == -ENOSYS) {
2614
			fm->fc->no_lseek = 1;
2615 2616 2617 2618 2619 2620 2621 2622
			goto fallback;
		}
		return err;
	}

	return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);

fallback:
M
Miklos Szeredi 已提交
2623
	err = fuse_update_attributes(inode, file);
2624 2625 2626 2627 2628 2629
	if (!err)
		return generic_file_llseek(file, offset, whence);
	else
		return err;
}

2630
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
M
Miklos Szeredi 已提交
2631 2632
{
	loff_t retval;
A
Al Viro 已提交
2633
	struct inode *inode = file_inode(file);
M
Miklos Szeredi 已提交
2634

2635 2636 2637 2638
	switch (whence) {
	case SEEK_SET:
	case SEEK_CUR:
		 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2639
		retval = generic_file_llseek(file, offset, whence);
2640 2641
		break;
	case SEEK_END:
A
Al Viro 已提交
2642
		inode_lock(inode);
M
Miklos Szeredi 已提交
2643
		retval = fuse_update_attributes(inode, file);
2644 2645
		if (!retval)
			retval = generic_file_llseek(file, offset, whence);
A
Al Viro 已提交
2646
		inode_unlock(inode);
2647 2648 2649
		break;
	case SEEK_HOLE:
	case SEEK_DATA:
A
Al Viro 已提交
2650
		inode_lock(inode);
2651
		retval = fuse_lseek(file, offset, whence);
A
Al Viro 已提交
2652
		inode_unlock(inode);
2653 2654 2655 2656
		break;
	default:
		retval = -EINVAL;
	}
2657

M
Miklos Szeredi 已提交
2658 2659 2660
	return retval;
}

2661 2662 2663 2664 2665 2666
/*
 * CUSE servers compiled on 32bit broke on 64bit kernels because the
 * ABI was defined to be 'struct iovec' which is different on 32bit
 * and 64bit.  Fortunately we can determine which structure the server
 * used from the size of the reply.
 */
M
Miklos Szeredi 已提交
2667 2668 2669
static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
				     size_t transferred, unsigned count,
				     bool is_compat)
2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698
{
#ifdef CONFIG_COMPAT
	if (count * sizeof(struct compat_iovec) == transferred) {
		struct compat_iovec *ciov = src;
		unsigned i;

		/*
		 * With this interface a 32bit server cannot support
		 * non-compat (i.e. ones coming from 64bit apps) ioctl
		 * requests
		 */
		if (!is_compat)
			return -EINVAL;

		for (i = 0; i < count; i++) {
			dst[i].iov_base = compat_ptr(ciov[i].iov_base);
			dst[i].iov_len = ciov[i].iov_len;
		}
		return 0;
	}
#endif

	if (count * sizeof(struct iovec) != transferred)
		return -EIO;

	memcpy(dst, src, transferred);
	return 0;
}

M
Miklos Szeredi 已提交
2699
/* Make sure iov_length() won't overflow */
2700 2701
static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
				 size_t count)
M
Miklos Szeredi 已提交
2702 2703
{
	size_t n;
2704
	u32 max = fc->max_pages << PAGE_SHIFT;
M
Miklos Szeredi 已提交
2705

2706
	for (n = 0; n < count; n++, iov++) {
M
Miklos Szeredi 已提交
2707 2708 2709 2710 2711 2712 2713
		if (iov->iov_len > (size_t) max)
			return -ENOMEM;
		max -= iov->iov_len;
	}
	return 0;
}

M
Miklos Szeredi 已提交
2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749
static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
				 void *src, size_t transferred, unsigned count,
				 bool is_compat)
{
	unsigned i;
	struct fuse_ioctl_iovec *fiov = src;

	if (fc->minor < 16) {
		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
						 count, is_compat);
	}

	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
		return -EIO;

	for (i = 0; i < count; i++) {
		/* Did the server supply an inappropriate value? */
		if (fiov[i].base != (unsigned long) fiov[i].base ||
		    fiov[i].len != (unsigned long) fiov[i].len)
			return -EIO;

		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
		dst[i].iov_len = (size_t) fiov[i].len;

#ifdef CONFIG_COMPAT
		if (is_compat &&
		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
		     (compat_size_t) dst[i].iov_len != fiov[i].len))
			return -EIO;
#endif
	}

	return 0;
}


T
Tejun Heo 已提交
2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795
/*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
 * to dereference the passed pointer, so the parameter requires deep
 * copying but FUSE has no idea whatsoever about what to copy in or
 * out.
 *
 * This is solved by allowing FUSE server to retry ioctl with
 * necessary in/out iovecs.  Let's assume the ioctl implementation
 * needs to read in the following structure.
 *
 * struct a {
 *	char	*buf;
 *	size_t	buflen;
 * }
 *
 * On the first callout to FUSE server, inarg->in_size and
 * inarg->out_size will be NULL; then, the server completes the ioctl
 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
 * the actual iov array to
 *
 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
 *
 * which tells FUSE to copy in the requested area and retry the ioctl.
 * On the second round, the server has access to the structure and
 * from that it can tell what to look for next, so on the invocation,
 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
 *
 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
 *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
 *
 * FUSE will copy both struct a and the pointed buffer from the
 * process doing the ioctl and retry ioctl with both struct a and the
 * buffer.
 *
 * This time, FUSE server has everything it needs and completes ioctl
 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
 *
 * Copying data out works the same way.
 *
 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
 * automatically initializes in and out iovs by decoding @cmd with
 * _IOC_* macros and the server is not allowed to request RETRY.  This
 * limits ioctl data transfers to well-formed ioctls and is the forced
 * behavior for all FUSE servers.
 */
2796 2797
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
		   unsigned int flags)
T
Tejun Heo 已提交
2798 2799
{
	struct fuse_file *ff = file->private_data;
2800
	struct fuse_mount *fm = ff->fm;
T
Tejun Heo 已提交
2801 2802 2803 2804 2805 2806 2807
	struct fuse_ioctl_in inarg = {
		.fh = ff->fh,
		.cmd = cmd,
		.arg = arg,
		.flags = flags
	};
	struct fuse_ioctl_out outarg;
M
Miklos Szeredi 已提交
2808
	struct iovec *iov_page = NULL;
T
Tejun Heo 已提交
2809
	struct iovec *in_iov = NULL, *out_iov = NULL;
2810 2811 2812
	unsigned int in_iovs = 0, out_iovs = 0, max_pages;
	size_t in_size, out_size, c;
	ssize_t transferred;
2813 2814
	int err, i;
	struct iov_iter ii;
2815
	struct fuse_args_pages ap = {};
T
Tejun Heo 已提交
2816

M
Miklos Szeredi 已提交
2817 2818 2819
#if BITS_PER_LONG == 32
	inarg.flags |= FUSE_IOCTL_32BIT;
#else
2820
	if (flags & FUSE_IOCTL_COMPAT) {
M
Miklos Szeredi 已提交
2821
		inarg.flags |= FUSE_IOCTL_32BIT;
2822 2823 2824 2825 2826
#ifdef CONFIG_X86_X32
		if (in_x32_syscall())
			inarg.flags |= FUSE_IOCTL_COMPAT_X32;
#endif
	}
M
Miklos Szeredi 已提交
2827 2828
#endif

T
Tejun Heo 已提交
2829
	/* assume all the iovs returned by client always fits in a page */
M
Miklos Szeredi 已提交
2830
	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
T
Tejun Heo 已提交
2831 2832

	err = -ENOMEM;
2833
	ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
M
Miklos Szeredi 已提交
2834
	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
2835
	if (!ap.pages || !iov_page)
T
Tejun Heo 已提交
2836 2837
		goto out;

2838
	fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
2839

T
Tejun Heo 已提交
2840 2841 2842 2843 2844
	/*
	 * If restricted, initialize IO parameters as encoded in @cmd.
	 * RETRY from server is not allowed.
	 */
	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
M
Miklos Szeredi 已提交
2845
		struct iovec *iov = iov_page;
T
Tejun Heo 已提交
2846

M
Miklos Szeredi 已提交
2847
		iov->iov_base = (void __user *)arg;
2848 2849 2850 2851 2852 2853 2854 2855 2856 2857

		switch (cmd) {
		case FS_IOC_GETFLAGS:
		case FS_IOC_SETFLAGS:
			iov->iov_len = sizeof(int);
			break;
		default:
			iov->iov_len = _IOC_SIZE(cmd);
			break;
		}
T
Tejun Heo 已提交
2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882

		if (_IOC_DIR(cmd) & _IOC_WRITE) {
			in_iov = iov;
			in_iovs = 1;
		}

		if (_IOC_DIR(cmd) & _IOC_READ) {
			out_iov = iov;
			out_iovs = 1;
		}
	}

 retry:
	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
	inarg.out_size = out_size = iov_length(out_iov, out_iovs);

	/*
	 * Out data can be used either for actual out data or iovs,
	 * make sure there always is at least one page.
	 */
	out_size = max_t(size_t, out_size, PAGE_SIZE);
	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);

	/* make sure there are enough buffer pages and init request with them */
	err = -ENOMEM;
2883
	if (max_pages > fm->fc->max_pages)
T
Tejun Heo 已提交
2884
		goto out;
2885 2886 2887
	while (ap.num_pages < max_pages) {
		ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
		if (!ap.pages[ap.num_pages])
T
Tejun Heo 已提交
2888
			goto out;
2889
		ap.num_pages++;
T
Tejun Heo 已提交
2890 2891 2892 2893
	}


	/* okay, let's send it to the client */
2894 2895 2896 2897 2898
	ap.args.opcode = FUSE_IOCTL;
	ap.args.nodeid = ff->nodeid;
	ap.args.in_numargs = 1;
	ap.args.in_args[0].size = sizeof(inarg);
	ap.args.in_args[0].value = &inarg;
T
Tejun Heo 已提交
2899
	if (in_size) {
2900 2901 2902
		ap.args.in_numargs++;
		ap.args.in_args[1].size = in_size;
		ap.args.in_pages = true;
T
Tejun Heo 已提交
2903

2904 2905
		err = -EFAULT;
		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
2906 2907
		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
			c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2908 2909 2910
			if (c != PAGE_SIZE && iov_iter_count(&ii))
				goto out;
		}
T
Tejun Heo 已提交
2911 2912
	}

2913 2914 2915 2916 2917 2918
	ap.args.out_numargs = 2;
	ap.args.out_args[0].size = sizeof(outarg);
	ap.args.out_args[0].value = &outarg;
	ap.args.out_args[1].size = out_size;
	ap.args.out_pages = true;
	ap.args.out_argvar = true;
T
Tejun Heo 已提交
2919

2920
	transferred = fuse_simple_request(fm, &ap.args);
2921 2922
	err = transferred;
	if (transferred < 0)
T
Tejun Heo 已提交
2923 2924 2925 2926
		goto out;

	/* did it ask for retry? */
	if (outarg.flags & FUSE_IOCTL_RETRY) {
M
Miklos Szeredi 已提交
2927
		void *vaddr;
T
Tejun Heo 已提交
2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946

		/* no retry if in restricted mode */
		err = -EIO;
		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
			goto out;

		in_iovs = outarg.in_iovs;
		out_iovs = outarg.out_iovs;

		/*
		 * Make sure things are in boundary, separate checks
		 * are to protect against overflow.
		 */
		err = -ENOMEM;
		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
		    out_iovs > FUSE_IOCTL_MAX_IOV ||
		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
			goto out;

2947
		vaddr = kmap_atomic(ap.pages[0]);
2948
		err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
2949 2950
					    transferred, in_iovs + out_iovs,
					    (flags & FUSE_IOCTL_COMPAT) != 0);
2951
		kunmap_atomic(vaddr);
2952 2953
		if (err)
			goto out;
T
Tejun Heo 已提交
2954

M
Miklos Szeredi 已提交
2955
		in_iov = iov_page;
T
Tejun Heo 已提交
2956 2957
		out_iov = in_iov + in_iovs;

2958
		err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
M
Miklos Szeredi 已提交
2959 2960 2961
		if (err)
			goto out;

2962
		err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
M
Miklos Szeredi 已提交
2963 2964 2965
		if (err)
			goto out;

T
Tejun Heo 已提交
2966 2967 2968 2969 2970 2971 2972
		goto retry;
	}

	err = -EIO;
	if (transferred > inarg.out_size)
		goto out;

2973 2974
	err = -EFAULT;
	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
2975 2976
	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
		c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2977 2978 2979 2980
		if (c != PAGE_SIZE && iov_iter_count(&ii))
			goto out;
	}
	err = 0;
T
Tejun Heo 已提交
2981
 out:
M
Miklos Szeredi 已提交
2982
	free_page((unsigned long) iov_page);
2983 2984 2985
	while (ap.num_pages)
		__free_page(ap.pages[--ap.num_pages]);
	kfree(ap.pages);
T
Tejun Heo 已提交
2986 2987 2988

	return err ? err : outarg.result;
}
2989
EXPORT_SYMBOL_GPL(fuse_do_ioctl);
T
Tejun Heo 已提交
2990

2991 2992
long fuse_ioctl_common(struct file *file, unsigned int cmd,
		       unsigned long arg, unsigned int flags)
2993
{
A
Al Viro 已提交
2994
	struct inode *inode = file_inode(file);
2995 2996
	struct fuse_conn *fc = get_fuse_conn(inode);

2997
	if (!fuse_allow_current_process(fc))
2998 2999
		return -EACCES;

M
Miklos Szeredi 已提交
3000
	if (fuse_is_bad(inode))
3001 3002 3003 3004 3005
		return -EIO;

	return fuse_do_ioctl(file, cmd, arg, flags);
}

T
Tejun Heo 已提交
3006 3007 3008
static long fuse_file_ioctl(struct file *file, unsigned int cmd,
			    unsigned long arg)
{
3009
	return fuse_ioctl_common(file, cmd, arg, 0);
T
Tejun Heo 已提交
3010 3011 3012 3013 3014
}

static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
				   unsigned long arg)
{
3015
	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
T
Tejun Heo 已提交
3016 3017
}

T
Tejun Heo 已提交
3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058
/*
 * All files which have been polled are linked to RB tree
 * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
 * find the matching one.
 */
static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
					      struct rb_node **parent_out)
{
	struct rb_node **link = &fc->polled_files.rb_node;
	struct rb_node *last = NULL;

	while (*link) {
		struct fuse_file *ff;

		last = *link;
		ff = rb_entry(last, struct fuse_file, polled_node);

		if (kh < ff->kh)
			link = &last->rb_left;
		else if (kh > ff->kh)
			link = &last->rb_right;
		else
			return link;
	}

	if (parent_out)
		*parent_out = last;
	return link;
}

/*
 * The file is about to be polled.  Make sure it's on the polled_files
 * RB tree.  Note that files once added to the polled_files tree are
 * not removed before the file is released.  This is because a file
 * polled once is likely to be polled again.
 */
static void fuse_register_polled_file(struct fuse_conn *fc,
				      struct fuse_file *ff)
{
	spin_lock(&fc->lock);
	if (RB_EMPTY_NODE(&ff->polled_node)) {
3059
		struct rb_node **link, *parent;
T
Tejun Heo 已提交
3060 3061 3062 3063 3064 3065 3066 3067 3068

		link = fuse_find_polled_node(fc, ff->kh, &parent);
		BUG_ON(*link);
		rb_link_node(&ff->polled_node, parent, link);
		rb_insert_color(&ff->polled_node, &fc->polled_files);
	}
	spin_unlock(&fc->lock);
}

A
Al Viro 已提交
3069
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
T
Tejun Heo 已提交
3070 3071
{
	struct fuse_file *ff = file->private_data;
3072
	struct fuse_mount *fm = ff->fm;
T
Tejun Heo 已提交
3073 3074
	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
	struct fuse_poll_out outarg;
3075
	FUSE_ARGS(args);
T
Tejun Heo 已提交
3076 3077
	int err;

3078
	if (fm->fc->no_poll)
T
Tejun Heo 已提交
3079 3080 3081
		return DEFAULT_POLLMASK;

	poll_wait(file, &ff->poll_wait, wait);
3082
	inarg.events = mangle_poll(poll_requested_events(wait));
T
Tejun Heo 已提交
3083 3084 3085 3086 3087 3088 3089

	/*
	 * Ask for notification iff there's someone waiting for it.
	 * The client may ignore the flag and always notify.
	 */
	if (waitqueue_active(&ff->poll_wait)) {
		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
3090
		fuse_register_polled_file(fm->fc, ff);
T
Tejun Heo 已提交
3091 3092
	}

3093 3094 3095 3096 3097 3098 3099 3100
	args.opcode = FUSE_POLL;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
3101
	err = fuse_simple_request(fm, &args);
T
Tejun Heo 已提交
3102 3103

	if (!err)
3104
		return demangle_poll(outarg.revents);
T
Tejun Heo 已提交
3105
	if (err == -ENOSYS) {
3106
		fm->fc->no_poll = 1;
T
Tejun Heo 已提交
3107 3108
		return DEFAULT_POLLMASK;
	}
3109
	return EPOLLERR;
T
Tejun Heo 已提交
3110
}
3111
EXPORT_SYMBOL_GPL(fuse_file_poll);
T
Tejun Heo 已提交
3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136

/*
 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
 * wakes up the poll waiters.
 */
int fuse_notify_poll_wakeup(struct fuse_conn *fc,
			    struct fuse_notify_poll_wakeup_out *outarg)
{
	u64 kh = outarg->kh;
	struct rb_node **link;

	spin_lock(&fc->lock);

	link = fuse_find_polled_node(fc, kh, NULL);
	if (*link) {
		struct fuse_file *ff;

		ff = rb_entry(*link, struct fuse_file, polled_node);
		wake_up_interruptible_sync(&ff->poll_wait);
	}

	spin_unlock(&fc->lock);
	return 0;
}

3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147
static void fuse_do_truncate(struct file *file)
{
	struct inode *inode = file->f_mapping->host;
	struct iattr attr;

	attr.ia_valid = ATTR_SIZE;
	attr.ia_size = i_size_read(inode);

	attr.ia_file = file;
	attr.ia_valid |= ATTR_FILE;

3148
	fuse_do_setattr(file_dentry(file), &attr, file);
3149 3150
}

3151
static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
3152
{
3153
	return round_up(off, fc->max_pages << PAGE_SHIFT);
3154 3155
}

A
Anand Avati 已提交
3156
static ssize_t
3157
fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
A
Anand Avati 已提交
3158
{
3159
	DECLARE_COMPLETION_ONSTACK(wait);
A
Anand Avati 已提交
3160
	ssize_t ret = 0;
3161 3162
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
A
Anand Avati 已提交
3163
	loff_t pos = 0;
3164 3165
	struct inode *inode;
	loff_t i_size;
3166
	size_t count = iov_iter_count(iter), shortened = 0;
3167
	loff_t offset = iocb->ki_pos;
3168
	struct fuse_io_priv *io;
A
Anand Avati 已提交
3169 3170

	pos = offset;
3171 3172
	inode = file->f_mapping->host;
	i_size = i_size_read(inode);
A
Anand Avati 已提交
3173

3174
	if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
3175 3176
		return 0;

3177
	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3178 3179
	if (!io)
		return -ENOMEM;
3180
	spin_lock_init(&io->lock);
3181
	kref_init(&io->refcnt);
3182 3183 3184 3185
	io->reqs = 1;
	io->bytes = -1;
	io->size = 0;
	io->offset = offset;
3186
	io->write = (iov_iter_rw(iter) == WRITE);
3187 3188 3189
	io->err = 0;
	/*
	 * By default, we want to optimize all I/Os with async request
3190
	 * submission to the client filesystem if supported.
3191
	 */
3192
	io->async = ff->fm->fc->async_dio;
3193
	io->iocb = iocb;
3194
	io->blocking = is_sync_kiocb(iocb);
3195

3196 3197
	/* optimization for short read */
	if (io->async && !io->write && offset + count > i_size) {
3198
		iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
3199 3200 3201 3202
		shortened = count - iov_iter_count(iter);
		count -= shortened;
	}

3203
	/*
3204 3205
	 * We cannot asynchronously extend the size of a file.
	 * In such case the aio will behave exactly like sync io.
3206
	 */
3207
	if ((offset + count > i_size) && io->write)
3208
		io->blocking = true;
A
Anand Avati 已提交
3209

3210
	if (io->async && io->blocking) {
3211 3212 3213 3214 3215
		/*
		 * Additional reference to keep io around after
		 * calling fuse_aio_complete()
		 */
		kref_get(&io->refcnt);
3216
		io->done = &wait;
3217
	}
3218

3219
	if (iov_iter_rw(iter) == WRITE) {
3220
		ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3221 3222
		fuse_invalidate_attr(inode);
	} else {
A
Al Viro 已提交
3223
		ret = __fuse_direct_read(io, iter, &pos);
3224
	}
3225
	iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
3226

3227
	if (io->async) {
3228 3229
		bool blocking = io->blocking;

3230 3231 3232
		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);

		/* we have a non-extending, async request, so return */
3233
		if (!blocking)
3234 3235
			return -EIOCBQUEUED;

3236 3237
		wait_for_completion(&wait);
		ret = fuse_get_res_by_io(io);
3238 3239
	}

3240
	kref_put(&io->refcnt, fuse_io_release);
3241

3242
	if (iov_iter_rw(iter) == WRITE) {
3243 3244 3245 3246 3247
		if (ret > 0)
			fuse_write_update_size(inode, pos);
		else if (ret < 0 && offset + count > i_size)
			fuse_do_truncate(file);
	}
A
Anand Avati 已提交
3248 3249 3250 3251

	return ret;
}

3252 3253
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
M
Miklos Szeredi 已提交
3254
	int err = filemap_write_and_wait_range(inode->i_mapping, start, -1);
3255 3256 3257 3258 3259 3260 3261

	if (!err)
		fuse_sync_writes(inode);

	return err;
}

3262 3263
static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
				loff_t length)
A
Anatol Pomozov 已提交
3264 3265
{
	struct fuse_file *ff = file->private_data;
3266
	struct inode *inode = file_inode(file);
3267
	struct fuse_inode *fi = get_fuse_inode(inode);
3268
	struct fuse_mount *fm = ff->fm;
3269
	FUSE_ARGS(args);
A
Anatol Pomozov 已提交
3270 3271 3272 3273 3274 3275 3276
	struct fuse_fallocate_in inarg = {
		.fh = ff->fh,
		.offset = offset,
		.length = length,
		.mode = mode
	};
	int err;
3277 3278
	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
			   (mode & FALLOC_FL_PUNCH_HOLE);
A
Anatol Pomozov 已提交
3279

3280 3281
	bool block_faults = FUSE_IS_DAX(inode) && lock_inode;

M
Miklos Szeredi 已提交
3282 3283 3284
	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
		return -EOPNOTSUPP;

3285
	if (fm->fc->no_fallocate)
3286 3287
		return -EOPNOTSUPP;

3288
	if (lock_inode) {
A
Al Viro 已提交
3289
		inode_lock(inode);
3290 3291 3292 3293 3294 3295 3296
		if (block_faults) {
			down_write(&fi->i_mmap_sem);
			err = fuse_dax_break_layouts(inode, 0, 0);
			if (err)
				goto out;
		}

3297 3298
		if (mode & FALLOC_FL_PUNCH_HOLE) {
			loff_t endbyte = offset + length - 1;
3299 3300

			err = fuse_writeback_range(inode, offset, endbyte);
3301 3302 3303
			if (err)
				goto out;
		}
3304 3305
	}

3306 3307 3308 3309
	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
	    offset + length > i_size_read(inode)) {
		err = inode_newsize_ok(inode, offset + length);
		if (err)
3310
			goto out;
3311 3312
	}

3313 3314 3315
	if (!(mode & FALLOC_FL_KEEP_SIZE))
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

3316 3317 3318 3319 3320
	args.opcode = FUSE_FALLOCATE;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
3321
	err = fuse_simple_request(fm, &args);
3322
	if (err == -ENOSYS) {
3323
		fm->fc->no_fallocate = 1;
3324 3325
		err = -EOPNOTSUPP;
	}
3326 3327 3328 3329
	if (err)
		goto out;

	/* we could have extended the file */
M
Maxim Patlasov 已提交
3330 3331 3332
	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
		bool changed = fuse_write_update_size(inode, offset + length);

3333
		if (changed && fm->fc->writeback_cache)
3334
			file_update_time(file);
M
Maxim Patlasov 已提交
3335
	}
3336 3337 3338 3339 3340 3341

	if (mode & FALLOC_FL_PUNCH_HOLE)
		truncate_pagecache_range(inode, offset, offset + length - 1);

	fuse_invalidate_attr(inode);

3342
out:
3343 3344 3345
	if (!(mode & FALLOC_FL_KEEP_SIZE))
		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

3346 3347 3348
	if (block_faults)
		up_write(&fi->i_mmap_sem);

3349
	if (lock_inode)
A
Al Viro 已提交
3350
		inode_unlock(inode);
3351

3352 3353
	fuse_flush_time_update(inode);

A
Anatol Pomozov 已提交
3354 3355 3356
	return err;
}

3357 3358 3359
static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
				      struct file *file_out, loff_t pos_out,
				      size_t len, unsigned int flags)
3360 3361 3362
{
	struct fuse_file *ff_in = file_in->private_data;
	struct fuse_file *ff_out = file_out->private_data;
3363
	struct inode *inode_in = file_inode(file_in);
3364 3365
	struct inode *inode_out = file_inode(file_out);
	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3366 3367
	struct fuse_mount *fm = ff_in->fm;
	struct fuse_conn *fc = fm->fc;
3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387
	FUSE_ARGS(args);
	struct fuse_copy_file_range_in inarg = {
		.fh_in = ff_in->fh,
		.off_in = pos_in,
		.nodeid_out = ff_out->nodeid,
		.fh_out = ff_out->fh,
		.off_out = pos_out,
		.len = len,
		.flags = flags
	};
	struct fuse_write_out outarg;
	ssize_t err;
	/* mark unstable when write-back is not used, and file_out gets
	 * extended */
	bool is_unstable = (!fc->writeback_cache) &&
			   ((pos_out + len) > inode_out->i_size);

	if (fc->no_copy_file_range)
		return -EOPNOTSUPP;

3388 3389 3390
	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
		return -EXDEV;

3391 3392 3393 3394 3395
	inode_lock(inode_in);
	err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
	inode_unlock(inode_in);
	if (err)
		return err;
3396

3397 3398
	inode_lock(inode_out);

3399 3400 3401 3402
	err = file_modified(file_out);
	if (err)
		goto out;

3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420
	/*
	 * Write out dirty pages in the destination file before sending the COPY
	 * request to userspace.  After the request is completed, truncate off
	 * pages (including partial ones) from the cache that have been copied,
	 * since these contain stale data at that point.
	 *
	 * This should be mostly correct, but if the COPY writes to partial
	 * pages (at the start or end) and the parts not covered by the COPY are
	 * written through a memory map after calling fuse_writeback_range(),
	 * then these partial page modifications will be lost on truncation.
	 *
	 * It is unlikely that someone would rely on such mixed style
	 * modifications.  Yet this does give less guarantees than if the
	 * copying was performed with write(2).
	 *
	 * To fix this a i_mmap_sem style lock could be used to prevent new
	 * faults while the copy is ongoing.
	 */
3421 3422 3423
	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
	if (err)
		goto out;
3424 3425 3426 3427

	if (is_unstable)
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);

3428 3429 3430 3431 3432 3433 3434 3435
	args.opcode = FUSE_COPY_FILE_RANGE;
	args.nodeid = ff_in->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
3436
	err = fuse_simple_request(fm, &args);
3437 3438 3439 3440 3441 3442 3443
	if (err == -ENOSYS) {
		fc->no_copy_file_range = 1;
		err = -EOPNOTSUPP;
	}
	if (err)
		goto out;

3444 3445 3446 3447
	truncate_inode_pages_range(inode_out->i_mapping,
				   ALIGN_DOWN(pos_out, PAGE_SIZE),
				   ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);

3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460
	if (fc->writeback_cache) {
		fuse_write_update_size(inode_out, pos_out + outarg.size);
		file_update_time(file_out);
	}

	fuse_invalidate_attr(inode_out);

	err = outarg.size;
out:
	if (is_unstable)
		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);

	inode_unlock(inode_out);
3461
	file_accessed(file_in);
3462

3463 3464
	fuse_flush_time_update(inode_out);

3465 3466 3467
	return err;
}

3468 3469 3470 3471 3472 3473 3474 3475 3476
static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
				    struct file *dst_file, loff_t dst_off,
				    size_t len, unsigned int flags)
{
	ssize_t ret;

	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
				     len, flags);

3477
	if (ret == -EOPNOTSUPP || ret == -EXDEV)
3478 3479 3480 3481 3482
		ret = generic_copy_file_range(src_file, src_off, dst_file,
					      dst_off, len, flags);
	return ret;
}

3483
static const struct file_operations fuse_file_operations = {
M
Miklos Szeredi 已提交
3484
	.llseek		= fuse_file_llseek,
3485
	.read_iter	= fuse_file_read_iter,
A
Al Viro 已提交
3486
	.write_iter	= fuse_file_write_iter,
M
Miklos Szeredi 已提交
3487 3488 3489 3490 3491
	.mmap		= fuse_file_mmap,
	.open		= fuse_open,
	.flush		= fuse_flush,
	.release	= fuse_release,
	.fsync		= fuse_fsync,
3492
	.lock		= fuse_file_lock,
S
Stefan Hajnoczi 已提交
3493
	.get_unmapped_area = thp_get_unmapped_area,
3494
	.flock		= fuse_file_flock,
3495 3496
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
T
Tejun Heo 已提交
3497 3498
	.unlocked_ioctl	= fuse_file_ioctl,
	.compat_ioctl	= fuse_file_compat_ioctl,
T
Tejun Heo 已提交
3499
	.poll		= fuse_file_poll,
A
Anatol Pomozov 已提交
3500
	.fallocate	= fuse_file_fallocate,
3501
	.copy_file_range = fuse_copy_file_range,
M
Miklos Szeredi 已提交
3502 3503
};

3504
static const struct address_space_operations fuse_file_aops  = {
M
Miklos Szeredi 已提交
3505
	.readpage	= fuse_readpage,
3506
	.readahead	= fuse_readahead,
M
Miklos Szeredi 已提交
3507
	.writepage	= fuse_writepage,
3508
	.writepages	= fuse_writepages,
M
Miklos Szeredi 已提交
3509 3510
	.launder_page	= fuse_launder_page,
	.set_page_dirty	= __set_page_dirty_nobuffers,
M
Miklos Szeredi 已提交
3511
	.bmap		= fuse_bmap,
A
Anand Avati 已提交
3512
	.direct_IO	= fuse_direct_IO,
3513 3514
	.write_begin	= fuse_write_begin,
	.write_end	= fuse_write_end,
M
Miklos Szeredi 已提交
3515 3516 3517 3518
};

void fuse_init_file_inode(struct inode *inode)
{
3519 3520
	struct fuse_inode *fi = get_fuse_inode(inode);

3521 3522
	inode->i_fop = &fuse_file_operations;
	inode->i_data.a_ops = &fuse_file_aops;
3523 3524 3525 3526 3527

	INIT_LIST_HEAD(&fi->write_files);
	INIT_LIST_HEAD(&fi->queued_writes);
	fi->writectr = 0;
	init_waitqueue_head(&fi->page_waitq);
3528
	fi->writepages = RB_ROOT;
3529 3530 3531

	if (IS_ENABLED(CONFIG_FUSE_DAX))
		fuse_dax_inode_init(inode);
M
Miklos Szeredi 已提交
3532
}