aio.c 52.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7
/*
 *	An async IO implementation for Linux
 *	Written by Benjamin LaHaise <bcrl@kvack.org>
 *
 *	Implements an efficient asynchronous io interface.
 *
 *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
C
Christoph Hellwig 已提交
8
 *	Copyright 2018 Christoph Hellwig.
L
Linus Torvalds 已提交
9 10 11
 *
 *	See ../COPYING for licensing terms.
 */
K
Kent Overstreet 已提交
12 13
#define pr_fmt(fmt) "%s: " fmt, __func__

L
Linus Torvalds 已提交
14 15 16 17 18
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/aio_abi.h>
19
#include <linux/export.h>
L
Linus Torvalds 已提交
20
#include <linux/syscalls.h>
21
#include <linux/backing-dev.h>
22
#include <linux/uio.h>
L
Linus Torvalds 已提交
23

24
#include <linux/sched/signal.h>
L
Linus Torvalds 已提交
25 26 27 28
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
29
#include <linux/mmu_context.h>
K
Kent Overstreet 已提交
30
#include <linux/percpu.h>
L
Linus Torvalds 已提交
31 32 33 34 35 36
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
#include <linux/security.h>
37
#include <linux/eventfd.h>
J
Jeff Moyer 已提交
38
#include <linux/blkdev.h>
39
#include <linux/compat.h>
40 41
#include <linux/migrate.h>
#include <linux/ramfs.h>
K
Kent Overstreet 已提交
42
#include <linux/percpu-refcount.h>
43
#include <linux/mount.h>
L
Linus Torvalds 已提交
44 45

#include <asm/kmap_types.h>
46
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
47

A
Al Viro 已提交
48 49
#include "internal.h"

50 51
#define KIOCB_KEY		0

52 53 54 55 56 57
#define AIO_RING_MAGIC			0xa10a10a1
#define AIO_RING_COMPAT_FEATURES	1
#define AIO_RING_INCOMPAT_FEATURES	0
struct aio_ring {
	unsigned	id;	/* kernel internal index number */
	unsigned	nr;	/* number of io_events */
58 59
	unsigned	head;	/* Written to by userland or under ring_lock
				 * mutex by aio_read_events_ring(). */
60 61 62 63 64 65 66 67 68 69 70 71 72
	unsigned	tail;

	unsigned	magic;
	unsigned	compat_features;
	unsigned	incompat_features;
	unsigned	header_length;	/* size of aio_ring */


	struct io_event		io_events[0];
}; /* 128 bytes + ring size */

#define AIO_RING_PAGES	8

73
struct kioctx_table {
74 75 76
	struct rcu_head		rcu;
	unsigned		nr;
	struct kioctx __rcu	*table[];
77 78
};

K
Kent Overstreet 已提交
79 80 81 82
struct kioctx_cpu {
	unsigned		reqs_available;
};

83 84 85 86 87
struct ctx_rq_wait {
	struct completion comp;
	atomic_t count;
};

88
struct kioctx {
K
Kent Overstreet 已提交
89
	struct percpu_ref	users;
K
Kent Overstreet 已提交
90
	atomic_t		dead;
91

K
Kent Overstreet 已提交
92 93
	struct percpu_ref	reqs;

94 95
	unsigned long		user_id;

K
Kent Overstreet 已提交
96 97 98 99 100 101 102
	struct __percpu kioctx_cpu *cpu;

	/*
	 * For percpu reqs_available, number of slots we move to/from global
	 * counter at a time:
	 */
	unsigned		req_batch;
103 104 105 106
	/*
	 * This is what userspace passed to io_setup(), it's not used for
	 * anything but counting against the global max_reqs quota.
	 *
K
Kent Overstreet 已提交
107
	 * The real limit is nr_events - 1, which will be larger (see
108 109
	 * aio_setup_ring())
	 */
110 111
	unsigned		max_reqs;

K
Kent Overstreet 已提交
112 113
	/* Size of ringbuffer, in units of struct io_event */
	unsigned		nr_events;
114

K
Kent Overstreet 已提交
115 116 117 118 119 120
	unsigned long		mmap_base;
	unsigned long		mmap_size;

	struct page		**ring_pages;
	long			nr_pages;

121
	struct rcu_work		free_rwork;	/* see free_ioctx() */
122

123 124 125
	/*
	 * signals when all in-flight requests are done
	 */
126
	struct ctx_rq_wait	*rq_wait;
127

128
	struct {
129 130 131 132 133
		/*
		 * This counts the number of available slots in the ringbuffer,
		 * so we avoid overflowing it: it's decremented (if positive)
		 * when allocating a kiocb and incremented when the resulting
		 * io_event is pulled off the ringbuffer.
K
Kent Overstreet 已提交
134 135
		 *
		 * We batch accesses to it with a percpu version.
136 137
		 */
		atomic_t	reqs_available;
138 139 140 141 142 143 144
	} ____cacheline_aligned_in_smp;

	struct {
		spinlock_t	ctx_lock;
		struct list_head active_reqs;	/* used for cancellation */
	} ____cacheline_aligned_in_smp;

K
Kent Overstreet 已提交
145 146
	struct {
		struct mutex	ring_lock;
147 148
		wait_queue_head_t wait;
	} ____cacheline_aligned_in_smp;
K
Kent Overstreet 已提交
149 150 151

	struct {
		unsigned	tail;
152
		unsigned	completed_events;
K
Kent Overstreet 已提交
153
		spinlock_t	completion_lock;
154
	} ____cacheline_aligned_in_smp;
K
Kent Overstreet 已提交
155 156

	struct page		*internal_pages[AIO_RING_PAGES];
157
	struct file		*aio_ring_file;
158 159

	unsigned		id;
160 161
};

162 163 164 165 166 167
struct fsync_iocb {
	struct work_struct	work;
	struct file		*file;
	bool			datasync;
};

C
Christoph Hellwig 已提交
168 169 170 171 172 173 174 175 176 177 178
struct poll_iocb {
	struct file		*file;
	__poll_t		events;
	struct wait_queue_head	*head;

	union {
		struct wait_queue_entry	wait;
		struct work_struct	work;
	};
};

179
struct aio_kiocb {
180 181
	union {
		struct kiocb		rw;
182
		struct fsync_iocb	fsync;
C
Christoph Hellwig 已提交
183
		struct poll_iocb	poll;
184
	};
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201

	struct kioctx		*ki_ctx;
	kiocb_cancel_fn		*ki_cancel;

	struct iocb __user	*ki_user_iocb;	/* user's aiocb */
	__u64			ki_user_data;	/* user's data for completion */

	struct list_head	ki_list;	/* the aio core uses this
						 * for cancellation */

	/*
	 * If the aio_resfd field of the userspace iocb is not zero,
	 * this is the underlying eventfd context to deliver events to.
	 */
	struct eventfd_ctx	*ki_eventfd;
};

L
Linus Torvalds 已提交
202
/*------ sysctl variables----*/
203 204 205
static DEFINE_SPINLOCK(aio_nr_lock);
unsigned long aio_nr;		/* current system wide number of aio requests */
unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
L
Linus Torvalds 已提交
206 207
/*----end sysctl variables---*/

208 209
static struct kmem_cache	*kiocb_cachep;
static struct kmem_cache	*kioctx_cachep;
L
Linus Torvalds 已提交
210

211 212 213 214 215 216 217 218 219
static struct vfsmount *aio_mnt;

static const struct file_operations aio_ring_fops;
static const struct address_space_operations aio_ctx_aops;

static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
	struct file *file;
	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
220 221
	if (IS_ERR(inode))
		return ERR_CAST(inode);
222 223 224 225 226

	inode->i_mapping->a_ops = &aio_ctx_aops;
	inode->i_mapping->private_data = ctx;
	inode->i_size = PAGE_SIZE * nr_pages;

A
Al Viro 已提交
227 228
	file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
				O_RDWR, &aio_ring_fops);
229
	if (IS_ERR(file))
A
Al Viro 已提交
230
		iput(inode);
231 232 233 234 235 236
	return file;
}

static struct dentry *aio_mount(struct file_system_type *fs_type,
				int flags, const char *dev_name, void *data)
{
A
Al Viro 已提交
237
	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL,
J
Jann Horn 已提交
238 239 240 241 242
					   AIO_RING_MAGIC);

	if (!IS_ERR(root))
		root->d_sb->s_iflags |= SB_I_NOEXEC;
	return root;
243 244
}

L
Linus Torvalds 已提交
245 246 247 248 249 250
/* aio_setup
 *	Creates the slab caches used by the aio routines, panic on
 *	failure as this is done early during the boot sequence.
 */
static int __init aio_setup(void)
{
251 252 253 254 255 256 257 258 259
	static struct file_system_type aio_fs = {
		.name		= "aio",
		.mount		= aio_mount,
		.kill_sb	= kill_anon_super,
	};
	aio_mnt = kern_mount(&aio_fs);
	if (IS_ERR(aio_mnt))
		panic("Failed to create aio fs mount.");

260
	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
261
	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
L
Linus Torvalds 已提交
262 263
	return 0;
}
264
__initcall(aio_setup);
L
Linus Torvalds 已提交
265

266 267 268
static void put_aio_ring_file(struct kioctx *ctx)
{
	struct file *aio_ring_file = ctx->aio_ring_file;
269 270
	struct address_space *i_mapping;

271
	if (aio_ring_file) {
A
Al Viro 已提交
272
		truncate_setsize(file_inode(aio_ring_file), 0);
273 274

		/* Prevent further access to the kioctx from migratepages */
A
Al Viro 已提交
275
		i_mapping = aio_ring_file->f_mapping;
276 277
		spin_lock(&i_mapping->private_lock);
		i_mapping->private_data = NULL;
278
		ctx->aio_ring_file = NULL;
279
		spin_unlock(&i_mapping->private_lock);
280 281 282 283 284

		fput(aio_ring_file);
	}
}

L
Linus Torvalds 已提交
285 286
static void aio_free_ring(struct kioctx *ctx)
{
287
	int i;
L
Linus Torvalds 已提交
288

289 290 291 292 293
	/* Disconnect the kiotx from the ring file.  This prevents future
	 * accesses to the kioctx from page migration.
	 */
	put_aio_ring_file(ctx);

294
	for (i = 0; i < ctx->nr_pages; i++) {
295
		struct page *page;
296 297
		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
				page_count(ctx->ring_pages[i]));
298 299 300 301 302
		page = ctx->ring_pages[i];
		if (!page)
			continue;
		ctx->ring_pages[i] = NULL;
		put_page(page);
303
	}
L
Linus Torvalds 已提交
304

305
	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
K
Kent Overstreet 已提交
306
		kfree(ctx->ring_pages);
307 308
		ctx->ring_pages = NULL;
	}
309 310
}

311
static int aio_ring_mremap(struct vm_area_struct *vma)
312
{
313
	struct file *file = vma->vm_file;
314 315
	struct mm_struct *mm = vma->vm_mm;
	struct kioctx_table *table;
A
Al Viro 已提交
316
	int i, res = -EINVAL;
317 318 319 320 321 322 323

	spin_lock(&mm->ioctx_lock);
	rcu_read_lock();
	table = rcu_dereference(mm->ioctx_table);
	for (i = 0; i < table->nr; i++) {
		struct kioctx *ctx;

324
		ctx = rcu_dereference(table->table[i]);
325
		if (ctx && ctx->aio_ring_file == file) {
A
Al Viro 已提交
326 327 328 329
			if (!atomic_read(&ctx->dead)) {
				ctx->user_id = ctx->mmap_base = vma->vm_start;
				res = 0;
			}
330 331 332 333 334 335
			break;
		}
	}

	rcu_read_unlock();
	spin_unlock(&mm->ioctx_lock);
A
Al Viro 已提交
336
	return res;
337 338
}

339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
static const struct vm_operations_struct aio_ring_vm_ops = {
	.mremap		= aio_ring_mremap,
#if IS_ENABLED(CONFIG_MMU)
	.fault		= filemap_fault,
	.map_pages	= filemap_map_pages,
	.page_mkwrite	= filemap_page_mkwrite,
#endif
};

static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
{
	vma->vm_flags |= VM_DONTEXPAND;
	vma->vm_ops = &aio_ring_vm_ops;
	return 0;
}

355 356 357 358
static const struct file_operations aio_ring_fops = {
	.mmap = aio_ring_mmap,
};

359
#if IS_ENABLED(CONFIG_MIGRATION)
360 361 362
static int aio_migratepage(struct address_space *mapping, struct page *new,
			struct page *old, enum migrate_mode mode)
{
363
	struct kioctx *ctx;
364
	unsigned long flags;
365
	pgoff_t idx;
366 367
	int rc;

368 369 370 371 372 373 374 375
	/*
	 * We cannot support the _NO_COPY case here, because copy needs to
	 * happen under the ctx->completion_lock. That does not work with the
	 * migration workflow of MIGRATE_SYNC_NO_COPY.
	 */
	if (mode == MIGRATE_SYNC_NO_COPY)
		return -EINVAL;

376 377
	rc = 0;

378
	/* mapping->private_lock here protects against the kioctx teardown.  */
379 380
	spin_lock(&mapping->private_lock);
	ctx = mapping->private_data;
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
	if (!ctx) {
		rc = -EINVAL;
		goto out;
	}

	/* The ring_lock mutex.  The prevents aio_read_events() from writing
	 * to the ring's head, and prevents page migration from mucking in
	 * a partially initialized kiotx.
	 */
	if (!mutex_trylock(&ctx->ring_lock)) {
		rc = -EAGAIN;
		goto out;
	}

	idx = old->index;
	if (idx < (pgoff_t)ctx->nr_pages) {
		/* Make sure the old page hasn't already been changed */
		if (ctx->ring_pages[idx] != old)
			rc = -EAGAIN;
400 401 402 403
	} else
		rc = -EINVAL;

	if (rc != 0)
404
		goto out_unlock;
405

406 407
	/* Writeback must be complete */
	BUG_ON(PageWriteback(old));
408
	get_page(new);
409

410
	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
411
	if (rc != MIGRATEPAGE_SUCCESS) {
412
		put_page(new);
413
		goto out_unlock;
414 415
	}

416 417 418
	/* Take completion_lock to prevent other writes to the ring buffer
	 * while the old page is copied to the new.  This prevents new
	 * events from being lost.
419
	 */
420 421 422 423 424
	spin_lock_irqsave(&ctx->completion_lock, flags);
	migrate_page_copy(new, old);
	BUG_ON(ctx->ring_pages[idx] != old);
	ctx->ring_pages[idx] = new;
	spin_unlock_irqrestore(&ctx->completion_lock, flags);
425

426 427
	/* The old page is no longer accessible. */
	put_page(old);
428

429 430 431 432
out_unlock:
	mutex_unlock(&ctx->ring_lock);
out:
	spin_unlock(&mapping->private_lock);
433
	return rc;
L
Linus Torvalds 已提交
434
}
435
#endif
L
Linus Torvalds 已提交
436

437
static const struct address_space_operations aio_ctx_aops = {
438
	.set_page_dirty = __set_page_dirty_no_writeback,
439
#if IS_ENABLED(CONFIG_MIGRATION)
440
	.migratepage	= aio_migratepage,
441
#endif
442 443
};

444
static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
L
Linus Torvalds 已提交
445 446
{
	struct aio_ring *ring;
Z
Zach Brown 已提交
447
	struct mm_struct *mm = current->mm;
448
	unsigned long size, unused;
L
Linus Torvalds 已提交
449
	int nr_pages;
450 451
	int i;
	struct file *file;
L
Linus Torvalds 已提交
452 453 454 455 456 457 458

	/* Compensate for the ring buffer's head/tail overlap entry */
	nr_events += 2;	/* 1 is required, 2 for good luck */

	size = sizeof(struct aio_ring);
	size += sizeof(struct io_event) * nr_events;

459
	nr_pages = PFN_UP(size);
L
Linus Torvalds 已提交
460 461 462
	if (nr_pages < 0)
		return -EINVAL;

463
	file = aio_private_file(ctx, nr_pages);
464 465
	if (IS_ERR(file)) {
		ctx->aio_ring_file = NULL;
466
		return -ENOMEM;
467 468
	}

469 470 471 472 473 474 475 476 477 478 479 480 481 482
	ctx->aio_ring_file = file;
	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
			/ sizeof(struct io_event);

	ctx->ring_pages = ctx->internal_pages;
	if (nr_pages > AIO_RING_PAGES) {
		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
					  GFP_KERNEL);
		if (!ctx->ring_pages) {
			put_aio_ring_file(ctx);
			return -ENOMEM;
		}
	}

483 484
	for (i = 0; i < nr_pages; i++) {
		struct page *page;
A
Al Viro 已提交
485
		page = find_or_create_page(file->f_mapping,
486 487 488 489 490 491 492
					   i, GFP_HIGHUSER | __GFP_ZERO);
		if (!page)
			break;
		pr_debug("pid(%d) page[%d]->count=%d\n",
			 current->pid, i, page_count(page));
		SetPageUptodate(page);
		unlock_page(page);
493 494

		ctx->ring_pages[i] = page;
495
	}
496
	ctx->nr_pages = i;
L
Linus Torvalds 已提交
497

498 499
	if (unlikely(i != nr_pages)) {
		aio_free_ring(ctx);
500
		return -ENOMEM;
L
Linus Torvalds 已提交
501 502
	}

K
Kent Overstreet 已提交
503 504
	ctx->mmap_size = nr_pages * PAGE_SIZE;
	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
505

M
Michal Hocko 已提交
506 507 508 509 510 511
	if (down_write_killable(&mm->mmap_sem)) {
		ctx->mmap_size = 0;
		aio_free_ring(ctx);
		return -EINTR;
	}

512 513
	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
				       PROT_READ | PROT_WRITE,
514
				       MAP_SHARED, 0, &unused, NULL);
515
	up_write(&mm->mmap_sem);
K
Kent Overstreet 已提交
516 517
	if (IS_ERR((void *)ctx->mmap_base)) {
		ctx->mmap_size = 0;
L
Linus Torvalds 已提交
518
		aio_free_ring(ctx);
519
		return -ENOMEM;
L
Linus Torvalds 已提交
520 521
	}

K
Kent Overstreet 已提交
522
	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
523

K
Kent Overstreet 已提交
524 525
	ctx->user_id = ctx->mmap_base;
	ctx->nr_events = nr_events; /* trusted copy */
L
Linus Torvalds 已提交
526

K
Kent Overstreet 已提交
527
	ring = kmap_atomic(ctx->ring_pages[0]);
L
Linus Torvalds 已提交
528
	ring->nr = nr_events;	/* user copy */
529
	ring->id = ~0U;
L
Linus Torvalds 已提交
530 531 532 533 534
	ring->head = ring->tail = 0;
	ring->magic = AIO_RING_MAGIC;
	ring->compat_features = AIO_RING_COMPAT_FEATURES;
	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
	ring->header_length = sizeof(struct aio_ring);
535
	kunmap_atomic(ring);
K
Kent Overstreet 已提交
536
	flush_dcache_page(ctx->ring_pages[0]);
L
Linus Torvalds 已提交
537 538 539 540 541 542 543 544

	return 0;
}

#define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)

545
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
546
{
547
	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
548 549 550
	struct kioctx *ctx = req->ki_ctx;
	unsigned long flags;

551 552
	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
		return;
553

554 555
	spin_lock_irqsave(&ctx->ctx_lock, flags);
	list_add_tail(&req->ki_list, &ctx->active_reqs);
556 557 558 559 560
	req->ki_cancel = cancel;
	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);

561 562 563
/*
 * free_ioctx() should be RCU delayed to synchronize against the RCU
 * protected lookup_ioctx() and also needs process context to call
564
 * aio_free_ring().  Use rcu_work.
565
 */
K
Kent Overstreet 已提交
566
static void free_ioctx(struct work_struct *work)
K
Kent Overstreet 已提交
567
{
568 569
	struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
					  free_rwork);
K
Kent Overstreet 已提交
570
	pr_debug("freeing %p\n", ctx);
K
Kent Overstreet 已提交
571

K
Kent Overstreet 已提交
572
	aio_free_ring(ctx);
K
Kent Overstreet 已提交
573
	free_percpu(ctx->cpu);
574 575
	percpu_ref_exit(&ctx->reqs);
	percpu_ref_exit(&ctx->users);
K
Kent Overstreet 已提交
576 577 578
	kmem_cache_free(kioctx_cachep, ctx);
}

K
Kent Overstreet 已提交
579 580 581 582
static void free_ioctx_reqs(struct percpu_ref *ref)
{
	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);

583
	/* At this point we know that there are no any in-flight requests */
584 585
	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
		complete(&ctx->rq_wait->comp);
586

587
	/* Synchronize against RCU protected table->table[] dereferences */
588 589
	INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
	queue_rcu_work(system_wq, &ctx->free_rwork);
K
Kent Overstreet 已提交
590 591
}

K
Kent Overstreet 已提交
592 593 594 595 596
/*
 * When this function runs, the kioctx has been removed from the "hash table"
 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
 * now it's safe to cancel any that need to be.
 */
K
Kent Overstreet 已提交
597
static void free_ioctx_users(struct percpu_ref *ref)
K
Kent Overstreet 已提交
598
{
K
Kent Overstreet 已提交
599
	struct kioctx *ctx = container_of(ref, struct kioctx, users);
600
	struct aio_kiocb *req;
K
Kent Overstreet 已提交
601 602 603 604 605

	spin_lock_irq(&ctx->ctx_lock);

	while (!list_empty(&ctx->active_reqs)) {
		req = list_first_entry(&ctx->active_reqs,
606
				       struct aio_kiocb, ki_list);
C
Christoph Hellwig 已提交
607
		req->ki_cancel(&req->rw);
A
Al Viro 已提交
608
		list_del_init(&req->ki_list);
K
Kent Overstreet 已提交
609 610 611 612
	}

	spin_unlock_irq(&ctx->ctx_lock);

K
Kent Overstreet 已提交
613 614
	percpu_ref_kill(&ctx->reqs);
	percpu_ref_put(&ctx->reqs);
K
Kent Overstreet 已提交
615 616
}

617 618 619 620 621 622 623
static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
{
	unsigned i, new_nr;
	struct kioctx_table *table, *old;
	struct aio_ring *ring;

	spin_lock(&mm->ioctx_lock);
624
	table = rcu_dereference_raw(mm->ioctx_table);
625 626 627 628

	while (1) {
		if (table)
			for (i = 0; i < table->nr; i++)
629
				if (!rcu_access_pointer(table->table[i])) {
630
					ctx->id = i;
631
					rcu_assign_pointer(table->table[i], ctx);
632 633
					spin_unlock(&mm->ioctx_lock);

634 635 636 637
					/* While kioctx setup is in progress,
					 * we are protected from page migration
					 * changes ring_pages by ->ring_lock.
					 */
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
					ring = kmap_atomic(ctx->ring_pages[0]);
					ring->id = ctx->id;
					kunmap_atomic(ring);
					return 0;
				}

		new_nr = (table ? table->nr : 1) * 4;
		spin_unlock(&mm->ioctx_lock);

		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
				new_nr, GFP_KERNEL);
		if (!table)
			return -ENOMEM;

		table->nr = new_nr;

		spin_lock(&mm->ioctx_lock);
655
		old = rcu_dereference_raw(mm->ioctx_table);
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

		if (!old) {
			rcu_assign_pointer(mm->ioctx_table, table);
		} else if (table->nr > old->nr) {
			memcpy(table->table, old->table,
			       old->nr * sizeof(struct kioctx *));

			rcu_assign_pointer(mm->ioctx_table, table);
			kfree_rcu(old, rcu);
		} else {
			kfree(table);
			table = old;
		}
	}
}

K
Kent Overstreet 已提交
672 673 674 675 676 677 678 679 680 681
static void aio_nr_sub(unsigned nr)
{
	spin_lock(&aio_nr_lock);
	if (WARN_ON(aio_nr - nr > aio_nr))
		aio_nr = 0;
	else
		aio_nr -= nr;
	spin_unlock(&aio_nr_lock);
}

L
Linus Torvalds 已提交
682 683 684 685 686
/* ioctx_alloc
 *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
 */
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
Z
Zach Brown 已提交
687
	struct mm_struct *mm = current->mm;
L
Linus Torvalds 已提交
688
	struct kioctx *ctx;
689
	int err = -ENOMEM;
L
Linus Torvalds 已提交
690

691 692 693 694 695 696
	/*
	 * Store the original nr_events -- what userspace passed to io_setup(),
	 * for counting against the global limit -- before it changes.
	 */
	unsigned int max_reqs = nr_events;

K
Kent Overstreet 已提交
697 698 699 700 701 702 703 704 705 706 707 708
	/*
	 * We keep track of the number of available ringbuffer slots, to prevent
	 * overflow (reqs_available), and we also use percpu counters for this.
	 *
	 * So since up to half the slots might be on other cpu's percpu counters
	 * and unavailable, double nr_events so userspace sees what they
	 * expected: additionally, we move req_batch slots to/from percpu
	 * counters at a time, so make sure that isn't 0:
	 */
	nr_events = max(nr_events, num_possible_cpus() * 4);
	nr_events *= 2;

L
Linus Torvalds 已提交
709
	/* Prevent overflows */
A
Al Viro 已提交
710
	if (nr_events > (0x10000000U / sizeof(struct io_event))) {
L
Linus Torvalds 已提交
711 712 713 714
		pr_debug("ENOMEM: nr_events too high\n");
		return ERR_PTR(-EINVAL);
	}

715
	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
L
Linus Torvalds 已提交
716 717
		return ERR_PTR(-EAGAIN);

718
	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
L
Linus Torvalds 已提交
719 720 721
	if (!ctx)
		return ERR_PTR(-ENOMEM);

722
	ctx->max_reqs = max_reqs;
L
Linus Torvalds 已提交
723 724

	spin_lock_init(&ctx->ctx_lock);
725
	spin_lock_init(&ctx->completion_lock);
K
Kent Overstreet 已提交
726
	mutex_init(&ctx->ring_lock);
727 728 729
	/* Protect against page migration throughout kiotx setup by keeping
	 * the ring_lock mutex held until setup is complete. */
	mutex_lock(&ctx->ring_lock);
L
Linus Torvalds 已提交
730 731 732 733
	init_waitqueue_head(&ctx->wait);

	INIT_LIST_HEAD(&ctx->active_reqs);

734
	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
735 736
		goto err;

737
	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
738 739
		goto err;

K
Kent Overstreet 已提交
740 741
	ctx->cpu = alloc_percpu(struct kioctx_cpu);
	if (!ctx->cpu)
K
Kent Overstreet 已提交
742
		goto err;
L
Linus Torvalds 已提交
743

744
	err = aio_setup_ring(ctx, nr_events);
745
	if (err < 0)
K
Kent Overstreet 已提交
746
		goto err;
K
Kent Overstreet 已提交
747

748
	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
K
Kent Overstreet 已提交
749
	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
750 751
	if (ctx->req_batch < 1)
		ctx->req_batch = 1;
752

L
Linus Torvalds 已提交
753
	/* limit the number of system wide aios */
754
	spin_lock(&aio_nr_lock);
755 756
	if (aio_nr + ctx->max_reqs > aio_max_nr ||
	    aio_nr + ctx->max_reqs < aio_nr) {
757
		spin_unlock(&aio_nr_lock);
K
Kent Overstreet 已提交
758
		err = -EAGAIN;
G
Gu Zheng 已提交
759
		goto err_ctx;
760 761
	}
	aio_nr += ctx->max_reqs;
762
	spin_unlock(&aio_nr_lock);
L
Linus Torvalds 已提交
763

764 765
	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
K
Kent Overstreet 已提交
766

767 768
	err = ioctx_add_table(ctx, mm);
	if (err)
K
Kent Overstreet 已提交
769
		goto err_cleanup;
770

771 772 773
	/* Release the ring_lock mutex now that all setup is complete. */
	mutex_unlock(&ctx->ring_lock);

K
Kent Overstreet 已提交
774
	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
K
Kent Overstreet 已提交
775
		 ctx, ctx->user_id, mm, ctx->nr_events);
L
Linus Torvalds 已提交
776 777
	return ctx;

K
Kent Overstreet 已提交
778 779
err_cleanup:
	aio_nr_sub(ctx->max_reqs);
G
Gu Zheng 已提交
780
err_ctx:
781 782 783
	atomic_set(&ctx->dead, 1);
	if (ctx->mmap_size)
		vm_munmap(ctx->mmap_base, ctx->mmap_size);
G
Gu Zheng 已提交
784
	aio_free_ring(ctx);
K
Kent Overstreet 已提交
785
err:
786
	mutex_unlock(&ctx->ring_lock);
K
Kent Overstreet 已提交
787
	free_percpu(ctx->cpu);
788 789
	percpu_ref_exit(&ctx->reqs);
	percpu_ref_exit(&ctx->users);
L
Linus Torvalds 已提交
790
	kmem_cache_free(kioctx_cachep, ctx);
K
Kent Overstreet 已提交
791
	pr_debug("error allocating ioctx %d\n", err);
792
	return ERR_PTR(err);
L
Linus Torvalds 已提交
793 794
}

K
Kent Overstreet 已提交
795 796 797 798 799
/* kill_ioctx
 *	Cancels all outstanding aio requests on an aio context.  Used
 *	when the processes owning a context have all exited to encourage
 *	the rapid destruction of the kioctx.
 */
800
static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
801
		      struct ctx_rq_wait *wait)
K
Kent Overstreet 已提交
802
{
803
	struct kioctx_table *table;
804

A
Al Viro 已提交
805 806 807
	spin_lock(&mm->ioctx_lock);
	if (atomic_xchg(&ctx->dead, 1)) {
		spin_unlock(&mm->ioctx_lock);
808
		return -EINVAL;
A
Al Viro 已提交
809
	}
810

811
	table = rcu_dereference_raw(mm->ioctx_table);
812 813
	WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
	RCU_INIT_POINTER(table->table[ctx->id], NULL);
814
	spin_unlock(&mm->ioctx_lock);
815

816
	/* free_ioctx_reqs() will do the necessary RCU synchronization */
817
	wake_up_all(&ctx->wait);
818

819 820 821 822 823 824 825 826
	/*
	 * It'd be more correct to do this in free_ioctx(), after all
	 * the outstanding kiocbs have finished - but by then io_destroy
	 * has already returned, so io_setup() could potentially return
	 * -EAGAIN with no ioctxs actually in use (as far as userspace
	 *  could tell).
	 */
	aio_nr_sub(ctx->max_reqs);
827

828 829
	if (ctx->mmap_size)
		vm_munmap(ctx->mmap_base, ctx->mmap_size);
830

831
	ctx->rq_wait = wait;
832 833
	percpu_ref_kill(&ctx->users);
	return 0;
L
Linus Torvalds 已提交
834 835
}

K
Kent Overstreet 已提交
836 837 838 839 840 841 842
/*
 * exit_aio: called when the last user of mm goes away.  At this point, there is
 * no way for any new requests to be submited or any of the io_* syscalls to be
 * called on the context.
 *
 * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
 * them.
L
Linus Torvalds 已提交
843
 */
844
void exit_aio(struct mm_struct *mm)
L
Linus Torvalds 已提交
845
{
846
	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
847 848
	struct ctx_rq_wait wait;
	int i, skipped;
849

850 851
	if (!table)
		return;
852

853 854 855 856
	atomic_set(&wait.count, table->nr);
	init_completion(&wait.comp);

	skipped = 0;
857
	for (i = 0; i < table->nr; ++i) {
858 859
		struct kioctx *ctx =
			rcu_dereference_protected(table->table[i], true);
J
Jens Axboe 已提交
860

861 862
		if (!ctx) {
			skipped++;
863
			continue;
864 865
		}

866
		/*
867 868 869 870 871
		 * We don't need to bother with munmap() here - exit_mmap(mm)
		 * is coming and it'll unmap everything. And we simply can't,
		 * this is not necessarily our ->mm.
		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
		 * that it needs to unmap the area, just set it to 0.
872
		 */
K
Kent Overstreet 已提交
873
		ctx->mmap_size = 0;
874 875
		kill_ioctx(mm, ctx, &wait);
	}
K
Kent Overstreet 已提交
876

877
	if (!atomic_sub_and_test(skipped, &wait.count)) {
878
		/* Wait until all IO for the context are done. */
879
		wait_for_completion(&wait.comp);
L
Linus Torvalds 已提交
880
	}
881 882 883

	RCU_INIT_POINTER(mm->ioctx_table, NULL);
	kfree(table);
L
Linus Torvalds 已提交
884 885
}

K
Kent Overstreet 已提交
886 887 888
static void put_reqs_available(struct kioctx *ctx, unsigned nr)
{
	struct kioctx_cpu *kcpu;
889
	unsigned long flags;
K
Kent Overstreet 已提交
890

891
	local_irq_save(flags);
892
	kcpu = this_cpu_ptr(ctx->cpu);
K
Kent Overstreet 已提交
893
	kcpu->reqs_available += nr;
894

K
Kent Overstreet 已提交
895 896 897 898 899
	while (kcpu->reqs_available >= ctx->req_batch * 2) {
		kcpu->reqs_available -= ctx->req_batch;
		atomic_add(ctx->req_batch, &ctx->reqs_available);
	}

900
	local_irq_restore(flags);
K
Kent Overstreet 已提交
901 902 903 904 905 906
}

static bool get_reqs_available(struct kioctx *ctx)
{
	struct kioctx_cpu *kcpu;
	bool ret = false;
907
	unsigned long flags;
K
Kent Overstreet 已提交
908

909
	local_irq_save(flags);
910
	kcpu = this_cpu_ptr(ctx->cpu);
K
Kent Overstreet 已提交
911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
	if (!kcpu->reqs_available) {
		int old, avail = atomic_read(&ctx->reqs_available);

		do {
			if (avail < ctx->req_batch)
				goto out;

			old = avail;
			avail = atomic_cmpxchg(&ctx->reqs_available,
					       avail, avail - ctx->req_batch);
		} while (avail != old);

		kcpu->reqs_available += ctx->req_batch;
	}

	ret = true;
	kcpu->reqs_available--;
out:
929
	local_irq_restore(flags);
K
Kent Overstreet 已提交
930 931 932
	return ret;
}

933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
/* refill_reqs_available
 *	Updates the reqs_available reference counts used for tracking the
 *	number of free slots in the completion ring.  This can be called
 *	from aio_complete() (to optimistically update reqs_available) or
 *	from aio_get_req() (the we're out of events case).  It must be
 *	called holding ctx->completion_lock.
 */
static void refill_reqs_available(struct kioctx *ctx, unsigned head,
                                  unsigned tail)
{
	unsigned events_in_ring, completed;

	/* Clamp head since userland can write to it. */
	head %= ctx->nr_events;
	if (head <= tail)
		events_in_ring = tail - head;
	else
		events_in_ring = ctx->nr_events - (head - tail);

	completed = ctx->completed_events;
	if (events_in_ring < completed)
		completed -= events_in_ring;
	else
		completed = 0;

	if (!completed)
		return;

	ctx->completed_events -= completed;
	put_reqs_available(ctx, completed);
}

/* user_refill_reqs_available
 *	Called to refill reqs_available when aio_get_req() encounters an
 *	out of space in the completion ring.
 */
static void user_refill_reqs_available(struct kioctx *ctx)
{
	spin_lock_irq(&ctx->completion_lock);
	if (ctx->completed_events) {
		struct aio_ring *ring;
		unsigned head;

		/* Access of ring->head may race with aio_read_events_ring()
		 * here, but that's okay since whether we read the old version
		 * or the new version, and either will be valid.  The important
		 * part is that head cannot pass tail since we prevent
		 * aio_complete() from updating tail by holding
		 * ctx->completion_lock.  Even if head is invalid, the check
		 * against ctx->completed_events below will make sure we do the
		 * safe/right thing.
		 */
		ring = kmap_atomic(ctx->ring_pages[0]);
		head = ring->head;
		kunmap_atomic(ring);

		refill_reqs_available(ctx, head, ctx->tail);
	}

	spin_unlock_irq(&ctx->completion_lock);
}

L
Linus Torvalds 已提交
995
/* aio_get_req
K
Kent Overstreet 已提交
996 997
 *	Allocate a slot for an aio request.
 * Returns NULL if no requests are free.
L
Linus Torvalds 已提交
998
 */
999
static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
L
Linus Torvalds 已提交
1000
{
1001
	struct aio_kiocb *req;
K
Kent Overstreet 已提交
1002

1003 1004 1005 1006 1007
	if (!get_reqs_available(ctx)) {
		user_refill_reqs_available(ctx);
		if (!get_reqs_available(ctx))
			return NULL;
	}
K
Kent Overstreet 已提交
1008

1009
	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
L
Linus Torvalds 已提交
1010
	if (unlikely(!req))
K
Kent Overstreet 已提交
1011
		goto out_put;
L
Linus Torvalds 已提交
1012

K
Kent Overstreet 已提交
1013
	percpu_ref_get(&ctx->reqs);
1014
	INIT_LIST_HEAD(&req->ki_list);
L
Linus Torvalds 已提交
1015
	req->ki_ctx = ctx;
J
Jeff Moyer 已提交
1016
	return req;
K
Kent Overstreet 已提交
1017
out_put:
K
Kent Overstreet 已提交
1018
	put_reqs_available(ctx, 1);
K
Kent Overstreet 已提交
1019
	return NULL;
L
Linus Torvalds 已提交
1020 1021
}

A
Adrian Bunk 已提交
1022
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
L
Linus Torvalds 已提交
1023
{
1024
	struct aio_ring __user *ring  = (void __user *)ctx_id;
J
Jens Axboe 已提交
1025
	struct mm_struct *mm = current->mm;
1026
	struct kioctx *ctx, *ret = NULL;
1027 1028 1029 1030 1031
	struct kioctx_table *table;
	unsigned id;

	if (get_user(id, &ring->id))
		return NULL;
L
Linus Torvalds 已提交
1032

J
Jens Axboe 已提交
1033
	rcu_read_lock();
1034
	table = rcu_dereference(mm->ioctx_table);
J
Jens Axboe 已提交
1035

1036 1037
	if (!table || id >= table->nr)
		goto out;
L
Linus Torvalds 已提交
1038

1039
	ctx = rcu_dereference(table->table[id]);
1040
	if (ctx && ctx->user_id == ctx_id) {
1041 1042
		if (percpu_ref_tryget_live(&ctx->users))
			ret = ctx;
1043 1044
	}
out:
J
Jens Axboe 已提交
1045
	rcu_read_unlock();
1046
	return ret;
L
Linus Torvalds 已提交
1047 1048 1049 1050 1051
}

/* aio_complete
 *	Called when the io request on the given iocb is complete.
 */
1052
static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
L
Linus Torvalds 已提交
1053 1054 1055
{
	struct kioctx	*ctx = iocb->ki_ctx;
	struct aio_ring	*ring;
K
Kent Overstreet 已提交
1056
	struct io_event	*ev_page, *event;
1057
	unsigned tail, pos, head;
L
Linus Torvalds 已提交
1058 1059
	unsigned long	flags;

1060 1061
	/*
	 * Add a completion event to the ring buffer. Must be done holding
1062
	 * ctx->completion_lock to prevent other code from messing with the tail
1063 1064 1065 1066
	 * pointer since we might be called from irq context.
	 */
	spin_lock_irqsave(&ctx->completion_lock, flags);

K
Kent Overstreet 已提交
1067
	tail = ctx->tail;
K
Kent Overstreet 已提交
1068 1069
	pos = tail + AIO_EVENTS_OFFSET;

K
Kent Overstreet 已提交
1070
	if (++tail >= ctx->nr_events)
1071
		tail = 0;
L
Linus Torvalds 已提交
1072

K
Kent Overstreet 已提交
1073
	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
K
Kent Overstreet 已提交
1074 1075
	event = ev_page + pos % AIO_EVENTS_PER_PAGE;

1076
	event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
L
Linus Torvalds 已提交
1077 1078 1079 1080
	event->data = iocb->ki_user_data;
	event->res = res;
	event->res2 = res2;

K
Kent Overstreet 已提交
1081
	kunmap_atomic(ev_page);
K
Kent Overstreet 已提交
1082
	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
K
Kent Overstreet 已提交
1083 1084

	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
1085
		 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
K
Kent Overstreet 已提交
1086
		 res, res2);
L
Linus Torvalds 已提交
1087 1088 1089 1090 1091 1092

	/* after flagging the request as done, we
	 * must never even look at it again
	 */
	smp_wmb();	/* make event visible before updating tail */

K
Kent Overstreet 已提交
1093
	ctx->tail = tail;
L
Linus Torvalds 已提交
1094

K
Kent Overstreet 已提交
1095
	ring = kmap_atomic(ctx->ring_pages[0]);
1096
	head = ring->head;
K
Kent Overstreet 已提交
1097
	ring->tail = tail;
1098
	kunmap_atomic(ring);
K
Kent Overstreet 已提交
1099
	flush_dcache_page(ctx->ring_pages[0]);
L
Linus Torvalds 已提交
1100

1101 1102 1103
	ctx->completed_events++;
	if (ctx->completed_events > 1)
		refill_reqs_available(ctx, head, tail);
1104 1105
	spin_unlock_irqrestore(&ctx->completion_lock, flags);

K
Kent Overstreet 已提交
1106
	pr_debug("added to ring %p at [%u]\n", iocb, tail);
D
Davide Libenzi 已提交
1107 1108 1109 1110 1111 1112

	/*
	 * Check if the user asked us to deliver the result through an
	 * eventfd. The eventfd_signal() function is safe to be called
	 * from IRQ context.
	 */
1113
	if (iocb->ki_eventfd) {
D
Davide Libenzi 已提交
1114
		eventfd_signal(iocb->ki_eventfd, 1);
1115 1116
		eventfd_ctx_put(iocb->ki_eventfd);
	}
D
Davide Libenzi 已提交
1117

1118
	kmem_cache_free(kiocb_cachep, iocb);
L
Linus Torvalds 已提交
1119

1120 1121 1122 1123 1124 1125 1126 1127
	/*
	 * We have to order our ring_info tail store above and test
	 * of the wait list below outside the wait lock.  This is
	 * like in wake_up_bit() where clearing a bit has to be
	 * ordered with the unlocked test.
	 */
	smp_mb();

L
Linus Torvalds 已提交
1128 1129 1130
	if (waitqueue_active(&ctx->wait))
		wake_up(&ctx->wait);

K
Kent Overstreet 已提交
1131
	percpu_ref_put(&ctx->reqs);
L
Linus Torvalds 已提交
1132 1133
}

G
Gu Zheng 已提交
1134
/* aio_read_events_ring
1135 1136
 *	Pull an event off of the ioctx's event ring.  Returns the number of
 *	events fetched
L
Linus Torvalds 已提交
1137
 */
1138 1139
static long aio_read_events_ring(struct kioctx *ctx,
				 struct io_event __user *event, long nr)
L
Linus Torvalds 已提交
1140 1141
{
	struct aio_ring *ring;
1142
	unsigned head, tail, pos;
1143 1144 1145
	long ret = 0;
	int copy_ret;

1146 1147 1148 1149 1150 1151 1152
	/*
	 * The mutex can block and wake us up and that will cause
	 * wait_event_interruptible_hrtimeout() to schedule without sleeping
	 * and repeat. This should be rare enough that it doesn't cause
	 * peformance issues. See the comment in read_events() for more detail.
	 */
	sched_annotate_sleep();
K
Kent Overstreet 已提交
1153
	mutex_lock(&ctx->ring_lock);
L
Linus Torvalds 已提交
1154

1155
	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
K
Kent Overstreet 已提交
1156
	ring = kmap_atomic(ctx->ring_pages[0]);
1157
	head = ring->head;
1158
	tail = ring->tail;
1159 1160
	kunmap_atomic(ring);

1161 1162 1163 1164 1165 1166
	/*
	 * Ensure that once we've read the current tail pointer, that
	 * we also see the events that were stored up to the tail.
	 */
	smp_rmb();

1167
	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
L
Linus Torvalds 已提交
1168

1169
	if (head == tail)
L
Linus Torvalds 已提交
1170 1171
		goto out;

1172 1173 1174
	head %= ctx->nr_events;
	tail %= ctx->nr_events;

1175 1176 1177 1178 1179
	while (ret < nr) {
		long avail;
		struct io_event *ev;
		struct page *page;

1180 1181
		avail = (head <= tail ?  tail : ctx->nr_events) - head;
		if (head == tail)
1182 1183 1184
			break;

		pos = head + AIO_EVENTS_OFFSET;
K
Kent Overstreet 已提交
1185
		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
1186 1187
		pos %= AIO_EVENTS_PER_PAGE;

1188 1189 1190
		avail = min(avail, nr - ret);
		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);

1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
		ev = kmap(page);
		copy_ret = copy_to_user(event + ret, ev + pos,
					sizeof(*ev) * avail);
		kunmap(page);

		if (unlikely(copy_ret)) {
			ret = -EFAULT;
			goto out;
		}

		ret += avail;
		head += avail;
K
Kent Overstreet 已提交
1203
		head %= ctx->nr_events;
L
Linus Torvalds 已提交
1204 1205
	}

K
Kent Overstreet 已提交
1206
	ring = kmap_atomic(ctx->ring_pages[0]);
1207
	ring->head = head;
1208
	kunmap_atomic(ring);
K
Kent Overstreet 已提交
1209
	flush_dcache_page(ctx->ring_pages[0]);
1210

1211
	pr_debug("%li  h%u t%u\n", ret, head, tail);
1212
out:
K
Kent Overstreet 已提交
1213
	mutex_unlock(&ctx->ring_lock);
1214

L
Linus Torvalds 已提交
1215 1216 1217
	return ret;
}

1218 1219
static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
			    struct io_event __user *event, long *i)
L
Linus Torvalds 已提交
1220
{
1221
	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
L
Linus Torvalds 已提交
1222

1223 1224
	if (ret > 0)
		*i += ret;
L
Linus Torvalds 已提交
1225

1226 1227
	if (unlikely(atomic_read(&ctx->dead)))
		ret = -EINVAL;
L
Linus Torvalds 已提交
1228

1229 1230
	if (!*i)
		*i = ret;
L
Linus Torvalds 已提交
1231

1232
	return ret < 0 || *i >= min_nr;
L
Linus Torvalds 已提交
1233 1234
}

1235
static long read_events(struct kioctx *ctx, long min_nr, long nr,
L
Linus Torvalds 已提交
1236
			struct io_event __user *event,
1237
			ktime_t until)
L
Linus Torvalds 已提交
1238
{
1239
	long ret = 0;
L
Linus Torvalds 已提交
1240

1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254
	/*
	 * Note that aio_read_events() is being called as the conditional - i.e.
	 * we're calling it after prepare_to_wait() has set task state to
	 * TASK_INTERRUPTIBLE.
	 *
	 * But aio_read_events() can block, and if it blocks it's going to flip
	 * the task state back to TASK_RUNNING.
	 *
	 * This should be ok, provided it doesn't flip the state back to
	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
	 * will only happen if the mutex_lock() call blocks, and we then find
	 * the ringbuffer empty. So in practice we should be ok, but it's
	 * something to be aware of when touching this code.
	 */
T
Thomas Gleixner 已提交
1255
	if (until == 0)
1256 1257 1258 1259 1260
		aio_read_events(ctx, min_nr, nr, event, &ret);
	else
		wait_event_interruptible_hrtimeout(ctx->wait,
				aio_read_events(ctx, min_nr, nr, event, &ret),
				until);
1261
	return ret;
L
Linus Torvalds 已提交
1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276
}

/* sys_io_setup:
 *	Create an aio_context capable of receiving at least nr_events.
 *	ctxp must not point to an aio_context that already exists, and
 *	must be initialized to 0 prior to the call.  On successful
 *	creation of the aio_context, *ctxp is filled in with the resulting 
 *	handle.  May fail with -EINVAL if *ctxp is not initialized,
 *	if the specified nr_events exceeds internal limits.  May fail 
 *	with -EAGAIN if the specified nr_events exceeds the user's limit 
 *	of available events.  May fail with -ENOMEM if insufficient kernel
 *	resources are available.  May fail with -EFAULT if an invalid
 *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
 *	implemented.
 */
1277
SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
L
Linus Torvalds 已提交
1278 1279 1280 1281 1282 1283 1284 1285 1286 1287
{
	struct kioctx *ioctx = NULL;
	unsigned long ctx;
	long ret;

	ret = get_user(ctx, ctxp);
	if (unlikely(ret))
		goto out;

	ret = -EINVAL;
1288
	if (unlikely(ctx || nr_events == 0)) {
1289
		pr_debug("EINVAL: ctx %lu nr_events %u\n",
1290
		         ctx, nr_events);
L
Linus Torvalds 已提交
1291 1292 1293 1294 1295 1296 1297
		goto out;
	}

	ioctx = ioctx_alloc(nr_events);
	ret = PTR_ERR(ioctx);
	if (!IS_ERR(ioctx)) {
		ret = put_user(ioctx->user_id, ctxp);
1298
		if (ret)
1299
			kill_ioctx(current->mm, ioctx, NULL);
K
Kent Overstreet 已提交
1300
		percpu_ref_put(&ioctx->users);
L
Linus Torvalds 已提交
1301 1302 1303 1304 1305 1306
	}

out:
	return ret;
}

A
Al Viro 已提交
1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
{
	struct kioctx *ioctx = NULL;
	unsigned long ctx;
	long ret;

	ret = get_user(ctx, ctx32p);
	if (unlikely(ret))
		goto out;

	ret = -EINVAL;
	if (unlikely(ctx || nr_events == 0)) {
		pr_debug("EINVAL: ctx %lu nr_events %u\n",
		         ctx, nr_events);
		goto out;
	}

	ioctx = ioctx_alloc(nr_events);
	ret = PTR_ERR(ioctx);
	if (!IS_ERR(ioctx)) {
		/* truncating is ok because it's a user address */
		ret = put_user((u32)ioctx->user_id, ctx32p);
		if (ret)
			kill_ioctx(current->mm, ioctx, NULL);
		percpu_ref_put(&ioctx->users);
	}

out:
	return ret;
}
#endif

L
Linus Torvalds 已提交
1340 1341 1342
/* sys_io_destroy:
 *	Destroy the aio_context specified.  May cancel any outstanding 
 *	AIOs and block on completion.  Will fail with -ENOSYS if not
1343
 *	implemented.  May fail with -EINVAL if the context pointed to
L
Linus Torvalds 已提交
1344 1345
 *	is invalid.
 */
1346
SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
L
Linus Torvalds 已提交
1347 1348 1349
{
	struct kioctx *ioctx = lookup_ioctx(ctx);
	if (likely(NULL != ioctx)) {
1350
		struct ctx_rq_wait wait;
1351
		int ret;
1352

1353 1354 1355
		init_completion(&wait.comp);
		atomic_set(&wait.count, 1);

1356 1357 1358 1359
		/* Pass requests_done to kill_ioctx() where it can be set
		 * in a thread-safe way. If we try to set it here then we have
		 * a race condition if two io_destroy() called simultaneously.
		 */
1360
		ret = kill_ioctx(current->mm, ioctx, &wait);
K
Kent Overstreet 已提交
1361
		percpu_ref_put(&ioctx->users);
1362 1363 1364 1365 1366

		/* Wait until all IO for the context are done. Otherwise kernel
		 * keep using user-space buffers even if user thinks the context
		 * is destroyed.
		 */
1367
		if (!ret)
1368
			wait_for_completion(&wait.comp);
1369

1370
		return ret;
L
Linus Torvalds 已提交
1371
	}
1372
	pr_debug("EINVAL: invalid context id\n");
L
Linus Torvalds 已提交
1373 1374 1375
	return -EINVAL;
}

1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
static void aio_remove_iocb(struct aio_kiocb *iocb)
{
	struct kioctx *ctx = iocb->ki_ctx;
	unsigned long flags;

	spin_lock_irqsave(&ctx->ctx_lock, flags);
	list_del(&iocb->ki_list);
	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}

1386 1387 1388 1389
static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
{
	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);

1390 1391 1392
	if (!list_empty_careful(&iocb->ki_list))
		aio_remove_iocb(iocb);

1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420
	if (kiocb->ki_flags & IOCB_WRITE) {
		struct inode *inode = file_inode(kiocb->ki_filp);

		/*
		 * Tell lockdep we inherited freeze protection from submission
		 * thread.
		 */
		if (S_ISREG(inode->i_mode))
			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
		file_end_write(kiocb->ki_filp);
	}

	fput(kiocb->ki_filp);
	aio_complete(iocb, res, res2);
}

static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
{
	int ret;

	req->ki_filp = fget(iocb->aio_fildes);
	if (unlikely(!req->ki_filp))
		return -EBADF;
	req->ki_complete = aio_complete_rw;
	req->ki_pos = iocb->aio_offset;
	req->ki_flags = iocb_flags(req->ki_filp);
	if (iocb->aio_flags & IOCB_FLAG_RESFD)
		req->ki_flags |= IOCB_EVENTFD;
1421
	req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
A
Adam Manzanares 已提交
1422 1423 1424 1425 1426 1427 1428 1429
	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
		/*
		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
		 * aio_reqprio is interpreted as an I/O scheduling
		 * class and priority.
		 */
		ret = ioprio_check_cap(iocb->aio_reqprio);
		if (ret) {
1430 1431
			pr_debug("aio ioprio check cap error: %d\n", ret);
			return ret;
A
Adam Manzanares 已提交
1432 1433 1434 1435 1436 1437
		}

		req->ki_ioprio = iocb->aio_reqprio;
	} else
		req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);

1438 1439 1440 1441 1442 1443
	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
	if (unlikely(ret))
		fput(req->ki_filp);
	return ret;
}

C
Christoph Hellwig 已提交
1444 1445
static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
		bool vectored, bool compat, struct iov_iter *iter)
B
Badari Pulavarty 已提交
1446
{
C
Christoph Hellwig 已提交
1447 1448 1449 1450 1451 1452 1453 1454
	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
	size_t len = iocb->aio_nbytes;

	if (!vectored) {
		ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
		*iovec = NULL;
		return ret;
	}
1455 1456
#ifdef CONFIG_COMPAT
	if (compat)
C
Christoph Hellwig 已提交
1457 1458
		return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
				iter);
1459
#endif
C
Christoph Hellwig 已提交
1460
	return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
B
Badari Pulavarty 已提交
1461 1462
}

1463
static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
C
Christoph Hellwig 已提交
1464 1465 1466
{
	switch (ret) {
	case -EIOCBQUEUED:
1467
		break;
C
Christoph Hellwig 已提交
1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478
	case -ERESTARTSYS:
	case -ERESTARTNOINTR:
	case -ERESTARTNOHAND:
	case -ERESTART_RESTARTBLOCK:
		/*
		 * There's no easy way to restart the syscall since other AIO's
		 * may be already running. Just fail this IO with EINTR.
		 */
		ret = -EINTR;
		/*FALLTHRU*/
	default:
1479
		aio_complete_rw(req, ret, 0);
C
Christoph Hellwig 已提交
1480 1481 1482 1483 1484
	}
}

static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
		bool compat)
L
Linus Torvalds 已提交
1485
{
1486
	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1487
	struct iov_iter iter;
1488
	struct file *file;
C
Christoph Hellwig 已提交
1489
	ssize_t ret;
L
Linus Torvalds 已提交
1490

1491 1492 1493 1494 1495 1496
	ret = aio_prep_rw(req, iocb);
	if (ret)
		return ret;
	file = req->ki_filp;

	ret = -EBADF;
C
Christoph Hellwig 已提交
1497
	if (unlikely(!(file->f_mode & FMODE_READ)))
1498 1499
		goto out_fput;
	ret = -EINVAL;
C
Christoph Hellwig 已提交
1500
	if (unlikely(!file->f_op->read_iter))
1501
		goto out_fput;
K
Kent Overstreet 已提交
1502

C
Christoph Hellwig 已提交
1503 1504
	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
	if (ret)
1505
		goto out_fput;
C
Christoph Hellwig 已提交
1506 1507
	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
	if (!ret)
1508
		aio_rw_done(req, call_read_iter(file, req, &iter));
C
Christoph Hellwig 已提交
1509
	kfree(iovec);
1510
out_fput:
1511
	if (unlikely(ret))
1512
		fput(file);
C
Christoph Hellwig 已提交
1513 1514
	return ret;
}
K
Kent Overstreet 已提交
1515

C
Christoph Hellwig 已提交
1516 1517 1518 1519 1520
static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
		bool compat)
{
	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
	struct iov_iter iter;
1521
	struct file *file;
C
Christoph Hellwig 已提交
1522
	ssize_t ret;
K
Kent Overstreet 已提交
1523

1524 1525 1526 1527 1528 1529
	ret = aio_prep_rw(req, iocb);
	if (ret)
		return ret;
	file = req->ki_filp;

	ret = -EBADF;
C
Christoph Hellwig 已提交
1530
	if (unlikely(!(file->f_mode & FMODE_WRITE)))
1531 1532
		goto out_fput;
	ret = -EINVAL;
C
Christoph Hellwig 已提交
1533
	if (unlikely(!file->f_op->write_iter))
1534
		goto out_fput;
L
Linus Torvalds 已提交
1535

C
Christoph Hellwig 已提交
1536 1537
	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
	if (ret)
1538
		goto out_fput;
C
Christoph Hellwig 已提交
1539 1540
	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
	if (!ret) {
1541
		/*
1542
		 * Open-code file_start_write here to grab freeze protection,
1543 1544 1545 1546
		 * which will be released by another thread in
		 * aio_complete_rw().  Fool lockdep by telling it the lock got
		 * released so that it doesn't complain about the held lock when
		 * we return to userspace.
1547
		 */
1548 1549
		if (S_ISREG(file_inode(file)->i_mode)) {
			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
S
Shaohua Li 已提交
1550
			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
1551 1552
		}
		req->ki_flags |= IOCB_WRITE;
1553
		aio_rw_done(req, call_write_iter(file, req, &iter));
K
Kent Overstreet 已提交
1554
	}
C
Christoph Hellwig 已提交
1555
	kfree(iovec);
1556
out_fput:
1557
	if (unlikely(ret))
1558
		fput(file);
C
Christoph Hellwig 已提交
1559
	return ret;
L
Linus Torvalds 已提交
1560 1561
}

1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587
static void aio_fsync_work(struct work_struct *work)
{
	struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
	int ret;

	ret = vfs_fsync(req->file, req->datasync);
	fput(req->file);
	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
}

static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
{
	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
			iocb->aio_rw_flags))
		return -EINVAL;
	req->file = fget(iocb->aio_fildes);
	if (unlikely(!req->file))
		return -EBADF;
	if (unlikely(!req->file->f_op->fsync)) {
		fput(req->file);
		return -EINVAL;
	}

	req->datasync = datasync;
	INIT_WORK(&req->work, aio_fsync_work);
	schedule_work(&req->work);
1588
	return 0;
1589 1590
}

C
Christoph Hellwig 已提交
1591 1592 1593 1594 1595 1596 1597 1598 1599
/* need to use list_del_init so we can check if item was present */
static inline bool __aio_poll_remove(struct poll_iocb *req)
{
	if (list_empty(&req->wait.entry))
		return false;
	list_del_init(&req->wait.entry);
	return true;
}

1600
static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
C
Christoph Hellwig 已提交
1601
{
1602
	fput(iocb->poll.file);
C
Christoph Hellwig 已提交
1603 1604 1605 1606 1607
	aio_complete(iocb, mangle_poll(mask), 0);
}

static void aio_poll_work(struct work_struct *work)
{
1608
	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work);
C
Christoph Hellwig 已提交
1609

1610 1611 1612
	if (!list_empty_careful(&iocb->ki_list))
		aio_remove_iocb(iocb);
	__aio_poll_complete(iocb, iocb->poll.events);
C
Christoph Hellwig 已提交
1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637
}

static int aio_poll_cancel(struct kiocb *iocb)
{
	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
	struct poll_iocb *req = &aiocb->poll;
	struct wait_queue_head *head = req->head;
	bool found = false;

	spin_lock(&head->lock);
	found = __aio_poll_remove(req);
	spin_unlock(&head->lock);

	if (found) {
		req->events = 0;
		INIT_WORK(&req->work, aio_poll_work);
		schedule_work(&req->work);
	}
	return 0;
}

static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
		void *key)
{
	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
1638
	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
C
Christoph Hellwig 已提交
1639 1640 1641 1642 1643 1644 1645 1646 1647
	struct file *file = req->file;
	__poll_t mask = key_to_poll(key);

	assert_spin_locked(&req->head->lock);

	/* for instances that support it check for an event match first: */
	if (mask && !(mask & req->events))
		return 0;

1648
	mask = file->f_op->poll_mask(file, req->events) & req->events;
C
Christoph Hellwig 已提交
1649 1650 1651 1652 1653
	if (!mask)
		return 0;

	__aio_poll_remove(req);

1654 1655 1656 1657 1658 1659 1660 1661 1662
	/*
	 * Try completing without a context switch if we can acquire ctx_lock
	 * without spinning.  Otherwise we need to defer to a workqueue to
	 * avoid a deadlock due to the lock order.
	 */
	if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
		list_del_init(&iocb->ki_list);
		spin_unlock(&iocb->ki_ctx->ctx_lock);

1663
		__aio_poll_complete(iocb, mask);
1664 1665 1666 1667 1668 1669
	} else {
		req->events = mask;
		INIT_WORK(&req->work, aio_poll_work);
		schedule_work(&req->work);
	}

C
Christoph Hellwig 已提交
1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705
	return 1;
}

static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
{
	struct kioctx *ctx = aiocb->ki_ctx;
	struct poll_iocb *req = &aiocb->poll;
	__poll_t mask;

	/* reject any unknown events outside the normal event mask. */
	if ((u16)iocb->aio_buf != iocb->aio_buf)
		return -EINVAL;
	/* reject fields that are not defined for poll */
	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
		return -EINVAL;

	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
	req->file = fget(iocb->aio_fildes);
	if (unlikely(!req->file))
		return -EBADF;
	if (!file_has_poll_mask(req->file))
		goto out_fail;

	req->head = req->file->f_op->get_poll_head(req->file, req->events);
	if (!req->head)
		goto out_fail;
	if (IS_ERR(req->head)) {
		mask = EPOLLERR;
		goto done;
	}

	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
	aiocb->ki_cancel = aio_poll_cancel;

	spin_lock_irq(&ctx->ctx_lock);
	spin_lock(&req->head->lock);
1706
	mask = req->file->f_op->poll_mask(req->file, req->events) & req->events;
C
Christoph Hellwig 已提交
1707 1708 1709 1710 1711 1712 1713 1714
	if (!mask) {
		__add_wait_queue(req->head, &req->wait);
		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
	}
	spin_unlock(&req->head->lock);
	spin_unlock_irq(&ctx->ctx_lock);
done:
	if (mask)
1715
		__aio_poll_complete(aiocb, mask);
1716
	return 0;
C
Christoph Hellwig 已提交
1717 1718 1719 1720 1721
out_fail:
	fput(req->file);
	return -EINVAL; /* same as no support for IOCB_CMD_POLL */
}

A
Adrian Bunk 已提交
1722
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1723
			 bool compat)
L
Linus Torvalds 已提交
1724
{
1725
	struct aio_kiocb *req;
1726
	struct iocb iocb;
L
Linus Torvalds 已提交
1727 1728
	ssize_t ret;

1729 1730 1731
	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
		return -EFAULT;

L
Linus Torvalds 已提交
1732
	/* enforce forwards compatibility on users */
1733
	if (unlikely(iocb.aio_reserved2)) {
K
Kent Overstreet 已提交
1734
		pr_debug("EINVAL: reserve field set\n");
L
Linus Torvalds 已提交
1735 1736 1737 1738 1739
		return -EINVAL;
	}

	/* prevent overflows */
	if (unlikely(
1740 1741 1742
	    (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
	    (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
	    ((ssize_t)iocb.aio_nbytes < 0)
L
Linus Torvalds 已提交
1743
	   )) {
1744
		pr_debug("EINVAL: overflow check\n");
L
Linus Torvalds 已提交
1745 1746 1747
		return -EINVAL;
	}

K
Kent Overstreet 已提交
1748
	req = aio_get_req(ctx);
1749
	if (unlikely(!req))
L
Linus Torvalds 已提交
1750
		return -EAGAIN;
1751

1752
	if (iocb.aio_flags & IOCB_FLAG_RESFD) {
1753 1754 1755 1756 1757 1758
		/*
		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
		 * instance of the file* now. The file descriptor must be
		 * an eventfd() fd, and will be signaled for each completed
		 * event using the eventfd_signal() function.
		 */
1759
		req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
1760
		if (IS_ERR(req->ki_eventfd)) {
1761
			ret = PTR_ERR(req->ki_eventfd);
1762
			req->ki_eventfd = NULL;
1763 1764
			goto out_put_req;
		}
1765 1766
	}

K
Kent Overstreet 已提交
1767
	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
L
Linus Torvalds 已提交
1768
	if (unlikely(ret)) {
K
Kent Overstreet 已提交
1769
		pr_debug("EFAULT: aio_key\n");
L
Linus Torvalds 已提交
1770 1771 1772
		goto out_put_req;
	}

1773
	req->ki_user_iocb = user_iocb;
1774
	req->ki_user_data = iocb.aio_data;
L
Linus Torvalds 已提交
1775

1776
	switch (iocb.aio_lio_opcode) {
C
Christoph Hellwig 已提交
1777
	case IOCB_CMD_PREAD:
1778
		ret = aio_read(&req->rw, &iocb, false, compat);
C
Christoph Hellwig 已提交
1779 1780
		break;
	case IOCB_CMD_PWRITE:
1781
		ret = aio_write(&req->rw, &iocb, false, compat);
C
Christoph Hellwig 已提交
1782 1783
		break;
	case IOCB_CMD_PREADV:
1784
		ret = aio_read(&req->rw, &iocb, true, compat);
C
Christoph Hellwig 已提交
1785 1786
		break;
	case IOCB_CMD_PWRITEV:
1787
		ret = aio_write(&req->rw, &iocb, true, compat);
C
Christoph Hellwig 已提交
1788
		break;
1789
	case IOCB_CMD_FSYNC:
1790
		ret = aio_fsync(&req->fsync, &iocb, false);
1791 1792
		break;
	case IOCB_CMD_FDSYNC:
1793
		ret = aio_fsync(&req->fsync, &iocb, true);
1794
		break;
C
Christoph Hellwig 已提交
1795
	case IOCB_CMD_POLL:
1796
		ret = aio_poll(req, &iocb);
1797
		break;
C
Christoph Hellwig 已提交
1798
	default:
1799
		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
C
Christoph Hellwig 已提交
1800 1801 1802
		ret = -EINVAL;
		break;
	}
Z
Zach Brown 已提交
1803

1804
	/*
1805 1806 1807
	 * If ret is 0, we'd either done aio_complete() ourselves or have
	 * arranged for that to be done asynchronously.  Anything non-zero
	 * means that we need to destroy req ourselves.
1808
	 */
1809
	if (ret)
C
Christoph Hellwig 已提交
1810
		goto out_put_req;
L
Linus Torvalds 已提交
1811 1812
	return 0;
out_put_req:
K
Kent Overstreet 已提交
1813
	put_reqs_available(ctx, 1);
K
Kent Overstreet 已提交
1814
	percpu_ref_put(&ctx->reqs);
1815 1816 1817
	if (req->ki_eventfd)
		eventfd_ctx_put(req->ki_eventfd);
	kmem_cache_free(kiocb_cachep, req);
L
Linus Torvalds 已提交
1818 1819 1820
	return ret;
}

A
Al Viro 已提交
1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834
/* sys_io_submit:
 *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
 *	the number of iocbs queued.  May return -EINVAL if the aio_context
 *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
 *	*iocbpp[0] is not properly initialized, if the operation specified
 *	is invalid for the file descriptor in the iocb.  May fail with
 *	-EFAULT if any of the data structures point to invalid data.  May
 *	fail with -EBADF if the file descriptor specified in the first
 *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
 *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
 *	fail with -ENOSYS if not implemented.
 */
SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
		struct iocb __user * __user *, iocbpp)
L
Linus Torvalds 已提交
1835 1836 1837
{
	struct kioctx *ctx;
	long ret = 0;
J
Jeff Moyer 已提交
1838
	int i = 0;
S
Shaohua Li 已提交
1839
	struct blk_plug plug;
L
Linus Torvalds 已提交
1840 1841 1842 1843 1844 1845

	if (unlikely(nr < 0))
		return -EINVAL;

	ctx = lookup_ioctx(ctx_id);
	if (unlikely(!ctx)) {
K
Kent Overstreet 已提交
1846
		pr_debug("EINVAL: invalid context id\n");
L
Linus Torvalds 已提交
1847 1848 1849
		return -EINVAL;
	}

1850 1851 1852
	if (nr > ctx->nr_events)
		nr = ctx->nr_events;

S
Shaohua Li 已提交
1853
	blk_start_plug(&plug);
A
Al Viro 已提交
1854
	for (i = 0; i < nr; i++) {
L
Linus Torvalds 已提交
1855 1856
		struct iocb __user *user_iocb;

A
Al Viro 已提交
1857
		if (unlikely(get_user(user_iocb, iocbpp + i))) {
L
Linus Torvalds 已提交
1858 1859 1860 1861
			ret = -EFAULT;
			break;
		}

A
Al Viro 已提交
1862
		ret = io_submit_one(ctx, user_iocb, false);
L
Linus Torvalds 已提交
1863 1864 1865
		if (ret)
			break;
	}
S
Shaohua Li 已提交
1866
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
1867

K
Kent Overstreet 已提交
1868
	percpu_ref_put(&ctx->users);
L
Linus Torvalds 已提交
1869 1870 1871
	return i ? i : ret;
}

A
Al Viro 已提交
1872 1873
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
A
Al Viro 已提交
1874
		       int, nr, compat_uptr_t __user *, iocbpp)
A
Al Viro 已提交
1875
{
A
Al Viro 已提交
1876 1877 1878 1879
	struct kioctx *ctx;
	long ret = 0;
	int i = 0;
	struct blk_plug plug;
A
Al Viro 已提交
1880 1881 1882 1883

	if (unlikely(nr < 0))
		return -EINVAL;

A
Al Viro 已提交
1884 1885 1886 1887 1888 1889
	ctx = lookup_ioctx(ctx_id);
	if (unlikely(!ctx)) {
		pr_debug("EINVAL: invalid context id\n");
		return -EINVAL;
	}

1890 1891 1892
	if (nr > ctx->nr_events)
		nr = ctx->nr_events;

A
Al Viro 已提交
1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
	blk_start_plug(&plug);
	for (i = 0; i < nr; i++) {
		compat_uptr_t user_iocb;

		if (unlikely(get_user(user_iocb, iocbpp + i))) {
			ret = -EFAULT;
			break;
		}

		ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
		if (ret)
			break;
	}
	blk_finish_plug(&plug);

	percpu_ref_put(&ctx->users);
	return i ? i : ret;
A
Al Viro 已提交
1910 1911 1912
}
#endif

L
Linus Torvalds 已提交
1913 1914 1915
/* lookup_kiocb
 *	Finds a given iocb for cancellation.
 */
1916
static struct aio_kiocb *
1917
lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
L
Linus Torvalds 已提交
1918
{
1919
	struct aio_kiocb *kiocb;
1920 1921 1922

	assert_spin_locked(&ctx->ctx_lock);

L
Linus Torvalds 已提交
1923
	/* TODO: use a hash or array, this sucks. */
1924 1925
	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
		if (kiocb->ki_user_iocb == iocb)
L
Linus Torvalds 已提交
1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940
			return kiocb;
	}
	return NULL;
}

/* sys_io_cancel:
 *	Attempts to cancel an iocb previously passed to io_submit.  If
 *	the operation is successfully cancelled, the resulting event is
 *	copied into the memory pointed to by result without being placed
 *	into the completion queue and 0 is returned.  May fail with
 *	-EFAULT if any of the data structures pointed to are invalid.
 *	May fail with -EINVAL if aio_context specified by ctx_id is
 *	invalid.  May fail with -EAGAIN if the iocb specified was not
 *	cancelled.  Will fail with -ENOSYS if not implemented.
 */
1941 1942
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
		struct io_event __user *, result)
L
Linus Torvalds 已提交
1943 1944
{
	struct kioctx *ctx;
1945
	struct aio_kiocb *kiocb;
C
Christoph Hellwig 已提交
1946
	int ret = -EINVAL;
L
Linus Torvalds 已提交
1947 1948
	u32 key;

1949
	if (unlikely(get_user(key, &iocb->aio_key)))
L
Linus Torvalds 已提交
1950
		return -EFAULT;
1951 1952
	if (unlikely(key != KIOCB_KEY))
		return -EINVAL;
L
Linus Torvalds 已提交
1953 1954 1955 1956 1957 1958

	ctx = lookup_ioctx(ctx_id);
	if (unlikely(!ctx))
		return -EINVAL;

	spin_lock_irq(&ctx->ctx_lock);
1959
	kiocb = lookup_kiocb(ctx, iocb);
C
Christoph Hellwig 已提交
1960 1961 1962 1963
	if (kiocb) {
		ret = kiocb->ki_cancel(&kiocb->rw);
		list_del_init(&kiocb->ki_list);
	}
L
Linus Torvalds 已提交
1964 1965
	spin_unlock_irq(&ctx->ctx_lock);

K
Kent Overstreet 已提交
1966
	if (!ret) {
1967 1968 1969 1970
		/*
		 * The result argument is no longer used - the io_event is
		 * always delivered via the ring buffer. -EINPROGRESS indicates
		 * cancellation is progress:
K
Kent Overstreet 已提交
1971
		 */
1972
		ret = -EINPROGRESS;
K
Kent Overstreet 已提交
1973
	}
L
Linus Torvalds 已提交
1974

K
Kent Overstreet 已提交
1975
	percpu_ref_put(&ctx->users);
L
Linus Torvalds 已提交
1976 1977 1978 1979

	return ret;
}

1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998
static long do_io_getevents(aio_context_t ctx_id,
		long min_nr,
		long nr,
		struct io_event __user *events,
		struct timespec64 *ts)
{
	ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
	struct kioctx *ioctx = lookup_ioctx(ctx_id);
	long ret = -EINVAL;

	if (likely(ioctx)) {
		if (likely(min_nr <= nr && min_nr >= 0))
			ret = read_events(ioctx, min_nr, nr, events, until);
		percpu_ref_put(&ioctx->users);
	}

	return ret;
}

L
Linus Torvalds 已提交
1999 2000
/* io_getevents:
 *	Attempts to read at least min_nr events and up to nr events from
2001 2002 2003 2004 2005 2006 2007 2008
 *	the completion queue for the aio_context specified by ctx_id. If
 *	it succeeds, the number of read events is returned. May fail with
 *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
 *	out of range, if timeout is out of range.  May fail with -EFAULT
 *	if any of the memory specified is invalid.  May return 0 or
 *	< min_nr if the timeout specified by timeout has elapsed
 *	before sufficient events are available, where timeout == NULL
 *	specifies an infinite timeout. Note that the timeout pointed to by
J
Jeff Moyer 已提交
2009
 *	timeout is relative.  Will fail with -ENOSYS if not implemented.
L
Linus Torvalds 已提交
2010
 */
2011 2012 2013 2014 2015
SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
		long, min_nr,
		long, nr,
		struct io_event __user *, events,
		struct timespec __user *, timeout)
L
Linus Torvalds 已提交
2016
{
2017
	struct timespec64	ts;
C
Christoph Hellwig 已提交
2018 2019 2020 2021 2022 2023 2024 2025 2026 2027
	int			ret;

	if (timeout && unlikely(get_timespec64(&ts, timeout)))
		return -EFAULT;

	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
	if (!ret && signal_pending(current))
		ret = -EINTR;
	return ret;
}
L
Linus Torvalds 已提交
2028

C
Christoph Hellwig 已提交
2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051
SYSCALL_DEFINE6(io_pgetevents,
		aio_context_t, ctx_id,
		long, min_nr,
		long, nr,
		struct io_event __user *, events,
		struct timespec __user *, timeout,
		const struct __aio_sigset __user *, usig)
{
	struct __aio_sigset	ksig = { NULL, };
	sigset_t		ksigmask, sigsaved;
	struct timespec64	ts;
	int ret;

	if (timeout && unlikely(get_timespec64(&ts, timeout)))
		return -EFAULT;

	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
		return -EFAULT;

	if (ksig.sigmask) {
		if (ksig.sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
2052
			return -EFAULT;
C
Christoph Hellwig 已提交
2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068
		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
	if (signal_pending(current)) {
		if (ksig.sigmask) {
			current->saved_sigmask = sigsaved;
			set_restore_sigmask();
		}

		if (!ret)
			ret = -ERESTARTNOHAND;
	} else {
		if (ksig.sigmask)
			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
L
Linus Torvalds 已提交
2069
	}
2070

C
Christoph Hellwig 已提交
2071
	return ret;
L
Linus Torvalds 已提交
2072
}
A
Al Viro 已提交
2073 2074 2075 2076 2077 2078 2079 2080

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
		       compat_long_t, min_nr,
		       compat_long_t, nr,
		       struct io_event __user *, events,
		       struct compat_timespec __user *, timeout)
{
2081
	struct timespec64 t;
C
Christoph Hellwig 已提交
2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092
	int ret;

	if (timeout && compat_get_timespec64(&t, timeout))
		return -EFAULT;

	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
	if (!ret && signal_pending(current))
		ret = -EINTR;
	return ret;
}

A
Al Viro 已提交
2093

C
Christoph Hellwig 已提交
2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121
struct __compat_aio_sigset {
	compat_sigset_t __user	*sigmask;
	compat_size_t		sigsetsize;
};

COMPAT_SYSCALL_DEFINE6(io_pgetevents,
		compat_aio_context_t, ctx_id,
		compat_long_t, min_nr,
		compat_long_t, nr,
		struct io_event __user *, events,
		struct compat_timespec __user *, timeout,
		const struct __compat_aio_sigset __user *, usig)
{
	struct __compat_aio_sigset ksig = { NULL, };
	sigset_t ksigmask, sigsaved;
	struct timespec64 t;
	int ret;

	if (timeout && compat_get_timespec64(&t, timeout))
		return -EFAULT;

	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
		return -EFAULT;

	if (ksig.sigmask) {
		if (ksig.sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
		if (get_compat_sigset(&ksigmask, ksig.sigmask))
A
Al Viro 已提交
2122
			return -EFAULT;
C
Christoph Hellwig 已提交
2123 2124 2125
		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}
A
Al Viro 已提交
2126

C
Christoph Hellwig 已提交
2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137
	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
	if (signal_pending(current)) {
		if (ksig.sigmask) {
			current->saved_sigmask = sigsaved;
			set_restore_sigmask();
		}
		if (!ret)
			ret = -ERESTARTNOHAND;
	} else {
		if (ksig.sigmask)
			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
A
Al Viro 已提交
2138
	}
2139

C
Christoph Hellwig 已提交
2140
	return ret;
A
Al Viro 已提交
2141 2142
}
#endif