diff --git a/fs/io_uring.c b/fs/io_uring.c index 6aaff7bfe8b5c242c014c431b4fb96c1373ca177..4325068324b7d0cef66afdf9593623ed76257f35 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -179,6 +179,21 @@ struct fixed_file_table { struct file **files; }; +enum { + FFD_F_ATOMIC, +}; + +struct fixed_file_data { + struct fixed_file_table *table; + struct io_ring_ctx *ctx; + + struct percpu_ref refs; + struct llist_head put_llist; + unsigned long state; + struct work_struct ref_work; + struct completion done; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,7 +246,7 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_table *file_table; + struct fixed_file_data *file_data; unsigned nr_user_files; /* if used, fixed mapped user buffers */ @@ -370,6 +385,13 @@ struct io_open { int flags; }; +struct io_files_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -421,6 +443,7 @@ struct io_kiocb { struct io_sr_msg sr_msg; struct io_open open; struct io_close close; + struct io_files_update files_update; }; struct io_async_ctx *io; @@ -496,6 +519,9 @@ static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); static struct kmem_cache *req_cachep; @@ -945,6 +971,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); percpu_ref_put_many(&ctx->refs, *nr); + percpu_ref_put_many(&ctx->file_data->refs, *nr); *nr = 0; } } @@ -955,8 +982,12 @@ static void __io_free_req(struct io_kiocb *req) if (req->io) kfree(req->io); - if (req->file && !(req->flags & REQ_F_FIXED_FILE)) - fput(req->file); + if (req->file) { + if (req->flags & REQ_F_FIXED_FILE) + percpu_ref_put(&ctx->file_data->refs); + else + fput(req->file); + } if (req->flags & REQ_F_INFLIGHT) { unsigned long flags; @@ -3293,6 +3324,45 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) return 0; } +static int io_files_update_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->flags || sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + req->files_update.offset = READ_ONCE(sqe->off); + req->files_update.nr_args = READ_ONCE(sqe->len); + if (!req->files_update.nr_args) + return -EINVAL; + req->files_update.arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update(struct io_kiocb *req, bool force_nonblock) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_files_update up; + int ret; + + if (force_nonblock) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + return -EAGAIN; + } + + up.offset = req->files_update.offset; + up.fds = req->files_update.arg; + + mutex_lock(&ctx->uring_lock); + ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + mutex_unlock(&ctx->uring_lock); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_req_defer_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3354,6 +3424,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_CLOSE: ret = io_close_prep(req, sqe); break; + case IORING_OP_FILES_UPDATE: + ret = io_files_update_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3532,6 +3605,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_close(req, nxt, force_nonblock); break; + case IORING_OP_FILES_UPDATE: + if (sqe) { + ret = io_files_update_prep(req, sqe); + if (ret) + break; + } + ret = io_files_update(req, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3631,8 +3712,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, { struct fixed_file_table *table; - table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK];; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -3653,7 +3734,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, return ret; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_table || + if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); @@ -3661,6 +3742,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (!req->file) return -EBADF; req->flags |= REQ_F_FIXED_FILE; + percpu_ref_get(&ctx->file_data->refs); } else { if (req->needs_fixed_file) return -EBADF; @@ -4338,19 +4420,37 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } +static void io_file_ref_kill(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + complete(&data->done); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + struct fixed_file_data *data = ctx->file_data; unsigned nr_tables, i; - if (!ctx->file_table) + if (!data) return -ENXIO; + /* protect against inflight atomic switch, which drops the ref */ + flush_work(&data->ref_work); + percpu_ref_get(&data->refs); + percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); + wait_for_completion(&data->done); + percpu_ref_put(&data->refs); + percpu_ref_exit(&data->refs); + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(data->table[i].files); + kfree(data->table); + kfree(data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; } @@ -4381,16 +4481,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) } #if defined(CONFIG_UNIX) -static void io_destruct_skb(struct sk_buff *skb) -{ - struct io_ring_ctx *ctx = skb->sk->sk_user_data; - - if (ctx->io_wq) - io_wq_flush(ctx->io_wq); - - unix_destruct_scm(skb); -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -4438,7 +4528,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; - skb->destructor = io_destruct_skb; + skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); @@ -4500,7 +4590,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -4515,36 +4605,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; kfree(table->files); } return 1; } +static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +struct io_file_put { + struct llist_node llist; + struct file *file; + struct completion *done; +}; + +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct io_file_put *pfile, *tmp; + struct fixed_file_data *data; + struct llist_node *node; + + data = container_of(work, struct fixed_file_data, ref_work); + + while ((node = llist_del_all(&data->put_llist)) != NULL) { + llist_for_each_entry_safe(pfile, tmp, node, llist) { + io_ring_file_put(data->ctx, pfile->file); + if (pfile->done) + complete(pfile->done); + else + kfree(pfile); + } + } + + percpu_ref_get(&data->refs); + percpu_ref_switch_to_percpu(&data->refs); +} + +static void io_file_data_ref_zero(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + + /* we can't safely switch from inside this context, punt to wq */ + queue_work(system_wq, &data->ref_work); +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; unsigned nr_tables; + struct file *file; int fd, ret = 0; unsigned i; - if (ctx->file_table) + if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!ctx->file_data) + return -ENOMEM; + ctx->file_data->ctx = ctx; + init_completion(&ctx->file_data->done); + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + ctx->file_data->table = kcalloc(nr_tables, + sizeof(struct fixed_file_table), GFP_KERNEL); - if (!ctx->file_table) + if (!ctx->file_data->table) { + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; + } + + if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + ctx->file_data->put_llist.first = NULL; + INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - kfree(ctx->file_table); - ctx->file_table = NULL; + percpu_ref_exit(&ctx->file_data->refs); + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; } @@ -4561,13 +4774,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, continue; } - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; - table->files[index] = fget(fd); + file = fget(fd); ret = -EBADF; - if (!table->files[index]) + if (!file) break; + /* * Don't allow io_uring instances to be registered. If UNIX * isn't enabled, then this causes a reference cycle and this @@ -4575,26 +4789,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (table->files[index]->f_op == &io_uring_fops) { - fput(table->files[index]); + if (file->f_op == &io_uring_fops) { + fput(file); break; } ret = 0; + table->files[index] = file; } if (ret) { for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - file = io_file_from_index(ctx, i); if (file) fput(file); } for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); + kfree(ctx->file_data->table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return ret; } @@ -4606,69 +4820,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) -{ -#if defined(CONFIG_UNIX) - struct file *file = io_file_from_index(ctx, index); - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(io_file_from_index(ctx, index)); -#endif -} - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, int index) { @@ -4712,29 +4863,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static void io_atomic_switch(struct percpu_ref *ref) { - struct io_uring_files_update up; + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + clear_bit(FFD_F_ATOMIC, &data->state); +} + +static bool io_queue_file_removal(struct fixed_file_data *data, + struct file *file) +{ + struct io_file_put *pfile, pfile_stack; + DECLARE_COMPLETION_ONSTACK(done); + + /* + * If we fail allocating the struct we need for doing async reomval + * of this file, just punt to sync and wait for it. + */ + pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); + if (!pfile) { + pfile = &pfile_stack; + pfile->done = &done; + } + + pfile->file = file; + llist_add(&pfile->llist, &data->put_llist); + + if (pfile == &pfile_stack) { + if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, + io_atomic_switch); + } + wait_for_completion(&done); + flush_work(&data->ref_work); + return false; + } + + return true; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *up, + unsigned nr_args) +{ + struct fixed_file_data *data = ctx->file_data; + bool ref_switch = false; + struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; - if (!ctx->file_table) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (up.resv) - return -EINVAL; - if (check_add_overflow(up.offset, nr_args, &done)) + if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = u64_to_user_ptr(up.fds); + fds = u64_to_user_ptr(up->fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -4744,16 +4931,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EFAULT; break; } - i = array_index_nospec(up.offset, ctx->nr_user_files); - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + i = array_index_nospec(up->offset, ctx->nr_user_files); + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - io_sqe_file_unregister(ctx, i); + file = io_file_from_index(ctx, index); table->files[index] = NULL; + if (io_queue_file_removal(data, file)) + ref_switch = true; } if (fd != -1) { - struct file *file; - file = fget(fd); if (!file) { err = -EBADF; @@ -4779,11 +4966,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, } nr_args--; done++; - up.offset++; + up->offset++; + } + + if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); } return done ? done : err; } +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + + if (!ctx->file_data) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (up.resv) + return -EINVAL; + + return __io_sqe_files_update(ctx, &up, nr_args); +} static void io_put_work(struct io_wq_work *work) { @@ -5546,7 +5754,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; - ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -5710,18 +5917,22 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - percpu_ref_kill(&ctx->refs); + if (opcode != IORING_UNREGISTER_FILES && + opcode != IORING_REGISTER_FILES_UPDATE) { + percpu_ref_kill(&ctx->refs); - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab - * the uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); - mutex_lock(&ctx->uring_lock); + /* + * Drop uring mutex before waiting for references to exit. If + * another thread is currently inside io_uring_enter() it might + * need to grab the uring_lock to make progress. If we hold it + * here across the drain wait, then we can deadlock. It's safe + * to drop the mutex here, since no new references will come in + * after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + wait_for_completion(&ctx->completions[0]); + mutex_lock(&ctx->uring_lock); + } switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -5762,9 +5973,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; } - /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); - percpu_ref_reinit(&ctx->refs); + + if (opcode != IORING_UNREGISTER_FILES && + opcode != IORING_REGISTER_FILES_UPDATE) { + /* bring the ctx back to life */ + reinit_completion(&ctx->completions[0]); + percpu_ref_reinit(&ctx->refs); + } return ret; } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 084dea85b83804c1f24f8d791799e81f23909abe..ca436b9d4921220cd140bf2bebfa55a044fa9c02 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -80,6 +80,7 @@ enum { IORING_OP_FALLOCATE, IORING_OP_OPENAT, IORING_OP_CLOSE, + IORING_OP_FILES_UPDATE, /* this goes last, obviously */ IORING_OP_LAST,