提交 e59cd880 编写于 作者: L Linus Torvalds

Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Here are the io_uring changes for this merge window. Light on new
  features this time around (just splice + buffer selection), lots of
  cleanups, fixes, and improvements to existing support. In particular,
  this contains:

   - Cleanup fixed file update handling for stack fallback (Hillf)

   - Re-work of how pollable async IO is handled, we no longer require
     thread offload to handle that. Instead we rely using poll to drive
     this, with task_work execution.

   - In conjunction with the above, allow expendable buffer selection,
     so that poll+recv (for example) no longer has to be a split
     operation.

   - Make sure we honor RLIMIT_FSIZE for buffered writes

   - Add support for splice (Pavel)

   - Linked work inheritance fixes and optimizations (Pavel)

   - Async work fixes and cleanups (Pavel)

   - Improve io-wq locking (Pavel)

   - Hashed link write improvements (Pavel)

   - SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)"

* tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits)
  io_uring: cleanup io_alloc_async_ctx()
  io_uring: fix missing 'return' in comment
  io-wq: handle hashed writes in chains
  io-uring: drop 'free_pfile' in struct io_file_put
  io-uring: drop completion when removing file
  io_uring: Fix ->data corruption on re-enqueue
  io-wq: close cancel gap for hashed linked work
  io_uring: make spdxcheck.py happy
  io_uring: honor original task RLIMIT_FSIZE
  io-wq: hash dependent work
  io-wq: split hashing and enqueueing
  io-wq: don't resched if there is no work
  io-wq: remove duplicated cancel code
  io_uring: fix truncated async read/readv and write/writev retry
  io_uring: dual license io_uring.h uapi header
  io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled
  io_uring: Fix unused function warnings
  io_uring: add end-of-bits marker and build time verify it
  io_uring: provide means of removing buffers
  io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG
  ...
...@@ -69,6 +69,8 @@ struct io_worker { ...@@ -69,6 +69,8 @@ struct io_worker {
#define IO_WQ_HASH_ORDER 5 #define IO_WQ_HASH_ORDER 5
#endif #endif
#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
struct io_wqe_acct { struct io_wqe_acct {
unsigned nr_workers; unsigned nr_workers;
unsigned max_workers; unsigned max_workers;
...@@ -98,6 +100,7 @@ struct io_wqe { ...@@ -98,6 +100,7 @@ struct io_wqe {
struct list_head all_list; struct list_head all_list;
struct io_wq *wq; struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
}; };
/* /*
...@@ -107,8 +110,7 @@ struct io_wq { ...@@ -107,8 +110,7 @@ struct io_wq {
struct io_wqe **wqes; struct io_wqe **wqes;
unsigned long state; unsigned long state;
get_work_fn *get_work; free_work_fn *free_work;
put_work_fn *put_work;
struct task_struct *manager; struct task_struct *manager;
struct user_struct *user; struct user_struct *user;
...@@ -376,26 +378,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) ...@@ -376,26 +378,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
return __io_worker_unuse(wqe, worker); return __io_worker_unuse(wqe, worker);
} }
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
return work->flags >> IO_WQ_HASH_SHIFT;
}
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock) __must_hold(wqe->lock)
{ {
struct io_wq_work_node *node, *prev; struct io_wq_work_node *node, *prev;
struct io_wq_work *work; struct io_wq_work *work, *tail;
unsigned int hash;
wq_list_for_each(node, prev, &wqe->work_list) { wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list); work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */ /* not hashed, can run anytime */
if (!(work->flags & IO_WQ_WORK_HASHED)) { if (!io_wq_is_hashed(work)) {
wq_node_del(&wqe->work_list, node, prev); wq_list_del(&wqe->work_list, node, prev);
return work; return work;
} }
/* hashed, can run if not already running */ /* hashed, can run if not already running */
*hash = work->flags >> IO_WQ_HASH_SHIFT; hash = io_get_work_hash(work);
if (!(wqe->hash_map & BIT_ULL(*hash))) { if (!(wqe->hash_map & BIT(hash))) {
wqe->hash_map |= BIT_ULL(*hash); wqe->hash_map |= BIT(hash);
wq_node_del(&wqe->work_list, node, prev); /* all items with this hash lie in [work, tail] */
tail = wqe->hash_tail[hash];
wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev);
return work; return work;
} }
} }
...@@ -440,16 +451,49 @@ static void io_wq_switch_creds(struct io_worker *worker, ...@@ -440,16 +451,49 @@ static void io_wq_switch_creds(struct io_worker *worker,
worker->saved_creds = old_creds; worker->saved_creds = old_creds;
} }
static void io_impersonate_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if (work->fs && current->fs != work->fs)
current->fs = work->fs;
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
}
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
/* flush pending signals before assigning new work */
if (signal_pending(current))
flush_signals(current);
cond_resched();
}
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker) static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock) __releases(wqe->lock)
{ {
struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
struct io_wqe *wqe = worker->wqe; struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
do { do {
unsigned hash = -1U; struct io_wq_work *work;
unsigned int hash;
get_next:
/* /*
* If we got some work, mark us as busy. If we didn't, but * If we got some work, mark us as busy. If we didn't, but
* the list isn't empty, it means we stalled on hashed work. * the list isn't empty, it means we stalled on hashed work.
...@@ -457,81 +501,60 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -457,81 +501,60 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will * can't make progress, any work completion or insertion will
* clear the stalled flag. * clear the stalled flag.
*/ */
work = io_get_next_work(wqe, &hash); work = io_get_next_work(wqe);
if (work) if (work)
__io_worker_busy(wqe, worker, work); __io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list)) else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED; wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock); spin_unlock_irq(&wqe->lock);
if (put_work && wq->put_work)
wq->put_work(old_work);
if (!work) if (!work)
break; break;
next: io_assign_current_work(worker, work);
/* flush any pending signals before assigning new work */
if (signal_pending(current)) /* handle a whole dependent link */
flush_signals(current); do {
struct io_wq_work *old_work, *next_hashed, *linked;
cond_resched();
next_hashed = wq_next_work(work);
spin_lock_irq(&worker->lock); io_impersonate_work(worker, work);
worker->cur_work = work; /*
spin_unlock_irq(&worker->lock); * OK to set IO_WQ_WORK_CANCEL even for uncancellable
* work, the worker function will do the right thing.
if (work->flags & IO_WQ_WORK_CB) */
work->func(&work); if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (work->files && current->files != work->files) {
task_lock(current); hash = io_get_work_hash(work);
current->files = work->files; linked = old_work = work;
task_unlock(current); linked->func(&linked);
} linked = (old_work == linked) ? NULL : linked;
if (work->fs && current->fs != work->fs)
current->fs = work->fs; work = next_hashed;
if (work->mm != worker->mm) if (!work && linked && !io_wq_is_hashed(linked)) {
io_wq_switch_mm(worker, work); work = linked;
if (worker->cur_creds != work->creds) linked = NULL;
io_wq_switch_creds(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
* the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
work->flags |= IO_WQ_WORK_HAS_MM;
if (wq->get_work) {
put_work = work;
wq->get_work(work);
}
old_work = work;
work->func(&work);
spin_lock_irq(&worker->lock);
worker->cur_work = NULL;
spin_unlock_irq(&worker->lock);
spin_lock_irq(&wqe->lock);
if (hash != -1U) {
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
}
if (work && work != old_work) {
spin_unlock_irq(&wqe->lock);
if (put_work && wq->put_work) {
wq->put_work(put_work);
put_work = NULL;
} }
io_assign_current_work(worker, work);
wq->free_work(old_work);
if (linked)
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* dependent work is not hashed */
hash = -1U;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
spin_unlock_irq(&wqe->lock);
}
} while (work);
/* dependent work not hashed */ spin_lock_irq(&wqe->lock);
hash = -1U;
goto next;
}
} while (1); } while (1);
} }
...@@ -747,17 +770,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, ...@@ -747,17 +770,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true; return true;
} }
static void io_run_cancel(struct io_wq_work *work) static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{ {
struct io_wq *wq = wqe->wq;
do { do {
struct io_wq_work *old_work = work; struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL; work->flags |= IO_WQ_WORK_CANCEL;
work->func(&work); work->func(&work);
work = (work == old_work) ? NULL : work; work = (work == old_work) ? NULL : work;
wq->free_work(old_work);
} while (work); } while (work);
} }
static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{
unsigned int hash;
struct io_wq_work *tail;
if (!io_wq_is_hashed(work)) {
append:
wq_list_add_tail(&work->list, &wqe->work_list);
return;
}
hash = io_get_work_hash(work);
tail = wqe->hash_tail[hash];
wqe->hash_tail[hash] = work;
if (!tail)
goto append;
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{ {
struct io_wqe_acct *acct = io_work_get_acct(wqe, work); struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
...@@ -771,13 +817,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) ...@@ -771,13 +817,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay. * It's close enough to not be an issue, fork() has the same delay.
*/ */
if (unlikely(!io_wq_can_queue(wqe, acct, work))) { if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
io_run_cancel(work); io_run_cancel(work, wqe);
return; return;
} }
work_flags = work->flags; work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags); spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list); io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED; wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags); spin_unlock_irqrestore(&wqe->lock, flags);
...@@ -794,19 +840,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) ...@@ -794,19 +840,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
} }
/* /*
* Enqueue work, hashed by some key. Work items that hash to the same value * Work items that hash to the same value will not be done in parallel.
* will not be done in parallel. Used to limit concurrent writes, generally * Used to limit concurrent writes, generally hashed by inode.
* hashed by inode.
*/ */
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val) void io_wq_hash_work(struct io_wq_work *work, void *val)
{ {
struct io_wqe *wqe = wq->wqes[numa_node_id()]; unsigned int bit;
unsigned bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER); bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
io_wqe_enqueue(wqe, work);
} }
static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
...@@ -856,14 +898,13 @@ void io_wq_cancel_all(struct io_wq *wq) ...@@ -856,14 +898,13 @@ void io_wq_cancel_all(struct io_wq *wq)
} }
struct io_cb_cancel_data { struct io_cb_cancel_data {
struct io_wqe *wqe; work_cancel_fn *fn;
work_cancel_fn *cancel; void *data;
void *caller_data;
}; };
static bool io_work_cancel(struct io_worker *worker, void *cancel_data) static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{ {
struct io_cb_cancel_data *data = cancel_data; struct io_cb_cancel_data *match = data;
unsigned long flags; unsigned long flags;
bool ret = false; bool ret = false;
...@@ -874,83 +915,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) ...@@ -874,83 +915,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work && if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) { match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
work_cancel_fn *cancel,
void *cancel_data)
{
struct io_cb_cancel_data data = {
.wqe = wqe,
.cancel = cancel,
.caller_data = cancel_data,
};
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (cancel(work, cancel_data)) {
wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
}
}
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
io_run_cancel(work);
return IO_WQ_CANCEL_OK;
}
rcu_read_lock();
found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
rcu_read_unlock();
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data)
{
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_cb_work(wqe, cancel, data);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
return ret;
}
struct work_match {
bool (*fn)(struct io_wq_work *, void *data);
void *data;
};
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct work_match *match = data;
unsigned long flags;
bool ret = false;
spin_lock_irqsave(&worker->lock, flags);
if (match->fn(worker->cur_work, match->data) &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1); send_sig(SIGINT, worker->task, 1);
ret = true; ret = true;
} }
...@@ -960,7 +925,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) ...@@ -960,7 +925,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
} }
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
struct work_match *match) struct io_cb_cancel_data *match)
{ {
struct io_wq_work_node *node, *prev; struct io_wq_work_node *node, *prev;
struct io_wq_work *work; struct io_wq_work *work;
...@@ -977,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, ...@@ -977,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
work = container_of(node, struct io_wq_work, list); work = container_of(node, struct io_wq_work, list);
if (match->fn(work, match->data)) { if (match->fn(work, match->data)) {
wq_node_del(&wqe->work_list, node, prev); wq_list_del(&wqe->work_list, node, prev);
found = true; found = true;
break; break;
} }
...@@ -985,7 +950,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, ...@@ -985,7 +950,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags); spin_unlock_irqrestore(&wqe->lock, flags);
if (found) { if (found) {
io_run_cancel(work); io_run_cancel(work, wqe);
return IO_WQ_CANCEL_OK; return IO_WQ_CANCEL_OK;
} }
...@@ -1001,22 +966,16 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, ...@@ -1001,22 +966,16 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
} }
static bool io_wq_work_match(struct io_wq_work *work, void *data) enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
{ void *data)
return work == data;
}
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{ {
struct work_match match = { struct io_cb_cancel_data match = {
.fn = io_wq_work_match, .fn = cancel,
.data = cwork .data = data,
}; };
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node; int node;
cwork->flags |= IO_WQ_WORK_CANCEL;
for_each_node(node) { for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node]; struct io_wqe *wqe = wq->wqes[node];
...@@ -1028,33 +987,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) ...@@ -1028,33 +987,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
return ret; return ret;
} }
static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
{
return work == data;
}
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
}
static bool io_wq_pid_match(struct io_wq_work *work, void *data) static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{ {
pid_t pid = (pid_t) (unsigned long) data; pid_t pid = (pid_t) (unsigned long) data;
if (work) return work->task_pid == pid;
return work->task_pid == pid;
return false;
} }
enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
{ {
struct work_match match = { void *data = (void *) (unsigned long) pid;
.fn = io_wq_pid_match,
.data = (void *) (unsigned long) pid
};
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_work(wqe, &match); return io_wq_cancel_cb(wq, io_wq_pid_match, data);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
return ret;
} }
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
...@@ -1062,6 +1016,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1062,6 +1016,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node; int ret = -ENOMEM, node;
struct io_wq *wq; struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work))
return ERR_PTR(-EINVAL);
wq = kzalloc(sizeof(*wq), GFP_KERNEL); wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq) if (!wq)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -1072,8 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1072,8 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
wq->get_work = data->get_work; wq->free_work = data->free_work;
wq->put_work = data->put_work;
/* caller must already hold a reference to this */ /* caller must already hold a reference to this */
wq->user = data->user; wq->user = data->user;
...@@ -1130,7 +1086,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1130,7 +1086,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{ {
if (data->get_work != wq->get_work || data->put_work != wq->put_work) if (data->free_work != wq->free_work)
return false; return false;
return refcount_inc_not_zero(&wq->use_refs); return refcount_inc_not_zero(&wq->use_refs);
......
...@@ -5,10 +5,8 @@ struct io_wq; ...@@ -5,10 +5,8 @@ struct io_wq;
enum { enum {
IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256, IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512, IO_WQ_WORK_CONCURRENT = 512,
...@@ -30,6 +28,18 @@ struct io_wq_work_list { ...@@ -30,6 +28,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last; struct io_wq_work_node *last;
}; };
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
{
struct io_wq_work_node *next = pos->next;
pos->next = node;
node->next = next;
if (!next)
list->last = node;
}
static inline void wq_list_add_tail(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list) struct io_wq_work_list *list)
{ {
...@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, ...@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
} }
} }
static inline void wq_node_del(struct io_wq_work_list *list, static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *node, struct io_wq_work_node *last,
struct io_wq_work_node *prev) struct io_wq_work_node *prev)
{ {
if (node == list->first) /* first in the list, if prev==NULL */
WRITE_ONCE(list->first, node->next); if (!prev)
if (node == list->last) WRITE_ONCE(list->first, last->next);
else
prev->next = last->next;
if (last == list->last)
list->last = prev; list->last = prev;
if (prev) last->next = NULL;
prev->next = node->next; }
node->next = NULL;
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
{
wq_list_cut(list, node, prev);
} }
#define wq_list_for_each(pos, prv, head) \ #define wq_list_for_each(pos, prv, head) \
...@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, ...@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
} while (0) } while (0)
struct io_wq_work { struct io_wq_work {
union { struct io_wq_work_node list;
struct io_wq_work_node list;
void *data;
};
void (*func)(struct io_wq_work **); void (*func)(struct io_wq_work **);
struct files_struct *files; struct files_struct *files;
struct mm_struct *mm; struct mm_struct *mm;
...@@ -83,14 +99,20 @@ struct io_wq_work { ...@@ -83,14 +99,20 @@ struct io_wq_work {
*(work) = (struct io_wq_work){ .func = _func }; \ *(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \ } while (0) \
typedef void (get_work_fn)(struct io_wq_work *); static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
typedef void (put_work_fn)(struct io_wq_work *); {
if (!work->list.next)
return NULL;
return container_of(work->list.next, struct io_wq_work, list);
}
typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data { struct io_wq_data {
struct user_struct *user; struct user_struct *user;
get_work_fn *get_work; free_work_fn *free_work;
put_work_fn *put_work;
}; };
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
...@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); ...@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq); void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); void io_wq_hash_work(struct io_wq_work *work, void *val);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
void io_wq_cancel_all(struct io_wq *wq); void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
......
此差异已折叠。
...@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, ...@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/* /*
* Determine where to splice to/from. * Determine where to splice to/from.
*/ */
static long do_splice(struct file *in, loff_t __user *off_in, long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out, struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags) size_t len, unsigned int flags)
{ {
struct pipe_inode_info *ipipe; struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe; struct pipe_inode_info *opipe;
......
...@@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, ...@@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags, struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr, struct sockaddr __user **uaddr,
struct iovec **iov); struct iovec **iov);
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs);
/* helpers which do the actual work for syscalls */ /* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
......
...@@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, ...@@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
struct pipe_buffer *); struct pipe_buffer *);
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
splice_direct_actor *); splice_direct_actor *);
extern long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags);
/* /*
* for dynamic pipe sizing * for dynamic pipe sizing
......
...@@ -38,6 +38,9 @@ struct compat_cmsghdr { ...@@ -38,6 +38,9 @@ struct compat_cmsghdr {
#define compat_mmsghdr mmsghdr #define compat_mmsghdr mmsghdr
#endif /* defined(CONFIG_COMPAT) */ #endif /* defined(CONFIG_COMPAT) */
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
compat_size_t *len);
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
struct sockaddr __user **, struct iovec **); struct sockaddr __user **, struct iovec **);
struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval); struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval);
......
...@@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe, ...@@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->force_nonblock, __entry->sq_thread) __entry->force_nonblock, __entry->sq_thread)
); );
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
__field( int, events )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
__entry->events = events;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
);
TRACE_EVENT(io_uring_poll_wake,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask %x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
),
TP_printk("ring %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data)
);
#endif /* _TRACE_IO_URING_H */ #endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
/* /*
* Header file for the io_uring interface. * Header file for the io_uring interface.
* *
...@@ -23,7 +23,10 @@ struct io_uring_sqe { ...@@ -23,7 +23,10 @@ struct io_uring_sqe {
__u64 off; /* offset into file */ __u64 off; /* offset into file */
__u64 addr2; __u64 addr2;
}; };
__u64 addr; /* pointer to buffer or iovecs */ union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
};
__u32 len; /* buffer size or number of iovecs */ __u32 len; /* buffer size or number of iovecs */
union { union {
__kernel_rwf_t rw_flags; __kernel_rwf_t rw_flags;
...@@ -37,14 +40,21 @@ struct io_uring_sqe { ...@@ -37,14 +40,21 @@ struct io_uring_sqe {
__u32 open_flags; __u32 open_flags;
__u32 statx_flags; __u32 statx_flags;
__u32 fadvise_advice; __u32 fadvise_advice;
__u32 splice_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
union { union {
struct { struct {
/* index into fixed buffers, if used */ /* pack this to avoid bogus arm OABI complaints */
__u16 buf_index; union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */ /* personality to use, if used */
__u16 personality; __u16 personality;
__s32 splice_fd_in;
}; };
__u64 __pad2[3]; __u64 __pad2[3];
}; };
...@@ -56,6 +66,7 @@ enum { ...@@ -56,6 +66,7 @@ enum {
IOSQE_IO_LINK_BIT, IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT, IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT, IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
}; };
/* /*
...@@ -71,6 +82,8 @@ enum { ...@@ -71,6 +82,8 @@ enum {
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */ /* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
/* /*
* io_uring_setup() flags * io_uring_setup() flags
...@@ -113,6 +126,9 @@ enum { ...@@ -113,6 +126,9 @@ enum {
IORING_OP_RECV, IORING_OP_RECV,
IORING_OP_OPENAT2, IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL, IORING_OP_EPOLL_CTL,
IORING_OP_SPLICE,
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,
...@@ -128,6 +144,12 @@ enum { ...@@ -128,6 +144,12 @@ enum {
*/ */
#define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_ABS (1U << 0)
/*
* sqe->splice_flags
* extends splice(2) flags
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/* /*
* IO completion data structure (Completion Queue Entry) * IO completion data structure (Completion Queue Entry)
*/ */
...@@ -137,6 +159,17 @@ struct io_uring_cqe { ...@@ -137,6 +159,17 @@ struct io_uring_cqe {
__u32 flags; __u32 flags;
}; };
/*
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
*/
#define IORING_CQE_F_BUFFER (1U << 0)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
};
/* /*
* Magic offsets for the application to mmap the data it needs * Magic offsets for the application to mmap the data it needs
*/ */
...@@ -204,6 +237,7 @@ struct io_uring_params { ...@@ -204,6 +237,7 @@ struct io_uring_params {
#define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
......
...@@ -97,16 +97,26 @@ void task_work_run(void) ...@@ -97,16 +97,26 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set * work->func() can do task_work_add(), do not set
* work_exited unless the list is empty. * work_exited unless the list is empty.
*/ */
raw_spin_lock_irq(&task->pi_lock);
do { do {
head = NULL;
work = READ_ONCE(task->task_works); work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ? if (!work) {
&work_exited : NULL; if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
} while (cmpxchg(&task->task_works, work, head) != work); } while (cmpxchg(&task->task_works, work, head) != work);
raw_spin_unlock_irq(&task->pi_lock);
if (!work) if (!work)
break; break;
/*
* Synchronize with task_work_cancel(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
raw_spin_lock_irq(&task->pi_lock);
raw_spin_unlock_irq(&task->pi_lock);
do { do {
next = work->next; next = work->next;
......
...@@ -33,10 +33,10 @@ ...@@ -33,10 +33,10 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <net/compat.h> #include <net/compat.h>
int get_compat_msghdr(struct msghdr *kmsg, int __get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, struct sockaddr __user **save_addr,
struct iovec **iov) compat_uptr_t *ptr, compat_size_t *len)
{ {
struct compat_msghdr msg; struct compat_msghdr msg;
ssize_t err; ssize_t err;
...@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg, ...@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg,
return -EMSGSIZE; return -EMSGSIZE;
kmsg->msg_iocb = NULL; kmsg->msg_iocb = NULL;
*ptr = msg.msg_iov;
*len = msg.msg_iovlen;
return 0;
}
int get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
compat_uptr_t ptr;
compat_size_t len;
ssize_t err;
err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
if (err)
return err;
err = compat_import_iovec(save_addr ? READ : WRITE, err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
compat_ptr(msg.msg_iov), msg.msg_iovlen, len, UIO_FASTIOV, iov, &kmsg->msg_iter);
UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0; return err < 0 ? err : 0;
} }
......
...@@ -2228,10 +2228,10 @@ struct used_address { ...@@ -2228,10 +2228,10 @@ struct used_address {
unsigned int name_len; unsigned int name_len;
}; };
static int copy_msghdr_from_user(struct msghdr *kmsg, int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg, struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr, struct sockaddr __user **save_addr,
struct iovec **iov) struct iovec __user **uiov, size_t *nsegs)
{ {
struct user_msghdr msg; struct user_msghdr msg;
ssize_t err; ssize_t err;
...@@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, ...@@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
return -EMSGSIZE; return -EMSGSIZE;
kmsg->msg_iocb = NULL; kmsg->msg_iocb = NULL;
*uiov = msg.msg_iov;
*nsegs = msg.msg_iovlen;
return 0;
}
static int copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
struct user_msghdr msg;
ssize_t err;
err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
&msg.msg_iovlen);
if (err)
return err;
err = import_iovec(save_addr ? READ : WRITE, err = import_iovec(save_addr ? READ : WRITE,
msg.msg_iov, msg.msg_iovlen, msg.msg_iov, msg.msg_iovlen,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册