提交 e59cd880 编写于 作者: L Linus Torvalds

Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Here are the io_uring changes for this merge window. Light on new
  features this time around (just splice + buffer selection), lots of
  cleanups, fixes, and improvements to existing support. In particular,
  this contains:

   - Cleanup fixed file update handling for stack fallback (Hillf)

   - Re-work of how pollable async IO is handled, we no longer require
     thread offload to handle that. Instead we rely using poll to drive
     this, with task_work execution.

   - In conjunction with the above, allow expendable buffer selection,
     so that poll+recv (for example) no longer has to be a split
     operation.

   - Make sure we honor RLIMIT_FSIZE for buffered writes

   - Add support for splice (Pavel)

   - Linked work inheritance fixes and optimizations (Pavel)

   - Async work fixes and cleanups (Pavel)

   - Improve io-wq locking (Pavel)

   - Hashed link write improvements (Pavel)

   - SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)"

* tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits)
  io_uring: cleanup io_alloc_async_ctx()
  io_uring: fix missing 'return' in comment
  io-wq: handle hashed writes in chains
  io-uring: drop 'free_pfile' in struct io_file_put
  io-uring: drop completion when removing file
  io_uring: Fix ->data corruption on re-enqueue
  io-wq: close cancel gap for hashed linked work
  io_uring: make spdxcheck.py happy
  io_uring: honor original task RLIMIT_FSIZE
  io-wq: hash dependent work
  io-wq: split hashing and enqueueing
  io-wq: don't resched if there is no work
  io-wq: remove duplicated cancel code
  io_uring: fix truncated async read/readv and write/writev retry
  io_uring: dual license io_uring.h uapi header
  io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled
  io_uring: Fix unused function warnings
  io_uring: add end-of-bits marker and build time verify it
  io_uring: provide means of removing buffers
  io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG
  ...
......@@ -69,6 +69,8 @@ struct io_worker {
#define IO_WQ_HASH_ORDER 5
#endif
#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
......@@ -98,6 +100,7 @@ struct io_wqe {
struct list_head all_list;
struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
/*
......@@ -107,8 +110,7 @@ struct io_wq {
struct io_wqe **wqes;
unsigned long state;
get_work_fn *get_work;
put_work_fn *put_work;
free_work_fn *free_work;
struct task_struct *manager;
struct user_struct *user;
......@@ -376,26 +378,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
return __io_worker_unuse(wqe, worker);
}
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
return work->flags >> IO_WQ_HASH_SHIFT;
}
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
struct io_wq_work *work, *tail;
unsigned int hash;
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
if (!(work->flags & IO_WQ_WORK_HASHED)) {
wq_node_del(&wqe->work_list, node, prev);
if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev);
return work;
}
/* hashed, can run if not already running */
*hash = work->flags >> IO_WQ_HASH_SHIFT;
if (!(wqe->hash_map & BIT_ULL(*hash))) {
wqe->hash_map |= BIT_ULL(*hash);
wq_node_del(&wqe->work_list, node, prev);
hash = io_get_work_hash(work);
if (!(wqe->hash_map & BIT(hash))) {
wqe->hash_map |= BIT(hash);
/* all items with this hash lie in [work, tail] */
tail = wqe->hash_tail[hash];
wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
}
......@@ -440,16 +451,49 @@ static void io_wq_switch_creds(struct io_worker *worker,
worker->saved_creds = old_creds;
}
static void io_impersonate_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if (work->fs && current->fs != work->fs)
current->fs = work->fs;
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
}
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
/* flush pending signals before assigning new work */
if (signal_pending(current))
flush_signals(current);
cond_resched();
}
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
do {
unsigned hash = -1U;
struct io_wq_work *work;
unsigned int hash;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
* the list isn't empty, it means we stalled on hashed work.
......@@ -457,81 +501,60 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
work = io_get_next_work(wqe, &hash);
work = io_get_next_work(wqe);
if (work)
__io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock);
if (put_work && wq->put_work)
wq->put_work(old_work);
if (!work)
break;
next:
/* flush any pending signals before assigning new work */
if (signal_pending(current))
flush_signals(current);
cond_resched();
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
if (work->flags & IO_WQ_WORK_CB)
work->func(&work);
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if (work->fs && current->fs != work->fs)
current->fs = work->fs;
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
* the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
work->flags |= IO_WQ_WORK_HAS_MM;
if (wq->get_work) {
put_work = work;
wq->get_work(work);
}
old_work = work;
work->func(&work);
spin_lock_irq(&worker->lock);
worker->cur_work = NULL;
spin_unlock_irq(&worker->lock);
spin_lock_irq(&wqe->lock);
if (hash != -1U) {
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
}
if (work && work != old_work) {
spin_unlock_irq(&wqe->lock);
if (put_work && wq->put_work) {
wq->put_work(put_work);
put_work = NULL;
io_assign_current_work(worker, work);
/* handle a whole dependent link */
do {
struct io_wq_work *old_work, *next_hashed, *linked;
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable
* work, the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
hash = io_get_work_hash(work);
linked = old_work = work;
linked->func(&linked);
linked = (old_work == linked) ? NULL : linked;
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
linked = NULL;
}
io_assign_current_work(worker, work);
wq->free_work(old_work);
if (linked)
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* dependent work is not hashed */
hash = -1U;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
spin_unlock_irq(&wqe->lock);
}
} while (work);
/* dependent work not hashed */
hash = -1U;
goto next;
}
spin_lock_irq(&wqe->lock);
} while (1);
}
......@@ -747,17 +770,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true;
}
static void io_run_cancel(struct io_wq_work *work)
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
struct io_wq *wq = wqe->wq;
do {
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
work->func(&work);
work = (work == old_work) ? NULL : work;
wq->free_work(old_work);
} while (work);
}
static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{
unsigned int hash;
struct io_wq_work *tail;
if (!io_wq_is_hashed(work)) {
append:
wq_list_add_tail(&work->list, &wqe->work_list);
return;
}
hash = io_get_work_hash(work);
tail = wqe->hash_tail[hash];
wqe->hash_tail[hash] = work;
if (!tail)
goto append;
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
......@@ -771,13 +817,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
io_run_cancel(work);
io_run_cancel(work, wqe);
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
......@@ -794,19 +840,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
}
/*
* Enqueue work, hashed by some key. Work items that hash to the same value
* will not be done in parallel. Used to limit concurrent writes, generally
* hashed by inode.
* Work items that hash to the same value will not be done in parallel.
* Used to limit concurrent writes, generally hashed by inode.
*/
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
void io_wq_hash_work(struct io_wq_work *work, void *val)
{
struct io_wqe *wqe = wq->wqes[numa_node_id()];
unsigned bit;
unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
io_wqe_enqueue(wqe, work);
}
static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
......@@ -856,14 +898,13 @@ void io_wq_cancel_all(struct io_wq *wq)
}
struct io_cb_cancel_data {
struct io_wqe *wqe;
work_cancel_fn *cancel;
void *caller_data;
work_cancel_fn *fn;
void *data;
};
static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *data = cancel_data;
struct io_cb_cancel_data *match = data;
unsigned long flags;
bool ret = false;
......@@ -874,83 +915,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
work_cancel_fn *cancel,
void *cancel_data)
{
struct io_cb_cancel_data data = {
.wqe = wqe,
.cancel = cancel,
.caller_data = cancel_data,
};
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (cancel(work, cancel_data)) {
wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
}
}
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
io_run_cancel(work);
return IO_WQ_CANCEL_OK;
}
rcu_read_lock();
found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
rcu_read_unlock();
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data)
{
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_cb_work(wqe, cancel, data);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
return ret;
}
struct work_match {
bool (*fn)(struct io_wq_work *, void *data);
void *data;
};
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct work_match *match = data;
unsigned long flags;
bool ret = false;
spin_lock_irqsave(&worker->lock, flags);
if (match->fn(worker->cur_work, match->data) &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
......@@ -960,7 +925,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
}
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
struct work_match *match)
struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
......@@ -977,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
work = container_of(node, struct io_wq_work, list);
if (match->fn(work, match->data)) {
wq_node_del(&wqe->work_list, node, prev);
wq_list_del(&wqe->work_list, node, prev);
found = true;
break;
}
......@@ -985,7 +950,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
io_run_cancel(work);
io_run_cancel(work, wqe);
return IO_WQ_CANCEL_OK;
}
......@@ -1001,22 +966,16 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
static bool io_wq_work_match(struct io_wq_work *work, void *data)
{
return work == data;
}
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data)
{
struct work_match match = {
.fn = io_wq_work_match,
.data = cwork
struct io_cb_cancel_data match = {
.fn = cancel,
.data = data,
};
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
cwork->flags |= IO_WQ_WORK_CANCEL;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
......@@ -1028,33 +987,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
return ret;
}
static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
{
return work == data;
}
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
}
static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{
pid_t pid = (pid_t) (unsigned long) data;
if (work)
return work->task_pid == pid;
return false;
return work->task_pid == pid;
}
enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
{
struct work_match match = {
.fn = io_wq_pid_match,
.data = (void *) (unsigned long) pid
};
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
void *data = (void *) (unsigned long) pid;
ret = io_wqe_cancel_work(wqe, &match);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
return ret;
return io_wq_cancel_cb(wq, io_wq_pid_match, data);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
......@@ -1062,6 +1016,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node;
struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work))
return ERR_PTR(-EINVAL);
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
......@@ -1072,8 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM);
}
wq->get_work = data->get_work;
wq->put_work = data->put_work;
wq->free_work = data->free_work;
/* caller must already hold a reference to this */
wq->user = data->user;
......@@ -1130,7 +1086,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
if (data->free_work != wq->free_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
......
......@@ -5,10 +5,8 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
......@@ -30,6 +28,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
{
struct io_wq_work_node *next = pos->next;
pos->next = node;
node->next = next;
if (!next)
list->last = node;
}
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
......@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
static inline void wq_node_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
{
if (node == list->first)
WRITE_ONCE(list->first, node->next);
if (node == list->last)
/* first in the list, if prev==NULL */
if (!prev)
WRITE_ONCE(list->first, last->next);
else
prev->next = last->next;
if (last == list->last)
list->last = prev;
if (prev)
prev->next = node->next;
node->next = NULL;
last->next = NULL;
}
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
{
wq_list_cut(list, node, prev);
}
#define wq_list_for_each(pos, prv, head) \
......@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
} while (0)
struct io_wq_work {
union {
struct io_wq_work_node list;
void *data;
};
struct io_wq_work_node list;
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
......@@ -83,14 +99,20 @@ struct io_wq_work {
*(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
{
if (!work->list.next)
return NULL;
return container_of(work->list.next, struct io_wq_work, list);
}
typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
get_work_fn *get_work;
put_work_fn *put_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
......@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
void io_wq_hash_work(struct io_wq_work *work, void *val);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
......
此差异已折叠。
......@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/*
* Determine where to splice to/from.
*/
static long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
......
......@@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr,
struct iovec **iov);
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs);
/* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
......
......@@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
struct pipe_buffer *);
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
splice_direct_actor *);
extern long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags);
/*
* for dynamic pipe sizing
......
......@@ -38,6 +38,9 @@ struct compat_cmsghdr {
#define compat_mmsghdr mmsghdr
#endif /* defined(CONFIG_COMPAT) */
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
compat_size_t *len);
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
struct sockaddr __user **, struct iovec **);
struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval);
......
......@@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->force_nonblock, __entry->sq_thread)
);
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
__field( int, events )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
__entry->events = events;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
);
TRACE_EVENT(io_uring_poll_wake,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask %x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
),
TP_printk("ring %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data)
);
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
......
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
/*
* Header file for the io_uring interface.
*
......@@ -23,7 +23,10 @@ struct io_uring_sqe {
__u64 off; /* offset into file */
__u64 addr2;
};
__u64 addr; /* pointer to buffer or iovecs */
union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
};
__u32 len; /* buffer size or number of iovecs */
union {
__kernel_rwf_t rw_flags;
......@@ -37,14 +40,21 @@ struct io_uring_sqe {
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
};
__u64 user_data; /* data to be passed back at completion time */
union {
struct {
/* index into fixed buffers, if used */
__u16 buf_index;
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
__s32 splice_fd_in;
};
__u64 __pad2[3];
};
......@@ -56,6 +66,7 @@ enum {
IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
};
/*
......@@ -71,6 +82,8 @@ enum {
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
/*
* io_uring_setup() flags
......@@ -113,6 +126,9 @@ enum {
IORING_OP_RECV,
IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL,
IORING_OP_SPLICE,
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
/* this goes last, obviously */
IORING_OP_LAST,
......@@ -128,6 +144,12 @@ enum {
*/
#define IORING_TIMEOUT_ABS (1U << 0)
/*
* sqe->splice_flags
* extends splice(2) flags
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/*
* IO completion data structure (Completion Queue Entry)
*/
......@@ -137,6 +159,17 @@ struct io_uring_cqe {
__u32 flags;
};
/*
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
*/
#define IORING_CQE_F_BUFFER (1U << 0)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
};
/*
* Magic offsets for the application to mmap the data it needs
*/
......@@ -204,6 +237,7 @@ struct io_uring_params {
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5)
/*
* io_uring_register(2) opcodes and arguments
......
......@@ -97,16 +97,26 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
raw_spin_lock_irq(&task->pi_lock);
do {
head = NULL;
work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
if (!work) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
} while (cmpxchg(&task->task_works, work, head) != work);
raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
/*
* Synchronize with task_work_cancel(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
raw_spin_lock_irq(&task->pi_lock);
raw_spin_unlock_irq(&task->pi_lock);
do {
next = work->next;
......
......@@ -33,10 +33,10 @@
#include <linux/uaccess.h>
#include <net/compat.h>
int get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
int __get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
compat_uptr_t *ptr, compat_size_t *len)
{
struct compat_msghdr msg;
ssize_t err;
......@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
*ptr = msg.msg_iov;
*len = msg.msg_iovlen;
return 0;
}
int get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
compat_uptr_t ptr;
compat_size_t len;
ssize_t err;
err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
if (err)
return err;
err = compat_import_iovec(save_addr ? READ : WRITE,
compat_ptr(msg.msg_iov), msg.msg_iovlen,
UIO_FASTIOV, iov, &kmsg->msg_iter);
err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
len, UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
}
......
......@@ -2228,10 +2228,10 @@ struct used_address {
unsigned int name_len;
};
static int copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs)
{
struct user_msghdr msg;
ssize_t err;
......@@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
*uiov = msg.msg_iov;
*nsegs = msg.msg_iovlen;
return 0;
}
static int copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
struct user_msghdr msg;
ssize_t err;
err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
&msg.msg_iovlen);
if (err)
return err;
err = import_iovec(save_addr ? READ : WRITE,
msg.msg_iov, msg.msg_iovlen,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册