io_uring.h 13.6 KB
Newer Older
1
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
J
Jens Axboe 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Header file for the io_uring interface.
 *
 * Copyright (C) 2019 Jens Axboe
 * Copyright (C) 2019 Christoph Hellwig
 */
#ifndef LINUX_IO_URING_H
#define LINUX_IO_URING_H

#include <linux/fs.h>
#include <linux/types.h>

/*
 * IO submission data structure (Submission Queue Entry)
 */
struct io_uring_sqe {
	__u8	opcode;		/* type of operation for this sqe */
J
Jens Axboe 已提交
19
	__u8	flags;		/* IOSQE_ flags */
J
Jens Axboe 已提交
20 21
	__u16	ioprio;		/* ioprio for the request */
	__s32	fd;		/* file descriptor to do IO on */
22 23 24
	union {
		__u64	off;	/* offset into file */
		__u64	addr2;
25
		__u32	cmd_op;
26
	};
P
Pavel Begunkov 已提交
27 28 29 30
	union {
		__u64	addr;	/* pointer to buffer or iovecs */
		__u64	splice_off_in;
	};
J
Jens Axboe 已提交
31 32 33
	__u32	len;		/* buffer size or number of iovecs */
	union {
		__kernel_rwf_t	rw_flags;
C
Christoph Hellwig 已提交
34
		__u32		fsync_flags;
35 36
		__u16		poll_events;	/* compatibility */
		__u32		poll32_events;	/* word-reversed for BE */
37
		__u32		sync_range_flags;
J
Jens Axboe 已提交
38
		__u32		msg_flags;
J
Jens Axboe 已提交
39
		__u32		timeout_flags;
40
		__u32		accept_flags;
41
		__u32		cancel_flags;
42
		__u32		open_flags;
43
		__u32		statx_flags;
J
Jens Axboe 已提交
44
		__u32		fadvise_advice;
P
Pavel Begunkov 已提交
45
		__u32		splice_flags;
46
		__u32		rename_flags;
47
		__u32		unlink_flags;
48
		__u32		hardlink_flags;
49
		__u32		xattr_flags;
50
		__u32		close_flags;
J
Jens Axboe 已提交
51 52
	};
	__u64	user_data;	/* data to be passed back at completion time */
53
	/* pack this to avoid bogus arm OABI complaints */
54
	union {
55 56 57 58 59 60 61
		/* index into fixed buffers, if used */
		__u16	buf_index;
		/* for grouped buffer selection */
		__u16	buf_group;
	} __attribute__((packed));
	/* personality to use, if used */
	__u16	personality;
62 63 64 65
	union {
		__s32	splice_fd_in;
		__u32	file_index;
	};
66 67 68 69 70 71 72 73 74 75 76
	union {
		struct {
			__u64	addr3;
			__u64	__pad2[1];
		};
		/*
		 * If the ring is initialized with IORING_SETUP_SQE128, then
		 * this field is used for 80 bytes of arbitrary command data
		 */
		__u8	cmd[0];
	};
J
Jens Axboe 已提交
77 78
};

79 80 81 82 83 84 85 86 87
/*
 * If sqe->file_index is set to this for opcodes that instantiate a new
 * direct descriptor (like openat/openat2/accept), then io_uring will allocate
 * an available direct descriptor instead of having the application pass one
 * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE
 * if the space is full.
 */
#define IORING_FILE_INDEX_ALLOC		(~0U)

88 89 90 91 92 93
enum {
	IOSQE_FIXED_FILE_BIT,
	IOSQE_IO_DRAIN_BIT,
	IOSQE_IO_LINK_BIT,
	IOSQE_IO_HARDLINK_BIT,
	IOSQE_ASYNC_BIT,
94
	IOSQE_BUFFER_SELECT_BIT,
95
	IOSQE_CQE_SKIP_SUCCESS_BIT,
96 97
};

J
Jens Axboe 已提交
98 99 100
/*
 * sqe->flags
 */
101 102 103 104 105 106 107 108 109 110
/* use fixed fileset */
#define IOSQE_FIXED_FILE	(1U << IOSQE_FIXED_FILE_BIT)
/* issue after inflight IO */
#define IOSQE_IO_DRAIN		(1U << IOSQE_IO_DRAIN_BIT)
/* links next sqe */
#define IOSQE_IO_LINK		(1U << IOSQE_IO_LINK_BIT)
/* like LINK, but stronger */
#define IOSQE_IO_HARDLINK	(1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */
#define IOSQE_ASYNC		(1U << IOSQE_ASYNC_BIT)
111 112
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT	(1U << IOSQE_BUFFER_SELECT_BIT)
113 114
/* don't post CQE if request succeeded */
#define IOSQE_CQE_SKIP_SUCCESS	(1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
J
Jens Axboe 已提交
115

J
Jens Axboe 已提交
116 117 118 119
/*
 * io_uring_setup() flags
 */
#define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
J
Jens Axboe 已提交
120 121
#define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
#define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
122
#define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
123
#define IORING_SETUP_CLAMP	(1U << 4)	/* clamp SQ/CQ ring sizes */
124
#define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
125
#define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
126
#define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
127 128 129 130 131 132 133 134
/*
 * Cooperative task running. When requests complete, they often require
 * forcing the submitter to transition to the kernel to complete. If this
 * flag is set, work will be done when the task transitions anyway, rather
 * than force an inter-processor interrupt reschedule. This avoids interrupting
 * a task running in userspace, and saves an IPI.
 */
#define IORING_SETUP_COOP_TASKRUN	(1U << 8)
135 136 137 138 139 140
/*
 * If COOP_TASKRUN is set, get notified if task work is available for
 * running and a kernel transition would be needed to run it. This sets
 * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
 */
#define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
J
Jens Axboe 已提交
141

142
#define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
143
#define IORING_SETUP_CQE32		(1U << 11) /* CQEs are 32 byte */
144

D
Dylan Yudaken 已提交
145
enum io_uring_op {
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
	IORING_OP_NOP,
	IORING_OP_READV,
	IORING_OP_WRITEV,
	IORING_OP_FSYNC,
	IORING_OP_READ_FIXED,
	IORING_OP_WRITE_FIXED,
	IORING_OP_POLL_ADD,
	IORING_OP_POLL_REMOVE,
	IORING_OP_SYNC_FILE_RANGE,
	IORING_OP_SENDMSG,
	IORING_OP_RECVMSG,
	IORING_OP_TIMEOUT,
	IORING_OP_TIMEOUT_REMOVE,
	IORING_OP_ACCEPT,
	IORING_OP_ASYNC_CANCEL,
	IORING_OP_LINK_TIMEOUT,
	IORING_OP_CONNECT,
163
	IORING_OP_FALLOCATE,
164
	IORING_OP_OPENAT,
165
	IORING_OP_CLOSE,
166
	IORING_OP_FILES_UPDATE,
167
	IORING_OP_STATX,
168 169
	IORING_OP_READ,
	IORING_OP_WRITE,
J
Jens Axboe 已提交
170
	IORING_OP_FADVISE,
J
Jens Axboe 已提交
171
	IORING_OP_MADVISE,
172 173
	IORING_OP_SEND,
	IORING_OP_RECV,
174
	IORING_OP_OPENAT2,
175
	IORING_OP_EPOLL_CTL,
P
Pavel Begunkov 已提交
176
	IORING_OP_SPLICE,
177
	IORING_OP_PROVIDE_BUFFERS,
178
	IORING_OP_REMOVE_BUFFERS,
P
Pavel Begunkov 已提交
179
	IORING_OP_TEE,
J
Jens Axboe 已提交
180
	IORING_OP_SHUTDOWN,
181
	IORING_OP_RENAMEAT,
182
	IORING_OP_UNLINKAT,
183
	IORING_OP_MKDIRAT,
184
	IORING_OP_SYMLINKAT,
185
	IORING_OP_LINKAT,
186
	IORING_OP_MSG_RING,
187 188
	IORING_OP_FSETXATTR,
	IORING_OP_SETXATTR,
189 190
	IORING_OP_FGETXATTR,
	IORING_OP_GETXATTR,
J
Jens Axboe 已提交
191
	IORING_OP_SOCKET,
192
	IORING_OP_URING_CMD,
193 194 195 196

	/* this goes last, obviously */
	IORING_OP_LAST,
};
C
Christoph Hellwig 已提交
197 198 199 200 201

/*
 * sqe->fsync_flags
 */
#define IORING_FSYNC_DATASYNC	(1U << 0)
J
Jens Axboe 已提交
202

203 204 205
/*
 * sqe->timeout_flags
 */
206 207 208 209 210
#define IORING_TIMEOUT_ABS		(1U << 0)
#define IORING_TIMEOUT_UPDATE		(1U << 1)
#define IORING_TIMEOUT_BOOTTIME		(1U << 2)
#define IORING_TIMEOUT_REALTIME		(1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE	(1U << 4)
211
#define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
212
#define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
213
#define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
P
Pavel Begunkov 已提交
214 215 216 217 218 219
/*
 * sqe->splice_flags
 * extends splice(2) flags
 */
#define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */

220 221 222 223 224 225 226
/*
 * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
 * command flags for POLL_ADD are stored in sqe->len.
 *
 * IORING_POLL_ADD_MULTI	Multishot poll. Sets IORING_CQE_F_MORE if
 *				the poll handler will continue to report
 *				CQEs on behalf of the same SQE.
227 228 229
 *
 * IORING_POLL_UPDATE		Update existing poll request, matching
 *				sqe->addr as the old user_data field.
230 231
 */
#define IORING_POLL_ADD_MULTI	(1U << 0)
232 233
#define IORING_POLL_UPDATE_EVENTS	(1U << 1)
#define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
234

235 236 237 238
/*
 * ASYNC_CANCEL flags.
 *
 * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
239 240
 * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
 *				request 'user_data'
241
 * IORING_ASYNC_CANCEL_ANY	Match any request
242 243
 */
#define IORING_ASYNC_CANCEL_ALL	(1U << 0)
244
#define IORING_ASYNC_CANCEL_FD	(1U << 1)
245
#define IORING_ASYNC_CANCEL_ANY	(1U << 2)
246

247 248 249 250 251 252 253 254 255 256
/*
 * send/sendmsg and recv/recvmsg flags (sqe->addr2)
 *
 * IORING_RECVSEND_POLL_FIRST	If set, instead of first attempting to send
 *				or receive and arm poll if that yields an
 *				-EAGAIN result, arm poll upfront and skip
 *				the initial transfer attempt.
 */
#define IORING_RECVSEND_POLL_FIRST	(1U << 0)

257 258 259 260 261
/*
 * accept flags stored in sqe->ioprio
 */
#define IORING_ACCEPT_MULTISHOT	(1U << 0)

262 263 264 265 266
/*
 * close flags, store in sqe->close_flags
 */
#define IORING_CLOSE_FD_AND_FILE_SLOT	(1U << 0)

J
Jens Axboe 已提交
267 268 269 270 271 272 273
/*
 * IO completion data structure (Completion Queue Entry)
 */
struct io_uring_cqe {
	__u64	user_data;	/* sqe->data submission passed back */
	__s32	res;		/* result code for this event */
	__u32	flags;
274 275 276 277 278 279

	/*
	 * If the ring is initialized with IORING_SETUP_CQE32, then this field
	 * contains 16-bytes of padding, doubling the size of the CQE.
	 */
	__u64 big_cqe[];
J
Jens Axboe 已提交
280 281
};

282 283 284 285
/*
 * cqe->flags
 *
 * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
286
 * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
287
 * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
288 289
 */
#define IORING_CQE_F_BUFFER		(1U << 0)
290
#define IORING_CQE_F_MORE		(1U << 1)
291
#define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
292 293 294 295 296

enum {
	IORING_CQE_BUFFER_SHIFT		= 16,
};

J
Jens Axboe 已提交
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
/*
 * Magic offsets for the application to mmap the data it needs
 */
#define IORING_OFF_SQ_RING		0ULL
#define IORING_OFF_CQ_RING		0x8000000ULL
#define IORING_OFF_SQES			0x10000000ULL

/*
 * Filled with the offset for mmap(2)
 */
struct io_sqring_offsets {
	__u32 head;
	__u32 tail;
	__u32 ring_mask;
	__u32 ring_entries;
	__u32 flags;
	__u32 dropped;
	__u32 array;
	__u32 resv1;
	__u64 resv2;
};

J
Jens Axboe 已提交
319 320 321 322
/*
 * sq_ring->flags
 */
#define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
323
#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* CQ ring is overflown */
324
#define IORING_SQ_TASKRUN	(1U << 2) /* task should enter the kernel */
J
Jens Axboe 已提交
325

J
Jens Axboe 已提交
326 327 328 329 330 331 332
struct io_cqring_offsets {
	__u32 head;
	__u32 tail;
	__u32 ring_mask;
	__u32 ring_entries;
	__u32 overflow;
	__u32 cqes;
333 334 335
	__u32 flags;
	__u32 resv1;
	__u64 resv2;
J
Jens Axboe 已提交
336 337
};

338 339 340 341 342 343 344
/*
 * cq_ring->flags
 */

/* disable eventfd notifications */
#define IORING_CQ_EVENTFD_DISABLED	(1U << 0)

J
Jens Axboe 已提交
345 346 347
/*
 * io_uring_enter(2) flags
 */
348 349 350 351 352
#define IORING_ENTER_GETEVENTS		(1U << 0)
#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
#define IORING_ENTER_SQ_WAIT		(1U << 2)
#define IORING_ENTER_EXT_ARG		(1U << 3)
#define IORING_ENTER_REGISTERED_RING	(1U << 4)
J
Jens Axboe 已提交
353 354 355 356 357 358 359 360

/*
 * Passed in for io_uring_setup(2). Copied back with updated info on success
 */
struct io_uring_params {
	__u32 sq_entries;
	__u32 cq_entries;
	__u32 flags;
J
Jens Axboe 已提交
361 362
	__u32 sq_thread_cpu;
	__u32 sq_thread_idle;
363
	__u32 features;
364 365
	__u32 wq_fd;
	__u32 resv[3];
J
Jens Axboe 已提交
366 367 368 369
	struct io_sqring_offsets sq_off;
	struct io_cqring_offsets cq_off;
};

370 371 372 373
/*
 * io_uring_params->features flags
 */
#define IORING_FEAT_SINGLE_MMAP		(1U << 0)
374
#define IORING_FEAT_NODROP		(1U << 1)
375
#define IORING_FEAT_SUBMIT_STABLE	(1U << 2)
376
#define IORING_FEAT_RW_CUR_POS		(1U << 3)
377
#define IORING_FEAT_CUR_PERSONALITY	(1U << 4)
378
#define IORING_FEAT_FAST_POLL		(1U << 5)
379
#define IORING_FEAT_POLL_32BITS 	(1U << 6)
380
#define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
381
#define IORING_FEAT_EXT_ARG		(1U << 8)
382
#define IORING_FEAT_NATIVE_WORKERS	(1U << 9)
383
#define IORING_FEAT_RSRC_TAGS		(1U << 10)
384
#define IORING_FEAT_CQE_SKIP		(1U << 11)
385
#define IORING_FEAT_LINKED_FILE		(1U << 12)
386

387 388 389
/*
 * io_uring_register(2) opcodes and arguments
 */
390 391 392 393 394 395 396 397 398 399 400 401
enum {
	IORING_REGISTER_BUFFERS			= 0,
	IORING_UNREGISTER_BUFFERS		= 1,
	IORING_REGISTER_FILES			= 2,
	IORING_UNREGISTER_FILES			= 3,
	IORING_REGISTER_EVENTFD			= 4,
	IORING_UNREGISTER_EVENTFD		= 5,
	IORING_REGISTER_FILES_UPDATE		= 6,
	IORING_REGISTER_EVENTFD_ASYNC		= 7,
	IORING_REGISTER_PROBE			= 8,
	IORING_REGISTER_PERSONALITY		= 9,
	IORING_UNREGISTER_PERSONALITY		= 10,
402
	IORING_REGISTER_RESTRICTIONS		= 11,
403
	IORING_REGISTER_ENABLE_RINGS		= 12,
404 405 406 407 408 409

	/* extended with tagging */
	IORING_REGISTER_FILES2			= 13,
	IORING_REGISTER_FILES_UPDATE2		= 14,
	IORING_REGISTER_BUFFERS2		= 15,
	IORING_REGISTER_BUFFERS_UPDATE		= 16,
410

411 412 413 414
	/* set/clear io-wq thread affinities */
	IORING_REGISTER_IOWQ_AFF		= 17,
	IORING_UNREGISTER_IOWQ_AFF		= 18,

415
	/* set/get max number of io-wq workers */
416 417
	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,

418 419 420 421
	/* register/unregister io_uring fd with the ring */
	IORING_REGISTER_RING_FDS		= 20,
	IORING_UNREGISTER_RING_FDS		= 21,

422 423 424 425
	/* register ring based provide buffer group */
	IORING_REGISTER_PBUF_RING		= 22,
	IORING_UNREGISTER_PBUF_RING		= 23,

426 427 428
	/* this goes last */
	IORING_REGISTER_LAST
};
429

430 431 432 433 434 435
/* io-wq worker categories */
enum {
	IO_WQ_BOUND,
	IO_WQ_UNBOUND,
};

436
/* deprecated, see struct io_uring_rsrc_update */
437 438
struct io_uring_files_update {
	__u32 offset;
439 440
	__u32 resv;
	__aligned_u64 /* __s32 * */ fds;
441
};
442

443 444 445 446 447 448
/*
 * Register a fully sparse file space, rather than pass in an array of all
 * -1 file descriptors.
 */
#define IORING_RSRC_REGISTER_SPARSE	(1U << 0)

449 450
struct io_uring_rsrc_register {
	__u32 nr;
451
	__u32 flags;
452
	__u64 resv2;
453 454 455 456
	__aligned_u64 data;
	__aligned_u64 tags;
};

457 458 459 460 461 462 463 464 465 466 467 468
struct io_uring_rsrc_update {
	__u32 offset;
	__u32 resv;
	__aligned_u64 data;
};

struct io_uring_rsrc_update2 {
	__u32 offset;
	__u32 resv;
	__aligned_u64 data;
	__aligned_u64 tags;
	__u32 nr;
469
	__u32 resv2;
470 471
};

472 473 474
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP	(-2)

475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
#define IO_URING_OP_SUPPORTED	(1U << 0)

struct io_uring_probe_op {
	__u8 op;
	__u8 resv;
	__u16 flags;	/* IO_URING_OP_* flags */
	__u32 resv2;
};

struct io_uring_probe {
	__u8 last_op;	/* last opcode supported */
	__u8 ops_len;	/* length of ops[] array below */
	__u16 resv;
	__u32 resv2[3];
	struct io_uring_probe_op ops[0];
};

492 493 494 495 496 497 498 499 500 501 502
struct io_uring_restriction {
	__u16 opcode;
	union {
		__u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
		__u8 sqe_op;      /* IORING_RESTRICTION_SQE_OP */
		__u8 sqe_flags;   /* IORING_RESTRICTION_SQE_FLAGS_* */
	};
	__u8 resv;
	__u32 resv2[3];
};

503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
struct io_uring_buf {
	__u64	addr;
	__u32	len;
	__u16	bid;
	__u16	resv;
};

struct io_uring_buf_ring {
	union {
		/*
		 * To avoid spilling into more pages than we need to, the
		 * ring tail is overlaid with the io_uring_buf->resv field.
		 */
		struct {
			__u64	resv1;
			__u32	resv2;
			__u16	resv3;
			__u16	tail;
		};
		struct io_uring_buf	bufs[0];
	};
};

/* argument for IORING_(UN)REGISTER_PBUF_RING */
struct io_uring_buf_reg {
	__u64	ring_addr;
	__u32	ring_entries;
	__u16	bgid;
	__u16	pad;
	__u64	resv[3];
};

535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
/*
 * io_uring_restriction->opcode values
 */
enum {
	/* Allow an io_uring_register(2) opcode */
	IORING_RESTRICTION_REGISTER_OP		= 0,

	/* Allow an sqe opcode */
	IORING_RESTRICTION_SQE_OP		= 1,

	/* Allow sqe flags */
	IORING_RESTRICTION_SQE_FLAGS_ALLOWED	= 2,

	/* Require sqe flags (these flags must be set on each submission) */
	IORING_RESTRICTION_SQE_FLAGS_REQUIRED	= 3,

	IORING_RESTRICTION_LAST
};

554 555 556 557 558 559 560
struct io_uring_getevents_arg {
	__u64	sigmask;
	__u32	sigmask_sz;
	__u32	pad;
	__u64	ts;
};

J
Jens Axboe 已提交
561
#endif