From 2cf23c9f8623be62439ae4c23fce0ddae8a18eed Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 28 Sep 2025 22:46:23 +0000 Subject: [PATCH 01/41] IoUring: use typed Flags and Features for IoUring IoUring Overhaul Named Constants for IoUring Move io_uring_sqe.zig into IoUring Add new exposed IoUring data types Improve naming and namespacing a bit Improve api of init with Setup.Flags // https://github.com/axboe/liburing/issues/1075#issuecomment-1970744640 // https://github.com/axboe/liburing/issues/811#issuecomment-1454967969 // https://nick-black.com/dankwiki/index.php/Io_uring Catch simple incompatible flag configurations during init_params Start updating IoUring gradually to use the new typed flags Slowly remove IoUring bits and pieces from linux.zig Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 1995 ++++++++++++++++++++++++++--- lib/std/os/linux/io_uring_sqe.zig | 679 ---------- 2 files changed, 1816 insertions(+), 858 deletions(-) delete mode 100644 lib/std/os/linux/io_uring_sqe.zig diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 4992a352d21b..9dcbf71787c5 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -13,17 +13,1647 @@ const page_size_min = std.heap.page_size_min; fd: linux.fd_t = -1, sq: SubmissionQueue, cq: CompletionQueue, -flags: u32, -features: u32, +flags: Flags.Setup, +features: Flags.Features, + +// COMMIT: move IoUring constants to Constants +pub const Constants = struct { + /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new + /// an available direct descriptor instead of having the application pass one + /// direct descriptor (like openat/openat2/accept), then io_uring will allocate + /// in. The picked direct descriptor will be returned in cqe.res, or -ENFILE + /// if the space is full. + pub const FILE_INDEX_ALLOC = std.math.maxInt(u32); + + pub const CMD_MASK = 1 << 0; + + pub const TIMEOUT_CLOCK_MASK = ((1 << 2) | (1 << 3)); + pub const TIMEOUT_UPDATE_MASK = ((1 << 1) | (1 << 4)); + + pub const CQE_BUFFER_SHIFT = 16; + + /// cqe.res for IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was + /// requested It should be treated as a flag, all other bits of cqe.res + /// should be treated as reserved! + pub const NOTIF_USAGE_ZC_COPIED = (1 << 31); + + //Magic offsets for the application to mmap the data it needs + pub const OFF_SQ_RING = 0; + pub const OFF_CQ_RING = 0x8000000; + pub const OFF_SQES = 0x10000000; + // COMMIT: new magic constants + pub const OFF_PBUF_RING = 0x80000000; + pub const OFF_PBUF_SHIFT = 16; + pub const OFF_MMAP_MASK = 0xf8000000; + + // COMMIT: new constant + /// initialise with user provided memory pointed by user_addr + pub const MEM_REGION_TYPE_USER = 1; + /// expose the region as registered wait arguments + pub const MEM_REGION_REG_WAIT_ARG = 1; + + /// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. + pub const RSRC_REGISTER_SPARSE = 1 << 0; + + /// Skip updating fd indexes set to this value in the fd table + pub const REGISTER_FILES_SKIP = -2; + + pub const IO_URING_OP_SUPPORTED = 1 << 0; + + // COMMIT: new constants + pub const REGISTER_SRC_REGISTERED = (1 << 0); + pub const REGISTER_DST_REPLACE = (1 << 1); + + // COMMIT: new constant + pub const REG_WAIT_TS = (1 << 0); + + // COMMIT: new TX Timestamp definition + /// SOCKET_URING_OP_TX_TIMESTAMP definitions + pub const TIMESTAMP_HW_SHIFT = 16; + /// The cqe.flags bit from which the timestamp type is stored + pub const TIMESTAMP_TYPE_SHIFT = (TIMESTAMP_HW_SHIFT + 1); + /// The cqe.flags flag signifying whether it's a hardware timestamp + pub const CQE_F_TSTAMP_HW = (1 << TIMESTAMP_HW_SHIFT); + + /// The bit from which area id is encoded into offsets + pub const ZCRX_AREA_SHIFT = 48; + pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1)); +}; + +// COMMIT: move IoUring flags to Flags struct +pub const Flags = struct { + /// io_uring_setup() flags + pub const Setup = packed struct(u32) { + /// io_context is polled + IOPOLL: bool = false, + /// SQ poll thread + SQPOLL: bool = false, + /// sq_thread_cpu is valid + SQ_AFF: bool = false, + /// app defines CQ size + CQSIZE: bool = false, + /// clamp SQ/CQ ring sizes + CLAMP: bool = false, + /// attach to existing wq + ATTACH_WQ: bool = false, + /// start with ring disabled + R_DISABLED: bool = false, + /// continue submit on error + SUBMIT_ALL: bool = false, + ///Cooperative task running. When requests complete, they often require + ///forcing the submitter to transition to the kernel to complete. If this + ///flag is set, work will be done when the task transitions anyway, rather + ///than force an inter-processor interrupt reschedule. This avoids interrupting + ///a task running in userspace, and saves an IPI. + COOP_TASKRUN: bool = false, + ///If COOP_TASKRUN is set, get notified if task work is available for + ///running and a kernel transition would be needed to run it. This sets + ///IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + TASKRUN_FLAG: bool = false, + /// SQEs are 128 byte + SQE128: bool = false, + /// CQEs are 32 byte + CQE32: bool = false, + /// Only one task is allowed to submit requests + SINGLE_ISSUER: bool = false, + /// Defer running task work to get events. + /// Rather than running bits of task work whenever the task transitions + /// try to do it just before it is needed. + DEFER_TASKRUN: bool = false, + /// Application provides the memory for the rings + NO_MMAP: bool = false, + /// Register the ring fd in itself for use with + /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + /// than an fd. + REGISTERED_FD_ONLY: bool = false, + /// Removes indirection through the SQ index array. + NO_SQARRAY: bool = false, + // COMMIT: new setup flags + /// Use hybrid poll in iopoll process + HYBRID_IOPOLL: bool = false, + /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + /// IORING_CQE_F_32 set in cqe.flags. + CQE_MIXED: bool = false, + _unused: u13 = 0, + }; + + /// sqe.uring_cmd_flags (rw_flags in the Zig struct) + /// top 8bits aren't available for userspace + /// use registered buffer; pass this flag along with setting sqe.buf_index. + pub const Cmd = packed struct(u32) { + CMD_FIXED: bool = false, + _unused: u31 = 0, + }; + + /// sqe.fsync_flags (rw_flags in the Zig struct) + pub const Fsync = packed struct(u32) { + DATASYNC: bool = false, + _unused: u31 = 0, + }; + + /// sqe.timeout_flags + pub const Timeout = packed struct(u32) { + TIMEOUT_ABS: bool = false, + /// Available since Linux 5.11 + TIMEOUT_UPDATE: bool = false, + /// Available since Linux 5.15 + TIMEOUT_BOOTTIME: bool = false, + /// Available since Linux 5.15 + TIMEOUT_REALTIME: bool = false, + /// Available since Linux 5.15 + LINK_TIMEOUT_UPDATE: bool = false, + /// Available since Linux 5.16 + TIMEOUT_ETIME_SUCCESS: bool = false, + // COMMIT: new Timeout Flag + // TODO: add when it became available + TIMEOUT_MULTISHOT: bool = false, + _unused: u25 = 0, + }; + + /// sqe.splice_flags (rw_flags in Zig Struct) + /// extends splice(2) flags + pub const Splice = packed struct(u32) { + _unused: u31 = 0, + /// the last bit of __u32 + F_FD_IN_FIXED: bool = false, + }; + + /// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct) + /// is the flag space, the command flags for POLL_ADD are stored in sqe.len. + pub const Poll = packed struct(u32) { + /// IORING_POLL_ADD_MULTI + /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue + /// to report CQEs on behalf of the same SQE. + ADD_MULTI: bool = false, + // TODO: verify this doc comment is valid for the 2 flags below + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + UPDATE_EVENTS: bool = false, + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + UPDATE_USER_DATA: bool = false, + /// IORING_POLL_LEVEL + /// Level triggered poll. + ADD_LEVEL: bool = false, + _unused: u28 = 0, + }; + + /// ASYNC_CANCEL flags. + pub const AsyncCancel = packed struct(u32) { + /// IORING_ASYNC_CANCEL_ALL + /// Cancel all requests that match the given key + ALL: bool = false, + /// IORING_ASYNC_CANCEL_FD + /// Key off 'fd' for cancelation rather than the request 'user_data' + FD: bool = false, + /// IORING_ASYNC_CANCEL_ANY + /// Match any request + ANY: bool = false, + /// IORING_ASYNC_CANCEL_FD_FIXED + /// 'fd' passed in is a fixed descriptor + FD_FIXED: bool = false, + // COMMIT: new AsyncCancel Flags + /// IORING_ASYNC_CANCEL_USERDATA + /// Match on user_data, default for no other key + USERDATA: bool = false, + /// IORING_ASYNC_CANCEL_OP + /// Match request based on opcode + OP: bool = false, + _unused: u26 = 0, + }; + + /// send/sendmsg and recv/recvmsg flags (sqe.ioprio) + pub const SendRecv = packed struct(u16) { + /// IORING_RECVSEND_POLL_FIRST + /// If set, instead of first attempting to send or receive and arm poll + /// if that yields an -EAGAIN result, arm poll upfront and skip the + /// initial transfer attempt. + RECVSEND_POLL_FIRST: bool = false, + /// IORING_RECV_MULTISHOT + /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue + /// to report CQEs on behalf of the same SQE. + RECV_MULTISHOT: bool = false, + /// IORING_RECVSEND_FIXED_BUF + /// Use registered buffers, the index is stored in the buf_index field. + RECVSEND_FIXED_BUF: bool = false, + /// IORING_SEND_ZC_REPORT_USAGE + /// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res + /// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was + /// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied + /// (at least partially). + SEND_ZC_REPORT_USAGE: bool = false, + /// IORING_RECVSEND_BUNDLE + /// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as + /// many buffers from the buffer group ID given and send them all. + /// The completion result will be the number of buffers send, with the + /// starting buffer ID in cqe.flags as per usual for provided buffer + /// usage. The buffers will be contiguous from the starting buffer ID. + RECVSEND_BUNDLE: bool = false, + // COMMIT: new flags + /// IORING_SEND_VECTORIZED + /// If set, SEND[_ZC] will take a pointer to a io_vec to allow + /// vectorized send operations. + SEND_VECTORIZED: bool = false, + _: u10 = 0, + }; + + /// accept flags stored in sqe.ioprio + pub const Accept = packed struct(u16) { + MULTISHOT: bool = false, + // COMMIT: new Flags + DONTWAIT: bool = false, + POLL_FIRST: bool = false, + _unused: u29 = 0, + }; + + /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) + pub const MsgRing = packed struct(u32) { + /// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. + /// Not applicable for IORING_MSG_DATA, obviously. + CQE_SKIP: bool = false, + /// Pass through the flags from sqe.file_index to cqe.flags + FLAGS_PASS: bool = false, + _unused: u30 = 0, + }; + + // COMMIT: new flag + /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) + pub const FixedFd = packed struct(u32) { + /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC + NO_CLOEXEC: bool = false, + }; + + /// COMMIT: new flags + /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) + pub const Nop = packed struct(u32) { + /// IORING_NOP_INJECT_RESULT Inject result from sqe.result + INJECT_RESULT: bool = false, + _unused: u4 = 0, + CQE32: bool = false, + _unused_1: u26 = 0, + }; + + /// cqe.flags + pub const Cqe = packed struct(u32) { + /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + F_BUFFER: bool = false, + /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + F_MORE: bool = false, + /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv + F_SOCK_NONEMPTY: bool = false, + /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct + /// them from sends. + F_NOTIF: bool = false, + /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get + /// more completions. In other words, the buffer is being + /// partially consumed, and will be used by the kernel for + /// more completions. This is only set for buffers used via + /// the incremental buffer consumption, as provided by + /// a ring buffer setup with IOU_PBUF_RING_INC. For any + /// other provided buffer type, all completions with a + /// buffer passed back is automatically returned to the + /// application. + F_BUF_MORE: bool = false, + // COMMIT: new flags + /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + /// CQE. It's only purpose is to fill a gap in the ring, + /// if a large CQE is attempted posted when the ring has + /// just a single small CQE worth of space left before + /// wrapping. + F_SKIP: bool = false, + _unused: u9 = 0, + /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + /// setup in a mixed CQE mode, where both 16b and 32b + /// CQEs may be posted to the CQ ring. + F_32: bool = false, + _unused_1: u16 = 0, + }; + + /// sq_ring.flags + pub const Sq = packed struct(u32) { + /// needs io_uring_enter wakeup + NEED_WAKEUP: bool = false, + /// CQ ring is overflown + CQ_OVERFLOW: bool = false, + /// task should enter the kernel + TASKRUN: bool = false, + _unused: u29 = 0, + }; + + /// cq_ring.flags + pub const Cq = packed struct(u32) { + /// disable eventfd notifications + EVENTFD_DISABLED: bool = false, + _unused: u31 = 0, + }; + + /// io_uring_enter(2) flags + pub const Enter = packed struct(u32) { + GETEVENTS: bool = false, + SQ_WAKEUP: bool = false, + SQ_WAIT: bool = false, + EXT_ARG: bool = false, + REGISTERED_RING: bool = false, + // COMMIT: new flags + ABS_TIMER: bool = false, + EXT_ARG_REG: bool = false, + NO_IOWAIT: bool = false, + _unused: u24 = 0, + }; + + /// io_uring_params.features flags + const Features = packed struct(u32) { + SINGLE_MMAP: bool = false, + NODROP: bool = false, + SUBMIT_STABLE: bool = false, + RW_CUR_POS: bool = false, + CUR_PERSONALITY: bool = false, + FAST_POLL: bool = false, + POLL_32BITS: bool = false, + SQPOLL_NONFIXED: bool = false, + EXT_ARG: bool = false, + NATIVE_WORKERS: bool = false, + RSRC_TAGS: bool = false, + CQE_SKIP: bool = false, + LINKED_FILE: bool = false, + // COMMIT: add new Feature Flags + REG_REG_RING: bool = false, + RECVSEND_BUNDLE: bool = false, + MIN_TIMEOUT: bool = false, + RW_ATTR: bool = false, + NO_IOWAIT: bool = false, + _unused: u14 = 0, + }; +}; + +// IO completion data structure (Completion Queue Entry) +pub const io_uring_cqe = extern struct { + /// sqe.user_data value passed back + user_data: u64, + /// result code for this event + res: i32, + flags: Flags.Cqe, + // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration + /// If the ring is initialized with IORING_SETUP_CQE32, then this field + /// contains 16-bytes of padding, doubling the size of the CQE. + big_cqe: []u64, + + // Followed by 16 bytes of padding if initialized with IORING_SETUP_CQE32, doubling cqe size + + pub fn err(self: io_uring_cqe) linux.E { + if (self.res > -4096 and self.res < 0) { + return @as(linux.E, @enumFromInt(-self.res)); + } + return .SUCCESS; + } + + // On successful completion of the provided buffers IO request, the CQE flags field + // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by + // the upper 16-bits of the flags field. + pub fn buffer_id(self: io_uring_cqe) !u16 { + if (!self.flags.F_BUFFER) { + return error.NoBufferSelected; + } + return @as(u16, @intCast(@as(u32, @bitCast(self.flags)) >> Constants.CQE_BUFFER_SHIFT)); + } +}; + +/// IO submission data structure (Submission Queue Entry) +pub const io_uring_sqe = extern struct { + /// type of operation for this sqe + opcode: Op, + /// IOSQE_* flags + flags: SqeFlags, + /// ioprio for the request + ioprio: u16, + /// file descriptor to do IO on + fd: i32, + /// offset into file + off: u64, + /// pointer to buffer or iovecs + addr: u64, + /// buffer size or number of iovecs + len: u32, + /// flags for any sqe operation + /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | msg_flags + /// timeout_flags | accept_flags | cancel_flags | open_flags | statx_flags + /// fadvise_advice | splice_flags | rename_flags | unlink_flags | hardlink_flags + /// xattr_flags | msg_ring_flags | uring_cmd_flags | waitid_flags | futex_flags + /// install_fd_flags | nop_flags | pipe_flags + rw_flags: u32, + /// data to be passed back at completion time + user_data: u64, + /// index into fixed buffers or for grouped buffer selection + buf_index: u16, + personality: u16, + splice_fd_in: i32, + addr3: u64, + resv: u64, + + /// sqe.flags + pub const SqeFlags = packed struct(u8) { + /// use fixed fileset + IOSQE_FIXED_FILE: bool = false, + /// issue after inflight IO + IOSQE_IO_DRAIN: bool = false, + /// links next sqe + IOSQE_IO_LINK: bool = false, + /// like LINK, but stronger + IOSQE_IO_HARDLINK: bool = false, + /// always go async + IOSQE_ASYNC: bool = false, + /// select buffer from sqe->buf_group + IOSQE_BUFFER_SELECT: bool = false, + /// don't post CQE if request succeeded + IOSQE_CQE_SKIP_SUCCESS: bool = false, + _: u1 = 0, + }; + + pub fn prep_nop(sqe: *io_uring_sqe) void { + sqe.* = .{ + .opcode = .NOP, + .flags = @bitCast(0), + .ioprio = 0, + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_fsync(sqe: *io_uring_sqe, fd: linux.fd_t, flags: u32) void { + sqe.* = .{ + .opcode = .FSYNC, + .flags = 0, + .ioprio = 0, + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = flags, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_rw( + sqe: *io_uring_sqe, + op: Op, + fd: linux.fd_t, + addr: u64, + len: usize, + offset: u64, + ) void { + sqe.* = .{ + .opcode = op, + .flags = @bitCast(0), + .ioprio = 0, + .fd = fd, + .off = offset, + .addr = addr, + .len = @intCast(len), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_read(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { + sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } + + pub fn prep_write(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { + sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } + + pub fn prep_splice(sqe: *io_uring_sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { + sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); + sqe.addr = off_in; + sqe.splice_fd_in = fd_in; + } + + pub fn prep_readv( + sqe: *io_uring_sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec, + offset: u64, + ) void { + sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } + + pub fn prep_writev( + sqe: *io_uring_sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec_const, + offset: u64, + ) void { + sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } + + pub fn prep_read_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_write_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_accept( + sqe: *io_uring_sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: u32, + ) void { + // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. + // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); + sqe.rw_flags = flags; + } + + pub fn prep_accept_direct( + sqe: *io_uring_sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: u32, + file_index: u32, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_multishot_accept_direct( + sqe: *io_uring_sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: u32, + ) void { + prep_multishot_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); + } + + fn set_target_fixed_file(sqe: *io_uring_sqe, file_index: u32) void { + const sqe_file_index: u32 = if (file_index == Constants.FILE_INDEX_ALLOC) + Constants.FILE_INDEX_ALLOC + else + // 0 means no fixed files, indexes should be encoded as "index + 1" + file_index + 1; + // This filed is overloaded in liburing: + // splice_fd_in: i32 + // sqe_file_index: u32 + sqe.splice_fd_in = @bitCast(sqe_file_index); + } + + pub fn prep_connect( + sqe: *io_uring_sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + ) void { + // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); + } + + pub fn prep_epoll_ctl( + sqe: *io_uring_sqe, + epfd: linux.fd_t, + fd: linux.fd_t, + op: u32, + ev: ?*linux.epoll_event, + ) void { + sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); + } + + pub fn prep_recv(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []u8, flags: Flags.SendRecv) void { + sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_recv_multishot( + sqe: *io_uring_sqe, + fd: linux.fd_t, + buffer: []u8, + flags: Flags.SendRecv, + ) void { + sqe.prep_recv(fd, buffer, flags); + const enable_multishot: Flags.SendRecv = .{ .RECV_MULTISHOT = true }; + sqe.ioprio |= @bitCast(enable_multishot); + } + + pub fn prep_recvmsg( + sqe: *io_uring_sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: Flags.SendRecv, + ) void { + sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_recvmsg_multishot( + sqe: *io_uring_sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: Flags.SendRecv, + ) void { + sqe.prep_recvmsg(fd, msg, flags); + const enable_multishot: Flags.SendRecv = .{ .RECV_MULTISHOT = true }; + sqe.ioprio |= @bitCast(enable_multishot); + } + + // COMMIT: fix send[|recv] flag param type + pub fn prep_send(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: Flags.SendRecv) void { + sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_send_zc(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16) void { + sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + sqe.ioprio = zc_flags; + } + + pub fn prep_send_zc_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16, buf_index: u16) void { + prep_send_zc(sqe, fd, buffer, flags, zc_flags); + sqe.ioprio |= linux.IORING_RECVSEND_FIXED_BUF; + sqe.buf_index = buf_index; + } + + pub fn prep_sendmsg_zc( + sqe: *io_uring_sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: u32, + ) void { + prep_sendmsg(sqe, fd, msg, flags); + sqe.opcode = .SENDMSG_ZC; + } + + pub fn prep_sendmsg( + sqe: *io_uring_sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: u32, + ) void { + sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; + } + + pub fn prep_openat( + sqe: *io_uring_sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_openat_direct( + sqe: *io_uring_sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + file_index: u32, + ) void { + prep_openat(sqe, fd, path, flags, mode); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_close(sqe: *io_uring_sqe, fd: linux.fd_t) void { + sqe.* = .{ + .opcode = .CLOSE, + .flags = .{}, + .ioprio = 0, + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_close_direct(sqe: *io_uring_sqe, file_index: u32) void { + prep_close(sqe, 0); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_timeout( + sqe: *io_uring_sqe, + ts: *const linux.kernel_timespec, + count: u32, + flags: u32, + ) void { + sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); + sqe.rw_flags = flags; + } + + pub fn prep_timeout_remove(sqe: *io_uring_sqe, timeout_user_data: u64, flags: u32) void { + sqe.* = .{ + .opcode = .TIMEOUT_REMOVE, + .flags = .{}, + .ioprio = 0, + .fd = -1, + .off = 0, + .addr = timeout_user_data, + .len = 0, + .rw_flags = flags, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_link_timeout( + sqe: *io_uring_sqe, + ts: *const linux.kernel_timespec, + flags: u32, + ) void { + sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); + sqe.rw_flags = flags; + } + + pub fn prep_poll_add( + sqe: *io_uring_sqe, + fd: linux.fd_t, + poll_mask: u32, + ) void { + sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + } + + pub fn prep_poll_remove( + sqe: *io_uring_sqe, + target_user_data: u64, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); + } + + pub fn prep_poll_update( + sqe: *io_uring_sqe, + old_user_data: u64, + new_user_data: u64, + poll_mask: u32, + flags: u32, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + } + + pub fn prep_fallocate( + sqe: *io_uring_sqe, + fd: linux.fd_t, + mode: i32, + offset: u64, + len: u64, + ) void { + sqe.* = .{ + .opcode = .FALLOCATE, + .flags = .{}, + .ioprio = 0, + .fd = fd, + .off = offset, + .addr = len, + .len = @intCast(mode), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_statx( + sqe: *io_uring_sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: u32, + mask: u32, + buf: *linux.Statx, + ) void { + sqe.prep_rw(.STATX, fd, @intFromPtr(path), mask, @intFromPtr(buf)); + sqe.rw_flags = flags; + } + + pub fn prep_cancel( + sqe: *io_uring_sqe, + cancel_user_data: u64, + flags: u32, + ) void { + sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); + sqe.rw_flags = flags; + } + + pub fn prep_cancel_fd( + sqe: *io_uring_sqe, + fd: linux.fd_t, + flags: u32, + ) void { + sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); + sqe.rw_flags = flags | linux.IORING_ASYNC_CANCEL_FD; + } + + pub fn prep_shutdown( + sqe: *io_uring_sqe, + sockfd: linux.socket_t, + how: u32, + ) void { + sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); + } + + pub fn prep_renameat( + sqe: *io_uring_sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: u32, + ) void { + sqe.prep_rw( + .RENAMEAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = flags; + } + + pub fn prep_unlinkat( + sqe: *io_uring_sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + flags: u32, + ) void { + sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); + sqe.rw_flags = flags; + } + + pub fn prep_mkdirat( + sqe: *io_uring_sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); + } + + pub fn prep_symlinkat( + sqe: *io_uring_sqe, + target: [*:0]const u8, + new_dir_fd: linux.fd_t, + link_path: [*:0]const u8, + ) void { + sqe.prep_rw( + .SYMLINKAT, + new_dir_fd, + @intFromPtr(target), + 0, + @intFromPtr(link_path), + ); + } + + pub fn prep_linkat( + sqe: *io_uring_sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: u32, + ) void { + sqe.prep_rw( + .LINKAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = flags; + } + + pub fn prep_files_update( + sqe: *io_uring_sqe, + fds: []const linux.fd_t, + offset: u32, + ) void { + sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); + } + + pub fn prep_files_update_alloc( + sqe: *io_uring_sqe, + fds: []linux.fd_t, + ) void { + sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, linux.IORING_FILE_INDEX_ALLOC); + } + + pub fn prep_provide_buffers( + sqe: *io_uring_sqe, + buffers: [*]u8, + buffer_len: usize, + num: usize, + group_id: usize, + buffer_id: usize, + ) void { + const ptr = @intFromPtr(buffers); + sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); + sqe.buf_index = @intCast(group_id); + } + + pub fn prep_remove_buffers( + sqe: *io_uring_sqe, + num: usize, + group_id: usize, + ) void { + sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); + sqe.buf_index = @intCast(group_id); + } + + pub fn prep_multishot_accept( + sqe: *io_uring_sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: u32, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + sqe.ioprio |= linux.IORING_ACCEPT_MULTISHOT; + } + + pub fn prep_socket( + sqe: *io_uring_sqe, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + ) void { + sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); + sqe.rw_flags = flags; + } + + pub fn prep_socket_direct( + sqe: *io_uring_sqe, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + file_index: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_socket_direct_alloc( + sqe: *io_uring_sqe, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); + } + + pub fn prep_waitid( + sqe: *io_uring_sqe, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: u32, + flags: u32, + ) void { + sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); + sqe.rw_flags = flags; + sqe.splice_fd_in = @bitCast(options); + } + + pub fn prep_bind( + sqe: *io_uring_sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + flags: u32, + ) void { + sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); + sqe.rw_flags = flags; + } + + pub fn prep_listen( + sqe: *io_uring_sqe, + fd: linux.fd_t, + backlog: usize, + flags: u32, + ) void { + sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); + sqe.rw_flags = flags; + } + + pub fn prep_cmd_sock( + sqe: *io_uring_sqe, + cmd_op: linux.IO_URING_SOCKET_OP, + fd: linux.fd_t, + level: u32, + optname: u32, + optval: u64, + optlen: u32, + ) void { + sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); + // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 + sqe.off = @intFromEnum(cmd_op); + // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 + sqe.addr = @bitCast(packed struct { + level: u32, + optname: u32, + }{ + .level = level, + .optname = optname, + }); + // splice_fd_in if overloaded u32 -> i32 + sqe.splice_fd_in = @bitCast(optlen); + // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 + sqe.addr3 = optval; + } + + pub fn set_flags(sqe: *io_uring_sqe, flags: io_uring_sqe.SqeFlags) void { + const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags)); + sqe.flags = @bitCast(updated_flags); + } + + /// This SQE forms a link with the next SQE in the submission ring. Next SQE + /// will not be started before this one completes. Forms a chain of SQEs. + pub fn link_next(sqe: *io_uring_sqe) void { + sqe.flags.IOSQE_IO_LINK = true; + } +}; + +/// Filled with the offset for mmap(2) +pub const io_sqring_offsets = extern struct { + /// offset of ring head + head: u32, + /// offset of ring tail + tail: u32, + /// ring mask value + ring_mask: u32, + /// entries in ring + ring_entries: u32, + /// ring flags + flags: u32, + /// number of sqes not submitted + dropped: u32, + /// sqe index array + array: u32, + resv1: u32, + user_addr: u64, +}; + +pub const io_cqring_offsets = extern struct { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + overflow: u32, + cqes: u32, + flags: u32, + resv: u32, + user_addr: u64, +}; + +/// Passed in for io_uring_setup(2). Copied back with updated info on success +pub const io_uring_params = extern struct { + sq_entries: u32, + cq_entries: u32, + flags: u32, + sq_thread_cpu: u32, + sq_thread_idle: u32, + features: u32, + wq_fd: u32, + resv: [3]u32, + sq_off: io_sqring_offsets, + cq_off: io_cqring_offsets, +}; + +// io_uring_register opcodes and arguments +pub const RegisterOp = enum(u32) { + REGISTER_BUFFERS, + UNREGISTER_BUFFERS, + REGISTER_FILES, + UNREGISTER_FILES, + REGISTER_EVENTFD, + UNREGISTER_EVENTFD, + REGISTER_FILES_UPDATE, + REGISTER_EVENTFD_ASYNC, + REGISTER_PROBE, + REGISTER_PERSONALITY, + UNREGISTER_PERSONALITY, + REGISTER_RESTRICTIONS, + REGISTER_ENABLE_RINGS, + + // extended with tagging + REGISTER_FILES2, + REGISTER_FILES_UPDATE2, + REGISTER_BUFFERS2, + REGISTER_BUFFERS_UPDATE, + + // set/clear io-wq thread affinities + REGISTER_IOWQ_AFF, + UNREGISTER_IOWQ_AFF, + + // set/get max number of io-wq workers + REGISTER_IOWQ_MAX_WORKERS, + + // register/unregister io_uring fd with the ring + REGISTER_RING_FDS, + UNREGISTER_RING_FDS, + + // register ring based provide buffer group + REGISTER_PBUF_RING, + UNREGISTER_PBUF_RING, + + // sync cancelation API + REGISTER_SYNC_CANCEL, + + // register a range of fixed file slots for automatic slot allocation + REGISTER_FILE_ALLOC_RANGE, + + // return status information for a buffer group + REGISTER_PBUF_STATUS, + + // set/clear busy poll settings + REGISTER_NAPI, + UNREGISTER_NAPI, + + REGISTER_CLOCK, + + // clone registered buffers from source ring to current ring + REGISTER_CLONE_BUFFERS, + + // send MSG_RING without having a ring + REGISTER_SEND_MSG_RING, + + // register a netdev hw rx queue for zerocopy + REGISTER_ZCRX_IFQ, + + // resize CQ ring + REGISTER_RESIZE_RINGS, + + REGISTER_MEM_REGION, + + // COMMIT: new register opcode + // query various aspects of io_uring, see linux/io_uring/query.h + REGISTER_QUERY, + + // flag added to the opcode to use a registered ring fd + REGISTER_USE_REGISTERED_RING = 1 << 31, + + _, +}; + +/// io_uring_restriction.opcode values +pub const IoWqCategory = enum(u8) { + BOUND, + UNBOUND, +}; + +// COMMIT: remove deprecated io_uring_rsrc_update struct +// deprecated, see struct io_uring_rsrc_update + +// COMMIT: add new io_uring_region_desc struct +pub const io_uring_region_desc = extern struct { + user_addr: u64, + size: u64, + flags: u32, + id: u32, + mmap_offset: u64, + __resv: [4]u64, +}; + +// COMMIT: add new io_uring_mem_region_reg struct +pub const io_uring_mem_region_reg = extern struct { + /// struct io_uring_region_desc + region_uptr: u64, + flags: u64, + __resv: [2]u64, +}; + +pub const io_uring_rsrc_register = extern struct { + nr: u32, + flags: u32, + resv2: u64, + data: u64, + tags: u64, +}; + +pub const io_uring_rsrc_update = extern struct { + offset: u32, + resv: u32, + data: u64, +}; + +pub const io_uring_rsrc_update2 = extern struct { + offset: u32, + resv: u32, + data: u64, + tags: u64, + nr: u32, + resv2: u32, +}; + +pub const io_uring_probe_op = extern struct { + op: Op, + resv: u8, + // TODO: which flags are these exactly :) + /// IO_URING_OP_* flags + flags: u16, + resv2: u32, + + pub fn is_supported(self: @This()) bool { + return self.flags & Constants.IO_URING_OP_SUPPORTED != 0; + } +}; + +pub const io_uring_probe = extern struct { + /// Last opcode supported + last_op: Op, + /// Length of ops[] array below + ops_len: u8, + resv: u16, + resv2: [3]u32, + ops: [256]io_uring_probe_op, + + /// Is the operation supported on the running kernel. + pub fn is_supported(self: @This(), op: Op) bool { + const i = @intFromEnum(op); + if (i > @intFromEnum(self.last_op) or i >= self.ops_len) + return false; + return self.ops[i].is_supported(); + } +}; + +// COMMIT: fix defination of io_uring_restriction +// register_op: RegisterRestrictionOp field +pub const io_uring_restriction = extern struct { + opcode: RestrictionOp, + arg: extern union { + /// IORING_RESTRICTION_REGISTER_OP + register_op: RegisterRestrictionOp, + /// IORING_RESTRICTION_SQE_OP + sqe_op: Op, + /// IORING_RESTRICTION_SQE_FLAGS_* + sqe_flags: u8, + }, + resv: u8, + resv2: [3]u32, +}; + +// COMMIT: add new struct type +pub const io_uring_clock_register = extern struct { + clockid: u32, + __resv: [3]u32, +}; + +// COMMIT: add new struct type +pub const io_uring_clone_buffers = extern struct { + src_fd: u32, + flags: u32, + src_off: u32, + dst_off: u32, + nr: u32, + pad: [3]u32, +}; + +pub const io_uring_buf = extern struct { + addr: u64, + len: u32, + bid: u16, + resv: u16, +}; + +pub const io_uring_buf_ring = extern struct { + resv1: u64, + resv2: u32, + resv3: u16, + tail: u16, +}; + +/// argument for IORING_(UN)REGISTER_PBUF_RING +pub const io_uring_buf_reg = extern struct { + ring_addr: u64, + ring_entries: u32, + bgid: u16, + flags: RegisterPbufFlags, + resv: [3]u64, + + // COMMIT: new IORING_REGISTER_PBUF_RING flags + /// Flags for IORING_REGISTER_PBUF_RING. + pub const RegisterPbufFlags = packed struct(u16) { + /// IOU_PBUF_RING_MMAP: + /// If set, kernel will allocate the memory for the ring. + /// The application must not set a ring_addr in struct io_uring_buf_reg + /// instead it must subsequently call mmap(2) with the offset set + /// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get + /// a virtual mapping for the ring. + IOU_PBUF_RING_MMAP: bool = false, + /// IOU_PBUF_RING_INC: + /// If set, buffers consumed from this buffer ring can be + /// consumed incrementally. Normally one (or more) buffers + /// are fully consumed. With incremental consumptions, it's + /// feasible to register big ranges of buffers, and each + /// use of it will consume only as much as it needs. This + /// requires that both the kernel and application keep + /// track of where the current read/recv index is at. + IOU_PBUF_RING_INC: bool = false, + _: u14 = 0, + }; +}; + +/// argument for IORING_REGISTER_PBUF_STATUS +pub const io_uring_buf_status = extern struct { + /// input + buf_group: u32, + /// output + head: u32, + resv: [8]u32, +}; + +/// argument for IORING_(UN)REGISTER_NAPI +pub const io_uring_napi = extern struct { + busy_poll_to: u32, + prefer_busy_poll: u8, + pad: [3]u8, + resv: u64, +}; + +// COMMIT: new struct type +/// Argument for io_uring_enter(2) with +/// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument +/// is an index into a previously registered fixed wait region described by +/// the below structure. +pub const io_uring_reg_wait = extern struct { + ts: linux.kernel_timespec, + min_wait_usec: u32, + flags: u32, + sigmask: u64, + sigmask_sz: u32, + pad: [3]u32, + pad2: [2]u64, +}; + +/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG +pub const io_uring_getevents_arg = extern struct { + sigmask: u64, + sigmask_sz: u32, + pad: u32, + ts: u64, +}; + +/// Argument for IORING_REGISTER_SYNC_CANCEL +pub const io_uring_sync_cancel_reg = extern struct { + addr: u64, + fd: i32, + flags: u32, + timeout: linux.kernel_timespec, + pad: [4]u64, +}; + +/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE +/// The range is specified as [off, off + len) +pub const io_uring_file_index_range = extern struct { + off: u32, + len: u32, + resv: u64, +}; + +pub const io_uring_recvmsg_out = extern struct { + namelen: u32, + controllen: u32, + payloadlen: u32, + flags: u32, +}; + +/// Zero copy receive refill queue entry +pub const io_uring_zcrx_rqe = extern struct { + off: u64, + len: u32, + __pad: u32, +}; + +pub const io_uring_zcrx_cqe = extern struct { + off: u64, + __pad: u64, +}; + +pub const io_uring_zcrx_offsets = extern struct { + head: u32, + tail: u32, + rqes: u32, + __resv2: u32, + __resv: [2]u64, +}; + +pub const io_uring_zcrx_area_reg = extern struct { + addr: u64, + len: u64, + rq_area_token: u64, + flags: zcrx_area_flags, + dmabuf_fd: u32, + __resv2: [2]u64, + + pub const zcrx_area_flags = packed struct(u32) { + DMABUF: bool = false, + _: u31 = 0, + }; +}; + +/// Argument for IORING_REGISTER_ZCRX_IFQ +pub const io_uring_zcrx_ifq_reg = extern struct { + if_idx: u32, + if_rxq: u32, + rq_entries: u32, + flags: u32, + /// pointer to struct io_uring_zcrx_area_reg + area_ptr: u64, + /// struct io_uring_region_desc + region_ptr: u64, + offsets: io_uring_zcrx_offsets, + zcrx_id: u32, + __resv2: u32, + __resv: [3]u64, +}; + +pub const SocketOp = enum(u16) { + SIOCIN, + SIOCOUTQ, + GETSOCKOPT, + SETSOCKOPT, + // COMMIT: new socket op + TX_TIMESTAMP, +}; + +/// io_uring_restriction.opcode values +pub const RegisterRestrictionOp = enum(u8) { + /// Allow an io_uring_register(2) opcode + REGISTER_OP = 0, + + /// Allow an sqe opcode + SQE_OP = 1, + + /// Allow sqe flags + SQE_FLAGS_ALLOWED = 2, + + /// Require sqe flags (these flags must be set on each submission) + SQE_FLAGS_REQUIRED = 3, + + _, +}; + +/// io_uring_restriction.opcode values +pub const RestrictionOp = enum(u16) { + /// Allow an io_uring_register(2) opcode + REGISTER_OP = 0, + /// Allow an sqe opcode + SQE_OP = 1, + /// Allow sqe flags + SQE_FLAGS_ALLOWED = 2, + /// Require sqe flags (these flags must be set on each submission) + SQE_FLAGS_REQUIRED = 3, + + _, +}; + +/// IORING_OP_MSG_RING command types, stored in sqe.addr +pub const MsgRingCmd = enum { + /// pass sqe->len as 'res' and off as user_data + DATA, + /// send a registered fd to another ring + SEND_FD, +}; + +// COMMIT: OP to IoUring +pub const Op = enum(u8) { + NOP, + READV, + WRITEV, + FSYNC, + READ_FIXED, + WRITE_FIXED, + POLL_ADD, + POLL_REMOVE, + SYNC_FILE_RANGE, + SENDMSG, + RECVMSG, + TIMEOUT, + TIMEOUT_REMOVE, + ACCEPT, + ASYNC_CANCEL, + LINK_TIMEOUT, + CONNECT, + FALLOCATE, + OPENAT, + CLOSE, + FILES_UPDATE, + STATX, + READ, + WRITE, + FADVISE, + MADVISE, + SEND, + RECV, + EPOLL_CTL, + OPENAT2, + SPLICE, + PROVIDE_BUFFERS, + REMOVE_BUFFERS, + TEE, + SHUTDOWN, + RENAMEAT, + UNLINKAT, + MKDIRAT, + SYMLINKAT, + LINKAT, + MSG_RING, + FSETXATTR, + SETXATTR, + FGETXATTR, + GETXATTR, + SOCKET, + URING_CMD, + SEND_ZC, + SENDMSG_ZC, + READ_MULTISHOT, + WAITID, + FUTEX_WAIT, + FUTEX_WAKE, + FUTEX_WAITV, + FIXED_FD_INSTALL, + FTRUNCATE, + BIND, + LISTEN, + RECV_ZC, + // COMMIT: new OPs + // TODO: to be implemented + EPOLL_WAIT, + READV_FIXED, + WRITEV_FIXED, + PIPE, + + _, +}; /// A friendly way to setup an io_uring, with default linux.io_uring_params. /// `entries` must be a power of two between 1 and 32768, although the kernel will make the final /// call on how many entries the submission and completion queues will ultimately have, /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. /// Matches the interface of io_uring_queue_init() in liburing. -pub fn init(entries: u16, flags: u32) !IoUring { +pub fn init(entries: u16, flags: Flags.Setup) !IoUring { var params = mem.zeroInit(linux.io_uring_params, .{ - .flags = flags, + .flags = @bitCast(flags), .sq_thread_idle = 1000, }); return try IoUring.init_params(entries, ¶ms); @@ -36,15 +1666,21 @@ pub fn init(entries: u16, flags: u32) !IoUring { pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { if (entries == 0) return error.EntriesZero; if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; - assert(p.sq_entries == 0); - assert(p.cq_entries == 0 or p.flags & linux.IORING_SETUP_CQSIZE != 0); assert(p.features == 0); - assert(p.wq_fd == 0 or p.flags & linux.IORING_SETUP_ATTACH_WQ != 0); assert(p.resv[0] == 0); assert(p.resv[1] == 0); assert(p.resv[2] == 0); + const flags: Flags.Setup = @bitCast(p.flags); + assert(p.cq_entries == 0 or flags.CQSIZE); + assert(p.wq_fd == 0 or flags.ATTACH_WQ); + + // flags compatibility + assert(flags.SQPOLL and !(flags.COOP_TASKRUN or flags.TASKRUN_FLAG or flags.DEFER_TASKRUN)); + assert(flags.SQ_AFF and flags.SQPOLL); + assert(flags.DEFER_TASKRUN and flags.SINGLE_ISSUER); + const res = linux.io_uring_setup(entries, p); switch (linux.errno(res)) { .SUCCESS => {}, @@ -66,6 +1702,7 @@ pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { assert(fd >= 0); errdefer posix.close(fd); + const features: Flags.Features = @bitCast(p.features); // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. // This is not an optional feature for us... if the kernel does it, we have to do it. // The thinking on this by the kernel developers was that both the submission and the @@ -75,7 +1712,7 @@ pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. // We do not support the double mmap() done before 5.4, because we want to keep the // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. - if ((p.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { + if (!features.SINGLE_MMAP) { return error.SystemOutdated; } @@ -108,12 +1745,12 @@ pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { assert(cq.overflow.* == 0); assert(cq.cqes.len == p.cq_entries); - return IoUring{ + return .{ .fd = fd, .sq = sq, .cq = cq, - .flags = p.flags, - .features = p.features, + .flags = flags, + .features = features, }; } @@ -133,7 +1770,7 @@ pub fn deinit(self: *IoUring) void { /// and the null return in liburing is more a C idiom than anything else, for lack of a better /// alternative. In Zig, we have first-class error handling... so let's use it. /// Matches the implementation of io_uring_get_sqe() in liburing. -pub fn get_sqe(self: *IoUring) !*linux.io_uring_sqe { +pub fn get_sqe(self: *IoUring) !*io_uring_sqe { const head = @atomicLoad(u32, self.sq.head, .acquire); // Remember that these head and tail offsets wrap around every four billion operations. // We must therefore use wrapping addition and subtraction to avoid a runtime crash. @@ -160,10 +1797,10 @@ pub fn submit(self: *IoUring) !u32 { /// Matches the implementation of io_uring_submit_and_wait() in liburing. pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { const submitted = self.flush_sq(); - var flags: u32 = 0; + var flags: Flags.Enter = .{}; if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { - if (wait_nr > 0 or (self.flags & linux.IORING_SETUP_IOPOLL) != 0) { - flags |= linux.IORING_ENTER_GETEVENTS; + if (wait_nr > 0 or self.flags.IOPOLL) { + flags.GETEVENTS = true; } return try self.enter(submitted, wait_nr, flags); } @@ -172,7 +1809,7 @@ pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { /// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. /// Returns the number of SQEs submitted. -pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: u32) !u32 { +pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: Flags.Enter) !u32 { assert(self.fd >= 0); const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); switch (linux.errno(res)) { @@ -232,11 +1869,11 @@ pub fn flush_sq(self: *IoUring) u32 { /// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. /// For the latter case, we set the SQ thread wakeup flag. /// Matches the implementation of sq_ring_needs_enter() in liburing. -pub fn sq_ring_needs_enter(self: *IoUring, flags: *u32) bool { - assert(flags.* == 0); - if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0) return true; +pub fn sq_ring_needs_enter(self: *IoUring, flags: *Flags.Enter) bool { + assert(@as(u32, @bitCast(flags.*)) == 0); + if (!self.flags.SQPOLL) return true; if ((@atomicLoad(u32, self.sq.flags, .unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { - flags.* |= linux.IORING_ENTER_SQ_WAKEUP; + flags.*.SQ_WAKEUP = true; return true; } return false; @@ -269,7 +1906,7 @@ pub fn cq_ready(self: *IoUring) u32 { /// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. /// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. /// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. -pub fn copy_cqes(self: *IoUring, cqes: []linux.io_uring_cqe, wait_nr: u32) !u32 { +pub fn copy_cqes(self: *IoUring, cqes: []io_uring_cqe, wait_nr: u32) !u32 { const count = self.copy_cqes_ready(cqes); if (count > 0) return count; if (self.cq_ring_needs_flush() or wait_nr > 0) { @@ -279,7 +1916,7 @@ pub fn copy_cqes(self: *IoUring, cqes: []linux.io_uring_cqe, wait_nr: u32) !u32 return 0; } -fn copy_cqes_ready(self: *IoUring, cqes: []linux.io_uring_cqe) u32 { +fn copy_cqes_ready(self: *IoUring, cqes: []io_uring_cqe) u32 { const ready = self.cq_ready(); const count = @min(cqes.len, ready); const head = self.cq.head.* & self.cq.mask; @@ -300,8 +1937,8 @@ fn copy_cqes_ready(self: *IoUring, cqes: []linux.io_uring_cqe) u32 { /// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. /// A convenience method for `copy_cqes()` for when you don't need to batch or peek. -pub fn copy_cqe(ring: *IoUring) !linux.io_uring_cqe { - var cqes: [1]linux.io_uring_cqe = undefined; +pub fn copy_cqe(ring: *IoUring) !io_uring_cqe { + var cqes: [1]io_uring_cqe = undefined; while (true) { const count = try ring.copy_cqes(&cqes, 1); if (count > 0) return cqes[0]; @@ -318,7 +1955,7 @@ pub fn cq_ring_needs_flush(self: *IoUring) bool { /// Must be called exactly once after a zero-copy CQE has been processed by your application. /// Not idempotent, calling more than once will result in other CQEs being lost. /// Matches the implementation of cqe_seen() in liburing. -pub fn cqe_seen(self: *IoUring, cqe: *linux.io_uring_cqe) void { +pub fn cqe_seen(self: *IoUring, cqe: *io_uring_cqe) void { _ = cqe; self.cq_advance(1); } @@ -341,7 +1978,7 @@ pub fn cq_advance(self: *IoUring, count: u32) void { /// apply to the write, since the fsync may complete before the write is issued to the disk. /// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, /// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. -pub fn fsync(self: *IoUring, user_data: u64, fd: linux.fd_t, flags: u32) !*linux.io_uring_sqe { +pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: Flags.Fsync) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_fsync(fd, flags); sqe.user_data = user_data; @@ -353,7 +1990,7 @@ pub fn fsync(self: *IoUring, user_data: u64, fd: linux.fd_t, flags: u32) !*linux /// A no-op is more useful than may appear at first glance. /// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to /// know when the ring is idle before acting on a kill signal. -pub fn nop(self: *IoUring, user_data: u64) !*linux.io_uring_sqe { +pub fn nop(self: *IoUring, user_data: u64) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_nop(); sqe.user_data = user_data; @@ -389,7 +2026,7 @@ pub fn read( fd: linux.fd_t, buffer: ReadBuffer, offset: u64, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); switch (buffer) { .buffer => |slice| sqe.prep_read(fd, slice, offset), @@ -412,7 +2049,7 @@ pub fn write( fd: linux.fd_t, buffer: []const u8, offset: u64, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_write(fd, buffer, offset); sqe.user_data = user_data; @@ -433,7 +2070,7 @@ pub fn write( /// See https://github.com/axboe/liburing/issues/291 /// /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -pub fn splice(self: *IoUring, user_data: u64, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) !*linux.io_uring_sqe { +pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd_out: posix.fd_t, off_out: u64, len: usize) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); sqe.user_data = user_data; @@ -452,7 +2089,7 @@ pub fn read_fixed( buffer: *posix.iovec, offset: u64, buffer_index: u16, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_read_fixed(fd, buffer, offset, buffer_index); sqe.user_data = user_data; @@ -469,7 +2106,7 @@ pub fn writev( fd: linux.fd_t, iovecs: []const posix.iovec_const, offset: u64, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_writev(fd, iovecs, offset); sqe.user_data = user_data; @@ -488,7 +2125,7 @@ pub fn write_fixed( buffer: *posix.iovec, offset: u64, buffer_index: u16, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_write_fixed(fd, buffer, offset, buffer_index); sqe.user_data = user_data; @@ -505,7 +2142,7 @@ pub fn accept( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_accept(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -527,7 +2164,7 @@ pub fn accept_multishot( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_multishot_accept(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -552,7 +2189,7 @@ pub fn accept_direct( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_accept_direct(fd, addr, addrlen, flags, linux.IORING_FILE_INDEX_ALLOC); sqe.user_data = user_data; @@ -568,7 +2205,7 @@ pub fn accept_multishot_direct( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -583,7 +2220,7 @@ pub fn connect( fd: linux.fd_t, addr: *const posix.sockaddr, addrlen: posix.socklen_t, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_connect(fd, addr, addrlen); sqe.user_data = user_data; @@ -599,7 +2236,7 @@ pub fn epoll_ctl( fd: linux.fd_t, op: u32, ev: ?*linux.epoll_event, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_epoll_ctl(epfd, fd, op, ev); sqe.user_data = user_data; @@ -629,7 +2266,7 @@ pub fn recv( fd: linux.fd_t, buffer: RecvBuffer, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); switch (buffer) { .buffer => |slice| sqe.prep_recv(fd, slice, flags), @@ -653,7 +2290,7 @@ pub fn send( fd: linux.fd_t, buffer: []const u8, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_send(fd, buffer, flags); sqe.user_data = user_data; @@ -682,7 +2319,7 @@ pub fn send_zc( buffer: []const u8, send_flags: u32, zc_flags: u16, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); sqe.user_data = user_data; @@ -700,7 +2337,7 @@ pub fn send_zc_fixed( send_flags: u32, zc_flags: u16, buf_index: u16, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); sqe.user_data = user_data; @@ -716,7 +2353,7 @@ pub fn recvmsg( fd: linux.fd_t, msg: *linux.msghdr, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_recvmsg(fd, msg, flags); sqe.user_data = user_data; @@ -732,7 +2369,7 @@ pub fn sendmsg( fd: linux.fd_t, msg: *const linux.msghdr_const, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_sendmsg(fd, msg, flags); sqe.user_data = user_data; @@ -748,7 +2385,7 @@ pub fn sendmsg_zc( fd: linux.fd_t, msg: *const linux.msghdr_const, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_sendmsg_zc(fd, msg, flags); sqe.user_data = user_data; @@ -765,7 +2402,7 @@ pub fn openat( path: [*:0]const u8, flags: linux.O, mode: posix.mode_t, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_openat(fd, path, flags, mode); sqe.user_data = user_data; @@ -791,7 +2428,7 @@ pub fn openat_direct( flags: linux.O, mode: posix.mode_t, file_index: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_openat_direct(fd, path, flags, mode, file_index); sqe.user_data = user_data; @@ -801,7 +2438,7 @@ pub fn openat_direct( /// Queues (but does not submit) an SQE to perform a `close(2)`. /// Returns a pointer to the SQE. /// Available since 5.6. -pub fn close(self: *IoUring, user_data: u64, fd: linux.fd_t) !*linux.io_uring_sqe { +pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_close(fd); sqe.user_data = user_data; @@ -810,7 +2447,7 @@ pub fn close(self: *IoUring, user_data: u64, fd: linux.fd_t) !*linux.io_uring_sq /// Queues close of registered file descriptor. /// Available since 5.15 -pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*linux.io_uring_sqe { +pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_close_direct(file_index); sqe.user_data = user_data; @@ -836,7 +2473,7 @@ pub fn timeout( ts: *const linux.kernel_timespec, count: u32, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_timeout(ts, count, flags); sqe.user_data = user_data; @@ -856,7 +2493,7 @@ pub fn timeout_remove( user_data: u64, timeout_user_data: u64, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_timeout_remove(timeout_user_data, flags); sqe.user_data = user_data; @@ -884,7 +2521,7 @@ pub fn link_timeout( user_data: u64, ts: *const linux.kernel_timespec, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_link_timeout(ts, flags); sqe.user_data = user_data; @@ -898,7 +2535,7 @@ pub fn poll_add( user_data: u64, fd: linux.fd_t, poll_mask: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_poll_add(fd, poll_mask); sqe.user_data = user_data; @@ -911,7 +2548,7 @@ pub fn poll_remove( self: *IoUring, user_data: u64, target_user_data: u64, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_poll_remove(target_user_data); sqe.user_data = user_data; @@ -927,7 +2564,7 @@ pub fn poll_update( new_user_data: u64, poll_mask: u32, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); sqe.user_data = user_data; @@ -943,7 +2580,7 @@ pub fn fallocate( mode: i32, offset: u64, len: u64, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_fallocate(fd, mode, offset, len); sqe.user_data = user_data; @@ -960,7 +2597,7 @@ pub fn statx( flags: u32, mask: u32, buf: *linux.Statx, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_statx(fd, path, flags, mask, buf); sqe.user_data = user_data; @@ -980,7 +2617,7 @@ pub fn cancel( user_data: u64, cancel_user_data: u64, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_cancel(cancel_user_data, flags); sqe.user_data = user_data; @@ -996,7 +2633,7 @@ pub fn shutdown( user_data: u64, sockfd: posix.socket_t, how: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_shutdown(sockfd, how); sqe.user_data = user_data; @@ -1013,7 +2650,7 @@ pub fn renameat( new_dir_fd: linux.fd_t, new_path: [*:0]const u8, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); sqe.user_data = user_data; @@ -1028,7 +2665,7 @@ pub fn unlinkat( dir_fd: linux.fd_t, path: [*:0]const u8, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_unlinkat(dir_fd, path, flags); sqe.user_data = user_data; @@ -1043,7 +2680,7 @@ pub fn mkdirat( dir_fd: linux.fd_t, path: [*:0]const u8, mode: posix.mode_t, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_mkdirat(dir_fd, path, mode); sqe.user_data = user_data; @@ -1058,7 +2695,7 @@ pub fn symlinkat( target: [*:0]const u8, new_dir_fd: linux.fd_t, link_path: [*:0]const u8, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_symlinkat(target, new_dir_fd, link_path); sqe.user_data = user_data; @@ -1075,7 +2712,7 @@ pub fn linkat( new_dir_fd: linux.fd_t, new_path: [*:0]const u8, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); sqe.user_data = user_data; @@ -1096,7 +2733,7 @@ pub fn provide_buffers( buffers_count: usize, group_id: usize, buffer_id: usize, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); sqe.user_data = user_data; @@ -1110,7 +2747,7 @@ pub fn remove_buffers( user_data: u64, buffers_count: usize, group_id: usize, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_remove_buffers(buffers_count, group_id); sqe.user_data = user_data; @@ -1127,7 +2764,7 @@ pub fn waitid( infop: *linux.siginfo_t, options: u32, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_waitid(id_type, id, infop, options, flags); sqe.user_data = user_data; @@ -1358,7 +2995,7 @@ pub fn socket( socket_type: u32, protocol: u32, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_socket(domain, socket_type, protocol, flags); sqe.user_data = user_data; @@ -1375,7 +3012,7 @@ pub fn socket_direct( protocol: u32, flags: u32, file_index: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); sqe.user_data = user_data; @@ -1392,7 +3029,7 @@ pub fn socket_direct_alloc( socket_type: u32, protocol: u32, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); sqe.user_data = user_data; @@ -1409,7 +3046,7 @@ pub fn bind( addr: *const posix.sockaddr, addrlen: posix.socklen_t, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_bind(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -1425,7 +3062,7 @@ pub fn listen( fd: linux.fd_t, backlog: usize, flags: u32, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_listen(fd, backlog, flags); sqe.user_data = user_data; @@ -1444,7 +3081,7 @@ pub fn cmd_sock( optname: u32, // linux.SO optval: u64, // pointer to the option value optlen: u32, // size of the option value -) !*linux.io_uring_sqe { +) !*io_uring_sqe { const sqe = try self.get_sqe(); sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); sqe.user_data = user_data; @@ -1461,7 +3098,7 @@ pub fn setsockopt( level: u32, // linux.SOL optname: u32, // linux.SO opt: []const u8, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { return try self.cmd_sock( user_data, .SETSOCKOPT, @@ -1483,7 +3120,7 @@ pub fn getsockopt( level: u32, // linux.SOL optname: u32, // linux.SO opt: []u8, -) !*linux.io_uring_sqe { +) !*io_uring_sqe { return try self.cmd_sock( user_data, .GETSOCKOPT, @@ -1502,7 +3139,7 @@ pub const SubmissionQueue = struct { flags: *u32, dropped: *u32, array: []u32, - sqes: []linux.io_uring_sqe, + sqes: []io_uring_sqe, mmap: []align(page_size_min) u8, mmap_sqes: []align(page_size_min) u8, @@ -1518,7 +3155,7 @@ pub const SubmissionQueue = struct { assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); const size = @max( p.sq_off.array + p.sq_entries * @sizeOf(u32), - p.cq_off.cqes + p.cq_entries * @sizeOf(linux.io_uring_cqe), + p.cq_off.cqes + p.cq_entries * @sizeOf(io_uring_cqe), ); const mmap = try posix.mmap( null, @@ -1532,8 +3169,8 @@ pub const SubmissionQueue = struct { assert(mmap.len == size); // The motivation for the `sqes` and `array` indirection is to make it possible for the - // application to preallocate static linux.io_uring_sqe entries and then replay them when needed. - const size_sqes = p.sq_entries * @sizeOf(linux.io_uring_sqe); + // application to preallocate static io_uring_sqe entries and then replay them when needed. + const size_sqes = p.sq_entries * @sizeOf(io_uring_sqe); const mmap_sqes = try posix.mmap( null, size_sqes, @@ -1546,11 +3183,11 @@ pub const SubmissionQueue = struct { assert(mmap_sqes.len == size_sqes); const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); - const sqes: [*]linux.io_uring_sqe = @ptrCast(@alignCast(&mmap_sqes[0])); + const sqes: [*]io_uring_sqe = @ptrCast(@alignCast(&mmap_sqes[0])); // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); - return SubmissionQueue{ + return .{ .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, @@ -1574,13 +3211,13 @@ pub const CompletionQueue = struct { tail: *u32, mask: u32, overflow: *u32, - cqes: []linux.io_uring_cqe, + cqes: []io_uring_cqe, pub fn init(fd: linux.fd_t, p: linux.io_uring_params, sq: SubmissionQueue) !CompletionQueue { assert(fd >= 0); assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); const mmap = sq.mmap; - const cqes: [*]linux.io_uring_cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); + const cqes: [*]io_uring_cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); return CompletionQueue{ .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), @@ -1677,7 +3314,7 @@ pub const BufferGroup = struct { } // Prepare recv operation which will select buffer from this group. - pub fn recv(self: *BufferGroup, user_data: u64, fd: linux.fd_t, flags: u32) !*linux.io_uring_sqe { + pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*io_uring_sqe { var sqe = try self.ring.get_sqe(); sqe.prep_rw(.RECV, fd, 0, 0, 0); sqe.rw_flags = flags; @@ -1688,7 +3325,7 @@ pub const BufferGroup = struct { } // Prepare multishot recv operation which will select buffer from this group. - pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: linux.fd_t, flags: u32) !*linux.io_uring_sqe { + pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*io_uring_sqe { var sqe = try self.recv(user_data, fd, flags); sqe.ioprio |= linux.IORING_RECV_MULTISHOT; return sqe; @@ -1701,14 +3338,14 @@ pub const BufferGroup = struct { } // Get buffer by CQE. - pub fn get(self: *BufferGroup, cqe: linux.io_uring_cqe) ![]u8 { + pub fn get(self: *BufferGroup, cqe: io_uring_cqe) ![]u8 { const buffer_id = try cqe.buffer_id(); const used_len = @as(usize, @intCast(cqe.res)); return self.get_by_id(buffer_id)[0..used_len]; } // Release buffer from CQE to the kernel. - pub fn put(self: *BufferGroup, cqe: linux.io_uring_cqe) !void { + pub fn put(self: *BufferGroup, cqe: io_uring_cqe) !void { const buffer_id = try cqe.buffer_id(); if (cqe.flags & linux.IORING_CQE_F_BUF_MORE == linux.IORING_CQE_F_BUF_MORE) { // Incremental consumption active, kernel will write to the this buffer again @@ -1852,8 +3489,8 @@ test "structs/offsets/entries" { if (!is_linux) return error.SkipZigTest; try testing.expectEqual(@as(usize, 120), @sizeOf(linux.io_uring_params)); - try testing.expectEqual(@as(usize, 64), @sizeOf(linux.io_uring_sqe)); - try testing.expectEqual(@as(usize, 16), @sizeOf(linux.io_uring_cqe)); + try testing.expectEqual(@as(usize, 64), @sizeOf(io_uring_sqe)); + try testing.expectEqual(@as(usize, 16), @sizeOf(io_uring_cqe)); try testing.expectEqual(0, linux.IORING_OFF_SQ_RING); try testing.expectEqual(0x8000000, linux.IORING_OFF_CQ_RING); @@ -1877,7 +3514,7 @@ test "nop" { } const sqe = try ring.nop(0xaaaaaaaa); - try testing.expectEqual(linux.io_uring_sqe{ + try testing.expectEqual(io_uring_sqe{ .opcode = .NOP, .flags = 0, .ioprio = 0, @@ -1908,7 +3545,7 @@ test "nop" { try testing.expectEqual(@as(u32, 0), ring.cq.head.*); try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xaaaaaaaa, .res = 0, .flags = 0, @@ -1919,7 +3556,7 @@ test "nop" { const sqe_barrier = try ring.nop(0xbbbbbbbb); sqe_barrier.flags |= linux.IOSQE_IO_DRAIN; try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xbbbbbbbb, .res = 0, .flags = 0, @@ -1957,12 +3594,12 @@ test "readv" { var buffer = [_]u8{42} ** 128; var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); - try testing.expectEqual(linux.IORING_OP.READV, sqe.opcode); + try testing.expectEqual(Op.READV, sqe.opcode); sqe.flags |= linux.IOSQE_FIXED_FILE; try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xcccccccc, .res = buffer.len, .flags = 0, @@ -2000,17 +3637,17 @@ test "writev/fsync/readv" { }; const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); - try testing.expectEqual(linux.IORING_OP.WRITEV, sqe_writev.opcode); + try testing.expectEqual(Op.WRITEV, sqe_writev.opcode); try testing.expectEqual(@as(u64, 17), sqe_writev.off); sqe_writev.flags |= linux.IOSQE_IO_LINK; const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); - try testing.expectEqual(linux.IORING_OP.FSYNC, sqe_fsync.opcode); + try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); try testing.expectEqual(fd, sqe_fsync.fd); sqe_fsync.flags |= linux.IOSQE_IO_LINK; const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); - try testing.expectEqual(linux.IORING_OP.READV, sqe_readv.opcode); + try testing.expectEqual(Op.READV, sqe_readv.opcode); try testing.expectEqual(@as(u64, 17), sqe_readv.off); try testing.expectEqual(@as(u32, 3), ring.sq_ready()); @@ -2018,21 +3655,21 @@ test "writev/fsync/readv" { try testing.expectEqual(@as(u32, 0), ring.sq_ready()); try testing.expectEqual(@as(u32, 3), ring.cq_ready()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xdddddddd, .res = buffer_write.len, .flags = 0, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xeeeeeeee, .res = 0, .flags = 0, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 1), ring.cq_ready()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xffffffff, .res = buffer_read.len, .flags = 0, @@ -2062,11 +3699,11 @@ test "write/read" { const buffer_write = [_]u8{97} ** 20; var buffer_read = [_]u8{98} ** 20; const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); - try testing.expectEqual(linux.IORING_OP.WRITE, sqe_write.opcode); + try testing.expectEqual(Op.WRITE, sqe_write.opcode); try testing.expectEqual(@as(u64, 10), sqe_write.off); sqe_write.flags |= linux.IOSQE_IO_LINK; const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); + try testing.expectEqual(Op.READ, sqe_read.opcode); try testing.expectEqual(@as(u64, 10), sqe_read.off); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -2076,12 +3713,12 @@ test "write/read" { // https://lwn.net/Articles/809820/ if (cqe_write.err() == .INVAL) return error.SkipZigTest; if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x11111111, .res = buffer_write.len, .flags = 0, }, cqe_write); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x22222222, .res = buffer_read.len, .flags = 0, @@ -2118,19 +3755,19 @@ test "splice/read" { const pipe_offset: u64 = std.math.maxInt(u64); const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); - try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_to_pipe.opcode); + try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); sqe_splice_to_pipe.flags |= linux.IOSQE_IO_LINK; const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); - try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_from_pipe.opcode); + try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); sqe_splice_from_pipe.flags |= linux.IOSQE_IO_LINK; const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); + try testing.expectEqual(Op.READ, sqe_read.opcode); try testing.expectEqual(@as(u64, 10), sqe_read.off); try testing.expectEqual(@as(u32, 3), try ring.submit()); @@ -2142,17 +3779,17 @@ test "splice/read" { if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x11111111, .res = buffer_write.len, .flags = 0, }, cqe_splice_to_pipe); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x22222222, .res = buffer_write.len, .flags = 0, }, cqe_splice_from_pipe); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x33333333, .res = buffer_read.len, .flags = 0, @@ -2196,12 +3833,12 @@ test "write_fixed/read_fixed" { }; const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); - try testing.expectEqual(linux.IORING_OP.WRITE_FIXED, sqe_write.opcode); + try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); try testing.expectEqual(@as(u64, 3), sqe_write.off); sqe_write.flags |= linux.IOSQE_IO_LINK; const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); - try testing.expectEqual(linux.IORING_OP.READ_FIXED, sqe_read.opcode); + try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode); try testing.expectEqual(@as(u64, 0), sqe_read.off); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -2209,12 +3846,12 @@ test "write_fixed/read_fixed" { const cqe_write = try ring.copy_cqe(); const cqe_read = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x45454545, .res = @as(i32, @intCast(buffers[0].len)), .flags = 0, }, cqe_write); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x12121212, .res = @as(i32, @intCast(buffers[1].len)), .flags = 0, @@ -2250,7 +3887,7 @@ test "openat" { const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; const mode: posix.mode_t = 0o666; const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); - try testing.expectEqual(linux.io_uring_sqe{ + try testing.expectEqual(io_uring_sqe{ .opcode = .OPENAT, .flags = 0, .ioprio = 0, @@ -2297,13 +3934,13 @@ test "close" { errdefer file.close(); const sqe_close = try ring.close(0x44444444, file.handle); - try testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); + try testing.expectEqual(Op.CLOSE, sqe_close.opcode); try testing.expectEqual(file.handle, sqe_close.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_close = try ring.copy_cqe(); if (cqe_close.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x44444444, .res = 0, .flags = 0, @@ -2333,7 +3970,7 @@ test "accept/connect/send/recv" { const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, .flags = 0, @@ -2341,7 +3978,7 @@ test "accept/connect/send/recv" { const cqe_recv = try ring.copy_cqe(); if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems @@ -2394,7 +4031,7 @@ test "sendmsg/recvmsg" { }; const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); sqe_sendmsg.flags |= linux.IOSQE_IO_LINK; - try testing.expectEqual(linux.IORING_OP.SENDMSG, sqe_sendmsg.opcode); + try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); try testing.expectEqual(client, sqe_sendmsg.fd); var buffer_recv = [_]u8{0} ** 128; @@ -2415,7 +4052,7 @@ test "sendmsg/recvmsg" { .flags = 0, }; const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); - try testing.expectEqual(linux.IORING_OP.RECVMSG, sqe_recvmsg.opcode); + try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode); try testing.expectEqual(server, sqe_recvmsg.fd); try testing.expectEqual(@as(u32, 2), ring.sq_ready()); @@ -2425,7 +4062,7 @@ test "sendmsg/recvmsg" { const cqe_sendmsg = try ring.copy_cqe(); if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x11111111, .res = buffer_send.len, .flags = 0, @@ -2433,7 +4070,7 @@ test "sendmsg/recvmsg" { const cqe_recvmsg = try ring.copy_cqe(); if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x22222222, .res = buffer_recv.len, // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically @@ -2461,12 +4098,12 @@ test "timeout (after a relative time)" { const started = try std.Io.Clock.awake.now(io); const sqe = try ring.timeout(0x55555555, &ts, 0, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe.opcode); + try testing.expectEqual(Op.TIMEOUT, sqe.opcode); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); const stopped = try std.Io.Clock.awake.now(io); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x55555555, .res = -@as(i32, @intFromEnum(linux.E.TIME)), .flags = 0, @@ -2490,20 +4127,20 @@ test "timeout (after a number of completions)" { const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; const count_completions: u64 = 1; const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); try testing.expectEqual(count_completions, sqe_timeout.off); _ = try ring.nop(0x77777777); try testing.expectEqual(@as(u32, 2), try ring.submit()); const cqe_nop = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x77777777, .res = 0, .flags = 0, }, cqe_nop); const cqe_timeout = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x66666666, .res = 0, .flags = 0, @@ -2522,11 +4159,11 @@ test "timeout_remove" { const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); + try testing.expectEqual(Op.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); @@ -2536,7 +4173,7 @@ test "timeout_remove" { // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second - var cqes: [2]linux.io_uring_cqe = undefined; + var cqes: [2]io_uring_cqe = undefined; cqes[0] = try ring.copy_cqe(); cqes[1] = try ring.copy_cqe(); @@ -2555,13 +4192,13 @@ test "timeout_remove" { try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); if (cqe.user_data == 0x88888888) { - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x88888888, .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), .flags = 0, }, cqe); } else if (cqe.user_data == 0x99999999) { - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x99999999, .res = 0, .flags = 0, @@ -2640,7 +4277,7 @@ test "fallocate" { const len: u64 = 65536; const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); - try testing.expectEqual(linux.IORING_OP.FALLOCATE, sqe.opcode); + try testing.expectEqual(Op.FALLOCATE, sqe.opcode); try testing.expectEqual(file.handle, sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -2656,7 +4293,7 @@ test "fallocate" { .OPNOTSUPP => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xaaaaaaaa, .res = 0, .flags = 0, @@ -2694,7 +4331,7 @@ test "statx" { linux.STATX_SIZE, &buf, ); - try testing.expectEqual(linux.IORING_OP.STATX, sqe.opcode); + try testing.expectEqual(Op.STATX, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -2712,7 +4349,7 @@ test "statx" { .BADF => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xaaaaaaaa, .res = 0, .flags = 0, @@ -2741,7 +4378,7 @@ test "accept/connect/recv/cancel" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); - try testing.expectEqual(linux.IORING_OP.ASYNC_CANCEL, sqe_cancel.opcode); + try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode); try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -2759,13 +4396,13 @@ test "accept/connect/recv/cancel" { cqe_cancel = a; } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xffffffff, .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), .flags = 0, }, cqe_recv); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x99999999, .res = 0, .flags = 0, @@ -2810,11 +4447,11 @@ test "register_files_update" { var buffer = [_]u8{42} ** 128; { const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(Op.READ, sqe.opcode); sqe.flags |= linux.IOSQE_FIXED_FILE; try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xcccccccc, .res = buffer.len, .flags = 0, @@ -2831,11 +4468,11 @@ test "register_files_update" { { // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(Op.READ, sqe.opcode); sqe.flags |= linux.IOSQE_FIXED_FILE; try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xcccccccc, .res = buffer.len, .flags = 0, @@ -2848,7 +4485,7 @@ test "register_files_update" { { // Now this should fail since both fds are sparse (-1) const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(Op.READ, sqe.opcode); sqe.flags |= linux.IOSQE_FIXED_FILE; try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -2887,7 +4524,7 @@ test "shutdown" { try posix.getsockname(server, addrAny(&address), &slen); const shutdown_sqe = try ring.shutdown(0x445445445, server, linux.SHUT.RD); - try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -2900,7 +4537,7 @@ test "shutdown" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x445445445, .res = 0, .flags = 0, @@ -2915,7 +4552,7 @@ test "shutdown" { const shutdown_sqe = ring.shutdown(0x445445445, server, linux.SHUT.RD) catch |err| switch (err) { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), }; - try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -2958,7 +4595,7 @@ test "renameat" { new_path, 0, ); - try testing.expectEqual(linux.IORING_OP.RENAMEAT, sqe.opcode); + try testing.expectEqual(Op.RENAMEAT, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -2970,7 +4607,7 @@ test "renameat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -3012,7 +4649,7 @@ test "unlinkat" { path, 0, ); - try testing.expectEqual(linux.IORING_OP.UNLINKAT, sqe.opcode); + try testing.expectEqual(Op.UNLINKAT, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -3023,7 +4660,7 @@ test "unlinkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -3059,7 +4696,7 @@ test "mkdirat" { path, 0o0755, ); - try testing.expectEqual(linux.IORING_OP.MKDIRAT, sqe.opcode); + try testing.expectEqual(Op.MKDIRAT, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -3070,7 +4707,7 @@ test "mkdirat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -3107,7 +4744,7 @@ test "symlinkat" { tmp.dir.fd, link_path, ); - try testing.expectEqual(linux.IORING_OP.SYMLINKAT, sqe.opcode); + try testing.expectEqual(Op.SYMLINKAT, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -3118,7 +4755,7 @@ test "symlinkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -3160,7 +4797,7 @@ test "linkat" { second_path, 0, ); - try testing.expectEqual(linux.IORING_OP.LINKAT, sqe.opcode); + try testing.expectEqual(Op.LINKAT, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -3172,7 +4809,7 @@ test "linkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -3207,7 +4844,7 @@ test "provide_buffers: read" { { const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); @@ -3228,7 +4865,7 @@ test "provide_buffers: read" { var i: usize = 0; while (i < buffers.len) : (i += 1) { const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(Op.READ, sqe.opcode); try testing.expectEqual(@as(i32, fd), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -3254,7 +4891,7 @@ test "provide_buffers: read" { { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(Op.READ, sqe.opcode); try testing.expectEqual(@as(i32, fd), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -3293,7 +4930,7 @@ test "provide_buffers: read" { { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); + try testing.expectEqual(Op.READ, sqe.opcode); try testing.expectEqual(@as(i32, fd), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -3354,7 +4991,7 @@ test "remove_buffers" { { const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); - try testing.expectEqual(linux.IORING_OP.REMOVE_BUFFERS, sqe.opcode); + try testing.expectEqual(Op.REMOVE_BUFFERS, sqe.opcode); try testing.expectEqual(@as(i32, 3), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); @@ -3424,7 +5061,7 @@ test "provide_buffers: accept/connect/send/recv" { { const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); @@ -3454,7 +5091,7 @@ test "provide_buffers: accept/connect/send/recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); } - var cqes: [4]linux.io_uring_cqe = undefined; + var cqes: [4]io_uring_cqe = undefined; try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); } @@ -3466,7 +5103,7 @@ test "provide_buffers: accept/connect/send/recv" { var i: usize = 0; while (i < buffers.len) : (i += 1) { const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + try testing.expectEqual(Op.RECV, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -3495,7 +5132,7 @@ test "provide_buffers: accept/connect/send/recv" { { const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + try testing.expectEqual(Op.RECV, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -3545,7 +5182,7 @@ test "provide_buffers: accept/connect/send/recv" { { const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + try testing.expectEqual(Op.RECV, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -3620,7 +5257,7 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); try testing.expect(cqe_accept.res > 0); try testing.expectEqual(@as(u32, 0), cqe_accept.flags); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xcccccccc, .res = 0, .flags = 0, @@ -3721,7 +5358,7 @@ test "accept/connect/send_zc/recv" { // will be a second completion event / notification for the // request, with the user_data field set to the same value. // buffer_send must be keep alive until second cqe. - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, .flags = linux.IORING_CQE_F_MORE, @@ -3733,7 +5370,7 @@ test "accept/connect/send_zc/recv" { break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 }; }; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, @@ -3742,7 +5379,7 @@ test "accept/connect/send_zc/recv" { // Second completion of zero-copy send. // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(io_uring_cqe{ .user_data = 0xeeeeeeee, .res = 0, .flags = linux.IORING_CQE_F_NOTIF, @@ -4142,7 +5779,7 @@ test BufferGroup { try testing.expectEqual(1, submitted); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(io_uring_cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); } // Server uses buffer group receive @@ -4212,7 +5849,7 @@ test "ring mapped buffers recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); } var pos: usize = 0; @@ -4301,7 +5938,7 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); } // start multishot recv @@ -4401,7 +6038,7 @@ fn buf_grp_recv_submit_get_cqe( buf_grp: *BufferGroup, fd: linux.fd_t, user_data: u64, -) !linux.io_uring_cqe { +) !io_uring_cqe { // prepare and submit recv const sqe = try buf_grp.recv(user_data, fd, 0); try testing.expect(sqe.flags & linux.IOSQE_BUFFER_SELECT == linux.IOSQE_BUFFER_SELECT); @@ -4422,7 +6059,7 @@ fn expect_buf_grp_cqe( buf_grp: *BufferGroup, user_data: u64, expected: []const u8, -) !linux.io_uring_cqe { +) !io_uring_cqe { // get cqe const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); @@ -4454,7 +6091,7 @@ test "copy_cqes with wrapping sq.cqes buffer" { try testing.expectEqual(4, ring.cq.cqes.len); // submit 2 entries, receive 2 completions - var cqes: [8]linux.io_uring_cqe = undefined; + var cqes: [8]io_uring_cqe = undefined; { for (0..2) |_| { const sqe = try ring.get_sqe(); diff --git a/lib/std/os/linux/io_uring_sqe.zig b/lib/std/os/linux/io_uring_sqe.zig deleted file mode 100644 index 5658206a66a8..000000000000 --- a/lib/std/os/linux/io_uring_sqe.zig +++ /dev/null @@ -1,679 +0,0 @@ -//! Contains only the definition of `io_uring_sqe`. -//! Split into its own file to compartmentalize the initialization methods. - -const std = @import("../../std.zig"); -const linux = std.os.linux; - -pub const io_uring_sqe = extern struct { - opcode: linux.IORING_OP, - flags: u8, - ioprio: u16, - fd: i32, - off: u64, - addr: u64, - len: u32, - rw_flags: u32, - user_data: u64, - buf_index: u16, - personality: u16, - splice_fd_in: i32, - addr3: u64, - resv: u64, - - pub fn prep_nop(sqe: *linux.io_uring_sqe) void { - sqe.* = .{ - .opcode = .NOP, - .flags = 0, - .ioprio = 0, - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_fsync(sqe: *linux.io_uring_sqe, fd: linux.fd_t, flags: u32) void { - sqe.* = .{ - .opcode = .FSYNC, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = flags, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_rw( - sqe: *linux.io_uring_sqe, - op: linux.IORING_OP, - fd: linux.fd_t, - addr: u64, - len: usize, - offset: u64, - ) void { - sqe.* = .{ - .opcode = op, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = offset, - .addr = addr, - .len = @intCast(len), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_read(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { - sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } - - pub fn prep_write(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { - sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } - - pub fn prep_splice(sqe: *linux.io_uring_sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { - sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); - sqe.addr = off_in; - sqe.splice_fd_in = fd_in; - } - - pub fn prep_readv( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec, - offset: u64, - ) void { - sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } - - pub fn prep_writev( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec_const, - offset: u64, - ) void { - sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } - - pub fn prep_read_fixed(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; - } - - pub fn prep_write_fixed(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; - } - - pub fn prep_accept( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - ) void { - // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. - // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); - sqe.rw_flags = flags; - } - - pub fn prep_accept_direct( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - file_index: u32, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_multishot_accept_direct( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - ) void { - prep_multishot_accept(sqe, fd, addr, addrlen, flags); - __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); - } - - fn __io_uring_set_target_fixed_file(sqe: *linux.io_uring_sqe, file_index: u32) void { - const sqe_file_index: u32 = if (file_index == linux.IORING_FILE_INDEX_ALLOC) - linux.IORING_FILE_INDEX_ALLOC - else - // 0 means no fixed files, indexes should be encoded as "index + 1" - file_index + 1; - // This filed is overloaded in liburing: - // splice_fd_in: i32 - // sqe_file_index: u32 - sqe.splice_fd_in = @bitCast(sqe_file_index); - } - - pub fn prep_connect( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - ) void { - // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); - } - - pub fn prep_epoll_ctl( - sqe: *linux.io_uring_sqe, - epfd: linux.fd_t, - fd: linux.fd_t, - op: u32, - ev: ?*linux.epoll_event, - ) void { - sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); - } - - pub fn prep_recv(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []u8, flags: u32) void { - sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } - - pub fn prep_recv_multishot( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - buffer: []u8, - flags: u32, - ) void { - sqe.prep_recv(fd, buffer, flags); - sqe.ioprio |= linux.IORING_RECV_MULTISHOT; - } - - pub fn prep_recvmsg( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: u32, - ) void { - sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_recvmsg_multishot( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: u32, - ) void { - sqe.prep_recvmsg(fd, msg, flags); - sqe.ioprio |= linux.IORING_RECV_MULTISHOT; - } - - pub fn prep_send(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32) void { - sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } - - pub fn prep_send_zc(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16) void { - sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - sqe.ioprio = zc_flags; - } - - pub fn prep_send_zc_fixed(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16, buf_index: u16) void { - prep_send_zc(sqe, fd, buffer, flags, zc_flags); - sqe.ioprio |= linux.IORING_RECVSEND_FIXED_BUF; - sqe.buf_index = buf_index; - } - - pub fn prep_sendmsg_zc( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, - ) void { - prep_sendmsg(sqe, fd, msg, flags); - sqe.opcode = .SENDMSG_ZC; - } - - pub fn prep_sendmsg( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, - ) void { - sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_openat( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); - sqe.rw_flags = @bitCast(flags); - } - - pub fn prep_openat_direct( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - file_index: u32, - ) void { - prep_openat(sqe, fd, path, flags, mode); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_close(sqe: *linux.io_uring_sqe, fd: linux.fd_t) void { - sqe.* = .{ - .opcode = .CLOSE, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_close_direct(sqe: *linux.io_uring_sqe, file_index: u32) void { - prep_close(sqe, 0); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_timeout( - sqe: *linux.io_uring_sqe, - ts: *const linux.kernel_timespec, - count: u32, - flags: u32, - ) void { - sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); - sqe.rw_flags = flags; - } - - pub fn prep_timeout_remove(sqe: *linux.io_uring_sqe, timeout_user_data: u64, flags: u32) void { - sqe.* = .{ - .opcode = .TIMEOUT_REMOVE, - .flags = 0, - .ioprio = 0, - .fd = -1, - .off = 0, - .addr = timeout_user_data, - .len = 0, - .rw_flags = flags, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_link_timeout( - sqe: *linux.io_uring_sqe, - ts: *const linux.kernel_timespec, - flags: u32, - ) void { - sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_poll_add( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - poll_mask: u32, - ) void { - sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); - } - - pub fn prep_poll_remove( - sqe: *linux.io_uring_sqe, - target_user_data: u64, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); - } - - pub fn prep_poll_update( - sqe: *linux.io_uring_sqe, - old_user_data: u64, - new_user_data: u64, - poll_mask: u32, - flags: u32, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); - } - - pub fn prep_fallocate( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - mode: i32, - offset: u64, - len: u64, - ) void { - sqe.* = .{ - .opcode = .FALLOCATE, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = offset, - .addr = len, - .len = @intCast(mode), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_statx( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: u32, - mask: u32, - buf: *linux.Statx, - ) void { - sqe.prep_rw(.STATX, fd, @intFromPtr(path), mask, @intFromPtr(buf)); - sqe.rw_flags = flags; - } - - pub fn prep_cancel( - sqe: *linux.io_uring_sqe, - cancel_user_data: u64, - flags: u32, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); - sqe.rw_flags = flags; - } - - pub fn prep_cancel_fd( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - flags: u32, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); - sqe.rw_flags = flags | linux.IORING_ASYNC_CANCEL_FD; - } - - pub fn prep_shutdown( - sqe: *linux.io_uring_sqe, - sockfd: linux.socket_t, - how: u32, - ) void { - sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); - } - - pub fn prep_renameat( - sqe: *linux.io_uring_sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, - ) void { - sqe.prep_rw( - .RENAMEAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; - } - - pub fn prep_unlinkat( - sqe: *linux.io_uring_sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - flags: u32, - ) void { - sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); - sqe.rw_flags = flags; - } - - pub fn prep_mkdirat( - sqe: *linux.io_uring_sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); - } - - pub fn prep_symlinkat( - sqe: *linux.io_uring_sqe, - target: [*:0]const u8, - new_dir_fd: linux.fd_t, - link_path: [*:0]const u8, - ) void { - sqe.prep_rw( - .SYMLINKAT, - new_dir_fd, - @intFromPtr(target), - 0, - @intFromPtr(link_path), - ); - } - - pub fn prep_linkat( - sqe: *linux.io_uring_sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, - ) void { - sqe.prep_rw( - .LINKAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; - } - - pub fn prep_files_update( - sqe: *linux.io_uring_sqe, - fds: []const linux.fd_t, - offset: u32, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); - } - - pub fn prep_files_update_alloc( - sqe: *linux.io_uring_sqe, - fds: []linux.fd_t, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, linux.IORING_FILE_INDEX_ALLOC); - } - - pub fn prep_provide_buffers( - sqe: *linux.io_uring_sqe, - buffers: [*]u8, - buffer_len: usize, - num: usize, - group_id: usize, - buffer_id: usize, - ) void { - const ptr = @intFromPtr(buffers); - sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); - sqe.buf_index = @intCast(group_id); - } - - pub fn prep_remove_buffers( - sqe: *linux.io_uring_sqe, - num: usize, - group_id: usize, - ) void { - sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); - sqe.buf_index = @intCast(group_id); - } - - pub fn prep_multishot_accept( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - sqe.ioprio |= linux.IORING_ACCEPT_MULTISHOT; - } - - pub fn prep_socket( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - ) void { - sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); - sqe.rw_flags = flags; - } - - pub fn prep_socket_direct( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - file_index: u32, - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_socket_direct_alloc( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); - } - - pub fn prep_waitid( - sqe: *linux.io_uring_sqe, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: u32, - flags: u32, - ) void { - sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); - sqe.rw_flags = flags; - sqe.splice_fd_in = @bitCast(options); - } - - pub fn prep_bind( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - flags: u32, - ) void { - sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); - sqe.rw_flags = flags; - } - - pub fn prep_listen( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - backlog: usize, - flags: u32, - ) void { - sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); - sqe.rw_flags = flags; - } - - pub fn prep_cmd_sock( - sqe: *linux.io_uring_sqe, - cmd_op: linux.IO_URING_SOCKET_OP, - fd: linux.fd_t, - level: u32, - optname: u32, - optval: u64, - optlen: u32, - ) void { - sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); - // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 - sqe.off = @intFromEnum(cmd_op); - // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 - sqe.addr = @bitCast(packed struct { - level: u32, - optname: u32, - }{ - .level = level, - .optname = optname, - }); - // splice_fd_in if overloaded u32 -> i32 - sqe.splice_fd_in = @bitCast(optlen); - // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 - sqe.addr3 = optval; - } - - pub fn set_flags(sqe: *linux.io_uring_sqe, flags: u8) void { - sqe.flags |= flags; - } - - /// This SQE forms a link with the next SQE in the submission ring. Next SQE - /// will not be started before this one completes. Forms a chain of SQEs. - pub fn link_next(sqe: *linux.io_uring_sqe) void { - sqe.flags |= linux.IOSQE_IO_LINK; - } -}; From 3d67bfd88649c49389dce5c07d91f921c57dbd27 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Mon, 29 Sep 2025 17:13:34 +0000 Subject: [PATCH 02/41] replace some more fn flags with Typed Flags and use linux Flags where appropriate constants is a namespace make AsyncCancel flags more descriptive move cqe.flags into the Cqe type make io_uring_sqe ioprio a packed union of SendRecv & Accept TODO: move some of the used linux Flags to be packed structs Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 293 ++++++++++++++++++----------------- 1 file changed, 153 insertions(+), 140 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 9dcbf71787c5..be0576deb21c 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -17,7 +17,7 @@ flags: Flags.Setup, features: Flags.Features, // COMMIT: move IoUring constants to Constants -pub const Constants = struct { +pub const constants = struct { /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new /// an available direct descriptor instead of having the application pass one /// direct descriptor (like openat/openat2/accept), then io_uring will allocate @@ -204,23 +204,23 @@ pub const Flags = struct { pub const AsyncCancel = packed struct(u32) { /// IORING_ASYNC_CANCEL_ALL /// Cancel all requests that match the given key - ALL: bool = false, + CANCEL_ALL: bool = false, /// IORING_ASYNC_CANCEL_FD /// Key off 'fd' for cancelation rather than the request 'user_data' - FD: bool = false, + CANCEL_FD: bool = false, /// IORING_ASYNC_CANCEL_ANY /// Match any request - ANY: bool = false, + CANCEL_ANY: bool = false, /// IORING_ASYNC_CANCEL_FD_FIXED /// 'fd' passed in is a fixed descriptor - FD_FIXED: bool = false, + CANCEL_FD_FIXED: bool = false, // COMMIT: new AsyncCancel Flags /// IORING_ASYNC_CANCEL_USERDATA /// Match on user_data, default for no other key - USERDATA: bool = false, + CANCEL_USERDATA: bool = false, /// IORING_ASYNC_CANCEL_OP /// Match request based on opcode - OP: bool = false, + CANCEL_OP: bool = false, _unused: u26 = 0, }; @@ -295,42 +295,6 @@ pub const Flags = struct { _unused_1: u26 = 0, }; - /// cqe.flags - pub const Cqe = packed struct(u32) { - /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID - F_BUFFER: bool = false, - /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries - F_MORE: bool = false, - /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv - F_SOCK_NONEMPTY: bool = false, - /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct - /// them from sends. - F_NOTIF: bool = false, - /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get - /// more completions. In other words, the buffer is being - /// partially consumed, and will be used by the kernel for - /// more completions. This is only set for buffers used via - /// the incremental buffer consumption, as provided by - /// a ring buffer setup with IOU_PBUF_RING_INC. For any - /// other provided buffer type, all completions with a - /// buffer passed back is automatically returned to the - /// application. - F_BUF_MORE: bool = false, - // COMMIT: new flags - /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this - /// CQE. It's only purpose is to fill a gap in the ring, - /// if a large CQE is attempted posted when the ring has - /// just a single small CQE worth of space left before - /// wrapping. - F_SKIP: bool = false, - _unused: u9 = 0, - /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings - /// setup in a mixed CQE mode, where both 16b and 32b - /// CQEs may be posted to the CQ ring. - F_32: bool = false, - _unused_1: u16 = 0, - }; - /// sq_ring.flags pub const Sq = packed struct(u32) { /// needs io_uring_enter wakeup @@ -394,13 +358,47 @@ pub const io_uring_cqe = extern struct { user_data: u64, /// result code for this event res: i32, - flags: Flags.Cqe, + flags: CqeFlags, // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration /// If the ring is initialized with IORING_SETUP_CQE32, then this field /// contains 16-bytes of padding, doubling the size of the CQE. big_cqe: []u64, - // Followed by 16 bytes of padding if initialized with IORING_SETUP_CQE32, doubling cqe size + /// cqe.flags + pub const CqeFlags = packed struct(u32) { + /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + F_BUFFER: bool = false, + /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + F_MORE: bool = false, + /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv + F_SOCK_NONEMPTY: bool = false, + /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct + /// them from sends. + F_NOTIF: bool = false, + /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get + /// more completions. In other words, the buffer is being + /// partially consumed, and will be used by the kernel for + /// more completions. This is only set for buffers used via + /// the incremental buffer consumption, as provided by + /// a ring buffer setup with IOU_PBUF_RING_INC. For any + /// other provided buffer type, all completions with a + /// buffer passed back is automatically returned to the + /// application. + F_BUF_MORE: bool = false, + // COMMIT: new flags + /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + /// CQE. It's only purpose is to fill a gap in the ring, + /// if a large CQE is attempted posted when the ring has + /// just a single small CQE worth of space left before + /// wrapping. + F_SKIP: bool = false, + _unused: u9 = 0, + /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + /// setup in a mixed CQE mode, where both 16b and 32b + /// CQEs may be posted to the CQ ring. + F_32: bool = false, + _unused_1: u16 = 0, + }; pub fn err(self: io_uring_cqe) linux.E { if (self.res > -4096 and self.res < 0) { @@ -416,7 +414,7 @@ pub const io_uring_cqe = extern struct { if (!self.flags.F_BUFFER) { return error.NoBufferSelected; } - return @as(u16, @intCast(@as(u32, @bitCast(self.flags)) >> Constants.CQE_BUFFER_SHIFT)); + return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT); } }; @@ -427,7 +425,10 @@ pub const io_uring_sqe = extern struct { /// IOSQE_* flags flags: SqeFlags, /// ioprio for the request - ioprio: u16, + ioprio: packed union { + send_recv: Flags.SendRecv, + accept: Flags.Accept, + }, /// file descriptor to do IO on fd: i32, /// offset into file @@ -474,8 +475,8 @@ pub const io_uring_sqe = extern struct { pub fn prep_nop(sqe: *io_uring_sqe) void { sqe.* = .{ .opcode = .NOP, - .flags = @bitCast(0), - .ioprio = 0, + .flags = .{}, + .ioprio = @bitCast(@as(u16, 0)), .fd = 0, .off = 0, .addr = 0, @@ -490,16 +491,16 @@ pub const io_uring_sqe = extern struct { }; } - pub fn prep_fsync(sqe: *io_uring_sqe, fd: linux.fd_t, flags: u32) void { + pub fn prep_fsync(sqe: *io_uring_sqe, fd: linux.fd_t, flags: Flags.Fsync) void { sqe.* = .{ .opcode = .FSYNC, - .flags = 0, - .ioprio = 0, + .flags = .{}, + .ioprio = @bitCast(@as(u16, 0)), .fd = fd, .off = 0, .addr = 0, .len = 0, - .rw_flags = flags, + .rw_flags = @bitCast(flags), .user_data = 0, .buf_index = 0, .personality = 0, @@ -519,8 +520,8 @@ pub const io_uring_sqe = extern struct { ) void { sqe.* = .{ .opcode = op, - .flags = @bitCast(0), - .ioprio = 0, + .flags = .{}, + .ioprio = @bitCast(@as(u16, 0)), .fd = fd, .off = offset, .addr = addr, @@ -582,7 +583,7 @@ pub const io_uring_sqe = extern struct { fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, - flags: u32, + flags: linux.SOCK, ) void { // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). @@ -590,32 +591,45 @@ pub const io_uring_sqe = extern struct { sqe.rw_flags = flags; } + /// accept directly into the fixed file table pub fn prep_accept_direct( sqe: *io_uring_sqe, fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, - flags: u32, + flags: linux.SOCK, file_index: u32, ) void { prep_accept(sqe, fd, addr, addrlen, flags); set_target_fixed_file(sqe, file_index); } + pub fn prep_multishot_accept( + sqe: *io_uring_sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.SOCK, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } }; + } + + /// multishot accept directly into the fixed file table pub fn prep_multishot_accept_direct( sqe: *io_uring_sqe, fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, - flags: u32, + flags: linux.SOCK, ) void { prep_multishot_accept(sqe, fd, addr, addrlen, flags); - set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); } fn set_target_fixed_file(sqe: *io_uring_sqe, file_index: u32) void { - const sqe_file_index: u32 = if (file_index == Constants.FILE_INDEX_ALLOC) - Constants.FILE_INDEX_ALLOC + const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC) + constants.FILE_INDEX_ALLOC else // 0 means no fixed files, indexes should be encoded as "index + 1" file_index + 1; @@ -645,79 +659,82 @@ pub const io_uring_sqe = extern struct { sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); } - pub fn prep_recv(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []u8, flags: Flags.SendRecv) void { + pub fn prep_recv(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG) void { sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = @bitCast(flags); + sqe.rw_flags = flags; } + // TODO: review recv `flags` pub fn prep_recv_multishot( sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []u8, - flags: Flags.SendRecv, + flags: linux.MSG, ) void { sqe.prep_recv(fd, buffer, flags); - const enable_multishot: Flags.SendRecv = .{ .RECV_MULTISHOT = true }; - sqe.ioprio |= @bitCast(enable_multishot); + sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; } pub fn prep_recvmsg( sqe: *io_uring_sqe, fd: linux.fd_t, msg: *linux.msghdr, - flags: Flags.SendRecv, + flags: linux.MSG, ) void { sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = @bitCast(flags); + sqe.rw_flags = flags; } pub fn prep_recvmsg_multishot( sqe: *io_uring_sqe, fd: linux.fd_t, msg: *linux.msghdr, - flags: Flags.SendRecv, + flags: linux.MSG, ) void { sqe.prep_recvmsg(fd, msg, flags); - const enable_multishot: Flags.SendRecv = .{ .RECV_MULTISHOT = true }; - sqe.ioprio |= @bitCast(enable_multishot); + sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; } // COMMIT: fix send[|recv] flag param type - pub fn prep_send(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: Flags.SendRecv) void { + pub fn prep_send(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG) void { sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = @bitCast(flags); + sqe.rw_flags = flags; } - pub fn prep_send_zc(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16) void { + pub fn prep_send_zc(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Flags.SendRecv) void { sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = flags; - sqe.ioprio = zc_flags; + sqe.ioprio = .{ .send_recv = zc_flags }; } - pub fn prep_send_zc_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16, buf_index: u16) void { - prep_send_zc(sqe, fd, buffer, flags, zc_flags); - sqe.ioprio |= linux.IORING_RECVSEND_FIXED_BUF; + pub fn prep_send_zc_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Flags.SendRecv, buf_index: u16) void { + const zc_flags_fixed = blk: { + var updated_flags = zc_flags; + updated_flags.RECVSEND_FIXED_BUF = true; + break :blk updated_flags; + }; + prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed); sqe.buf_index = buf_index; } - pub fn prep_sendmsg_zc( + pub fn prep_sendmsg( sqe: *io_uring_sqe, fd: linux.fd_t, msg: *const linux.msghdr_const, - flags: u32, + flags: linux.MSG, ) void { - prep_sendmsg(sqe, fd, msg, flags); - sqe.opcode = .SENDMSG_ZC; + sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; } - pub fn prep_sendmsg( + pub fn prep_sendmsg_zc( sqe: *io_uring_sqe, fd: linux.fd_t, msg: *const linux.msghdr_const, - flags: u32, + flags: linux.MSG, ) void { - sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; + prep_sendmsg(sqe, fd, msg, flags); + sqe.opcode = .SENDMSG_ZC; } pub fn prep_openat( @@ -747,7 +764,7 @@ pub const io_uring_sqe = extern struct { sqe.* = .{ .opcode = .CLOSE, .flags = .{}, - .ioprio = 0, + .ioprio = @bitCast(@as(u16, 0)), .fd = fd, .off = 0, .addr = 0, @@ -771,22 +788,22 @@ pub const io_uring_sqe = extern struct { sqe: *io_uring_sqe, ts: *const linux.kernel_timespec, count: u32, - flags: u32, + flags: Flags.Timeout, ) void { sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } - pub fn prep_timeout_remove(sqe: *io_uring_sqe, timeout_user_data: u64, flags: u32) void { + pub fn prep_timeout_remove(sqe: *io_uring_sqe, timeout_user_data: u64, flags: Flags.Timeout) void { sqe.* = .{ .opcode = .TIMEOUT_REMOVE, .flags = .{}, - .ioprio = 0, + .ioprio = @bitCast(@as(u16, 0)), .fd = -1, .off = 0, .addr = timeout_user_data, .len = 0, - .rw_flags = flags, + .rw_flags = @bitCast(flags), .user_data = 0, .buf_index = 0, .personality = 0, @@ -799,7 +816,7 @@ pub const io_uring_sqe = extern struct { pub fn prep_link_timeout( sqe: *io_uring_sqe, ts: *const linux.kernel_timespec, - flags: u32, + flags: Flags.Timeout, ) void { sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); sqe.rw_flags = flags; @@ -808,7 +825,7 @@ pub const io_uring_sqe = extern struct { pub fn prep_poll_add( sqe: *io_uring_sqe, fd: linux.fd_t, - poll_mask: u32, + poll_mask: linux.POLL, ) void { sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); // Poll masks previously used to comprise of 16 bits in the flags union of @@ -831,8 +848,8 @@ pub const io_uring_sqe = extern struct { sqe: *io_uring_sqe, old_user_data: u64, new_user_data: u64, - poll_mask: u32, - flags: u32, + poll_mask: linux.POLL, + flags: Flags.Poll, ) void { sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); // Poll masks previously used to comprise of 16 bits in the flags union of @@ -854,7 +871,7 @@ pub const io_uring_sqe = extern struct { sqe.* = .{ .opcode = .FALLOCATE, .flags = .{}, - .ioprio = 0, + .ioprio = @bitCast(@as(u16, 0)), .fd = fd, .off = offset, .addr = len, @@ -873,8 +890,8 @@ pub const io_uring_sqe = extern struct { sqe: *io_uring_sqe, fd: linux.fd_t, path: [*:0]const u8, - flags: u32, - mask: u32, + flags: linux.AT, + mask: linux.STATX, // TODO: compose linux.STATX buf: *linux.Statx, ) void { sqe.prep_rw(.STATX, fd, @intFromPtr(path), mask, @intFromPtr(buf)); @@ -884,25 +901,30 @@ pub const io_uring_sqe = extern struct { pub fn prep_cancel( sqe: *io_uring_sqe, cancel_user_data: u64, - flags: u32, + flags: Flags.AsyncCancel, ) void { sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_cancel_fd( sqe: *io_uring_sqe, fd: linux.fd_t, - flags: u32, + flags: Flags.AsyncCancel, ) void { sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); - sqe.rw_flags = flags | linux.IORING_ASYNC_CANCEL_FD; + const enable_cancel_fd = blk: { + var update_flags = flags; + update_flags.CANCEL_FD = true; + break :blk update_flags; + }; + sqe.rw_flags = @bitCast(enable_cancel_fd); } pub fn prep_shutdown( sqe: *io_uring_sqe, sockfd: linux.socket_t, - how: u32, + how: linux.SHUT, ) void { sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); } @@ -913,7 +935,7 @@ pub const io_uring_sqe = extern struct { old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: u32, + flags: linux.RENAME, ) void { sqe.prep_rw( .RENAMEAT, @@ -930,7 +952,7 @@ pub const io_uring_sqe = extern struct { sqe: *io_uring_sqe, dir_fd: linux.fd_t, path: [*:0]const u8, - flags: u32, + flags: linux.AT, // TODO: unlink flags only AT_REMOVEDIR ) void { sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); sqe.rw_flags = flags; @@ -966,7 +988,7 @@ pub const io_uring_sqe = extern struct { old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: u32, + flags: linux.AT, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW ) void { sqe.prep_rw( .LINKAT, @@ -991,9 +1013,10 @@ pub const io_uring_sqe = extern struct { sqe: *io_uring_sqe, fds: []linux.fd_t, ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, linux.IORING_FILE_INDEX_ALLOC); + sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); } + // TODO: why can't slice be used here ? pub fn prep_provide_buffers( sqe: *io_uring_sqe, buffers: [*]u8, @@ -1016,23 +1039,12 @@ pub const io_uring_sqe = extern struct { sqe.buf_index = @intCast(group_id); } - pub fn prep_multishot_accept( - sqe: *io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - sqe.ioprio |= linux.IORING_ACCEPT_MULTISHOT; - } - pub fn prep_socket( sqe: *io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, + domain: linux.AF, + socket_type: linux.SOCK, + protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + flags: u32, // flags is unused ) void { sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); sqe.rw_flags = flags; @@ -1040,10 +1052,10 @@ pub const io_uring_sqe = extern struct { pub fn prep_socket_direct( sqe: *io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, + domain: linux.AF, + socket_type: linux.SOCK, + protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + flags: u32, // flags is unused file_index: u32, ) void { prep_socket(sqe, domain, socket_type, protocol, flags); @@ -1052,13 +1064,13 @@ pub const io_uring_sqe = extern struct { pub fn prep_socket_direct_alloc( sqe: *io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, + domain: linux.AF, + socket_type: linux.SOCK, + protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + flags: u32, // flags is unused ) void { prep_socket(sqe, domain, socket_type, protocol, flags); - set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); } pub fn prep_waitid( @@ -1066,20 +1078,21 @@ pub const io_uring_sqe = extern struct { id_type: linux.P, id: i32, infop: *linux.siginfo_t, - options: u32, - flags: u32, + options: linux.W, + flags: u32, // flags is unused ) void { sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); sqe.rw_flags = flags; sqe.splice_fd_in = @bitCast(options); } + // TODO: maybe remove unused flag fields? pub fn prep_bind( sqe: *io_uring_sqe, fd: linux.fd_t, addr: *const linux.sockaddr, addrlen: linux.socklen_t, - flags: u32, + flags: u32, // flags is unused and does't exist in io_uring's api ) void { sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); sqe.rw_flags = flags; @@ -1089,7 +1102,7 @@ pub const io_uring_sqe = extern struct { sqe: *io_uring_sqe, fd: linux.fd_t, backlog: usize, - flags: u32, + flags: u32, // flags is unused and does't exist in io_uring's api ) void { sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); sqe.rw_flags = flags; @@ -1097,10 +1110,10 @@ pub const io_uring_sqe = extern struct { pub fn prep_cmd_sock( sqe: *io_uring_sqe, - cmd_op: linux.IO_URING_SOCKET_OP, + cmd_op: SocketOp, fd: linux.fd_t, - level: u32, - optname: u32, + level: linux.SOL, + optname: linux.SO, optval: u64, optlen: u32, ) void { @@ -1314,7 +1327,7 @@ pub const io_uring_probe_op = extern struct { resv2: u32, pub fn is_supported(self: @This()) bool { - return self.flags & Constants.IO_URING_OP_SUPPORTED != 0; + return self.flags & constants.IO_URING_OP_SUPPORTED != 0; } }; From 2bd223b393cb3acd1d41eb64765c4c9be2d777f6 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Tue, 30 Sep 2025 17:14:03 +0000 Subject: [PATCH 03/41] move Flags that were mistakenly tagged as constants Improve naming and namespacing of IoUring flags and types to match Zig Style move some flags from uflags to its type Add matches doc comments to make it easy to match Zig IoUring types to liburing Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 952 ++++++++++++++++++----------------- 1 file changed, 494 insertions(+), 458 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index be0576deb21c..3a1f5311f689 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -10,11 +10,11 @@ const testing = std.testing; const is_linux = builtin.os.tag == .linux; const page_size_min = std.heap.page_size_min; -fd: linux.fd_t = -1, -sq: SubmissionQueue, -cq: CompletionQueue, -flags: Flags.Setup, -features: Flags.Features, +fd: posix.fd_t = -1, +sq: Sq, +cq: Cq, +flags: uflags.Setup, +features: uflags.Features, // COMMIT: move IoUring constants to Constants pub const constants = struct { @@ -46,27 +46,12 @@ pub const constants = struct { pub const OFF_PBUF_SHIFT = 16; pub const OFF_MMAP_MASK = 0xf8000000; - // COMMIT: new constant - /// initialise with user provided memory pointed by user_addr - pub const MEM_REGION_TYPE_USER = 1; - /// expose the region as registered wait arguments - pub const MEM_REGION_REG_WAIT_ARG = 1; - /// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. pub const RSRC_REGISTER_SPARSE = 1 << 0; /// Skip updating fd indexes set to this value in the fd table pub const REGISTER_FILES_SKIP = -2; - pub const IO_URING_OP_SUPPORTED = 1 << 0; - - // COMMIT: new constants - pub const REGISTER_SRC_REGISTERED = (1 << 0); - pub const REGISTER_DST_REPLACE = (1 << 1); - - // COMMIT: new constant - pub const REG_WAIT_TS = (1 << 0); - // COMMIT: new TX Timestamp definition /// SOCKET_URING_OP_TX_TIMESTAMP definitions pub const TIMESTAMP_HW_SHIFT = 16; @@ -78,10 +63,13 @@ pub const constants = struct { /// The bit from which area id is encoded into offsets pub const ZCRX_AREA_SHIFT = 48; pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1)); + + // flag added to the opcode to use a registered ring fd + pub const REGISTER_USE_REGISTERED_RING = 1 << 31; }; // COMMIT: move IoUring flags to Flags struct -pub const Flags = struct { +pub const uflags = struct { /// io_uring_setup() flags pub const Setup = packed struct(u32) { /// io_context is polled @@ -265,7 +253,7 @@ pub const Flags = struct { // COMMIT: new Flags DONTWAIT: bool = false, POLL_FIRST: bool = false, - _unused: u29 = 0, + _unused: u13 = 0, }; /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) @@ -295,24 +283,6 @@ pub const Flags = struct { _unused_1: u26 = 0, }; - /// sq_ring.flags - pub const Sq = packed struct(u32) { - /// needs io_uring_enter wakeup - NEED_WAKEUP: bool = false, - /// CQ ring is overflown - CQ_OVERFLOW: bool = false, - /// task should enter the kernel - TASKRUN: bool = false, - _unused: u29 = 0, - }; - - /// cq_ring.flags - pub const Cq = packed struct(u32) { - /// disable eventfd notifications - EVENTFD_DISABLED: bool = false, - _unused: u31 = 0, - }; - /// io_uring_enter(2) flags pub const Enter = packed struct(u32) { GETEVENTS: bool = false, @@ -353,19 +323,20 @@ pub const Flags = struct { }; // IO completion data structure (Completion Queue Entry) -pub const io_uring_cqe = extern struct { +pub const Cqe = extern struct { /// sqe.user_data value passed back user_data: u64, /// result code for this event res: i32, - flags: CqeFlags, + flags: Flags, // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration + // TODO: add support for the IORING_SETUP_CQE32 case /// If the ring is initialized with IORING_SETUP_CQE32, then this field /// contains 16-bytes of padding, doubling the size of the CQE. - big_cqe: []u64, + // big_cqe: ?[2]u64, /// cqe.flags - pub const CqeFlags = packed struct(u32) { + pub const Flags = packed struct(u32) { /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID F_BUFFER: bool = false, /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries @@ -400,7 +371,7 @@ pub const io_uring_cqe = extern struct { _unused_1: u16 = 0, }; - pub fn err(self: io_uring_cqe) linux.E { + pub fn err(self: Cqe) linux.E { if (self.res > -4096 and self.res < 0) { return @as(linux.E, @enumFromInt(-self.res)); } @@ -410,7 +381,7 @@ pub const io_uring_cqe = extern struct { // On successful completion of the provided buffers IO request, the CQE flags field // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by // the upper 16-bits of the flags field. - pub fn buffer_id(self: io_uring_cqe) !u16 { + pub fn buffer_id(self: Cqe) !u16 { if (!self.flags.F_BUFFER) { return error.NoBufferSelected; } @@ -419,15 +390,16 @@ pub const io_uring_cqe = extern struct { }; /// IO submission data structure (Submission Queue Entry) -pub const io_uring_sqe = extern struct { +/// matches io_uring_sqe in liburing +pub const Sqe = extern struct { /// type of operation for this sqe opcode: Op, /// IOSQE_* flags - flags: SqeFlags, + flags: IoSqe, /// ioprio for the request ioprio: packed union { - send_recv: Flags.SendRecv, - accept: Flags.Accept, + send_recv: uflags.SendRecv, + accept: uflags.Accept, }, /// file descriptor to do IO on fd: i32, @@ -454,25 +426,25 @@ pub const io_uring_sqe = extern struct { resv: u64, /// sqe.flags - pub const SqeFlags = packed struct(u8) { + pub const IoSqe = packed struct(u8) { /// use fixed fileset - IOSQE_FIXED_FILE: bool = false, + FIXED_FILE: bool = false, /// issue after inflight IO - IOSQE_IO_DRAIN: bool = false, + IO_DRAIN: bool = false, /// links next sqe - IOSQE_IO_LINK: bool = false, + IO_LINK: bool = false, /// like LINK, but stronger - IOSQE_IO_HARDLINK: bool = false, + IO_HARDLINK: bool = false, /// always go async - IOSQE_ASYNC: bool = false, + ASYNC: bool = false, /// select buffer from sqe->buf_group - IOSQE_BUFFER_SELECT: bool = false, + BUFFER_SELECT: bool = false, /// don't post CQE if request succeeded - IOSQE_CQE_SKIP_SUCCESS: bool = false, + CQE_SKIP_SUCCESS: bool = false, _: u1 = 0, }; - pub fn prep_nop(sqe: *io_uring_sqe) void { + pub fn prep_nop(sqe: *Sqe) void { sqe.* = .{ .opcode = .NOP, .flags = .{}, @@ -491,7 +463,7 @@ pub const io_uring_sqe = extern struct { }; } - pub fn prep_fsync(sqe: *io_uring_sqe, fd: linux.fd_t, flags: Flags.Fsync) void { + pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void { sqe.* = .{ .opcode = .FSYNC, .flags = .{}, @@ -511,7 +483,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_rw( - sqe: *io_uring_sqe, + sqe: *Sqe, op: Op, fd: linux.fd_t, addr: u64, @@ -536,22 +508,22 @@ pub const io_uring_sqe = extern struct { }; } - pub fn prep_read(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { + pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); } - pub fn prep_write(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { + pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); } - pub fn prep_splice(sqe: *io_uring_sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { + pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); sqe.addr = off_in; sqe.splice_fd_in = fd_in; } pub fn prep_readv( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, iovecs: []const std.posix.iovec, offset: u64, @@ -560,7 +532,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_writev( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, iovecs: []const std.posix.iovec_const, offset: u64, @@ -568,18 +540,18 @@ pub const io_uring_sqe = extern struct { sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); } - pub fn prep_read_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); sqe.buf_index = buffer_index; } - pub fn prep_write_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); sqe.buf_index = buffer_index; } pub fn prep_accept( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, @@ -593,7 +565,7 @@ pub const io_uring_sqe = extern struct { /// accept directly into the fixed file table pub fn prep_accept_direct( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, @@ -605,7 +577,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_multishot_accept( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, @@ -617,7 +589,7 @@ pub const io_uring_sqe = extern struct { /// multishot accept directly into the fixed file table pub fn prep_multishot_accept_direct( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, @@ -627,7 +599,7 @@ pub const io_uring_sqe = extern struct { set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); } - fn set_target_fixed_file(sqe: *io_uring_sqe, file_index: u32) void { + fn set_target_fixed_file(sqe: *Sqe, file_index: u32) void { const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC) constants.FILE_INDEX_ALLOC else @@ -640,7 +612,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_connect( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, addr: *const linux.sockaddr, addrlen: linux.socklen_t, @@ -650,7 +622,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_epoll_ctl( - sqe: *io_uring_sqe, + sqe: *Sqe, epfd: linux.fd_t, fd: linux.fd_t, op: u32, @@ -659,14 +631,14 @@ pub const io_uring_sqe = extern struct { sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); } - pub fn prep_recv(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG) void { + pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG) void { sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = flags; } // TODO: review recv `flags` pub fn prep_recv_multishot( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG, @@ -676,7 +648,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_recvmsg( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, msg: *linux.msghdr, flags: linux.MSG, @@ -686,7 +658,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_recvmsg_multishot( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, msg: *linux.msghdr, flags: linux.MSG, @@ -696,18 +668,18 @@ pub const io_uring_sqe = extern struct { } // COMMIT: fix send[|recv] flag param type - pub fn prep_send(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG) void { + pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG) void { sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = flags; } - pub fn prep_send_zc(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Flags.SendRecv) void { + pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: uflags.SendRecv) void { sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = flags; sqe.ioprio = .{ .send_recv = zc_flags }; } - pub fn prep_send_zc_fixed(sqe: *io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Flags.SendRecv, buf_index: u16) void { + pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: uflags.SendRecv, buf_index: u16) void { const zc_flags_fixed = blk: { var updated_flags = zc_flags; updated_flags.RECVSEND_FIXED_BUF = true; @@ -718,7 +690,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_sendmsg( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, msg: *const linux.msghdr_const, flags: linux.MSG, @@ -728,7 +700,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_sendmsg_zc( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, msg: *const linux.msghdr_const, flags: linux.MSG, @@ -738,7 +710,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_openat( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, path: [*:0]const u8, flags: linux.O, @@ -749,7 +721,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_openat_direct( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, path: [*:0]const u8, flags: linux.O, @@ -760,7 +732,7 @@ pub const io_uring_sqe = extern struct { set_target_fixed_file(sqe, file_index); } - pub fn prep_close(sqe: *io_uring_sqe, fd: linux.fd_t) void { + pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void { sqe.* = .{ .opcode = .CLOSE, .flags = .{}, @@ -779,22 +751,22 @@ pub const io_uring_sqe = extern struct { }; } - pub fn prep_close_direct(sqe: *io_uring_sqe, file_index: u32) void { + pub fn prep_close_direct(sqe: *Sqe, file_index: u32) void { prep_close(sqe, 0); set_target_fixed_file(sqe, file_index); } pub fn prep_timeout( - sqe: *io_uring_sqe, + sqe: *Sqe, ts: *const linux.kernel_timespec, count: u32, - flags: Flags.Timeout, + flags: uflags.Timeout, ) void { sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); sqe.rw_flags = @bitCast(flags); } - pub fn prep_timeout_remove(sqe: *io_uring_sqe, timeout_user_data: u64, flags: Flags.Timeout) void { + pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { sqe.* = .{ .opcode = .TIMEOUT_REMOVE, .flags = .{}, @@ -814,16 +786,16 @@ pub const io_uring_sqe = extern struct { } pub fn prep_link_timeout( - sqe: *io_uring_sqe, + sqe: *Sqe, ts: *const linux.kernel_timespec, - flags: Flags.Timeout, + flags: uflags.Timeout, ) void { sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); sqe.rw_flags = flags; } pub fn prep_poll_add( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, poll_mask: linux.POLL, ) void { @@ -838,18 +810,18 @@ pub const io_uring_sqe = extern struct { } pub fn prep_poll_remove( - sqe: *io_uring_sqe, + sqe: *Sqe, target_user_data: u64, ) void { sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); } pub fn prep_poll_update( - sqe: *io_uring_sqe, + sqe: *Sqe, old_user_data: u64, new_user_data: u64, poll_mask: linux.POLL, - flags: Flags.Poll, + flags: uflags.Poll, ) void { sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); // Poll masks previously used to comprise of 16 bits in the flags union of @@ -862,7 +834,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_fallocate( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, mode: i32, offset: u64, @@ -887,7 +859,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_statx( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, path: [*:0]const u8, flags: linux.AT, @@ -899,18 +871,18 @@ pub const io_uring_sqe = extern struct { } pub fn prep_cancel( - sqe: *io_uring_sqe, + sqe: *Sqe, cancel_user_data: u64, - flags: Flags.AsyncCancel, + flags: uflags.AsyncCancel, ) void { sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); sqe.rw_flags = @bitCast(flags); } pub fn prep_cancel_fd( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, - flags: Flags.AsyncCancel, + flags: uflags.AsyncCancel, ) void { sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); const enable_cancel_fd = blk: { @@ -922,7 +894,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_shutdown( - sqe: *io_uring_sqe, + sqe: *Sqe, sockfd: linux.socket_t, how: linux.SHUT, ) void { @@ -930,7 +902,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_renameat( - sqe: *io_uring_sqe, + sqe: *Sqe, old_dir_fd: linux.fd_t, old_path: [*:0]const u8, new_dir_fd: linux.fd_t, @@ -949,7 +921,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_unlinkat( - sqe: *io_uring_sqe, + sqe: *Sqe, dir_fd: linux.fd_t, path: [*:0]const u8, flags: linux.AT, // TODO: unlink flags only AT_REMOVEDIR @@ -959,7 +931,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_mkdirat( - sqe: *io_uring_sqe, + sqe: *Sqe, dir_fd: linux.fd_t, path: [*:0]const u8, mode: linux.mode_t, @@ -968,7 +940,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_symlinkat( - sqe: *io_uring_sqe, + sqe: *Sqe, target: [*:0]const u8, new_dir_fd: linux.fd_t, link_path: [*:0]const u8, @@ -983,7 +955,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_linkat( - sqe: *io_uring_sqe, + sqe: *Sqe, old_dir_fd: linux.fd_t, old_path: [*:0]const u8, new_dir_fd: linux.fd_t, @@ -1002,7 +974,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_files_update( - sqe: *io_uring_sqe, + sqe: *Sqe, fds: []const linux.fd_t, offset: u32, ) void { @@ -1010,7 +982,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_files_update_alloc( - sqe: *io_uring_sqe, + sqe: *Sqe, fds: []linux.fd_t, ) void { sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); @@ -1018,7 +990,7 @@ pub const io_uring_sqe = extern struct { // TODO: why can't slice be used here ? pub fn prep_provide_buffers( - sqe: *io_uring_sqe, + sqe: *Sqe, buffers: [*]u8, buffer_len: usize, num: usize, @@ -1031,7 +1003,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_remove_buffers( - sqe: *io_uring_sqe, + sqe: *Sqe, num: usize, group_id: usize, ) void { @@ -1040,7 +1012,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_socket( - sqe: *io_uring_sqe, + sqe: *Sqe, domain: linux.AF, socket_type: linux.SOCK, protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 @@ -1051,7 +1023,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_socket_direct( - sqe: *io_uring_sqe, + sqe: *Sqe, domain: linux.AF, socket_type: linux.SOCK, protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 @@ -1063,7 +1035,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_socket_direct_alloc( - sqe: *io_uring_sqe, + sqe: *Sqe, domain: linux.AF, socket_type: linux.SOCK, protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 @@ -1074,7 +1046,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_waitid( - sqe: *io_uring_sqe, + sqe: *Sqe, id_type: linux.P, id: i32, infop: *linux.siginfo_t, @@ -1088,7 +1060,7 @@ pub const io_uring_sqe = extern struct { // TODO: maybe remove unused flag fields? pub fn prep_bind( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, addr: *const linux.sockaddr, addrlen: linux.socklen_t, @@ -1099,7 +1071,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_listen( - sqe: *io_uring_sqe, + sqe: *Sqe, fd: linux.fd_t, backlog: usize, flags: u32, // flags is unused and does't exist in io_uring's api @@ -1109,7 +1081,7 @@ pub const io_uring_sqe = extern struct { } pub fn prep_cmd_sock( - sqe: *io_uring_sqe, + sqe: *Sqe, cmd_op: SocketOp, fd: linux.fd_t, level: linux.SOL, @@ -1134,20 +1106,21 @@ pub const io_uring_sqe = extern struct { sqe.addr3 = optval; } - pub fn set_flags(sqe: *io_uring_sqe, flags: io_uring_sqe.SqeFlags) void { + pub fn set_flags(sqe: *Sqe, flags: Sqe.IoSqe) void { const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags)); sqe.flags = @bitCast(updated_flags); } /// This SQE forms a link with the next SQE in the submission ring. Next SQE /// will not be started before this one completes. Forms a chain of SQEs. - pub fn link_next(sqe: *io_uring_sqe) void { - sqe.flags.IOSQE_IO_LINK = true; + pub fn link_next(sqe: *Sqe) void { + sqe.flags.IO_LINK = true; } }; /// Filled with the offset for mmap(2) -pub const io_sqring_offsets = extern struct { +/// matches io_sqring_offsets in liburing +pub const SqOffsets = extern struct { /// offset of ring head head: u32, /// offset of ring tail @@ -1166,7 +1139,8 @@ pub const io_sqring_offsets = extern struct { user_addr: u64, }; -pub const io_cqring_offsets = extern struct { +/// matches io_cqring_offsets in liburing +pub const CqOffsets = extern struct { head: u32, tail: u32, ring_mask: u32, @@ -1179,21 +1153,23 @@ pub const io_cqring_offsets = extern struct { }; /// Passed in for io_uring_setup(2). Copied back with updated info on success -pub const io_uring_params = extern struct { +/// matches io_uring_params in liburing +pub const Params = extern struct { sq_entries: u32, cq_entries: u32, - flags: u32, + flags: uflags.Setup, sq_thread_cpu: u32, sq_thread_idle: u32, - features: u32, + features: uflags.Features, wq_fd: u32, resv: [3]u32, - sq_off: io_sqring_offsets, - cq_off: io_cqring_offsets, + sq_off: SqOffsets, + cq_off: CqOffsets, }; -// io_uring_register opcodes and arguments -pub const RegisterOp = enum(u32) { +/// io_uring_register(2) opcodes and arguments +/// matches io_uring_register_op in liburing +pub const RegisterOp = enum(u8) { REGISTER_BUFFERS, UNREGISTER_BUFFERS, REGISTER_FILES, @@ -1262,13 +1238,11 @@ pub const RegisterOp = enum(u32) { // query various aspects of io_uring, see linux/io_uring/query.h REGISTER_QUERY, - // flag added to the opcode to use a registered ring fd - REGISTER_USE_REGISTERED_RING = 1 << 31, - _, }; -/// io_uring_restriction.opcode values +/// io-wq worker categories +/// matches io_wq_type in liburing pub const IoWqCategory = enum(u8) { BOUND, UNBOUND, @@ -1278,24 +1252,40 @@ pub const IoWqCategory = enum(u8) { // deprecated, see struct io_uring_rsrc_update // COMMIT: add new io_uring_region_desc struct -pub const io_uring_region_desc = extern struct { +/// matches io_uring_region_desc in liburing +pub const RegionDesc = extern struct { user_addr: u64, size: u64, - flags: u32, + flags: Flags, id: u32, mmap_offset: u64, __resv: [4]u64, + + // COMMIT: new constant + /// initialise with user provided memory pointed by user_addr + pub const Flags = packed struct(u32) { + TYPE_USER: bool = false, + _: u31 = 0, + }; }; // COMMIT: add new io_uring_mem_region_reg struct -pub const io_uring_mem_region_reg = extern struct { - /// struct io_uring_region_desc +/// matches io_uring_mem_region_reg in liburing +pub const MemRegionReg = extern struct { + /// struct io_uring_region_desc (RegionDesc in Zig) region_uptr: u64, - flags: u64, + flags: Flags, __resv: [2]u64, + + /// expose the region as registered wait arguments + pub const Flags = packed struct(u64) { + REG_WAIT_ARG: bool = false, + _: u63 = 0, + }; }; -pub const io_uring_rsrc_register = extern struct { +/// matches io_uring_rsrc_register in liburing +pub const RsrcRegister = extern struct { nr: u32, flags: u32, resv2: u64, @@ -1303,13 +1293,15 @@ pub const io_uring_rsrc_register = extern struct { tags: u64, }; -pub const io_uring_rsrc_update = extern struct { +/// matches io_uring_rsrc_update in liburing +pub const RsrcUpdate = extern struct { offset: u32, resv: u32, data: u64, }; -pub const io_uring_rsrc_update2 = extern struct { +/// matches io_uring_rsrc_update2 in liburing +pub const RsrcUpdate2 = extern struct { offset: u32, resv: u32, data: u64, @@ -1318,27 +1310,32 @@ pub const io_uring_rsrc_update2 = extern struct { resv2: u32, }; -pub const io_uring_probe_op = extern struct { +/// matches io_uring_probe_op in liburing +pub const ProbeOp = extern struct { op: Op, resv: u8, - // TODO: which flags are these exactly :) - /// IO_URING_OP_* flags - flags: u16, + flags: Flags, resv2: u32, - pub fn is_supported(self: @This()) bool { - return self.flags & constants.IO_URING_OP_SUPPORTED != 0; + pub const Flags = packed struct(u16) { + OP_SUPPORTED: bool = false, + _: u15 = 0, + }; + + pub fn is_supported(self: ProbeOp) bool { + return self.flags.OP_SUPPORTED; } }; -pub const io_uring_probe = extern struct { +/// matches io_uring_probe in liburing +pub const Probe = extern struct { /// Last opcode supported last_op: Op, /// Length of ops[] array below ops_len: u8, resv: u16, resv2: [3]u32, - ops: [256]io_uring_probe_op, + ops: [256]ProbeOp, /// Is the operation supported on the running kernel. pub fn is_supported(self: @This(), op: Op) bool { @@ -1350,12 +1347,13 @@ pub const io_uring_probe = extern struct { }; // COMMIT: fix defination of io_uring_restriction -// register_op: RegisterRestrictionOp field -pub const io_uring_restriction = extern struct { +// RegisterOp is actually u8 +/// matches io_uring_restriction in liburing +pub const Restriction = extern struct { opcode: RestrictionOp, arg: extern union { /// IORING_RESTRICTION_REGISTER_OP - register_op: RegisterRestrictionOp, + register_op: RegisterOp, /// IORING_RESTRICTION_SQE_OP sqe_op: Op, /// IORING_RESTRICTION_SQE_FLAGS_* @@ -1366,29 +1364,40 @@ pub const io_uring_restriction = extern struct { }; // COMMIT: add new struct type -pub const io_uring_clock_register = extern struct { +/// matches io_uring_clock_register in liburing +pub const ClockRegister = extern struct { clockid: u32, __resv: [3]u32, }; // COMMIT: add new struct type -pub const io_uring_clone_buffers = extern struct { +/// matches io_uring_clone_buffers in liburing +pub const CloneBuffers = extern struct { src_fd: u32, - flags: u32, + flags: Flags, src_off: u32, dst_off: u32, nr: u32, pad: [3]u32, + + // COMMIT: new flags + pub const Flags = packed struct(u32) { + REGISTER_SRC_REGISTERED: bool = false, + REGISTER_DST_REPLACE: bool = false, + _: u30 = 0, + }; }; -pub const io_uring_buf = extern struct { +/// matches io_uring_buf in liburing +pub const Buffer = extern struct { addr: u64, len: u32, bid: u16, resv: u16, }; -pub const io_uring_buf_ring = extern struct { +/// matches io_uring_buf_ring in liburing +pub const BufferRing = extern struct { resv1: u64, resv2: u32, resv3: u16, @@ -1396,16 +1405,17 @@ pub const io_uring_buf_ring = extern struct { }; /// argument for IORING_(UN)REGISTER_PBUF_RING -pub const io_uring_buf_reg = extern struct { +/// matches io_uring_buf_reg in liburing +pub const BufferRegister = extern struct { ring_addr: u64, ring_entries: u32, bgid: u16, - flags: RegisterPbufFlags, + flags: Flags, resv: [3]u64, // COMMIT: new IORING_REGISTER_PBUF_RING flags /// Flags for IORING_REGISTER_PBUF_RING. - pub const RegisterPbufFlags = packed struct(u16) { + pub const Flags = packed struct(u16) { /// IOU_PBUF_RING_MMAP: /// If set, kernel will allocate the memory for the ring. /// The application must not set a ring_addr in struct io_uring_buf_reg @@ -1427,7 +1437,8 @@ pub const io_uring_buf_reg = extern struct { }; /// argument for IORING_REGISTER_PBUF_STATUS -pub const io_uring_buf_status = extern struct { +/// matches io_uring_buf_status in liburing +pub const BufferStatus = extern struct { /// input buf_group: u32, /// output @@ -1436,7 +1447,8 @@ pub const io_uring_buf_status = extern struct { }; /// argument for IORING_(UN)REGISTER_NAPI -pub const io_uring_napi = extern struct { +/// matches io_uring_napi in liburing +pub const Napi = extern struct { busy_poll_to: u32, prefer_busy_poll: u8, pad: [3]u8, @@ -1448,42 +1460,56 @@ pub const io_uring_napi = extern struct { /// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument /// is an index into a previously registered fixed wait region described by /// the below structure. -pub const io_uring_reg_wait = extern struct { +/// matches io_uring_reg_wait in liburing +pub const RegisterWait = extern struct { ts: linux.kernel_timespec, min_wait_usec: u32, - flags: u32, + flags: Flags, sigmask: u64, sigmask_sz: u32, pad: [3]u32, pad2: [2]u64, + + // COMMIT: new constant + pub const Flags = packed struct(u32) { + REG_WAIT_TS: bool = false, + _: u31 = 0, + }; }; /// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG -pub const io_uring_getevents_arg = extern struct { +/// matches io_uring_getevents_arg in liburing +pub const GetEventsArg = extern struct { sigmask: u64, sigmask_sz: u32, pad: u32, ts: u64, }; +// COMMIT: fix type definition of io_uring_sync_cancel_reg /// Argument for IORING_REGISTER_SYNC_CANCEL -pub const io_uring_sync_cancel_reg = extern struct { +/// matches io_uring_sync_cancel_reg in liburing +pub const SyncCancelRegister = extern struct { addr: u64, fd: i32, - flags: u32, + flags: uflags.AsyncCancel, timeout: linux.kernel_timespec, - pad: [4]u64, + opcode: Op, + pad: [7]u8, + pad2: [4]u64, }; /// Argument for IORING_REGISTER_FILE_ALLOC_RANGE /// The range is specified as [off, off + len) -pub const io_uring_file_index_range = extern struct { +/// matches io_uring_file_index_range in liburing +pub const FileIndexRange = extern struct { off: u32, len: u32, resv: u64, }; -pub const io_uring_recvmsg_out = extern struct { +/// matches io_uring_recvmsg_out in liburing +pub const RecvmsgOut = extern struct { namelen: u32, controllen: u32, payloadlen: u32, @@ -1491,18 +1517,21 @@ pub const io_uring_recvmsg_out = extern struct { }; /// Zero copy receive refill queue entry -pub const io_uring_zcrx_rqe = extern struct { +/// matches io_uring_zcrx_rqe in liburing +pub const ZcrxRqe = extern struct { off: u64, len: u32, __pad: u32, }; -pub const io_uring_zcrx_cqe = extern struct { +/// matches io_uring_zcrx_cqe in liburing +pub const ZcrxCqe = extern struct { off: u64, __pad: u64, }; -pub const io_uring_zcrx_offsets = extern struct { +/// matches io_uring_zcrx_offsets in liburing +pub const ZcrxOffsets = extern struct { head: u32, tail: u32, rqes: u32, @@ -1510,31 +1539,34 @@ pub const io_uring_zcrx_offsets = extern struct { __resv: [2]u64, }; -pub const io_uring_zcrx_area_reg = extern struct { +/// matches io_uring_zcrx_area_reg in liburing +pub const ZcrxAreaRegister = extern struct { addr: u64, len: u64, rq_area_token: u64, - flags: zcrx_area_flags, + flags: Flags, dmabuf_fd: u32, __resv2: [2]u64, - pub const zcrx_area_flags = packed struct(u32) { + pub const Flags = packed struct(u32) { DMABUF: bool = false, _: u31 = 0, }; }; /// Argument for IORING_REGISTER_ZCRX_IFQ -pub const io_uring_zcrx_ifq_reg = extern struct { +/// matches io_uring_zcrx_ifq_reg in liburing +pub const ZcrxIfqRegister = extern struct { if_idx: u32, if_rxq: u32, rq_entries: u32, + // TODO: find out its flags, I suspect its ZcrxAreaRegister.Flags flags: u32, /// pointer to struct io_uring_zcrx_area_reg area_ptr: u64, /// struct io_uring_region_desc region_ptr: u64, - offsets: io_uring_zcrx_offsets, + offsets: ZcrxOffsets, zcrx_id: u32, __resv2: u32, __resv: [3]u64, @@ -1550,23 +1582,7 @@ pub const SocketOp = enum(u16) { }; /// io_uring_restriction.opcode values -pub const RegisterRestrictionOp = enum(u8) { - /// Allow an io_uring_register(2) opcode - REGISTER_OP = 0, - - /// Allow an sqe opcode - SQE_OP = 1, - - /// Allow sqe flags - SQE_FLAGS_ALLOWED = 2, - - /// Require sqe flags (these flags must be set on each submission) - SQE_FLAGS_REQUIRED = 3, - - _, -}; - -/// io_uring_restriction.opcode values +/// matches io_uring_register_restriction_op in liburing pub const RestrictionOp = enum(u16) { /// Allow an io_uring_register(2) opcode REGISTER_OP = 0, @@ -1664,28 +1680,28 @@ pub const Op = enum(u8) { /// call on how many entries the submission and completion queues will ultimately have, /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. /// Matches the interface of io_uring_queue_init() in liburing. -pub fn init(entries: u16, flags: Flags.Setup) !IoUring { - var params = mem.zeroInit(linux.io_uring_params, .{ - .flags = @bitCast(flags), +pub fn init(entries: u16, flags: uflags.Setup) !IoUring { + var params = mem.zeroInit(Params, .{ + .flags = flags, .sq_thread_idle = 1000, }); - return try IoUring.init_params(entries, ¶ms); + return try .init_params(entries, ¶ms); } /// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission /// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). /// `params` is passed by reference because the kernel needs to modify the parameters. /// Matches the interface of io_uring_queue_init_params() in liburing. -pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { +pub fn init_params(entries: u16, p: *Params) !IoUring { if (entries == 0) return error.EntriesZero; if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; assert(p.sq_entries == 0); - assert(p.features == 0); + assert(@as(u32, @bitCast(p.features)) == 0); assert(p.resv[0] == 0); assert(p.resv[1] == 0); assert(p.resv[2] == 0); - const flags: Flags.Setup = @bitCast(p.flags); + const flags: uflags.Setup = @bitCast(p.flags); assert(p.cq_entries == 0 or flags.CQSIZE); assert(p.wq_fd == 0 or flags.ATTACH_WQ); @@ -1715,7 +1731,7 @@ pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { assert(fd >= 0); errdefer posix.close(fd); - const features: Flags.Features = @bitCast(p.features); + const features: uflags.Features = @bitCast(p.features); // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. // This is not an optional feature for us... if the kernel does it, we have to do it. // The thinking on this by the kernel developers was that both the submission and the @@ -1736,9 +1752,9 @@ pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { // From here on, we only need to read from params, so pass `p` by value as immutable. // The completion queue shares the mmap with the submission queue, so pass `sq` there too. - var sq = try SubmissionQueue.init(fd, p.*); + var sq = try Sq.init(fd, p.*); errdefer sq.deinit(); - var cq = try CompletionQueue.init(fd, p.*, sq); + var cq = try Cq.init(fd, p.*, sq); errdefer cq.deinit(); // Check that our starting state is as we expect. @@ -1783,7 +1799,7 @@ pub fn deinit(self: *IoUring) void { /// and the null return in liburing is more a C idiom than anything else, for lack of a better /// alternative. In Zig, we have first-class error handling... so let's use it. /// Matches the implementation of io_uring_get_sqe() in liburing. -pub fn get_sqe(self: *IoUring) !*io_uring_sqe { +pub fn get_sqe(self: *IoUring) !*Sqe { const head = @atomicLoad(u32, self.sq.head, .acquire); // Remember that these head and tail offsets wrap around every four billion operations. // We must therefore use wrapping addition and subtraction to avoid a runtime crash. @@ -1810,7 +1826,7 @@ pub fn submit(self: *IoUring) !u32 { /// Matches the implementation of io_uring_submit_and_wait() in liburing. pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { const submitted = self.flush_sq(); - var flags: Flags.Enter = .{}; + var flags: uflags.Enter = .{}; if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { if (wait_nr > 0 or self.flags.IOPOLL) { flags.GETEVENTS = true; @@ -1822,7 +1838,7 @@ pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { /// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. /// Returns the number of SQEs submitted. -pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: Flags.Enter) !u32 { +pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.Enter) !u32 { assert(self.fd >= 0); const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); switch (linux.errno(res)) { @@ -1882,10 +1898,10 @@ pub fn flush_sq(self: *IoUring) u32 { /// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. /// For the latter case, we set the SQ thread wakeup flag. /// Matches the implementation of sq_ring_needs_enter() in liburing. -pub fn sq_ring_needs_enter(self: *IoUring, flags: *Flags.Enter) bool { +pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { assert(@as(u32, @bitCast(flags.*)) == 0); if (!self.flags.SQPOLL) return true; - if ((@atomicLoad(u32, self.sq.flags, .unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { + if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).NEED_WAKEUP) { flags.*.SQ_WAKEUP = true; return true; } @@ -1919,17 +1935,17 @@ pub fn cq_ready(self: *IoUring) u32 { /// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. /// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. /// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. -pub fn copy_cqes(self: *IoUring, cqes: []io_uring_cqe, wait_nr: u32) !u32 { +pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 { const count = self.copy_cqes_ready(cqes); if (count > 0) return count; if (self.cq_ring_needs_flush() or wait_nr > 0) { - _ = try self.enter(0, wait_nr, linux.IORING_ENTER_GETEVENTS); + _ = try self.enter(0, wait_nr, .{ .GETEVENTS = true }); return self.copy_cqes_ready(cqes); } return 0; } -fn copy_cqes_ready(self: *IoUring, cqes: []io_uring_cqe) u32 { +fn copy_cqes_ready(self: *IoUring, cqes: []Cqe) u32 { const ready = self.cq_ready(); const count = @min(cqes.len, ready); const head = self.cq.head.* & self.cq.mask; @@ -1950,8 +1966,8 @@ fn copy_cqes_ready(self: *IoUring, cqes: []io_uring_cqe) u32 { /// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. /// A convenience method for `copy_cqes()` for when you don't need to batch or peek. -pub fn copy_cqe(ring: *IoUring) !io_uring_cqe { - var cqes: [1]io_uring_cqe = undefined; +pub fn copy_cqe(ring: *IoUring) !Cqe { + var cqes: [1]Cqe = undefined; while (true) { const count = try ring.copy_cqes(&cqes, 1); if (count > 0) return cqes[0]; @@ -1960,7 +1976,7 @@ pub fn copy_cqe(ring: *IoUring) !io_uring_cqe { /// Matches the implementation of cq_ring_needs_flush() in liburing. pub fn cq_ring_needs_flush(self: *IoUring) bool { - return (@atomicLoad(u32, self.sq.flags, .unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; + return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).CQ_OVERFLOW; } /// For advanced use cases only that implement custom completion queue methods. @@ -1968,7 +1984,7 @@ pub fn cq_ring_needs_flush(self: *IoUring) bool { /// Must be called exactly once after a zero-copy CQE has been processed by your application. /// Not idempotent, calling more than once will result in other CQEs being lost. /// Matches the implementation of cqe_seen() in liburing. -pub fn cqe_seen(self: *IoUring, cqe: *io_uring_cqe) void { +pub fn cqe_seen(self: *IoUring, cqe: *Cqe) void { _ = cqe; self.cq_advance(1); } @@ -1991,7 +2007,7 @@ pub fn cq_advance(self: *IoUring, count: u32) void { /// apply to the write, since the fsync may complete before the write is issued to the disk. /// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, /// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. -pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: Flags.Fsync) !*io_uring_sqe { +pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_fsync(fd, flags); sqe.user_data = user_data; @@ -2003,7 +2019,7 @@ pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: Flags.Fsync) /// A no-op is more useful than may appear at first glance. /// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to /// know when the ring is idle before acting on a kill signal. -pub fn nop(self: *IoUring, user_data: u64) !*io_uring_sqe { +pub fn nop(self: *IoUring, user_data: u64) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_nop(); sqe.user_data = user_data; @@ -2039,14 +2055,14 @@ pub fn read( fd: linux.fd_t, buffer: ReadBuffer, offset: u64, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); switch (buffer) { .buffer => |slice| sqe.prep_read(fd, slice, offset), .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), .buffer_selection => |selection| { sqe.prep_rw(.READ, fd, 0, selection.len, offset); - sqe.flags |= linux.IOSQE_BUFFER_SELECT; + sqe.flags.BUFFER_SELECT = true; sqe.buf_index = selection.group_id; }, } @@ -2062,7 +2078,7 @@ pub fn write( fd: linux.fd_t, buffer: []const u8, offset: u64, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_write(fd, buffer, offset); sqe.user_data = user_data; @@ -2083,7 +2099,7 @@ pub fn write( /// See https://github.com/axboe/liburing/issues/291 /// /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd_out: posix.fd_t, off_out: u64, len: usize) !*io_uring_sqe { +pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd_out: posix.fd_t, off_out: u64, len: usize) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); sqe.user_data = user_data; @@ -2102,7 +2118,7 @@ pub fn read_fixed( buffer: *posix.iovec, offset: u64, buffer_index: u16, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_read_fixed(fd, buffer, offset, buffer_index); sqe.user_data = user_data; @@ -2119,7 +2135,7 @@ pub fn writev( fd: linux.fd_t, iovecs: []const posix.iovec_const, offset: u64, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_writev(fd, iovecs, offset); sqe.user_data = user_data; @@ -2138,7 +2154,7 @@ pub fn write_fixed( buffer: *posix.iovec, offset: u64, buffer_index: u16, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_write_fixed(fd, buffer, offset, buffer_index); sqe.user_data = user_data; @@ -2155,7 +2171,7 @@ pub fn accept( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_accept(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -2177,7 +2193,7 @@ pub fn accept_multishot( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_multishot_accept(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -2202,9 +2218,9 @@ pub fn accept_direct( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_accept_direct(fd, addr, addrlen, flags, linux.IORING_FILE_INDEX_ALLOC); + sqe.prep_accept_direct(fd, addr, addrlen, flags, constants.FILE_INDEX_ALLOC); sqe.user_data = user_data; return sqe; } @@ -2218,7 +2234,7 @@ pub fn accept_multishot_direct( addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -2233,7 +2249,7 @@ pub fn connect( fd: linux.fd_t, addr: *const posix.sockaddr, addrlen: posix.socklen_t, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_connect(fd, addr, addrlen); sqe.user_data = user_data; @@ -2249,7 +2265,7 @@ pub fn epoll_ctl( fd: linux.fd_t, op: u32, ev: ?*linux.epoll_event, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_epoll_ctl(epfd, fd, op, ev); sqe.user_data = user_data; @@ -2279,14 +2295,14 @@ pub fn recv( fd: linux.fd_t, buffer: RecvBuffer, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); switch (buffer) { .buffer => |slice| sqe.prep_recv(fd, slice, flags), .buffer_selection => |selection| { sqe.prep_rw(.RECV, fd, 0, selection.len, 0); sqe.rw_flags = flags; - sqe.flags |= linux.IOSQE_BUFFER_SELECT; + sqe.flags.BUFFER_SELECT = true; sqe.buf_index = selection.group_id; }, } @@ -2303,7 +2319,7 @@ pub fn send( fd: linux.fd_t, buffer: []const u8, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_send(fd, buffer, flags); sqe.user_data = user_data; @@ -2332,7 +2348,7 @@ pub fn send_zc( buffer: []const u8, send_flags: u32, zc_flags: u16, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); sqe.user_data = user_data; @@ -2350,7 +2366,7 @@ pub fn send_zc_fixed( send_flags: u32, zc_flags: u16, buf_index: u16, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); sqe.user_data = user_data; @@ -2366,7 +2382,7 @@ pub fn recvmsg( fd: linux.fd_t, msg: *linux.msghdr, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_recvmsg(fd, msg, flags); sqe.user_data = user_data; @@ -2382,7 +2398,7 @@ pub fn sendmsg( fd: linux.fd_t, msg: *const linux.msghdr_const, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_sendmsg(fd, msg, flags); sqe.user_data = user_data; @@ -2398,7 +2414,7 @@ pub fn sendmsg_zc( fd: linux.fd_t, msg: *const linux.msghdr_const, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_sendmsg_zc(fd, msg, flags); sqe.user_data = user_data; @@ -2415,7 +2431,7 @@ pub fn openat( path: [*:0]const u8, flags: linux.O, mode: posix.mode_t, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_openat(fd, path, flags, mode); sqe.user_data = user_data; @@ -2441,7 +2457,7 @@ pub fn openat_direct( flags: linux.O, mode: posix.mode_t, file_index: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_openat_direct(fd, path, flags, mode, file_index); sqe.user_data = user_data; @@ -2451,7 +2467,7 @@ pub fn openat_direct( /// Queues (but does not submit) an SQE to perform a `close(2)`. /// Returns a pointer to the SQE. /// Available since 5.6. -pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*io_uring_sqe { +pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_close(fd); sqe.user_data = user_data; @@ -2460,7 +2476,7 @@ pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*io_uring_sqe { /// Queues close of registered file descriptor. /// Available since 5.15 -pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*io_uring_sqe { +pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_close_direct(file_index); sqe.user_data = user_data; @@ -2486,7 +2502,7 @@ pub fn timeout( ts: *const linux.kernel_timespec, count: u32, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_timeout(ts, count, flags); sqe.user_data = user_data; @@ -2506,7 +2522,7 @@ pub fn timeout_remove( user_data: u64, timeout_user_data: u64, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_timeout_remove(timeout_user_data, flags); sqe.user_data = user_data; @@ -2534,7 +2550,7 @@ pub fn link_timeout( user_data: u64, ts: *const linux.kernel_timespec, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_link_timeout(ts, flags); sqe.user_data = user_data; @@ -2548,7 +2564,7 @@ pub fn poll_add( user_data: u64, fd: linux.fd_t, poll_mask: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_poll_add(fd, poll_mask); sqe.user_data = user_data; @@ -2561,7 +2577,7 @@ pub fn poll_remove( self: *IoUring, user_data: u64, target_user_data: u64, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_poll_remove(target_user_data); sqe.user_data = user_data; @@ -2577,7 +2593,7 @@ pub fn poll_update( new_user_data: u64, poll_mask: u32, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); sqe.user_data = user_data; @@ -2593,7 +2609,7 @@ pub fn fallocate( mode: i32, offset: u64, len: u64, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_fallocate(fd, mode, offset, len); sqe.user_data = user_data; @@ -2610,7 +2626,7 @@ pub fn statx( flags: u32, mask: u32, buf: *linux.Statx, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_statx(fd, path, flags, mask, buf); sqe.user_data = user_data; @@ -2630,7 +2646,7 @@ pub fn cancel( user_data: u64, cancel_user_data: u64, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_cancel(cancel_user_data, flags); sqe.user_data = user_data; @@ -2646,7 +2662,7 @@ pub fn shutdown( user_data: u64, sockfd: posix.socket_t, how: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_shutdown(sockfd, how); sqe.user_data = user_data; @@ -2663,7 +2679,7 @@ pub fn renameat( new_dir_fd: linux.fd_t, new_path: [*:0]const u8, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); sqe.user_data = user_data; @@ -2678,7 +2694,7 @@ pub fn unlinkat( dir_fd: linux.fd_t, path: [*:0]const u8, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_unlinkat(dir_fd, path, flags); sqe.user_data = user_data; @@ -2693,7 +2709,7 @@ pub fn mkdirat( dir_fd: linux.fd_t, path: [*:0]const u8, mode: posix.mode_t, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_mkdirat(dir_fd, path, mode); sqe.user_data = user_data; @@ -2708,7 +2724,7 @@ pub fn symlinkat( target: [*:0]const u8, new_dir_fd: linux.fd_t, link_path: [*:0]const u8, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_symlinkat(target, new_dir_fd, link_path); sqe.user_data = user_data; @@ -2725,7 +2741,7 @@ pub fn linkat( new_dir_fd: linux.fd_t, new_path: [*:0]const u8, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); sqe.user_data = user_data; @@ -2746,7 +2762,7 @@ pub fn provide_buffers( buffers_count: usize, group_id: usize, buffer_id: usize, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); sqe.user_data = user_data; @@ -2760,7 +2776,7 @@ pub fn remove_buffers( user_data: u64, buffers_count: usize, group_id: usize, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_remove_buffers(buffers_count, group_id); sqe.user_data = user_data; @@ -2777,7 +2793,7 @@ pub fn waitid( infop: *linux.siginfo_t, options: u32, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_waitid(id_type, id, infop, options, flags); sqe.user_data = user_data; @@ -2821,7 +2837,7 @@ pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_ resv: u32, fds: u64 align(8), }; - var update = FilesUpdate{ + var update: FilesUpdate = .{ .offset = offset, .resv = @as(u32, 0), .fds = @as(u64, @intFromPtr(fds.ptr)), @@ -2840,9 +2856,9 @@ pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { assert(self.fd >= 0); - const reg = &linux.io_uring_rsrc_register{ + const reg: RsrcRegister = .{ .nr = nr_files, - .flags = linux.IORING_RSRC_REGISTER_SPARSE, + .flags = constants.RSRC_REGISTER_SPARSE, .resv2 = 0, .data = 0, .tags = 0, @@ -2851,7 +2867,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { const res = linux.io_uring_register( self.fd, .REGISTER_FILES2, - @ptrCast(reg), + @ptrCast(®), @as(u32, @sizeOf(linux.io_uring_rsrc_register)), ); @@ -2863,7 +2879,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { assert(self.fd >= 0); - const range = &linux.io_uring_file_index_range{ + const range: FileIndexRange = .{ .off = offset, .len = len, .resv = 0, @@ -2872,7 +2888,7 @@ pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { const res = linux.io_uring_register( self.fd, .REGISTER_FILE_ALLOC_RANGE, - @ptrCast(range), + @ptrCast(&range), @as(u32, @sizeOf(linux.io_uring_file_index_range)), ); @@ -2920,13 +2936,13 @@ pub fn unregister_eventfd(self: *IoUring) !void { try handle_registration_result(res); } -pub fn register_napi(self: *IoUring, napi: *linux.io_uring_napi) !void { +pub fn register_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); const res = linux.io_uring_register(self.fd, .REGISTER_NAPI, napi, 1); try handle_registration_result(res); } -pub fn unregister_napi(self: *IoUring, napi: *linux.io_uring_napi) !void { +pub fn unregister_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); const res = linux.io_uring_register(self.fd, .UNREGISTER_NAPI, napi, 1); try handle_registration_result(res); @@ -2955,11 +2971,11 @@ pub fn unregister_buffers(self: *IoUring) !void { } } -/// Returns a io_uring_probe which is used to probe the capabilities of the -/// io_uring subsystem of the running kernel. The io_uring_probe contains the +/// Returns a Probe which is used to probe the capabilities of the +/// io_uring subsystem of the running kernel. The Probe contains the /// list of supported operations. -pub fn get_probe(self: *IoUring) !linux.io_uring_probe { - var probe = mem.zeroInit(linux.io_uring_probe, .{}); +pub fn get_probe(self: *IoUring) !Probe { + var probe = mem.zeroInit(Probe, .{}); const res = linux.io_uring_register(self.fd, .REGISTER_PROBE, &probe, probe.ops.len); try handle_register_buf_ring_result(res); return probe; @@ -3008,7 +3024,7 @@ pub fn socket( socket_type: u32, protocol: u32, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_socket(domain, socket_type, protocol, flags); sqe.user_data = user_data; @@ -3025,7 +3041,7 @@ pub fn socket_direct( protocol: u32, flags: u32, file_index: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); sqe.user_data = user_data; @@ -3042,7 +3058,7 @@ pub fn socket_direct_alloc( socket_type: u32, protocol: u32, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); sqe.user_data = user_data; @@ -3059,7 +3075,7 @@ pub fn bind( addr: *const posix.sockaddr, addrlen: posix.socklen_t, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_bind(fd, addr, addrlen, flags); sqe.user_data = user_data; @@ -3075,7 +3091,7 @@ pub fn listen( fd: linux.fd_t, backlog: usize, flags: u32, -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_listen(fd, backlog, flags); sqe.user_data = user_data; @@ -3088,13 +3104,13 @@ pub fn listen( pub fn cmd_sock( self: *IoUring, user_data: u64, - cmd_op: linux.IO_URING_SOCKET_OP, + cmd_op: SocketOp, fd: linux.fd_t, level: u32, // linux.SOL optname: u32, // linux.SO optval: u64, // pointer to the option value optlen: u32, // size of the option value -) !*io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); sqe.user_data = user_data; @@ -3111,7 +3127,7 @@ pub fn setsockopt( level: u32, // linux.SOL optname: u32, // linux.SO opt: []const u8, -) !*io_uring_sqe { +) !*Sqe { return try self.cmd_sock( user_data, .SETSOCKOPT, @@ -3133,7 +3149,7 @@ pub fn getsockopt( level: u32, // linux.SOL optname: u32, // linux.SO opt: []u8, -) !*io_uring_sqe { +) !*Sqe { return try self.cmd_sock( user_data, .GETSOCKOPT, @@ -3145,14 +3161,15 @@ pub fn getsockopt( ); } -pub const SubmissionQueue = struct { +/// matches io_uring_sq in liburing +pub const Sq = struct { head: *u32, tail: *u32, mask: u32, - flags: *u32, + flags: *Flags, dropped: *u32, array: []u32, - sqes: []io_uring_sqe, + sqes: []Sqe, mmap: []align(page_size_min) u8, mmap_sqes: []align(page_size_min) u8, @@ -3163,12 +3180,23 @@ pub const SubmissionQueue = struct { sqe_head: u32 = 0, sqe_tail: u32 = 0, - pub fn init(fd: linux.fd_t, p: linux.io_uring_params) !SubmissionQueue { + /// sq_ring.flags + pub const Flags = packed struct(u32) { + /// needs io_uring_enter wakeup + NEED_WAKEUP: bool = false, + /// CQ ring is overflown + CQ_OVERFLOW: bool = false, + /// task should enter the kernel + TASKRUN: bool = false, + _unused: u29 = 0, + }; + + pub fn init(fd: posix.fd_t, p: Params) !Sq { assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); + assert(p.features.SINGLE_MMAP); const size = @max( p.sq_off.array + p.sq_entries * @sizeOf(u32), - p.cq_off.cqes + p.cq_entries * @sizeOf(io_uring_cqe), + p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe), ); const mmap = try posix.mmap( null, @@ -3176,27 +3204,27 @@ pub const SubmissionQueue = struct { posix.PROT.READ | posix.PROT.WRITE, .{ .TYPE = .SHARED, .POPULATE = true }, fd, - linux.IORING_OFF_SQ_RING, + constants.OFF_SQ_RING, ); errdefer posix.munmap(mmap); assert(mmap.len == size); // The motivation for the `sqes` and `array` indirection is to make it possible for the // application to preallocate static io_uring_sqe entries and then replay them when needed. - const size_sqes = p.sq_entries * @sizeOf(io_uring_sqe); + const size_sqes = p.sq_entries * @sizeOf(Sqe); const mmap_sqes = try posix.mmap( null, size_sqes, posix.PROT.READ | posix.PROT.WRITE, .{ .TYPE = .SHARED, .POPULATE = true }, fd, - linux.IORING_OFF_SQES, + constants.OFF_SQES, ); errdefer posix.munmap(mmap_sqes); assert(mmap_sqes.len == size_sqes); const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); - const sqes: [*]io_uring_sqe = @ptrCast(@alignCast(&mmap_sqes[0])); + const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0])); // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); @@ -3213,26 +3241,35 @@ pub const SubmissionQueue = struct { }; } - pub fn deinit(self: *SubmissionQueue) void { + pub fn deinit(self: *Sq) void { posix.munmap(self.mmap_sqes); posix.munmap(self.mmap); } }; -pub const CompletionQueue = struct { +/// matches io_uring_cq in liburing +pub const Cq = struct { head: *u32, tail: *u32, mask: u32, overflow: *u32, - cqes: []io_uring_cqe, + cqes: []Cqe, - pub fn init(fd: linux.fd_t, p: linux.io_uring_params, sq: SubmissionQueue) !CompletionQueue { + /// cq_ring.flags + pub const Flags = packed struct(u32) { + /// disable eventfd notifications + EVENTFD_DISABLED: bool = false, + _unused: u31 = 0, + }; + + pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq { assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); + const features: uflags.Features = @bitCast(p.features); + assert(features.SINGLE_MMAP); const mmap = sq.mmap; - const cqes: [*]io_uring_cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); + const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); - return CompletionQueue{ + return .{ .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, @@ -3241,7 +3278,7 @@ pub const CompletionQueue = struct { }; } - pub fn deinit(self: *CompletionQueue) void { + pub fn deinit(self: *Cq) void { _ = self; // A no-op since we now share the mmap with the submission queue. // Here for symmetry with the submission queue, and for any future feature support. @@ -3272,7 +3309,7 @@ pub const BufferGroup = struct { /// Pointer to the memory shared by the kernel. /// `buffers_count` of `io_uring_buf` structures are shared by the kernel. /// First `io_uring_buf` is overlaid by `io_uring_buf_ring` struct. - br: *align(page_size_min) linux.io_uring_buf_ring, + br: *align(page_size_min) BufferRing, /// Contiguous block of memory of size (buffers_count * buffer_size). buffers: []u8, /// Size of each buffer in buffers. @@ -3327,20 +3364,20 @@ pub const BufferGroup = struct { } // Prepare recv operation which will select buffer from this group. - pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*io_uring_sqe { + pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { var sqe = try self.ring.get_sqe(); sqe.prep_rw(.RECV, fd, 0, 0, 0); sqe.rw_flags = flags; - sqe.flags |= linux.IOSQE_BUFFER_SELECT; + sqe.flags.BUFFER_SELECT = true; sqe.buf_index = self.group_id; sqe.user_data = user_data; return sqe; } // Prepare multishot recv operation which will select buffer from this group. - pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*io_uring_sqe { + pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { var sqe = try self.recv(user_data, fd, flags); - sqe.ioprio |= linux.IORING_RECV_MULTISHOT; + sqe.ioprio.send_recv.RECV_MULTISHOT = true; return sqe; } @@ -3351,16 +3388,16 @@ pub const BufferGroup = struct { } // Get buffer by CQE. - pub fn get(self: *BufferGroup, cqe: io_uring_cqe) ![]u8 { + pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { const buffer_id = try cqe.buffer_id(); const used_len = @as(usize, @intCast(cqe.res)); return self.get_by_id(buffer_id)[0..used_len]; } // Release buffer from CQE to the kernel. - pub fn put(self: *BufferGroup, cqe: io_uring_cqe) !void { + pub fn put(self: *BufferGroup, cqe: Cqe) !void { const buffer_id = try cqe.buffer_id(); - if (cqe.flags & linux.IORING_CQE_F_BUF_MORE == linux.IORING_CQE_F_BUF_MORE) { + if (cqe.flags.F_BUF_MORE) { // Incremental consumption active, kernel will write to the this buffer again const used_len = @as(u32, @intCast(cqe.res)); // Track what part of the buffer is used @@ -3385,12 +3422,12 @@ pub fn setup_buf_ring( fd: linux.fd_t, entries: u16, group_id: u16, - flags: linux.io_uring_buf_reg.Flags, -) !*align(page_size_min) linux.io_uring_buf_ring { + flags: BufferRegister.Flags, +) !*align(page_size_min) BufferRing { if (entries == 0 or entries > 1 << 15) return error.EntriesNotInRange; if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; - const mmap_size = @as(usize, entries) * @sizeOf(linux.io_uring_buf); + const mmap_size = @as(usize, entries) * @sizeOf(Buffer); const mmap = try posix.mmap( null, mmap_size, @@ -3402,7 +3439,7 @@ pub fn setup_buf_ring( errdefer posix.munmap(mmap); assert(mmap.len == mmap_size); - const br: *align(page_size_min) linux.io_uring_buf_ring = @ptrCast(mmap.ptr); + const br: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); try register_buf_ring(fd, @intFromPtr(br), entries, group_id, flags); return br; } @@ -3412,9 +3449,9 @@ fn register_buf_ring( addr: u64, entries: u32, group_id: u16, - flags: linux.io_uring_buf_reg.Flags, + flags: BufferRegister.Flags, ) !void { - var reg = mem.zeroInit(linux.io_uring_buf_reg, .{ + var reg = mem.zeroInit(BufferRegister, .{ .ring_addr = addr, .ring_entries = entries, .bgid = group_id, @@ -3430,8 +3467,8 @@ fn register_buf_ring( try handle_register_buf_ring_result(res); } -fn unregister_buf_ring(fd: linux.fd_t, group_id: u16) !void { - var reg = mem.zeroInit(linux.io_uring_buf_reg, .{ +fn unregister_buf_ring(fd: posix.fd_t, group_id: u16) !void { + var reg = mem.zeroInit(BufferRegister, .{ .bgid = group_id, }); const res = linux.io_uring_register( @@ -3452,16 +3489,16 @@ fn handle_register_buf_ring_result(res: usize) !void { } // Unregisters a previously registered shared buffer ring, returned from io_uring_setup_buf_ring. -pub fn free_buf_ring(fd: linux.fd_t, br: *align(page_size_min) linux.io_uring_buf_ring, entries: u32, group_id: u16) void { +pub fn free_buf_ring(fd: posix.fd_t, br: *align(page_size_min) BufferRing, entries: u32, group_id: u16) void { unregister_buf_ring(fd, group_id) catch {}; var mmap: []align(page_size_min) u8 = undefined; mmap.ptr = @ptrCast(br); - mmap.len = entries * @sizeOf(linux.io_uring_buf); + mmap.len = entries * @sizeOf(Buffer); posix.munmap(mmap); } /// Initialises `br` so that it is ready to be used. -pub fn buf_ring_init(br: *linux.io_uring_buf_ring) void { +pub fn buf_ring_init(br: *BufferRing) void { br.tail = 0; } @@ -3477,14 +3514,14 @@ pub fn buf_ring_mask(entries: u16) u16 { /// If just one buffer is provided before the ring tail is committed with advance then offset should be 0. /// If buffers are provided in a loop before being committed, the offset must be incremented by one for each buffer added. pub fn buf_ring_add( - br: *linux.io_uring_buf_ring, + br: *BufferRing, buffer: []u8, buffer_id: u16, mask: u16, buffer_offset: u16, ) void { - const bufs: [*]linux.io_uring_buf = @ptrCast(br); - const buf: *linux.io_uring_buf = &bufs[(br.tail +% buffer_offset) & mask]; + const bufs: [*]Buffer = @ptrCast(br); + const buf: *Buffer = &bufs[(br.tail +% buffer_offset) & mask]; buf.addr = @intFromPtr(buffer.ptr); buf.len = @intCast(buffer.len); @@ -3493,7 +3530,7 @@ pub fn buf_ring_add( /// Make `count` new buffers visible to the kernel. Called after /// `io_uring_buf_ring_add` has been called `count` times to fill in new buffers. -pub fn buf_ring_advance(br: *linux.io_uring_buf_ring, count: u16) void { +pub fn buf_ring_advance(br: *BufferRing, count: u16) void { const tail: u16 = br.tail +% count; @atomicStore(u16, &br.tail, tail, .release); } @@ -3501,22 +3538,22 @@ pub fn buf_ring_advance(br: *linux.io_uring_buf_ring, count: u16) void { test "structs/offsets/entries" { if (!is_linux) return error.SkipZigTest; - try testing.expectEqual(@as(usize, 120), @sizeOf(linux.io_uring_params)); - try testing.expectEqual(@as(usize, 64), @sizeOf(io_uring_sqe)); - try testing.expectEqual(@as(usize, 16), @sizeOf(io_uring_cqe)); + try testing.expectEqual(@as(usize, 120), @sizeOf(Params)); + try testing.expectEqual(@as(usize, 64), @sizeOf(Sqe)); + try testing.expectEqual(@as(usize, 16), @sizeOf(Cqe)); - try testing.expectEqual(0, linux.IORING_OFF_SQ_RING); - try testing.expectEqual(0x8000000, linux.IORING_OFF_CQ_RING); - try testing.expectEqual(0x10000000, linux.IORING_OFF_SQES); + try testing.expectEqual(0, constants.OFF_SQ_RING); + try testing.expectEqual(0x8000000, constants.OFF_CQ_RING); + try testing.expectEqual(0x10000000, constants.OFF_SQES); - try testing.expectError(error.EntriesZero, IoUring.init(0, 0)); - try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, 0)); + try testing.expectError(error.EntriesZero, IoUring.init(0, .{})); + try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, .{})); } test "nop" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3527,7 +3564,7 @@ test "nop" { } const sqe = try ring.nop(0xaaaaaaaa); - try testing.expectEqual(io_uring_sqe{ + try testing.expectEqual(Sqe{ .opcode = .NOP, .flags = 0, .ioprio = 0, @@ -3558,7 +3595,7 @@ test "nop" { try testing.expectEqual(@as(u32, 0), ring.cq.head.*); try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, .flags = 0, @@ -3567,9 +3604,9 @@ test "nop" { try testing.expectEqual(@as(u32, 0), ring.cq_ready()); const sqe_barrier = try ring.nop(0xbbbbbbbb); - sqe_barrier.flags |= linux.IOSQE_IO_DRAIN; + sqe_barrier.flags.IO_DRAIN = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xbbbbbbbb, .res = 0, .flags = 0, @@ -3583,7 +3620,7 @@ test "nop" { test "readv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3608,11 +3645,11 @@ test "readv" { var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); try testing.expectEqual(Op.READV, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + sqe.flags.FIXED_FILE = true; try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, .flags = 0, @@ -3625,7 +3662,7 @@ test "readv" { test "writev/fsync/readv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(4, 0) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3652,12 +3689,12 @@ test "writev/fsync/readv" { const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); try testing.expectEqual(Op.WRITEV, sqe_writev.opcode); try testing.expectEqual(@as(u64, 17), sqe_writev.off); - sqe_writev.flags |= linux.IOSQE_IO_LINK; + sqe_writev.flags.IO_LINK = true; const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); try testing.expectEqual(fd, sqe_fsync.fd); - sqe_fsync.flags |= linux.IOSQE_IO_LINK; + sqe_fsync.flags.IO_LINK = true; const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); try testing.expectEqual(Op.READV, sqe_readv.opcode); @@ -3668,21 +3705,21 @@ test "writev/fsync/readv" { try testing.expectEqual(@as(u32, 0), ring.sq_ready()); try testing.expectEqual(@as(u32, 3), ring.cq_ready()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xdddddddd, .res = buffer_write.len, .flags = 0, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, .flags = 0, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 1), ring.cq_ready()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_read.len, .flags = 0, @@ -3695,7 +3732,7 @@ test "writev/fsync/readv" { test "write/read" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3714,7 +3751,7 @@ test "write/read" { const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); try testing.expectEqual(Op.WRITE, sqe_write.opcode); try testing.expectEqual(@as(u64, 10), sqe_write.off); - sqe_write.flags |= linux.IOSQE_IO_LINK; + sqe_write.flags.IO_LINK = true; const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); try testing.expectEqual(Op.READ, sqe_read.opcode); try testing.expectEqual(@as(u64, 10), sqe_read.off); @@ -3726,12 +3763,12 @@ test "write/read" { // https://lwn.net/Articles/809820/ if (cqe_write.err() == .INVAL) return error.SkipZigTest; if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_write.len, .flags = 0, }, cqe_write); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_read.len, .flags = 0, @@ -3742,7 +3779,7 @@ test "write/read" { test "splice/read" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(4, 0) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3771,13 +3808,13 @@ test "splice/read" { try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); - sqe_splice_to_pipe.flags |= linux.IOSQE_IO_LINK; + sqe_splice_to_pipe.flags.IO_LINK = true; const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); - sqe_splice_from_pipe.flags |= linux.IOSQE_IO_LINK; + sqe_splice_from_pipe.flags.IO_LINK = true; const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); try testing.expectEqual(Op.READ, sqe_read.opcode); @@ -3792,17 +3829,17 @@ test "splice/read" { if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_write.len, .flags = 0, }, cqe_splice_to_pipe); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_write.len, .flags = 0, }, cqe_splice_from_pipe); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x33333333, .res = buffer_read.len, .flags = 0, @@ -3813,7 +3850,7 @@ test "splice/read" { test "write_fixed/read_fixed" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3848,7 +3885,7 @@ test "write_fixed/read_fixed" { const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); try testing.expectEqual(@as(u64, 3), sqe_write.off); - sqe_write.flags |= linux.IOSQE_IO_LINK; + sqe_write.flags.IO_LINK = true; const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode); @@ -3859,12 +3896,12 @@ test "write_fixed/read_fixed" { const cqe_write = try ring.copy_cqe(); const cqe_read = try ring.copy_cqe(); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x45454545, .res = @as(i32, @intCast(buffers[0].len)), .flags = 0, }, cqe_write); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = @as(i32, @intCast(buffers[1].len)), .flags = 0, @@ -3878,7 +3915,7 @@ test "write_fixed/read_fixed" { test "openat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3900,7 +3937,7 @@ test "openat" { const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; const mode: posix.mode_t = 0o666; const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); - try testing.expectEqual(io_uring_sqe{ + try testing.expectEqual(Sqe{ .opcode = .OPENAT, .flags = 0, .ioprio = 0, @@ -3932,7 +3969,7 @@ test "openat" { test "close" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3953,7 +3990,7 @@ test "close" { const cqe_close = try ring.copy_cqe(); if (cqe_close.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x44444444, .res = 0, .flags = 0, @@ -3963,7 +4000,7 @@ test "close" { test "accept/connect/send/recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3977,13 +4014,13 @@ test "accept/connect/send/recv" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); - sqe_send.flags |= linux.IOSQE_IO_LINK; + sqe_send.flags.IO_LINK = true; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); try testing.expectEqual(@as(u32, 2), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, .flags = 0, @@ -3991,7 +4028,7 @@ test "accept/connect/send/recv" { const cqe_recv = try ring.copy_cqe(); if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems @@ -4004,7 +4041,7 @@ test "accept/connect/send/recv" { test "sendmsg/recvmsg" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4043,7 +4080,7 @@ test "sendmsg/recvmsg" { .flags = 0, }; const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); - sqe_sendmsg.flags |= linux.IOSQE_IO_LINK; + sqe_sendmsg.flags.IO_LINK = true; try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); try testing.expectEqual(client, sqe_sendmsg.fd); @@ -4075,7 +4112,7 @@ test "sendmsg/recvmsg" { const cqe_sendmsg = try ring.copy_cqe(); if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_send.len, .flags = 0, @@ -4083,7 +4120,7 @@ test "sendmsg/recvmsg" { const cqe_recvmsg = try ring.copy_cqe(); if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_recv.len, // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically @@ -4095,10 +4132,9 @@ test "sendmsg/recvmsg" { test "timeout (after a relative time)" { if (!is_linux) return error.SkipZigTest; + const io = std.testing.io; - const io = testing.io; - - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4116,7 +4152,7 @@ test "timeout (after a relative time)" { const cqe = try ring.copy_cqe(); const stopped = try std.Io.Clock.awake.now(io); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x55555555, .res = -@as(i32, @intFromEnum(linux.E.TIME)), .flags = 0, @@ -4130,7 +4166,7 @@ test "timeout (after a relative time)" { test "timeout (after a number of completions)" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4146,14 +4182,14 @@ test "timeout (after a number of completions)" { try testing.expectEqual(@as(u32, 2), try ring.submit()); const cqe_nop = try ring.copy_cqe(); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x77777777, .res = 0, .flags = 0, }, cqe_nop); const cqe_timeout = try ring.copy_cqe(); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x66666666, .res = 0, .flags = 0, @@ -4163,7 +4199,7 @@ test "timeout (after a number of completions)" { test "timeout_remove" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4186,7 +4222,7 @@ test "timeout_remove" { // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second - var cqes: [2]io_uring_cqe = undefined; + var cqes: [2]Cqe = undefined; cqes[0] = try ring.copy_cqe(); cqes[1] = try ring.copy_cqe(); @@ -4205,13 +4241,13 @@ test "timeout_remove" { try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); if (cqe.user_data == 0x88888888) { - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x88888888, .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), .flags = 0, }, cqe); } else if (cqe.user_data == 0x99999999) { - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x99999999, .res = 0, .flags = 0, @@ -4223,7 +4259,7 @@ test "timeout_remove" { test "accept/connect/recv/link_timeout" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4236,7 +4272,7 @@ test "accept/connect/recv/link_timeout" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - sqe_recv.flags |= linux.IOSQE_IO_LINK; + sqe_recv.flags.IO_LINK = true; const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; _ = try ring.link_timeout(0x22222222, &ts, 0); @@ -4272,7 +4308,7 @@ test "accept/connect/recv/link_timeout" { test "fallocate" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4306,7 +4342,7 @@ test "fallocate" { .OPNOTSUPP => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, .flags = 0, @@ -4318,7 +4354,7 @@ test "fallocate" { test "statx" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4362,7 +4398,7 @@ test "statx" { .BADF => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, .flags = 0, @@ -4375,7 +4411,7 @@ test "statx" { test "accept/connect/recv/cancel" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4409,13 +4445,13 @@ test "accept/connect/recv/cancel" { cqe_cancel = a; } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), .flags = 0, }, cqe_recv); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x99999999, .res = 0, .flags = 0, @@ -4425,7 +4461,7 @@ test "accept/connect/recv/cancel" { test "register_files_update" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4461,10 +4497,10 @@ test "register_files_update" { { const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, .flags = 0, @@ -4485,7 +4521,7 @@ test "register_files_update" { sqe.flags |= linux.IOSQE_FIXED_FILE; try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, .flags = 0, @@ -4512,7 +4548,7 @@ test "register_files_update" { test "shutdown" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4550,7 +4586,7 @@ test "shutdown" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x445445445, .res = 0, .flags = 0, @@ -4579,7 +4615,7 @@ test "shutdown" { test "renameat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4620,7 +4656,7 @@ test "renameat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -4637,7 +4673,7 @@ test "renameat" { test "unlinkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4673,7 +4709,7 @@ test "unlinkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -4689,7 +4725,7 @@ test "unlinkat" { test "mkdirat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4720,7 +4756,7 @@ test "mkdirat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -4733,7 +4769,7 @@ test "mkdirat" { test "symlinkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4768,7 +4804,7 @@ test "symlinkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -4781,7 +4817,7 @@ test "symlinkat" { test "linkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4822,7 +4858,7 @@ test "linkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, .flags = 0, @@ -4836,7 +4872,7 @@ test "linkat" { test "provide_buffers: read" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4891,7 +4927,7 @@ test "provide_buffers: read" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); + try testing.expect(cqe.flags.F_BUFFER); const used_buffer_id = cqe.flags >> 16; try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -4968,7 +5004,7 @@ test "provide_buffers: read" { test "remove_buffers" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5030,7 +5066,7 @@ test "remove_buffers" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); + try testing.expect(cqe.flags.F_BUFFER); const used_buffer_id = cqe.flags >> 16; try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -5057,7 +5093,7 @@ test "remove_buffers" { test "provide_buffers: accept/connect/send/recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5104,7 +5140,7 @@ test "provide_buffers: accept/connect/send/recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); } - var cqes: [4]io_uring_cqe = undefined; + var cqes: [4]Cqe = undefined; try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); } @@ -5122,7 +5158,7 @@ test "provide_buffers: accept/connect/send/recv" { try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(.{ .BUFFER_SELECT = true }, sqe.flags); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5270,7 +5306,7 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); try testing.expect(cqe_accept.res > 0); try testing.expectEqual(@as(u32, 0), cqe_accept.flags); - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = 0, .flags = 0, @@ -5304,7 +5340,7 @@ fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { test "accept multishot" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5337,7 +5373,7 @@ test "accept multishot" { if (cqe.err() == .INVAL) return error.SkipZigTest; try testing.expect(cqe.res > 0); try testing.expect(cqe.user_data == userdata); - try testing.expect(cqe.flags & linux.IORING_CQE_F_MORE > 0); // more flag is set + try testing.expect(cqe.flags.F_MORE); // more flag is set posix.close(client); } @@ -5346,7 +5382,7 @@ test "accept multishot" { test "accept/connect/send_zc/recv" { try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5361,7 +5397,7 @@ test "accept/connect/send_zc/recv" { // zero-copy send const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); - sqe_send.flags |= linux.IOSQE_IO_LINK; + sqe_send.flags.IO_LINK = true; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -5371,10 +5407,10 @@ test "accept/connect/send_zc/recv" { // will be a second completion event / notification for the // request, with the user_data field set to the same value. // buffer_send must be keep alive until second cqe. - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, - .flags = linux.IORING_CQE_F_MORE, + .flags = .{ .F_MORE = true }, }, cqe_send); cqe_send, const cqe_recv = brk: { @@ -5383,7 +5419,7 @@ test "accept/connect/send_zc/recv" { break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 }; }; - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, @@ -5392,7 +5428,7 @@ test "accept/connect/send_zc/recv" { // Second completion of zero-copy send. // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer - try testing.expectEqual(io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, .flags = linux.IORING_CQE_F_NOTIF, @@ -5402,7 +5438,7 @@ test "accept/connect/send_zc/recv" { test "accept_direct" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5453,7 +5489,7 @@ test "accept_direct" { // Fd field is set to registered file index, returned by accept. // Flag linux.IOSQE_FIXED_FILE must be set. const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); - recv_sqe.flags |= linux.IOSQE_FIXED_FILE; + recv_sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); // accept receive @@ -5490,7 +5526,7 @@ test "accept_multishot_direct" { return error.SkipZigTest; } - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5527,7 +5563,7 @@ test "accept_multishot_direct" { const fd_index = cqe_accept.res; try testing.expect(fd_index < registered_fds.len); try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE > 0); // has more is set + try testing.expect(cqe_accept.flags.F_MORE); // has more is set } // No more available fds, accept will get NFILE error. // Multishot is terminated (more flag is not set). @@ -5551,7 +5587,7 @@ test "accept_multishot_direct" { test "socket" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5574,7 +5610,7 @@ test "socket" { test "socket_direct/socket_direct_alloc/close_direct" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5655,7 +5691,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { test "openat_direct/close_direct" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5706,7 +5742,7 @@ test "openat_direct/close_direct" { test "waitid" { try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5756,7 +5792,7 @@ test BufferGroup { if (!is_linux) return error.SkipZigTest; // Init IoUring - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5792,7 +5828,7 @@ test BufferGroup { try testing.expectEqual(1, submitted); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); } // Server uses buffer group receive @@ -5820,7 +5856,7 @@ test BufferGroup { test "ring mapped buffers recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5862,7 +5898,7 @@ test "ring mapped buffers recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); } var pos: usize = 0; @@ -5887,7 +5923,7 @@ test "ring mapped buffers recv" { try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == 0); // IORING_CQE_F_BUFFER flags is set on success only + try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); } @@ -5909,7 +5945,7 @@ test "ring mapped buffers recv" { test "ring mapped buffers multishot recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -5951,7 +5987,7 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); } // start multishot recv @@ -5964,11 +6000,11 @@ test "ring mapped buffers multishot recv" { // we read four chunks of 4, 4, 4, 4 bytes each var chunk: []const u8 = data[0..buffer_size]; // first chunk const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe1.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe1.flags.F_MORE); chunk = data[buffer_size .. buffer_size * 2]; // second chunk const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe2.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe2.flags.F_MORE); // both buffers provided to the kernel are used so we get error // 'no more buffers', until we put buffers to the kernel @@ -5977,10 +6013,10 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(recv_user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == 0); // IORING_CQE_F_BUFFER flags is set on success only + try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only // has more is not set // indicates that multishot is finished - try testing.expect(cqe.flags & linux.IORING_CQE_F_MORE == 0); + try testing.expect(!cqe.flags.F_MORE); try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); } @@ -5995,12 +6031,12 @@ test "ring mapped buffers multishot recv" { chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe3.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe3.flags.F_MORE); try buf_grp.put(cqe3); chunk = data[buffer_size * 3 ..]; // last chunk const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe4.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe4.flags.F_MORE); try buf_grp.put(cqe4); // cancel pending multishot recv operation @@ -6040,7 +6076,7 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(recv_user_data, cqe_recv.user_data); try testing.expect(cqe_recv.res < 0); try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED); - try testing.expect(cqe_recv.flags & linux.IORING_CQE_F_MORE == 0); + try testing.expect(!cqe_recv.flags.F_MORE); } } } @@ -6051,10 +6087,10 @@ fn buf_grp_recv_submit_get_cqe( buf_grp: *BufferGroup, fd: linux.fd_t, user_data: u64, -) !io_uring_cqe { +) !Cqe { // prepare and submit recv const sqe = try buf_grp.recv(user_data, fd, 0); - try testing.expect(sqe.flags & linux.IOSQE_BUFFER_SELECT == linux.IOSQE_BUFFER_SELECT); + try testing.expect(sqe.flags.BUFFER_SELECT); try testing.expect(sqe.buf_index == buf_grp.group_id); try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit // get cqe, expect success @@ -6062,7 +6098,7 @@ fn buf_grp_recv_submit_get_cqe( try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); // IORING_CQE_F_BUFFER flag is set + try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set return cqe; } @@ -6072,12 +6108,12 @@ fn expect_buf_grp_cqe( buf_grp: *BufferGroup, user_data: u64, expected: []const u8, -) !io_uring_cqe { +) !Cqe { // get cqe const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); // IORING_CQE_F_BUFFER flag is set + try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); @@ -6093,7 +6129,7 @@ fn expect_buf_grp_cqe( test "copy_cqes with wrapping sq.cqes buffer" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -6104,7 +6140,7 @@ test "copy_cqes with wrapping sq.cqes buffer" { try testing.expectEqual(4, ring.cq.cqes.len); // submit 2 entries, receive 2 completions - var cqes: [8]io_uring_cqe = undefined; + var cqes: [8]Cqe = undefined; { for (0..2) |_| { const sqe = try ring.get_sqe(); @@ -6140,7 +6176,7 @@ test "copy_cqes with wrapping sq.cqes buffer" { test "bind/listen/connect" { if (builtin.cpu.arch == .s390x) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/25956 - var ring = IoUring.init(4, 0) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, From 1d374e0ba9d10545f6efcdd0d50b83ed4f65fbd5 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Tue, 30 Sep 2025 22:52:53 +0000 Subject: [PATCH 04/41] Replace STATX_* with StatxMask & StatxAttr Add RenameFlags for IoUring renameat Start update io_uring Types in linux.zig to use IoUring Continue change options and flags to Typed types Fix test cases to use updated IoUring flags and options Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 129 +++++++++++++++++++++++++++-------- lib/std/os/linux/IoUring.zig | 96 +++++++++++++------------- 2 files changed, 149 insertions(+), 76 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 2029356a66b5..8a928e19d9d9 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -2509,7 +2509,7 @@ pub fn uname(uts: *utsname) usize { return syscall1(.uname, @intFromPtr(uts)); } -pub fn io_uring_setup(entries: u32, p: *io_uring_params) usize { +pub fn io_uring_setup(entries: u32, p: *IoUring.Params) usize { return syscall2(.io_uring_setup, entries, @intFromPtr(p)); } @@ -6330,9 +6330,6 @@ pub const IORING_SETUP_REGISTERED_FD_ONLY = 1 << 15; /// Removes indirection through the SQ index array. pub const IORING_SETUP_NO_SQARRAY = 1 << 16; -/// IO submission data structure (Submission Queue Entry) -pub const io_uring_sqe = @import("linux/io_uring_sqe.zig").io_uring_sqe; - pub const IoUring = @import("linux/IoUring.zig"); /// If sqe->file_index is set to this for opcodes that instantiate a new @@ -6939,27 +6936,103 @@ pub const utsname = extern struct { }; pub const HOST_NAME_MAX = 64; -pub const STATX_TYPE = 0x0001; -pub const STATX_MODE = 0x0002; -pub const STATX_NLINK = 0x0004; -pub const STATX_UID = 0x0008; -pub const STATX_GID = 0x0010; -pub const STATX_ATIME = 0x0020; -pub const STATX_MTIME = 0x0040; -pub const STATX_CTIME = 0x0080; -pub const STATX_INO = 0x0100; -pub const STATX_SIZE = 0x0200; -pub const STATX_BLOCKS = 0x0400; -pub const STATX_BASIC_STATS = 0x07ff; - -pub const STATX_BTIME = 0x0800; - -pub const STATX_ATTR_COMPRESSED = 0x0004; -pub const STATX_ATTR_IMMUTABLE = 0x0010; -pub const STATX_ATTR_APPEND = 0x0020; -pub const STATX_ATTR_NODUMP = 0x0040; -pub const STATX_ATTR_ENCRYPTED = 0x0800; -pub const STATX_ATTR_AUTOMOUNT = 0x1000; +// COMMIT: add new StatxMask fields +// https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L203 +pub const StatxMask = packed struct(u32) { + type: bool = false, + /// Want/got stx_mode & ~S_IFMT + mode: bool = false, + /// Want/got stx_nlink + nlink: bool = false, + /// Want/got stx_uid + uid: bool = false, + /// Want/got stx_gid + gid: bool = false, + /// Want/got stx_atime + atime: bool = false, + /// Want/got stx_mtime + mtime: bool = false, + /// Want/got stx_ctime + ctime: bool = false, + /// Want/got stx_ino + ino: bool = false, + /// Want/got stx_size + size: bool = false, + /// Want/got stx_blocks + blocks: bool = false, + /// Want/got stx_btime + btime: bool = false, + /// Got stx_mnt_id + mnt_id: bool = false, + /// Want/got direct I/O alignment info + dioalign: bool = false, + /// Want/got extended stx_mount_id + mnt_id_unique: bool = false, + /// Want/got stx_subvol + subvol: bool = false, + /// Want/got atomic_write_* fields + write_atomic: bool = false, + /// Want/got dio read alignment info + dio_read_align: bool = false, + /// Reserved for future struct statx expansion + _: u14 = 0, + + /// The stuff in the normal stat struct (bits 0-10) + pub const basic_stats: StatxMask = .{ + .type = true, + .mode = true, + .nlink = true, + .uid = true, + .gid = true, + .atime = true, + .mtime = true, + .ctime = true, + .ino = true, + .size = true, + .blocks = true, + }; +}; + +// COMMIT: Statx as Packed Struct +// https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L248 +pub const StatxAttr = packed struct(u64) { + _0: u2 = 0, + /// File is compressed by the fs + compressed: bool = false, + _1: u1 = 0, + /// File is marked immutable + immutable: bool = false, + /// File is append-only + append: bool = false, + /// File is not to be dumped + nodump: bool = false, + _2: u4 = 0, + /// File requires key to decrypt in fs + encrypted: bool = false, + /// Dir: Automount trigger + automount: bool = false, + /// Root of a mount + mount_root: bool = false, + _3: u6 = 0, + /// Verity protected file + verity: bool = false, + /// File is currently in DAX state + dax: bool = false, + /// File supports atomic write operations + write_atomic: bool = false, + _: u41 = 0, +}; + +// COMMIT: RenameFlags +pub const RenameFlags = packed struct(u32) { + /// Don't overwrite target + noreplace: bool = false, + /// Exchange source and dest + exchange: bool = false, + /// Whiteout source + whiteout: bool = false, + _: u29 = 0, +}; pub const statx_timestamp = extern struct { sec: i64, @@ -6970,13 +7043,13 @@ pub const statx_timestamp = extern struct { /// Renamed to `Statx` to not conflict with the `statx` function. pub const Statx = extern struct { /// Mask of bits indicating filled fields - mask: u32, + mask: StatxMask, /// Block size for filesystem I/O blksize: u32, /// Extra file attribute indicators - attributes: u64, + attributes: StatxAttr, /// Number of hard links nlink: u32, @@ -7001,7 +7074,7 @@ pub const Statx = extern struct { blocks: u64, /// Mask to show what's supported in `attributes`. - attributes_mask: u64, + attributes_mask: StatxAttr, /// Last access file timestamp atime: statx_timestamp, diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 3a1f5311f689..a59b6827b81a 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -863,10 +863,10 @@ pub const Sqe = extern struct { fd: linux.fd_t, path: [*:0]const u8, flags: linux.AT, - mask: linux.STATX, // TODO: compose linux.STATX + mask: linux.StatxMask, buf: *linux.Statx, ) void { - sqe.prep_rw(.STATX, fd, @intFromPtr(path), mask, @intFromPtr(buf)); + sqe.prep_rw(.STATX, fd, @intFromPtr(path), @bitCast(mask), @intFromPtr(buf)); sqe.rw_flags = flags; } @@ -907,7 +907,7 @@ pub const Sqe = extern struct { old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: linux.RENAME, + flags: linux.RenameFlags, ) void { sqe.prep_rw( .RENAMEAT, @@ -2501,7 +2501,7 @@ pub fn timeout( user_data: u64, ts: *const linux.kernel_timespec, count: u32, - flags: u32, + flags: uflags.Timeout, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_timeout(ts, count, flags); @@ -2791,7 +2791,7 @@ pub fn waitid( id_type: linux.P, id: i32, infop: *linux.siginfo_t, - options: u32, + options: linux.W, flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); @@ -3020,8 +3020,8 @@ pub fn unregister_files(self: *IoUring) !void { pub fn socket( self: *IoUring, user_data: u64, - domain: u32, - socket_type: u32, + domain: linux.AF, + socket_type: linux.SOCK, protocol: u32, flags: u32, ) !*Sqe { @@ -3333,7 +3333,7 @@ pub const BufferGroup = struct { const heads = try allocator.alloc(u32, buffers_count); errdefer allocator.free(heads); - const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .inc = true }); + const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .IOU_PBUF_RING_INC = true }); buf_ring_init(br); const mask = buf_ring_mask(buffers_count); @@ -3458,10 +3458,10 @@ fn register_buf_ring( .flags = flags, }); var res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); - if (linux.errno(res) == .INVAL and reg.flags.inc) { + if (linux.errno(res) == .INVAL and reg.flags.IOU_PBUF_RING_INC) { // Retry without incremental buffer consumption. // It is available since kernel 6.12. returns INVAL on older. - reg.flags.inc = false; + reg.flags.IOU_PBUF_RING_INC = false; res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); } try handle_register_buf_ring_result(res); @@ -3566,8 +3566,8 @@ test "nop" { const sqe = try ring.nop(0xaaaaaaaa); try testing.expectEqual(Sqe{ .opcode = .NOP, - .flags = 0, - .ioprio = 0, + .flags = .{}, + .ioprio = @bitCast(@as(u16, 0)), .fd = 0, .off = 0, .addr = 0, @@ -3598,7 +3598,7 @@ test "nop" { try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 1), ring.cq.head.*); try testing.expectEqual(@as(u32, 0), ring.cq_ready()); @@ -3609,7 +3609,7 @@ test "nop" { try testing.expectEqual(Cqe{ .user_data = 0xbbbbbbbb, .res = 0, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); @@ -3652,7 +3652,7 @@ test "readv" { try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); @@ -3691,7 +3691,7 @@ test "writev/fsync/readv" { try testing.expectEqual(@as(u64, 17), sqe_writev.off); sqe_writev.flags.IO_LINK = true; - const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); + const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); try testing.expectEqual(fd, sqe_fsync.fd); sqe_fsync.flags.IO_LINK = true; @@ -3708,21 +3708,21 @@ test "writev/fsync/readv" { try testing.expectEqual(Cqe{ .user_data = 0xdddddddd, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 2), ring.cq_ready()); try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 1), ring.cq_ready()); try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_read.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqual(@as(u32, 0), ring.cq_ready()); @@ -3766,12 +3766,12 @@ test "write/read" { try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, cqe_write); try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_read.len, - .flags = 0, + .flags = .{}, }, cqe_read); try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } @@ -3832,17 +3832,17 @@ test "splice/read" { try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, cqe_splice_to_pipe); try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, cqe_splice_from_pipe); try testing.expectEqual(Cqe{ .user_data = 0x33333333, .res = buffer_read.len, - .flags = 0, + .flags = .{}, }, cqe_read); try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } @@ -3899,12 +3899,12 @@ test "write_fixed/read_fixed" { try testing.expectEqual(Cqe{ .user_data = 0x45454545, .res = @as(i32, @intCast(buffers[0].len)), - .flags = 0, + .flags = .{}, }, cqe_write); try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = @as(i32, @intCast(buffers[1].len)), - .flags = 0, + .flags = .{}, }, cqe_read); try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].base[0..3]); @@ -3939,8 +3939,8 @@ test "openat" { const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); try testing.expectEqual(Sqe{ .opcode = .OPENAT, - .flags = 0, - .ioprio = 0, + .flags = .{}, + .ioprio = @bitCast(@as(u16, 0)), .fd = tmp.dir.fd, .off = 0, .addr = path_addr, @@ -3961,7 +3961,7 @@ test "openat" { if (cqe_openat.err() == .BADF) return error.SkipZigTest; if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); try testing.expect(cqe_openat.res > 0); - try testing.expectEqual(@as(u32, 0), cqe_openat.flags); + try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_openat.flags); posix.close(cqe_openat.res); } @@ -3993,7 +3993,7 @@ test "close" { try testing.expectEqual(Cqe{ .user_data = 0x44444444, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_close); } @@ -4155,7 +4155,7 @@ test "timeout (after a relative time)" { try testing.expectEqual(Cqe{ .user_data = 0x55555555, .res = -@as(i32, @intFromEnum(linux.E.TIME)), - .flags = 0, + .flags = .{}, }, cqe); // Tests should not depend on timings: skip test if outside margin. @@ -4175,7 +4175,7 @@ test "timeout (after a number of completions)" { const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; const count_completions: u64 = 1; - const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, 0); + const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, .{}); try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); try testing.expectEqual(count_completions, sqe_timeout.off); _ = try ring.nop(0x77777777); @@ -4185,14 +4185,14 @@ test "timeout (after a number of completions)" { try testing.expectEqual(Cqe{ .user_data = 0x77777777, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_nop); const cqe_timeout = try ring.copy_cqe(); try testing.expectEqual(Cqe{ .user_data = 0x66666666, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_timeout); } @@ -4207,7 +4207,7 @@ test "timeout_remove" { defer ring.deinit(); const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; - const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, 0); + const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); @@ -4345,7 +4345,7 @@ test "fallocate" { try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); try testing.expectEqual(len, (try file.stat()).size); @@ -4503,7 +4503,7 @@ test "register_files_update" { try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); } @@ -4518,13 +4518,13 @@ test "register_files_update" { // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); } @@ -4535,7 +4535,7 @@ test "register_files_update" { // Now this should fail since both fds are sparse (-1) const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -4759,7 +4759,7 @@ test "mkdirat" { try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the directory exist @@ -4807,7 +4807,7 @@ test "symlinkat" { try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the symlink exist @@ -4928,7 +4928,7 @@ test "provide_buffers: read" { } try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + const used_buffer_id = @as(u32, @bitCast(cqe.flags)) >> 16; try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -4992,8 +4992,8 @@ test "provide_buffers: read" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = @as(u32, @bitCast(cqe.flags)) >> 16; try testing.expectEqual(used_buffer_id, reprovided_buffer_id); try testing.expectEqual(@as(i32, buffer_len), cqe.res); try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); @@ -5067,7 +5067,7 @@ test "remove_buffers" { } try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + const used_buffer_id = @as(u32, @bitCast(cqe.flags)) >> 16; try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); try testing.expectEqual(@as(i32, buffer_len), cqe.res); try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); @@ -6144,7 +6144,7 @@ test "copy_cqes with wrapping sq.cqes buffer" { { for (0..2) |_| { const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, 0); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); try testing.expect(try ring.submit() == 1); } var cqe_count: u32 = 0; @@ -6161,7 +6161,7 @@ test "copy_cqes with wrapping sq.cqes buffer" { for (1..1024) |i| { for (0..4) |_| { const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, 0); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); try testing.expect(try ring.submit() == 1); } var cqe_count: u32 = 0; From 45f924ff7c48a84d7b133409824a5100cb25150b Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Wed, 1 Oct 2025 14:55:54 +0000 Subject: [PATCH 05/41] Improve organization of fn and structs in IoUring Update std to use Statx.Mask Update statx syscall to use Statx.Mask Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 186 +- lib/std/os/linux/IoUring.zig | 10826 +++++++++++++++++---------------- lib/std/os/linux/test.zig | 2 +- 3 files changed, 5512 insertions(+), 5502 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 8a928e19d9d9..2854f72564f9 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -2249,13 +2249,13 @@ pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: u32) usi } } -pub fn statx(dirfd: i32, path: [*:0]const u8, flags: u32, mask: u32, statx_buf: *Statx) usize { +pub fn statx(dirfd: i32, path: [*:0]const u8, flags: u32, mask: Statx.Mask, statx_buf: *Statx) usize { return syscall5( .statx, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), flags, - mask, + @intCast(@as(u32, @bitCast(mask))), @intFromPtr(statx_buf), ); } @@ -6936,93 +6936,6 @@ pub const utsname = extern struct { }; pub const HOST_NAME_MAX = 64; -// COMMIT: add new StatxMask fields -// https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L203 -pub const StatxMask = packed struct(u32) { - type: bool = false, - /// Want/got stx_mode & ~S_IFMT - mode: bool = false, - /// Want/got stx_nlink - nlink: bool = false, - /// Want/got stx_uid - uid: bool = false, - /// Want/got stx_gid - gid: bool = false, - /// Want/got stx_atime - atime: bool = false, - /// Want/got stx_mtime - mtime: bool = false, - /// Want/got stx_ctime - ctime: bool = false, - /// Want/got stx_ino - ino: bool = false, - /// Want/got stx_size - size: bool = false, - /// Want/got stx_blocks - blocks: bool = false, - /// Want/got stx_btime - btime: bool = false, - /// Got stx_mnt_id - mnt_id: bool = false, - /// Want/got direct I/O alignment info - dioalign: bool = false, - /// Want/got extended stx_mount_id - mnt_id_unique: bool = false, - /// Want/got stx_subvol - subvol: bool = false, - /// Want/got atomic_write_* fields - write_atomic: bool = false, - /// Want/got dio read alignment info - dio_read_align: bool = false, - /// Reserved for future struct statx expansion - _: u14 = 0, - - /// The stuff in the normal stat struct (bits 0-10) - pub const basic_stats: StatxMask = .{ - .type = true, - .mode = true, - .nlink = true, - .uid = true, - .gid = true, - .atime = true, - .mtime = true, - .ctime = true, - .ino = true, - .size = true, - .blocks = true, - }; -}; - -// COMMIT: Statx as Packed Struct -// https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L248 -pub const StatxAttr = packed struct(u64) { - _0: u2 = 0, - /// File is compressed by the fs - compressed: bool = false, - _1: u1 = 0, - /// File is marked immutable - immutable: bool = false, - /// File is append-only - append: bool = false, - /// File is not to be dumped - nodump: bool = false, - _2: u4 = 0, - /// File requires key to decrypt in fs - encrypted: bool = false, - /// Dir: Automount trigger - automount: bool = false, - /// Root of a mount - mount_root: bool = false, - _3: u6 = 0, - /// Verity protected file - verity: bool = false, - /// File is currently in DAX state - dax: bool = false, - /// File supports atomic write operations - write_atomic: bool = false, - _: u41 = 0, -}; - // COMMIT: RenameFlags pub const RenameFlags = packed struct(u32) { /// Don't overwrite target @@ -7043,13 +6956,13 @@ pub const statx_timestamp = extern struct { /// Renamed to `Statx` to not conflict with the `statx` function. pub const Statx = extern struct { /// Mask of bits indicating filled fields - mask: StatxMask, + mask: Mask, /// Block size for filesystem I/O blksize: u32, /// Extra file attribute indicators - attributes: StatxAttr, + attributes: Attr, /// Number of hard links nlink: u32, @@ -7074,7 +6987,7 @@ pub const Statx = extern struct { blocks: u64, /// Mask to show what's supported in `attributes`. - attributes_mask: StatxAttr, + attributes_mask: Attr, /// Last access file timestamp atime: statx_timestamp, @@ -7101,6 +7014,95 @@ pub const Statx = extern struct { dev_minor: u32, __pad2: [14]u64, + + // COMMIT: add new StatxMask fields + // https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L203 + /// matches STATX_* in kernel + pub const Mask = packed struct(u32) { + type: bool = false, + /// Want/got stx_mode & ~S_IFMT + mode: bool = false, + /// Want/got stx_nlink + nlink: bool = false, + /// Want/got stx_uid + uid: bool = false, + /// Want/got stx_gid + gid: bool = false, + /// Want/got stx_atime + atime: bool = false, + /// Want/got stx_mtime + mtime: bool = false, + /// Want/got stx_ctime + ctime: bool = false, + /// Want/got stx_ino + ino: bool = false, + /// Want/got stx_size + size: bool = false, + /// Want/got stx_blocks + blocks: bool = false, + /// Want/got stx_btime + btime: bool = false, + /// Got stx_mnt_id + mnt_id: bool = false, + /// Want/got direct I/O alignment info + dioalign: bool = false, + /// Want/got extended stx_mount_id + mnt_id_unique: bool = false, + /// Want/got stx_subvol + subvol: bool = false, + /// Want/got atomic_write_* fields + write_atomic: bool = false, + /// Want/got dio read alignment info + dio_read_align: bool = false, + /// Reserved for future struct statx expansion + _: u14 = 0, + + /// The stuff in the normal stat struct (bits 0-10) + pub const basic_stats: Mask = .{ + .type = true, + .mode = true, + .nlink = true, + .uid = true, + .gid = true, + .atime = true, + .mtime = true, + .ctime = true, + .ino = true, + .size = true, + .blocks = true, + }; + }; + + // COMMIT: Statx as Packed Struct + // https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L248 + /// matches STATX_ATTR_* in kernel + pub const Attr = packed struct(u64) { + _0: u2 = 0, + /// File is compressed by the fs + compressed: bool = false, + _1: u1 = 0, + /// File is marked immutable + immutable: bool = false, + /// File is append-only + append: bool = false, + /// File is not to be dumped + nodump: bool = false, + _2: u4 = 0, + /// File requires key to decrypt in fs + encrypted: bool = false, + /// Dir: Automount trigger + automount: bool = false, + /// Root of a mount + mount_root: bool = false, + _3: u6 = 0, + /// Verity protected file + verity: bool = false, + /// File is currently in DAX state + dax: bool = false, + /// File supports atomic write operations + write_atomic: bool = false, + _: u41 = 0, + }; }; pub const addrinfo = extern struct { diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index a59b6827b81a..0b9488f17469 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -16,3608 +16,2659 @@ cq: Cq, flags: uflags.Setup, features: uflags.Features, -// COMMIT: move IoUring constants to Constants -pub const constants = struct { - /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new - /// an available direct descriptor instead of having the application pass one - /// direct descriptor (like openat/openat2/accept), then io_uring will allocate - /// in. The picked direct descriptor will be returned in cqe.res, or -ENFILE - /// if the space is full. - pub const FILE_INDEX_ALLOC = std.math.maxInt(u32); - - pub const CMD_MASK = 1 << 0; - - pub const TIMEOUT_CLOCK_MASK = ((1 << 2) | (1 << 3)); - pub const TIMEOUT_UPDATE_MASK = ((1 << 1) | (1 << 4)); +/// A friendly way to setup an io_uring, with default linux.io_uring_params. +/// `entries` must be a power of two between 1 and 32768, although the kernel will make the final +/// call on how many entries the submission and completion queues will ultimately have, +/// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. +/// Matches the interface of io_uring_queue_init() in liburing. +pub fn init(entries: u16, flags: uflags.Setup) !IoUring { + var params = mem.zeroInit(Params, .{ + .flags = flags, + .sq_thread_idle = 1000, + }); + return try .init_params(entries, ¶ms); +} - pub const CQE_BUFFER_SHIFT = 16; +/// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission +/// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). +/// `params` is passed by reference because the kernel needs to modify the parameters. +/// Matches the interface of io_uring_queue_init_params() in liburing. +pub fn init_params(entries: u16, p: *Params) !IoUring { + if (entries == 0) return error.EntriesZero; + if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + assert(p.sq_entries == 0); + assert(p.features.empty()); + assert(p.resv[0] == 0); + assert(p.resv[1] == 0); + assert(p.resv[2] == 0); - /// cqe.res for IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was - /// requested It should be treated as a flag, all other bits of cqe.res - /// should be treated as reserved! - pub const NOTIF_USAGE_ZC_COPIED = (1 << 31); + assert(p.cq_entries == 0 or p.flags.CQSIZE); + assert(p.wq_fd == 0 or p.flags.ATTACH_WQ); - //Magic offsets for the application to mmap the data it needs - pub const OFF_SQ_RING = 0; - pub const OFF_CQ_RING = 0x8000000; - pub const OFF_SQES = 0x10000000; - // COMMIT: new magic constants - pub const OFF_PBUF_RING = 0x80000000; - pub const OFF_PBUF_SHIFT = 16; - pub const OFF_MMAP_MASK = 0xf8000000; + // flags compatibility + assert(p.flags.SQPOLL and !(p.flags.COOP_TASKRUN or p.flags.TASKRUN_FLAG or p.flags.DEFER_TASKRUN)); + assert(p.flags.SQ_AFF and p.flags.SQPOLL); + assert(p.flags.DEFER_TASKRUN and p.flags.SINGLE_ISSUER); - /// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. - pub const RSRC_REGISTER_SPARSE = 1 << 0; + const res = linux.io_uring_setup(entries, p); + switch (linux.errno(res)) { + .SUCCESS => {}, + .FAULT => return error.ParamsOutsideAccessibleAddressSpace, + // The resv array contains non-zero data, p.flags contains an unsupported flag, + // entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL, + // or IORING_SETUP_CQSIZE was specified but linux.io_uring_params.cq_entries was invalid: + .INVAL => return error.ArgumentsInvalid, + .MFILE => return error.ProcessFdQuotaExceeded, + .NFILE => return error.SystemFdQuotaExceeded, + .NOMEM => return error.SystemResources, + // IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges, + // or a container seccomp policy prohibits io_uring syscalls: + .PERM => return error.PermissionDenied, + .NOSYS => return error.SystemOutdated, + else => |errno| return posix.unexpectedErrno(errno), + } + const fd = @as(linux.fd_t, @intCast(res)); + assert(fd >= 0); + errdefer posix.close(fd); - /// Skip updating fd indexes set to this value in the fd table - pub const REGISTER_FILES_SKIP = -2; + // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. + // This is not an optional feature for us... if the kernel does it, we have to do it. + // The thinking on this by the kernel developers was that both the submission and the + // completion queue rings have sizes just over a power of two, but the submission queue ring + // is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel + // gets the submission queue ring for free. + // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. + // We do not support the double mmap() done before 5.4, because we want to keep the + // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. + if (!p.features.SINGLE_MMAP) { + return error.SystemOutdated; + } - // COMMIT: new TX Timestamp definition - /// SOCKET_URING_OP_TX_TIMESTAMP definitions - pub const TIMESTAMP_HW_SHIFT = 16; - /// The cqe.flags bit from which the timestamp type is stored - pub const TIMESTAMP_TYPE_SHIFT = (TIMESTAMP_HW_SHIFT + 1); - /// The cqe.flags flag signifying whether it's a hardware timestamp - pub const CQE_F_TSTAMP_HW = (1 << TIMESTAMP_HW_SHIFT); + // Check that the kernel has actually set params and that "impossible is nothing". + assert(p.sq_entries != 0); + assert(p.cq_entries != 0); + assert(p.cq_entries >= p.sq_entries); - /// The bit from which area id is encoded into offsets - pub const ZCRX_AREA_SHIFT = 48; - pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1)); + // From here on, we only need to read from params, so pass `p` by value as immutable. + // The completion queue shares the mmap with the submission queue, so pass `sq` there too. + var sq: Sq = try .init(fd, p.*); + errdefer sq.deinit(); + var cq: Cq = try .init(fd, p.*, sq); + errdefer cq.deinit(); - // flag added to the opcode to use a registered ring fd - pub const REGISTER_USE_REGISTERED_RING = 1 << 31; -}; + // Check that our starting state is as we expect. + assert(sq.head.* == 0); + assert(sq.tail.* == 0); + assert(sq.mask == p.sq_entries - 1); + // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. + assert(sq.dropped.* == 0); + assert(sq.array.len == p.sq_entries); + assert(sq.sqes.len == p.sq_entries); + assert(sq.sqe_head == 0); + assert(sq.sqe_tail == 0); -// COMMIT: move IoUring flags to Flags struct -pub const uflags = struct { - /// io_uring_setup() flags - pub const Setup = packed struct(u32) { - /// io_context is polled - IOPOLL: bool = false, - /// SQ poll thread - SQPOLL: bool = false, - /// sq_thread_cpu is valid - SQ_AFF: bool = false, - /// app defines CQ size - CQSIZE: bool = false, - /// clamp SQ/CQ ring sizes - CLAMP: bool = false, - /// attach to existing wq - ATTACH_WQ: bool = false, - /// start with ring disabled - R_DISABLED: bool = false, - /// continue submit on error - SUBMIT_ALL: bool = false, - ///Cooperative task running. When requests complete, they often require - ///forcing the submitter to transition to the kernel to complete. If this - ///flag is set, work will be done when the task transitions anyway, rather - ///than force an inter-processor interrupt reschedule. This avoids interrupting - ///a task running in userspace, and saves an IPI. - COOP_TASKRUN: bool = false, - ///If COOP_TASKRUN is set, get notified if task work is available for - ///running and a kernel transition would be needed to run it. This sets - ///IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. - TASKRUN_FLAG: bool = false, - /// SQEs are 128 byte - SQE128: bool = false, - /// CQEs are 32 byte - CQE32: bool = false, - /// Only one task is allowed to submit requests - SINGLE_ISSUER: bool = false, - /// Defer running task work to get events. - /// Rather than running bits of task work whenever the task transitions - /// try to do it just before it is needed. - DEFER_TASKRUN: bool = false, - /// Application provides the memory for the rings - NO_MMAP: bool = false, - /// Register the ring fd in itself for use with - /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather - /// than an fd. - REGISTERED_FD_ONLY: bool = false, - /// Removes indirection through the SQ index array. - NO_SQARRAY: bool = false, - // COMMIT: new setup flags - /// Use hybrid poll in iopoll process - HYBRID_IOPOLL: bool = false, - /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have - /// IORING_CQE_F_32 set in cqe.flags. - CQE_MIXED: bool = false, - _unused: u13 = 0, - }; + assert(cq.head.* == 0); + assert(cq.tail.* == 0); + assert(cq.mask == p.cq_entries - 1); + assert(cq.overflow.* == 0); + assert(cq.cqes.len == p.cq_entries); - /// sqe.uring_cmd_flags (rw_flags in the Zig struct) - /// top 8bits aren't available for userspace - /// use registered buffer; pass this flag along with setting sqe.buf_index. - pub const Cmd = packed struct(u32) { - CMD_FIXED: bool = false, - _unused: u31 = 0, + return .{ + .fd = fd, + .sq = sq, + .cq = cq, + .flags = p.flags, + .features = p.features, }; +} - /// sqe.fsync_flags (rw_flags in the Zig struct) - pub const Fsync = packed struct(u32) { - DATASYNC: bool = false, - _unused: u31 = 0, - }; +pub fn deinit(self: *IoUring) void { + assert(self.fd >= 0); + // The mmaps depend on the fd, so the order of these calls is important: + self.cq.deinit(); + self.sq.deinit(); + posix.close(self.fd); + self.fd = -1; +} - /// sqe.timeout_flags - pub const Timeout = packed struct(u32) { - TIMEOUT_ABS: bool = false, - /// Available since Linux 5.11 - TIMEOUT_UPDATE: bool = false, - /// Available since Linux 5.15 - TIMEOUT_BOOTTIME: bool = false, - /// Available since Linux 5.15 - TIMEOUT_REALTIME: bool = false, - /// Available since Linux 5.15 - LINK_TIMEOUT_UPDATE: bool = false, - /// Available since Linux 5.16 - TIMEOUT_ETIME_SUCCESS: bool = false, - // COMMIT: new Timeout Flag - // TODO: add when it became available - TIMEOUT_MULTISHOT: bool = false, - _unused: u25 = 0, - }; +/// Returns a pointer to a vacant SQE, or an error if the submission queue is full. +/// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. +/// However, instead of a null we return an error to force safe handling. +/// Any situation where the submission queue is full tends more towards a control flow error, +/// and the null return in liburing is more a C idiom than anything else, for lack of a better +/// alternative. In Zig, we have first-class error handling... so let's use it. +/// Matches the implementation of io_uring_get_sqe() in liburing. +pub fn get_sqe(self: *IoUring) !*Sqe { + const head = @atomicLoad(u32, self.sq.head, .acquire); + // Remember that these head and tail offsets wrap around every four billion operations. + // We must therefore use wrapping addition and subtraction to avoid a runtime crash. + const next = self.sq.sqe_tail +% 1; + if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; + const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; + self.sq.sqe_tail = next; + return sqe; +} - /// sqe.splice_flags (rw_flags in Zig Struct) - /// extends splice(2) flags - pub const Splice = packed struct(u32) { - _unused: u31 = 0, - /// the last bit of __u32 - F_FD_IN_FIXED: bool = false, - }; +/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have +/// called get_sqe() multiple times to setup multiple I/O requests. +/// Returns the number of SQEs submitted, if not used alongside IORING_SETUP_SQPOLL. +/// If the io_uring instance is uses IORING_SETUP_SQPOLL, the value returned on success is not +/// guaranteed to match the amount of actually submitted sqes during this call. A value higher +/// or lower, including 0, may be returned. +/// Matches the implementation of io_uring_submit() in liburing. +pub fn submit(self: *IoUring) !u32 { + return self.submit_and_wait(0); +} - /// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct) - /// is the flag space, the command flags for POLL_ADD are stored in sqe.len. - pub const Poll = packed struct(u32) { - /// IORING_POLL_ADD_MULTI - /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue - /// to report CQEs on behalf of the same SQE. - ADD_MULTI: bool = false, - // TODO: verify this doc comment is valid for the 2 flags below - /// IORING_POLL_UPDATE - /// Update existing poll request, matching sqe.addr as the old user_data - /// field. - UPDATE_EVENTS: bool = false, - /// IORING_POLL_UPDATE - /// Update existing poll request, matching sqe.addr as the old user_data - /// field. - UPDATE_USER_DATA: bool = false, - /// IORING_POLL_LEVEL - /// Level triggered poll. - ADD_LEVEL: bool = false, - _unused: u28 = 0, - }; - - /// ASYNC_CANCEL flags. - pub const AsyncCancel = packed struct(u32) { - /// IORING_ASYNC_CANCEL_ALL - /// Cancel all requests that match the given key - CANCEL_ALL: bool = false, - /// IORING_ASYNC_CANCEL_FD - /// Key off 'fd' for cancelation rather than the request 'user_data' - CANCEL_FD: bool = false, - /// IORING_ASYNC_CANCEL_ANY - /// Match any request - CANCEL_ANY: bool = false, - /// IORING_ASYNC_CANCEL_FD_FIXED - /// 'fd' passed in is a fixed descriptor - CANCEL_FD_FIXED: bool = false, - // COMMIT: new AsyncCancel Flags - /// IORING_ASYNC_CANCEL_USERDATA - /// Match on user_data, default for no other key - CANCEL_USERDATA: bool = false, - /// IORING_ASYNC_CANCEL_OP - /// Match request based on opcode - CANCEL_OP: bool = false, - _unused: u26 = 0, - }; - - /// send/sendmsg and recv/recvmsg flags (sqe.ioprio) - pub const SendRecv = packed struct(u16) { - /// IORING_RECVSEND_POLL_FIRST - /// If set, instead of first attempting to send or receive and arm poll - /// if that yields an -EAGAIN result, arm poll upfront and skip the - /// initial transfer attempt. - RECVSEND_POLL_FIRST: bool = false, - /// IORING_RECV_MULTISHOT - /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue - /// to report CQEs on behalf of the same SQE. - RECV_MULTISHOT: bool = false, - /// IORING_RECVSEND_FIXED_BUF - /// Use registered buffers, the index is stored in the buf_index field. - RECVSEND_FIXED_BUF: bool = false, - /// IORING_SEND_ZC_REPORT_USAGE - /// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res - /// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was - /// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied - /// (at least partially). - SEND_ZC_REPORT_USAGE: bool = false, - /// IORING_RECVSEND_BUNDLE - /// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as - /// many buffers from the buffer group ID given and send them all. - /// The completion result will be the number of buffers send, with the - /// starting buffer ID in cqe.flags as per usual for provided buffer - /// usage. The buffers will be contiguous from the starting buffer ID. - RECVSEND_BUNDLE: bool = false, - // COMMIT: new flags - /// IORING_SEND_VECTORIZED - /// If set, SEND[_ZC] will take a pointer to a io_vec to allow - /// vectorized send operations. - SEND_VECTORIZED: bool = false, - _: u10 = 0, - }; - - /// accept flags stored in sqe.ioprio - pub const Accept = packed struct(u16) { - MULTISHOT: bool = false, - // COMMIT: new Flags - DONTWAIT: bool = false, - POLL_FIRST: bool = false, - _unused: u13 = 0, - }; - - /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) - pub const MsgRing = packed struct(u32) { - /// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. - /// Not applicable for IORING_MSG_DATA, obviously. - CQE_SKIP: bool = false, - /// Pass through the flags from sqe.file_index to cqe.flags - FLAGS_PASS: bool = false, - _unused: u30 = 0, - }; - - // COMMIT: new flag - /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) - pub const FixedFd = packed struct(u32) { - /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC - NO_CLOEXEC: bool = false, - }; - - /// COMMIT: new flags - /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) - pub const Nop = packed struct(u32) { - /// IORING_NOP_INJECT_RESULT Inject result from sqe.result - INJECT_RESULT: bool = false, - _unused: u4 = 0, - CQE32: bool = false, - _unused_1: u26 = 0, - }; - - /// io_uring_enter(2) flags - pub const Enter = packed struct(u32) { - GETEVENTS: bool = false, - SQ_WAKEUP: bool = false, - SQ_WAIT: bool = false, - EXT_ARG: bool = false, - REGISTERED_RING: bool = false, - // COMMIT: new flags - ABS_TIMER: bool = false, - EXT_ARG_REG: bool = false, - NO_IOWAIT: bool = false, - _unused: u24 = 0, - }; - - /// io_uring_params.features flags - const Features = packed struct(u32) { - SINGLE_MMAP: bool = false, - NODROP: bool = false, - SUBMIT_STABLE: bool = false, - RW_CUR_POS: bool = false, - CUR_PERSONALITY: bool = false, - FAST_POLL: bool = false, - POLL_32BITS: bool = false, - SQPOLL_NONFIXED: bool = false, - EXT_ARG: bool = false, - NATIVE_WORKERS: bool = false, - RSRC_TAGS: bool = false, - CQE_SKIP: bool = false, - LINKED_FILE: bool = false, - // COMMIT: add new Feature Flags - REG_REG_RING: bool = false, - RECVSEND_BUNDLE: bool = false, - MIN_TIMEOUT: bool = false, - RW_ATTR: bool = false, - NO_IOWAIT: bool = false, - _unused: u14 = 0, - }; -}; - -// IO completion data structure (Completion Queue Entry) -pub const Cqe = extern struct { - /// sqe.user_data value passed back - user_data: u64, - /// result code for this event - res: i32, - flags: Flags, - // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration - // TODO: add support for the IORING_SETUP_CQE32 case - /// If the ring is initialized with IORING_SETUP_CQE32, then this field - /// contains 16-bytes of padding, doubling the size of the CQE. - // big_cqe: ?[2]u64, - - /// cqe.flags - pub const Flags = packed struct(u32) { - /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID - F_BUFFER: bool = false, - /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries - F_MORE: bool = false, - /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv - F_SOCK_NONEMPTY: bool = false, - /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct - /// them from sends. - F_NOTIF: bool = false, - /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get - /// more completions. In other words, the buffer is being - /// partially consumed, and will be used by the kernel for - /// more completions. This is only set for buffers used via - /// the incremental buffer consumption, as provided by - /// a ring buffer setup with IOU_PBUF_RING_INC. For any - /// other provided buffer type, all completions with a - /// buffer passed back is automatically returned to the - /// application. - F_BUF_MORE: bool = false, - // COMMIT: new flags - /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this - /// CQE. It's only purpose is to fill a gap in the ring, - /// if a large CQE is attempted posted when the ring has - /// just a single small CQE worth of space left before - /// wrapping. - F_SKIP: bool = false, - _unused: u9 = 0, - /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings - /// setup in a mixed CQE mode, where both 16b and 32b - /// CQEs may be posted to the CQ ring. - F_32: bool = false, - _unused_1: u16 = 0, - }; - - pub fn err(self: Cqe) linux.E { - if (self.res > -4096 and self.res < 0) { - return @as(linux.E, @enumFromInt(-self.res)); - } - return .SUCCESS; - } - - // On successful completion of the provided buffers IO request, the CQE flags field - // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by - // the upper 16-bits of the flags field. - pub fn buffer_id(self: Cqe) !u16 { - if (!self.flags.F_BUFFER) { - return error.NoBufferSelected; - } - return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT); - } -}; - -/// IO submission data structure (Submission Queue Entry) -/// matches io_uring_sqe in liburing -pub const Sqe = extern struct { - /// type of operation for this sqe - opcode: Op, - /// IOSQE_* flags - flags: IoSqe, - /// ioprio for the request - ioprio: packed union { - send_recv: uflags.SendRecv, - accept: uflags.Accept, - }, - /// file descriptor to do IO on - fd: i32, - /// offset into file - off: u64, - /// pointer to buffer or iovecs - addr: u64, - /// buffer size or number of iovecs - len: u32, - /// flags for any sqe operation - /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | msg_flags - /// timeout_flags | accept_flags | cancel_flags | open_flags | statx_flags - /// fadvise_advice | splice_flags | rename_flags | unlink_flags | hardlink_flags - /// xattr_flags | msg_ring_flags | uring_cmd_flags | waitid_flags | futex_flags - /// install_fd_flags | nop_flags | pipe_flags - rw_flags: u32, - /// data to be passed back at completion time - user_data: u64, - /// index into fixed buffers or for grouped buffer selection - buf_index: u16, - personality: u16, - splice_fd_in: i32, - addr3: u64, - resv: u64, - - /// sqe.flags - pub const IoSqe = packed struct(u8) { - /// use fixed fileset - FIXED_FILE: bool = false, - /// issue after inflight IO - IO_DRAIN: bool = false, - /// links next sqe - IO_LINK: bool = false, - /// like LINK, but stronger - IO_HARDLINK: bool = false, - /// always go async - ASYNC: bool = false, - /// select buffer from sqe->buf_group - BUFFER_SELECT: bool = false, - /// don't post CQE if request succeeded - CQE_SKIP_SUCCESS: bool = false, - _: u1 = 0, - }; - - pub fn prep_nop(sqe: *Sqe) void { - sqe.* = .{ - .opcode = .NOP, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void { - sqe.* = .{ - .opcode = .FSYNC, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = @bitCast(flags), - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_rw( - sqe: *Sqe, - op: Op, - fd: linux.fd_t, - addr: u64, - len: usize, - offset: u64, - ) void { - sqe.* = .{ - .opcode = op, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = fd, - .off = offset, - .addr = addr, - .len = @intCast(len), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { - sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } - - pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { - sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } - - pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { - sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); - sqe.addr = off_in; - sqe.splice_fd_in = fd_in; - } - - pub fn prep_readv( - sqe: *Sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec, - offset: u64, - ) void { - sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } - - pub fn prep_writev( - sqe: *Sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec_const, - offset: u64, - ) void { - sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } - - pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; - } - - pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; - } - - pub fn prep_accept( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.SOCK, - ) void { - // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. - // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); - sqe.rw_flags = flags; - } - - /// accept directly into the fixed file table - pub fn prep_accept_direct( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.SOCK, - file_index: u32, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - set_target_fixed_file(sqe, file_index); - } - - pub fn prep_multishot_accept( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.SOCK, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } }; - } - - /// multishot accept directly into the fixed file table - pub fn prep_multishot_accept_direct( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.SOCK, - ) void { - prep_multishot_accept(sqe, fd, addr, addrlen, flags); - set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); - } - - fn set_target_fixed_file(sqe: *Sqe, file_index: u32) void { - const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC) - constants.FILE_INDEX_ALLOC - else - // 0 means no fixed files, indexes should be encoded as "index + 1" - file_index + 1; - // This filed is overloaded in liburing: - // splice_fd_in: i32 - // sqe_file_index: u32 - sqe.splice_fd_in = @bitCast(sqe_file_index); - } - - pub fn prep_connect( - sqe: *Sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - ) void { - // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); - } - - pub fn prep_epoll_ctl( - sqe: *Sqe, - epfd: linux.fd_t, - fd: linux.fd_t, - op: u32, - ev: ?*linux.epoll_event, - ) void { - sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); - } - - pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG) void { - sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } - - // TODO: review recv `flags` - pub fn prep_recv_multishot( - sqe: *Sqe, - fd: linux.fd_t, - buffer: []u8, - flags: linux.MSG, - ) void { - sqe.prep_recv(fd, buffer, flags); - sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; - } - - pub fn prep_recvmsg( - sqe: *Sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: linux.MSG, - ) void { - sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_recvmsg_multishot( - sqe: *Sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: linux.MSG, - ) void { - sqe.prep_recvmsg(fd, msg, flags); - sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; - } - - // COMMIT: fix send[|recv] flag param type - pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG) void { - sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } - - pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: uflags.SendRecv) void { - sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - sqe.ioprio = .{ .send_recv = zc_flags }; - } - - pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: uflags.SendRecv, buf_index: u16) void { - const zc_flags_fixed = blk: { - var updated_flags = zc_flags; - updated_flags.RECVSEND_FIXED_BUF = true; - break :blk updated_flags; - }; - prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed); - sqe.buf_index = buf_index; - } - - pub fn prep_sendmsg( - sqe: *Sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: linux.MSG, - ) void { - sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_sendmsg_zc( - sqe: *Sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: linux.MSG, - ) void { - prep_sendmsg(sqe, fd, msg, flags); - sqe.opcode = .SENDMSG_ZC; - } - - pub fn prep_openat( - sqe: *Sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); - sqe.rw_flags = @bitCast(flags); - } - - pub fn prep_openat_direct( - sqe: *Sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - file_index: u32, - ) void { - prep_openat(sqe, fd, path, flags, mode); - set_target_fixed_file(sqe, file_index); - } - - pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void { - sqe.* = .{ - .opcode = .CLOSE, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_close_direct(sqe: *Sqe, file_index: u32) void { - prep_close(sqe, 0); - set_target_fixed_file(sqe, file_index); - } - - pub fn prep_timeout( - sqe: *Sqe, - ts: *const linux.kernel_timespec, - count: u32, - flags: uflags.Timeout, - ) void { - sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); - sqe.rw_flags = @bitCast(flags); - } - - pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { - sqe.* = .{ - .opcode = .TIMEOUT_REMOVE, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = -1, - .off = 0, - .addr = timeout_user_data, - .len = 0, - .rw_flags = @bitCast(flags), - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_link_timeout( - sqe: *Sqe, - ts: *const linux.kernel_timespec, - flags: uflags.Timeout, - ) void { - sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_poll_add( - sqe: *Sqe, - fd: linux.fd_t, - poll_mask: linux.POLL, - ) void { - sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); - } - - pub fn prep_poll_remove( - sqe: *Sqe, - target_user_data: u64, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); - } - - pub fn prep_poll_update( - sqe: *Sqe, - old_user_data: u64, - new_user_data: u64, - poll_mask: linux.POLL, - flags: uflags.Poll, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); - } - - pub fn prep_fallocate( - sqe: *Sqe, - fd: linux.fd_t, - mode: i32, - offset: u64, - len: u64, - ) void { - sqe.* = .{ - .opcode = .FALLOCATE, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = fd, - .off = offset, - .addr = len, - .len = @intCast(mode), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_statx( - sqe: *Sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.AT, - mask: linux.StatxMask, - buf: *linux.Statx, - ) void { - sqe.prep_rw(.STATX, fd, @intFromPtr(path), @bitCast(mask), @intFromPtr(buf)); - sqe.rw_flags = flags; - } - - pub fn prep_cancel( - sqe: *Sqe, - cancel_user_data: u64, - flags: uflags.AsyncCancel, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); - sqe.rw_flags = @bitCast(flags); - } - - pub fn prep_cancel_fd( - sqe: *Sqe, - fd: linux.fd_t, - flags: uflags.AsyncCancel, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); - const enable_cancel_fd = blk: { - var update_flags = flags; - update_flags.CANCEL_FD = true; - break :blk update_flags; - }; - sqe.rw_flags = @bitCast(enable_cancel_fd); - } - - pub fn prep_shutdown( - sqe: *Sqe, - sockfd: linux.socket_t, - how: linux.SHUT, - ) void { - sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); - } - - pub fn prep_renameat( - sqe: *Sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: linux.RenameFlags, - ) void { - sqe.prep_rw( - .RENAMEAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; - } - - pub fn prep_unlinkat( - sqe: *Sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.AT, // TODO: unlink flags only AT_REMOVEDIR - ) void { - sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); - sqe.rw_flags = flags; - } - - pub fn prep_mkdirat( - sqe: *Sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); - } - - pub fn prep_symlinkat( - sqe: *Sqe, - target: [*:0]const u8, - new_dir_fd: linux.fd_t, - link_path: [*:0]const u8, - ) void { - sqe.prep_rw( - .SYMLINKAT, - new_dir_fd, - @intFromPtr(target), - 0, - @intFromPtr(link_path), - ); - } - - pub fn prep_linkat( - sqe: *Sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: linux.AT, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW - ) void { - sqe.prep_rw( - .LINKAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; - } - - pub fn prep_files_update( - sqe: *Sqe, - fds: []const linux.fd_t, - offset: u32, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); - } - - pub fn prep_files_update_alloc( - sqe: *Sqe, - fds: []linux.fd_t, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); - } - - // TODO: why can't slice be used here ? - pub fn prep_provide_buffers( - sqe: *Sqe, - buffers: [*]u8, - buffer_len: usize, - num: usize, - group_id: usize, - buffer_id: usize, - ) void { - const ptr = @intFromPtr(buffers); - sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); - sqe.buf_index = @intCast(group_id); - } - - pub fn prep_remove_buffers( - sqe: *Sqe, - num: usize, - group_id: usize, - ) void { - sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); - sqe.buf_index = @intCast(group_id); - } - - pub fn prep_socket( - sqe: *Sqe, - domain: linux.AF, - socket_type: linux.SOCK, - protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - flags: u32, // flags is unused - ) void { - sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); - sqe.rw_flags = flags; +/// Like submit(), but allows waiting for events as well. +/// Returns the number of SQEs submitted. +/// Matches the implementation of io_uring_submit_and_wait() in liburing. +pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { + const submitted = self.flush_sq(); + var flags: uflags.Enter = .{}; + if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { + if (wait_nr > 0 or self.flags.IOPOLL) { + flags.GETEVENTS = true; + } + return try self.enter(submitted, wait_nr, flags); } + return submitted; +} - pub fn prep_socket_direct( - sqe: *Sqe, - domain: linux.AF, - socket_type: linux.SOCK, - protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - flags: u32, // flags is unused - file_index: u32, - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - set_target_fixed_file(sqe, file_index); +/// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. +/// Returns the number of SQEs submitted. +pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.Enter) !u32 { + assert(self.fd >= 0); + const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); + switch (linux.errno(res)) { + .SUCCESS => {}, + // The kernel was unable to allocate memory or ran out of resources for the request. + // The application should wait for some completions and try again: + .AGAIN => return error.SystemResources, + // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: + .BADF => return error.FileDescriptorInvalid, + // The file descriptor is valid, but the ring is not in the right state. + // See io_uring_register(2) for how to enable the ring. + .BADFD => return error.FileDescriptorInBadState, + // The application attempted to overcommit the number of requests it can have pending. + // The application should wait for some completions and try again: + .BUSY => return error.CompletionQueueOvercommitted, + // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: + .INVAL => return error.SubmissionQueueEntryInvalid, + // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED + // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range + // described by `addr` and `len` is not within the buffer registered at `buf_index`: + .FAULT => return error.BufferInvalid, + .NXIO => return error.RingShuttingDown, + // The kernel believes our `self.fd` does not refer to an io_uring instance, + // or the opcode is valid but not supported by this kernel (more likely): + .OPNOTSUPP => return error.OpcodeNotSupported, + // The operation was interrupted by a delivery of a signal before it could complete. + // This can happen while waiting for events with IORING_ENTER_GETEVENTS: + .INTR => return error.SignalInterrupt, + else => |errno| return posix.unexpectedErrno(errno), } + return @as(u32, @intCast(res)); +} - pub fn prep_socket_direct_alloc( - sqe: *Sqe, - domain: linux.AF, - socket_type: linux.SOCK, - protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - flags: u32, // flags is unused - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); +/// Sync internal state with kernel ring state on the SQ side. +/// Returns the number of all pending events in the SQ ring, for the shared ring. +/// This return value includes previously flushed SQEs, as per liburing. +/// The rationale is to suggest that an io_uring_enter() call is needed rather than not. +/// Matches the implementation of __io_uring_flush_sq() in liburing. +pub fn flush_sq(self: *IoUring) u32 { + if (self.sq.sqe_head != self.sq.sqe_tail) { + // Fill in SQEs that we have queued up, adding them to the kernel ring. + const to_submit = self.sq.sqe_tail -% self.sq.sqe_head; + var tail = self.sq.tail.*; + var i: usize = 0; + while (i < to_submit) : (i += 1) { + self.sq.array[tail & self.sq.mask] = self.sq.sqe_head & self.sq.mask; + tail +%= 1; + self.sq.sqe_head +%= 1; + } + // Ensure that the kernel can actually see the SQE updates when it sees the tail update. + @atomicStore(u32, self.sq.tail, tail, .release); } + return self.sq_ready(); +} - pub fn prep_waitid( - sqe: *Sqe, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: linux.W, - flags: u32, // flags is unused - ) void { - sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); - sqe.rw_flags = flags; - sqe.splice_fd_in = @bitCast(options); +/// Returns true if we are not using an SQ thread (thus nobody submits but us), +/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. +/// For the latter case, we set the SQ thread wakeup flag. +/// Matches the implementation of sq_ring_needs_enter() in liburing. +pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { + assert(flags.*.empty()); + if (!self.flags.SQPOLL) return true; + if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).NEED_WAKEUP) { + flags.*.SQ_WAKEUP = true; + return true; } + return false; +} - // TODO: maybe remove unused flag fields? - pub fn prep_bind( - sqe: *Sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - flags: u32, // flags is unused and does't exist in io_uring's api - ) void { - sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); - sqe.rw_flags = flags; - } +/// Returns the number of flushed and unflushed SQEs pending in the submission queue. +/// In other words, this is the number of SQEs in the submission queue, i.e. its length. +/// These are SQEs that the kernel is yet to consume. +/// Matches the implementation of io_uring_sq_ready in liburing. +pub fn sq_ready(self: *IoUring) u32 { + // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, + // see https://github.com/axboe/liburing/issues/92. + return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .acquire); +} - pub fn prep_listen( - sqe: *Sqe, - fd: linux.fd_t, - backlog: usize, - flags: u32, // flags is unused and does't exist in io_uring's api - ) void { - sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); - sqe.rw_flags = flags; - } +/// Returns the number of CQEs in the completion queue, i.e. its length. +/// These are CQEs that the application is yet to consume. +/// Matches the implementation of io_uring_cq_ready in liburing. +pub fn cq_ready(self: *IoUring) u32 { + return @atomicLoad(u32, self.cq.tail, .acquire) -% self.cq.head.*; +} - pub fn prep_cmd_sock( - sqe: *Sqe, - cmd_op: SocketOp, - fd: linux.fd_t, - level: linux.SOL, - optname: linux.SO, - optval: u64, - optlen: u32, - ) void { - sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); - // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 - sqe.off = @intFromEnum(cmd_op); - // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 - sqe.addr = @bitCast(packed struct { - level: u32, - optname: u32, - }{ - .level = level, - .optname = optname, - }); - // splice_fd_in if overloaded u32 -> i32 - sqe.splice_fd_in = @bitCast(optlen); - // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 - sqe.addr3 = optval; +/// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice. +/// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs. +/// Returns the number of CQEs copied, advancing the CQ ring. +/// Provides all the wait/peek methods found in liburing, but with batching and a single method. +/// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes +/// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface. +/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs. +/// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. +/// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. +/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. +pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 { + const count = self.copy_cqes_ready(cqes); + if (count > 0) return count; + if (self.cq_ring_needs_flush() or wait_nr > 0) { + _ = try self.enter(0, wait_nr, .{ .GETEVENTS = true }); + return self.copy_cqes_ready(cqes); } + return 0; +} - pub fn set_flags(sqe: *Sqe, flags: Sqe.IoSqe) void { - const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags)); - sqe.flags = @bitCast(updated_flags); - } +fn copy_cqes_ready(self: *IoUring, cqes: []Cqe) u32 { + const ready = self.cq_ready(); + const count = @min(cqes.len, ready); + const head = self.cq.head.* & self.cq.mask; - /// This SQE forms a link with the next SQE in the submission ring. Next SQE - /// will not be started before this one completes. Forms a chain of SQEs. - pub fn link_next(sqe: *Sqe) void { - sqe.flags.IO_LINK = true; + // before wrapping + const n = @min(self.cq.cqes.len - head, count); + @memcpy(cqes[0..n], self.cq.cqes[head..][0..n]); + + if (count > n) { + // wrap self.cq.cqes + const w = count - n; + @memcpy(cqes[n..][0..w], self.cq.cqes[0..w]); } -}; -/// Filled with the offset for mmap(2) -/// matches io_sqring_offsets in liburing -pub const SqOffsets = extern struct { - /// offset of ring head - head: u32, - /// offset of ring tail - tail: u32, - /// ring mask value - ring_mask: u32, - /// entries in ring - ring_entries: u32, - /// ring flags - flags: u32, - /// number of sqes not submitted - dropped: u32, - /// sqe index array - array: u32, - resv1: u32, - user_addr: u64, -}; + self.cq_advance(count); + return count; +} -/// matches io_cqring_offsets in liburing -pub const CqOffsets = extern struct { - head: u32, - tail: u32, - ring_mask: u32, - ring_entries: u32, - overflow: u32, - cqes: u32, - flags: u32, - resv: u32, - user_addr: u64, -}; +/// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. +/// A convenience method for `copy_cqes()` for when you don't need to batch or peek. +pub fn copy_cqe(ring: *IoUring) !Cqe { + var cqes: [1]Cqe = undefined; + while (true) { + const count = try ring.copy_cqes(&cqes, 1); + if (count > 0) return cqes[0]; + } +} -/// Passed in for io_uring_setup(2). Copied back with updated info on success -/// matches io_uring_params in liburing -pub const Params = extern struct { - sq_entries: u32, - cq_entries: u32, - flags: uflags.Setup, - sq_thread_cpu: u32, - sq_thread_idle: u32, - features: uflags.Features, - wq_fd: u32, - resv: [3]u32, - sq_off: SqOffsets, - cq_off: CqOffsets, -}; +/// Matches the implementation of cq_ring_needs_flush() in liburing. +pub fn cq_ring_needs_flush(self: *IoUring) bool { + return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).CQ_OVERFLOW; +} -/// io_uring_register(2) opcodes and arguments -/// matches io_uring_register_op in liburing -pub const RegisterOp = enum(u8) { - REGISTER_BUFFERS, - UNREGISTER_BUFFERS, - REGISTER_FILES, - UNREGISTER_FILES, - REGISTER_EVENTFD, - UNREGISTER_EVENTFD, - REGISTER_FILES_UPDATE, - REGISTER_EVENTFD_ASYNC, - REGISTER_PROBE, - REGISTER_PERSONALITY, - UNREGISTER_PERSONALITY, - REGISTER_RESTRICTIONS, - REGISTER_ENABLE_RINGS, +/// For advanced use cases only that implement custom completion queue methods. +/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance(). +/// Must be called exactly once after a zero-copy CQE has been processed by your application. +/// Not idempotent, calling more than once will result in other CQEs being lost. +/// Matches the implementation of cqe_seen() in liburing. +pub fn cqe_seen(self: *IoUring, cqe: *Cqe) void { + _ = cqe; + self.cq_advance(1); +} - // extended with tagging - REGISTER_FILES2, - REGISTER_FILES_UPDATE2, - REGISTER_BUFFERS2, - REGISTER_BUFFERS_UPDATE, +/// For advanced use cases only that implement custom completion queue methods. +/// Matches the implementation of cq_advance() in liburing. +pub fn cq_advance(self: *IoUring, count: u32) void { + if (count > 0) { + // Ensure the kernel only sees the new head value after the CQEs have been read. + @atomicStore(u32, self.cq.head, self.cq.head.* +% count, .release); + } +} - // set/clear io-wq thread affinities - REGISTER_IOWQ_AFF, - UNREGISTER_IOWQ_AFF, +/// Queues (but does not submit) an SQE to perform an `fsync(2)`. +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. +/// N.B. While SQEs are initiated in the order in which they appear in the submission queue, +/// operations execute in parallel and completions are unordered. Therefore, an application that +/// submits a write followed by an fsync in the submission queue cannot expect the fsync to +/// apply to the write, since the fsync may complete before the write is issued to the disk. +/// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, +/// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. +pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_fsync(fd, flags); + sqe.user_data = user_data; + return sqe; +} - // set/get max number of io-wq workers - REGISTER_IOWQ_MAX_WORKERS, +/// Queues (but does not submit) an SQE to perform a no-op. +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// A no-op is more useful than may appear at first glance. +/// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to +/// know when the ring is idle before acting on a kill signal. +pub fn nop(self: *IoUring, user_data: u64) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_nop(); + sqe.user_data = user_data; + return sqe; +} - // register/unregister io_uring fd with the ring - REGISTER_RING_FDS, - UNREGISTER_RING_FDS, +/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` depending on the buffer type. +/// * Reading into a `ReadBuffer.buffer` uses `read(2)` +/// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` +/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. See https://man7.org/linux/man-pages/man2/preadv2.2.html +/// +/// Returns a pointer to the SQE. +pub fn read( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: ReadBuffer, + offset: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_read(fd, slice, offset), + .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), + .buffer_selection => |selection| { + sqe.prep_rw(.READ, fd, 0, selection.len, offset); + sqe.flags.BUFFER_SELECT = true; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} - // register ring based provide buffer group - REGISTER_PBUF_RING, - UNREGISTER_PBUF_RING, +/// Queues (but does not submit) an SQE to perform a `write(2)`. +/// Returns a pointer to the SQE. +pub fn write( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: []const u8, + offset: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_write(fd, buffer, offset); + sqe.user_data = user_data; + return sqe; +} - // sync cancelation API - REGISTER_SYNC_CANCEL, +/// Queues (but does not submit) an SQE to perform a `splice(2)` +/// Either `fd_in` or `fd_out` must be a pipe. +/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64). +/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read +/// from `fd_in` starting from the file offset, which is incremented by the number of bytes read. +/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`. +/// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first, +/// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally. +/// +/// NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with EINVAL if one of the +/// fd doesn't explicitly support splice peration, e.g. reading from terminal is unsupported from kernel 5.7 to 5.11. +/// See https://github.com/axboe/liburing/issues/291 +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd_out: posix.fd_t, off_out: u64, len: usize) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); + sqe.user_data = user_data; + return sqe; +} - // register a range of fixed file slots for automatic slot allocation - REGISTER_FILE_ALLOC_RANGE, +/// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED. +/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. +/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +pub fn read_fixed( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: *posix.iovec, + offset: u64, + buffer_index: u16, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_read_fixed(fd, buffer, offset, buffer_index); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `pwritev()`. +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. +/// See https://linux.die.net/man/2/pwritev. +pub fn writev( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + iovecs: []const posix.iovec_const, + offset: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_writev(fd, iovecs, offset); + sqe.user_data = user_data; + return sqe; +} - // return status information for a buffer group - REGISTER_PBUF_STATUS, +/// Queues (but does not submit) an SQE to perform a IORING_OP_WRITE_FIXED. +/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. +/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +pub fn write_fixed( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: *posix.iovec, + offset: u64, + buffer_index: u16, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_write_fixed(fd, buffer, offset, buffer_index); + sqe.user_data = user_data; + return sqe; +} - // set/clear busy poll settings - REGISTER_NAPI, - UNREGISTER_NAPI, +/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. +/// Returns a pointer to the SQE. +/// Available since 5.5 +pub fn accept( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: ?*posix.sockaddr, + addrlen: ?*posix.socklen_t, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_accept(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} - REGISTER_CLOCK, +/// Queues an multishot accept on a socket. +/// +/// Multishot variant allows an application to issue a single accept request, +/// which will repeatedly trigger a CQE when a connection request comes in. +/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate +/// further CQEs. +/// +/// Available since 5.19 +pub fn accept_multishot( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: ?*posix.sockaddr, + addrlen: ?*posix.socklen_t, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_multishot_accept(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} - // clone registered buffers from source ring to current ring - REGISTER_CLONE_BUFFERS, +/// Queues an accept using direct (registered) file descriptors. +/// +/// To use an accept direct variant, the application must first have registered +/// a file table (with register_files). An unused table index will be +/// dynamically chosen and returned in the CQE res field. +/// +/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE +/// flags member, and setting the SQE fd field to the direct descriptor value +/// rather than the regular file descriptor. +/// +/// Available since 5.19 +pub fn accept_direct( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: ?*posix.sockaddr, + addrlen: ?*posix.socklen_t, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_accept_direct(fd, addr, addrlen, flags, constants.FILE_INDEX_ALLOC); + sqe.user_data = user_data; + return sqe; +} - // send MSG_RING without having a ring - REGISTER_SEND_MSG_RING, +/// Queues an multishot accept using direct (registered) file descriptors. +/// Available since 5.19 +pub fn accept_multishot_direct( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: ?*posix.sockaddr, + addrlen: ?*posix.socklen_t, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} - // register a netdev hw rx queue for zerocopy - REGISTER_ZCRX_IFQ, +/// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. +/// Returns a pointer to the SQE. +pub fn connect( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: *const posix.sockaddr, + addrlen: posix.socklen_t, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_connect(fd, addr, addrlen); + sqe.user_data = user_data; + return sqe; +} - // resize CQ ring - REGISTER_RESIZE_RINGS, +/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. +/// Returns a pointer to the SQE. +pub fn epoll_ctl( + self: *IoUring, + user_data: u64, + epfd: linux.fd_t, + fd: linux.fd_t, + op: u32, + ev: ?*linux.epoll_event, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_epoll_ctl(epfd, fd, op, ev); + sqe.user_data = user_data; + return sqe; +} - REGISTER_MEM_REGION, +/// Queues (but does not submit) an SQE to perform a `recv(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn recv( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: RecvBuffer, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_recv(fd, slice, flags), + .buffer_selection => |selection| { + sqe.prep_rw(.RECV, fd, 0, selection.len, 0); + sqe.rw_flags = flags; + sqe.flags.BUFFER_SELECT = true; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} - // COMMIT: new register opcode - // query various aspects of io_uring, see linux/io_uring/query.h - REGISTER_QUERY, +/// Queues (but does not submit) an SQE to perform a `send(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn send( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: []const u8, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send(fd, buffer, flags); + sqe.user_data = user_data; + return sqe; +} - _, -}; +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// +/// This operation will most likely produce two CQEs. The flags field of the +/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will +/// be a second cqe with the user_data field set to the same value. The user +/// must not modify the data buffer until the notification is posted. The first +/// cqe follows the usual rules and so its res field will contain the number of +/// bytes sent or a negative error code. The notification's res field will be +/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two +/// step model is needed because the kernel may hold on to buffers for a long +/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling +/// the lifetime of the buffers. Even errored requests may generate a +/// notification. +/// +/// Available since 6.0 +pub fn send_zc( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: []const u8, + send_flags: u32, + zc_flags: u16, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); + sqe.user_data = user_data; + return sqe; +} -/// io-wq worker categories -/// matches io_wq_type in liburing -pub const IoWqCategory = enum(u8) { - BOUND, - UNBOUND, -}; +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// Returns a pointer to the SQE. +/// Available since 6.0 +pub fn send_zc_fixed( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: []const u8, + send_flags: u32, + zc_flags: u16, + buf_index: u16, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); + sqe.user_data = user_data; + return sqe; +} -// COMMIT: remove deprecated io_uring_rsrc_update struct -// deprecated, see struct io_uring_rsrc_update +/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.3 +pub fn recvmsg( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_recvmsg(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} -// COMMIT: add new io_uring_region_desc struct -/// matches io_uring_region_desc in liburing -pub const RegionDesc = extern struct { - user_addr: u64, - size: u64, - flags: Flags, - id: u32, - mmap_offset: u64, - __resv: [4]u64, +/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.3 +pub fn sendmsg( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} - // COMMIT: new constant - /// initialise with user provided memory pointed by user_addr - pub const Flags = packed struct(u32) { - TYPE_USER: bool = false, - _: u31 = 0, - }; -}; +/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 6.1 +pub fn sendmsg_zc( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg_zc(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} -// COMMIT: add new io_uring_mem_region_reg struct -/// matches io_uring_mem_region_reg in liburing -pub const MemRegionReg = extern struct { - /// struct io_uring_region_desc (RegionDesc in Zig) - region_uptr: u64, - flags: Flags, - __resv: [2]u64, +/// Queues (but does not submit) an SQE to perform an `openat(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6. +pub fn openat( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: posix.mode_t, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_openat(fd, path, flags, mode); + sqe.user_data = user_data; + return sqe; +} - /// expose the region as registered wait arguments - pub const Flags = packed struct(u64) { - REG_WAIT_ARG: bool = false, - _: u63 = 0, - }; -}; +/// Queues an openat using direct (registered) file descriptors. +/// +/// To use an accept direct variant, the application must first have registered +/// a file table (with register_files). An unused table index will be +/// dynamically chosen and returned in the CQE res field. +/// +/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE +/// flags member, and setting the SQE fd field to the direct descriptor value +/// rather than the regular file descriptor. +/// +/// Available since 5.15 +pub fn openat_direct( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: posix.mode_t, + file_index: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_openat_direct(fd, path, flags, mode, file_index); + sqe.user_data = user_data; + return sqe; +} -/// matches io_uring_rsrc_register in liburing -pub const RsrcRegister = extern struct { - nr: u32, - flags: u32, - resv2: u64, - data: u64, - tags: u64, -}; +/// Queues (but does not submit) an SQE to perform a `close(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6. +pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_close(fd); + sqe.user_data = user_data; + return sqe; +} -/// matches io_uring_rsrc_update in liburing -pub const RsrcUpdate = extern struct { - offset: u32, - resv: u32, - data: u64, -}; +/// Queues close of registered file descriptor. +/// Available since 5.15 +pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_close_direct(file_index); + sqe.user_data = user_data; + return sqe; +} -/// matches io_uring_rsrc_update2 in liburing -pub const RsrcUpdate2 = extern struct { - offset: u32, - resv: u32, - data: u64, - tags: u64, - nr: u32, - resv2: u32, -}; +/// Queues (but does not submit) an SQE to register a timeout operation. +/// Returns a pointer to the SQE. +/// +/// The timeout will complete when either the timeout expires, or after the specified number of +/// events complete (if `count` is greater than `0`). +/// +/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout. +/// +/// The completion event result will be `-ETIME` if the timeout completed through expiration, +/// `0` if the timeout completed after the specified number of events, or `-ECANCELED` if the +/// timeout was removed before it expired. +/// +/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. +pub fn timeout( + self: *IoUring, + user_data: u64, + ts: *const linux.kernel_timespec, + count: u32, + flags: uflags.Timeout, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_timeout(ts, count, flags); + sqe.user_data = user_data; + return sqe; +} -/// matches io_uring_probe_op in liburing -pub const ProbeOp = extern struct { - op: Op, - resv: u8, - flags: Flags, - resv2: u32, +/// Queues (but does not submit) an SQE to remove an existing timeout operation. +/// Returns a pointer to the SQE. +/// +/// The timeout is identified by its `user_data`. +/// +/// The completion event result will be `0` if the timeout was found and canceled successfully, +/// `-EBUSY` if the timeout was found but expiration was already in progress, or +/// `-ENOENT` if the timeout was not found. +pub fn timeout_remove( + self: *IoUring, + user_data: u64, + timeout_user_data: u64, + flags: uflags.Timeout, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_timeout_remove(timeout_user_data, flags); + sqe.user_data = user_data; + return sqe; +} - pub const Flags = packed struct(u16) { - OP_SUPPORTED: bool = false, - _: u15 = 0, - }; +/// Queues (but does not submit) an SQE to add a link timeout operation. +/// Returns a pointer to the SQE. +/// +/// You need to set linux.IOSQE_IO_LINK to flags of the target operation +/// and then call this method right after the target operation. +/// See https://lwn.net/Articles/803932/ for detail. +/// +/// If the dependent request finishes before the linked timeout, the timeout +/// is canceled. If the timeout finishes before the dependent request, the +/// dependent request will be canceled. +/// +/// The completion event result of the link_timeout will be +/// `-ETIME` if the timeout finishes before the dependent request +/// (in this case, the completion event result of the dependent request will +/// be `-ECANCELED`), or +/// `-EALREADY` if the dependent request finishes before the linked timeout. +pub fn link_timeout( + self: *IoUring, + user_data: u64, + ts: *const linux.kernel_timespec, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_link_timeout(ts, flags); + sqe.user_data = user_data; + return sqe; +} - pub fn is_supported(self: ProbeOp) bool { - return self.flags.OP_SUPPORTED; - } -}; +/// Queues (but does not submit) an SQE to perform a `poll(2)`. +/// Returns a pointer to the SQE. +pub fn poll_add( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + poll_mask: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_add(fd, poll_mask); + sqe.user_data = user_data; + return sqe; +} -/// matches io_uring_probe in liburing -pub const Probe = extern struct { - /// Last opcode supported - last_op: Op, - /// Length of ops[] array below - ops_len: u8, - resv: u16, - resv2: [3]u32, - ops: [256]ProbeOp, +/// Queues (but does not submit) an SQE to remove an existing poll operation. +/// Returns a pointer to the SQE. +pub fn poll_remove( + self: *IoUring, + user_data: u64, + target_user_data: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_remove(target_user_data); + sqe.user_data = user_data; + return sqe; +} - /// Is the operation supported on the running kernel. - pub fn is_supported(self: @This(), op: Op) bool { - const i = @intFromEnum(op); - if (i > @intFromEnum(self.last_op) or i >= self.ops_len) - return false; - return self.ops[i].is_supported(); - } -}; +/// Queues (but does not submit) an SQE to update the user data of an existing poll +/// operation. Returns a pointer to the SQE. +pub fn poll_update( + self: *IoUring, + user_data: u64, + old_user_data: u64, + new_user_data: u64, + poll_mask: u32, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); + sqe.user_data = user_data; + return sqe; +} -// COMMIT: fix defination of io_uring_restriction -// RegisterOp is actually u8 -/// matches io_uring_restriction in liburing -pub const Restriction = extern struct { - opcode: RestrictionOp, - arg: extern union { - /// IORING_RESTRICTION_REGISTER_OP - register_op: RegisterOp, - /// IORING_RESTRICTION_SQE_OP - sqe_op: Op, - /// IORING_RESTRICTION_SQE_FLAGS_* - sqe_flags: u8, - }, - resv: u8, - resv2: [3]u32, -}; +/// Queues (but does not submit) an SQE to perform an `fallocate(2)`. +/// Returns a pointer to the SQE. +pub fn fallocate( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + mode: i32, + offset: u64, + len: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_fallocate(fd, mode, offset, len); + sqe.user_data = user_data; + return sqe; +} -// COMMIT: add new struct type -/// matches io_uring_clock_register in liburing -pub const ClockRegister = extern struct { - clockid: u32, - __resv: [3]u32, -}; +/// Queues (but does not submit) an SQE to perform an `statx(2)`. +/// Returns a pointer to the SQE. +pub fn statx( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + path: [:0]const u8, + flags: u32, + mask: linux.Statx.Mask, + buf: *linux.Statx, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_statx(fd, path, flags, mask, buf); + sqe.user_data = user_data; + return sqe; +} -// COMMIT: add new struct type -/// matches io_uring_clone_buffers in liburing -pub const CloneBuffers = extern struct { - src_fd: u32, - flags: Flags, - src_off: u32, - dst_off: u32, - nr: u32, - pad: [3]u32, +/// Queues (but does not submit) an SQE to remove an existing operation. +/// Returns a pointer to the SQE. +/// +/// The operation is identified by its `user_data`. +/// +/// The completion event result will be `0` if the operation was found and canceled successfully, +/// `-EALREADY` if the operation was found but was already in progress, or +/// `-ENOENT` if the operation was not found. +pub fn cancel( + self: *IoUring, + user_data: u64, + cancel_user_data: u64, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cancel(cancel_user_data, flags); + sqe.user_data = user_data; + return sqe; +} - // COMMIT: new flags - pub const Flags = packed struct(u32) { - REGISTER_SRC_REGISTERED: bool = false, - REGISTER_DST_REPLACE: bool = false, - _: u30 = 0, - }; -}; +/// Queues (but does not submit) an SQE to perform a `shutdown(2)`. +/// Returns a pointer to the SQE. +/// +/// The operation is identified by its `user_data`. +pub fn shutdown( + self: *IoUring, + user_data: u64, + sockfd: posix.socket_t, + how: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_shutdown(sockfd, how); + sqe.user_data = user_data; + return sqe; +} -/// matches io_uring_buf in liburing -pub const Buffer = extern struct { - addr: u64, - len: u32, - bid: u16, - resv: u16, -}; +/// Queues (but does not submit) an SQE to perform a `renameat2(2)`. +/// Returns a pointer to the SQE. +pub fn renameat( + self: *IoUring, + user_data: u64, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.RenameFlags, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.user_data = user_data; + return sqe; +} -/// matches io_uring_buf_ring in liburing -pub const BufferRing = extern struct { - resv1: u64, - resv2: u32, - resv3: u16, - tail: u16, -}; +/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. +/// Returns a pointer to the SQE. +pub fn unlinkat( + self: *IoUring, + user_data: u64, + dir_fd: linux.fd_t, + path: [*:0]const u8, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_unlinkat(dir_fd, path, flags); + sqe.user_data = user_data; + return sqe; +} -/// argument for IORING_(UN)REGISTER_PBUF_RING -/// matches io_uring_buf_reg in liburing -pub const BufferRegister = extern struct { - ring_addr: u64, - ring_entries: u32, - bgid: u16, - flags: Flags, - resv: [3]u64, +/// Queues (but does not submit) an SQE to perform a `mkdirat(2)`. +/// Returns a pointer to the SQE. +pub fn mkdirat( + self: *IoUring, + user_data: u64, + dir_fd: linux.fd_t, + path: [*:0]const u8, + mode: posix.mode_t, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_mkdirat(dir_fd, path, mode); + sqe.user_data = user_data; + return sqe; +} - // COMMIT: new IORING_REGISTER_PBUF_RING flags - /// Flags for IORING_REGISTER_PBUF_RING. - pub const Flags = packed struct(u16) { - /// IOU_PBUF_RING_MMAP: - /// If set, kernel will allocate the memory for the ring. - /// The application must not set a ring_addr in struct io_uring_buf_reg - /// instead it must subsequently call mmap(2) with the offset set - /// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get - /// a virtual mapping for the ring. - IOU_PBUF_RING_MMAP: bool = false, - /// IOU_PBUF_RING_INC: - /// If set, buffers consumed from this buffer ring can be - /// consumed incrementally. Normally one (or more) buffers - /// are fully consumed. With incremental consumptions, it's - /// feasible to register big ranges of buffers, and each - /// use of it will consume only as much as it needs. This - /// requires that both the kernel and application keep - /// track of where the current read/recv index is at. - IOU_PBUF_RING_INC: bool = false, - _: u14 = 0, - }; -}; +/// Queues (but does not submit) an SQE to perform a `symlinkat(2)`. +/// Returns a pointer to the SQE. +pub fn symlinkat( + self: *IoUring, + user_data: u64, + target: [*:0]const u8, + new_dir_fd: linux.fd_t, + link_path: [*:0]const u8, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_symlinkat(target, new_dir_fd, link_path); + sqe.user_data = user_data; + return sqe; +} -/// argument for IORING_REGISTER_PBUF_STATUS -/// matches io_uring_buf_status in liburing -pub const BufferStatus = extern struct { - /// input - buf_group: u32, - /// output - head: u32, - resv: [8]u32, -}; +/// Queues (but does not submit) an SQE to perform a `linkat(2)`. +/// Returns a pointer to the SQE. +pub fn linkat( + self: *IoUring, + user_data: u64, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.user_data = user_data; + return sqe; +} -/// argument for IORING_(UN)REGISTER_NAPI -/// matches io_uring_napi in liburing -pub const Napi = extern struct { - busy_poll_to: u32, - prefer_busy_poll: u8, - pad: [3]u8, - resv: u64, -}; +/// Queues (but does not submit) an SQE to provide a group of buffers used for commands that read/receive data. +/// Returns a pointer to the SQE. +/// +/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via .buffer_selection. +/// +/// The kernel expects a contiguous block of memory of size (buffers_count * buffer_size). +pub fn provide_buffers( + self: *IoUring, + user_data: u64, + buffers: [*]u8, + buffer_size: usize, + buffers_count: usize, + group_id: usize, + buffer_id: usize, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); + sqe.user_data = user_data; + return sqe; +} -// COMMIT: new struct type -/// Argument for io_uring_enter(2) with -/// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument -/// is an index into a previously registered fixed wait region described by -/// the below structure. -/// matches io_uring_reg_wait in liburing -pub const RegisterWait = extern struct { - ts: linux.kernel_timespec, - min_wait_usec: u32, - flags: Flags, - sigmask: u64, - sigmask_sz: u32, - pad: [3]u32, - pad2: [2]u64, +/// Queues (but does not submit) an SQE to remove a group of provided buffers. +/// Returns a pointer to the SQE. +pub fn remove_buffers( + self: *IoUring, + user_data: u64, + buffers_count: usize, + group_id: usize, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_remove_buffers(buffers_count, group_id); + sqe.user_data = user_data; + return sqe; +} - // COMMIT: new constant - pub const Flags = packed struct(u32) { - REG_WAIT_TS: bool = false, - _: u31 = 0, - }; -}; +/// Queues (but does not submit) an SQE to perform a `waitid(2)`. +/// Returns a pointer to the SQE. +pub fn waitid( + self: *IoUring, + user_data: u64, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: linux.W, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_waitid(id_type, id, infop, options, flags); + sqe.user_data = user_data; + return sqe; +} -/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG -/// matches io_uring_getevents_arg in liburing -pub const GetEventsArg = extern struct { - sigmask: u64, - sigmask_sz: u32, - pad: u32, - ts: u64, -}; +/// Registers an array of file descriptors. +/// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must +/// retrieve a reference to the file, and once I/O has completed the file reference must be +/// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads. +/// This slowdown can be avoided by pre-registering file descriptors. +/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags, +/// and the SQE's fd must be set to the index of the file descriptor in the registered array. +/// Registering file descriptors will wait for the ring to idle. +/// Files are automatically unregistered by the kernel when the ring is torn down. +/// An application need unregister only if it wants to register a new array of file descriptors. +pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_FILES, + @as(*const anyopaque, @ptrCast(fds.ptr)), + @as(u32, @intCast(fds.len)), + ); + try handle_registration_result(res); +} -// COMMIT: fix type definition of io_uring_sync_cancel_reg -/// Argument for IORING_REGISTER_SYNC_CANCEL -/// matches io_uring_sync_cancel_reg in liburing -pub const SyncCancelRegister = extern struct { - addr: u64, - fd: i32, - flags: uflags.AsyncCancel, - timeout: linux.kernel_timespec, - opcode: Op, - pad: [7]u8, - pad2: [4]u64, -}; +/// Updates registered file descriptors. +/// +/// Updates are applied starting at the provided offset in the original file descriptors slice. +/// There are three kind of updates: +/// * turning a sparse entry (where the fd is -1) into a real one +/// * removing an existing entry (set the fd to -1) +/// * replacing an existing entry with a new fd +/// Adding new file descriptors must be done with `register_files`. +pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { + assert(self.fd >= 0); -/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE -/// The range is specified as [off, off + len) -/// matches io_uring_file_index_range in liburing -pub const FileIndexRange = extern struct { - off: u32, - len: u32, - resv: u64, -}; + const FilesUpdate = extern struct { + offset: u32, + resv: u32, + fds: u64 align(8), + }; + var update: FilesUpdate = .{ + .offset = offset, + .resv = @as(u32, 0), + .fds = @as(u64, @intFromPtr(fds.ptr)), + }; -/// matches io_uring_recvmsg_out in liburing -pub const RecvmsgOut = extern struct { - namelen: u32, - controllen: u32, - payloadlen: u32, - flags: u32, -}; + const res = linux.io_uring_register( + self.fd, + .REGISTER_FILES_UPDATE, + @as(*const anyopaque, @ptrCast(&update)), + @as(u32, @intCast(fds.len)), + ); + try handle_registration_result(res); +} -/// Zero copy receive refill queue entry -/// matches io_uring_zcrx_rqe in liburing -pub const ZcrxRqe = extern struct { - off: u64, - len: u32, - __pad: u32, -}; +/// Registers an empty (-1) file table of `nr_files` number of file descriptors. +pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { + assert(self.fd >= 0); -/// matches io_uring_zcrx_cqe in liburing -pub const ZcrxCqe = extern struct { - off: u64, - __pad: u64, -}; + const reg: RsrcRegister = .{ + .nr = nr_files, + .flags = constants.RSRC_REGISTER_SPARSE, + .resv2 = 0, + .data = 0, + .tags = 0, + }; -/// matches io_uring_zcrx_offsets in liburing -pub const ZcrxOffsets = extern struct { - head: u32, - tail: u32, - rqes: u32, - __resv2: u32, - __resv: [2]u64, -}; + const res = linux.io_uring_register( + self.fd, + .REGISTER_FILES2, + @ptrCast(®), + @as(u32, @sizeOf(linux.io_uring_rsrc_register)), + ); -/// matches io_uring_zcrx_area_reg in liburing -pub const ZcrxAreaRegister = extern struct { - addr: u64, - len: u64, - rq_area_token: u64, - flags: Flags, - dmabuf_fd: u32, - __resv2: [2]u64, + return handle_registration_result(res); +} - pub const Flags = packed struct(u32) { - DMABUF: bool = false, - _: u31 = 0, - }; -}; +// Registers range for fixed file allocations. +// Available since 6.0 +pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { + assert(self.fd >= 0); -/// Argument for IORING_REGISTER_ZCRX_IFQ -/// matches io_uring_zcrx_ifq_reg in liburing -pub const ZcrxIfqRegister = extern struct { - if_idx: u32, - if_rxq: u32, - rq_entries: u32, - // TODO: find out its flags, I suspect its ZcrxAreaRegister.Flags - flags: u32, - /// pointer to struct io_uring_zcrx_area_reg - area_ptr: u64, - /// struct io_uring_region_desc - region_ptr: u64, - offsets: ZcrxOffsets, - zcrx_id: u32, - __resv2: u32, - __resv: [3]u64, -}; + const range: FileIndexRange = .{ + .off = offset, + .len = len, + .resv = 0, + }; -pub const SocketOp = enum(u16) { - SIOCIN, - SIOCOUTQ, - GETSOCKOPT, - SETSOCKOPT, - // COMMIT: new socket op - TX_TIMESTAMP, -}; + const res = linux.io_uring_register( + self.fd, + .REGISTER_FILE_ALLOC_RANGE, + @ptrCast(&range), + @as(u32, @sizeOf(linux.io_uring_file_index_range)), + ); -/// io_uring_restriction.opcode values -/// matches io_uring_register_restriction_op in liburing -pub const RestrictionOp = enum(u16) { - /// Allow an io_uring_register(2) opcode - REGISTER_OP = 0, - /// Allow an sqe opcode - SQE_OP = 1, - /// Allow sqe flags - SQE_FLAGS_ALLOWED = 2, - /// Require sqe flags (these flags must be set on each submission) - SQE_FLAGS_REQUIRED = 3, + return handle_registration_result(res); +} - _, -}; +/// Registers the file descriptor for an eventfd that will be notified of completion events on +/// an io_uring instance. +/// Only a single a eventfd can be registered at any given point in time. +pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_EVENTFD, + @as(*const anyopaque, @ptrCast(&fd)), + 1, + ); + try handle_registration_result(res); +} -/// IORING_OP_MSG_RING command types, stored in sqe.addr -pub const MsgRingCmd = enum { - /// pass sqe->len as 'res' and off as user_data - DATA, - /// send a registered fd to another ring - SEND_FD, -}; +/// Registers the file descriptor for an eventfd that will be notified of completion events on +/// an io_uring instance. Notifications are only posted for events that complete in an async manner. +/// This means that events that complete inline while being submitted do not trigger a notification event. +/// Only a single eventfd can be registered at any given point in time. +pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_EVENTFD_ASYNC, + @as(*const anyopaque, @ptrCast(&fd)), + 1, + ); + try handle_registration_result(res); +} -// COMMIT: OP to IoUring -pub const Op = enum(u8) { - NOP, - READV, - WRITEV, - FSYNC, - READ_FIXED, - WRITE_FIXED, - POLL_ADD, - POLL_REMOVE, - SYNC_FILE_RANGE, - SENDMSG, - RECVMSG, - TIMEOUT, - TIMEOUT_REMOVE, - ACCEPT, - ASYNC_CANCEL, - LINK_TIMEOUT, - CONNECT, - FALLOCATE, - OPENAT, - CLOSE, - FILES_UPDATE, - STATX, - READ, - WRITE, - FADVISE, - MADVISE, - SEND, - RECV, - EPOLL_CTL, - OPENAT2, - SPLICE, - PROVIDE_BUFFERS, - REMOVE_BUFFERS, - TEE, - SHUTDOWN, - RENAMEAT, - UNLINKAT, - MKDIRAT, - SYMLINKAT, - LINKAT, - MSG_RING, - FSETXATTR, - SETXATTR, - FGETXATTR, - GETXATTR, - SOCKET, - URING_CMD, - SEND_ZC, - SENDMSG_ZC, - READ_MULTISHOT, - WAITID, - FUTEX_WAIT, - FUTEX_WAKE, - FUTEX_WAITV, - FIXED_FD_INSTALL, - FTRUNCATE, - BIND, - LISTEN, - RECV_ZC, - // COMMIT: new OPs - // TODO: to be implemented - EPOLL_WAIT, - READV_FIXED, - WRITEV_FIXED, - PIPE, +/// Unregister the registered eventfd file descriptor. +pub fn unregister_eventfd(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .UNREGISTER_EVENTFD, + null, + 0, + ); + try handle_registration_result(res); +} - _, -}; +pub fn register_napi(self: *IoUring, napi: *Napi) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .REGISTER_NAPI, napi, 1); + try handle_registration_result(res); +} -/// A friendly way to setup an io_uring, with default linux.io_uring_params. -/// `entries` must be a power of two between 1 and 32768, although the kernel will make the final -/// call on how many entries the submission and completion queues will ultimately have, -/// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. -/// Matches the interface of io_uring_queue_init() in liburing. -pub fn init(entries: u16, flags: uflags.Setup) !IoUring { - var params = mem.zeroInit(Params, .{ - .flags = flags, - .sq_thread_idle = 1000, - }); - return try .init_params(entries, ¶ms); +pub fn unregister_napi(self: *IoUring, napi: *Napi) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .UNREGISTER_NAPI, napi, 1); + try handle_registration_result(res); } -/// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission -/// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). -/// `params` is passed by reference because the kernel needs to modify the parameters. -/// Matches the interface of io_uring_queue_init_params() in liburing. -pub fn init_params(entries: u16, p: *Params) !IoUring { - if (entries == 0) return error.EntriesZero; - if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; - assert(p.sq_entries == 0); - assert(@as(u32, @bitCast(p.features)) == 0); - assert(p.resv[0] == 0); - assert(p.resv[1] == 0); - assert(p.resv[2] == 0); +/// Registers an array of buffers for use with `read_fixed` and `write_fixed`. +pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .REGISTER_BUFFERS, + buffers.ptr, + @as(u32, @intCast(buffers.len)), + ); + try handle_registration_result(res); +} - const flags: uflags.Setup = @bitCast(p.flags); - assert(p.cq_entries == 0 or flags.CQSIZE); - assert(p.wq_fd == 0 or flags.ATTACH_WQ); +/// Unregister the registered buffers. +pub fn unregister_buffers(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .UNREGISTER_BUFFERS, null, 0); + switch (linux.errno(res)) { + .SUCCESS => {}, + .NXIO => return error.BuffersNotRegistered, + else => |errno| return posix.unexpectedErrno(errno), + } +} - // flags compatibility - assert(flags.SQPOLL and !(flags.COOP_TASKRUN or flags.TASKRUN_FLAG or flags.DEFER_TASKRUN)); - assert(flags.SQ_AFF and flags.SQPOLL); - assert(flags.DEFER_TASKRUN and flags.SINGLE_ISSUER); +/// Returns a Probe which is used to probe the capabilities of the +/// io_uring subsystem of the running kernel. The Probe contains the +/// list of supported operations. +pub fn get_probe(self: *IoUring) !Probe { + var probe = mem.zeroInit(Probe, .{}); + const res = linux.io_uring_register(self.fd, .REGISTER_PROBE, &probe, probe.ops.len); + try handle_register_buf_ring_result(res); + return probe; +} - const res = linux.io_uring_setup(entries, p); +fn handle_registration_result(res: usize) !void { switch (linux.errno(res)) { .SUCCESS => {}, - .FAULT => return error.ParamsOutsideAccessibleAddressSpace, - // The resv array contains non-zero data, p.flags contains an unsupported flag, - // entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL, - // or IORING_SETUP_CQSIZE was specified but linux.io_uring_params.cq_entries was invalid: - .INVAL => return error.ArgumentsInvalid, - .MFILE => return error.ProcessFdQuotaExceeded, - .NFILE => return error.SystemFdQuotaExceeded, + // One or more fds in the array are invalid, or the kernel does not support sparse sets: + .BADF => return error.FileDescriptorInvalid, + .BUSY => return error.FilesAlreadyRegistered, + .INVAL => return error.FilesEmpty, + // Adding `nr_args` file references would exceed the maximum allowed number of files the + // user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and + // the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed + // for a fixed file set (older kernels have a limit of 1024 files vs 64K files): + .MFILE => return error.UserFdQuotaExceeded, + // Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft + // resource limit but tried to lock more memory than the limit permitted (not enforced + // when the process is privileged with CAP_IPC_LOCK): .NOMEM => return error.SystemResources, - // IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges, - // or a container seccomp policy prohibits io_uring syscalls: - .PERM => return error.PermissionDenied, - .NOSYS => return error.SystemOutdated, + // Attempt to register files on a ring already registering files or being torn down: + .NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, else => |errno| return posix.unexpectedErrno(errno), } - const fd = @as(linux.fd_t, @intCast(res)); - assert(fd >= 0); - errdefer posix.close(fd); +} - const features: uflags.Features = @bitCast(p.features); - // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. - // This is not an optional feature for us... if the kernel does it, we have to do it. - // The thinking on this by the kernel developers was that both the submission and the - // completion queue rings have sizes just over a power of two, but the submission queue ring - // is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel - // gets the submission queue ring for free. - // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. - // We do not support the double mmap() done before 5.4, because we want to keep the - // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. - if (!features.SINGLE_MMAP) { - return error.SystemOutdated; +/// Unregisters all registered file descriptors previously associated with the ring. +pub fn unregister_files(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); + switch (linux.errno(res)) { + .SUCCESS => {}, + .NXIO => return error.FilesNotRegistered, + else => |errno| return posix.unexpectedErrno(errno), } +} + +/// Prepares a socket creation request. +/// New socket fd will be returned in completion result. +/// Available since 5.19 +pub fn socket( + self: *IoUring, + user_data: u64, + domain: linux.AF, + socket_type: linux.SOCK, + protocol: u32, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} - // Check that the kernel has actually set params and that "impossible is nothing". - assert(p.sq_entries != 0); - assert(p.cq_entries != 0); - assert(p.cq_entries >= p.sq_entries); +/// Prepares a socket creation request for registered file at index `file_index`. +/// Available since 5.19 +pub fn socket_direct( + self: *IoUring, + user_data: u64, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, + file_index: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); + sqe.user_data = user_data; + return sqe; +} - // From here on, we only need to read from params, so pass `p` by value as immutable. - // The completion queue shares the mmap with the submission queue, so pass `sq` there too. - var sq = try Sq.init(fd, p.*); - errdefer sq.deinit(); - var cq = try Cq.init(fd, p.*, sq); - errdefer cq.deinit(); +/// Prepares a socket creation request for registered file, index chosen by kernel (file index alloc). +/// File index will be returned in CQE res field. +/// Available since 5.19 +pub fn socket_direct_alloc( + self: *IoUring, + user_data: u64, + domain: u32, + socket_type: u32, + protocol: u32, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} - // Check that our starting state is as we expect. - assert(sq.head.* == 0); - assert(sq.tail.* == 0); - assert(sq.mask == p.sq_entries - 1); - // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. - assert(sq.dropped.* == 0); - assert(sq.array.len == p.sq_entries); - assert(sq.sqes.len == p.sq_entries); - assert(sq.sqe_head == 0); - assert(sq.sqe_tail == 0); +/// Queues (but does not submit) an SQE to perform an `bind(2)` on a socket. +/// Returns a pointer to the SQE. +/// Available since 6.11 +pub fn bind( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: *const posix.sockaddr, + addrlen: posix.socklen_t, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_bind(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} - assert(cq.head.* == 0); - assert(cq.tail.* == 0); - assert(cq.mask == p.cq_entries - 1); - assert(cq.overflow.* == 0); - assert(cq.cqes.len == p.cq_entries); +/// Queues (but does not submit) an SQE to perform an `listen(2)` on a socket. +/// Returns a pointer to the SQE. +/// Available since 6.11 +pub fn listen( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + backlog: usize, + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_listen(fd, backlog, flags); + sqe.user_data = user_data; + return sqe; +} - return .{ - .fd = fd, - .sq = sq, - .cq = cq, - .flags = flags, - .features = features, - }; +/// Prepares an cmd request for a socket. +/// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html +/// Available since 6.7. +pub fn cmd_sock( + self: *IoUring, + user_data: u64, + cmd_op: SocketOp, + fd: linux.fd_t, + level: u32, // linux.SOL + optname: u32, // linux.SO + optval: u64, // pointer to the option value + optlen: u32, // size of the option value +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); + sqe.user_data = user_data; + return sqe; } -pub fn deinit(self: *IoUring) void { - assert(self.fd >= 0); - // The mmaps depend on the fd, so the order of these calls is important: - self.cq.deinit(); - self.sq.deinit(); - posix.close(self.fd); - self.fd = -1; +/// Prepares set socket option for the optname argument, at the protocol +/// level specified by the level argument. +/// Available since 6.7.n +pub fn setsockopt( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + level: u32, // linux.SOL + optname: u32, // linux.SO + opt: []const u8, +) !*Sqe { + return try self.cmd_sock( + user_data, + .SETSOCKOPT, + fd, + level, + optname, + @intFromPtr(opt.ptr), + @intCast(opt.len), + ); } -/// Returns a pointer to a vacant SQE, or an error if the submission queue is full. -/// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. -/// However, instead of a null we return an error to force safe handling. -/// Any situation where the submission queue is full tends more towards a control flow error, -/// and the null return in liburing is more a C idiom than anything else, for lack of a better -/// alternative. In Zig, we have first-class error handling... so let's use it. -/// Matches the implementation of io_uring_get_sqe() in liburing. -pub fn get_sqe(self: *IoUring) !*Sqe { - const head = @atomicLoad(u32, self.sq.head, .acquire); - // Remember that these head and tail offsets wrap around every four billion operations. - // We must therefore use wrapping addition and subtraction to avoid a runtime crash. - const next = self.sq.sqe_tail +% 1; - if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; - const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; - self.sq.sqe_tail = next; - return sqe; +/// Prepares get socket option to retrieve the value for the option specified by +/// the option_name argument for the socket specified by the fd argument. +/// Available since 6.7. +pub fn getsockopt( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + level: u32, // linux.SOL + optname: u32, // linux.SO + opt: []u8, +) !*Sqe { + return try self.cmd_sock( + user_data, + .GETSOCKOPT, + fd, + level, + optname, + @intFromPtr(opt.ptr), + @intCast(opt.len), + ); } -/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have -/// called get_sqe() multiple times to setup multiple I/O requests. -/// Returns the number of SQEs submitted, if not used alongside IORING_SETUP_SQPOLL. -/// If the io_uring instance is uses IORING_SETUP_SQPOLL, the value returned on success is not -/// guaranteed to match the amount of actually submitted sqes during this call. A value higher -/// or lower, including 0, may be returned. -/// Matches the implementation of io_uring_submit() in liburing. -pub fn submit(self: *IoUring) !u32 { - return self.submit_and_wait(0); +/// Registers a shared buffer ring to be used with provided buffers. +/// `entries` number of `io_uring_buf` structures is mem mapped and shared by kernel. +/// `fd` is IO_Uring.fd for which the provided buffer ring is being registered. +/// `entries` is the number of entries requested in the buffer ring, must be power of 2. +/// `group_id` is the chosen buffer group ID, unique in IO_Uring. +pub fn setup_buf_ring( + fd: linux.fd_t, + entries: u16, + group_id: u16, + flags: BufferRegister.Flags, +) !*align(page_size_min) BufferRing { + if (entries == 0 or entries > 1 << 15) return error.EntriesNotInRange; + if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + + const mmap_size = @as(usize, entries) * @sizeOf(Buffer); + const mmap = try posix.mmap( + null, + mmap_size, + posix.PROT.READ | posix.PROT.WRITE, + .{ .TYPE = .PRIVATE, .ANONYMOUS = true }, + -1, + 0, + ); + errdefer posix.munmap(mmap); + assert(mmap.len == mmap_size); + + const br: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); + try register_buf_ring(fd, @intFromPtr(br), entries, group_id, flags); + return br; } -/// Like submit(), but allows waiting for events as well. -/// Returns the number of SQEs submitted. -/// Matches the implementation of io_uring_submit_and_wait() in liburing. -pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { - const submitted = self.flush_sq(); - var flags: uflags.Enter = .{}; - if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { - if (wait_nr > 0 or self.flags.IOPOLL) { - flags.GETEVENTS = true; - } - return try self.enter(submitted, wait_nr, flags); +fn register_buf_ring( + fd: linux.fd_t, + addr: u64, + entries: u32, + group_id: u16, + flags: BufferRegister.Flags, +) !void { + var reg = mem.zeroInit(BufferRegister, .{ + .ring_addr = addr, + .ring_entries = entries, + .bgid = group_id, + .flags = flags, + }); + var res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); + if (linux.errno(res) == .INVAL and reg.flags.IOU_PBUF_RING_INC) { + // Retry without incremental buffer consumption. + // It is available since kernel 6.12. returns INVAL on older. + reg.flags.IOU_PBUF_RING_INC = false; + res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); } - return submitted; + try handle_register_buf_ring_result(res); +} + +fn unregister_buf_ring(fd: posix.fd_t, group_id: u16) !void { + var reg = mem.zeroInit(BufferRegister, .{ + .bgid = group_id, + }); + const res = linux.io_uring_register( + fd, + .UNREGISTER_PBUF_RING, + @as(*const anyopaque, @ptrCast(®)), + 1, + ); + try handle_register_buf_ring_result(res); } -/// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. -/// Returns the number of SQEs submitted. -pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.Enter) !u32 { - assert(self.fd >= 0); - const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); +fn handle_register_buf_ring_result(res: usize) !void { switch (linux.errno(res)) { .SUCCESS => {}, - // The kernel was unable to allocate memory or ran out of resources for the request. - // The application should wait for some completions and try again: - .AGAIN => return error.SystemResources, - // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: - .BADF => return error.FileDescriptorInvalid, - // The file descriptor is valid, but the ring is not in the right state. - // See io_uring_register(2) for how to enable the ring. - .BADFD => return error.FileDescriptorInBadState, - // The application attempted to overcommit the number of requests it can have pending. - // The application should wait for some completions and try again: - .BUSY => return error.CompletionQueueOvercommitted, - // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: - .INVAL => return error.SubmissionQueueEntryInvalid, - // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED - // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range - // described by `addr` and `len` is not within the buffer registered at `buf_index`: - .FAULT => return error.BufferInvalid, - .NXIO => return error.RingShuttingDown, - // The kernel believes our `self.fd` does not refer to an io_uring instance, - // or the opcode is valid but not supported by this kernel (more likely): - .OPNOTSUPP => return error.OpcodeNotSupported, - // The operation was interrupted by a delivery of a signal before it could complete. - // This can happen while waiting for events with IORING_ENTER_GETEVENTS: - .INTR => return error.SignalInterrupt, + .INVAL => return error.ArgumentsInvalid, else => |errno| return posix.unexpectedErrno(errno), } - return @as(u32, @intCast(res)); } -/// Sync internal state with kernel ring state on the SQ side. -/// Returns the number of all pending events in the SQ ring, for the shared ring. -/// This return value includes previously flushed SQEs, as per liburing. -/// The rationale is to suggest that an io_uring_enter() call is needed rather than not. -/// Matches the implementation of __io_uring_flush_sq() in liburing. -pub fn flush_sq(self: *IoUring) u32 { - if (self.sq.sqe_head != self.sq.sqe_tail) { - // Fill in SQEs that we have queued up, adding them to the kernel ring. - const to_submit = self.sq.sqe_tail -% self.sq.sqe_head; - var tail = self.sq.tail.*; - var i: usize = 0; - while (i < to_submit) : (i += 1) { - self.sq.array[tail & self.sq.mask] = self.sq.sqe_head & self.sq.mask; - tail +%= 1; - self.sq.sqe_head +%= 1; - } - // Ensure that the kernel can actually see the SQE updates when it sees the tail update. - @atomicStore(u32, self.sq.tail, tail, .release); - } - return self.sq_ready(); +// Unregisters a previously registered shared buffer ring, returned from io_uring_setup_buf_ring. +pub fn free_buf_ring(fd: posix.fd_t, br: *align(page_size_min) BufferRing, entries: u32, group_id: u16) void { + unregister_buf_ring(fd, group_id) catch {}; + var mmap: []align(page_size_min) u8 = undefined; + mmap.ptr = @ptrCast(br); + mmap.len = entries * @sizeOf(Buffer); + posix.munmap(mmap); } -/// Returns true if we are not using an SQ thread (thus nobody submits but us), -/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. -/// For the latter case, we set the SQ thread wakeup flag. -/// Matches the implementation of sq_ring_needs_enter() in liburing. -pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { - assert(@as(u32, @bitCast(flags.*)) == 0); - if (!self.flags.SQPOLL) return true; - if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).NEED_WAKEUP) { - flags.*.SQ_WAKEUP = true; - return true; - } - return false; +/// Initialises `br` so that it is ready to be used. +pub fn buf_ring_init(br: *BufferRing) void { + br.tail = 0; } -/// Returns the number of flushed and unflushed SQEs pending in the submission queue. -/// In other words, this is the number of SQEs in the submission queue, i.e. its length. -/// These are SQEs that the kernel is yet to consume. -/// Matches the implementation of io_uring_sq_ready in liburing. -pub fn sq_ready(self: *IoUring) u32 { - // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, - // see https://github.com/axboe/liburing/issues/92. - return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .acquire); +/// Calculates the appropriate size mask for a buffer ring. +/// `entries` is the ring entries as specified in io_uring_register_buf_ring. +pub fn buf_ring_mask(entries: u16) u16 { + return entries - 1; } -/// Returns the number of CQEs in the completion queue, i.e. its length. -/// These are CQEs that the application is yet to consume. -/// Matches the implementation of io_uring_cq_ready in liburing. -pub fn cq_ready(self: *IoUring) u32 { - return @atomicLoad(u32, self.cq.tail, .acquire) -% self.cq.head.*; +/// Assigns `buffer` with the `br` buffer ring. +/// `buffer_id` is identifier which will be returned in the CQE. +/// `buffer_offset` is the offset to insert at from the current tail. +/// If just one buffer is provided before the ring tail is committed with advance then offset should be 0. +/// If buffers are provided in a loop before being committed, the offset must be incremented by one for each buffer added. +pub fn buf_ring_add( + br: *BufferRing, + buffer: []u8, + buffer_id: u16, + mask: u16, + buffer_offset: u16, +) void { + const bufs: [*]Buffer = @ptrCast(br); + const buf: *Buffer = &bufs[(br.tail +% buffer_offset) & mask]; + + buf.addr = @intFromPtr(buffer.ptr); + buf.len = @intCast(buffer.len); + buf.bid = buffer_id; } -/// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice. -/// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs. -/// Returns the number of CQEs copied, advancing the CQ ring. -/// Provides all the wait/peek methods found in liburing, but with batching and a single method. -/// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes -/// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface. -/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs. -/// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. -/// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. -/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. -pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 { - const count = self.copy_cqes_ready(cqes); - if (count > 0) return count; - if (self.cq_ring_needs_flush() or wait_nr > 0) { - _ = try self.enter(0, wait_nr, .{ .GETEVENTS = true }); - return self.copy_cqes_ready(cqes); +/// Make `count` new buffers visible to the kernel. Called after +/// `io_uring_buf_ring_add` has been called `count` times to fill in new buffers. +pub fn buf_ring_advance(br: *BufferRing, count: u16) void { + const tail: u16 = br.tail +% count; + @atomicStore(u16, &br.tail, tail, .release); +} + +test "structs/offsets/entries" { + if (!is_linux) return error.SkipZigTest; + + try testing.expectEqual(@as(usize, 120), @sizeOf(Params)); + try testing.expectEqual(@as(usize, 64), @sizeOf(Sqe)); + try testing.expectEqual(@as(usize, 16), @sizeOf(Cqe)); + + try testing.expectEqual(0, constants.OFF_SQ_RING); + try testing.expectEqual(0x8000000, constants.OFF_CQ_RING); + try testing.expectEqual(0x10000000, constants.OFF_SQES); + + try testing.expectError(error.EntriesZero, IoUring.init(0, .{})); + try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, .{})); +} + +test "nop" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer { + ring.deinit(); + testing.expectEqual(@as(linux.fd_t, -1), ring.fd) catch @panic("test failed"); } - return 0; + + const sqe = try ring.nop(0xaaaaaaaa); + try testing.expectEqual(Sqe{ + .opcode = .NOP, + .flags = .{}, + .ioprio = .init_empty(), + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0xaaaaaaaa, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }, sqe.*); + + try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 0), ring.cq.head.*); + try testing.expectEqual(@as(u32, 1), ring.sq_ready()); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 0), ring.cq.head.*); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + + try testing.expectEqual(Cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 1), ring.cq.head.*); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + const sqe_barrier = try ring.nop(0xbbbbbbbb); + sqe_barrier.flags.IO_DRAIN = true; + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xbbbbbbbb, + .res = 0, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 2), ring.cq.head.*); } -fn copy_cqes_ready(self: *IoUring, cqes: []Cqe) u32 { - const ready = self.cq_ready(); - const count = @min(cqes.len, ready); - const head = self.cq.head.* & self.cq.mask; +test "readv" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); + + // Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1). + // Linux Kernel 5.5 adds support for sparse fd sets. + // Compare: + // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs + // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 + // We therefore avoid stressing sparse fd sets here: + var registered_fds = [_]linux.fd_t{0} ** 1; + const fd_index = 0; + registered_fds[fd_index] = fd; + try ring.register_files(registered_fds[0..]); + + var buffer = [_]u8{42} ** 128; + var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; + const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); + try testing.expectEqual(Op.READV, sqe.opcode); + sqe.flags.FIXED_FILE = true; + + try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + + try ring.unregister_files(); +} - // before wrapping - const n = @min(self.cq.cqes.len - head, count); - @memcpy(cqes[0..n], self.cq.cqes[head..][0..n]); +test "writev/fsync/readv" { + if (!is_linux) return error.SkipZigTest; - if (count > n) { - // wrap self.cq.cqes - const w = count - n; - @memcpy(cqes[n..][0..w], self.cq.cqes[0..w]); - } + var ring = IoUring.init(4, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - self.cq_advance(count); - return count; -} + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); -/// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. -/// A convenience method for `copy_cqes()` for when you don't need to batch or peek. -pub fn copy_cqe(ring: *IoUring) !Cqe { - var cqes: [1]Cqe = undefined; - while (true) { - const count = try ring.copy_cqes(&cqes, 1); - if (count > 0) return cqes[0]; - } -} + const path = "test_io_uring_writev_fsync_readv"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; -/// Matches the implementation of cq_ring_needs_flush() in liburing. -pub fn cq_ring_needs_flush(self: *IoUring) bool { - return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).CQ_OVERFLOW; -} + const buffer_write = [_]u8{42} ** 128; + const iovecs_write = [_]posix.iovec_const{ + posix.iovec_const{ .base = &buffer_write, .len = buffer_write.len }, + }; + var buffer_read = [_]u8{0} ** 128; + var iovecs_read = [_]posix.iovec{ + posix.iovec{ .base = &buffer_read, .len = buffer_read.len }, + }; -/// For advanced use cases only that implement custom completion queue methods. -/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance(). -/// Must be called exactly once after a zero-copy CQE has been processed by your application. -/// Not idempotent, calling more than once will result in other CQEs being lost. -/// Matches the implementation of cqe_seen() in liburing. -pub fn cqe_seen(self: *IoUring, cqe: *Cqe) void { - _ = cqe; - self.cq_advance(1); -} + const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); + try testing.expectEqual(Op.WRITEV, sqe_writev.opcode); + try testing.expectEqual(@as(u64, 17), sqe_writev.off); + sqe_writev.flags.IO_LINK = true; -/// For advanced use cases only that implement custom completion queue methods. -/// Matches the implementation of cq_advance() in liburing. -pub fn cq_advance(self: *IoUring, count: u32) void { - if (count > 0) { - // Ensure the kernel only sees the new head value after the CQEs have been read. - @atomicStore(u32, self.cq.head, self.cq.head.* +% count, .release); - } -} + const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); + try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); + try testing.expectEqual(fd, sqe_fsync.fd); + sqe_fsync.flags.IO_LINK = true; -/// Queues (but does not submit) an SQE to perform an `fsync(2)`. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. -/// N.B. While SQEs are initiated in the order in which they appear in the submission queue, -/// operations execute in parallel and completions are unordered. Therefore, an application that -/// submits a write followed by an fsync in the submission queue cannot expect the fsync to -/// apply to the write, since the fsync may complete before the write is issued to the disk. -/// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, -/// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. -pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_fsync(fd, flags); - sqe.user_data = user_data; - return sqe; -} + const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); + try testing.expectEqual(Op.READV, sqe_readv.opcode); + try testing.expectEqual(@as(u64, 17), sqe_readv.off); -/// Queues (but does not submit) an SQE to perform a no-op. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// A no-op is more useful than may appear at first glance. -/// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to -/// know when the ring is idle before acting on a kill signal. -pub fn nop(self: *IoUring, user_data: u64) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_nop(); - sqe.user_data = user_data; - return sqe; -} + try testing.expectEqual(@as(u32, 3), ring.sq_ready()); + try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + try testing.expectEqual(@as(u32, 3), ring.cq_ready()); -/// Used to select how the read should be handled. -pub const ReadBuffer = union(enum) { - /// io_uring will read directly into this buffer - buffer: []u8, + try testing.expectEqual(Cqe{ + .user_data = 0xdddddddd, + .res = buffer_write.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - /// io_uring will read directly into these buffers using readv. - iovecs: []const posix.iovec, + try testing.expectEqual(Cqe{ + .user_data = 0xeeeeeeee, + .res = 0, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 1), ring.cq_ready()); - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, -}; + try testing.expectEqual(Cqe{ + .user_data = 0xffffffff, + .res = buffer_read.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); -/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` depending on the buffer type. -/// * Reading into a `ReadBuffer.buffer` uses `read(2)` -/// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` -/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. See https://man7.org/linux/man-pages/man2/preadv2.2.html -/// -/// Returns a pointer to the SQE. -pub fn read( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: ReadBuffer, - offset: u64, -) !*Sqe { - const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| sqe.prep_read(fd, slice, offset), - .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), - .buffer_selection => |selection| { - sqe.prep_rw(.READ, fd, 0, selection.len, offset); - sqe.flags.BUFFER_SELECT = true; - sqe.buf_index = selection.group_id; - }, - } - sqe.user_data = user_data; - return sqe; + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } -/// Queues (but does not submit) an SQE to perform a `write(2)`. -/// Returns a pointer to the SQE. -pub fn write( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - offset: u64, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_write(fd, buffer, offset); - sqe.user_data = user_data; - return sqe; -} +test "write/read" { + if (!is_linux) return error.SkipZigTest; -/// Queues (but does not submit) an SQE to perform a `splice(2)` -/// Either `fd_in` or `fd_out` must be a pipe. -/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64). -/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read -/// from `fd_in` starting from the file offset, which is incremented by the number of bytes read. -/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`. -/// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first, -/// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally. -/// -/// NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with EINVAL if one of the -/// fd doesn't explicitly support splice peration, e.g. reading from terminal is unsupported from kernel 5.7 to 5.11. -/// See https://github.com/axboe/liburing/issues/291 -/// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd_out: posix.fd_t, off_out: u64, len: usize) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); - sqe.user_data = user_data; - return sqe; -} + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED. -/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. -/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. -/// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -pub fn read_fixed( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: *posix.iovec, - offset: u64, - buffer_index: u16, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_read_fixed(fd, buffer, offset, buffer_index); - sqe.user_data = user_data; - return sqe; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_write_read"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; + + const buffer_write = [_]u8{97} ** 20; + var buffer_read = [_]u8{98} ** 20; + const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); + try testing.expectEqual(Op.WRITE, sqe_write.opcode); + try testing.expectEqual(@as(u64, 10), sqe_write.off); + sqe_write.flags.IO_LINK = true; + const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); + try testing.expectEqual(Op.READ, sqe_read.opcode); + try testing.expectEqual(@as(u64, 10), sqe_read.off); + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + const cqe_write = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); + // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: + // https://lwn.net/Articles/809820/ + if (cqe_write.err() == .INVAL) return error.SkipZigTest; + if (cqe_read.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x11111111, + .res = buffer_write.len, + .flags = .{}, + }, cqe_write); + try testing.expectEqual(Cqe{ + .user_data = 0x22222222, + .res = buffer_read.len, + .flags = .{}, + }, cqe_read); + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } -/// Queues (but does not submit) an SQE to perform a `pwritev()`. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. -/// See https://linux.die.net/man/2/pwritev. -pub fn writev( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - iovecs: []const posix.iovec_const, - offset: u64, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_writev(fd, iovecs, offset); - sqe.user_data = user_data; - return sqe; -} +test "splice/read" { + if (!is_linux) return error.SkipZigTest; -/// Queues (but does not submit) an SQE to perform a IORING_OP_WRITE_FIXED. -/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. -/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. -/// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -pub fn write_fixed( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: *posix.iovec, - offset: u64, - buffer_index: u16, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_write_fixed(fd, buffer, offset, buffer_index); - sqe.user_data = user_data; - return sqe; -} + var ring = IoUring.init(4, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. -/// Returns a pointer to the SQE. -/// Available since 5.5 -pub fn accept( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_accept(fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; -} + var tmp = std.testing.tmpDir(.{}); + const path_src = "test_io_uring_splice_src"; + const file_src = try tmp.dir.createFile(path_src, .{ .read = true, .truncate = true }); + defer file_src.close(); + const fd_src = file_src.handle; -/// Queues an multishot accept on a socket. -/// -/// Multishot variant allows an application to issue a single accept request, -/// which will repeatedly trigger a CQE when a connection request comes in. -/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate -/// further CQEs. -/// -/// Available since 5.19 -pub fn accept_multishot( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_multishot_accept(fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; -} + const path_dst = "test_io_uring_splice_dst"; + const file_dst = try tmp.dir.createFile(path_dst, .{ .read = true, .truncate = true }); + defer file_dst.close(); + const fd_dst = file_dst.handle; -/// Queues an accept using direct (registered) file descriptors. -/// -/// To use an accept direct variant, the application must first have registered -/// a file table (with register_files). An unused table index will be -/// dynamically chosen and returned in the CQE res field. -/// -/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE -/// flags member, and setting the SQE fd field to the direct descriptor value -/// rather than the regular file descriptor. -/// -/// Available since 5.19 -pub fn accept_direct( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_accept_direct(fd, addr, addrlen, flags, constants.FILE_INDEX_ALLOC); - sqe.user_data = user_data; - return sqe; -} + const buffer_write = [_]u8{97} ** 20; + var buffer_read = [_]u8{98} ** 20; + _ = try file_src.write(&buffer_write); -/// Queues an multishot accept using direct (registered) file descriptors. -/// Available since 5.19 -pub fn accept_multishot_direct( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; -} + const fds = try posix.pipe(); + const pipe_offset: u64 = std.math.maxInt(u64); -/// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. -/// Returns a pointer to the SQE. -pub fn connect( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: *const posix.sockaddr, - addrlen: posix.socklen_t, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_connect(fd, addr, addrlen); - sqe.user_data = user_data; - return sqe; -} + const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); + try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); + try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); + try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); + sqe_splice_to_pipe.flags.IO_LINK = true; -/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. -/// Returns a pointer to the SQE. -pub fn epoll_ctl( - self: *IoUring, - user_data: u64, - epfd: linux.fd_t, - fd: linux.fd_t, - op: u32, - ev: ?*linux.epoll_event, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_epoll_ctl(epfd, fd, op, ev); - sqe.user_data = user_data; - return sqe; + const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); + try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); + try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); + try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); + sqe_splice_from_pipe.flags.IO_LINK = true; + + const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); + try testing.expectEqual(Op.READ, sqe_read.opcode); + try testing.expectEqual(@as(u64, 10), sqe_read.off); + try testing.expectEqual(@as(u32, 3), try ring.submit()); + + const cqe_splice_to_pipe = try ring.copy_cqe(); + const cqe_splice_from_pipe = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); + // Prior to Linux Kernel 5.6 this is the only way to test for splice/read support: + // https://lwn.net/Articles/809820/ + if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; + if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; + if (cqe_read.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x11111111, + .res = buffer_write.len, + .flags = .{}, + }, cqe_splice_to_pipe); + try testing.expectEqual(Cqe{ + .user_data = 0x22222222, + .res = buffer_write.len, + .flags = .{}, + }, cqe_splice_from_pipe); + try testing.expectEqual(Cqe{ + .user_data = 0x33333333, + .res = buffer_read.len, + .flags = .{}, + }, cqe_read); + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } -/// Used to select how the recv call should be handled. -pub const RecvBuffer = union(enum) { - /// io_uring will recv directly into this buffer - buffer: []u8, +test "write_fixed/read_fixed" { + if (!is_linux) return error.SkipZigTest; - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, -}; + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Queues (but does not submit) an SQE to perform a `recv(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6 -pub fn recv( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: RecvBuffer, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| sqe.prep_recv(fd, slice, flags), - .buffer_selection => |selection| { - sqe.prep_rw(.RECV, fd, 0, selection.len, 0); - sqe.rw_flags = flags; - sqe.flags.BUFFER_SELECT = true; - sqe.buf_index = selection.group_id; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const path = "test_io_uring_write_read_fixed"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; + + var raw_buffers: [2][11]u8 = undefined; + // First buffer will be written to the file. + @memset(&raw_buffers[0], 'z'); + raw_buffers[0][0.."foobar".len].* = "foobar".*; + + var buffers = [2]posix.iovec{ + .{ .base = &raw_buffers[0], .len = raw_buffers[0].len }, + .{ .base = &raw_buffers[1], .len = raw_buffers[1].len }, + }; + ring.register_buffers(&buffers) catch |err| switch (err) { + error.SystemResources => { + // See https://github.com/ziglang/zig/issues/15362 + return error.SkipZigTest; }, - } - sqe.user_data = user_data; - return sqe; -} + else => |e| return e, + }; -/// Queues (but does not submit) an SQE to perform a `send(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6 -pub fn send( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_send(fd, buffer, flags); - sqe.user_data = user_data; - return sqe; -} + const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); + try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); + try testing.expectEqual(@as(u64, 3), sqe_write.off); + sqe_write.flags.IO_LINK = true; -/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. -/// -/// This operation will most likely produce two CQEs. The flags field of the -/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will -/// be a second cqe with the user_data field set to the same value. The user -/// must not modify the data buffer until the notification is posted. The first -/// cqe follows the usual rules and so its res field will contain the number of -/// bytes sent or a negative error code. The notification's res field will be -/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two -/// step model is needed because the kernel may hold on to buffers for a long -/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling -/// the lifetime of the buffers. Even errored requests may generate a -/// notification. -/// -/// Available since 6.0 -pub fn send_zc( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - send_flags: u32, - zc_flags: u16, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); - sqe.user_data = user_data; - return sqe; -} + const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); + try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode); + try testing.expectEqual(@as(u64, 0), sqe_read.off); -/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. -/// Returns a pointer to the SQE. -/// Available since 6.0 -pub fn send_zc_fixed( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - send_flags: u32, - zc_flags: u16, - buf_index: u16, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); - sqe.user_data = user_data; - return sqe; -} + try testing.expectEqual(@as(u32, 2), try ring.submit()); -/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.3 -pub fn recvmsg( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_recvmsg(fd, msg, flags); - sqe.user_data = user_data; - return sqe; -} + const cqe_write = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); -/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.3 -pub fn sendmsg( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_sendmsg(fd, msg, flags); - sqe.user_data = user_data; - return sqe; -} + try testing.expectEqual(Cqe{ + .user_data = 0x45454545, + .res = @as(i32, @intCast(buffers[0].len)), + .flags = .{}, + }, cqe_write); + try testing.expectEqual(Cqe{ + .user_data = 0x12121212, + .res = @as(i32, @intCast(buffers[1].len)), + .flags = .{}, + }, cqe_read); -/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. -/// Returns a pointer to the SQE. -/// Available since 6.1 -pub fn sendmsg_zc( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_sendmsg_zc(fd, msg, flags); - sqe.user_data = user_data; - return sqe; + try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].base[0..3]); + try testing.expectEqualSlices(u8, "foobar", buffers[1].base[3..9]); + try testing.expectEqualSlices(u8, "zz", buffers[1].base[9..11]); } -/// Queues (but does not submit) an SQE to perform an `openat(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6. -pub fn openat( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: posix.mode_t, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_openat(fd, path, flags, mode); - sqe.user_data = user_data; - return sqe; -} +test "openat" { + if (!is_linux) return error.SkipZigTest; -/// Queues an openat using direct (registered) file descriptors. -/// -/// To use an accept direct variant, the application must first have registered -/// a file table (with register_files). An unused table index will be -/// dynamically chosen and returned in the CQE res field. -/// -/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE -/// flags member, and setting the SQE fd field to the direct descriptor value -/// rather than the regular file descriptor. -/// -/// Available since 5.15 -pub fn openat_direct( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: posix.mode_t, - file_index: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_openat_direct(fd, path, flags, mode, file_index); - sqe.user_data = user_data; - return sqe; -} + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Queues (but does not submit) an SQE to perform a `close(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6. -pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_close(fd); - sqe.user_data = user_data; - return sqe; -} + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); -/// Queues close of registered file descriptor. -/// Available since 5.15 -pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_close_direct(file_index); - sqe.user_data = user_data; - return sqe; -} + const path = "test_io_uring_openat"; -/// Queues (but does not submit) an SQE to register a timeout operation. -/// Returns a pointer to the SQE. -/// -/// The timeout will complete when either the timeout expires, or after the specified number of -/// events complete (if `count` is greater than `0`). -/// -/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout. -/// -/// The completion event result will be `-ETIME` if the timeout completed through expiration, -/// `0` if the timeout completed after the specified number of events, or `-ECANCELED` if the -/// timeout was removed before it expired. -/// -/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. -pub fn timeout( - self: *IoUring, - user_data: u64, - ts: *const linux.kernel_timespec, - count: u32, - flags: uflags.Timeout, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_timeout(ts, count, flags); - sqe.user_data = user_data; - return sqe; -} + // Workaround for LLVM bug: https://github.com/ziglang/zig/issues/12014 + const path_addr = if (builtin.zig_backend == .stage2_llvm) p: { + var workaround = path; + _ = &workaround; + break :p @intFromPtr(workaround); + } else @intFromPtr(path); + + const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; + const mode: posix.mode_t = 0o666; + const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); + try testing.expectEqual(Sqe{ + .opcode = .OPENAT, + .flags = .{}, + .ioprio = .init_empty(), + .fd = tmp.dir.fd, + .off = 0, + .addr = path_addr, + .len = mode, + .rw_flags = @bitCast(flags), + .user_data = 0x33333333, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }, sqe_openat.*); + try testing.expectEqual(@as(u32, 1), try ring.submit()); -/// Queues (but does not submit) an SQE to remove an existing timeout operation. -/// Returns a pointer to the SQE. -/// -/// The timeout is identified by its `user_data`. -/// -/// The completion event result will be `0` if the timeout was found and canceled successfully, -/// `-EBUSY` if the timeout was found but expiration was already in progress, or -/// `-ENOENT` if the timeout was not found. -pub fn timeout_remove( - self: *IoUring, - user_data: u64, - timeout_user_data: u64, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_timeout_remove(timeout_user_data, flags); - sqe.user_data = user_data; - return sqe; -} + const cqe_openat = try ring.copy_cqe(); + try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); + if (cqe_openat.err() == .INVAL) return error.SkipZigTest; + if (cqe_openat.err() == .BADF) return error.SkipZigTest; + if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); + try testing.expect(cqe_openat.res > 0); + try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_openat.flags); -/// Queues (but does not submit) an SQE to add a link timeout operation. -/// Returns a pointer to the SQE. -/// -/// You need to set linux.IOSQE_IO_LINK to flags of the target operation -/// and then call this method right after the target operation. -/// See https://lwn.net/Articles/803932/ for detail. -/// -/// If the dependent request finishes before the linked timeout, the timeout -/// is canceled. If the timeout finishes before the dependent request, the -/// dependent request will be canceled. -/// -/// The completion event result of the link_timeout will be -/// `-ETIME` if the timeout finishes before the dependent request -/// (in this case, the completion event result of the dependent request will -/// be `-ECANCELED`), or -/// `-EALREADY` if the dependent request finishes before the linked timeout. -pub fn link_timeout( - self: *IoUring, - user_data: u64, - ts: *const linux.kernel_timespec, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_link_timeout(ts, flags); - sqe.user_data = user_data; - return sqe; + posix.close(cqe_openat.res); } -/// Queues (but does not submit) an SQE to perform a `poll(2)`. -/// Returns a pointer to the SQE. -pub fn poll_add( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - poll_mask: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_poll_add(fd, poll_mask); - sqe.user_data = user_data; - return sqe; -} +test "close" { + if (!is_linux) return error.SkipZigTest; -/// Queues (but does not submit) an SQE to remove an existing poll operation. -/// Returns a pointer to the SQE. -pub fn poll_remove( - self: *IoUring, - user_data: u64, - target_user_data: u64, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_poll_remove(target_user_data); - sqe.user_data = user_data; - return sqe; -} + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Queues (but does not submit) an SQE to update the user data of an existing poll -/// operation. Returns a pointer to the SQE. -pub fn poll_update( - self: *IoUring, - user_data: u64, - old_user_data: u64, - new_user_data: u64, - poll_mask: u32, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); - sqe.user_data = user_data; - return sqe; -} + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); -/// Queues (but does not submit) an SQE to perform an `fallocate(2)`. -/// Returns a pointer to the SQE. -pub fn fallocate( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - mode: i32, - offset: u64, - len: u64, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_fallocate(fd, mode, offset, len); - sqe.user_data = user_data; - return sqe; -} + const path = "test_io_uring_close"; + const file = try tmp.dir.createFile(path, .{}); + errdefer file.close(); -/// Queues (but does not submit) an SQE to perform an `statx(2)`. -/// Returns a pointer to the SQE. -pub fn statx( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - path: [:0]const u8, - flags: u32, - mask: u32, - buf: *linux.Statx, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_statx(fd, path, flags, mask, buf); - sqe.user_data = user_data; - return sqe; -} + const sqe_close = try ring.close(0x44444444, file.handle); + try testing.expectEqual(Op.CLOSE, sqe_close.opcode); + try testing.expectEqual(file.handle, sqe_close.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); -/// Queues (but does not submit) an SQE to remove an existing operation. -/// Returns a pointer to the SQE. -/// -/// The operation is identified by its `user_data`. -/// -/// The completion event result will be `0` if the operation was found and canceled successfully, -/// `-EALREADY` if the operation was found but was already in progress, or -/// `-ENOENT` if the operation was not found. -pub fn cancel( - self: *IoUring, - user_data: u64, - cancel_user_data: u64, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_cancel(cancel_user_data, flags); - sqe.user_data = user_data; - return sqe; + const cqe_close = try ring.copy_cqe(); + if (cqe_close.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x44444444, + .res = 0, + .flags = .{}, + }, cqe_close); } -/// Queues (but does not submit) an SQE to perform a `shutdown(2)`. -/// Returns a pointer to the SQE. -/// -/// The operation is identified by its `user_data`. -pub fn shutdown( - self: *IoUring, - user_data: u64, - sockfd: posix.socket_t, - how: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_shutdown(sockfd, how); - sqe.user_data = user_data; - return sqe; -} +test "accept/connect/send/recv" { + if (!is_linux) return error.SkipZigTest; -/// Queues (but does not submit) an SQE to perform a `renameat2(2)`. -/// Returns a pointer to the SQE. -pub fn renameat( - self: *IoUring, - user_data: u64, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); - sqe.user_data = user_data; - return sqe; -} + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. -/// Returns a pointer to the SQE. -pub fn unlinkat( - self: *IoUring, - user_data: u64, - dir_fd: linux.fd_t, - path: [*:0]const u8, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_unlinkat(dir_fd, path, flags); - sqe.user_data = user_data; - return sqe; -} + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); -/// Queues (but does not submit) an SQE to perform a `mkdirat(2)`. -/// Returns a pointer to the SQE. -pub fn mkdirat( - self: *IoUring, - user_data: u64, - dir_fd: linux.fd_t, - path: [*:0]const u8, - mode: posix.mode_t, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_mkdirat(dir_fd, path, mode); - sqe.user_data = user_data; - return sqe; -} + const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + + const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); + sqe_send.flags.IO_LINK = true; + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0xeeeeeeee, + .res = buffer_send.len, + .flags = 0, + }, cqe_send); + + const cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0xffffffff, + .res = buffer_recv.len, + // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems + .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recv); -/// Queues (but does not submit) an SQE to perform a `symlinkat(2)`. -/// Returns a pointer to the SQE. -pub fn symlinkat( - self: *IoUring, - user_data: u64, - target: [*:0]const u8, - new_dir_fd: linux.fd_t, - link_path: [*:0]const u8, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_symlinkat(target, new_dir_fd, link_path); - sqe.user_data = user_data; - return sqe; + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); } -/// Queues (but does not submit) an SQE to perform a `linkat(2)`. -/// Returns a pointer to the SQE. -pub fn linkat( - self: *IoUring, - user_data: u64, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); - sqe.user_data = user_data; - return sqe; -} +test "sendmsg/recvmsg" { + if (!is_linux) return error.SkipZigTest; -/// Queues (but does not submit) an SQE to provide a group of buffers used for commands that read/receive data. -/// Returns a pointer to the SQE. -/// -/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via .buffer_selection. -/// -/// The kernel expects a contiguous block of memory of size (buffers_count * buffer_size). -pub fn provide_buffers( - self: *IoUring, - user_data: u64, - buffers: [*]u8, - buffer_size: usize, - buffers_count: usize, - group_id: usize, - buffer_id: usize, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); - sqe.user_data = user_data; - return sqe; -} + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Queues (but does not submit) an SQE to remove a group of provided buffers. -/// Returns a pointer to the SQE. -pub fn remove_buffers( - self: *IoUring, - user_data: u64, - buffers_count: usize, - group_id: usize, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_remove_buffers(buffers_count, group_id); - sqe.user_data = user_data; - return sqe; -} + var address_server: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; -/// Queues (but does not submit) an SQE to perform a `waitid(2)`. -/// Returns a pointer to the SQE. -pub fn waitid( - self: *IoUring, - user_data: u64, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: linux.W, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_waitid(id_type, id, infop, options, flags); - sqe.user_data = user_data; - return sqe; -} + const server = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); + defer posix.close(server); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.bind(server, addrAny(&address_server), @sizeOf(linux.sockaddr.in)); -/// Registers an array of file descriptors. -/// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must -/// retrieve a reference to the file, and once I/O has completed the file reference must be -/// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads. -/// This slowdown can be avoided by pre-registering file descriptors. -/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags, -/// and the SQE's fd must be set to the index of the file descriptor in the registered array. -/// Registering file descriptors will wait for the ring to idle. -/// Files are automatically unregistered by the kernel when the ring is torn down. -/// An application need unregister only if it wants to register a new array of file descriptors. -pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES, - @as(*const anyopaque, @ptrCast(fds.ptr)), - @as(u32, @intCast(fds.len)), - ); - try handle_registration_result(res); -} + // set address_server to the OS-chosen IP/port. + var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(server, addrAny(&address_server), &slen); -/// Updates registered file descriptors. -/// -/// Updates are applied starting at the provided offset in the original file descriptors slice. -/// There are three kind of updates: -/// * turning a sparse entry (where the fd is -1) into a real one -/// * removing an existing entry (set the fd to -1) -/// * replacing an existing entry with a new fd -/// Adding new file descriptors must be done with `register_files`. -pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { - assert(self.fd >= 0); + const client = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); + defer posix.close(client); - const FilesUpdate = extern struct { - offset: u32, - resv: u32, - fds: u64 align(8), + const buffer_send = [_]u8{42} ** 128; + const iovecs_send = [_]posix.iovec_const{ + posix.iovec_const{ .base = &buffer_send, .len = buffer_send.len }, }; - var update: FilesUpdate = .{ - .offset = offset, - .resv = @as(u32, 0), - .fds = @as(u64, @intFromPtr(fds.ptr)), + const msg_send: linux.msghdr_const = .{ + .name = addrAny(&address_server), + .namelen = @sizeOf(linux.sockaddr.in), + .iov = &iovecs_send, + .iovlen = 1, + .control = null, + .controllen = 0, + .flags = 0, }; + const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); + sqe_sendmsg.flags.IO_LINK = true; + try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); + try testing.expectEqual(client, sqe_sendmsg.fd); - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES_UPDATE, - @as(*const anyopaque, @ptrCast(&update)), - @as(u32, @intCast(fds.len)), - ); - try handle_registration_result(res); -} + var buffer_recv = [_]u8{0} ** 128; + var iovecs_recv = [_]posix.iovec{ + posix.iovec{ .base = &buffer_recv, .len = buffer_recv.len }, + }; + var address_recv: linux.sockaddr.in = .{ + .port = 0, + .addr = 0, + }; + var msg_recv: linux.msghdr = .{ + .name = addrAny(&address_recv), + .namelen = @sizeOf(linux.sockaddr.in), + .iov = &iovecs_recv, + .iovlen = 1, + .control = null, + .controllen = 0, + .flags = 0, + }; + const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); + try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode); + try testing.expectEqual(server, sqe_recvmsg.fd); -/// Registers an empty (-1) file table of `nr_files` number of file descriptors. -pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { - assert(self.fd >= 0); + try testing.expectEqual(@as(u32, 2), ring.sq_ready()); + try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - const reg: RsrcRegister = .{ - .nr = nr_files, - .flags = constants.RSRC_REGISTER_SPARSE, - .resv2 = 0, - .data = 0, - .tags = 0, - }; + const cqe_sendmsg = try ring.copy_cqe(); + if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x11111111, + .res = buffer_send.len, + .flags = 0, + }, cqe_sendmsg); - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES2, - @ptrCast(®), - @as(u32, @sizeOf(linux.io_uring_rsrc_register)), - ); + const cqe_recvmsg = try ring.copy_cqe(); + if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x22222222, + .res = buffer_recv.len, + // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically + .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recvmsg); - return handle_registration_result(res); + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); } -// Registers range for fixed file allocations. -// Available since 6.0 -pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { - assert(self.fd >= 0); +test "timeout (after a relative time)" { + if (!is_linux) return error.SkipZigTest; + const io = std.testing.io; - const range: FileIndexRange = .{ - .off = offset, - .len = len, - .resv = 0, + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; + defer ring.deinit(); - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILE_ALLOC_RANGE, - @ptrCast(&range), - @as(u32, @sizeOf(linux.io_uring_file_index_range)), - ); + const ms = 10; + const margin = 5; + const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; + + const started = try std.Io.Clock.awake.now(io); + const sqe = try ring.timeout(0x55555555, &ts, 0, 0); + try testing.expectEqual(Op.TIMEOUT, sqe.opcode); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + const stopped = try std.Io.Clock.awake.now(io); + + try testing.expectEqual(Cqe{ + .user_data = 0x55555555, + .res = -@as(i32, @intFromEnum(linux.E.TIME)), + .flags = .{}, + }, cqe); - return handle_registration_result(res); + // Tests should not depend on timings: skip test if outside margin. + const ms_elapsed = started.durationTo(stopped).toMilliseconds(); + if (ms_elapsed > margin) return error.SkipZigTest; } -/// Registers the file descriptor for an eventfd that will be notified of completion events on -/// an io_uring instance. -/// Only a single a eventfd can be registered at any given point in time. -pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_EVENTFD, - @as(*const anyopaque, @ptrCast(&fd)), - 1, - ); - try handle_registration_result(res); -} +test "timeout (after a number of completions)" { + if (!is_linux) return error.SkipZigTest; -/// Registers the file descriptor for an eventfd that will be notified of completion events on -/// an io_uring instance. Notifications are only posted for events that complete in an async manner. -/// This means that events that complete inline while being submitted do not trigger a notification event. -/// Only a single eventfd can be registered at any given point in time. -pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_EVENTFD_ASYNC, - @as(*const anyopaque, @ptrCast(&fd)), - 1, - ); - try handle_registration_result(res); -} + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Unregister the registered eventfd file descriptor. -pub fn unregister_eventfd(self: *IoUring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .UNREGISTER_EVENTFD, - null, - 0, - ); - try handle_registration_result(res); -} + const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; + const count_completions: u64 = 1; + const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, .{}); + try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(count_completions, sqe_timeout.off); + _ = try ring.nop(0x77777777); + try testing.expectEqual(@as(u32, 2), try ring.submit()); -pub fn register_napi(self: *IoUring, napi: *Napi) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .REGISTER_NAPI, napi, 1); - try handle_registration_result(res); -} + const cqe_nop = try ring.copy_cqe(); + try testing.expectEqual(Cqe{ + .user_data = 0x77777777, + .res = 0, + .flags = .{}, + }, cqe_nop); -pub fn unregister_napi(self: *IoUring, napi: *Napi) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_NAPI, napi, 1); - try handle_registration_result(res); + const cqe_timeout = try ring.copy_cqe(); + try testing.expectEqual(Cqe{ + .user_data = 0x66666666, + .res = 0, + .flags = .{}, + }, cqe_timeout); } -/// Registers an array of buffers for use with `read_fixed` and `write_fixed`. -pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_BUFFERS, - buffers.ptr, - @as(u32, @intCast(buffers.len)), - ); - try handle_registration_result(res); -} +test "timeout_remove" { + if (!is_linux) return error.SkipZigTest; -/// Unregister the registered buffers. -pub fn unregister_buffers(self: *IoUring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_BUFFERS, null, 0); - switch (linux.errno(res)) { - .SUCCESS => {}, - .NXIO => return error.BuffersNotRegistered, - else => |errno| return posix.unexpectedErrno(errno), - } -} + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Returns a Probe which is used to probe the capabilities of the -/// io_uring subsystem of the running kernel. The Probe contains the -/// list of supported operations. -pub fn get_probe(self: *IoUring) !Probe { - var probe = mem.zeroInit(Probe, .{}); - const res = linux.io_uring_register(self.fd, .REGISTER_PROBE, &probe, probe.ops.len); - try handle_register_buf_ring_result(res); - return probe; -} + const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; + const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); + try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); -fn handle_registration_result(res: usize) !void { - switch (linux.errno(res)) { - .SUCCESS => {}, - // One or more fds in the array are invalid, or the kernel does not support sparse sets: - .BADF => return error.FileDescriptorInvalid, - .BUSY => return error.FilesAlreadyRegistered, - .INVAL => return error.FilesEmpty, - // Adding `nr_args` file references would exceed the maximum allowed number of files the - // user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and - // the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed - // for a fixed file set (older kernels have a limit of 1024 files vs 64K files): - .MFILE => return error.UserFdQuotaExceeded, - // Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft - // resource limit but tried to lock more memory than the limit permitted (not enforced - // when the process is privileged with CAP_IPC_LOCK): - .NOMEM => return error.SystemResources, - // Attempt to register files on a ring already registering files or being torn down: - .NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, - else => |errno| return posix.unexpectedErrno(errno), - } -} + const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, .{}); + try testing.expectEqual(Op.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); + try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); + try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); -/// Unregisters all registered file descriptors previously associated with the ring. -pub fn unregister_files(self: *IoUring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); - switch (linux.errno(res)) { - .SUCCESS => {}, - .NXIO => return error.FilesNotRegistered, - else => |errno| return posix.unexpectedErrno(errno), - } -} + try testing.expectEqual(@as(u32, 2), try ring.submit()); -/// Prepares a socket creation request. -/// New socket fd will be returned in completion result. -/// Available since 5.19 -pub fn socket( - self: *IoUring, - user_data: u64, - domain: linux.AF, - socket_type: linux.SOCK, - protocol: u32, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket(domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; -} + // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: + // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second + // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second -/// Prepares a socket creation request for registered file at index `file_index`. -/// Available since 5.19 -pub fn socket_direct( - self: *IoUring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - file_index: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); - sqe.user_data = user_data; - return sqe; -} + var cqes: [2]Cqe = undefined; + cqes[0] = try ring.copy_cqe(); + cqes[1] = try ring.copy_cqe(); -/// Prepares a socket creation request for registered file, index chosen by kernel (file index alloc). -/// File index will be returned in CQE res field. -/// Available since 5.19 -pub fn socket_direct_alloc( - self: *IoUring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; -} + for (cqes) |cqe| { + // IORING_OP_TIMEOUT_REMOVE is not supported by this kernel version: + // Timeout remove operations set the fd to -1, which results in EBADF before EINVAL. + // We use IORING_FEAT_RW_CUR_POS as a safety check here to make sure we are at least pre-5.6. + // We don't want to skip this test for newer kernels. + if (cqe.user_data == 0x99999999 and + cqe.err() == .BADF and + (!ring.features.RW_CUR_POS)) + { + return error.SkipZigTest; + } -/// Queues (but does not submit) an SQE to perform an `bind(2)` on a socket. -/// Returns a pointer to the SQE. -/// Available since 6.11 -pub fn bind( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: *const posix.sockaddr, - addrlen: posix.socklen_t, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_bind(fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; -} + try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); -/// Queues (but does not submit) an SQE to perform an `listen(2)` on a socket. -/// Returns a pointer to the SQE. -/// Available since 6.11 -pub fn listen( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - backlog: usize, - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_listen(fd, backlog, flags); - sqe.user_data = user_data; - return sqe; + if (cqe.user_data == 0x88888888) { + try testing.expectEqual(Cqe{ + .user_data = 0x88888888, + .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), + .flags = .{}, + }, cqe); + } else if (cqe.user_data == 0x99999999) { + try testing.expectEqual(Cqe{ + .user_data = 0x99999999, + .res = 0, + .flags = 0, + }, cqe); + } + } } -/// Prepares an cmd request for a socket. -/// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html -/// Available since 6.7. -pub fn cmd_sock( - self: *IoUring, - user_data: u64, - cmd_op: SocketOp, - fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO - optval: u64, // pointer to the option value - optlen: u32, // size of the option value -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); - sqe.user_data = user_data; - return sqe; -} +test "accept/connect/recv/link_timeout" { + if (!is_linux) return error.SkipZigTest; -/// Prepares set socket option for the optname argument, at the protocol -/// level specified by the level argument. -/// Available since 6.7.n -pub fn setsockopt( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO - opt: []const u8, -) !*Sqe { - return try self.cmd_sock( - user_data, - .SETSOCKOPT, - fd, - level, - optname, - @intFromPtr(opt.ptr), - @intCast(opt.len), - ); -} + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// Prepares get socket option to retrieve the value for the option specified by -/// the option_name argument for the socket specified by the fd argument. -/// Available since 6.7. -pub fn getsockopt( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO - opt: []u8, -) !*Sqe { - return try self.cmd_sock( - user_data, - .GETSOCKOPT, - fd, - level, - optname, - @intFromPtr(opt.ptr), - @intCast(opt.len), - ); -} + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); -/// matches io_uring_sq in liburing -pub const Sq = struct { - head: *u32, - tail: *u32, - mask: u32, - flags: *Flags, - dropped: *u32, - array: []u32, - sqes: []Sqe, - mmap: []align(page_size_min) u8, - mmap_sqes: []align(page_size_min) u8, + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - // We use `sqe_head` and `sqe_tail` in the same way as liburing: - // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. - // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. - // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. - sqe_head: u32 = 0, - sqe_tail: u32 = 0, + const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + sqe_recv.flags.IO_LINK = true; - /// sq_ring.flags - pub const Flags = packed struct(u32) { - /// needs io_uring_enter wakeup - NEED_WAKEUP: bool = false, - /// CQ ring is overflown - CQ_OVERFLOW: bool = false, - /// task should enter the kernel - TASKRUN: bool = false, - _unused: u29 = 0, + const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; + _ = try ring.link_timeout(0x22222222, &ts, 0); + + const nr_wait = try ring.submit(); + try testing.expectEqual(@as(u32, 2), nr_wait); + + var i: usize = 0; + while (i < nr_wait) : (i += 1) { + const cqe = try ring.copy_cqe(); + switch (cqe.user_data) { + 0xffffffff => { + if (cqe.res != -@as(i32, @intFromEnum(linux.E.INTR)) and + cqe.res != -@as(i32, @intFromEnum(linux.E.CANCELED))) + { + std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); + try testing.expect(false); + } + }, + 0x22222222 => { + if (cqe.res != -@as(i32, @intFromEnum(linux.E.ALREADY)) and + cqe.res != -@as(i32, @intFromEnum(linux.E.TIME))) + { + std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); + try testing.expect(false); + } + }, + else => @panic("should not happen"), + } + } +} + +test "fallocate" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; + defer ring.deinit(); - pub fn init(fd: posix.fd_t, p: Params) !Sq { - assert(fd >= 0); - assert(p.features.SINGLE_MMAP); - const size = @max( - p.sq_off.array + p.sq_entries * @sizeOf(u32), - p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe), - ); - const mmap = try posix.mmap( - null, - size, - posix.PROT.READ | posix.PROT.WRITE, - .{ .TYPE = .SHARED, .POPULATE = true }, - fd, - constants.OFF_SQ_RING, - ); - errdefer posix.munmap(mmap); - assert(mmap.len == size); + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - // The motivation for the `sqes` and `array` indirection is to make it possible for the - // application to preallocate static io_uring_sqe entries and then replay them when needed. - const size_sqes = p.sq_entries * @sizeOf(Sqe); - const mmap_sqes = try posix.mmap( - null, - size_sqes, - posix.PROT.READ | posix.PROT.WRITE, - .{ .TYPE = .SHARED, .POPULATE = true }, - fd, - constants.OFF_SQES, - ); - errdefer posix.munmap(mmap_sqes); - assert(mmap_sqes.len == size_sqes); + const path = "test_io_uring_fallocate"; + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); - const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); - const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0])); - // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, - // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. - assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); - return .{ - .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), - .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), - .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, - .flags = @ptrCast(@alignCast(&mmap[p.sq_off.flags])), - .dropped = @ptrCast(@alignCast(&mmap[p.sq_off.dropped])), - .array = array[0..p.sq_entries], - .sqes = sqes[0..p.sq_entries], - .mmap = mmap, - .mmap_sqes = mmap_sqes, - }; - } + try testing.expectEqual(@as(u64, 0), (try file.stat()).size); - pub fn deinit(self: *Sq) void { - posix.munmap(self.mmap_sqes); - posix.munmap(self.mmap); + const len: u64 = 65536; + const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); + try testing.expectEqual(Op.FALLOCATE, sqe.opcode); + try testing.expectEqual(file.handle, sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement fallocate(): + .INVAL => return error.SkipZigTest, + // This kernel does not implement fallocate(): + .NOSYS => return error.SkipZigTest, + // The filesystem containing the file referred to by fd does not support this operation; + // or the mode is not supported by the filesystem containing the file referred to by fd: + .OPNOTSUPP => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } -}; + try testing.expectEqual(Cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = .{}, + }, cqe); -/// matches io_uring_cq in liburing -pub const Cq = struct { - head: *u32, - tail: *u32, - mask: u32, - overflow: *u32, - cqes: []Cqe, + try testing.expectEqual(len, (try file.stat()).size); +} - /// cq_ring.flags - pub const Flags = packed struct(u32) { - /// disable eventfd notifications - EVENTFD_DISABLED: bool = false, - _unused: u31 = 0, +test "statx" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; + defer ring.deinit(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_statx"; + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); - pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq { - assert(fd >= 0); - const features: uflags.Features = @bitCast(p.features); - assert(features.SINGLE_MMAP); - const mmap = sq.mmap; - const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); - assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); - return .{ - .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), - .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), - .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, - .overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])), - .cqes = cqes[0..p.cq_entries], - }; - } + try testing.expectEqual(@as(u64, 0), (try file.stat()).size); - pub fn deinit(self: *Cq) void { - _ = self; - // A no-op since we now share the mmap with the submission queue. - // Here for symmetry with the submission queue, and for any future feature support. + try file.writeAll("foobar"); + + var buf: linux.Statx = undefined; + const sqe = try ring.statx( + 0xaaaaaaaa, + tmp.dir.fd, + path, + 0, + .{ .size = true }, + &buf, + ); + try testing.expectEqual(Op.STATX, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement statx(): + .INVAL => return error.SkipZigTest, + // This kernel does not implement statx(): + .NOSYS => return error.SkipZigTest, + // The filesystem containing the file referred to by fd does not support this operation; + // or the mode is not supported by the filesystem containing the file referred to by fd: + .OPNOTSUPP => return error.SkipZigTest, + // not supported on older kernels (5.4) + .BADF => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } -}; + try testing.expectEqual(Cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = 0, + }, cqe); -/// Group of application provided buffers. Uses newer type, called ring mapped -/// buffers, supported since kernel 5.19. Buffers are identified by a buffer -/// group ID, and within that group, a buffer ID. IO_Uring can have multiple -/// buffer groups, each with unique group ID. -/// -/// In `init` application provides contiguous block of memory `buffers` for -/// `buffers_count` buffers of size `buffers_size`. Application can then submit -/// `recv` operation without providing buffer upfront. Once the operation is -/// ready to receive data, a buffer is picked automatically and the resulting -/// CQE will contain the buffer ID in `cqe.buffer_id()`. Use `get` method to get -/// buffer for buffer ID identified by CQE. Once the application has processed -/// the buffer, it may hand ownership back to the kernel, by calling `put` -/// allowing the cycle to repeat. -/// -/// Depending on the rate of arrival of data, it is possible that a given buffer -/// group will run out of buffers before those in CQEs can be put back to the -/// kernel. If this happens, a `cqe.err()` will have ENOBUFS as the error value. -/// -pub const BufferGroup = struct { - /// Parent ring for which this group is registered. - ring: *IoUring, - /// Pointer to the memory shared by the kernel. - /// `buffers_count` of `io_uring_buf` structures are shared by the kernel. - /// First `io_uring_buf` is overlaid by `io_uring_buf_ring` struct. - br: *align(page_size_min) BufferRing, - /// Contiguous block of memory of size (buffers_count * buffer_size). - buffers: []u8, - /// Size of each buffer in buffers. - buffer_size: u32, - /// Number of buffers in `buffers`, number of `io_uring_buf structures` in br. - buffers_count: u16, - /// Head of unconsumed part of each buffer, if incremental consumption is enabled - heads: []u32, - /// ID of this group, must be unique in ring. - group_id: u16, + try testing.expect(buf.mask & linux.STATX_SIZE == linux.STATX_SIZE); + try testing.expectEqual(@as(u64, 6), buf.size); +} - pub fn init( - ring: *IoUring, - allocator: mem.Allocator, - group_id: u16, - buffer_size: u32, - buffers_count: u16, - ) !BufferGroup { - const buffers = try allocator.alloc(u8, buffer_size * buffers_count); - errdefer allocator.free(buffers); - const heads = try allocator.alloc(u32, buffers_count); - errdefer allocator.free(heads); +test "accept/connect/recv/cancel" { + if (!is_linux) return error.SkipZigTest; - const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .IOU_PBUF_RING_INC = true }); - buf_ring_init(br); + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - const mask = buf_ring_mask(buffers_count); - var i: u16 = 0; - while (i < buffers_count) : (i += 1) { - const pos = buffer_size * i; - const buf = buffers[pos .. pos + buffer_size]; - heads[i] = 0; - buf_ring_add(br, buf, i, mask, i); - } - buf_ring_advance(br, buffers_count); + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); - return BufferGroup{ - .ring = ring, - .group_id = group_id, - .br = br, - .buffers = buffers, - .heads = heads, - .buffer_size = buffer_size, - .buffers_count = buffers_count, - }; - } + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void { - free_buf_ring(self.ring.fd, self.br, self.buffers_count, self.group_id); - allocator.free(self.buffers); - allocator.free(self.heads); - } + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // Prepare recv operation which will select buffer from this group. - pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { - var sqe = try self.ring.get_sqe(); - sqe.prep_rw(.RECV, fd, 0, 0, 0); - sqe.rw_flags = flags; - sqe.flags.BUFFER_SELECT = true; - sqe.buf_index = self.group_id; - sqe.user_data = user_data; - return sqe; - } + const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); + try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode); + try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); + try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // Prepare multishot recv operation which will select buffer from this group. - pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { - var sqe = try self.recv(user_data, fd, flags); - sqe.ioprio.send_recv.RECV_MULTISHOT = true; - return sqe; - } + var cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; + var cqe_cancel = try ring.copy_cqe(); + if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; - // Get buffer by id. - fn get_by_id(self: *BufferGroup, buffer_id: u16) []u8 { - const pos = self.buffer_size * buffer_id; - return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..]; + // The recv/cancel CQEs may arrive in any order, the recv CQE will sometimes come first: + if (cqe_recv.user_data == 0x99999999 and cqe_cancel.user_data == 0xffffffff) { + const a = cqe_recv; + const b = cqe_cancel; + cqe_recv = b; + cqe_cancel = a; } - // Get buffer by CQE. - pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { - const buffer_id = try cqe.buffer_id(); - const used_len = @as(usize, @intCast(cqe.res)); - return self.get_by_id(buffer_id)[0..used_len]; - } + try testing.expectEqual(Cqe{ + .user_data = 0xffffffff, + .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), + .flags = 0, + }, cqe_recv); - // Release buffer from CQE to the kernel. - pub fn put(self: *BufferGroup, cqe: Cqe) !void { - const buffer_id = try cqe.buffer_id(); - if (cqe.flags.F_BUF_MORE) { - // Incremental consumption active, kernel will write to the this buffer again - const used_len = @as(u32, @intCast(cqe.res)); - // Track what part of the buffer is used - self.heads[buffer_id] += used_len; - return; - } - self.heads[buffer_id] = 0; + try testing.expectEqual(Cqe{ + .user_data = 0x99999999, + .res = 0, + .flags = 0, + }, cqe_cancel); +} - // Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count); - const mask = buf_ring_mask(self.buffers_count); - buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0); - buf_ring_advance(self.br, 1); - } -}; +test "register_files_update" { + if (!is_linux) return error.SkipZigTest; -/// Registers a shared buffer ring to be used with provided buffers. -/// `entries` number of `io_uring_buf` structures is mem mapped and shared by kernel. -/// `fd` is IO_Uring.fd for which the provided buffer ring is being registered. -/// `entries` is the number of entries requested in the buffer ring, must be power of 2. -/// `group_id` is the chosen buffer group ID, unique in IO_Uring. -pub fn setup_buf_ring( - fd: linux.fd_t, - entries: u16, - group_id: u16, - flags: BufferRegister.Flags, -) !*align(page_size_min) BufferRing { - if (entries == 0 or entries > 1 << 15) return error.EntriesNotInRange; - if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - const mmap_size = @as(usize, entries) * @sizeOf(Buffer); - const mmap = try posix.mmap( - null, - mmap_size, - posix.PROT.READ | posix.PROT.WRITE, - .{ .TYPE = .PRIVATE, .ANONYMOUS = true }, - -1, - 0, - ); - errdefer posix.munmap(mmap); - assert(mmap.len == mmap_size); + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); - const br: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); - try register_buf_ring(fd, @intFromPtr(br), entries, group_id, flags); - return br; -} + var registered_fds = [_]linux.fd_t{0} ** 2; + const fd_index = 0; + const fd_index2 = 1; + registered_fds[fd_index] = fd; + registered_fds[fd_index2] = -1; + + ring.register_files(registered_fds[0..]) catch |err| switch (err) { + // Happens when the kernel doesn't support sparse entry (-1) in the file descriptors array. + error.FileDescriptorInvalid => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + }; -fn register_buf_ring( - fd: linux.fd_t, - addr: u64, - entries: u32, - group_id: u16, - flags: BufferRegister.Flags, -) !void { - var reg = mem.zeroInit(BufferRegister, .{ - .ring_addr = addr, - .ring_entries = entries, - .bgid = group_id, - .flags = flags, - }); - var res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); - if (linux.errno(res) == .INVAL and reg.flags.IOU_PBUF_RING_INC) { - // Retry without incremental buffer consumption. - // It is available since kernel 6.12. returns INVAL on older. - reg.flags.IOU_PBUF_RING_INC = false; - res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); - } - try handle_register_buf_ring_result(res); -} + // Test IORING_REGISTER_FILES_UPDATE + // Only available since Linux 5.5 -fn unregister_buf_ring(fd: posix.fd_t, group_id: u16) !void { - var reg = mem.zeroInit(BufferRegister, .{ - .bgid = group_id, - }); - const res = linux.io_uring_register( - fd, - .UNREGISTER_PBUF_RING, - @as(*const anyopaque, @ptrCast(®)), - 1, - ); - try handle_register_buf_ring_result(res); -} + const fd2 = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd2); -fn handle_register_buf_ring_result(res: usize) !void { - switch (linux.errno(res)) { - .SUCCESS => {}, - .INVAL => return error.ArgumentsInvalid, - else => |errno| return posix.unexpectedErrno(errno), - } -} + registered_fds[fd_index] = fd2; + registered_fds[fd_index2] = -1; + try ring.register_files_update(0, registered_fds[0..]); -// Unregisters a previously registered shared buffer ring, returned from io_uring_setup_buf_ring. -pub fn free_buf_ring(fd: posix.fd_t, br: *align(page_size_min) BufferRing, entries: u32, group_id: u16) void { - unregister_buf_ring(fd, group_id) catch {}; - var mmap: []align(page_size_min) u8 = undefined; - mmap.ptr = @ptrCast(br); - mmap.len = entries * @sizeOf(Buffer); - posix.munmap(mmap); -} + var buffer = [_]u8{42} ** 128; + { + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + sqe.flags.FIXED_FILE = true; -/// Initialises `br` so that it is ready to be used. -pub fn buf_ring_init(br: *BufferRing) void { - br.tail = 0; -} + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + } -/// Calculates the appropriate size mask for a buffer ring. -/// `entries` is the ring entries as specified in io_uring_register_buf_ring. -pub fn buf_ring_mask(entries: u16) u16 { - return entries - 1; -} + // Test with a non-zero offset -/// Assigns `buffer` with the `br` buffer ring. -/// `buffer_id` is identifier which will be returned in the CQE. -/// `buffer_offset` is the offset to insert at from the current tail. -/// If just one buffer is provided before the ring tail is committed with advance then offset should be 0. -/// If buffers are provided in a loop before being committed, the offset must be incremented by one for each buffer added. -pub fn buf_ring_add( - br: *BufferRing, - buffer: []u8, - buffer_id: u16, - mask: u16, - buffer_offset: u16, -) void { - const bufs: [*]Buffer = @ptrCast(br); - const buf: *Buffer = &bufs[(br.tail +% buffer_offset) & mask]; + registered_fds[fd_index] = -1; + registered_fds[fd_index2] = -1; + try ring.register_files_update(1, registered_fds[1..]); - buf.addr = @intFromPtr(buffer.ptr); - buf.len = @intCast(buffer.len); - buf.bid = buffer_id; -} + { + // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + sqe.flags.FIXED_FILE = true; -/// Make `count` new buffers visible to the kernel. Called after -/// `io_uring_buf_ring_add` has been called `count` times to fill in new buffers. -pub fn buf_ring_advance(br: *BufferRing, count: u16) void { - const tail: u16 = br.tail +% count; - @atomicStore(u16, &br.tail, tail, .release); -} + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + } -test "structs/offsets/entries" { - if (!is_linux) return error.SkipZigTest; + try ring.register_files_update(0, registered_fds[0..]); - try testing.expectEqual(@as(usize, 120), @sizeOf(Params)); - try testing.expectEqual(@as(usize, 64), @sizeOf(Sqe)); - try testing.expectEqual(@as(usize, 16), @sizeOf(Cqe)); + { + // Now this should fail since both fds are sparse (-1) + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + sqe.flags.FIXED_FILE = true; - try testing.expectEqual(0, constants.OFF_SQ_RING); - try testing.expectEqual(0x8000000, constants.OFF_CQ_RING); - try testing.expectEqual(0x10000000, constants.OFF_SQES); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + try testing.expectEqual(linux.E.BADF, cqe.err()); + } - try testing.expectError(error.EntriesZero, IoUring.init(0, .{})); - try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, .{})); + try ring.unregister_files(); } -test "nop" { +test "shutdown" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; - defer { - ring.deinit(); - testing.expectEqual(@as(linux.fd_t, -1), ring.fd) catch @panic("test failed"); - } + defer ring.deinit(); - const sqe = try ring.nop(0xaaaaaaaa); - try testing.expectEqual(Sqe{ - .opcode = .NOP, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0xaaaaaaaa, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }, sqe.*); + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; - try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 1), ring.sq_ready()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + // Socket bound, expect shutdown to work + { + const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + defer posix.close(server); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); + try posix.listen(server, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + // set address to the OS-chosen IP/port. + var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(server, addrAny(&address), &slen); - try testing.expectEqual(Cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + const shutdown_sqe = try ring.shutdown(0x445445445, server, linux.SHUT.RD); + try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); - const sqe_barrier = try ring.nop(0xbbbbbbbb); - sqe_barrier.flags.IO_DRAIN = true; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0xbbbbbbbb, - .res = 0, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 2), ring.cq.head.*); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement shutdown (kernel version < 5.11) + .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expectEqual(Cqe{ + .user_data = 0x445445445, + .res = 0, + .flags = 0, + }, cqe); + } + + // Socket not bound, expect to fail with ENOTCONN + { + const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + defer posix.close(server); + + const shutdown_sqe = ring.shutdown(0x445445445, server, linux.SHUT.RD) catch |err| switch (err) { + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + }; + try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); + try testing.expectEqual(linux.E.NOTCONN, cqe.err()); + } } -test "readv" { +test "renameat" { if (!is_linux) return error.SkipZigTest; var ring = IoUring.init(1, .{}) catch |err| switch (err) { @@ -3627,112 +2678,110 @@ test "readv" { }; defer ring.deinit(); - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); + const old_path = "test_io_uring_renameat_old"; + const new_path = "test_io_uring_renameat_new"; + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + // Write old file with data - // Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1). - // Linux Kernel 5.5 adds support for sparse fd sets. - // Compare: - // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs - // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 - // We therefore avoid stressing sparse fd sets here: - var registered_fds = [_]linux.fd_t{0} ** 1; - const fd_index = 0; - registered_fds[fd_index] = fd; - try ring.register_files(registered_fds[0..]); + const old_file = try tmp.dir.createFile(old_path, .{ .truncate = true, .mode = 0o666 }); + defer old_file.close(); + try old_file.writeAll("hello"); - var buffer = [_]u8{42} ** 128; - var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; - const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); - try testing.expectEqual(Op.READV, sqe.opcode); - sqe.flags.FIXED_FILE = true; + // Submit renameat - try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); + const sqe = try ring.renameat( + 0x12121212, + tmp.dir.fd, + old_path, + tmp.dir.fd, + new_path, + .{}, + ); + try testing.expectEqual(Op.RENAMEAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement renameat (kernel version < 5.11) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + .user_data = 0x12121212, + .res = 0, + .flags = 0, + }, cqe); - try ring.unregister_files(); + // Validate that the old file doesn't exist anymore + try testing.expectError(error.FileNotFound, tmp.dir.openFile(old_path, .{})); + + // Validate that the new file exists with the proper content + var new_file_data: [16]u8 = undefined; + try testing.expectEqualStrings("hello", try tmp.dir.readFile(new_path, &new_file_data)); } -test "writev/fsync/readv" { +test "unlinkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(4, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); + const path = "test_io_uring_unlinkat"; + var tmp = std.testing.tmpDir(.{}); defer tmp.cleanup(); - const path = "test_io_uring_writev_fsync_readv"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; - - const buffer_write = [_]u8{42} ** 128; - const iovecs_write = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_write, .len = buffer_write.len }, - }; - var buffer_read = [_]u8{0} ** 128; - var iovecs_read = [_]posix.iovec{ - posix.iovec{ .base = &buffer_read, .len = buffer_read.len }, - }; - - const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); - try testing.expectEqual(Op.WRITEV, sqe_writev.opcode); - try testing.expectEqual(@as(u64, 17), sqe_writev.off); - sqe_writev.flags.IO_LINK = true; - - const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); - try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); - try testing.expectEqual(fd, sqe_fsync.fd); - sqe_fsync.flags.IO_LINK = true; + // Write old file with data - const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); - try testing.expectEqual(Op.READV, sqe_readv.opcode); - try testing.expectEqual(@as(u64, 17), sqe_readv.off); + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); - try testing.expectEqual(@as(u32, 3), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), ring.cq_ready()); + // Submit unlinkat - try testing.expectEqual(Cqe{ - .user_data = 0xdddddddd, - .res = buffer_write.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + const sqe = try ring.unlinkat( + 0x12121212, + tmp.dir.fd, + path, + 0, + ); + try testing.expectEqual(Op.UNLINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement unlinkat (kernel version < 5.11) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } try testing.expectEqual(Cqe{ - .user_data = 0xeeeeeeee, + .user_data = 0x12121212, .res = 0, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq_ready()); - - try testing.expectEqual(Cqe{ - .user_data = 0xffffffff, - .res = buffer_read.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + .flags = 0, + }, cqe); - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); + // Validate that the file doesn't exist anymore + _ = tmp.dir.openFile(path, .{}) catch |err| switch (err) { + error.FileNotFound => {}, + else => std.debug.panic("unexpected error: {}", .{err}), + }; } -test "write/read" { +test "mkdirat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3741,45 +2790,42 @@ test "write/read" { var tmp = std.testing.tmpDir(.{}); defer tmp.cleanup(); - const path = "test_io_uring_write_read"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; - const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); - try testing.expectEqual(Op.WRITE, sqe_write.opcode); - try testing.expectEqual(@as(u64, 10), sqe_write.off); - sqe_write.flags.IO_LINK = true; - const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(Op.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + const path = "test_io_uring_mkdirat"; - const cqe_write = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: - // https://lwn.net/Articles/809820/ - if (cqe_write.err() == .INVAL) return error.SkipZigTest; - if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x11111111, - .res = buffer_write.len, - .flags = .{}, - }, cqe_write); + // Submit mkdirat + + const sqe = try ring.mkdirat( + 0x12121212, + tmp.dir.fd, + path, + 0o0755, + ); + try testing.expectEqual(Op.MKDIRAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement mkdirat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } try testing.expectEqual(Cqe{ - .user_data = 0x22222222, - .res = buffer_read.len, + .user_data = 0x12121212, + .res = 0, .flags = .{}, - }, cqe_read); - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); + }, cqe); + + // Validate that the directory exist + _ = try tmp.dir.openDir(path, .{}); } -test "splice/read" { +test "symlinkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(4, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3787,70 +2833,47 @@ test "splice/read" { defer ring.deinit(); var tmp = std.testing.tmpDir(.{}); - const path_src = "test_io_uring_splice_src"; - const file_src = try tmp.dir.createFile(path_src, .{ .read = true, .truncate = true }); - defer file_src.close(); - const fd_src = file_src.handle; - - const path_dst = "test_io_uring_splice_dst"; - const file_dst = try tmp.dir.createFile(path_dst, .{ .read = true, .truncate = true }); - defer file_dst.close(); - const fd_dst = file_dst.handle; - - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; - _ = try file_src.write(&buffer_write); - - const fds = try posix.pipe(); - const pipe_offset: u64 = std.math.maxInt(u64); - - const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); - try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); - try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); - try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); - sqe_splice_to_pipe.flags.IO_LINK = true; + defer tmp.cleanup(); - const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); - try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); - try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); - try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); - sqe_splice_from_pipe.flags.IO_LINK = true; + const path = "test_io_uring_symlinkat"; + const link_path = "test_io_uring_symlinkat_link"; - const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(Op.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 3), try ring.submit()); + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); - const cqe_splice_to_pipe = try ring.copy_cqe(); - const cqe_splice_from_pipe = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - // Prior to Linux Kernel 5.6 this is the only way to test for splice/read support: - // https://lwn.net/Articles/809820/ - if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; - if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; - if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x11111111, - .res = buffer_write.len, - .flags = .{}, - }, cqe_splice_to_pipe); - try testing.expectEqual(Cqe{ - .user_data = 0x22222222, - .res = buffer_write.len, - .flags = .{}, - }, cqe_splice_from_pipe); + // Submit symlinkat + + const sqe = try ring.symlinkat( + 0x12121212, + path, + tmp.dir.fd, + link_path, + ); + try testing.expectEqual(Op.SYMLINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement symlinkat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } try testing.expectEqual(Cqe{ - .user_data = 0x33333333, - .res = buffer_read.len, + .user_data = 0x12121212, + .res = 0, .flags = .{}, - }, cqe_read); - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); + }, cqe); + + // Validate that the symlink exist + _ = try tmp.dir.openFile(link_path, .{}); } -test "write_fixed/read_fixed" { +test "linkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3860,59 +2883,49 @@ test "write_fixed/read_fixed" { var tmp = std.testing.tmpDir(.{}); defer tmp.cleanup(); - const path = "test_io_uring_write_read_fixed"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; - - var raw_buffers: [2][11]u8 = undefined; - // First buffer will be written to the file. - @memset(&raw_buffers[0], 'z'); - raw_buffers[0][0.."foobar".len].* = "foobar".*; - - var buffers = [2]posix.iovec{ - .{ .base = &raw_buffers[0], .len = raw_buffers[0].len }, - .{ .base = &raw_buffers[1], .len = raw_buffers[1].len }, - }; - ring.register_buffers(&buffers) catch |err| switch (err) { - error.SystemResources => { - // See https://github.com/ziglang/zig/issues/15362 - return error.SkipZigTest; - }, - else => |e| return e, - }; + const first_path = "test_io_uring_linkat_first"; + const second_path = "test_io_uring_linkat_second"; - const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); - try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); - try testing.expectEqual(@as(u64, 3), sqe_write.off); - sqe_write.flags.IO_LINK = true; + // Write file with data - const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); - try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode); - try testing.expectEqual(@as(u64, 0), sqe_read.off); + const first_file = try tmp.dir.createFile(first_path, .{ .truncate = true, .mode = 0o666 }); + defer first_file.close(); + try first_file.writeAll("hello"); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + // Submit linkat - const cqe_write = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); + const sqe = try ring.linkat( + 0x12121212, + tmp.dir.fd, + first_path, + tmp.dir.fd, + second_path, + 0, + ); + try testing.expectEqual(Op.LINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0x45454545, - .res = @as(i32, @intCast(buffers[0].len)), - .flags = .{}, - }, cqe_write); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement linkat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } try testing.expectEqual(Cqe{ .user_data = 0x12121212, - .res = @as(i32, @intCast(buffers[1].len)), - .flags = .{}, - }, cqe_read); + .res = 0, + .flags = 0, + }, cqe); - try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].base[0..3]); - try testing.expectEqualSlices(u8, "foobar", buffers[1].base[3..9]); - try testing.expectEqualSlices(u8, "zz", buffers[1].base[9..11]); + // Validate the second file + var second_file_data: [16]u8 = undefined; + try testing.expectEqualStrings("hello", try tmp.dir.readFile(second_path, &second_file_data)); } -test "openat" { +test "provide_buffers: read" { if (!is_linux) return error.SkipZigTest; var ring = IoUring.init(1, .{}) catch |err| switch (err) { @@ -3922,51 +2935,129 @@ test "openat" { }; defer ring.deinit(); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); - const path = "test_io_uring_openat"; + const group_id = 1337; + const buffer_id = 0; - // Workaround for LLVM bug: https://github.com/ziglang/zig/issues/12014 - const path_addr = if (builtin.zig_backend == .stage2_llvm) p: { - var workaround = path; - _ = &workaround; - break :p @intFromPtr(workaround); - } else @intFromPtr(path); + const buffer_len = 128; - const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; - const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); - try testing.expectEqual(Sqe{ - .opcode = .OPENAT, - .flags = .{}, - .ioprio = @bitCast(@as(u16, 0)), - .fd = tmp.dir.fd, - .off = 0, - .addr = path_addr, - .len = mode, - .rw_flags = @bitCast(flags), - .user_data = 0x33333333, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }, sqe_openat.*); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + var buffers: [4][buffer_len]u8 = undefined; - const cqe_openat = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); - if (cqe_openat.err() == .INVAL) return error.SkipZigTest; - if (cqe_openat.err() == .BADF) return error.SkipZigTest; - if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); - try testing.expect(cqe_openat.res > 0); - try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_openat.flags); + // Provide 4 buffers - posix.close(cqe_openat.res); + { + const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, buffers.len), sqe.fd); + try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Happens when the kernel is < 5.7 + .INVAL, .BADF => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + } + + // Do 4 reads which should consume all buffers + + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + + try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + } + + // This read should fail + + { + const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + } + + // Provide 1 buffer again + + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 42); + + const reprovided_buffer_id = 2; + + { + _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + } + + // Final read which should work + + { + const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expectEqual(used_buffer_id, reprovided_buffer_id); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + } } -test "close" { +test "remove_buffers" { if (!is_linux) return error.SkipZigTest; var ring = IoUring.init(1, .{}) catch |err| switch (err) { @@ -3976,490 +3067,520 @@ test "close" { }; defer ring.deinit(); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); - const path = "test_io_uring_close"; - const file = try tmp.dir.createFile(path, .{}); - errdefer file.close(); + const group_id = 1337; + const buffer_id = 0; - const sqe_close = try ring.close(0x44444444, file.handle); - try testing.expectEqual(Op.CLOSE, sqe_close.opcode); - try testing.expectEqual(file.handle, sqe_close.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const buffer_len = 128; - const cqe_close = try ring.copy_cqe(); - if (cqe_close.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x44444444, - .res = 0, - .flags = .{}, - }, cqe_close); -} + var buffers: [4][buffer_len]u8 = undefined; -test "accept/connect/send/recv" { - if (!is_linux) return error.SkipZigTest; + // Provide 4 buffers - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + { + _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .INVAL, .BADF => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + } - const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + // Remove 3 buffers - const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); - sqe_send.flags.IO_LINK = true; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + { + const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); + try testing.expectEqual(Op.REMOVE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, 3), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0xeeeeeeee, - .res = buffer_send.len, - .flags = 0, - }, cqe_send); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); + } - const cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0xffffffff, - .res = buffer_recv.len, - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recv); + // This read should work - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); + { + _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + } + + // Final read should _not_ work + + { + _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + } } -test "sendmsg/recvmsg" { +test "provide_buffers: accept/connect/send/recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var address_server: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - - const server = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); - defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try posix.bind(server, addrAny(&address_server), @sizeOf(linux.sockaddr.in)); - - // set address_server to the OS-chosen IP/port. - var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(server, addrAny(&address_server), &slen); + const group_id = 1337; + const buffer_id = 0; - const client = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); - defer posix.close(client); + const buffer_len = 128; + var buffers: [4][buffer_len]u8 = undefined; - const buffer_send = [_]u8{42} ** 128; - const iovecs_send = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_send, .len = buffer_send.len }, - }; - const msg_send: linux.msghdr_const = .{ - .name = addrAny(&address_server), - .namelen = @sizeOf(linux.sockaddr.in), - .iov = &iovecs_send, - .iovlen = 1, - .control = null, - .controllen = 0, - .flags = 0, - }; - const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); - sqe_sendmsg.flags.IO_LINK = true; - try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); - try testing.expectEqual(client, sqe_sendmsg.fd); + // Provide 4 buffers - var buffer_recv = [_]u8{0} ** 128; - var iovecs_recv = [_]posix.iovec{ - posix.iovec{ .base = &buffer_recv, .len = buffer_recv.len }, - }; - var address_recv: linux.sockaddr.in = .{ - .port = 0, - .addr = 0, - }; - var msg_recv: linux.msghdr = .{ - .name = addrAny(&address_recv), - .namelen = @sizeOf(linux.sockaddr.in), - .iov = &iovecs_recv, - .iovlen = 1, - .control = null, - .controllen = 0, - .flags = 0, - }; - const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); - try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode); - try testing.expectEqual(server, sqe_recvmsg.fd); + { + const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, buffers.len), sqe.fd); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(@as(u32, 2), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Happens when the kernel is < 5.7 + .INVAL => return error.SkipZigTest, + // Happens on the kernel 5.4 + .BADF => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + } - const cqe_sendmsg = try ring.copy_cqe(); - if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x11111111, - .res = buffer_send.len, - .flags = 0, - }, cqe_sendmsg); + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); - const cqe_recvmsg = try ring.copy_cqe(); - if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x22222222, - .res = buffer_recv.len, - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically - .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recvmsg); + // Do 4 send on the socket - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); -} + { + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + } -test "timeout (after a relative time)" { - if (!is_linux) return error.SkipZigTest; - const io = std.testing.io; + var cqes: [4]Cqe = undefined; + try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); + } - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + // Do 4 recv which should consume all buffers - const ms = 10; - const margin = 5; - const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 1); - const started = try std.Io.Clock.awake.now(io); - const sqe = try ring.timeout(0x55555555, &ts, 0, 0); - try testing.expectEqual(Op.TIMEOUT, sqe.opcode); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - const stopped = try std.Io.Clock.awake.now(io); + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(.{ .BUFFER_SELECT = true }, sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0x55555555, - .res = -@as(i32, @intFromEnum(linux.E.TIME)), - .flags = .{}, - }, cqe); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } - // Tests should not depend on timings: skip test if outside margin. - const ms_elapsed = started.durationTo(stopped).toMilliseconds(); - if (ms_elapsed > margin) return error.SkipZigTest; -} + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); -test "timeout (after a number of completions)" { - if (!is_linux) return error.SkipZigTest; + try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); + const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; + try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); + } - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + // This recv should fail - const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; - const count_completions: u64 = 1; - const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, .{}); - try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); - try testing.expectEqual(count_completions, sqe_timeout.off); - _ = try ring.nop(0x77777777); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + { + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe_nop = try ring.copy_cqe(); - try testing.expectEqual(Cqe{ - .user_data = 0x77777777, - .res = 0, - .flags = .{}, - }, cqe_nop); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + } - const cqe_timeout = try ring.copy_cqe(); - try testing.expectEqual(Cqe{ - .user_data = 0x66666666, - .res = 0, - .flags = .{}, - }, cqe_timeout); -} + // Provide 1 buffer again -test "timeout_remove" { - if (!is_linux) return error.SkipZigTest; + const reprovided_buffer_id = 2; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + { + _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; - const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); - try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + } - const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, 0); - try testing.expectEqual(Op.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); + // Redo 1 send on the server socket - try testing.expectEqual(@as(u32, 2), try ring.submit()); + { + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: - // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second - // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second + _ = try ring.copy_cqe(); + } - var cqes: [2]Cqe = undefined; - cqes[0] = try ring.copy_cqe(); - cqes[1] = try ring.copy_cqe(); + // Final recv which should work - for (cqes) |cqe| { - // IORING_OP_TIMEOUT_REMOVE is not supported by this kernel version: - // Timeout remove operations set the fd to -1, which results in EBADF before EINVAL. - // We use IORING_FEAT_RW_CUR_POS as a safety check here to make sure we are at least pre-5.6. - // We don't want to skip this test for newer kernels. - if (cqe.user_data == 0x99999999 and - cqe.err() == .BADF and - (ring.features & linux.IORING_FEAT_RW_CUR_POS) == 0) - { - return error.SkipZigTest; - } + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 1); - try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); + { + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - if (cqe.user_data == 0x88888888) { - try testing.expectEqual(Cqe{ - .user_data = 0x88888888, - .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = 0, - }, cqe); - } else if (cqe.user_data == 0x99999999) { - try testing.expectEqual(Cqe{ - .user_data = 0x99999999, - .res = 0, - .flags = 0, - }, cqe); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } + + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expectEqual(used_buffer_id, reprovided_buffer_id); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; + try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); } } -test "accept/connect/recv/link_timeout" { - if (!is_linux) return error.SkipZigTest; +/// Used for testing server/client interactions. +const SocketTestHarness = struct { + listener: posix.socket_t, + server: posix.socket_t, + client: posix.socket_t, - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + fn close(self: SocketTestHarness) void { + posix.close(self.client); + posix.close(self.listener); + } +}; - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); +fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { + // Create a TCP server socket + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; + const listener_socket = try createListenerSocket(&address); + errdefer posix.close(listener_socket); - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + // Submit 1 accept + var accept_addr: posix.sockaddr = undefined; + var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr)); + _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, 0); - const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - sqe_recv.flags.IO_LINK = true; + // Create a TCP client socket + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + errdefer posix.close(client); + _ = try ring.connect(0xcccccccc, client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; - _ = try ring.link_timeout(0x22222222, &ts, 0); + try testing.expectEqual(@as(u32, 2), try ring.submit()); - const nr_wait = try ring.submit(); - try testing.expectEqual(@as(u32, 2), nr_wait); + var cqe_accept = try ring.copy_cqe(); + if (cqe_accept.err() == .INVAL) return error.SkipZigTest; + var cqe_connect = try ring.copy_cqe(); + if (cqe_connect.err() == .INVAL) return error.SkipZigTest; - var i: usize = 0; - while (i < nr_wait) : (i += 1) { - const cqe = try ring.copy_cqe(); - switch (cqe.user_data) { - 0xffffffff => { - if (cqe.res != -@as(i32, @intFromEnum(linux.E.INTR)) and - cqe.res != -@as(i32, @intFromEnum(linux.E.CANCELED))) - { - std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); - try testing.expect(false); - } - }, - 0x22222222 => { - if (cqe.res != -@as(i32, @intFromEnum(linux.E.ALREADY)) and - cqe.res != -@as(i32, @intFromEnum(linux.E.TIME))) - { - std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); - try testing.expect(false); - } - }, - else => @panic("should not happen"), - } + // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: + if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { + const a = cqe_accept; + const b = cqe_connect; + cqe_accept = b; + cqe_connect = a; } + + try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); + if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); + try testing.expect(cqe_accept.res > 0); + try testing.expectEqual(@as(u32, 0), cqe_accept.flags); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = 0, + .flags = 0, + }, cqe_connect); + + // All good + + return SocketTestHarness{ + .listener = listener_socket, + .server = cqe_accept.res, + .client = client, + }; } -test "fallocate" { +fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { + const kernel_backlog = 1; + const listener_socket = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + errdefer posix.close(listener_socket); + + try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.bind(listener_socket, addrAny(address), @sizeOf(linux.sockaddr.in)); + try posix.listen(listener_socket, kernel_backlog); + + // set address to the OS-chosen IP/port. + var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(listener_socket, addrAny(address), &slen); + + return listener_socket; +} + +test "accept multishot" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); - const path = "test_io_uring_fallocate"; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); + // submit multishot accept operation + var addr: posix.sockaddr = undefined; + var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); + const userdata: u64 = 0xaaaaaaaa; + _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + var nr: usize = 4; // number of clients to connect + while (nr > 0) : (nr -= 1) { + // connect client + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + errdefer posix.close(client); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - const len: u64 = 65536; - const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); - try testing.expectEqual(Op.FALLOCATE, sqe.opcode); - try testing.expectEqual(file.handle, sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // test accept completion + var cqe = try ring.copy_cqe(); + if (cqe.err() == .INVAL) return error.SkipZigTest; + try testing.expect(cqe.res > 0); + try testing.expect(cqe.user_data == userdata); + try testing.expect(cqe.flags.F_MORE); // more flag is set - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement fallocate(): - .INVAL => return error.SkipZigTest, - // This kernel does not implement fallocate(): - .NOSYS => return error.SkipZigTest, - // The filesystem containing the file referred to by fd does not support this operation; - // or the mode is not supported by the filesystem containing the file referred to by fd: - .OPNOTSUPP => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + posix.close(client); } - try testing.expectEqual(Cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = .{}, - }, cqe); - - try testing.expectEqual(len, (try file.stat()).size); } -test "statx" { - if (!is_linux) return error.SkipZigTest; +test "accept/connect/send_zc/recv" { + try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - const path = "test_io_uring_statx"; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + var buffer_recv = [_]u8{0} ** 10; - try file.writeAll("foobar"); + // zero-copy send + const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); + sqe_send.flags.IO_LINK = true; + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 2), try ring.submit()); - var buf: linux.Statx = undefined; - const sqe = try ring.statx( - 0xaaaaaaaa, - tmp.dir.fd, - path, - 0, - linux.STATX_SIZE, - &buf, - ); - try testing.expectEqual(Op.STATX, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_send = try ring.copy_cqe(); + // First completion of zero-copy send. + // IORING_CQE_F_MORE, means that there + // will be a second completion event / notification for the + // request, with the user_data field set to the same value. + // buffer_send must be keep alive until second cqe. + try testing.expectEqual(Cqe{ + .user_data = 0xeeeeeeee, + .res = buffer_send.len, + .flags = .{ .F_MORE = true }, + }, cqe_send); + + cqe_send, const cqe_recv = brk: { + const cqe1 = try ring.copy_cqe(); + const cqe2 = try ring.copy_cqe(); + break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 }; + }; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement statx(): - .INVAL => return error.SkipZigTest, - // This kernel does not implement statx(): - .NOSYS => return error.SkipZigTest, - // The filesystem containing the file referred to by fd does not support this operation; - // or the mode is not supported by the filesystem containing the file referred to by fd: - .OPNOTSUPP => return error.SkipZigTest, - // not supported on older kernels (5.4) - .BADF => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } try testing.expectEqual(Cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = 0, - }, cqe); + .user_data = 0xffffffff, + .res = buffer_recv.len, + .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recv); + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); - try testing.expect(buf.mask & linux.STATX_SIZE == linux.STATX_SIZE); - try testing.expectEqual(@as(u64, 6), buf.size); + // Second completion of zero-copy send. + // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer + try testing.expectEqual(Cqe{ + .user_data = 0xeeeeeeee, + .res = 0, + .flags = linux.IORING_CQE_F_NOTIF, + }, cqe_send); } -test "accept/connect/recv/cancel" { - if (!is_linux) return error.SkipZigTest; +test "accept_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(16, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); + // register direct file descriptors + var registered_fds = [_]linux.fd_t{-1} ** 2; + try ring.register_files(registered_fds[0..]); - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const accept_userdata: u64 = 0xaaaaaaaa; + const read_userdata: u64 = 0xbbbbbbbb; + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); - try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode); - try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + for (0..2) |_| { + for (registered_fds, 0..) |_, i| { + var buffer_recv: [16]u8 = @splat(0); + const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop - var cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - var cqe_cancel = try ring.copy_cqe(); - if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; + // submit accept, will chose registered fd and return index in cqe + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // The recv/cancel CQEs may arrive in any order, the recv CQE will sometimes come first: - if (cqe_recv.user_data == 0x99999999 and cqe_cancel.user_data == 0xffffffff) { - const a = cqe_recv; - const b = cqe_cancel; - cqe_recv = b; - cqe_cancel = a; - } + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); - try testing.expectEqual(Cqe{ - .user_data = 0xffffffff, - .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = 0, - }, cqe_recv); + // accept completion + const cqe_accept = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + const fd_index = cqe_accept.res; + try testing.expect(fd_index < registered_fds.len); + try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(Cqe{ - .user_data = 0x99999999, - .res = 0, - .flags = 0, - }, cqe_cancel); + // send data + _ = try posix.send(client, buffer_send, 0); + + // Example of how to use registered fd: + // Submit receive to fixed file returned by accept (fd_index). + // Fd field is set to registered file index, returned by accept. + // Flag linux.IOSQE_FIXED_FILE must be set. + const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); + recv_sqe.flags.FIXED_FILE = true; + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + // accept receive + const recv_cqe = try ring.copy_cqe(); + try testing.expect(recv_cqe.user_data == read_userdata); + try testing.expect(recv_cqe.res == buffer_send.len); + try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); + } + // no more available fds, accept will get NFILE error + { + // submit accept + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); + // completion with error + const cqe_accept = try ring.copy_cqe(); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); + } + // return file descriptors to kernel + try ring.register_files_update(0, registered_fds[0..]); + } + try ring.unregister_files(); } -test "register_files_update" { - if (!is_linux) return error.SkipZigTest; +test "accept_multishot_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + + if (builtin.cpu.arch == .riscv64) { + // https://github.com/ziglang/zig/issues/25734 + return error.SkipZigTest; + } var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, @@ -4468,1850 +3589,2737 @@ test "register_files_update" { }; defer ring.deinit(); - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); - - var registered_fds = [_]linux.fd_t{0} ** 2; - const fd_index = 0; - const fd_index2 = 1; - registered_fds[fd_index] = fd; - registered_fds[fd_index2] = -1; - - ring.register_files(registered_fds[0..]) catch |err| switch (err) { - // Happens when the kernel doesn't support sparse entry (-1) in the file descriptors array. - error.FileDescriptorInvalid => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; - // Test IORING_REGISTER_FILES_UPDATE - // Only available since Linux 5.5 - - const fd2 = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd2); + var registered_fds = [_]linux.fd_t{-1} ** 2; + try ring.register_files(registered_fds[0..]); - registered_fds[fd_index] = fd2; - registered_fds[fd_index2] = -1; - try ring.register_files_update(0, registered_fds[0..]); + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); - var buffer = [_]u8{42} ** 128; - { - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags.FIXED_FILE = true; + const accept_userdata: u64 = 0xaaaaaaaa; + for (0..2) |_| { + // submit multishot accept + // Will chose registered fd and return index of the selected registered file in cqe. + _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, 0); try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); - } - - // Test with a non-zero offset - registered_fds[fd_index] = -1; - registered_fds[fd_index2] = -1; - try ring.register_files_update(1, registered_fds[1..]); - - { - // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags.FIXED_FILE = true; + for (registered_fds) |_| { + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + // accept completion + const cqe_accept = try ring.copy_cqe(); + const fd_index = cqe_accept.res; + try testing.expect(fd_index < registered_fds.len); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expect(cqe_accept.flags.F_MORE); // has more is set + } + // No more available fds, accept will get NFILE error. + // Multishot is terminated (more flag is not set). + { + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); + // completion with error + const cqe_accept = try ring.copy_cqe(); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); + try testing.expect(!cqe_accept.flags.F_MORE); // has more is not set + } + // return file descriptors to kernel + try ring.register_files_update(0, registered_fds[0..]); } + try ring.unregister_files(); +} - try ring.register_files_update(0, registered_fds[0..]); +test "socket" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - { - // Now this should fail since both fds are sparse (-1) - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags.FIXED_FILE = true; + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(linux.E.BADF, cqe.err()); - } + // prepare, submit socket operation + _ = try ring.socket(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - try ring.unregister_files(); + // test completion + var cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + const fd: linux.fd_t = @intCast(cqe.res); + try testing.expect(fd > 2); + + posix.close(fd); } -test "shutdown" { - if (!is_linux) return error.SkipZigTest; +test "socket_direct/socket_direct_alloc/close_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(16, .{}) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); + var registered_fds = [_]linux.fd_t{-1} ** 3; + try ring.register_files(registered_fds[0..]); + + // create socket in registered file descriptor at index 0 (last param) + _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 0); + + // create socket in registered file descriptor at index 1 (last param) + _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 1); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified + + // create socket in kernel chosen file descriptor index (_alloc version) + // completion res has index from registered files + _ = try ring.socket_direct_alloc(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 2); // returns registered file index + + // use sockets from registered_fds in connect operation var address: linux.sockaddr.in = .{ .port = 0, .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); + const accept_userdata: u64 = 0xaaaaaaaa; + const connect_userdata: u64 = 0xbbbbbbbb; + const close_userdata: u64 = 0xcccccccc; + for (registered_fds, 0..) |_, fd_index| { + // prepare accept + _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); + // prepare connect with fixed socket + const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); + connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index + // submit both + try testing.expectEqual(@as(u32, 2), try ring.submit()); + // get completions + var cqe_connect = try ring.copy_cqe(); + var cqe_accept = try ring.copy_cqe(); + // ignore order + if (cqe_connect.user_data == accept_userdata and cqe_accept.user_data == connect_userdata) { + const a = cqe_accept; + const b = cqe_connect; + cqe_accept = b; + cqe_connect = a; + } + // test connect completion + try testing.expect(cqe_connect.user_data == connect_userdata); + try testing.expectEqual(posix.E.SUCCESS, cqe_connect.err()); + // test accept completion + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); - // Socket bound, expect shutdown to work - { - const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); - try posix.listen(server, 1); + // submit and test close_direct + _ = try ring.close_direct(close_userdata, @intCast(fd_index)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_close = try ring.copy_cqe(); + try testing.expect(cqe_close.user_data == close_userdata); + try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + } - // set address to the OS-chosen IP/port. - var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(server, addrAny(&address), &slen); + try ring.unregister_files(); +} - const shutdown_sqe = try ring.shutdown(0x445445445, server, linux.SHUT.RD); - try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); +test "openat_direct/close_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement shutdown (kernel version < 5.11) - .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + var registered_fds = [_]linux.fd_t{-1} ** 3; + try ring.register_files(registered_fds[0..]); - try testing.expectEqual(Cqe{ - .user_data = 0x445445445, - .res = 0, - .flags = 0, - }, cqe); - } + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_close_direct"; + const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; + const mode: posix.mode_t = 0o666; + const user_data: u64 = 0; - // Socket not bound, expect to fail with ENOTCONN - { - const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - defer posix.close(server); + // use registered file at index 0 (last param) + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 0); + + // use registered file at index 1 + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 0); // res is 0 when we specify index - const shutdown_sqe = ring.shutdown(0x445445445, server, linux.SHUT.RD) catch |err| switch (err) { - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - }; - try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + // let kernel choose registered file index + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 2); // chosen index is in res + // close all open file descriptors + for (registered_fds, 0..) |_, fd_index| { + _ = try ring.close_direct(user_data, @intCast(fd_index)); try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); - try testing.expectEqual(linux.E.NOTCONN, cqe.err()); + var cqe_close = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); } + try ring.unregister_files(); } -test "renameat" { - if (!is_linux) return error.SkipZigTest; +test "waitid" { + try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - const old_path = "test_io_uring_renameat_old"; - const new_path = "test_io_uring_renameat_new"; - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + const pid = try posix.fork(); + if (pid == 0) { + posix.exit(7); + } - // Write old file with data + var siginfo: posix.siginfo_t = undefined; + _ = try ring.waitid(0, .PID, pid, &siginfo, posix.W.EXITED, 0); - const old_file = try tmp.dir.createFile(old_path, .{ .truncate = true, .mode = 0o666 }); - defer old_file.close(); - try old_file.writeAll("hello"); + try testing.expectEqual(1, try ring.submit()); - // Submit renameat + const cqe_waitid = try ring.copy_cqe(); + try testing.expectEqual(0, cqe_waitid.res); + try testing.expectEqual(pid, siginfo.fields.common.first.piduid.pid); + try testing.expectEqual(7, siginfo.fields.common.second.sigchld.status); +} - const sqe = try ring.renameat( - 0x12121212, - tmp.dir.fd, - old_path, - tmp.dir.fd, - new_path, - 0, - ); - try testing.expectEqual(Op.RENAMEAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +/// For use in tests. Returns SkipZigTest if kernel version is less than required. +inline fn skipKernelLessThan(required: std.SemanticVersion) !void { + if (!is_linux) return error.SkipZigTest; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { + var uts: linux.utsname = undefined; + const res = linux.uname(&uts); + switch (linux.errno(res)) { .SUCCESS => {}, - // This kernel's io_uring does not yet implement renameat (kernel version < 5.11) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + else => |errno| return posix.unexpectedErrno(errno), } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); - // Validate that the old file doesn't exist anymore - try testing.expectError(error.FileNotFound, tmp.dir.openFile(old_path, .{})); + const release = mem.sliceTo(&uts.release, 0); + // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" + const extra_index = std.mem.indexOfAny(u8, release, "-+"); + const stripped = release[0..(extra_index orelse release.len)]; + // Make sure the input don't rely on the extra we just stripped + try testing.expect(required.pre == null and required.build == null); - // Validate that the new file exists with the proper content - var new_file_data: [16]u8 = undefined; - try testing.expectEqualStrings("hello", try tmp.dir.readFile(new_path, &new_file_data)); + var current = try std.SemanticVersion.parse(stripped); + current.pre = null; // don't check pre field + if (required.order(current) == .gt) return error.SkipZigTest; } -test "unlinkat" { +test BufferGroup { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + // Init IoUring + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - const path = "test_io_uring_unlinkat"; - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + // Init buffer group for ring + const group_id: u16 = 1; // buffers group id + const buffers_count: u16 = 1; // number of buffers in buffer group + const buffer_size: usize = 128; // size of each buffer in group + var buf_grp = BufferGroup.init( + &ring, + testing.allocator, + group_id, + buffer_size, + buffers_count, + ) catch |err| switch (err) { + // kernel older than 5.19 + error.ArgumentsInvalid => return error.SkipZigTest, + else => return err, + }; + defer buf_grp.deinit(testing.allocator); - // Write old file with data + // Create client/server fds + const fds = try createSocketTestHarness(&ring); + defer fds.close(); + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); + // Client sends data + { + _ = try ring.send(1, fds.client, data[0..], 0); + const submitted = try ring.submit(); + try testing.expectEqual(1, submitted); + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); + } - // Submit unlinkat + // Server uses buffer group receive + { + // Submit recv operation, buffer will be chosen from buffer group + _ = try buf_grp.recv(2, fds.server, 0); + const submitted = try ring.submit(); + try testing.expectEqual(1, submitted); - const sqe = try ring.unlinkat( - 0x12121212, - tmp.dir.fd, - path, - 0, - ); - try testing.expectEqual(Op.UNLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // ... when we have completion for recv operation + const cqe = try ring.copy_cqe(); + try testing.expectEqual(2, cqe.user_data); // matches submitted user_data + try testing.expect(cqe.res >= 0); // success + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(data.len, @as(usize, @intCast(cqe.res))); // cqe.res holds received data len - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement unlinkat (kernel version < 5.11) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + // Get buffer from pool + const buf = try buf_grp.get(cqe); + try testing.expectEqualSlices(u8, &data, buf); + // Release buffer to the kernel when application is done with it + try buf_grp.put(cqe); } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); - - // Validate that the file doesn't exist anymore - _ = tmp.dir.openFile(path, .{}) catch |err| switch (err) { - error.FileNotFound => {}, - else => std.debug.panic("unexpected error: {}", .{err}), - }; } -test "mkdirat" { +test "ring mapped buffers recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + // init buffer group + const group_id: u16 = 1; // buffers group id + const buffers_count: u16 = 2; // number of buffers in buffer group + const buffer_size: usize = 4; // size of each buffer in group + var buf_grp = BufferGroup.init( + &ring, + testing.allocator, + group_id, + buffer_size, + buffers_count, + ) catch |err| switch (err) { + // kernel older than 5.19 + error.ArgumentsInvalid => return error.SkipZigTest, + else => return err, + }; + defer buf_grp.deinit(testing.allocator); - const path = "test_io_uring_mkdirat"; + // create client/server fds + const fds = try createSocketTestHarness(&ring); + defer fds.close(); - // Submit mkdirat + // for random user_data in sqe/cqe + var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); + var rnd = Rnd.random(); - const sqe = try ring.mkdirat( - 0x12121212, - tmp.dir.fd, - path, - 0o0755, - ); - try testing.expectEqual(Op.MKDIRAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + var round: usize = 4; // repeat send/recv cycle round times + while (round > 0) : (round -= 1) { + // client sends data + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + { + const user_data = rnd.int(u64); + _ = try ring.send(user_data, fds.client, data[0..], 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + } + var pos: usize = 0; + + // read first chunk + const cqe1 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); + var buf = try buf_grp.get(cqe1); + try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); + pos += buf.len; + // second chunk + const cqe2 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); + buf = try buf_grp.get(cqe2); + try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); + pos += buf.len; + + // both buffers provided to the kernel are used so we get error + // 'no more buffers', until we put buffers to the kernel + { + const user_data = rnd.int(u64); + _ = try buf_grp.recv(user_data, fds.server, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expect(cqe.res < 0); // fail + try testing.expectEqual(posix.E.NOBUFS, cqe.err()); + try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only + try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); + } - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement mkdirat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = .{}, - }, cqe); + // put buffers back to the kernel + try buf_grp.put(cqe1); + try buf_grp.put(cqe2); - // Validate that the directory exist - _ = try tmp.dir.openDir(path, .{}); + // read remaining data + while (pos < data.len) { + const cqe = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); + buf = try buf_grp.get(cqe); + try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); + pos += buf.len; + try buf_grp.put(cqe); + } + } } -test "symlinkat" { +test "ring mapped buffers multishot recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + // init buffer group + const group_id: u16 = 1; // buffers group id + const buffers_count: u16 = 2; // number of buffers in buffer group + const buffer_size: usize = 4; // size of each buffer in group + var buf_grp = BufferGroup.init( + &ring, + testing.allocator, + group_id, + buffer_size, + buffers_count, + ) catch |err| switch (err) { + // kernel older than 5.19 + error.ArgumentsInvalid => return error.SkipZigTest, + else => return err, + }; + defer buf_grp.deinit(testing.allocator); - const path = "test_io_uring_symlinkat"; - const link_path = "test_io_uring_symlinkat_link"; + // create client/server fds + const fds = try createSocketTestHarness(&ring); + defer fds.close(); - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); + // for random user_data in sqe/cqe + var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); + var rnd = Rnd.random(); - // Submit symlinkat + var round: usize = 4; // repeat send/recv cycle round times + while (round > 0) : (round -= 1) { + // client sends data + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf }; + { + const user_data = rnd.int(u64); + _ = try ring.send(user_data, fds.client, data[0..], 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + } - const sqe = try ring.symlinkat( - 0x12121212, - path, - tmp.dir.fd, - link_path, - ); - try testing.expectEqual(Op.SYMLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // start multishot recv + var recv_user_data = rnd.int(u64); + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement symlinkat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = .{}, - }, cqe); + // server reads data into provided buffers + // there are 2 buffers of size 4, so each read gets only chunk of data + // we read four chunks of 4, 4, 4, 4 bytes each + var chunk: []const u8 = data[0..buffer_size]; // first chunk + const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe1.flags.F_MORE); - // Validate that the symlink exist - _ = try tmp.dir.openFile(link_path, .{}); -} + chunk = data[buffer_size .. buffer_size * 2]; // second chunk + const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe2.flags.F_MORE); -test "linkat" { - if (!is_linux) return error.SkipZigTest; + // both buffers provided to the kernel are used so we get error + // 'no more buffers', until we put buffers to the kernel + { + const cqe = try ring.copy_cqe(); + try testing.expectEqual(recv_user_data, cqe.user_data); + try testing.expect(cqe.res < 0); // fail + try testing.expectEqual(posix.E.NOBUFS, cqe.err()); + try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only + // has more is not set + // indicates that multishot is finished + try testing.expect(!cqe.flags.F_MORE); + try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); + } - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + // put buffers back to the kernel + try buf_grp.put(cqe1); + try buf_grp.put(cqe2); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + // restart multishot + recv_user_data = rnd.int(u64); + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit - const first_path = "test_io_uring_linkat_first"; - const second_path = "test_io_uring_linkat_second"; + chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk + const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe3.flags.F_MORE); + try buf_grp.put(cqe3); - // Write file with data + chunk = data[buffer_size * 3 ..]; // last chunk + const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe4.flags.F_MORE); + try buf_grp.put(cqe4); - const first_file = try tmp.dir.createFile(first_path, .{ .truncate = true, .mode = 0o666 }); - defer first_file.close(); - try first_file.writeAll("hello"); + // cancel pending multishot recv operation + { + const cancel_user_data = rnd.int(u64); + _ = try ring.cancel(cancel_user_data, recv_user_data, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // Submit linkat + // expect completion of cancel operation and completion of recv operation + var cqe_cancel = try ring.copy_cqe(); + if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; + var cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - const sqe = try ring.linkat( - 0x12121212, - tmp.dir.fd, - first_path, - tmp.dir.fd, - second_path, - 0, - ); - try testing.expectEqual(Op.LINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // don't depend on order of completions + if (cqe_cancel.user_data == recv_user_data and cqe_recv.user_data == cancel_user_data) { + const a = cqe_cancel; + const b = cqe_recv; + cqe_cancel = b; + cqe_recv = a; + } - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement linkat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + // Note on different kernel results: + // on older kernel (tested with v6.0.16, v6.1.57, v6.2.12, v6.4.16) + // cqe_cancel.err() == .NOENT + // cqe_recv.err() == .NOBUFS + // on kernel (tested with v6.5.0, v6.5.7) + // cqe_cancel.err() == .SUCCESS + // cqe_recv.err() == .CANCELED + // Upstream reference: https://github.com/axboe/liburing/issues/984 + + // cancel operation is success (or NOENT on older kernels) + try testing.expectEqual(cancel_user_data, cqe_cancel.user_data); + try testing.expect(cqe_cancel.err() == .NOENT or cqe_cancel.err() == .SUCCESS); + + // recv operation is failed with err CANCELED (or NOBUFS on older kernels) + try testing.expectEqual(recv_user_data, cqe_recv.user_data); + try testing.expect(cqe_recv.res < 0); + try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED); + try testing.expect(!cqe_recv.flags.F_MORE); + } } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = 0, - }, cqe); +} + +// Prepare, submit recv and get cqe using buffer group. +fn buf_grp_recv_submit_get_cqe( + ring: *IoUring, + buf_grp: *BufferGroup, + fd: linux.fd_t, + user_data: u64, +) !Cqe { + // prepare and submit recv + const sqe = try buf_grp.recv(user_data, fd, 0); + try testing.expect(sqe.flags.BUFFER_SELECT); + try testing.expect(sqe.buf_index == buf_grp.group_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + // get cqe, expect success + const cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expect(cqe.res >= 0); // success + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set + + return cqe; +} + +fn expect_buf_grp_cqe( + ring: *IoUring, + buf_grp: *BufferGroup, + user_data: u64, + expected: []const u8, +) !Cqe { + // get cqe + const cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expect(cqe.res >= 0); // success + try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set + try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + + // get buffer from pool + const buffer_id = try cqe.buffer_id(); + const len = @as(usize, @intCast(cqe.res)); + const buf = buf_grp.get_by_id(buffer_id)[0..len]; + try testing.expectEqualSlices(u8, expected, buf); - // Validate the second file - var second_file_data: [16]u8 = undefined; - try testing.expectEqualStrings("hello", try tmp.dir.readFile(second_path, &second_file_data)); + return cqe; } -test "provide_buffers: read" { +test "copy_cqes with wrapping sq.cqes buffer" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); - - const group_id = 1337; - const buffer_id = 0; - - const buffer_len = 128; - - var buffers: [4][buffer_len]u8 = undefined; - - // Provide 4 buffers + try testing.expectEqual(2, ring.sq.sqes.len); + try testing.expectEqual(4, ring.cq.cqes.len); + // submit 2 entries, receive 2 completions + var cqes: [8]Cqe = undefined; { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Happens when the kernel is < 5.7 - .INVAL, .BADF => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); - } - - // Do 4 reads which should consume all buffers - - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + for (0..2) |_| { + const sqe = try ring.get_sqe(); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); + try testing.expect(try ring.submit() == 1); } - - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = @as(u32, @bitCast(cqe.flags)) >> 16; - try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); - } - - // This read should fail - - { - const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + var cqe_count: u32 = 0; + while (cqe_count < 2) { + cqe_count += try ring.copy_cqes(&cqes, 2 - cqe_count); } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); } - // Provide 1 buffer again - - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 42); - - const reprovided_buffer_id = 2; - - { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(2, ring.cq.head.*); - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + // sq.sqes len is 4, starting at position 2 + // every 4 entries submit wraps completion buffer + // we are reading ring.cq.cqes at indexes 2,3,0,1 + for (1..1024) |i| { + for (0..4) |_| { + const sqe = try ring.get_sqe(); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); + try testing.expect(try ring.submit() == 1); } - } - - // Final read which should work - - { - const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + var cqe_count: u32 = 0; + while (cqe_count < 4) { + cqe_count += try ring.copy_cqes(&cqes, 4 - cqe_count); } - - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = @as(u32, @bitCast(cqe.flags)) >> 16; - try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(4, cqe_count); + try testing.expectEqual(2 + 4 * i, ring.cq.head.*); } } -test "remove_buffers" { - if (!is_linux) return error.SkipZigTest; +test "bind/listen/connect" { + if (builtin.cpu.arch == .s390x) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/25956 - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); + const probe = ring.get_probe() catch return error.SkipZigTest; + // LISTEN is higher required operation + if (!probe.is_supported(.LISTEN)) return error.SkipZigTest; - const group_id = 1337; - const buffer_id = 0; + var addr: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; + const proto: u32 = if (addr.family == linux.AF.UNIX) 0 else linux.IPPROTO.TCP; - const buffer_len = 128; + const listen_fd = brk: { + // Create socket + _ = try ring.socket(1, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); + try testing.expectEqual(1, try ring.submit()); + var cqe = try ring.copy_cqe(); + try testing.expectEqual(1, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + const listen_fd: linux.fd_t = @intCast(cqe.res); + try testing.expect(listen_fd > 2); - var buffers: [4][buffer_len]u8 = undefined; + // Prepare: set socket option * 2, bind, listen + var optval: u32 = 1; + (try ring.setsockopt(2, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval))).link_next(); + (try ring.setsockopt(3, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEPORT, mem.asBytes(&optval))).link_next(); + (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); + _ = try ring.listen(5, listen_fd, 1, 0); + // Submit 4 operations + try testing.expectEqual(4, try ring.submit()); + // Expect all to succeed + for (2..6) |user_data| { + cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + } - // Provide 4 buffers + // Check that socket option is set + optval = 0; + _ = try ring.getsockopt(5, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval)); + try testing.expectEqual(1, try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(5, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(1, optval); - { - _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // Read system assigned port into addr + var addr_len: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(listen_fd, addrAny(&addr), &addr_len); + + break :brk listen_fd; + }; + const connect_fd = brk: { + // Create connect socket + _ = try ring.socket(6, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .INVAL, .BADF => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + try testing.expectEqual(6, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + // Get connect socket fd + const connect_fd: linux.fd_t = @intCast(cqe.res); + try testing.expect(connect_fd > 2 and connect_fd != listen_fd); + break :brk connect_fd; + }; + + // Prepare accept/connect operations + _ = try ring.accept(7, listen_fd, null, null, 0); + _ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in)); + try testing.expectEqual(2, try ring.submit()); + // Get listener accepted socket + var accept_fd: posix.socket_t = 0; + for (0..2) |_| { + const cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + if (cqe.user_data == 7) { + accept_fd = @intCast(cqe.res); + } else { + try testing.expectEqual(8, cqe.user_data); } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); } + try testing.expect(accept_fd > 2 and accept_fd != listen_fd and accept_fd != connect_fd); - // Remove 3 buffers - - { - const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); - try testing.expectEqual(Op.REMOVE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, 3), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // Communicate + try testSendRecv(&ring, connect_fd, accept_fd); + try testSendRecv(&ring, accept_fd, connect_fd); - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + // Shutdown and close all sockets + for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { + (try ring.shutdown(9, fd, posix.SHUT.RDWR)).link_next(); + _ = try ring.close(10, fd); + try testing.expectEqual(2, try ring.submit()); + for (0..2) |i| { + const cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(9 + i, cqe.user_data); } - try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); } +} - // This read should work - - { - _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t) !void { + const buffer_send = "0123456789abcdf" ** 10; + var buffer_recv: [buffer_send.len * 2]u8 = undefined; + // 2 sends + _ = try ring.send(1, send_fd, buffer_send, linux.MSG.WAITALL); + _ = try ring.send(2, send_fd, buffer_send, linux.MSG.WAITALL); + try testing.expectEqual(2, try ring.submit()); + for (0..2) |i| { const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = @as(u32, @bitCast(cqe.flags)) >> 16; - try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(1 + i, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(buffer_send.len, @as(usize, @intCast(cqe.res))); } - // Final read should _not_ work - - { - _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - + // receive + var recv_len: usize = 0; + while (recv_len < buffer_send.len * 2) { + _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, 0); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + try testing.expectEqual(3, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + recv_len += @intCast(cqe.res); } + + // inspect recv buffer + try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); + try testing.expectEqualSlices(u8, buffer_send, buffer_recv[buffer_send.len..]); } -test "provide_buffers: accept/connect/send/recv" { - if (!is_linux) return error.SkipZigTest; +fn addrAny(addr: *linux.sockaddr.in) *linux.sockaddr { + return @ptrCast(addr); +} +// IO completion data structure (Completion Queue Entry) +pub const Cqe = extern struct { + /// sqe.user_data value passed back + user_data: u64, + /// result code for this event + res: i32, + flags: Flags, + // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration + // TODO: add support for the IORING_SETUP_CQE32 case + /// If the ring is initialized with IORING_SETUP_CQE32, then this field + /// contains 16-bytes of padding, doubling the size of the CQE. + // big_cqe: ?[2]u64, - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + /// cqe.flags + pub const Flags = packed struct(u32) { + /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + F_BUFFER: bool = false, + /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + F_MORE: bool = false, + /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv + F_SOCK_NONEMPTY: bool = false, + /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct + /// them from sends. + F_NOTIF: bool = false, + /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get + /// more completions. In other words, the buffer is being + /// partially consumed, and will be used by the kernel for + /// more completions. This is only set for buffers used via + /// the incremental buffer consumption, as provided by + /// a ring buffer setup with IOU_PBUF_RING_INC. For any + /// other provided buffer type, all completions with a + /// buffer passed back is automatically returned to the + /// application. + F_BUF_MORE: bool = false, + // COMMIT: new flags + /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + /// CQE. It's only purpose is to fill a gap in the ring, + /// if a large CQE is attempted posted when the ring has + /// just a single small CQE worth of space left before + /// wrapping. + F_SKIP: bool = false, + _unused: u9 = 0, + /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + /// setup in a mixed CQE mode, where both 16b and 32b + /// CQEs may be posted to the CQ ring. + F_32: bool = false, + _unused_1: u16 = 0, }; - defer ring.deinit(); - - const group_id = 1337; - const buffer_id = 0; - - const buffer_len = 128; - var buffers: [4][buffer_len]u8 = undefined; - - // Provide 4 buffers - - { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Happens when the kernel is < 5.7 - .INVAL => return error.SkipZigTest, - // Happens on the kernel 5.4 - .BADF => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + pub fn err(self: Cqe) linux.E { + if (self.res > -4096 and self.res < 0) { + return @as(linux.E, @enumFromInt(-self.res)); } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + return .SUCCESS; } - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); - - // Do 4 send on the socket - - { - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // On successful completion of the provided buffers IO request, the CQE flags field + // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by + // the upper 16-bits of the flags field. + pub fn buffer_id(self: Cqe) !u16 { + if (!self.flags.F_BUFFER) { + return error.NoBufferSelected; } - - var cqes: [4]Cqe = undefined; - try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); + return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT); } +}; - // Do 4 recv which should consume all buffers +/// IO submission data structure (Submission Queue Entry) +/// matches io_uring_sqe in liburing +pub const Sqe = extern struct { + /// type of operation for this sqe + opcode: Op, + /// IOSQE_* flags + flags: IoSqe, + /// ioprio for the request + ioprio: packed union { + send_recv: SendRecv, + accept: Accept, + const Ioprio = @This(); - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 1); + pub fn init_empty() Ioprio { + return @bitCast(@as(u16, 0)); + } + }, + /// file descriptor to do IO on + fd: i32, + /// offset into file + off: u64, + /// pointer to buffer or iovecs + addr: u64, + /// buffer size or number of iovecs + len: u32, + /// flags for any sqe operation + /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | msg_flags + /// timeout_flags | accept_flags | cancel_flags | open_flags | statx_flags + /// fadvise_advice | splice_flags | rename_flags | unlink_flags | hardlink_flags + /// xattr_flags | msg_ring_flags | uring_cmd_flags | waitid_flags | futex_flags + /// install_fd_flags | nop_flags | pipe_flags + rw_flags: u32, + /// data to be passed back at completion time + user_data: u64, + /// index into fixed buffers or for grouped buffer selection + buf_index: u16, + personality: u16, + splice_fd_in: i32, + addr3: u64, + resv: u64, - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(.{ .BUFFER_SELECT = true }, sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + /// sqe.flags + pub const IoSqe = packed struct(u8) { + /// use fixed fileset + FIXED_FILE: bool = false, + /// issue after inflight IO + IO_DRAIN: bool = false, + /// links next sqe + IO_LINK: bool = false, + /// like LINK, but stronger + IO_HARDLINK: bool = false, + /// always go async + ASYNC: bool = false, + /// select buffer from sqe->buf_group + BUFFER_SELECT: bool = false, + /// don't post CQE if request succeeded + CQE_SKIP_SUCCESS: bool = false, + _: u1 = 0, + }; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + /// send/sendmsg and recv/recvmsg flags (sqe.ioprio) + pub const SendRecv = packed struct(u16) { + /// IORING_RECVSEND_POLL_FIRST + /// If set, instead of first attempting to send or receive and arm poll + /// if that yields an -EAGAIN result, arm poll upfront and skip the + /// initial transfer attempt. + RECVSEND_POLL_FIRST: bool = false, + /// IORING_RECV_MULTISHOT + /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue + /// to report CQEs on behalf of the same SQE. + RECV_MULTISHOT: bool = false, + /// IORING_RECVSEND_FIXED_BUF + /// Use registered buffers, the index is stored in the buf_index field. + RECVSEND_FIXED_BUF: bool = false, + /// IORING_SEND_ZC_REPORT_USAGE + /// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res + /// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was + /// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied + /// (at least partially). + SEND_ZC_REPORT_USAGE: bool = false, + /// IORING_RECVSEND_BUNDLE + /// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as + /// many buffers from the buffer group ID given and send them all. + /// The completion result will be the number of buffers send, with the + /// starting buffer ID in cqe.flags as per usual for provided buffer + /// usage. The buffers will be contiguous from the starting buffer ID. + RECVSEND_BUNDLE: bool = false, + // COMMIT: new flags + /// IORING_SEND_VECTORIZED + /// If set, SEND[_ZC] will take a pointer to a io_vec to allow + /// vectorized send operations. + SEND_VECTORIZED: bool = false, + _: u10 = 0, + }; - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; - try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); + /// accept flags stored in sqe.ioprio + pub const Accept = packed struct(u16) { + MULTISHOT: bool = false, + // COMMIT: new Flags + DONTWAIT: bool = false, + POLL_FIRST: bool = false, + _unused: u13 = 0, + }; - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); + pub fn prep_nop(sqe: *Sqe) void { + sqe.* = .{ + .opcode = .NOP, + .flags = .{}, + .ioprio = .init_empty(), + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; } - // This recv should fail - - { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void { + sqe.* = .{ + .opcode = .FSYNC, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = @bitCast(flags), + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; } - // Provide 1 buffer again + pub fn prep_rw( + sqe: *Sqe, + op: Op, + fd: linux.fd_t, + addr: u64, + len: usize, + offset: u64, + ) void { + sqe.* = .{ + .opcode = op, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = offset, + .addr = addr, + .len = @intCast(len), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - const reprovided_buffer_id = 2; + pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { + sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } - { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { + sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { + sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); + sqe.addr = off_in; + sqe.splice_fd_in = fd_in; } - // Redo 1 send on the server socket + pub fn prep_readv( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec, + offset: u64, + ) void { + sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } - { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn prep_writev( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec_const, + offset: u64, + ) void { + sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } - _ = try ring.copy_cqe(); + pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.buf_index = buffer_index; } - // Final recv which should work + pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.buf_index = buffer_index; + } - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 1); + pub fn prep_accept( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.SOCK, + ) void { + // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. + // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); + sqe.rw_flags = flags; + } - { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + /// accept directly into the fixed file table + pub fn prep_accept_direct( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.SOCK, + file_index: u32, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, file_index); + } - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + pub fn prep_multishot_accept( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.SOCK, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } }; + } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; - try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); + /// multishot accept directly into the fixed file table + pub fn prep_multishot_accept_direct( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.SOCK, + ) void { + prep_multishot_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); } -} -/// Used for testing server/client interactions. -const SocketTestHarness = struct { - listener: posix.socket_t, - server: posix.socket_t, - client: posix.socket_t, + fn set_target_fixed_file(sqe: *Sqe, file_index: u32) void { + const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC) + constants.FILE_INDEX_ALLOC + else + // 0 means no fixed files, indexes should be encoded as "index + 1" + file_index + 1; + // This filed is overloaded in liburing: + // splice_fd_in: i32 + // sqe_file_index: u32 + sqe.splice_fd_in = @bitCast(sqe_file_index); + } - fn close(self: SocketTestHarness) void { - posix.close(self.client); - posix.close(self.listener); + pub fn prep_connect( + sqe: *Sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + ) void { + // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); } -}; -fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { - // Create a TCP server socket - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - const listener_socket = try createListenerSocket(&address); - errdefer posix.close(listener_socket); + pub fn prep_epoll_ctl( + sqe: *Sqe, + epfd: linux.fd_t, + fd: linux.fd_t, + op: u32, + ev: ?*linux.epoll_event, + ) void { + sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); + } - // Submit 1 accept - var accept_addr: posix.sockaddr = undefined; - var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr)); - _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, 0); + pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG) void { + sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + } - // Create a TCP client socket - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - errdefer posix.close(client); - _ = try ring.connect(0xcccccccc, client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + // TODO: review recv `flags` + pub fn prep_recv_multishot( + sqe: *Sqe, + fd: linux.fd_t, + buffer: []u8, + flags: linux.MSG, + ) void { + sqe.prep_recv(fd, buffer, flags); + sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; + } - try testing.expectEqual(@as(u32, 2), try ring.submit()); + pub fn prep_recvmsg( + sqe: *Sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: linux.MSG, + ) void { + sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; + } - var cqe_accept = try ring.copy_cqe(); - if (cqe_accept.err() == .INVAL) return error.SkipZigTest; - var cqe_connect = try ring.copy_cqe(); - if (cqe_connect.err() == .INVAL) return error.SkipZigTest; + pub fn prep_recvmsg_multishot( + sqe: *Sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: linux.MSG, + ) void { + sqe.prep_recvmsg(fd, msg, flags); + sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; + } - // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: - if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { - const a = cqe_accept; - const b = cqe_connect; - cqe_accept = b; - cqe_connect = a; + // COMMIT: fix send[|recv] flag param type + pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG) void { + sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; } - try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); - if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); - try testing.expect(cqe_accept.res > 0); - try testing.expectEqual(@as(u32, 0), cqe_accept.flags); - try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = 0, - .flags = 0, - }, cqe_connect); + pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Sqe.SendRecv) void { + sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + sqe.ioprio = .{ .send_recv = zc_flags }; + } - // All good + pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Sqe.SendRecv, buf_index: u16) void { + const zc_flags_fixed = blk: { + var updated_flags = zc_flags; + updated_flags.RECVSEND_FIXED_BUF = true; + break :blk updated_flags; + }; + prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed); + sqe.buf_index = buf_index; + } - return SocketTestHarness{ - .listener = listener_socket, - .server = cqe_accept.res, - .client = client, - }; -} + pub fn prep_sendmsg( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.MSG, + ) void { + sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; + } -fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { - const kernel_backlog = 1; - const listener_socket = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - errdefer posix.close(listener_socket); + pub fn prep_sendmsg_zc( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.MSG, + ) void { + prep_sendmsg(sqe, fd, msg, flags); + sqe.opcode = .SENDMSG_ZC; + } - try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try posix.bind(listener_socket, addrAny(address), @sizeOf(linux.sockaddr.in)); - try posix.listen(listener_socket, kernel_backlog); + pub fn prep_openat( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); + sqe.rw_flags = @bitCast(flags); + } - // set address to the OS-chosen IP/port. - var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(listener_socket, addrAny(address), &slen); + pub fn prep_openat_direct( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + file_index: u32, + ) void { + prep_openat(sqe, fd, path, flags, mode); + set_target_fixed_file(sqe, file_index); + } - return listener_socket; -} + pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void { + sqe.* = .{ + .opcode = .CLOSE, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } -test "accept multishot" { - if (!is_linux) return error.SkipZigTest; + pub fn prep_close_direct(sqe: *Sqe, file_index: u32) void { + prep_close(sqe, 0); + set_target_fixed_file(sqe, file_index); + } - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_timeout( + sqe: *Sqe, + ts: *const linux.kernel_timespec, + count: u32, + flags: uflags.Timeout, + ) void { + sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); + sqe.rw_flags = @bitCast(flags); + } - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); + pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { + sqe.* = .{ + .opcode = .TIMEOUT_REMOVE, + .flags = .{}, + .ioprio = .init_empty(), + .fd = -1, + .off = 0, + .addr = timeout_user_data, + .len = 0, + .rw_flags = @bitCast(flags), + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - // submit multishot accept operation - var addr: posix.sockaddr = undefined; - var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); - const userdata: u64 = 0xaaaaaaaa; - _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn prep_link_timeout( + sqe: *Sqe, + ts: *const linux.kernel_timespec, + flags: uflags.Timeout, + ) void { + sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); + sqe.rw_flags = flags; + } - var nr: usize = 4; // number of clients to connect - while (nr > 0) : (nr -= 1) { - // connect client - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - errdefer posix.close(client); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + pub fn prep_poll_add( + sqe: *Sqe, + fd: linux.fd_t, + poll_mask: linux.POLL, + ) void { + sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + } - // test accept completion - var cqe = try ring.copy_cqe(); - if (cqe.err() == .INVAL) return error.SkipZigTest; - try testing.expect(cqe.res > 0); - try testing.expect(cqe.user_data == userdata); - try testing.expect(cqe.flags.F_MORE); // more flag is set + pub fn prep_poll_remove( + sqe: *Sqe, + target_user_data: u64, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); + } - posix.close(client); + pub fn prep_poll_update( + sqe: *Sqe, + old_user_data: u64, + new_user_data: u64, + poll_mask: linux.POLL, + flags: uflags.Poll, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); } -} - -test "accept/connect/send_zc/recv" { - try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_fallocate( + sqe: *Sqe, + fd: linux.fd_t, + mode: i32, + offset: u64, + len: u64, + ) void { + sqe.* = .{ + .opcode = .FALLOCATE, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = offset, + .addr = len, + .len = @intCast(mode), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); + pub fn prep_statx( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.AT, + mask: linux.Statx.Mask, + buf: *linux.Statx, + ) void { + sqe.prep_rw(.STATX, fd, @intFromPtr(path), @bitCast(mask), @intFromPtr(buf)); + sqe.rw_flags = flags; + } - const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - var buffer_recv = [_]u8{0} ** 10; + pub fn prep_cancel( + sqe: *Sqe, + cancel_user_data: u64, + flags: uflags.AsyncCancel, + ) void { + sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); + sqe.rw_flags = @bitCast(flags); + } - // zero-copy send - const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); - sqe_send.flags.IO_LINK = true; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + pub fn prep_cancel_fd( + sqe: *Sqe, + fd: linux.fd_t, + flags: uflags.AsyncCancel, + ) void { + sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); + const enable_cancel_fd = blk: { + var update_flags = flags; + update_flags.CANCEL_FD = true; + break :blk update_flags; + }; + sqe.rw_flags = @bitCast(enable_cancel_fd); + } - var cqe_send = try ring.copy_cqe(); - // First completion of zero-copy send. - // IORING_CQE_F_MORE, means that there - // will be a second completion event / notification for the - // request, with the user_data field set to the same value. - // buffer_send must be keep alive until second cqe. - try testing.expectEqual(Cqe{ - .user_data = 0xeeeeeeee, - .res = buffer_send.len, - .flags = .{ .F_MORE = true }, - }, cqe_send); + pub fn prep_shutdown( + sqe: *Sqe, + sockfd: linux.socket_t, + how: linux.SHUT, + ) void { + sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); + } - cqe_send, const cqe_recv = brk: { - const cqe1 = try ring.copy_cqe(); - const cqe2 = try ring.copy_cqe(); - break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 }; - }; + pub fn prep_renameat( + sqe: *Sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.RenameFlags, + ) void { + sqe.prep_rw( + .RENAMEAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = flags; + } - try testing.expectEqual(Cqe{ - .user_data = 0xffffffff, - .res = buffer_recv.len, - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recv); - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); + pub fn prep_unlinkat( + sqe: *Sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.AT, // TODO: unlink flags only AT_REMOVEDIR + ) void { + sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); + sqe.rw_flags = flags; + } - // Second completion of zero-copy send. - // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer - try testing.expectEqual(Cqe{ - .user_data = 0xeeeeeeee, - .res = 0, - .flags = linux.IORING_CQE_F_NOTIF, - }, cqe_send); -} + pub fn prep_mkdirat( + sqe: *Sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); + } -test "accept_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + pub fn prep_symlinkat( + sqe: *Sqe, + target: [*:0]const u8, + new_dir_fd: linux.fd_t, + link_path: [*:0]const u8, + ) void { + sqe.prep_rw( + .SYMLINKAT, + new_dir_fd, + @intFromPtr(target), + 0, + @intFromPtr(link_path), + ); + } - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; + pub fn prep_linkat( + sqe: *Sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.AT, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW + ) void { + sqe.prep_rw( + .LINKAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = flags; + } - // register direct file descriptors - var registered_fds = [_]linux.fd_t{-1} ** 2; - try ring.register_files(registered_fds[0..]); + pub fn prep_files_update( + sqe: *Sqe, + fds: []const linux.fd_t, + offset: u32, + ) void { + sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); + } - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); + pub fn prep_files_update_alloc( + sqe: *Sqe, + fds: []linux.fd_t, + ) void { + sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); + } - const accept_userdata: u64 = 0xaaaaaaaa; - const read_userdata: u64 = 0xbbbbbbbb; - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + // TODO: why can't slice be used here ? + pub fn prep_provide_buffers( + sqe: *Sqe, + buffers: [*]u8, + buffer_len: usize, + num: usize, + group_id: usize, + buffer_id: usize, + ) void { + const ptr = @intFromPtr(buffers); + sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); + sqe.buf_index = @intCast(group_id); + } - for (0..2) |_| { - for (registered_fds, 0..) |_, i| { - var buffer_recv = [_]u8{0} ** 16; - const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop + pub fn prep_remove_buffers( + sqe: *Sqe, + num: usize, + group_id: usize, + ) void { + sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); + sqe.buf_index = @intCast(group_id); + } - // submit accept, will chose registered fd and return index in cqe - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn prep_socket( + sqe: *Sqe, + domain: linux.AF, + socket_type: linux.SOCK, + protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + flags: u32, // flags is unused + ) void { + sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); + sqe.rw_flags = flags; + } - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); + pub fn prep_socket_direct( + sqe: *Sqe, + domain: linux.AF, + socket_type: linux.SOCK, + protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + flags: u32, // flags is unused + file_index: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, file_index); + } - // accept completion - const cqe_accept = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); - const fd_index = cqe_accept.res; - try testing.expect(fd_index < registered_fds.len); - try testing.expect(cqe_accept.user_data == accept_userdata); + pub fn prep_socket_direct_alloc( + sqe: *Sqe, + domain: linux.AF, + socket_type: linux.SOCK, + protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + flags: u32, // flags is unused + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); + } - // send data - _ = try posix.send(client, buffer_send, 0); + pub fn prep_waitid( + sqe: *Sqe, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: linux.W, + flags: u32, // flags is unused + ) void { + sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); + sqe.rw_flags = flags; + sqe.splice_fd_in = @bitCast(options); + } - // Example of how to use registered fd: - // Submit receive to fixed file returned by accept (fd_index). - // Fd field is set to registered file index, returned by accept. - // Flag linux.IOSQE_FIXED_FILE must be set. - const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); - recv_sqe.flags.FIXED_FILE = true; - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // TODO: maybe remove unused flag fields? + pub fn prep_bind( + sqe: *Sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + flags: u32, // flags is unused and does't exist in io_uring's api + ) void { + sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); + sqe.rw_flags = flags; + } - // accept receive - const recv_cqe = try ring.copy_cqe(); - try testing.expect(recv_cqe.user_data == read_userdata); - try testing.expect(recv_cqe.res == buffer_send.len); - try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); - } - // no more available fds, accept will get NFILE error - { - // submit accept - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); - // completion with error - const cqe_accept = try ring.copy_cqe(); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); - } - // return file descriptors to kernel - try ring.register_files_update(0, registered_fds[0..]); + pub fn prep_listen( + sqe: *Sqe, + fd: linux.fd_t, + backlog: usize, + flags: u32, // flags is unused and does't exist in io_uring's api + ) void { + sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); + sqe.rw_flags = flags; } - try ring.unregister_files(); -} -test "accept_multishot_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + pub fn prep_cmd_sock( + sqe: *Sqe, + cmd_op: SocketOp, + fd: linux.fd_t, + level: linux.SOL, + optname: linux.SO, + optval: u64, + optlen: u32, + ) void { + sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); + // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 + sqe.off = @intFromEnum(cmd_op); + // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 + sqe.addr = @bitCast(packed struct { + level: u32, + optname: u32, + }{ + .level = level, + .optname = optname, + }); + // splice_fd_in if overloaded u32 -> i32 + sqe.splice_fd_in = @bitCast(optlen); + // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 + sqe.addr3 = optval; + } - if (builtin.cpu.arch == .riscv64) { - // https://github.com/ziglang/zig/issues/25734 - return error.SkipZigTest; + pub fn set_flags(sqe: *Sqe, flags: Sqe.IoSqe) void { + const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags)); + sqe.flags = @bitCast(updated_flags); } - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + /// This SQE forms a link with the next SQE in the submission ring. Next SQE + /// will not be started before this one completes. Forms a chain of SQEs. + pub fn link_next(sqe: *Sqe) void { + sqe.flags.IO_LINK = true; + } +}; - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; +/// matches io_uring_sq in liburing +pub const Sq = struct { + head: *u32, + tail: *u32, + mask: u32, + flags: *Flags, + dropped: *u32, + array: []u32, + sqes: []Sqe, + mmap: []align(page_size_min) u8, + mmap_sqes: []align(page_size_min) u8, - var registered_fds = [_]linux.fd_t{-1} ** 2; - try ring.register_files(registered_fds[0..]); + // We use `sqe_head` and `sqe_tail` in the same way as liburing: + // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. + // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. + // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. + sqe_head: u32 = 0, + sqe_tail: u32 = 0, - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); + /// sq_ring.flags + pub const Flags = packed struct(u32) { + /// needs io_uring_enter wakeup + NEED_WAKEUP: bool = false, + /// CQ ring is overflown + CQ_OVERFLOW: bool = false, + /// task should enter the kernel + TASKRUN: bool = false, + _unused: u29 = 0, + }; - const accept_userdata: u64 = 0xaaaaaaaa; + pub fn init(fd: posix.fd_t, p: Params) !Sq { + assert(fd >= 0); + assert(p.features.SINGLE_MMAP); + const size = @max( + p.sq_off.array + p.sq_entries * @sizeOf(u32), + p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe), + ); + const mmap = try posix.mmap( + null, + size, + posix.PROT.READ | posix.PROT.WRITE, + .{ .TYPE = .SHARED, .POPULATE = true }, + fd, + constants.OFF_SQ_RING, + ); + errdefer posix.munmap(mmap); + assert(mmap.len == size); - for (0..2) |_| { - // submit multishot accept - // Will chose registered fd and return index of the selected registered file in cqe. - _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // The motivation for the `sqes` and `array` indirection is to make it possible for the + // application to preallocate static io_uring_sqe entries and then replay them when needed. + const size_sqes = p.sq_entries * @sizeOf(Sqe); + const mmap_sqes = try posix.mmap( + null, + size_sqes, + posix.PROT.READ | posix.PROT.WRITE, + .{ .TYPE = .SHARED, .POPULATE = true }, + fd, + constants.OFF_SQES, + ); + errdefer posix.munmap(mmap_sqes); + assert(mmap_sqes.len == size_sqes); - for (registered_fds) |_| { - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); + const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); + const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0])); + // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, + // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. + assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); + return .{ + .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), + .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), + .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, + .flags = @ptrCast(@alignCast(&mmap[p.sq_off.flags])), + .dropped = @ptrCast(@alignCast(&mmap[p.sq_off.dropped])), + .array = array[0..p.sq_entries], + .sqes = sqes[0..p.sq_entries], + .mmap = mmap, + .mmap_sqes = mmap_sqes, + }; + } - // accept completion - const cqe_accept = try ring.copy_cqe(); - const fd_index = cqe_accept.res; - try testing.expect(fd_index < registered_fds.len); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expect(cqe_accept.flags.F_MORE); // has more is set - } - // No more available fds, accept will get NFILE error. - // Multishot is terminated (more flag is not set). - { - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); - // completion with error - const cqe_accept = try ring.copy_cqe(); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); - try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE == 0); // has more is not set - } - // return file descriptors to kernel - try ring.register_files_update(0, registered_fds[0..]); + pub fn deinit(self: *Sq) void { + posix.munmap(self.mmap_sqes); + posix.munmap(self.mmap); } - try ring.unregister_files(); -} +}; -test "socket" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); +/// matches io_uring_cq in liburing +pub const Cq = struct { + head: *u32, + tail: *u32, + mask: u32, + overflow: *u32, + cqes: []Cqe, - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + /// cq_ring.flags + pub const Flags = packed struct(u32) { + /// disable eventfd notifications + EVENTFD_DISABLED: bool = false, + _unused: u31 = 0, }; - defer ring.deinit(); - // prepare, submit socket operation - _ = try ring.socket(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq { + assert(fd >= 0); + assert(p.features.SINGLE_MMAP); + const mmap = sq.mmap; + const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); + assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); + return .{ + .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), + .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), + .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, + .overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])), + .cqes = cqes[0..p.cq_entries], + }; + } - // test completion - var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - const fd: linux.fd_t = @intCast(cqe.res); - try testing.expect(fd > 2); + pub fn deinit(self: *Cq) void { + _ = self; + // A no-op since we now share the mmap with the submission queue. + // Here for symmetry with the submission queue, and for any future feature support. + } +}; - posix.close(fd); -} +/// Group of application provided buffers. Uses newer type, called ring mapped +/// buffers, supported since kernel 5.19. Buffers are identified by a buffer +/// group ID, and within that group, a buffer ID. IO_Uring can have multiple +/// buffer groups, each with unique group ID. +/// +/// In `init` application provides contiguous block of memory `buffers` for +/// `buffers_count` buffers of size `buffers_size`. Application can then submit +/// `recv` operation without providing buffer upfront. Once the operation is +/// ready to receive data, a buffer is picked automatically and the resulting +/// CQE will contain the buffer ID in `cqe.buffer_id()`. Use `get` method to get +/// buffer for buffer ID identified by CQE. Once the application has processed +/// the buffer, it may hand ownership back to the kernel, by calling `put` +/// allowing the cycle to repeat. +/// +/// Depending on the rate of arrival of data, it is possible that a given buffer +/// group will run out of buffers before those in CQEs can be put back to the +/// kernel. If this happens, a `cqe.err()` will have ENOBUFS as the error value. +/// +pub const BufferGroup = struct { + /// Parent ring for which this group is registered. + ring: *IoUring, + /// Pointer to the memory shared by the kernel. + /// `buffers_count` of `io_uring_buf` structures are shared by the kernel. + /// First `io_uring_buf` is overlaid by `io_uring_buf_ring` struct. + br: *align(page_size_min) BufferRing, + /// Contiguous block of memory of size (buffers_count * buffer_size). + buffers: []u8, + /// Size of each buffer in buffers. + buffer_size: u32, + /// Number of buffers in `buffers`, number of `io_uring_buf structures` in br. + buffers_count: u16, + /// Head of unconsumed part of each buffer, if incremental consumption is enabled + heads: []u32, + /// ID of this group, must be unique in ring. + group_id: u16, -test "socket_direct/socket_direct_alloc/close_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + pub fn init( + ring: *IoUring, + allocator: mem.Allocator, + group_id: u16, + buffer_size: u32, + buffers_count: u16, + ) !BufferGroup { + const buffers = try allocator.alloc(u8, buffer_size * buffers_count); + errdefer allocator.free(buffers); + const heads = try allocator.alloc(u32, buffers_count); + errdefer allocator.free(heads); - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .IOU_PBUF_RING_INC = true }); + buf_ring_init(br); - var registered_fds = [_]linux.fd_t{-1} ** 3; - try ring.register_files(registered_fds[0..]); + const mask = buf_ring_mask(buffers_count); + var i: u16 = 0; + while (i < buffers_count) : (i += 1) { + const pos = buffer_size * i; + const buf = buffers[pos .. pos + buffer_size]; + heads[i] = 0; + buf_ring_add(br, buf, i, mask, i); + } + buf_ring_advance(br, buffers_count); - // create socket in registered file descriptor at index 0 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 0); + return BufferGroup{ + .ring = ring, + .group_id = group_id, + .br = br, + .buffers = buffers, + .heads = heads, + .buffer_size = buffer_size, + .buffers_count = buffers_count, + }; + } - // create socket in registered file descriptor at index 1 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified + pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void { + free_buf_ring(self.ring.fd, self.br, self.buffers_count, self.group_id); + allocator.free(self.buffers); + allocator.free(self.heads); + } - // create socket in kernel chosen file descriptor index (_alloc version) - // completion res has index from registered files - _ = try ring.socket_direct_alloc(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 2); // returns registered file index + // Prepare recv operation which will select buffer from this group. + pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { + var sqe = try self.ring.get_sqe(); + sqe.prep_rw(.RECV, fd, 0, 0, 0); + sqe.rw_flags = flags; + sqe.flags.BUFFER_SELECT = true; + sqe.buf_index = self.group_id; + sqe.user_data = user_data; + return sqe; + } - // use sockets from registered_fds in connect operation - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); - const accept_userdata: u64 = 0xaaaaaaaa; - const connect_userdata: u64 = 0xbbbbbbbb; - const close_userdata: u64 = 0xcccccccc; - for (registered_fds, 0..) |_, fd_index| { - // prepare accept - _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); - // prepare connect with fixed socket - const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); - connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index - // submit both - try testing.expectEqual(@as(u32, 2), try ring.submit()); - // get completions - var cqe_connect = try ring.copy_cqe(); - var cqe_accept = try ring.copy_cqe(); - // ignore order - if (cqe_connect.user_data == accept_userdata and cqe_accept.user_data == connect_userdata) { - const a = cqe_accept; - const b = cqe_connect; - cqe_accept = b; - cqe_connect = a; + // Prepare multishot recv operation which will select buffer from this group. + pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { + var sqe = try self.recv(user_data, fd, flags); + sqe.ioprio.send_recv.RECV_MULTISHOT = true; + return sqe; + } + + // Get buffer by id. + fn get_by_id(self: *BufferGroup, buffer_id: u16) []u8 { + const pos = self.buffer_size * buffer_id; + return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..]; + } + + // Get buffer by CQE. + pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { + const buffer_id = try cqe.buffer_id(); + const used_len = @as(usize, @intCast(cqe.res)); + return self.get_by_id(buffer_id)[0..used_len]; + } + + // Release buffer from CQE to the kernel. + pub fn put(self: *BufferGroup, cqe: Cqe) !void { + const buffer_id = try cqe.buffer_id(); + if (cqe.flags.F_BUF_MORE) { + // Incremental consumption active, kernel will write to the this buffer again + const used_len = @as(u32, @intCast(cqe.res)); + // Track what part of the buffer is used + self.heads[buffer_id] += used_len; + return; } - // test connect completion - try testing.expect(cqe_connect.user_data == connect_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_connect.err()); - // test accept completion - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + self.heads[buffer_id] = 0; - // submit and test close_direct - _ = try ring.close_direct(close_userdata, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_close = try ring.copy_cqe(); - try testing.expect(cqe_close.user_data == close_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + // Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count); + const mask = buf_ring_mask(self.buffers_count); + buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0); + buf_ring_advance(self.br, 1); } +}; - try ring.unregister_files(); -} +/// Used to select how the read should be handled. +pub const ReadBuffer = union(enum) { + /// io_uring will read directly into this buffer + buffer: []u8, -test "openat_direct/close_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + /// io_uring will read directly into these buffers using readv. + iovecs: []const posix.iovec, - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + /// io_uring will select a buffer that has previously been provided with `provide_buffers`. + /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; - var registered_fds = [_]linux.fd_t{-1} ** 3; - try ring.register_files(registered_fds[0..]); +/// Used to select how the recv call should be handled. +pub const RecvBuffer = union(enum) { + /// io_uring will recv directly into this buffer + buffer: []u8, - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - const path = "test_io_uring_close_direct"; - const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; - const user_data: u64 = 0; + /// io_uring will select a buffer that has previously been provided with `provide_buffers`. + /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; - // use registered file at index 0 (last param) - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 0); +/// Filled with the offset for mmap(2) +/// matches io_sqring_offsets in liburing +pub const SqOffsets = extern struct { + /// offset of ring head + head: u32, + /// offset of ring tail + tail: u32, + /// ring mask value + ring_mask: u32, + /// entries in ring + ring_entries: u32, + /// ring flags + flags: u32, + /// number of sqes not submitted + dropped: u32, + /// sqe index array + array: u32, + resv1: u32, + user_addr: u64, +}; - // use registered file at index 1 - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 0); // res is 0 when we specify index +/// matches io_cqring_offsets in liburing +pub const CqOffsets = extern struct { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + overflow: u32, + cqes: u32, + flags: u32, + resv: u32, + user_addr: u64, +}; - // let kernel choose registered file index - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 2); // chosen index is in res +/// Passed in for io_uring_setup(2). Copied back with updated info on success +/// matches io_uring_params in liburing +pub const Params = extern struct { + sq_entries: u32, + cq_entries: u32, + flags: uflags.Setup, + sq_thread_cpu: u32, + sq_thread_idle: u32, + features: uflags.Features, + wq_fd: u32, + resv: [3]u32, + sq_off: SqOffsets, + cq_off: CqOffsets, +}; - // close all open file descriptors - for (registered_fds, 0..) |_, fd_index| { - _ = try ring.close_direct(user_data, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_close = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); - } - try ring.unregister_files(); -} +// COMMIT: remove deprecated io_uring_rsrc_update struct +// deprecated, see struct io_uring_rsrc_update -test "waitid" { - try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); +// COMMIT: add new io_uring_region_desc struct +/// matches io_uring_region_desc in liburing +pub const RegionDesc = extern struct { + user_addr: u64, + size: u64, + flags: Flags, + id: u32, + mmap_offset: u64, + __resv: [4]u64, - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + // COMMIT: new constant + /// initialise with user provided memory pointed by user_addr + pub const Flags = packed struct(u32) { + TYPE_USER: bool = false, + _: u31 = 0, }; - defer ring.deinit(); +}; - const pid = try posix.fork(); - if (pid == 0) { - posix.exit(7); - } +// COMMIT: add new io_uring_mem_region_reg struct +/// matches io_uring_mem_region_reg in liburing +pub const MemRegionReg = extern struct { + /// struct io_uring_region_desc (RegionDesc in Zig) + region_uptr: u64, + flags: Flags, + __resv: [2]u64, - var siginfo: posix.siginfo_t = undefined; - _ = try ring.waitid(0, .PID, pid, &siginfo, posix.W.EXITED, 0); + /// expose the region as registered wait arguments + pub const Flags = packed struct(u64) { + REG_WAIT_ARG: bool = false, + _: u63 = 0, + }; +}; - try testing.expectEqual(1, try ring.submit()); +/// matches io_uring_rsrc_register in liburing +pub const RsrcRegister = extern struct { + nr: u32, + flags: u32, + resv2: u64, + data: u64, + tags: u64, +}; - const cqe_waitid = try ring.copy_cqe(); - try testing.expectEqual(0, cqe_waitid.res); - try testing.expectEqual(pid, siginfo.fields.common.first.piduid.pid); - try testing.expectEqual(7, siginfo.fields.common.second.sigchld.status); -} +/// matches io_uring_rsrc_update in liburing +pub const RsrcUpdate = extern struct { + offset: u32, + resv: u32, + data: u64, +}; -/// For use in tests. Returns SkipZigTest if kernel version is less than required. -inline fn skipKernelLessThan(required: std.SemanticVersion) !void { - if (!is_linux) return error.SkipZigTest; +/// matches io_uring_rsrc_update2 in liburing +pub const RsrcUpdate2 = extern struct { + offset: u32, + resv: u32, + data: u64, + tags: u64, + nr: u32, + resv2: u32, +}; - var uts: linux.utsname = undefined; - const res = linux.uname(&uts); - switch (linux.errno(res)) { - .SUCCESS => {}, - else => |errno| return posix.unexpectedErrno(errno), - } +/// matches io_uring_probe_op in liburing +pub const ProbeOp = extern struct { + op: Op, + resv: u8, + flags: Flags, + resv2: u32, - const release = mem.sliceTo(&uts.release, 0); - // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" - const extra_index = std.mem.indexOfAny(u8, release, "-+"); - const stripped = release[0..(extra_index orelse release.len)]; - // Make sure the input don't rely on the extra we just stripped - try testing.expect(required.pre == null and required.build == null); + pub const Flags = packed struct(u16) { + OP_SUPPORTED: bool = false, + _: u15 = 0, + }; - var current = try std.SemanticVersion.parse(stripped); - current.pre = null; // don't check pre field - if (required.order(current) == .gt) return error.SkipZigTest; -} + pub fn is_supported(self: ProbeOp) bool { + return self.flags.OP_SUPPORTED; + } +}; -test BufferGroup { - if (!is_linux) return error.SkipZigTest; +/// matches io_uring_probe in liburing +pub const Probe = extern struct { + /// Last opcode supported + last_op: Op, + /// Length of ops[] array below + ops_len: u8, + resv: u16, + resv2: [3]u32, + ops: [256]ProbeOp, - // Init IoUring - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + /// Is the operation supported on the running kernel. + pub fn is_supported(self: @This(), op: Op) bool { + const i = @intFromEnum(op); + if (i > @intFromEnum(self.last_op) or i >= self.ops_len) + return false; + return self.ops[i].is_supported(); + } +}; - // Init buffer group for ring - const group_id: u16 = 1; // buffers group id - const buffers_count: u16 = 1; // number of buffers in buffer group - const buffer_size: usize = 128; // size of each buffer in group - var buf_grp = BufferGroup.init( - &ring, - testing.allocator, - group_id, - buffer_size, - buffers_count, - ) catch |err| switch (err) { - // kernel older than 5.19 - error.ArgumentsInvalid => return error.SkipZigTest, - else => return err, - }; - defer buf_grp.deinit(testing.allocator); +// COMMIT: fix defination of io_uring_restriction +// RegisterOp is actually u8 +/// matches io_uring_restriction in liburing +pub const Restriction = extern struct { + opcode: RestrictionOp, + arg: extern union { + /// IORING_RESTRICTION_REGISTER_OP + register_op: RegisterOp, + /// IORING_RESTRICTION_SQE_OP + sqe_op: Op, + /// IORING_RESTRICTION_SQE_FLAGS_* + sqe_flags: u8, + }, + resv: u8, + resv2: [3]u32, +}; - // Create client/server fds - const fds = try createSocketTestHarness(&ring); - defer fds.close(); - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; +// COMMIT: add new struct type +/// matches io_uring_clock_register in liburing +pub const ClockRegister = extern struct { + clockid: u32, + __resv: [3]u32, +}; - // Client sends data - { - _ = try ring.send(1, fds.client, data[0..], 0); - const submitted = try ring.submit(); - try testing.expectEqual(1, submitted); - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); - } +// COMMIT: add new struct type +/// matches io_uring_clone_buffers in liburing +pub const CloneBuffers = extern struct { + src_fd: u32, + flags: Flags, + src_off: u32, + dst_off: u32, + nr: u32, + pad: [3]u32, - // Server uses buffer group receive - { - // Submit recv operation, buffer will be chosen from buffer group - _ = try buf_grp.recv(2, fds.server, 0); - const submitted = try ring.submit(); - try testing.expectEqual(1, submitted); + // COMMIT: new flags + pub const Flags = packed struct(u32) { + REGISTER_SRC_REGISTERED: bool = false, + REGISTER_DST_REPLACE: bool = false, + _: u30 = 0, + }; +}; - // ... when we have completion for recv operation - const cqe = try ring.copy_cqe(); - try testing.expectEqual(2, cqe.user_data); // matches submitted user_data - try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(data.len, @as(usize, @intCast(cqe.res))); // cqe.res holds received data len +/// matches io_uring_buf in liburing +pub const Buffer = extern struct { + addr: u64, + len: u32, + bid: u16, + resv: u16, +}; - // Get buffer from pool - const buf = try buf_grp.get(cqe); - try testing.expectEqualSlices(u8, &data, buf); - // Release buffer to the kernel when application is done with it - try buf_grp.put(cqe); - } -} +/// matches io_uring_buf_ring in liburing +pub const BufferRing = extern struct { + resv1: u64, + resv2: u32, + resv3: u16, + tail: u16, +}; -test "ring mapped buffers recv" { - if (!is_linux) return error.SkipZigTest; +/// argument for IORING_(UN)REGISTER_PBUF_RING +/// matches io_uring_buf_reg in liburing +pub const BufferRegister = extern struct { + ring_addr: u64, + ring_entries: u32, + bgid: u16, + flags: Flags, + resv: [3]u64, - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + // COMMIT: new IORING_REGISTER_PBUF_RING flags + /// Flags for IORING_REGISTER_PBUF_RING. + pub const Flags = packed struct(u16) { + /// IOU_PBUF_RING_MMAP: + /// If set, kernel will allocate the memory for the ring. + /// The application must not set a ring_addr in struct io_uring_buf_reg + /// instead it must subsequently call mmap(2) with the offset set + /// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get + /// a virtual mapping for the ring. + IOU_PBUF_RING_MMAP: bool = false, + /// IOU_PBUF_RING_INC: + /// If set, buffers consumed from this buffer ring can be + /// consumed incrementally. Normally one (or more) buffers + /// are fully consumed. With incremental consumptions, it's + /// feasible to register big ranges of buffers, and each + /// use of it will consume only as much as it needs. This + /// requires that both the kernel and application keep + /// track of where the current read/recv index is at. + IOU_PBUF_RING_INC: bool = false, + _: u14 = 0, }; - defer ring.deinit(); +}; - // init buffer group - const group_id: u16 = 1; // buffers group id - const buffers_count: u16 = 2; // number of buffers in buffer group - const buffer_size: usize = 4; // size of each buffer in group - var buf_grp = BufferGroup.init( - &ring, - testing.allocator, - group_id, - buffer_size, - buffers_count, - ) catch |err| switch (err) { - // kernel older than 5.19 - error.ArgumentsInvalid => return error.SkipZigTest, - else => return err, - }; - defer buf_grp.deinit(testing.allocator); +/// argument for IORING_REGISTER_PBUF_STATUS +/// matches io_uring_buf_status in liburing +pub const BufferStatus = extern struct { + /// input + buf_group: u32, + /// output + head: u32, + resv: [8]u32, +}; - // create client/server fds - const fds = try createSocketTestHarness(&ring); - defer fds.close(); +/// argument for IORING_(UN)REGISTER_NAPI +/// matches io_uring_napi in liburing +pub const Napi = extern struct { + busy_poll_to: u32, + prefer_busy_poll: u8, + pad: [3]u8, + resv: u64, +}; - // for random user_data in sqe/cqe - var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); - var rnd = Rnd.random(); +// COMMIT: new struct type +/// Argument for io_uring_enter(2) with +/// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument +/// is an index into a previously registered fixed wait region described by +/// the below structure. +/// matches io_uring_reg_wait in liburing +pub const RegisterWait = extern struct { + ts: linux.kernel_timespec, + min_wait_usec: u32, + flags: Flags, + sigmask: u64, + sigmask_sz: u32, + pad: [3]u32, + pad2: [2]u64, - var round: usize = 4; // repeat send/recv cycle round times - while (round > 0) : (round -= 1) { - // client sends data - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - { - const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); - } - var pos: usize = 0; + // COMMIT: new constant + pub const Flags = packed struct(u32) { + REG_WAIT_TS: bool = false, + _: u31 = 0, + }; +}; - // read first chunk - const cqe1 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); - var buf = try buf_grp.get(cqe1); - try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); - pos += buf.len; - // second chunk - const cqe2 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); - buf = try buf_grp.get(cqe2); - try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); - pos += buf.len; +/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG +/// matches io_uring_getevents_arg in liburing +pub const GetEventsArg = extern struct { + sigmask: u64, + sigmask_sz: u32, + pad: u32, + ts: u64, +}; - // both buffers provided to the kernel are used so we get error - // 'no more buffers', until we put buffers to the kernel - { - const user_data = rnd.int(u64); - _ = try buf_grp.recv(user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only - try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); - } +// COMMIT: fix type definition of io_uring_sync_cancel_reg +/// Argument for IORING_REGISTER_SYNC_CANCEL +/// matches io_uring_sync_cancel_reg in liburing +pub const SyncCancelRegister = extern struct { + addr: u64, + fd: i32, + flags: uflags.AsyncCancel, + timeout: linux.kernel_timespec, + opcode: Op, + pad: [7]u8, + pad2: [4]u64, +}; - // put buffers back to the kernel - try buf_grp.put(cqe1); - try buf_grp.put(cqe2); +/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE +/// The range is specified as [off, off + len) +/// matches io_uring_file_index_range in liburing +pub const FileIndexRange = extern struct { + off: u32, + len: u32, + resv: u64, +}; - // read remaining data - while (pos < data.len) { - const cqe = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); - buf = try buf_grp.get(cqe); - try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); - pos += buf.len; - try buf_grp.put(cqe); - } - } -} +/// matches io_uring_recvmsg_out in liburing +pub const RecvmsgOut = extern struct { + namelen: u32, + controllen: u32, + payloadlen: u32, + flags: u32, +}; -test "ring mapped buffers multishot recv" { - if (!is_linux) return error.SkipZigTest; +/// Zero copy receive refill queue entry +/// matches io_uring_zcrx_rqe in liburing +pub const ZcrxRqe = extern struct { + off: u64, + len: u32, + __pad: u32, +}; - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); +/// matches io_uring_zcrx_cqe in liburing +pub const ZcrxCqe = extern struct { + off: u64, + __pad: u64, +}; - // init buffer group - const group_id: u16 = 1; // buffers group id - const buffers_count: u16 = 2; // number of buffers in buffer group - const buffer_size: usize = 4; // size of each buffer in group - var buf_grp = BufferGroup.init( - &ring, - testing.allocator, - group_id, - buffer_size, - buffers_count, - ) catch |err| switch (err) { - // kernel older than 5.19 - error.ArgumentsInvalid => return error.SkipZigTest, - else => return err, - }; - defer buf_grp.deinit(testing.allocator); +/// matches io_uring_zcrx_offsets in liburing +pub const ZcrxOffsets = extern struct { + head: u32, + tail: u32, + rqes: u32, + __resv2: u32, + __resv: [2]u64, +}; - // create client/server fds - const fds = try createSocketTestHarness(&ring); - defer fds.close(); +/// matches io_uring_zcrx_area_reg in liburing +pub const ZcrxAreaRegister = extern struct { + addr: u64, + len: u64, + rq_area_token: u64, + flags: Flags, + dmabuf_fd: u32, + __resv2: [2]u64, - // for random user_data in sqe/cqe - var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); - var rnd = Rnd.random(); + pub const Flags = packed struct(u32) { + DMABUF: bool = false, + _: u31 = 0, + }; +}; - var round: usize = 4; // repeat send/recv cycle round times - while (round > 0) : (round -= 1) { - // client sends data - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf }; - { - const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); - } +/// Argument for IORING_REGISTER_ZCRX_IFQ +/// matches io_uring_zcrx_ifq_reg in liburing +pub const ZcrxIfqRegister = extern struct { + if_idx: u32, + if_rxq: u32, + rq_entries: u32, + // TODO: find out its flags, I suspect its ZcrxAreaRegister.Flags + flags: u32, + /// pointer to struct io_uring_zcrx_area_reg + area_ptr: u64, + /// struct io_uring_region_desc + region_ptr: u64, + offsets: ZcrxOffsets, + zcrx_id: u32, + __resv2: u32, + __resv: [3]u64, +}; - // start multishot recv - var recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit +// COMMIT: move IoUring constants to Constants +pub const constants = struct { + /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new + /// an available direct descriptor instead of having the application pass one + /// direct descriptor (like openat/openat2/accept), then io_uring will allocate + /// in. The picked direct descriptor will be returned in cqe.res, or -ENFILE + /// if the space is full. + pub const FILE_INDEX_ALLOC = std.math.maxInt(u32); - // server reads data into provided buffers - // there are 2 buffers of size 4, so each read gets only chunk of data - // we read four chunks of 4, 4, 4, 4 bytes each - var chunk: []const u8 = data[0..buffer_size]; // first chunk - const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe1.flags.F_MORE); + pub const CMD_MASK = 1 << 0; - chunk = data[buffer_size .. buffer_size * 2]; // second chunk - const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe2.flags.F_MORE); + pub const TIMEOUT_CLOCK_MASK = ((1 << 2) | (1 << 3)); + pub const TIMEOUT_UPDATE_MASK = ((1 << 1) | (1 << 4)); - // both buffers provided to the kernel are used so we get error - // 'no more buffers', until we put buffers to the kernel - { - const cqe = try ring.copy_cqe(); - try testing.expectEqual(recv_user_data, cqe.user_data); - try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only - // has more is not set - // indicates that multishot is finished - try testing.expect(!cqe.flags.F_MORE); - try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); - } + pub const CQE_BUFFER_SHIFT = 16; - // put buffers back to the kernel - try buf_grp.put(cqe1); - try buf_grp.put(cqe2); + /// cqe.res for IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was + /// requested It should be treated as a flag, all other bits of cqe.res + /// should be treated as reserved! + pub const NOTIF_USAGE_ZC_COPIED = (1 << 31); - // restart multishot - recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + //Magic offsets for the application to mmap the data it needs + pub const OFF_SQ_RING = 0; + pub const OFF_CQ_RING = 0x8000000; + pub const OFF_SQES = 0x10000000; + // COMMIT: new magic constants + pub const OFF_PBUF_RING = 0x80000000; + pub const OFF_PBUF_SHIFT = 16; + pub const OFF_MMAP_MASK = 0xf8000000; - chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk - const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe3.flags.F_MORE); - try buf_grp.put(cqe3); + /// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. + pub const RSRC_REGISTER_SPARSE = 1 << 0; - chunk = data[buffer_size * 3 ..]; // last chunk - const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe4.flags.F_MORE); - try buf_grp.put(cqe4); + /// Skip updating fd indexes set to this value in the fd table + pub const REGISTER_FILES_SKIP = -2; - // cancel pending multishot recv operation - { - const cancel_user_data = rnd.int(u64); - _ = try ring.cancel(cancel_user_data, recv_user_data, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // COMMIT: new TX Timestamp definition + /// SOCKET_URING_OP_TX_TIMESTAMP definitions + pub const TIMESTAMP_HW_SHIFT = 16; + /// The cqe.flags bit from which the timestamp type is stored + pub const TIMESTAMP_TYPE_SHIFT = (TIMESTAMP_HW_SHIFT + 1); + /// The cqe.flags flag signifying whether it's a hardware timestamp + pub const CQE_F_TSTAMP_HW = (1 << TIMESTAMP_HW_SHIFT); - // expect completion of cancel operation and completion of recv operation - var cqe_cancel = try ring.copy_cqe(); - if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; - var cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; + /// The bit from which area id is encoded into offsets + pub const ZCRX_AREA_SHIFT = 48; + pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1)); - // don't depend on order of completions - if (cqe_cancel.user_data == recv_user_data and cqe_recv.user_data == cancel_user_data) { - const a = cqe_cancel; - const b = cqe_recv; - cqe_cancel = b; - cqe_recv = a; - } + // flag added to the opcode to use a registered ring fd + pub const REGISTER_USE_REGISTERED_RING = 1 << 31; +}; - // Note on different kernel results: - // on older kernel (tested with v6.0.16, v6.1.57, v6.2.12, v6.4.16) - // cqe_cancel.err() == .NOENT - // cqe_recv.err() == .NOBUFS - // on kernel (tested with v6.5.0, v6.5.7) - // cqe_cancel.err() == .SUCCESS - // cqe_recv.err() == .CANCELED - // Upstream reference: https://github.com/axboe/liburing/issues/984 +// COMMIT: move IoUring flags to Flags struct +pub const uflags = struct { + /// io_uring_setup() flags + pub const Setup = packed struct(u32) { + /// io_context is polled + IOPOLL: bool = false, + /// SQ poll thread + SQPOLL: bool = false, + /// sq_thread_cpu is valid + SQ_AFF: bool = false, + /// app defines CQ size + CQSIZE: bool = false, + /// clamp SQ/CQ ring sizes + CLAMP: bool = false, + /// attach to existing wq + ATTACH_WQ: bool = false, + /// start with ring disabled + R_DISABLED: bool = false, + /// continue submit on error + SUBMIT_ALL: bool = false, + ///Cooperative task running. When requests complete, they often require + ///forcing the submitter to transition to the kernel to complete. If this + ///flag is set, work will be done when the task transitions anyway, rather + ///than force an inter-processor interrupt reschedule. This avoids interrupting + ///a task running in userspace, and saves an IPI. + COOP_TASKRUN: bool = false, + ///If COOP_TASKRUN is set, get notified if task work is available for + ///running and a kernel transition would be needed to run it. This sets + ///IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + TASKRUN_FLAG: bool = false, + /// SQEs are 128 byte + SQE128: bool = false, + /// CQEs are 32 byte + CQE32: bool = false, + /// Only one task is allowed to submit requests + SINGLE_ISSUER: bool = false, + /// Defer running task work to get events. + /// Rather than running bits of task work whenever the task transitions + /// try to do it just before it is needed. + DEFER_TASKRUN: bool = false, + /// Application provides the memory for the rings + NO_MMAP: bool = false, + /// Register the ring fd in itself for use with + /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + /// than an fd. + REGISTERED_FD_ONLY: bool = false, + /// Removes indirection through the SQ index array. + NO_SQARRAY: bool = false, + // COMMIT: new setup flags + /// Use hybrid poll in iopoll process + HYBRID_IOPOLL: bool = false, + /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + /// IORING_CQE_F_32 set in cqe.flags. + CQE_MIXED: bool = false, + _unused: u13 = 0, + }; - // cancel operation is success (or NOENT on older kernels) - try testing.expectEqual(cancel_user_data, cqe_cancel.user_data); - try testing.expect(cqe_cancel.err() == .NOENT or cqe_cancel.err() == .SUCCESS); + /// sqe.uring_cmd_flags (rw_flags in the Zig struct) + /// top 8bits aren't available for userspace + /// use registered buffer; pass this flag along with setting sqe.buf_index. + pub const Cmd = packed struct(u32) { + CMD_FIXED: bool = false, + _unused: u31 = 0, + }; - // recv operation is failed with err CANCELED (or NOBUFS on older kernels) - try testing.expectEqual(recv_user_data, cqe_recv.user_data); - try testing.expect(cqe_recv.res < 0); - try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED); - try testing.expect(!cqe_recv.flags.F_MORE); - } - } -} + /// sqe.fsync_flags (rw_flags in the Zig struct) + pub const Fsync = packed struct(u32) { + DATASYNC: bool = false, + _unused: u31 = 0, + }; -// Prepare, submit recv and get cqe using buffer group. -fn buf_grp_recv_submit_get_cqe( - ring: *IoUring, - buf_grp: *BufferGroup, - fd: linux.fd_t, - user_data: u64, -) !Cqe { - // prepare and submit recv - const sqe = try buf_grp.recv(user_data, fd, 0); - try testing.expect(sqe.flags.BUFFER_SELECT); - try testing.expect(sqe.buf_index == buf_grp.group_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit - // get cqe, expect success - const cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set + /// sqe.timeout_flags + pub const Timeout = packed struct(u32) { + TIMEOUT_ABS: bool = false, + /// Available since Linux 5.11 + TIMEOUT_UPDATE: bool = false, + /// Available since Linux 5.15 + TIMEOUT_BOOTTIME: bool = false, + /// Available since Linux 5.15 + TIMEOUT_REALTIME: bool = false, + /// Available since Linux 5.15 + LINK_TIMEOUT_UPDATE: bool = false, + /// Available since Linux 5.16 + TIMEOUT_ETIME_SUCCESS: bool = false, + // COMMIT: new Timeout Flag + // TODO: add when it became available + TIMEOUT_MULTISHOT: bool = false, + _unused: u25 = 0, + }; - return cqe; -} + /// sqe.splice_flags (rw_flags in Zig Struct) + /// extends splice(2) flags + pub const Splice = packed struct(u32) { + _unused: u31 = 0, + /// the last bit of __u32 + F_FD_IN_FIXED: bool = false, + }; -fn expect_buf_grp_cqe( - ring: *IoUring, - buf_grp: *BufferGroup, - user_data: u64, - expected: []const u8, -) !Cqe { - // get cqe - const cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expect(cqe.res >= 0); // success - try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set - try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + /// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct) + /// is the flag space, the command flags for POLL_ADD are stored in sqe.len. + pub const Poll = packed struct(u32) { + /// IORING_POLL_ADD_MULTI + /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue + /// to report CQEs on behalf of the same SQE. + ADD_MULTI: bool = false, + // TODO: verify this doc comment is valid for the 2 flags below + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + UPDATE_EVENTS: bool = false, + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + UPDATE_USER_DATA: bool = false, + /// IORING_POLL_LEVEL + /// Level triggered poll. + ADD_LEVEL: bool = false, + _unused: u28 = 0, + }; - // get buffer from pool - const buffer_id = try cqe.buffer_id(); - const len = @as(usize, @intCast(cqe.res)); - const buf = buf_grp.get_by_id(buffer_id)[0..len]; - try testing.expectEqualSlices(u8, expected, buf); + /// ASYNC_CANCEL flags. + pub const AsyncCancel = packed struct(u32) { + /// IORING_ASYNC_CANCEL_ALL + /// Cancel all requests that match the given key + CANCEL_ALL: bool = false, + /// IORING_ASYNC_CANCEL_FD + /// Key off 'fd' for cancelation rather than the request 'user_data' + CANCEL_FD: bool = false, + /// IORING_ASYNC_CANCEL_ANY + /// Match any request + CANCEL_ANY: bool = false, + /// IORING_ASYNC_CANCEL_FD_FIXED + /// 'fd' passed in is a fixed descriptor + CANCEL_FD_FIXED: bool = false, + // COMMIT: new AsyncCancel Flags + /// IORING_ASYNC_CANCEL_USERDATA + /// Match on user_data, default for no other key + CANCEL_USERDATA: bool = false, + /// IORING_ASYNC_CANCEL_OP + /// Match request based on opcode + CANCEL_OP: bool = false, + _unused: u26 = 0, + }; - return cqe; -} + /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) + pub const MsgRing = packed struct(u32) { + /// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. + /// Not applicable for IORING_MSG_DATA, obviously. + CQE_SKIP: bool = false, + /// Pass through the flags from sqe.file_index to cqe.flags + FLAGS_PASS: bool = false, + _unused: u30 = 0, + }; -test "copy_cqes with wrapping sq.cqes buffer" { - if (!is_linux) return error.SkipZigTest; + // COMMIT: new flag + /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) + pub const FixedFd = packed struct(u32) { + /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC + NO_CLOEXEC: bool = false, + }; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + /// COMMIT: new flags + /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) + pub const Nop = packed struct(u32) { + /// IORING_NOP_INJECT_RESULT Inject result from sqe.result + INJECT_RESULT: bool = false, + _unused: u4 = 0, + CQE32: bool = false, + _unused_1: u26 = 0, }; - defer ring.deinit(); - try testing.expectEqual(2, ring.sq.sqes.len); - try testing.expectEqual(4, ring.cq.cqes.len); + /// io_uring_enter(2) flags + pub const Enter = packed struct(u32) { + GETEVENTS: bool = false, + SQ_WAKEUP: bool = false, + SQ_WAIT: bool = false, + EXT_ARG: bool = false, + REGISTERED_RING: bool = false, + // COMMIT: new flags + ABS_TIMER: bool = false, + EXT_ARG_REG: bool = false, + NO_IOWAIT: bool = false, + _unused: u24 = 0, - // submit 2 entries, receive 2 completions - var cqes: [8]Cqe = undefined; - { - for (0..2) |_| { - const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); - try testing.expect(try ring.submit() == 1); - } - var cqe_count: u32 = 0; - while (cqe_count < 2) { - cqe_count += try ring.copy_cqes(&cqes, 2 - cqe_count); + pub fn empty(enter_flags: Enter) bool { + return @as(u32, @bitCast(enter_flags)) == 0; } - } + }; - try testing.expectEqual(2, ring.cq.head.*); + /// io_uring_params.features flags + const Features = packed struct(u32) { + SINGLE_MMAP: bool = false, + NODROP: bool = false, + SUBMIT_STABLE: bool = false, + RW_CUR_POS: bool = false, + CUR_PERSONALITY: bool = false, + FAST_POLL: bool = false, + POLL_32BITS: bool = false, + SQPOLL_NONFIXED: bool = false, + EXT_ARG: bool = false, + NATIVE_WORKERS: bool = false, + RSRC_TAGS: bool = false, + CQE_SKIP: bool = false, + LINKED_FILE: bool = false, + // COMMIT: add new Feature Flags + REG_REG_RING: bool = false, + RECVSEND_BUNDLE: bool = false, + MIN_TIMEOUT: bool = false, + RW_ATTR: bool = false, + NO_IOWAIT: bool = false, + _unused: u14 = 0, - // sq.sqes len is 4, starting at position 2 - // every 4 entries submit wraps completion buffer - // we are reading ring.cq.cqes at indexes 2,3,0,1 - for (1..1024) |i| { - for (0..4) |_| { - const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); - try testing.expect(try ring.submit() == 1); - } - var cqe_count: u32 = 0; - while (cqe_count < 4) { - cqe_count += try ring.copy_cqes(&cqes, 4 - cqe_count); + pub fn empty(features: Features) bool { + return @as(u32, @bitCast(features)) == 0; } - try testing.expectEqual(4, cqe_count); - try testing.expectEqual(2 + 4 * i, ring.cq.head.*); - } -} + }; +}; +/// io_uring_register(2) opcodes and arguments +/// matches io_uring_register_op in liburing +pub const RegisterOp = enum(u8) { + REGISTER_BUFFERS, + UNREGISTER_BUFFERS, + REGISTER_FILES, + UNREGISTER_FILES, + REGISTER_EVENTFD, + UNREGISTER_EVENTFD, + REGISTER_FILES_UPDATE, + REGISTER_EVENTFD_ASYNC, + REGISTER_PROBE, + REGISTER_PERSONALITY, + UNREGISTER_PERSONALITY, + REGISTER_RESTRICTIONS, + REGISTER_ENABLE_RINGS, + + // extended with tagging + REGISTER_FILES2, + REGISTER_FILES_UPDATE2, + REGISTER_BUFFERS2, + REGISTER_BUFFERS_UPDATE, + + // set/clear io-wq thread affinities + REGISTER_IOWQ_AFF, + UNREGISTER_IOWQ_AFF, + + // set/get max number of io-wq workers + REGISTER_IOWQ_MAX_WORKERS, + + // register/unregister io_uring fd with the ring + REGISTER_RING_FDS, + UNREGISTER_RING_FDS, + + // register ring based provide buffer group + REGISTER_PBUF_RING, + UNREGISTER_PBUF_RING, -test "bind/listen/connect" { - if (builtin.cpu.arch == .s390x) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/25956 + // sync cancelation API + REGISTER_SYNC_CANCEL, - var ring = IoUring.init(4, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + // register a range of fixed file slots for automatic slot allocation + REGISTER_FILE_ALLOC_RANGE, - const probe = ring.get_probe() catch return error.SkipZigTest; - // LISTEN is higher required operation - if (!probe.is_supported(.LISTEN)) return error.SkipZigTest; + // return status information for a buffer group + REGISTER_PBUF_STATUS, - var addr: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - const proto: u32 = if (addr.family == linux.AF.UNIX) 0 else linux.IPPROTO.TCP; + // set/clear busy poll settings + REGISTER_NAPI, + UNREGISTER_NAPI, - const listen_fd = brk: { - // Create socket - _ = try ring.socket(1, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); - try testing.expectEqual(1, try ring.submit()); - var cqe = try ring.copy_cqe(); - try testing.expectEqual(1, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - const listen_fd: linux.fd_t = @intCast(cqe.res); - try testing.expect(listen_fd > 2); + REGISTER_CLOCK, - // Prepare: set socket option * 2, bind, listen - var optval: u32 = 1; - (try ring.setsockopt(2, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval))).link_next(); - (try ring.setsockopt(3, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEPORT, mem.asBytes(&optval))).link_next(); - (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); - _ = try ring.listen(5, listen_fd, 1, 0); - // Submit 4 operations - try testing.expectEqual(4, try ring.submit()); - // Expect all to succeed - for (2..6) |user_data| { - cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - } + // clone registered buffers from source ring to current ring + REGISTER_CLONE_BUFFERS, - // Check that socket option is set - optval = 0; - _ = try ring.getsockopt(5, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval)); - try testing.expectEqual(1, try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(5, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(1, optval); + // send MSG_RING without having a ring + REGISTER_SEND_MSG_RING, - // Read system assigned port into addr - var addr_len: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(listen_fd, addrAny(&addr), &addr_len); + // register a netdev hw rx queue for zerocopy + REGISTER_ZCRX_IFQ, - break :brk listen_fd; - }; + // resize CQ ring + REGISTER_RESIZE_RINGS, - const connect_fd = brk: { - // Create connect socket - _ = try ring.socket(6, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); - try testing.expectEqual(1, try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(6, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - // Get connect socket fd - const connect_fd: linux.fd_t = @intCast(cqe.res); - try testing.expect(connect_fd > 2 and connect_fd != listen_fd); - break :brk connect_fd; - }; + REGISTER_MEM_REGION, - // Prepare accept/connect operations - _ = try ring.accept(7, listen_fd, null, null, 0); - _ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in)); - try testing.expectEqual(2, try ring.submit()); - // Get listener accepted socket - var accept_fd: posix.socket_t = 0; - for (0..2) |_| { - const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - if (cqe.user_data == 7) { - accept_fd = @intCast(cqe.res); - } else { - try testing.expectEqual(8, cqe.user_data); - } - } - try testing.expect(accept_fd > 2 and accept_fd != listen_fd and accept_fd != connect_fd); + // COMMIT: new register opcode + // query various aspects of io_uring, see linux/io_uring/query.h + REGISTER_QUERY, - // Communicate - try testSendRecv(&ring, connect_fd, accept_fd); - try testSendRecv(&ring, accept_fd, connect_fd); + _, +}; - // Shutdown and close all sockets - for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { - (try ring.shutdown(9, fd, posix.SHUT.RDWR)).link_next(); - _ = try ring.close(10, fd); - try testing.expectEqual(2, try ring.submit()); - for (0..2) |i| { - const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(9 + i, cqe.user_data); - } - } -} +/// io-wq worker categories +/// matches io_wq_type in liburing +pub const IoWqCategory = enum(u8) { + BOUND, + UNBOUND, +}; -fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t) !void { - const buffer_send = "0123456789abcdf" ** 10; - var buffer_recv: [buffer_send.len * 2]u8 = undefined; +pub const SocketOp = enum(u16) { + SIOCIN, + SIOCOUTQ, + GETSOCKOPT, + SETSOCKOPT, + // COMMIT: new socket op + TX_TIMESTAMP, +}; - // 2 sends - _ = try ring.send(1, send_fd, buffer_send, linux.MSG.WAITALL); - _ = try ring.send(2, send_fd, buffer_send, linux.MSG.WAITALL); - try testing.expectEqual(2, try ring.submit()); - for (0..2) |i| { - const cqe = try ring.copy_cqe(); - try testing.expectEqual(1 + i, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(buffer_send.len, @as(usize, @intCast(cqe.res))); - } +/// io_uring_restriction.opcode values +/// matches io_uring_register_restriction_op in liburing +pub const RestrictionOp = enum(u16) { + /// Allow an io_uring_register(2) opcode + REGISTER_OP = 0, + /// Allow an sqe opcode + SQE_OP = 1, + /// Allow sqe flags + SQE_FLAGS_ALLOWED = 2, + /// Require sqe flags (these flags must be set on each submission) + SQE_FLAGS_REQUIRED = 3, - // receive - var recv_len: usize = 0; - while (recv_len < buffer_send.len * 2) { - _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, 0); - try testing.expectEqual(1, try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(3, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - recv_len += @intCast(cqe.res); - } + _, +}; - // inspect recv buffer - try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); - try testing.expectEqualSlices(u8, buffer_send, buffer_recv[buffer_send.len..]); -} +/// IORING_OP_MSG_RING command types, stored in sqe.addr +pub const MsgRingCmd = enum { + /// pass sqe->len as 'res' and off as user_data + DATA, + /// send a registered fd to another ring + SEND_FD, +}; -fn addrAny(addr: *linux.sockaddr.in) *linux.sockaddr { - return @ptrCast(addr); -} +// COMMIT: OP to IoUring +pub const Op = enum(u8) { + NOP, + READV, + WRITEV, + FSYNC, + READ_FIXED, + WRITE_FIXED, + POLL_ADD, + POLL_REMOVE, + SYNC_FILE_RANGE, + SENDMSG, + RECVMSG, + TIMEOUT, + TIMEOUT_REMOVE, + ACCEPT, + ASYNC_CANCEL, + LINK_TIMEOUT, + CONNECT, + FALLOCATE, + OPENAT, + CLOSE, + FILES_UPDATE, + STATX, + READ, + WRITE, + FADVISE, + MADVISE, + SEND, + RECV, + EPOLL_CTL, + OPENAT2, + SPLICE, + PROVIDE_BUFFERS, + REMOVE_BUFFERS, + TEE, + SHUTDOWN, + RENAMEAT, + UNLINKAT, + MKDIRAT, + SYMLINKAT, + LINKAT, + MSG_RING, + FSETXATTR, + SETXATTR, + FGETXATTR, + GETXATTR, + SOCKET, + URING_CMD, + SEND_ZC, + SENDMSG_ZC, + READ_MULTISHOT, + WAITID, + FUTEX_WAIT, + FUTEX_WAKE, + FUTEX_WAITV, + FIXED_FD_INSTALL, + FTRUNCATE, + BIND, + LISTEN, + RECV_ZC, + // COMMIT: new OPs + // TODO: to be implemented + EPOLL_WAIT, + READV_FIXED, + WRITEV_FIXED, + PIPE, + + _, +}; diff --git a/lib/std/os/linux/test.zig b/lib/std/os/linux/test.zig index f5533a54bdf4..3290e0466691 100644 --- a/lib/std/os/linux/test.zig +++ b/lib/std/os/linux/test.zig @@ -85,7 +85,7 @@ test "statx" { defer file.close(); var statx_buf: linux.Statx = undefined; - switch (linux.errno(linux.statx(file.handle, "", linux.AT.EMPTY_PATH, linux.STATX_BASIC_STATS, &statx_buf))) { + switch (linux.errno(linux.statx(file.handle, "", linux.AT.EMPTY_PATH, linux.Statx.Mask.basic_stats, &statx_buf))) { .SUCCESS => {}, else => unreachable, } From 79475dc2b945f0e7cf7a3c4c77ebf636cc42bcf6 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Wed, 1 Oct 2025 23:06:56 +0000 Subject: [PATCH 06/41] Replace AT,W,SHUT,SOCK with a packed struct Flag type Use the new Flag types in IoUring fix some test Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 142 ++++++++++++++++++++++------------- lib/std/os/linux/IoUring.zig | 66 ++++++++-------- 2 files changed, 121 insertions(+), 87 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 2854f72564f9..01717bd08b23 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -3476,41 +3476,41 @@ pub const STDIN_FILENO = 0; pub const STDOUT_FILENO = 1; pub const STDERR_FILENO = 2; -pub const AT = struct { - /// Special value used to indicate openat should use the current working directory - pub const FDCWD = -100; - +/// matches AT_* and AT_STATX_* +pub const At = packed struct(u32) { + _reserved: u8 = 0, /// Do not follow symbolic links - pub const SYMLINK_NOFOLLOW = 0x100; - + symlink_nofollow: bool = false, /// Remove directory instead of unlinking file - pub const REMOVEDIR = 0x200; - + /// Or + /// File handle is needed to compare object identity and may not be usable + /// with open_by_handle_at(2) + removedir_or_handle_fid: bool = false, /// Follow symbolic links. - pub const SYMLINK_FOLLOW = 0x400; - + symlink_follow: bool = false, /// Suppress terminal automount traversal - pub const NO_AUTOMOUNT = 0x800; - + no_automount: bool = false, /// Allow empty relative pathname - pub const EMPTY_PATH = 0x1000; - - /// Type of synchronisation required from statx() - pub const STATX_SYNC_TYPE = 0x6000; - - /// - Do whatever stat() does - pub const STATX_SYNC_AS_STAT = 0x0000; - - /// - Force the attributes to be sync'd with the server - pub const STATX_FORCE_SYNC = 0x2000; + empty_path: bool = false, + /// Force the attributes to be sync'd with the server + statx_force_sync: bool = false, + /// Don't sync attributes with the server + statx_dont_sync: bool = false, + /// Apply to the entire subtree + recursive: bool = false, - /// - Don't sync attributes with the server - pub const STATX_DONT_SYNC = 0x4000; + /// Special value used to indicate openat should use the current working directory + pub const fdcwd = -100; - /// Apply to the entire subtree - pub const RECURSIVE = 0x8000; + // https://github.com/torvalds/linux/blob/d3479214c05dbd07bc56f8823e7bd8719fcd39a9/tools/perf/trace/beauty/fs_at_flags.sh#L15 + /// AT_STATX_SYNC_TYPE is not a bit, its a mask of + /// AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC and AT_STATX_DONT_SYNC + /// Type of synchronisation required from statx() + pub const statx_sync_type = 0x6000; - pub const HANDLE_FID = REMOVEDIR; + /// Do whatever stat() does + /// This is the default and is very much filesystem-specific + pub const statx_sync_as_stat: At = .{}; }; pub const FALLOC = struct { @@ -3656,30 +3656,36 @@ pub const X_OK = 1; pub const W_OK = 2; pub const R_OK = 4; -pub const W = struct { - pub const NOHANG = 1; - pub const UNTRACED = 2; - pub const STOPPED = 2; - pub const EXITED = 4; - pub const CONTINUED = 8; - pub const NOWAIT = 0x1000000; - - pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s & 0xff00) >> 8)); +pub const W = packed struct(u32) { + nohang: bool = false, + untraced_or_stopped: bool = false, + exited: bool = false, + continued: bool = false, + _unused: u20 = 0, + nowait: bool = false, + _unused_1: u7 = 0, + + pub fn EXITSTATUS(s: W) u8 { + return @intCast((@as(u32, @bitCast(s)) & 0xff00) >> 8); } - pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; + + pub fn TERMSIG(s: W) u32 { + return @as(u32, @bitCast(s)) & 0x7f; } - pub fn STOPSIG(s: u32) u32 { + + pub fn STOPSIG(s: W) u32 { return EXITSTATUS(s); } - pub fn IFEXITED(s: u32) bool { + + pub fn IFEXITED(s: W) bool { return TERMSIG(s) == 0; } - pub fn IFSTOPPED(s: u32) bool { - return @as(u16, @truncate(((s & 0xffff) *% 0x10001) >> 8)) > 0x7f00; + + pub fn IFSTOPPED(s: W) bool { + return @as(u16, @truncate(((@as(u32, @bitCast(s)) & 0xffff) *% 0x10001) >> 8)) > 0x7f00; } - pub fn IFSIGNALED(s: u32) bool { + + pub fn IFSIGNALED(s: W) bool { return (s & 0xffff) -% 1 < 0xff; } }; @@ -3885,16 +3891,44 @@ pub const SHUT = struct { pub const RDWR = 2; }; -pub const SOCK = struct { - pub const STREAM = if (is_mips) 2 else 1; - pub const DGRAM = if (is_mips) 1 else 2; - pub const RAW = 3; - pub const RDM = 4; - pub const SEQPACKET = 5; - pub const DCCP = 6; - pub const PACKET = 10; - pub const CLOEXEC = if (is_sparc) 0o20000000 else 0o2000000; - pub const NONBLOCK = if (is_mips) 0o200 else if (is_sparc) 0o40000 else 0o4000; +/// SOCK_* Socket type and flags +pub const Sock = packed struct(u32) { + type: Type, + flags: Flags = .{}, + + /// matches sock_type in kernel + pub const Type = enum(u7) { + stream = if (is_mips) 2 else 1, + dgram = if (is_mips) 1 else 2, + raw = 3, + rdm = 4, + seqpacket = 5, + dccp = 6, + packet = 10, + + _, + }; + + // bit range is (8 - 32] of the u32 + /// Flags for socket, socketpair, accept4 + pub const Flags = if (is_sparc) packed struct(u25) { + _: u7 = 0, // start from u7 since Type comes before Flags + nonblock: bool = false, + _1: u7 = 0, + cloexec: bool = false, + _2: u9 = 0, + } else if (is_mips) packed struct(u25) { + nonblock: bool = false, + _: u11 = 0, + cloexec: bool = false, + _1: u12 = 0, + } else packed struct(u25) { + _: u4 = 0, + nonblock: bool = false, + _1: u7 = 0, + cloexec: bool = false, + _2: u12 = 0, + }; }; pub const TCP = struct { diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 0b9488f17469..b948aed8445e 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -492,7 +492,7 @@ pub fn accept( fd: linux.fd_t, addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, - flags: u32, + flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_accept(fd, addr, addrlen, flags); @@ -514,7 +514,7 @@ pub fn accept_multishot( fd: linux.fd_t, addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, - flags: u32, + flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_multishot_accept(fd, addr, addrlen, flags); @@ -539,7 +539,7 @@ pub fn accept_direct( fd: linux.fd_t, addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, - flags: u32, + flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_accept_direct(fd, addr, addrlen, flags, constants.FILE_INDEX_ALLOC); @@ -555,7 +555,7 @@ pub fn accept_multishot_direct( fd: linux.fd_t, addr: ?*posix.sockaddr, addrlen: ?*posix.socklen_t, - flags: u32, + flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); @@ -931,7 +931,7 @@ pub fn statx( user_data: u64, fd: linux.fd_t, path: [:0]const u8, - flags: u32, + flags: linux.At, mask: linux.Statx.Mask, buf: *linux.Statx, ) !*Sqe { @@ -969,7 +969,7 @@ pub fn shutdown( self: *IoUring, user_data: u64, sockfd: posix.socket_t, - how: u32, + how: linux.At, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_shutdown(sockfd, how); @@ -1001,7 +1001,7 @@ pub fn unlinkat( user_data: u64, dir_fd: linux.fd_t, path: [*:0]const u8, - flags: u32, + flags: linux.At, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_unlinkat(dir_fd, path, flags); @@ -1048,7 +1048,7 @@ pub fn linkat( old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: u32, + flags: linux.At, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); @@ -1329,7 +1329,7 @@ pub fn socket( self: *IoUring, user_data: u64, domain: linux.AF, - socket_type: linux.SOCK, + socket_type: linux.Sock, protocol: u32, flags: u32, ) !*Sqe { @@ -2079,7 +2079,7 @@ test "accept/connect/send/recv" { try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, - .flags = 0, + .flags = .{}, }, cqe_send); const cqe_recv = try ring.copy_cqe(); @@ -2171,7 +2171,7 @@ test "sendmsg/recvmsg" { try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_send.len, - .flags = 0, + .flags = .{}, }, cqe_sendmsg); const cqe_recvmsg = try ring.copy_cqe(); @@ -2306,7 +2306,7 @@ test "timeout_remove" { try testing.expectEqual(Cqe{ .user_data = 0x99999999, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); } } @@ -2457,7 +2457,7 @@ test "statx" { try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); try testing.expect(buf.mask & linux.STATX_SIZE == linux.STATX_SIZE); @@ -2504,13 +2504,13 @@ test "accept/connect/recv/cancel" { try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = 0, + .flags = .{}, }, cqe_recv); try testing.expectEqual(Cqe{ .user_data = 0x99999999, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_cancel); } @@ -2645,7 +2645,7 @@ test "shutdown" { try testing.expectEqual(Cqe{ .user_data = 0x445445445, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); } @@ -2715,7 +2715,7 @@ test "renameat" { try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the old file doesn't exist anymore @@ -2768,7 +2768,7 @@ test "unlinkat" { try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the file doesn't exist anymore @@ -2917,7 +2917,7 @@ test "linkat" { try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate the second file @@ -3365,7 +3365,7 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_connect); // All good @@ -3811,7 +3811,7 @@ test "waitid" { } var siginfo: posix.siginfo_t = undefined; - _ = try ring.waitid(0, .PID, pid, &siginfo, posix.W.EXITED, 0); + _ = try ring.waitid(0, .PID, pid, &siginfo, .{ .exited = true }, 0); try testing.expectEqual(1, try ring.submit()); @@ -3884,7 +3884,7 @@ test BufferGroup { try testing.expectEqual(1, submitted); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = .{} }, cqe_send); } // Server uses buffer group receive @@ -3954,7 +3954,7 @@ test "ring mapped buffers recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); } var pos: usize = 0; @@ -4043,7 +4043,7 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); } // start multishot recv @@ -4251,7 +4251,7 @@ test "bind/listen/connect" { const listen_fd = brk: { // Create socket - _ = try ring.socket(1, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); + _ = try ring.socket(1, addr.any.family, .{ .type = .stream, .flags = .{ .cloexec = true } }, proto, 0); try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); try testing.expectEqual(1, cqe.user_data); @@ -4653,7 +4653,7 @@ pub const Sqe = extern struct { fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, - flags: linux.SOCK, + flags: linux.Sock, ) void { // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). @@ -4667,7 +4667,7 @@ pub const Sqe = extern struct { fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, - flags: linux.SOCK, + flags: linux.Sock, file_index: u32, ) void { prep_accept(sqe, fd, addr, addrlen, flags); @@ -4679,7 +4679,7 @@ pub const Sqe = extern struct { fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, - flags: linux.SOCK, + flags: linux.Sock, ) void { prep_accept(sqe, fd, addr, addrlen, flags); sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } }; @@ -4691,7 +4691,7 @@ pub const Sqe = extern struct { fd: linux.fd_t, addr: ?*linux.sockaddr, addrlen: ?*linux.socklen_t, - flags: linux.SOCK, + flags: linux.Sock, ) void { prep_multishot_accept(sqe, fd, addr, addrlen, flags); set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); @@ -4994,9 +4994,9 @@ pub const Sqe = extern struct { pub fn prep_shutdown( sqe: *Sqe, sockfd: linux.socket_t, - how: linux.SHUT, + how: linux.Shut, ) void { - sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); + sqe.prep_rw(.SHUTDOWN, sockfd, 0, @intFromEnum(how), 0); } pub fn prep_renameat( @@ -5015,7 +5015,7 @@ pub const Sqe = extern struct { @intFromPtr(new_path), ); sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_unlinkat( @@ -5058,7 +5058,7 @@ pub const Sqe = extern struct { old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: linux.AT, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW + flags: linux.At, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW ) void { sqe.prep_rw( .LINKAT, From 18bcf8ebfb04942706e771aa9f68cf4df9aa4f2e Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Thu, 2 Oct 2025 17:22:55 +0000 Subject: [PATCH 07/41] update some syscall APIs to use the new flags Add Shut, PF, AF, IPPROTO types Update IoUring and test to use some of this flags TODO: avoid breaking API and make transitioning smooth Instead mark the previous API as deprecated for at least One zig release while using the new in the Zig codebase Signed-off-by: Bernard Assan --- lib/std/fs.zig | 2 +- lib/std/os/linux.zig | 301 ++++++++++++++++------------------- lib/std/os/linux/IoUring.zig | 49 +++--- lib/std/os/linux/test.zig | 4 +- 4 files changed, 169 insertions(+), 187 deletions(-) diff --git a/lib/std/fs.zig b/lib/std/fs.zig index 26ce27e2b393..d07901a7710b 100644 --- a/lib/std/fs.zig +++ b/lib/std/fs.zig @@ -187,7 +187,7 @@ pub fn cwd() Dir { } else if (native_os == .wasi) { return .{ .fd = std.options.wasiCwd() }; } else { - return .{ .fd = posix.AT.FDCWD }; + return .{ .fd = posix.AT.fdcwd }; } } diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 01717bd08b23..1e94d0213892 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -921,7 +921,7 @@ pub fn readlink(noalias path: [*:0]const u8, noalias buf_ptr: [*]u8, buf_len: us if (@hasField(SYS, "readlink")) { return syscall3(.readlink, @intFromPtr(path), @intFromPtr(buf_ptr), buf_len); } else { - return syscall4(.readlinkat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), @intFromPtr(buf_ptr), buf_len); + return syscall4(.readlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), @intFromPtr(buf_ptr), buf_len); } } @@ -933,7 +933,7 @@ pub fn mkdir(path: [*:0]const u8, mode: mode_t) usize { if (@hasField(SYS, "mkdir")) { return syscall2(.mkdir, @intFromPtr(path), mode); } else { - return syscall3(.mkdirat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), mode); + return syscall3(.mkdirat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), mode); } } @@ -945,7 +945,7 @@ pub fn mknod(path: [*:0]const u8, mode: u32, dev: u32) usize { if (@hasField(SYS, "mknod")) { return syscall3(.mknod, @intFromPtr(path), mode, dev); } else { - return mknodat(AT.FDCWD, path, mode, dev); + return mknodat(At.fdcwd, path, mode, dev); } } @@ -1178,7 +1178,7 @@ pub fn rmdir(path: [*:0]const u8) usize { if (@hasField(SYS, "rmdir")) { return syscall1(.rmdir, @intFromPtr(path)); } else { - return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), AT.REMOVEDIR); + return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), @as(u32, @bitCast(At{ .removedir_or_handle_fid = .{ .removedir = true } }))); } } @@ -1186,7 +1186,7 @@ pub fn symlink(existing: [*:0]const u8, new: [*:0]const u8) usize { if (@hasField(SYS, "symlink")) { return syscall2(.symlink, @intFromPtr(existing), @intFromPtr(new)); } else { - return syscall3(.symlinkat, @intFromPtr(existing), @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(new)); + return syscall3(.symlinkat, @intFromPtr(existing), @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(new)); } } @@ -1237,7 +1237,7 @@ pub fn access(path: [*:0]const u8, mode: u32) usize { if (@hasField(SYS, "access")) { return syscall2(.access, @intFromPtr(path), mode); } else { - return faccessat(AT.FDCWD, path, mode, 0); + return faccessat(At.fdcwd, path, mode, 0); } } @@ -1338,9 +1338,9 @@ pub fn rename(old: [*:0]const u8, new: [*:0]const u8) usize { if (@hasField(SYS, "rename")) { return syscall2(.rename, @intFromPtr(old), @intFromPtr(new)); } else if (@hasField(SYS, "renameat")) { - return syscall4(.renameat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(new)); + return syscall4(.renameat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(new)); } else { - return syscall5(.renameat2, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(new), 0); + return syscall5(.renameat2, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(new), 0); } } @@ -1382,7 +1382,7 @@ pub fn open(path: [*:0]const u8, flags: O, perm: mode_t) usize { } else { return syscall4( .openat, - @bitCast(@as(isize, AT.FDCWD)), + @bitCast(@as(isize, At.fdcwd)), @intFromPtr(path), @as(u32, @bitCast(flags)), perm, @@ -1395,7 +1395,7 @@ pub fn create(path: [*:0]const u8, perm: mode_t) usize { } pub fn openat(dirfd: i32, path: [*:0]const u8, flags: O, mode: mode_t) usize { - // dirfd could be negative, for example AT.FDCWD is -100 + // dirfd could be negative, for example At.fdcwd is -100 return syscall4(.openat, @bitCast(@as(isize, dirfd)), @intFromPtr(path), @as(u32, @bitCast(flags)), mode); } @@ -1421,7 +1421,7 @@ pub fn chmod(path: [*:0]const u8, mode: mode_t) usize { if (@hasField(SYS, "chmod")) { return syscall2(.chmod, @intFromPtr(path), mode); } else { - return fchmodat(AT.FDCWD, path, mode, 0); + return fchmodat(At.fdcwd, path, mode, 0); } } @@ -1553,9 +1553,9 @@ pub fn link(oldpath: [*:0]const u8, newpath: [*:0]const u8) usize { } else { return syscall5( .linkat, - @as(usize, @bitCast(@as(isize, AT.FDCWD))), + @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(oldpath), - @as(usize, @bitCast(@as(isize, AT.FDCWD))), + @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(newpath), 0, ); @@ -1577,7 +1577,7 @@ pub fn unlink(path: [*:0]const u8) usize { if (@hasField(SYS, "unlink")) { return syscall1(.unlink, @intFromPtr(path)); } else { - return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), 0); + return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), 0); } } @@ -2237,24 +2237,24 @@ pub fn lstat(pathname: [*:0]const u8, statbuf: *Stat) usize { } } -pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: u32) usize { +pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: At) usize { if (native_arch == .riscv32 or native_arch.isLoongArch()) { // riscv32 and loongarch have made the interesting decision to not implement some of // the older stat syscalls, including this one. @compileError("No fstatat syscall on this architecture."); } else if (@hasField(SYS, "fstatat64")) { - return syscall4(.fstatat64, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); + return syscall4(.fstatat64, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), @as(u32, @bitCast(flags))); } else { - return syscall4(.fstatat, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); + return syscall4(.fstatat, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), @as(u32, @bitCast(flags))); } } -pub fn statx(dirfd: i32, path: [*:0]const u8, flags: u32, mask: Statx.Mask, statx_buf: *Statx) usize { +pub fn statx(dirfd: i32, path: [*:0]const u8, flags: At, mask: Statx.Mask, statx_buf: *Statx) usize { return syscall5( .statx, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), - flags, + @intCast(@as(u32, @bitCast(flags))), @intCast(@as(u32, @bitCast(mask))), @intFromPtr(statx_buf), ); @@ -3476,6 +3476,7 @@ pub const STDIN_FILENO = 0; pub const STDOUT_FILENO = 1; pub const STDERR_FILENO = 2; +pub const AT = At; /// matches AT_* and AT_STATX_* pub const At = packed struct(u32) { _reserved: u8 = 0, @@ -3485,7 +3486,10 @@ pub const At = packed struct(u32) { /// Or /// File handle is needed to compare object identity and may not be usable /// with open_by_handle_at(2) - removedir_or_handle_fid: bool = false, + removedir_or_handle_fid: packed union { + removedir: bool, + handle_fid: bool, + } = @bitCast(false), /// Follow symbolic links. symlink_follow: bool = false, /// Suppress terminal automount traversal @@ -3498,6 +3502,7 @@ pub const At = packed struct(u32) { statx_dont_sync: bool = false, /// Apply to the entire subtree recursive: bool = false, + _reserved_1: u16 = 0, /// Special value used to indicate openat should use the current working directory pub const fdcwd = -100; @@ -3658,7 +3663,10 @@ pub const R_OK = 4; pub const W = packed struct(u32) { nohang: bool = false, - untraced_or_stopped: bool = false, + untraced_or_stopped: packed union { + untraced: bool, + stopped: bool, + } = @bitCast(false), exited: bool = false, continued: bool = false, _unused: u20 = 0, @@ -3885,12 +3893,21 @@ pub const SEEK = struct { pub const END = 2; }; -pub const SHUT = struct { - pub const RD = 0; - pub const WR = 1; - pub const RDWR = 2; +pub const SHUT = Shut; +/// enum sock_shutdown_cmd - Shutdown types +/// matches SHUT_* in kenel +pub const Shut = enum(u32) { + /// SHUT_RD: shutdown receptions + rd = 0, + /// SHUT_WR: shutdown transmissions + wd = 1, + /// SHUT_RDWR: shutdown receptions/transmissions + rdwr = 2, + + _, }; +pub const SOCK = Sock; /// SOCK_* Socket type and flags pub const Sock = packed struct(u32) { type: Type, @@ -4031,110 +4048,65 @@ pub const UDP_ENCAP = struct { pub const RXRPC = 6; }; -pub const PF = struct { - pub const UNSPEC = 0; - pub const LOCAL = 1; - pub const UNIX = LOCAL; - pub const FILE = LOCAL; - pub const INET = 2; - pub const AX25 = 3; - pub const IPX = 4; - pub const APPLETALK = 5; - pub const NETROM = 6; - pub const BRIDGE = 7; - pub const ATMPVC = 8; - pub const X25 = 9; - pub const INET6 = 10; - pub const ROSE = 11; - pub const DECnet = 12; - pub const NETBEUI = 13; - pub const SECURITY = 14; - pub const KEY = 15; - pub const NETLINK = 16; - pub const ROUTE = PF.NETLINK; - pub const PACKET = 17; - pub const ASH = 18; - pub const ECONET = 19; - pub const ATMSVC = 20; - pub const RDS = 21; - pub const SNA = 22; - pub const IRDA = 23; - pub const PPPOX = 24; - pub const WANPIPE = 25; - pub const LLC = 26; - pub const IB = 27; - pub const MPLS = 28; - pub const CAN = 29; - pub const TIPC = 30; - pub const BLUETOOTH = 31; - pub const IUCV = 32; - pub const RXRPC = 33; - pub const ISDN = 34; - pub const PHONET = 35; - pub const IEEE802154 = 36; - pub const CAIF = 37; - pub const ALG = 38; - pub const NFC = 39; - pub const VSOCK = 40; - pub const KCM = 41; - pub const QIPCRTR = 42; - pub const SMC = 43; - pub const XDP = 44; - pub const MAX = 45; -}; - -pub const AF = struct { - pub const UNSPEC = PF.UNSPEC; - pub const LOCAL = PF.LOCAL; - pub const UNIX = AF.LOCAL; - pub const FILE = AF.LOCAL; - pub const INET = PF.INET; - pub const AX25 = PF.AX25; - pub const IPX = PF.IPX; - pub const APPLETALK = PF.APPLETALK; - pub const NETROM = PF.NETROM; - pub const BRIDGE = PF.BRIDGE; - pub const ATMPVC = PF.ATMPVC; - pub const X25 = PF.X25; - pub const INET6 = PF.INET6; - pub const ROSE = PF.ROSE; - pub const DECnet = PF.DECnet; - pub const NETBEUI = PF.NETBEUI; - pub const SECURITY = PF.SECURITY; - pub const KEY = PF.KEY; - pub const NETLINK = PF.NETLINK; - pub const ROUTE = PF.ROUTE; - pub const PACKET = PF.PACKET; - pub const ASH = PF.ASH; - pub const ECONET = PF.ECONET; - pub const ATMSVC = PF.ATMSVC; - pub const RDS = PF.RDS; - pub const SNA = PF.SNA; - pub const IRDA = PF.IRDA; - pub const PPPOX = PF.PPPOX; - pub const WANPIPE = PF.WANPIPE; - pub const LLC = PF.LLC; - pub const IB = PF.IB; - pub const MPLS = PF.MPLS; - pub const CAN = PF.CAN; - pub const TIPC = PF.TIPC; - pub const BLUETOOTH = PF.BLUETOOTH; - pub const IUCV = PF.IUCV; - pub const RXRPC = PF.RXRPC; - pub const ISDN = PF.ISDN; - pub const PHONET = PF.PHONET; - pub const IEEE802154 = PF.IEEE802154; - pub const CAIF = PF.CAIF; - pub const ALG = PF.ALG; - pub const NFC = PF.NFC; - pub const VSOCK = PF.VSOCK; - pub const KCM = PF.KCM; - pub const QIPCRTR = PF.QIPCRTR; - pub const SMC = PF.SMC; - pub const XDP = PF.XDP; - pub const MAX = PF.MAX; +/// Address Family +pub const AF = enum(u16) { + unspec = 0, + unix = 1, + inet = 2, + ax25 = 3, + ipx = 4, + appletalk = 5, + netrom = 6, + bridge = 7, + atmpvc = 8, + x25 = 9, + inet6 = 10, + rose = 11, + decnet = 12, + netbeui = 13, + security = 14, + key = 15, + route = 16, + packet = 17, + ash = 18, + econet = 19, + atmsvc = 20, + rds = 21, + sna = 22, + irda = 23, + pppox = 24, + wanpipe = 25, + llc = 26, + ib = 27, + mpls = 28, + can = 29, + tipc = 30, + bluetooth = 31, + iucv = 32, + rxrpc = 33, + isdn = 34, + phonet = 35, + ieee802154 = 36, + caif = 37, + alg = 38, + nfc = 39, + vsock = 40, + kcm = 41, + qipcrtr = 42, + smc = 43, + xdp = 44, + max = 45, + _, + + // Aliases + pub const local = AF.unix; + pub const file = AF.unix; + pub const netlink = AF.route; }; +/// Protocol Family (same values as Protocol Family) +pub const PF = AF; + pub const SO = if (is_mips) struct { pub const DEBUG = 1; pub const REUSEADDR = 0x0004; @@ -5891,7 +5863,7 @@ pub const signalfd_siginfo = extern struct { }; pub const in_port_t = u16; -pub const sa_family_t = u16; +pub const sa_family_t = AF; pub const socklen_t = u32; pub const sockaddr = extern struct { @@ -7164,40 +7136,45 @@ pub const AI = packed struct(u32) { pub const IPPORT_RESERVED = 1024; -pub const IPPROTO = struct { - pub const IP = 0; - pub const HOPOPTS = 0; - pub const ICMP = 1; - pub const IGMP = 2; - pub const IPIP = 4; - pub const TCP = 6; - pub const EGP = 8; - pub const PUP = 12; - pub const UDP = 17; - pub const IDP = 22; - pub const TP = 29; - pub const DCCP = 33; - pub const IPV6 = 41; - pub const ROUTING = 43; - pub const FRAGMENT = 44; - pub const RSVP = 46; - pub const GRE = 47; - pub const ESP = 50; - pub const AH = 51; - pub const ICMPV6 = 58; - pub const NONE = 59; - pub const DSTOPTS = 60; - pub const MTP = 92; - pub const BEETPH = 94; - pub const ENCAP = 98; - pub const PIM = 103; - pub const COMP = 108; - pub const SCTP = 132; - pub const MH = 135; - pub const UDPLITE = 136; - pub const MPLS = 137; - pub const RAW = 255; - pub const MAX = 256; +/// IP Protocol numbers +pub const IpProto = enum(u16) { + ip = 0, + icmp = 1, + igmp = 2, + ipip = 4, + tcp = 6, + egp = 8, + pup = 12, + udp = 17, + idp = 22, + tp = 29, + dccp = 33, + ipv6 = 41, + routing = 43, + fragment = 44, + rsvp = 46, + gre = 47, + esp = 50, + ah = 51, + icmpv6 = 58, + none = 59, + dstopts = 60, + mtp = 92, + beetph = 94, + encap = 98, + pim = 103, + comp = 108, + sctp = 132, + mh = 135, + udplite = 136, + mpls = 137, + raw = 255, + max = 256, + _, + + // Aliases + pub const hopopts = IpProto.ip; + pub const default = IpProto.ip; }; pub const tcp_repair_opt = extern struct { diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index b948aed8445e..7ca59b1f331e 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -969,7 +969,7 @@ pub fn shutdown( self: *IoUring, user_data: u64, sockfd: posix.socket_t, - how: linux.At, + how: linux.Shut, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_shutdown(sockfd, how); @@ -1330,7 +1330,8 @@ pub fn socket( user_data: u64, domain: linux.AF, socket_type: linux.Sock, - protocol: u32, + protocol: linux.IpProto, + /// flags is unused flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1344,9 +1345,10 @@ pub fn socket( pub fn socket_direct( self: *IoUring, user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, + domain: linux.AF, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags is unused flags: u32, file_index: u32, ) !*Sqe { @@ -2432,7 +2434,7 @@ test "statx" { 0xaaaaaaaa, tmp.dir.fd, path, - 0, + .{}, .{ .size = true }, &buf, ); @@ -2460,7 +2462,7 @@ test "statx" { .flags = .{}, }, cqe); - try testing.expect(buf.mask & linux.STATX_SIZE == linux.STATX_SIZE); + try testing.expect(buf.mask.size); try testing.expectEqual(@as(u64, 6), buf.size); } @@ -2618,7 +2620,8 @@ test "shutdown" { // Socket bound, expect shutdown to work { - const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + // TODO: update posix later to use Typed Flags + const server = try posix.socket(address.any.family, @as(u32, @bitCast(linux.Sock{ .type = .stream, .flags = .{ .cloexec = true } })), 0); defer posix.close(server); try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); @@ -2628,7 +2631,7 @@ test "shutdown" { var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); try posix.getsockname(server, addrAny(&address), &slen); - const shutdown_sqe = try ring.shutdown(0x445445445, server, linux.SHUT.RD); + const shutdown_sqe = try ring.shutdown(0x445445445, server, .rd); try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); @@ -2752,7 +2755,7 @@ test "unlinkat" { 0x12121212, tmp.dir.fd, path, - 0, + .{}, ); try testing.expectEqual(Op.UNLINKAT, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); @@ -2900,7 +2903,7 @@ test "linkat" { first_path, tmp.dir.fd, second_path, - 0, + .{}, ); try testing.expectEqual(Op.LINKAT, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); @@ -3651,7 +3654,7 @@ test "socket" { defer ring.deinit(); // prepare, submit socket operation - _ = try ring.socket(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); + _ = try ring.socket(0, linux.AF.INET, .{ .type = .stream }, 0, 0); try testing.expectEqual(@as(u32, 1), try ring.submit()); // test completion @@ -3677,7 +3680,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { try ring.register_files(registered_fds[0..]); // create socket in registered file descriptor at index 0 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 0); + _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 0); try testing.expectEqual(@as(u32, 1), try ring.submit()); var cqe_socket = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); @@ -4960,12 +4963,12 @@ pub const Sqe = extern struct { sqe: *Sqe, fd: linux.fd_t, path: [*:0]const u8, - flags: linux.AT, + flags: linux.At, mask: linux.Statx.Mask, buf: *linux.Statx, ) void { - sqe.prep_rw(.STATX, fd, @intFromPtr(path), @bitCast(mask), @intFromPtr(buf)); - sqe.rw_flags = flags; + sqe.prep_rw(.STATX, fd, @intFromPtr(path), @as(u32, @bitCast(mask)), @intFromPtr(buf)); + sqe.rw_flags = @bitCast(flags); } pub fn prep_cancel( @@ -5022,10 +5025,10 @@ pub const Sqe = extern struct { sqe: *Sqe, dir_fd: linux.fd_t, path: [*:0]const u8, - flags: linux.AT, // TODO: unlink flags only AT_REMOVEDIR + flags: linux.At, // TODO: unlink flags only AT_REMOVEDIR ) void { sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_mkdirat( @@ -5068,7 +5071,7 @@ pub const Sqe = extern struct { @intFromPtr(new_path), ); sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_files_update( @@ -5114,7 +5117,8 @@ pub const Sqe = extern struct { domain: linux.AF, socket_type: linux.SOCK, protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - flags: u32, // flags is unused + /// flags is unused + flags: u32, ) void { sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); sqe.rw_flags = flags; @@ -5123,9 +5127,10 @@ pub const Sqe = extern struct { pub fn prep_socket_direct( sqe: *Sqe, domain: linux.AF, - socket_type: linux.SOCK, + socket_type: linux.Sock, protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - flags: u32, // flags is unused + /// flags is unused + flags: u32, file_index: u32, ) void { prep_socket(sqe, domain, socket_type, protocol, flags); diff --git a/lib/std/os/linux/test.zig b/lib/std/os/linux/test.zig index 3290e0466691..4676d4899b33 100644 --- a/lib/std/os/linux/test.zig +++ b/lib/std/os/linux/test.zig @@ -85,7 +85,7 @@ test "statx" { defer file.close(); var statx_buf: linux.Statx = undefined; - switch (linux.errno(linux.statx(file.handle, "", linux.AT.EMPTY_PATH, linux.Statx.Mask.basic_stats, &statx_buf))) { + switch (linux.errno(linux.statx(file.handle, "", .{ .empty_path = true }, linux.Statx.Mask.basic_stats, &statx_buf))) { .SUCCESS => {}, else => unreachable, } @@ -93,7 +93,7 @@ test "statx" { if (builtin.cpu.arch == .riscv32 or builtin.cpu.arch.isLoongArch()) return error.SkipZigTest; // No fstatat, so the rest of the test is meaningless. var stat_buf: linux.Stat = undefined; - switch (linux.errno(linux.fstatat(file.handle, "", &stat_buf, linux.AT.EMPTY_PATH))) { + switch (linux.errno(linux.fstatat(file.handle, "", &stat_buf, .{ .empty_path = true }))) { .SUCCESS => {}, else => unreachable, } From 25850b9e448c685ebaf8c595a272c44638964614 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 5 Oct 2025 19:26:04 +0000 Subject: [PATCH 08/41] Restore deprecated contants using new Flag types This allows for a gradual transitioning to the new typed flags Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 221 ++++++++++++++++++++++++++++++++--- lib/std/os/linux/IoUring.zig | 34 +++--- 2 files changed, 222 insertions(+), 33 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 1e94d0213892..7a802e806569 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -3476,6 +3476,7 @@ pub const STDIN_FILENO = 0; pub const STDOUT_FILENO = 1; pub const STDERR_FILENO = 2; +/// Deprecated alias to At pub const AT = At; /// matches AT_* and AT_STATX_* pub const At = packed struct(u32) { @@ -3516,6 +3517,44 @@ pub const At = packed struct(u32) { /// Do whatever stat() does /// This is the default and is very much filesystem-specific pub const statx_sync_as_stat: At = .{}; + + // DEPRECATED ALIASES + // + // + /// Special value used to indicate openat should use the current working directory + pub const FDCWD = fdcwd; + + /// Do not follow symbolic links + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + + /// Remove directory instead of unlinking file + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir_or_handle_fid = .{ .removedir = true } }); + + pub const HANDLE_FID = At{ .removedir_or_handle_fid = .{ .handle_fid = true } }; + + /// Follow symbolic links. + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + + /// Suppress terminal automount traversal + pub const NO_AUTOMOUNT: u32 = @bitCast(At{ .no_automount = true }); + + /// Allow empty relative pathname + pub const EMPTY_PATH: u32 = @bitCast(At{ .empty_path = true }); + + /// Type of synchronisation required from statx() + pub const STATX_SYNC_TYPE: u32 = @bitCast(statx_sync_type); + + /// - Do whatever stat() does + pub const STATX_SYNC_AS_STAT: u32 = @bitCast(statx_sync_as_stat); + + /// - Force the attributes to be sync'd with the server + pub const STATX_FORCE_SYNC: u32 = @bitCast(At{ .statx_force_sync = true }); + + /// - Don't sync attributes with the server + pub const STATX_DONT_SYNC: u32 = @bitCast(At{ .statx_dont_sync = true }); + + /// Apply to the entire subtree + pub const RECURSIVE: u32 = @bitCast(At{ .recursive = true }); }; pub const FALLOC = struct { @@ -3673,6 +3712,14 @@ pub const W = packed struct(u32) { nowait: bool = false, _unused_1: u7 = 0, + // Deprecated aliases + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + pub const UNTRACED: u32 = @bitCast(W{ .untraced_or_stopped = .{ .untraced = true } }); + pub const STOPPED: u32 = @bitCast(W{ .untraced_or_stopped = .{ .stopped = true } }); + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + pub fn EXITSTATUS(s: W) u8 { return @intCast((@as(u32, @bitCast(s)) & 0xff00) >> 8); } @@ -3893,6 +3940,7 @@ pub const SEEK = struct { pub const END = 2; }; +/// Deprecated alias to Shut pub const SHUT = Shut; /// enum sock_shutdown_cmd - Shutdown types /// matches SHUT_* in kenel @@ -3905,16 +3953,22 @@ pub const Shut = enum(u32) { rdwr = 2, _, + + // deprecated constants of the fields + pub const RD: u32 = @intFromEnum(Shut.rd); + pub const WR: u32 = @intFromEnum(Shut.wd); + pub const RDWR: u32 = @intFromEnum(Shut.rdwr); }; pub const SOCK = Sock; /// SOCK_* Socket type and flags pub const Sock = packed struct(u32) { - type: Type, + type: Type = .default, flags: Flags = .{}, /// matches sock_type in kernel pub const Type = enum(u7) { + default = 0, stream = if (is_mips) 2 else 1, dgram = if (is_mips) 1 else 2, raw = 3, @@ -3946,6 +4000,17 @@ pub const Sock = packed struct(u32) { cloexec: bool = false, _2: u12 = 0, }; + + // Deprecated aliases for SOCK + pub const STREAM: u32 = @intFromEnum(Type.stream); + pub const DGRAM: u32 = @intFromEnum(Type.dgram); + pub const RAW: u32 = @intFromEnum(Type.raw); + pub const RDM: u32 = @intFromEnum(Type.rdm); + pub const SEQPACKET: u32 = @intFromEnum(Type.seqpacket); + pub const DCCP: u32 = @intFromEnum(Type.dccp); + pub const PACKET: u32 = @intFromEnum(Type.packet); + pub const CLOEXEC: u32 = (@as(u25, @bitCast(Flags{ .cloexec = true })) << 7); + pub const NONBLOCK: u32 = (@as(u25, @bitCast(Flags{ .nonblock = true })) << 7); }; pub const TCP = struct { @@ -4048,8 +4113,13 @@ pub const UDP_ENCAP = struct { pub const RXRPC = 6; }; +// Deprecated Alias +pub const AF = Af; +pub const PF = Af; +/// Protocol Family (same values as Protocol Family) +pub const Pf = Af; /// Address Family -pub const AF = enum(u16) { +pub const Af = enum(u16) { unspec = 0, unix = 1, inet = 2, @@ -4099,14 +4169,62 @@ pub const AF = enum(u16) { _, // Aliases - pub const local = AF.unix; - pub const file = AF.unix; - pub const netlink = AF.route; + pub const local = Af.unix; + pub const file = Af.unix; + pub const netlink = Af.route; + + // Deprecated constants for backward compatibility + pub const UNSPEC: u16 = @intFromEnum(Af.unspec); + pub const UNIX: u16 = @intFromEnum(Af.unix); + pub const LOCAL: u16 = @intFromEnum(local); + pub const FILE: u16 = @intFromEnum(file); + pub const INET: u16 = @intFromEnum(Af.inet); + pub const AX25: u16 = @intFromEnum(Af.ax25); + pub const IPX: u16 = @intFromEnum(Af.ipx); + pub const APPLETALK: u16 = @intFromEnum(Af.appletalk); + pub const NETROM: u16 = @intFromEnum(Af.netrom); + pub const BRIDGE: u16 = @intFromEnum(Af.bridge); + pub const ATMPVC: u16 = @intFromEnum(Af.atmpvc); + pub const X25: u16 = @intFromEnum(Af.x25); + pub const INET6: u16 = @intFromEnum(Af.inet6); + pub const ROSE: u16 = @intFromEnum(Af.rose); + pub const DECnet: u16 = @intFromEnum(Af.decnet); + pub const NETBEUI: u16 = @intFromEnum(Af.netbeui); + pub const SECURITY: u16 = @intFromEnum(Af.security); + pub const KEY: u16 = @intFromEnum(Af.key); + pub const ROUTE: u16 = @intFromEnum(Af.route); + pub const NETLINK: u16 = @intFromEnum(netlink); + pub const PACKET: u16 = @intFromEnum(Af.packet); + pub const ASH: u16 = @intFromEnum(Af.ash); + pub const ECONET: u16 = @intFromEnum(Af.econet); + pub const ATMSVC: u16 = @intFromEnum(Af.atmsvc); + pub const RDS: u16 = @intFromEnum(Af.rds); + pub const SNA: u16 = @intFromEnum(Af.sna); + pub const IRDA: u16 = @intFromEnum(Af.irda); + pub const PPPOX: u16 = @intFromEnum(Af.pppox); + pub const WANPIPE: u16 = @intFromEnum(Af.wanpipe); + pub const LLC: u16 = @intFromEnum(Af.llc); + pub const IB: u16 = @intFromEnum(Af.ib); + pub const MPLS: u16 = @intFromEnum(Af.mpls); + pub const CAN: u16 = @intFromEnum(Af.can); + pub const TIPC: u16 = @intFromEnum(Af.tipc); + pub const BLUETOOTH: u16 = @intFromEnum(Af.bluetooth); + pub const IUCV: u16 = @intFromEnum(Af.iucv); + pub const RXRPC: u16 = @intFromEnum(Af.rxrpc); + pub const ISDN: u16 = @intFromEnum(Af.isdn); + pub const PHONET: u16 = @intFromEnum(Af.phonet); + pub const IEEE802154: u16 = @intFromEnum(Af.ieee802154); + pub const CAIF: u16 = @intFromEnum(Af.caif); + pub const ALG: u16 = @intFromEnum(Af.alg); + pub const NFC: u16 = @intFromEnum(Af.nfc); + pub const VSOCK: u16 = @intFromEnum(Af.vsock); + pub const KCM: u16 = @intFromEnum(Af.kcm); + pub const QIPCRTR: u16 = @intFromEnum(Af.qipcrtr); + pub const SMC: u16 = @intFromEnum(Af.smc); + pub const XDP: u16 = @intFromEnum(Af.xdp); + pub const MAX: u16 = @intFromEnum(Af.max); }; -/// Protocol Family (same values as Protocol Family) -pub const PF = AF; - pub const SO = if (is_mips) struct { pub const DEBUG = 1; pub const REUSEADDR = 0x0004; @@ -5863,7 +5981,8 @@ pub const signalfd_siginfo = extern struct { }; pub const in_port_t = u16; -pub const sa_family_t = AF; +// TODO: change to AF type +pub const sa_family_t = u16; pub const socklen_t = u32; pub const sockaddr = extern struct { @@ -5883,7 +6002,7 @@ pub const sockaddr = extern struct { /// IPv4 socket address pub const in = extern struct { - family: sa_family_t = AF.INET, + family: sa_family_t = Af.INET, port: in_port_t, addr: u32, zero: [8]u8 = [8]u8{ 0, 0, 0, 0, 0, 0, 0, 0 }, @@ -5891,7 +6010,7 @@ pub const sockaddr = extern struct { /// IPv6 socket address pub const in6 = extern struct { - family: sa_family_t = AF.INET6, + family: sa_family_t = Af.INET6, port: in_port_t, flowinfo: u32, addr: [16]u8, @@ -5900,13 +6019,13 @@ pub const sockaddr = extern struct { /// UNIX domain socket address pub const un = extern struct { - family: sa_family_t = AF.UNIX, + family: sa_family_t = Af.UNIX, path: [108]u8, }; /// Packet socket address pub const ll = extern struct { - family: sa_family_t = AF.PACKET, + family: sa_family_t = Af.PACKET, protocol: u16, ifindex: i32, hatype: u16, @@ -5917,7 +6036,7 @@ pub const sockaddr = extern struct { /// Netlink socket address pub const nl = extern struct { - family: sa_family_t = AF.NETLINK, + family: sa_family_t = Af.NETLINK, __pad1: c_ushort = 0, /// port ID @@ -5928,7 +6047,7 @@ pub const sockaddr = extern struct { }; pub const xdp = extern struct { - family: u16 = AF.XDP, + family: u16 = Af.XDP, flags: u16, ifindex: u32, queue_id: u32, @@ -5937,7 +6056,7 @@ pub const sockaddr = extern struct { /// Address structure for vSockets pub const vm = extern struct { - family: sa_family_t = AF.VSOCK, + family: sa_family_t = Af.VSOCK, reserved1: u16 = 0, port: u32, cid: u32, @@ -7111,6 +7230,38 @@ pub const Statx = extern struct { }; }; +// DEPRECATED aliases to Statx.Mask and Statx.Attr +const STATX_TYPE: u32 = @bitCast(Statx.Mask{ .type = true }); +const STATX_MODE: u32 = @bitCast(Statx.Mask{ .mode = true }); +const STATX_NLINK: u32 = @bitCast(Statx.Mask{ .nlink = true }); +const STATX_UID: u32 = @bitCast(Statx.Mask{ .uid = true }); +const STATX_GID: u32 = @bitCast(Statx.Mask{ .gid = true }); +const STATX_ATIME: u32 = @bitCast(Statx.Mask{ .atime = true }); +const STATX_MTIME: u32 = @bitCast(Statx.Mask{ .mtime = true }); +const STATX_CTIME: u32 = @bitCast(Statx.Mask{ .ctime = true }); +const STATX_INO: u32 = @bitCast(Statx.Mask{ .ino = true }); +const STATX_SIZE: u32 = @bitCast(Statx.Mask{ .size = true }); +const STATX_BLOCKS: u32 = @bitCast(Statx.Mask{ .blocks = true }); +const STATX_BASIC_STATS: u32 = @bitCast(Statx.Mask.basic_stats); +const STATX_BTIME: u32 = @bitCast(Statx.Mask{ .btime = true }); +const STATX_MNT_ID: u32 = @bitCast(Statx.Mask{ .mnt_id = true }); +const STATX_DIOALIGN: u32 = @bitCast(Statx.Mask{ .dioalign = true }); +const STATX_MNT_ID_UNIQUE: u32 = @bitCast(Statx.Mask{ .mnt_id_unique = true }); +const STATX_SUBVOL: u32 = @bitCast(Statx.Mask{ .subvol = true }); +const STATX_WRITE_ATOMIC: u32 = @bitCast(Statx.Mask{ .write_atomic = true }); +const STATX_DIO_READ_ALIGN: u32 = @bitCast(Statx.Mask{ .dio_read_align = true }); + +const STATX_ATTR_COMPRESSED: u64 = @bitCast(Statx.Attr{ .compressed = true }); +const STATX_ATTR_IMMUTABLE: u64 = @bitCast(Statx.Attr{ .immutable = true }); +const STATX_ATTR_APPEND: u64 = @bitCast(Statx.Attr{ .append = true }); +const STATX_ATTR_NODUMP: u64 = @bitCast(Statx.Attr{ .nodump = true }); +const STATX_ATTR_ENCRYPTED: u64 = @bitCast(Statx.Attr{ .encrypted = true }); +const STATX_ATTR_AUTOMOUNT: u64 = @bitCast(Statx.Attr{ .automount = true }); +const STATX_ATTR_MOUNT_ROOT: u64 = @bitCast(Statx.Attr{ .mount_root = true }); +const STATX_ATTR_VERITY: u64 = @bitCast(Statx.Attr{ .verity = true }); +const STATX_ATTR_DAX: u64 = @bitCast(Statx.Attr{ .dax = true }); +const STATX_ATTR_WRITE_ATOMIC: u64 = @bitCast(Statx.Attr{ .write_atomic = true }); + pub const addrinfo = extern struct { flags: AI, family: i32, @@ -7136,6 +7287,8 @@ pub const AI = packed struct(u32) { pub const IPPORT_RESERVED = 1024; +/// Deprecated alias to IpProto +pub const IPPROTO = IpProto; /// IP Protocol numbers pub const IpProto = enum(u16) { ip = 0, @@ -7175,6 +7328,42 @@ pub const IpProto = enum(u16) { // Aliases pub const hopopts = IpProto.ip; pub const default = IpProto.ip; + + // Deprecated constants use enum instead + // Legacy constants for backward compatibility + pub const IP: u16 = @intFromEnum(IpProto.ip); + pub const HOPOPTS: u16 = @intFromEnum(hopopts); + pub const ICMP: u16 = @intFromEnum(IpProto.icmp); + pub const IGMP: u16 = @intFromEnum(IpProto.igmp); + pub const IPIP: u16 = @intFromEnum(IpProto.ipip); + pub const TCP: u16 = @intFromEnum(IpProto.tcp); + pub const EGP: u16 = @intFromEnum(IpProto.egp); + pub const PUP: u16 = @intFromEnum(IpProto.pup); + pub const UDP: u16 = @intFromEnum(IpProto.udp); + pub const IDP: u16 = @intFromEnum(IpProto.idp); + pub const TP: u16 = @intFromEnum(IpProto.tp); + pub const DCCP: u16 = @intFromEnum(IpProto.dccp); + pub const IPV6: u16 = @intFromEnum(IpProto.ipv6); + pub const ROUTING: u16 = @intFromEnum(IpProto.routing); + pub const FRAGMENT: u16 = @intFromEnum(IpProto.fragment); + pub const RSVP: u16 = @intFromEnum(IpProto.rsvp); + pub const GRE: u16 = @intFromEnum(IpProto.gre); + pub const ESP: u16 = @intFromEnum(IpProto.esp); + pub const AH: u16 = @intFromEnum(IpProto.ah); + pub const ICMPV6: u16 = @intFromEnum(IpProto.icmpv6); + pub const NONE: u16 = @intFromEnum(IpProto.none); + pub const DSTOPTS: u16 = @intFromEnum(IpProto.DSTOPTS); + pub const MTP: u16 = @intFromEnum(IpProto.mtp); + pub const BEETPH: u16 = @intFromEnum(IpProto.beetph); + pub const ENCAP: u16 = @intFromEnum(IpProto.encap); + pub const PIM: u16 = @intFromEnum(IpProto.pim); + pub const COMP: u16 = @intFromEnum(IpProto.comp); + pub const SCTP: u16 = @intFromEnum(IpProto.sctp); + pub const MH: u16 = @intFromEnum(IpProto.mh); + pub const UDPLITE: u16 = @intFromEnum(IpProto.udplite); + pub const MPLS: u16 = @intFromEnum(IpProto.mpls); + pub const RAW: u16 = @intFromEnum(IpProto.raw); + pub const MAX: u16 = @intFromEnum(IpProto.max); }; pub const tcp_repair_opt = extern struct { diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 7ca59b1f331e..a28df8c880c7 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1328,7 +1328,7 @@ pub fn unregister_files(self: *IoUring) !void { pub fn socket( self: *IoUring, user_data: u64, - domain: linux.AF, + domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, /// flags is unused @@ -1345,7 +1345,7 @@ pub fn socket( pub fn socket_direct( self: *IoUring, user_data: u64, - domain: linux.AF, + domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, /// flags is unused @@ -2657,7 +2657,7 @@ test "shutdown" { const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); defer posix.close(server); - const shutdown_sqe = ring.shutdown(0x445445445, server, linux.SHUT.RD) catch |err| switch (err) { + const shutdown_sqe = ring.shutdown(0x445445445, server, .rd) catch |err| switch (err) { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), }; try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); @@ -3339,7 +3339,7 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { // Submit 1 accept var accept_addr: posix.sockaddr = undefined; var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr)); - _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, 0); + _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, .{}); // Create a TCP client socket const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); @@ -3417,7 +3417,7 @@ test "accept multishot" { var addr: posix.sockaddr = undefined; var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); const userdata: u64 = 0xaaaaaaaa; - _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, 0); + _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); var nr: usize = 4; // number of clients to connect @@ -3525,7 +3525,7 @@ test "accept_direct" { const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop // submit accept, will chose registered fd and return index in cqe - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); // connect @@ -3608,7 +3608,7 @@ test "accept_multishot_direct" { for (0..2) |_| { // submit multishot accept // Will chose registered fd and return index of the selected registered file in cqe. - _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, 0); + _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); for (registered_fds) |_| { @@ -3654,7 +3654,7 @@ test "socket" { defer ring.deinit(); // prepare, submit socket operation - _ = try ring.socket(0, linux.AF.INET, .{ .type = .stream }, 0, 0); + _ = try ring.socket(0, .{ .inet = true }, .{ .type = .stream }, 0, 0); try testing.expectEqual(@as(u32, 1), try ring.submit()); // test completion @@ -3687,7 +3687,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { try testing.expect(cqe_socket.res == 0); // create socket in registered file descriptor at index 1 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 1); + _ = try ring.socket_direct(0, linux.Af.INET, posix.SOCK.STREAM, 0, 0, 1); try testing.expectEqual(@as(u32, 1), try ring.submit()); cqe_socket = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); @@ -3695,7 +3695,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { // create socket in kernel chosen file descriptor index (_alloc version) // completion res has index from registered files - _ = try ring.socket_direct_alloc(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); + _ = try ring.socket_direct_alloc(0, linux.Af.INET, posix.SOCK.STREAM, 0, 0); try testing.expectEqual(@as(u32, 1), try ring.submit()); cqe_socket = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); @@ -5114,21 +5114,21 @@ pub const Sqe = extern struct { pub fn prep_socket( sqe: *Sqe, - domain: linux.AF, - socket_type: linux.SOCK, - protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 /// flags is unused flags: u32, ) void { - sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); + sqe.prep_rw(.SOCKET, @intCast(domain), 0, @intFromEnum(protocol), @intCast(@as(u32, @bitCast(socket_type)))); sqe.rw_flags = flags; } pub fn prep_socket_direct( sqe: *Sqe, - domain: linux.AF, + domain: linux.Af, socket_type: linux.Sock, - protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + protocol: linux.IpProto, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 /// flags is unused flags: u32, file_index: u32, @@ -5139,7 +5139,7 @@ pub const Sqe = extern struct { pub fn prep_socket_direct_alloc( sqe: *Sqe, - domain: linux.AF, + domain: linux.Af, socket_type: linux.SOCK, protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 flags: u32, // flags is unused From 8a443a29d0e09fae4c4828630e3cbc584a6fba91 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Mon, 6 Oct 2025 17:18:56 +0000 Subject: [PATCH 09/41] Replace MSG with Packed Struct Flags Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 104 +- lib/std/os/linux/IoUring.zig | 8003 +++++++++++++++++----------------- 2 files changed, 4084 insertions(+), 4023 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 7a802e806569..70d3a448331d 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -4972,28 +4972,88 @@ pub const ETH = struct { }; }; -pub const MSG = struct { - pub const OOB = 0x0001; - pub const PEEK = 0x0002; - pub const DONTROUTE = 0x0004; - pub const CTRUNC = 0x0008; - pub const PROXY = 0x0010; - pub const TRUNC = 0x0020; - pub const DONTWAIT = 0x0040; - pub const EOR = 0x0080; - pub const WAITALL = 0x0100; - pub const FIN = 0x0200; - pub const SYN = 0x0400; - pub const CONFIRM = 0x0800; - pub const RST = 0x1000; - pub const ERRQUEUE = 0x2000; - pub const NOSIGNAL = 0x4000; - pub const MORE = 0x8000; - pub const WAITFORONE = 0x10000; - pub const BATCH = 0x40000; - pub const ZEROCOPY = 0x4000000; - pub const FASTOPEN = 0x20000000; - pub const CMSG_CLOEXEC = 0x40000000; +// Deprecated alias for Msg +pub const MSG = Msg; +pub const Msg = packed struct(u32) { + /// Process out-of-band data + oob: bool = false, + /// Peek at incoming message + peek: bool = false, + /// Send without using routing tables + dontroute: bool = false, + /// Control data truncated + ctrunc: bool = false, + /// Do not send. Only probe path (e.g. for MTU) + probe: bool = false, + /// Normal data truncated + trunc: bool = false, + /// Nonblocking I/O + dontwait: bool = false, + /// End of record + eor: bool = false, + /// Wait for a full request + waitall: bool = false, + /// FIN flag + fin: bool = false, + /// SYN flag + syn: bool = false, + /// Confirm path validity + confirm: bool = false, + /// RST flag + rst: bool = false, + /// Fetch message from error queue + errqueue: bool = false, + /// Do not generate SIGPIPE + nosignal: bool = false, + /// Sender will send more + more: bool = false, + /// recvmmsg(): block until 1+ packets available + waitforone: bool = false, + _18: u1 = 0, + /// sendmmsg(): more messages coming + batch: bool = false, + /// sendpage() internal: page frags are not shared + no_shared_frags: bool = false, + /// sendpage() internal: page may carry plain text and require encryption + sendpage_decrypted: bool = false, + _22: u4 = 0, + // COMMIT: new flags + /// Receive devmem skbs as cmsg + sock_devmem: bool = false, + /// Use user data in kernel path + zerocopy: bool = false, + /// Splice the pages from the iterator in sendmsg() + splice_pages: bool = false, + _29: u1 = 0, + /// Send data in TCP SYN + fastopen: bool = false, + /// Set close_on_exec for file descriptor received through SCM_RIGHTS + cmsg_cloexec: bool = false, + _: u1 = 0, + + // DEPRECATED CONSTANTS + pub const OOB: u32 = @bitCast(Msg{ .oob = true }); + pub const PEEK: u32 = @bitCast(Msg{ .peek = true }); + pub const DONTROUTE: u32 = @bitCast(Msg{ .dontroute = true }); + pub const CTRUNC: u32 = @bitCast(Msg{ .ctrunc = true }); + // fix typo PROBE not PROXY + pub const PROBE: u32 = @bitCast(Msg{ .probe = true }); + pub const TRUNC: u32 = @bitCast(Msg{ .trunc = true }); + pub const DONTWAIT: u32 = @bitCast(Msg{ .dontwait = true }); + pub const EOR: u32 = @bitCast(Msg{ .eor = true }); + pub const WAITALL: u32 = @bitCast(Msg{ .waitall = true }); + pub const FIN: u32 = @bitCast(Msg{ .fin = true }); + pub const SYN: u32 = @bitCast(Msg{ .syn = true }); + pub const CONFIRM: u32 = @bitCast(Msg{ .confirm = true }); + pub const RST: u32 = @bitCast(Msg{ .rst = true }); + pub const ERRQUEUE: u32 = @bitCast(Msg{ .errqueue = true }); + pub const NOSIGNAL: u32 = @bitCast(Msg{ .nosignal = true }); + pub const MORE: u32 = @bitCast(Msg{ .more = true }); + pub const WAITFORONE: u32 = @bitCast(Msg{ .waitforone = true }); + pub const BATCH: u32 = @bitCast(Msg{ .batch = true }); + pub const ZEROCOPY: u32 = @bitCast(Msg{ .zerocopy = true }); + pub const FASTOPEN: u32 = @bitCast(Msg{ .fastopen = true }); + pub const CMSG_CLOEXEC: u32 = @bitCast(Msg{ .cmsg_cloexec = true }); }; pub const DT = struct { diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index a28df8c880c7..fdc956a1b4c4 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -602,7 +602,7 @@ pub fn recv( user_data: u64, fd: linux.fd_t, buffer: RecvBuffer, - flags: u32, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); switch (buffer) { @@ -626,7 +626,7 @@ pub fn send( user_data: u64, fd: linux.fd_t, buffer: []const u8, - flags: u32, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_send(fd, buffer, flags); @@ -654,8 +654,8 @@ pub fn send_zc( user_data: u64, fd: linux.fd_t, buffer: []const u8, - send_flags: u32, - zc_flags: u16, + send_flags: linux.Msg, + zc_flags: Sqe.SendRecv, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); @@ -705,7 +705,7 @@ pub fn sendmsg( user_data: u64, fd: linux.fd_t, msg: *const linux.msghdr_const, - flags: u32, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_sendmsg(fd, msg, flags); @@ -1593,1474 +1593,1977 @@ pub fn buf_ring_advance(br: *BufferRing, count: u16) void { @atomicStore(u16, &br.tail, tail, .release); } -test "structs/offsets/entries" { - if (!is_linux) return error.SkipZigTest; - - try testing.expectEqual(@as(usize, 120), @sizeOf(Params)); - try testing.expectEqual(@as(usize, 64), @sizeOf(Sqe)); - try testing.expectEqual(@as(usize, 16), @sizeOf(Cqe)); - - try testing.expectEqual(0, constants.OFF_SQ_RING); - try testing.expectEqual(0x8000000, constants.OFF_CQ_RING); - try testing.expectEqual(0x10000000, constants.OFF_SQES); - - try testing.expectError(error.EntriesZero, IoUring.init(0, .{})); - try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, .{})); -} - -test "nop" { - if (!is_linux) return error.SkipZigTest; +// IO completion data structure (Completion Queue Entry) +pub const Cqe = extern struct { + /// sqe.user_data value passed back + user_data: u64, + /// result code for this event + res: i32, + flags: Flags, + // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration + // TODO: add support for the IORING_SETUP_CQE32 case + /// If the ring is initialized with IORING_SETUP_CQE32, then this field + /// contains 16-bytes of padding, doubling the size of the CQE. + // big_cqe: ?[2]u64, - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + /// cqe.flags + pub const Flags = packed struct(u32) { + /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + F_BUFFER: bool = false, + /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + F_MORE: bool = false, + /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv + F_SOCK_NONEMPTY: bool = false, + /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct + /// them from sends. + F_NOTIF: bool = false, + /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get + /// more completions. In other words, the buffer is being + /// partially consumed, and will be used by the kernel for + /// more completions. This is only set for buffers used via + /// the incremental buffer consumption, as provided by + /// a ring buffer setup with IOU_PBUF_RING_INC. For any + /// other provided buffer type, all completions with a + /// buffer passed back is automatically returned to the + /// application. + F_BUF_MORE: bool = false, + // COMMIT: new flags + /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + /// CQE. It's only purpose is to fill a gap in the ring, + /// if a large CQE is attempted posted when the ring has + /// just a single small CQE worth of space left before + /// wrapping. + F_SKIP: bool = false, + _unused: u9 = 0, + /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + /// setup in a mixed CQE mode, where both 16b and 32b + /// CQEs may be posted to the CQ ring. + F_32: bool = false, + _unused_1: u16 = 0, }; - defer { - ring.deinit(); - testing.expectEqual(@as(linux.fd_t, -1), ring.fd) catch @panic("test failed"); - } - - const sqe = try ring.nop(0xaaaaaaaa); - try testing.expectEqual(Sqe{ - .opcode = .NOP, - .flags = .{}, - .ioprio = .init_empty(), - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0xaaaaaaaa, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }, sqe.*); - try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 1), ring.sq_ready()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + pub fn err(self: Cqe) linux.E { + if (self.res > -4096 and self.res < 0) { + return @as(linux.E, @enumFromInt(-self.res)); + } + return .SUCCESS; + } - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + // On successful completion of the provided buffers IO request, the CQE flags field + // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by + // the upper 16-bits of the flags field. + pub fn buffer_id(self: Cqe) !u16 { + if (!self.flags.F_BUFFER) { + return error.NoBufferSelected; + } + return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT); + } +}; - try testing.expectEqual(Cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); +/// IO submission data structure (Submission Queue Entry) +/// matches io_uring_sqe in liburing +pub const Sqe = extern struct { + /// type of operation for this sqe + opcode: Op, + /// IOSQE_* flags + flags: IoSqe, + /// ioprio for the request + ioprio: packed union { + send_recv: SendRecv, + accept: Accept, + const Ioprio = @This(); - const sqe_barrier = try ring.nop(0xbbbbbbbb); - sqe_barrier.flags.IO_DRAIN = true; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0xbbbbbbbb, - .res = 0, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 2), ring.cq.head.*); -} + pub fn init_empty() Ioprio { + return @bitCast(@as(u16, 0)); + } + }, + /// file descriptor to do IO on + fd: i32, + /// offset into file + off: u64, + /// pointer to buffer or iovecs + addr: u64, + /// buffer size or number of iovecs + len: u32, + /// flags for any sqe operation + /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | msg_flags + /// timeout_flags | accept_flags | cancel_flags | open_flags | statx_flags + /// fadvise_advice | splice_flags | rename_flags | unlink_flags | hardlink_flags + /// xattr_flags | msg_ring_flags | uring_cmd_flags | waitid_flags | futex_flags + /// install_fd_flags | nop_flags | pipe_flags + rw_flags: u32, + /// data to be passed back at completion time + user_data: u64, + /// index into fixed buffers or for grouped buffer selection + buf_index: u16, + personality: u16, + splice_fd_in: i32, + addr3: u64, + resv: u64, -test "readv" { - if (!is_linux) return error.SkipZigTest; + /// sqe.flags + pub const IoSqe = packed struct(u8) { + /// use fixed fileset + FIXED_FILE: bool = false, + /// issue after inflight IO + IO_DRAIN: bool = false, + /// links next sqe + IO_LINK: bool = false, + /// like LINK, but stronger + IO_HARDLINK: bool = false, + /// always go async + ASYNC: bool = false, + /// select buffer from sqe->buf_group + BUFFER_SELECT: bool = false, + /// don't post CQE if request succeeded + CQE_SKIP_SUCCESS: bool = false, + _: u1 = 0, + }; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + /// send/sendmsg and recv/recvmsg flags (sqe.ioprio) + pub const SendRecv = packed struct(u16) { + /// IORING_RECVSEND_POLL_FIRST + /// If set, instead of first attempting to send or receive and arm poll + /// if that yields an -EAGAIN result, arm poll upfront and skip the + /// initial transfer attempt. + RECVSEND_POLL_FIRST: bool = false, + /// IORING_RECV_MULTISHOT + /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue + /// to report CQEs on behalf of the same SQE. + RECV_MULTISHOT: bool = false, + /// IORING_RECVSEND_FIXED_BUF + /// Use registered buffers, the index is stored in the buf_index field. + RECVSEND_FIXED_BUF: bool = false, + /// IORING_SEND_ZC_REPORT_USAGE + /// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res + /// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was + /// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied + /// (at least partially). + SEND_ZC_REPORT_USAGE: bool = false, + /// IORING_RECVSEND_BUNDLE + /// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as + /// many buffers from the buffer group ID given and send them all. + /// The completion result will be the number of buffers send, with the + /// starting buffer ID in cqe.flags as per usual for provided buffer + /// usage. The buffers will be contiguous from the starting buffer ID. + RECVSEND_BUNDLE: bool = false, + // COMMIT: new flags + /// IORING_SEND_VECTORIZED + /// If set, SEND[_ZC] will take a pointer to a io_vec to allow + /// vectorized send operations. + SEND_VECTORIZED: bool = false, + _: u10 = 0, }; - defer ring.deinit(); - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); + /// accept flags stored in sqe.ioprio + pub const Accept = packed struct(u16) { + MULTISHOT: bool = false, + // COMMIT: new Flags + DONTWAIT: bool = false, + POLL_FIRST: bool = false, + _unused: u13 = 0, + }; - // Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1). - // Linux Kernel 5.5 adds support for sparse fd sets. - // Compare: - // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs - // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 - // We therefore avoid stressing sparse fd sets here: - var registered_fds = [_]linux.fd_t{0} ** 1; - const fd_index = 0; - registered_fds[fd_index] = fd; - try ring.register_files(registered_fds[0..]); + pub fn prep_nop(sqe: *Sqe) void { + sqe.* = .{ + .opcode = .NOP, + .flags = .{}, + .ioprio = .init_empty(), + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - var buffer = [_]u8{42} ** 128; - var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; - const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); - try testing.expectEqual(Op.READV, sqe.opcode); - sqe.flags.FIXED_FILE = true; + pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void { + sqe.* = .{ + .opcode = .FSYNC, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = @bitCast(flags), + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + pub fn prep_rw( + sqe: *Sqe, + op: Op, + fd: linux.fd_t, + addr: u64, + len: usize, + offset: u64, + ) void { + sqe.* = .{ + .opcode = op, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = offset, + .addr = addr, + .len = @intCast(len), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - try ring.unregister_files(); -} + pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { + sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } -test "writev/fsync/readv" { - if (!is_linux) return error.SkipZigTest; + pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { + sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } - var ring = IoUring.init(4, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { + sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); + sqe.addr = off_in; + sqe.splice_fd_in = fd_in; + } - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + pub fn prep_readv( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec, + offset: u64, + ) void { + sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } - const path = "test_io_uring_writev_fsync_readv"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; + pub fn prep_writev( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec_const, + offset: u64, + ) void { + sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } - const buffer_write = [_]u8{42} ** 128; - const iovecs_write = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_write, .len = buffer_write.len }, - }; - var buffer_read = [_]u8{0} ** 128; - var iovecs_read = [_]posix.iovec{ - posix.iovec{ .base = &buffer_read, .len = buffer_read.len }, - }; + pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.buf_index = buffer_index; + } - const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); - try testing.expectEqual(Op.WRITEV, sqe_writev.opcode); - try testing.expectEqual(@as(u64, 17), sqe_writev.off); - sqe_writev.flags.IO_LINK = true; + pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.buf_index = buffer_index; + } - const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); - try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); - try testing.expectEqual(fd, sqe_fsync.fd); - sqe_fsync.flags.IO_LINK = true; + pub fn prep_accept( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + ) void { + // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. + // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); + sqe.rw_flags = @bitCast(flags); + } - const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); - try testing.expectEqual(Op.READV, sqe_readv.opcode); - try testing.expectEqual(@as(u64, 17), sqe_readv.off); + /// accept directly into the fixed file table + pub fn prep_accept_direct( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + file_index: u32, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, file_index); + } - try testing.expectEqual(@as(u32, 3), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), ring.cq_ready()); + pub fn prep_multishot_accept( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } }; + } - try testing.expectEqual(Cqe{ - .user_data = 0xdddddddd, - .res = buffer_write.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + /// multishot accept directly into the fixed file table + pub fn prep_multishot_accept_direct( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + ) void { + prep_multishot_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); + } - try testing.expectEqual(Cqe{ - .user_data = 0xeeeeeeee, - .res = 0, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq_ready()); + fn set_target_fixed_file(sqe: *Sqe, file_index: u32) void { + const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC) + constants.FILE_INDEX_ALLOC + else + // 0 means no fixed files, indexes should be encoded as "index + 1" + file_index + 1; + // This filed is overloaded in liburing: + // splice_fd_in: i32 + // sqe_file_index: u32 + sqe.splice_fd_in = @bitCast(sqe_file_index); + } - try testing.expectEqual(Cqe{ - .user_data = 0xffffffff, - .res = buffer_read.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + pub fn prep_connect( + sqe: *Sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + ) void { + // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); + } - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); -} + pub fn prep_epoll_ctl( + sqe: *Sqe, + epfd: linux.fd_t, + fd: linux.fd_t, + op: u32, + ev: ?*linux.epoll_event, + ) void { + sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); + } -test "write/read" { - if (!is_linux) return error.SkipZigTest; + pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.Msg) void { + sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + } - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - const path = "test_io_uring_write_read"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; + // TODO: review recv `flags` + pub fn prep_recv_multishot( + sqe: *Sqe, + fd: linux.fd_t, + buffer: []u8, + flags: linux.Msg, + ) void { + sqe.prep_recv(fd, buffer, flags); + sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; + } - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; - const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); - try testing.expectEqual(Op.WRITE, sqe_write.opcode); - try testing.expectEqual(@as(u64, 10), sqe_write.off); - sqe_write.flags.IO_LINK = true; - const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(Op.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + pub fn prep_recvmsg( + sqe: *Sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: linux.Msg, + ) void { + sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; + } - const cqe_write = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: - // https://lwn.net/Articles/809820/ - if (cqe_write.err() == .INVAL) return error.SkipZigTest; - if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x11111111, - .res = buffer_write.len, - .flags = .{}, - }, cqe_write); - try testing.expectEqual(Cqe{ - .user_data = 0x22222222, - .res = buffer_read.len, - .flags = .{}, - }, cqe_read); - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); -} + pub fn prep_recvmsg_multishot( + sqe: *Sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: linux.Msg, + ) void { + sqe.prep_recvmsg(fd, msg, flags); + sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; + } -test "splice/read" { - if (!is_linux) return error.SkipZigTest; + // COMMIT: fix send[|recv] flag param type + pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg) void { + sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + } - var ring = IoUring.init(4, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv) void { + sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = flags; + sqe.ioprio = .{ .send_recv = zc_flags }; + } - var tmp = std.testing.tmpDir(.{}); - const path_src = "test_io_uring_splice_src"; - const file_src = try tmp.dir.createFile(path_src, .{ .read = true, .truncate = true }); - defer file_src.close(); - const fd_src = file_src.handle; + pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv, buf_index: u16) void { + const zc_flags_fixed = blk: { + var updated_flags = zc_flags; + updated_flags.RECVSEND_FIXED_BUF = true; + break :blk updated_flags; + }; + prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed); + sqe.buf_index = buf_index; + } - const path_dst = "test_io_uring_splice_dst"; - const file_dst = try tmp.dir.createFile(path_dst, .{ .read = true, .truncate = true }); - defer file_dst.close(); - const fd_dst = file_dst.handle; + pub fn prep_sendmsg( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, + ) void { + sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = flags; + } - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; - _ = try file_src.write(&buffer_write); + pub fn prep_sendmsg_zc( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, + ) void { + prep_sendmsg(sqe, fd, msg, flags); + sqe.opcode = .SENDMSG_ZC; + } - const fds = try posix.pipe(); - const pipe_offset: u64 = std.math.maxInt(u64); + pub fn prep_openat( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); + sqe.rw_flags = @bitCast(flags); + } - const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); - try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); - try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); - try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); - sqe_splice_to_pipe.flags.IO_LINK = true; + pub fn prep_openat_direct( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + file_index: u32, + ) void { + prep_openat(sqe, fd, path, flags, mode); + set_target_fixed_file(sqe, file_index); + } - const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); - try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); - try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); - try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); - sqe_splice_from_pipe.flags.IO_LINK = true; + pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void { + sqe.* = .{ + .opcode = .CLOSE, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(Op.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 3), try ring.submit()); + pub fn prep_close_direct(sqe: *Sqe, file_index: u32) void { + prep_close(sqe, 0); + set_target_fixed_file(sqe, file_index); + } - const cqe_splice_to_pipe = try ring.copy_cqe(); - const cqe_splice_from_pipe = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - // Prior to Linux Kernel 5.6 this is the only way to test for splice/read support: - // https://lwn.net/Articles/809820/ - if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; - if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; - if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x11111111, - .res = buffer_write.len, - .flags = .{}, - }, cqe_splice_to_pipe); - try testing.expectEqual(Cqe{ - .user_data = 0x22222222, - .res = buffer_write.len, - .flags = .{}, - }, cqe_splice_from_pipe); - try testing.expectEqual(Cqe{ - .user_data = 0x33333333, - .res = buffer_read.len, - .flags = .{}, - }, cqe_read); - try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); -} + pub fn prep_timeout( + sqe: *Sqe, + ts: *const linux.kernel_timespec, + count: u32, + flags: uflags.Timeout, + ) void { + sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); + sqe.rw_flags = @bitCast(flags); + } -test "write_fixed/read_fixed" { - if (!is_linux) return error.SkipZigTest; + pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { + sqe.* = .{ + .opcode = .TIMEOUT_REMOVE, + .flags = .{}, + .ioprio = .init_empty(), + .fd = -1, + .off = 0, + .addr = timeout_user_data, + .len = 0, + .rw_flags = @bitCast(flags), + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_link_timeout( + sqe: *Sqe, + ts: *const linux.kernel_timespec, + flags: uflags.Timeout, + ) void { + sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); + sqe.rw_flags = flags; + } - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + pub fn prep_poll_add( + sqe: *Sqe, + fd: linux.fd_t, + poll_mask: linux.POLL, + ) void { + sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + } - const path = "test_io_uring_write_read_fixed"; - const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); - defer file.close(); - const fd = file.handle; - - var raw_buffers: [2][11]u8 = undefined; - // First buffer will be written to the file. - @memset(&raw_buffers[0], 'z'); - raw_buffers[0][0.."foobar".len].* = "foobar".*; - - var buffers = [2]posix.iovec{ - .{ .base = &raw_buffers[0], .len = raw_buffers[0].len }, - .{ .base = &raw_buffers[1], .len = raw_buffers[1].len }, - }; - ring.register_buffers(&buffers) catch |err| switch (err) { - error.SystemResources => { - // See https://github.com/ziglang/zig/issues/15362 - return error.SkipZigTest; - }, - else => |e| return e, - }; - - const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); - try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); - try testing.expectEqual(@as(u64, 3), sqe_write.off); - sqe_write.flags.IO_LINK = true; - - const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); - try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode); - try testing.expectEqual(@as(u64, 0), sqe_read.off); - - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - const cqe_write = try ring.copy_cqe(); - const cqe_read = try ring.copy_cqe(); - - try testing.expectEqual(Cqe{ - .user_data = 0x45454545, - .res = @as(i32, @intCast(buffers[0].len)), - .flags = .{}, - }, cqe_write); - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = @as(i32, @intCast(buffers[1].len)), - .flags = .{}, - }, cqe_read); - - try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].base[0..3]); - try testing.expectEqualSlices(u8, "foobar", buffers[1].base[3..9]); - try testing.expectEqualSlices(u8, "zz", buffers[1].base[9..11]); -} - -test "openat" { - if (!is_linux) return error.SkipZigTest; - - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_openat"; - - // Workaround for LLVM bug: https://github.com/ziglang/zig/issues/12014 - const path_addr = if (builtin.zig_backend == .stage2_llvm) p: { - var workaround = path; - _ = &workaround; - break :p @intFromPtr(workaround); - } else @intFromPtr(path); - - const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; - const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); - try testing.expectEqual(Sqe{ - .opcode = .OPENAT, - .flags = .{}, - .ioprio = .init_empty(), - .fd = tmp.dir.fd, - .off = 0, - .addr = path_addr, - .len = mode, - .rw_flags = @bitCast(flags), - .user_data = 0x33333333, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }, sqe_openat.*); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe_openat = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); - if (cqe_openat.err() == .INVAL) return error.SkipZigTest; - if (cqe_openat.err() == .BADF) return error.SkipZigTest; - if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); - try testing.expect(cqe_openat.res > 0); - try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_openat.flags); - - posix.close(cqe_openat.res); -} - -test "close" { - if (!is_linux) return error.SkipZigTest; - - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - - const path = "test_io_uring_close"; - const file = try tmp.dir.createFile(path, .{}); - errdefer file.close(); - - const sqe_close = try ring.close(0x44444444, file.handle); - try testing.expectEqual(Op.CLOSE, sqe_close.opcode); - try testing.expectEqual(file.handle, sqe_close.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe_close = try ring.copy_cqe(); - if (cqe_close.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x44444444, - .res = 0, - .flags = .{}, - }, cqe_close); -} - -test "accept/connect/send/recv" { - if (!is_linux) return error.SkipZigTest; - - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); - - const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - - const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); - sqe_send.flags.IO_LINK = true; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0xeeeeeeee, - .res = buffer_send.len, - .flags = .{}, - }, cqe_send); - - const cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0xffffffff, - .res = buffer_recv.len, - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recv); - - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); -} - -test "sendmsg/recvmsg" { - if (!is_linux) return error.SkipZigTest; - - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - var address_server: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - - const server = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); - defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try posix.bind(server, addrAny(&address_server), @sizeOf(linux.sockaddr.in)); - - // set address_server to the OS-chosen IP/port. - var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(server, addrAny(&address_server), &slen); - - const client = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); - defer posix.close(client); - - const buffer_send = [_]u8{42} ** 128; - const iovecs_send = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_send, .len = buffer_send.len }, - }; - const msg_send: linux.msghdr_const = .{ - .name = addrAny(&address_server), - .namelen = @sizeOf(linux.sockaddr.in), - .iov = &iovecs_send, - .iovlen = 1, - .control = null, - .controllen = 0, - .flags = 0, - }; - const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); - sqe_sendmsg.flags.IO_LINK = true; - try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); - try testing.expectEqual(client, sqe_sendmsg.fd); - - var buffer_recv = [_]u8{0} ** 128; - var iovecs_recv = [_]posix.iovec{ - posix.iovec{ .base = &buffer_recv, .len = buffer_recv.len }, - }; - var address_recv: linux.sockaddr.in = .{ - .port = 0, - .addr = 0, - }; - var msg_recv: linux.msghdr = .{ - .name = addrAny(&address_recv), - .namelen = @sizeOf(linux.sockaddr.in), - .iov = &iovecs_recv, - .iovlen = 1, - .control = null, - .controllen = 0, - .flags = 0, - }; - const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); - try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode); - try testing.expectEqual(server, sqe_recvmsg.fd); - - try testing.expectEqual(@as(u32, 2), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - - const cqe_sendmsg = try ring.copy_cqe(); - if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x11111111, - .res = buffer_send.len, - .flags = .{}, - }, cqe_sendmsg); - - const cqe_recvmsg = try ring.copy_cqe(); - if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(Cqe{ - .user_data = 0x22222222, - .res = buffer_recv.len, - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically - .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, - }, cqe_recvmsg); - - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); -} - -test "timeout (after a relative time)" { - if (!is_linux) return error.SkipZigTest; - const io = std.testing.io; - - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const ms = 10; - const margin = 5; - const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; - - const started = try std.Io.Clock.awake.now(io); - const sqe = try ring.timeout(0x55555555, &ts, 0, 0); - try testing.expectEqual(Op.TIMEOUT, sqe.opcode); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - const stopped = try std.Io.Clock.awake.now(io); - - try testing.expectEqual(Cqe{ - .user_data = 0x55555555, - .res = -@as(i32, @intFromEnum(linux.E.TIME)), - .flags = .{}, - }, cqe); - - // Tests should not depend on timings: skip test if outside margin. - const ms_elapsed = started.durationTo(stopped).toMilliseconds(); - if (ms_elapsed > margin) return error.SkipZigTest; -} - -test "timeout (after a number of completions)" { - if (!is_linux) return error.SkipZigTest; - - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; - const count_completions: u64 = 1; - const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, .{}); - try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); - try testing.expectEqual(count_completions, sqe_timeout.off); - _ = try ring.nop(0x77777777); - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - const cqe_nop = try ring.copy_cqe(); - try testing.expectEqual(Cqe{ - .user_data = 0x77777777, - .res = 0, - .flags = .{}, - }, cqe_nop); - - const cqe_timeout = try ring.copy_cqe(); - try testing.expectEqual(Cqe{ - .user_data = 0x66666666, - .res = 0, - .flags = .{}, - }, cqe_timeout); -} - -test "timeout_remove" { - if (!is_linux) return error.SkipZigTest; - - var ring = IoUring.init(2, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); - - const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; - const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); - try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); - - const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, .{}); - try testing.expectEqual(Op.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); - - try testing.expectEqual(@as(u32, 2), try ring.submit()); - - // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: - // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second - // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second - - var cqes: [2]Cqe = undefined; - cqes[0] = try ring.copy_cqe(); - cqes[1] = try ring.copy_cqe(); - - for (cqes) |cqe| { - // IORING_OP_TIMEOUT_REMOVE is not supported by this kernel version: - // Timeout remove operations set the fd to -1, which results in EBADF before EINVAL. - // We use IORING_FEAT_RW_CUR_POS as a safety check here to make sure we are at least pre-5.6. - // We don't want to skip this test for newer kernels. - if (cqe.user_data == 0x99999999 and - cqe.err() == .BADF and - (!ring.features.RW_CUR_POS)) - { - return error.SkipZigTest; - } - - try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); + pub fn prep_poll_remove( + sqe: *Sqe, + target_user_data: u64, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); + } - if (cqe.user_data == 0x88888888) { - try testing.expectEqual(Cqe{ - .user_data = 0x88888888, - .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = .{}, - }, cqe); - } else if (cqe.user_data == 0x99999999) { - try testing.expectEqual(Cqe{ - .user_data = 0x99999999, - .res = 0, - .flags = .{}, - }, cqe); - } + pub fn prep_poll_update( + sqe: *Sqe, + old_user_data: u64, + new_user_data: u64, + poll_mask: linux.POLL, + flags: uflags.Poll, + ) void { + sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); + // Poll masks previously used to comprise of 16 bits in the flags union of + // a SQE, but were then extended to comprise of 32 bits in order to make + // room for additional option flags. To ensure that the correct bits of + // poll masks are consistently and properly read across multiple kernel + // versions, poll masks are enforced to be little-endian. + // https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); } -} -test "accept/connect/recv/link_timeout" { - if (!is_linux) return error.SkipZigTest; + pub fn prep_fallocate( + sqe: *Sqe, + fd: linux.fd_t, + mode: i32, + offset: u64, + len: u64, + ) void { + sqe.* = .{ + .opcode = .FALLOCATE, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = offset, + .addr = len, + .len = @intCast(mode), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_statx( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.At, + mask: linux.Statx.Mask, + buf: *linux.Statx, + ) void { + sqe.prep_rw(.STATX, fd, @intFromPtr(path), @as(u32, @bitCast(mask)), @intFromPtr(buf)); + sqe.rw_flags = @bitCast(flags); + } - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); + pub fn prep_cancel( + sqe: *Sqe, + cancel_user_data: u64, + flags: uflags.AsyncCancel, + ) void { + sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); + sqe.rw_flags = @bitCast(flags); + } - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + pub fn prep_cancel_fd( + sqe: *Sqe, + fd: linux.fd_t, + flags: uflags.AsyncCancel, + ) void { + sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); + const enable_cancel_fd = blk: { + var update_flags = flags; + update_flags.CANCEL_FD = true; + break :blk update_flags; + }; + sqe.rw_flags = @bitCast(enable_cancel_fd); + } - const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - sqe_recv.flags.IO_LINK = true; + pub fn prep_shutdown( + sqe: *Sqe, + sockfd: linux.socket_t, + how: linux.Shut, + ) void { + sqe.prep_rw(.SHUTDOWN, sockfd, 0, @intFromEnum(how), 0); + } - const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; - _ = try ring.link_timeout(0x22222222, &ts, 0); + pub fn prep_renameat( + sqe: *Sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.RenameFlags, + ) void { + sqe.prep_rw( + .RENAMEAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = @bitCast(flags); + } - const nr_wait = try ring.submit(); - try testing.expectEqual(@as(u32, 2), nr_wait); + pub fn prep_unlinkat( + sqe: *Sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.At, // TODO: unlink flags only AT_REMOVEDIR + ) void { + sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); + sqe.rw_flags = @bitCast(flags); + } - var i: usize = 0; - while (i < nr_wait) : (i += 1) { - const cqe = try ring.copy_cqe(); - switch (cqe.user_data) { - 0xffffffff => { - if (cqe.res != -@as(i32, @intFromEnum(linux.E.INTR)) and - cqe.res != -@as(i32, @intFromEnum(linux.E.CANCELED))) - { - std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); - try testing.expect(false); - } - }, - 0x22222222 => { - if (cqe.res != -@as(i32, @intFromEnum(linux.E.ALREADY)) and - cqe.res != -@as(i32, @intFromEnum(linux.E.TIME))) - { - std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); - try testing.expect(false); - } - }, - else => @panic("should not happen"), - } + pub fn prep_mkdirat( + sqe: *Sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); } -} -test "fallocate" { - if (!is_linux) return error.SkipZigTest; + pub fn prep_symlinkat( + sqe: *Sqe, + target: [*:0]const u8, + new_dir_fd: linux.fd_t, + link_path: [*:0]const u8, + ) void { + sqe.prep_rw( + .SYMLINKAT, + new_dir_fd, + @intFromPtr(target), + 0, + @intFromPtr(link_path), + ); + } - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_linkat( + sqe: *Sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.At, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW + ) void { + sqe.prep_rw( + .LINKAT, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = @bitCast(flags); + } - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + pub fn prep_files_update( + sqe: *Sqe, + fds: []const linux.fd_t, + offset: u32, + ) void { + sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); + } - const path = "test_io_uring_fallocate"; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); + pub fn prep_files_update_alloc( + sqe: *Sqe, + fds: []linux.fd_t, + ) void { + sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); + } - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + // TODO: why can't slice be used here ? + pub fn prep_provide_buffers( + sqe: *Sqe, + buffers: [*]u8, + buffer_len: usize, + num: usize, + group_id: usize, + buffer_id: usize, + ) void { + const ptr = @intFromPtr(buffers); + sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); + sqe.buf_index = @intCast(group_id); + } - const len: u64 = 65536; - const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); - try testing.expectEqual(Op.FALLOCATE, sqe.opcode); - try testing.expectEqual(file.handle, sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn prep_remove_buffers( + sqe: *Sqe, + num: usize, + group_id: usize, + ) void { + sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); + sqe.buf_index = @intCast(group_id); + } - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement fallocate(): - .INVAL => return error.SkipZigTest, - // This kernel does not implement fallocate(): - .NOSYS => return error.SkipZigTest, - // The filesystem containing the file referred to by fd does not support this operation; - // or the mode is not supported by the filesystem containing the file referred to by fd: - .OPNOTSUPP => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + pub fn prep_socket( + sqe: *Sqe, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags is unused + flags: u32, + ) void { + sqe.prep_rw(.SOCKET, @intFromEnum(domain), 0, @intFromEnum(protocol), @intCast(@as(u32, @bitCast(socket_type)))); + sqe.rw_flags = flags; } - try testing.expectEqual(Cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = .{}, - }, cqe); - try testing.expectEqual(len, (try file.stat()).size); -} + pub fn prep_socket_direct( + sqe: *Sqe, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + /// flags is unused + flags: u32, + file_index: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, file_index); + } -test "statx" { - if (!is_linux) return error.SkipZigTest; + pub fn prep_socket_direct_alloc( + sqe: *Sqe, + domain: linux.Af, + socket_type: linux.SOCK, + protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + flags: u32, // flags is unused + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); + } - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + pub fn prep_waitid( + sqe: *Sqe, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: linux.W, + flags: u32, // flags is unused + ) void { + sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); + sqe.rw_flags = flags; + sqe.splice_fd_in = @bitCast(options); + } - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); - const path = "test_io_uring_statx"; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); + // TODO: maybe remove unused flag fields? + pub fn prep_bind( + sqe: *Sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + flags: u32, // flags is unused and does't exist in io_uring's api + ) void { + sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); + sqe.rw_flags = flags; + } - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + pub fn prep_listen( + sqe: *Sqe, + fd: linux.fd_t, + backlog: usize, + flags: u32, // flags is unused and does't exist in io_uring's api + ) void { + sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); + sqe.rw_flags = flags; + } - try file.writeAll("foobar"); + pub fn prep_cmd_sock( + sqe: *Sqe, + cmd_op: SocketOp, + fd: linux.fd_t, + level: linux.SOL, + optname: linux.SO, + optval: u64, + optlen: u32, + ) void { + sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); + // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 + sqe.off = @intFromEnum(cmd_op); + // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 + sqe.addr = @bitCast(packed struct { + level: u32, + optname: u32, + }{ + .level = level, + .optname = optname, + }); + // splice_fd_in if overloaded u32 -> i32 + sqe.splice_fd_in = @bitCast(optlen); + // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 + sqe.addr3 = optval; + } - var buf: linux.Statx = undefined; - const sqe = try ring.statx( - 0xaaaaaaaa, - tmp.dir.fd, - path, - .{}, - .{ .size = true }, - &buf, - ); - try testing.expectEqual(Op.STATX, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn set_flags(sqe: *Sqe, flags: Sqe.IoSqe) void { + const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags)); + sqe.flags = @bitCast(updated_flags); + } - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement statx(): - .INVAL => return error.SkipZigTest, - // This kernel does not implement statx(): - .NOSYS => return error.SkipZigTest, - // The filesystem containing the file referred to by fd does not support this operation; - // or the mode is not supported by the filesystem containing the file referred to by fd: - .OPNOTSUPP => return error.SkipZigTest, - // not supported on older kernels (5.4) - .BADF => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + /// This SQE forms a link with the next SQE in the submission ring. Next SQE + /// will not be started before this one completes. Forms a chain of SQEs. + pub fn link_next(sqe: *Sqe) void { + sqe.flags.IO_LINK = true; } - try testing.expectEqual(Cqe{ - .user_data = 0xaaaaaaaa, - .res = 0, - .flags = .{}, - }, cqe); +}; - try testing.expect(buf.mask.size); - try testing.expectEqual(@as(u64, 6), buf.size); -} +/// matches io_uring_sq in liburing +pub const Sq = struct { + head: *u32, + tail: *u32, + mask: u32, + flags: *Flags, + dropped: *u32, + array: []u32, + sqes: []Sqe, + mmap: []align(page_size_min) u8, + mmap_sqes: []align(page_size_min) u8, -test "accept/connect/recv/cancel" { - if (!is_linux) return error.SkipZigTest; + // We use `sqe_head` and `sqe_tail` in the same way as liburing: + // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. + // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. + // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. + sqe_head: u32 = 0, + sqe_tail: u32 = 0, - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + /// sq_ring.flags + pub const Flags = packed struct(u32) { + /// needs io_uring_enter wakeup + NEED_WAKEUP: bool = false, + /// CQ ring is overflown + CQ_OVERFLOW: bool = false, + /// task should enter the kernel + TASKRUN: bool = false, + _unused: u29 = 0, }; - defer ring.deinit(); - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); + pub fn init(fd: posix.fd_t, p: Params) !Sq { + assert(fd >= 0); + assert(p.features.SINGLE_MMAP); + const size = @max( + p.sq_off.array + p.sq_entries * @sizeOf(u32), + p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe), + ); + const mmap = try posix.mmap( + null, + size, + posix.PROT.READ | posix.PROT.WRITE, + .{ .TYPE = .SHARED, .POPULATE = true }, + fd, + constants.OFF_SQ_RING, + ); + errdefer posix.munmap(mmap); + assert(mmap.len == size); - var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; + // The motivation for the `sqes` and `array` indirection is to make it possible for the + // application to preallocate static io_uring_sqe entries and then replay them when needed. + const size_sqes = p.sq_entries * @sizeOf(Sqe); + const mmap_sqes = try posix.mmap( + null, + size_sqes, + posix.PROT.READ | posix.PROT.WRITE, + .{ .TYPE = .SHARED, .POPULATE = true }, + fd, + constants.OFF_SQES, + ); + errdefer posix.munmap(mmap_sqes); + assert(mmap_sqes.len == size_sqes); - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); + const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0])); + // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, + // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. + assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); + return .{ + .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), + .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), + .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, + .flags = @ptrCast(@alignCast(&mmap[p.sq_off.flags])), + .dropped = @ptrCast(@alignCast(&mmap[p.sq_off.dropped])), + .array = array[0..p.sq_entries], + .sqes = sqes[0..p.sq_entries], + .mmap = mmap, + .mmap_sqes = mmap_sqes, + }; + } - const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); - try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode); - try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub fn deinit(self: *Sq) void { + posix.munmap(self.mmap_sqes); + posix.munmap(self.mmap); + } +}; - var cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - var cqe_cancel = try ring.copy_cqe(); - if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; +/// matches io_uring_cq in liburing +pub const Cq = struct { + head: *u32, + tail: *u32, + mask: u32, + overflow: *u32, + cqes: []Cqe, - // The recv/cancel CQEs may arrive in any order, the recv CQE will sometimes come first: - if (cqe_recv.user_data == 0x99999999 and cqe_cancel.user_data == 0xffffffff) { - const a = cqe_recv; - const b = cqe_cancel; - cqe_recv = b; - cqe_cancel = a; + /// cq_ring.flags + pub const Flags = packed struct(u32) { + /// disable eventfd notifications + EVENTFD_DISABLED: bool = false, + _unused: u31 = 0, + }; + + pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq { + assert(fd >= 0); + assert(p.features.SINGLE_MMAP); + const mmap = sq.mmap; + const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); + assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); + return .{ + .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), + .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), + .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, + .overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])), + .cqes = cqes[0..p.cq_entries], + }; } - try testing.expectEqual(Cqe{ - .user_data = 0xffffffff, - .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = .{}, - }, cqe_recv); + pub fn deinit(self: *Cq) void { + _ = self; + // A no-op since we now share the mmap with the submission queue. + // Here for symmetry with the submission queue, and for any future feature support. + } +}; - try testing.expectEqual(Cqe{ - .user_data = 0x99999999, - .res = 0, - .flags = .{}, - }, cqe_cancel); -} +/// Group of application provided buffers. Uses newer type, called ring mapped +/// buffers, supported since kernel 5.19. Buffers are identified by a buffer +/// group ID, and within that group, a buffer ID. IO_Uring can have multiple +/// buffer groups, each with unique group ID. +/// +/// In `init` application provides contiguous block of memory `buffers` for +/// `buffers_count` buffers of size `buffers_size`. Application can then submit +/// `recv` operation without providing buffer upfront. Once the operation is +/// ready to receive data, a buffer is picked automatically and the resulting +/// CQE will contain the buffer ID in `cqe.buffer_id()`. Use `get` method to get +/// buffer for buffer ID identified by CQE. Once the application has processed +/// the buffer, it may hand ownership back to the kernel, by calling `put` +/// allowing the cycle to repeat. +/// +/// Depending on the rate of arrival of data, it is possible that a given buffer +/// group will run out of buffers before those in CQEs can be put back to the +/// kernel. If this happens, a `cqe.err()` will have ENOBUFS as the error value. +/// +pub const BufferGroup = struct { + /// Parent ring for which this group is registered. + ring: *IoUring, + /// Pointer to the memory shared by the kernel. + /// `buffers_count` of `io_uring_buf` structures are shared by the kernel. + /// First `io_uring_buf` is overlaid by `io_uring_buf_ring` struct. + br: *align(page_size_min) BufferRing, + /// Contiguous block of memory of size (buffers_count * buffer_size). + buffers: []u8, + /// Size of each buffer in buffers. + buffer_size: u32, + /// Number of buffers in `buffers`, number of `io_uring_buf structures` in br. + buffers_count: u16, + /// Head of unconsumed part of each buffer, if incremental consumption is enabled + heads: []u32, + /// ID of this group, must be unique in ring. + group_id: u16, -test "register_files_update" { - if (!is_linux) return error.SkipZigTest; + pub fn init( + ring: *IoUring, + allocator: mem.Allocator, + group_id: u16, + buffer_size: u32, + buffers_count: u16, + ) !BufferGroup { + const buffers = try allocator.alloc(u8, buffer_size * buffers_count); + errdefer allocator.free(buffers); + const heads = try allocator.alloc(u32, buffers_count); + errdefer allocator.free(heads); - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .IOU_PBUF_RING_INC = true }); + buf_ring_init(br); - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); + const mask = buf_ring_mask(buffers_count); + var i: u16 = 0; + while (i < buffers_count) : (i += 1) { + const pos = buffer_size * i; + const buf = buffers[pos .. pos + buffer_size]; + heads[i] = 0; + buf_ring_add(br, buf, i, mask, i); + } + buf_ring_advance(br, buffers_count); - var registered_fds = [_]linux.fd_t{0} ** 2; - const fd_index = 0; - const fd_index2 = 1; - registered_fds[fd_index] = fd; - registered_fds[fd_index2] = -1; + return BufferGroup{ + .ring = ring, + .group_id = group_id, + .br = br, + .buffers = buffers, + .heads = heads, + .buffer_size = buffer_size, + .buffers_count = buffers_count, + }; + } - ring.register_files(registered_fds[0..]) catch |err| switch (err) { - // Happens when the kernel doesn't support sparse entry (-1) in the file descriptors array. - error.FileDescriptorInvalid => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - }; + pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void { + free_buf_ring(self.ring.fd, self.br, self.buffers_count, self.group_id); + allocator.free(self.buffers); + allocator.free(self.heads); + } - // Test IORING_REGISTER_FILES_UPDATE - // Only available since Linux 5.5 + // Prepare recv operation which will select buffer from this group. + pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { + var sqe = try self.ring.get_sqe(); + sqe.prep_rw(.RECV, fd, 0, 0, 0); + sqe.rw_flags = flags; + sqe.flags.BUFFER_SELECT = true; + sqe.buf_index = self.group_id; + sqe.user_data = user_data; + return sqe; + } + + // Prepare multishot recv operation which will select buffer from this group. + pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { + var sqe = try self.recv(user_data, fd, flags); + sqe.ioprio.send_recv.RECV_MULTISHOT = true; + return sqe; + } - const fd2 = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd2); + // Get buffer by id. + fn get_by_id(self: *BufferGroup, buffer_id: u16) []u8 { + const pos = self.buffer_size * buffer_id; + return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..]; + } - registered_fds[fd_index] = fd2; - registered_fds[fd_index2] = -1; - try ring.register_files_update(0, registered_fds[0..]); + // Get buffer by CQE. + pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { + const buffer_id = try cqe.buffer_id(); + const used_len = @as(usize, @intCast(cqe.res)); + return self.get_by_id(buffer_id)[0..used_len]; + } - var buffer = [_]u8{42} ** 128; - { - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags.FIXED_FILE = true; + // Release buffer from CQE to the kernel. + pub fn put(self: *BufferGroup, cqe: Cqe) !void { + const buffer_id = try cqe.buffer_id(); + if (cqe.flags.F_BUF_MORE) { + // Incremental consumption active, kernel will write to the this buffer again + const used_len = @as(u32, @intCast(cqe.res)); + // Track what part of the buffer is used + self.heads[buffer_id] += used_len; + return; + } + self.heads[buffer_id] = 0; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + // Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count); + const mask = buf_ring_mask(self.buffers_count); + buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0); + buf_ring_advance(self.br, 1); } +}; - // Test with a non-zero offset +/// Used to select how the read should be handled. +pub const ReadBuffer = union(enum) { + /// io_uring will read directly into this buffer + buffer: []u8, - registered_fds[fd_index] = -1; - registered_fds[fd_index2] = -1; - try ring.register_files_update(1, registered_fds[1..]); + /// io_uring will read directly into these buffers using readv. + iovecs: []const posix.iovec, - { - // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags.FIXED_FILE = true; + /// io_uring will select a buffer that has previously been provided with `provide_buffers`. + /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = buffer.len, - .flags = .{}, - }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); - } +/// Used to select how the recv call should be handled. +pub const RecvBuffer = union(enum) { + /// io_uring will recv directly into this buffer + buffer: []u8, - try ring.register_files_update(0, registered_fds[0..]); + /// io_uring will select a buffer that has previously been provided with `provide_buffers`. + /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; - { - // Now this should fail since both fds are sparse (-1) - const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - sqe.flags.FIXED_FILE = true; +/// Filled with the offset for mmap(2) +/// matches io_sqring_offsets in liburing +pub const SqOffsets = extern struct { + /// offset of ring head + head: u32, + /// offset of ring tail + tail: u32, + /// ring mask value + ring_mask: u32, + /// entries in ring + ring_entries: u32, + /// ring flags + flags: u32, + /// number of sqes not submitted + dropped: u32, + /// sqe index array + array: u32, + resv1: u32, + user_addr: u64, +}; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(linux.E.BADF, cqe.err()); - } +/// matches io_cqring_offsets in liburing +pub const CqOffsets = extern struct { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + overflow: u32, + cqes: u32, + flags: u32, + resv: u32, + user_addr: u64, +}; - try ring.unregister_files(); -} +/// Passed in for io_uring_setup(2). Copied back with updated info on success +/// matches io_uring_params in liburing +pub const Params = extern struct { + sq_entries: u32, + cq_entries: u32, + flags: uflags.Setup, + sq_thread_cpu: u32, + sq_thread_idle: u32, + features: uflags.Features, + wq_fd: u32, + resv: [3]u32, + sq_off: SqOffsets, + cq_off: CqOffsets, +}; -test "shutdown" { - if (!is_linux) return error.SkipZigTest; +// COMMIT: remove deprecated io_uring_rsrc_update struct +// deprecated, see struct io_uring_rsrc_update - var ring = IoUring.init(16, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); +// COMMIT: add new io_uring_region_desc struct +/// matches io_uring_region_desc in liburing +pub const RegionDesc = extern struct { + user_addr: u64, + size: u64, + flags: Flags, + id: u32, + mmap_offset: u64, + __resv: [4]u64, - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + // COMMIT: new constant + /// initialise with user provided memory pointed by user_addr + pub const Flags = packed struct(u32) { + TYPE_USER: bool = false, + _: u31 = 0, }; +}; - // Socket bound, expect shutdown to work - { - // TODO: update posix later to use Typed Flags - const server = try posix.socket(address.any.family, @as(u32, @bitCast(linux.Sock{ .type = .stream, .flags = .{ .cloexec = true } })), 0); - defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); - try posix.listen(server, 1); - - // set address to the OS-chosen IP/port. - var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(server, addrAny(&address), &slen); - - const shutdown_sqe = try ring.shutdown(0x445445445, server, .rd); - try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); +// COMMIT: add new io_uring_mem_region_reg struct +/// matches io_uring_mem_region_reg in liburing +pub const MemRegionReg = extern struct { + /// struct io_uring_region_desc (RegionDesc in Zig) + region_uptr: u64, + flags: Flags, + __resv: [2]u64, - try testing.expectEqual(@as(u32, 1), try ring.submit()); + /// expose the region as registered wait arguments + pub const Flags = packed struct(u64) { + REG_WAIT_ARG: bool = false, + _: u63 = 0, + }; +}; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement shutdown (kernel version < 5.11) - .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } +/// matches io_uring_rsrc_register in liburing +pub const RsrcRegister = extern struct { + nr: u32, + flags: u32, + resv2: u64, + data: u64, + tags: u64, +}; - try testing.expectEqual(Cqe{ - .user_data = 0x445445445, - .res = 0, - .flags = .{}, - }, cqe); - } +/// matches io_uring_rsrc_update in liburing +pub const RsrcUpdate = extern struct { + offset: u32, + resv: u32, + data: u64, +}; - // Socket not bound, expect to fail with ENOTCONN - { - const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - defer posix.close(server); +/// matches io_uring_rsrc_update2 in liburing +pub const RsrcUpdate2 = extern struct { + offset: u32, + resv: u32, + data: u64, + tags: u64, + nr: u32, + resv2: u32, +}; - const shutdown_sqe = ring.shutdown(0x445445445, server, .rd) catch |err| switch (err) { - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - }; - try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); +/// matches io_uring_probe_op in liburing +pub const ProbeOp = extern struct { + op: Op, + resv: u8, + flags: Flags, + resv2: u32, - try testing.expectEqual(@as(u32, 1), try ring.submit()); + pub const Flags = packed struct(u16) { + OP_SUPPORTED: bool = false, + _: u15 = 0, + }; - const cqe = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); - try testing.expectEqual(linux.E.NOTCONN, cqe.err()); + pub fn is_supported(self: ProbeOp) bool { + return self.flags.OP_SUPPORTED; } -} +}; -test "renameat" { - if (!is_linux) return error.SkipZigTest; +/// matches io_uring_probe in liburing +pub const Probe = extern struct { + /// Last opcode supported + last_op: Op, + /// Length of ops[] array below + ops_len: u8, + resv: u16, + resv2: [3]u32, + ops: [256]ProbeOp, - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + /// Is the operation supported on the running kernel. + pub fn is_supported(self: @This(), op: Op) bool { + const i = @intFromEnum(op); + if (i > @intFromEnum(self.last_op) or i >= self.ops_len) + return false; + return self.ops[i].is_supported(); + } +}; - const old_path = "test_io_uring_renameat_old"; - const new_path = "test_io_uring_renameat_new"; +// COMMIT: fix defination of io_uring_restriction +// RegisterOp is actually u8 +/// matches io_uring_restriction in liburing +pub const Restriction = extern struct { + opcode: RestrictionOp, + arg: extern union { + /// IORING_RESTRICTION_REGISTER_OP + register_op: RegisterOp, + /// IORING_RESTRICTION_SQE_OP + sqe_op: Op, + /// IORING_RESTRICTION_SQE_FLAGS_* + sqe_flags: u8, + }, + resv: u8, + resv2: [3]u32, +}; - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); +// COMMIT: add new struct type +/// matches io_uring_clock_register in liburing +pub const ClockRegister = extern struct { + clockid: u32, + __resv: [3]u32, +}; - // Write old file with data +// COMMIT: add new struct type +/// matches io_uring_clone_buffers in liburing +pub const CloneBuffers = extern struct { + src_fd: u32, + flags: Flags, + src_off: u32, + dst_off: u32, + nr: u32, + pad: [3]u32, - const old_file = try tmp.dir.createFile(old_path, .{ .truncate = true, .mode = 0o666 }); - defer old_file.close(); - try old_file.writeAll("hello"); + // COMMIT: new flags + pub const Flags = packed struct(u32) { + REGISTER_SRC_REGISTERED: bool = false, + REGISTER_DST_REPLACE: bool = false, + _: u30 = 0, + }; +}; - // Submit renameat +/// matches io_uring_buf in liburing +pub const Buffer = extern struct { + addr: u64, + len: u32, + bid: u16, + resv: u16, +}; - const sqe = try ring.renameat( - 0x12121212, - tmp.dir.fd, - old_path, - tmp.dir.fd, - new_path, - .{}, - ); - try testing.expectEqual(Op.RENAMEAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +/// matches io_uring_buf_ring in liburing +pub const BufferRing = extern struct { + resv1: u64, + resv2: u32, + resv3: u16, + tail: u16, +}; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement renameat (kernel version < 5.11) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = .{}, - }, cqe); +/// argument for IORING_(UN)REGISTER_PBUF_RING +/// matches io_uring_buf_reg in liburing +pub const BufferRegister = extern struct { + ring_addr: u64, + ring_entries: u32, + bgid: u16, + flags: Flags, + resv: [3]u64, - // Validate that the old file doesn't exist anymore - try testing.expectError(error.FileNotFound, tmp.dir.openFile(old_path, .{})); + // COMMIT: new IORING_REGISTER_PBUF_RING flags + /// Flags for IORING_REGISTER_PBUF_RING. + pub const Flags = packed struct(u16) { + /// IOU_PBUF_RING_MMAP: + /// If set, kernel will allocate the memory for the ring. + /// The application must not set a ring_addr in struct io_uring_buf_reg + /// instead it must subsequently call mmap(2) with the offset set + /// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get + /// a virtual mapping for the ring. + IOU_PBUF_RING_MMAP: bool = false, + /// IOU_PBUF_RING_INC: + /// If set, buffers consumed from this buffer ring can be + /// consumed incrementally. Normally one (or more) buffers + /// are fully consumed. With incremental consumptions, it's + /// feasible to register big ranges of buffers, and each + /// use of it will consume only as much as it needs. This + /// requires that both the kernel and application keep + /// track of where the current read/recv index is at. + IOU_PBUF_RING_INC: bool = false, + _: u14 = 0, + }; +}; - // Validate that the new file exists with the proper content - var new_file_data: [16]u8 = undefined; - try testing.expectEqualStrings("hello", try tmp.dir.readFile(new_path, &new_file_data)); -} +/// argument for IORING_REGISTER_PBUF_STATUS +/// matches io_uring_buf_status in liburing +pub const BufferStatus = extern struct { + /// input + buf_group: u32, + /// output + head: u32, + resv: [8]u32, +}; -test "unlinkat" { - if (!is_linux) return error.SkipZigTest; +/// argument for IORING_(UN)REGISTER_NAPI +/// matches io_uring_napi in liburing +pub const Napi = extern struct { + busy_poll_to: u32, + prefer_busy_poll: u8, + pad: [3]u8, + resv: u64, +}; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, +// COMMIT: new struct type +/// Argument for io_uring_enter(2) with +/// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument +/// is an index into a previously registered fixed wait region described by +/// the below structure. +/// matches io_uring_reg_wait in liburing +pub const RegisterWait = extern struct { + ts: linux.kernel_timespec, + min_wait_usec: u32, + flags: Flags, + sigmask: u64, + sigmask_sz: u32, + pad: [3]u32, + pad2: [2]u64, + + // COMMIT: new constant + pub const Flags = packed struct(u32) { + REG_WAIT_TS: bool = false, + _: u31 = 0, }; - defer ring.deinit(); +}; - const path = "test_io_uring_unlinkat"; +/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG +/// matches io_uring_getevents_arg in liburing +pub const GetEventsArg = extern struct { + sigmask: u64, + sigmask_sz: u32, + pad: u32, + ts: u64, +}; - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); +// COMMIT: fix type definition of io_uring_sync_cancel_reg +/// Argument for IORING_REGISTER_SYNC_CANCEL +/// matches io_uring_sync_cancel_reg in liburing +pub const SyncCancelRegister = extern struct { + addr: u64, + fd: i32, + flags: uflags.AsyncCancel, + timeout: linux.kernel_timespec, + opcode: Op, + pad: [7]u8, + pad2: [4]u64, +}; - // Write old file with data +/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE +/// The range is specified as [off, off + len) +/// matches io_uring_file_index_range in liburing +pub const FileIndexRange = extern struct { + off: u32, + len: u32, + resv: u64, +}; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); +/// matches io_uring_recvmsg_out in liburing +pub const RecvmsgOut = extern struct { + namelen: u32, + controllen: u32, + payloadlen: u32, + flags: u32, +}; - // Submit unlinkat +/// Zero copy receive refill queue entry +/// matches io_uring_zcrx_rqe in liburing +pub const ZcrxRqe = extern struct { + off: u64, + len: u32, + __pad: u32, +}; + +/// matches io_uring_zcrx_cqe in liburing +pub const ZcrxCqe = extern struct { + off: u64, + __pad: u64, +}; - const sqe = try ring.unlinkat( - 0x12121212, - tmp.dir.fd, - path, - .{}, - ); - try testing.expectEqual(Op.UNLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +/// matches io_uring_zcrx_offsets in liburing +pub const ZcrxOffsets = extern struct { + head: u32, + tail: u32, + rqes: u32, + __resv2: u32, + __resv: [2]u64, +}; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement unlinkat (kernel version < 5.11) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = .{}, - }, cqe); +/// matches io_uring_zcrx_area_reg in liburing +pub const ZcrxAreaRegister = extern struct { + addr: u64, + len: u64, + rq_area_token: u64, + flags: Flags, + dmabuf_fd: u32, + __resv2: [2]u64, - // Validate that the file doesn't exist anymore - _ = tmp.dir.openFile(path, .{}) catch |err| switch (err) { - error.FileNotFound => {}, - else => std.debug.panic("unexpected error: {}", .{err}), + pub const Flags = packed struct(u32) { + DMABUF: bool = false, + _: u31 = 0, }; -} +}; -test "mkdirat" { - if (!is_linux) return error.SkipZigTest; +/// Argument for IORING_REGISTER_ZCRX_IFQ +/// matches io_uring_zcrx_ifq_reg in liburing +pub const ZcrxIfqRegister = extern struct { + if_idx: u32, + if_rxq: u32, + rq_entries: u32, + // TODO: find out its flags, I suspect its ZcrxAreaRegister.Flags + flags: u32, + /// pointer to struct io_uring_zcrx_area_reg + area_ptr: u64, + /// struct io_uring_region_desc + region_ptr: u64, + offsets: ZcrxOffsets, + zcrx_id: u32, + __resv2: u32, + __resv: [3]u64, +}; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); +// COMMIT: move IoUring constants to Constants +pub const constants = struct { + /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new + /// an available direct descriptor instead of having the application pass one + /// direct descriptor (like openat/openat2/accept), then io_uring will allocate + /// in. The picked direct descriptor will be returned in cqe.res, or -ENFILE + /// if the space is full. + pub const FILE_INDEX_ALLOC = std.math.maxInt(u32); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + pub const CMD_MASK = 1 << 0; - const path = "test_io_uring_mkdirat"; + pub const TIMEOUT_CLOCK_MASK = ((1 << 2) | (1 << 3)); + pub const TIMEOUT_UPDATE_MASK = ((1 << 1) | (1 << 4)); - // Submit mkdirat + pub const CQE_BUFFER_SHIFT = 16; - const sqe = try ring.mkdirat( - 0x12121212, - tmp.dir.fd, - path, - 0o0755, - ); - try testing.expectEqual(Op.MKDIRAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + /// cqe.res for IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was + /// requested It should be treated as a flag, all other bits of cqe.res + /// should be treated as reserved! + pub const NOTIF_USAGE_ZC_COPIED = (1 << 31); - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement mkdirat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = .{}, - }, cqe); + //Magic offsets for the application to mmap the data it needs + pub const OFF_SQ_RING = 0; + pub const OFF_CQ_RING = 0x8000000; + pub const OFF_SQES = 0x10000000; + // COMMIT: new magic constants + pub const OFF_PBUF_RING = 0x80000000; + pub const OFF_PBUF_SHIFT = 16; + pub const OFF_MMAP_MASK = 0xf8000000; - // Validate that the directory exist - _ = try tmp.dir.openDir(path, .{}); -} + /// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. + pub const RSRC_REGISTER_SPARSE = 1 << 0; -test "symlinkat" { - if (!is_linux) return error.SkipZigTest; + /// Skip updating fd indexes set to this value in the fd table + pub const REGISTER_FILES_SKIP = -2; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + // COMMIT: new TX Timestamp definition + /// SOCKET_URING_OP_TX_TIMESTAMP definitions + pub const TIMESTAMP_HW_SHIFT = 16; + /// The cqe.flags bit from which the timestamp type is stored + pub const TIMESTAMP_TYPE_SHIFT = (TIMESTAMP_HW_SHIFT + 1); + /// The cqe.flags flag signifying whether it's a hardware timestamp + pub const CQE_F_TSTAMP_HW = (1 << TIMESTAMP_HW_SHIFT); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + /// The bit from which area id is encoded into offsets + pub const ZCRX_AREA_SHIFT = 48; + pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1)); - const path = "test_io_uring_symlinkat"; - const link_path = "test_io_uring_symlinkat_link"; + // flag added to the opcode to use a registered ring fd + pub const REGISTER_USE_REGISTERED_RING = 1 << 31; +}; - const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); - defer file.close(); +// COMMIT: move IoUring flags to Flags struct +pub const uflags = struct { + /// io_uring_setup() flags + pub const Setup = packed struct(u32) { + /// io_context is polled + IOPOLL: bool = false, + /// SQ poll thread + SQPOLL: bool = false, + /// sq_thread_cpu is valid + SQ_AFF: bool = false, + /// app defines CQ size + CQSIZE: bool = false, + /// clamp SQ/CQ ring sizes + CLAMP: bool = false, + /// attach to existing wq + ATTACH_WQ: bool = false, + /// start with ring disabled + R_DISABLED: bool = false, + /// continue submit on error + SUBMIT_ALL: bool = false, + ///Cooperative task running. When requests complete, they often require + ///forcing the submitter to transition to the kernel to complete. If this + ///flag is set, work will be done when the task transitions anyway, rather + ///than force an inter-processor interrupt reschedule. This avoids interrupting + ///a task running in userspace, and saves an IPI. + COOP_TASKRUN: bool = false, + ///If COOP_TASKRUN is set, get notified if task work is available for + ///running and a kernel transition would be needed to run it. This sets + ///IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + TASKRUN_FLAG: bool = false, + /// SQEs are 128 byte + SQE128: bool = false, + /// CQEs are 32 byte + CQE32: bool = false, + /// Only one task is allowed to submit requests + SINGLE_ISSUER: bool = false, + /// Defer running task work to get events. + /// Rather than running bits of task work whenever the task transitions + /// try to do it just before it is needed. + DEFER_TASKRUN: bool = false, + /// Application provides the memory for the rings + NO_MMAP: bool = false, + /// Register the ring fd in itself for use with + /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + /// than an fd. + REGISTERED_FD_ONLY: bool = false, + /// Removes indirection through the SQ index array. + NO_SQARRAY: bool = false, + // COMMIT: new setup flags + /// Use hybrid poll in iopoll process + HYBRID_IOPOLL: bool = false, + /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + /// IORING_CQE_F_32 set in cqe.flags. + CQE_MIXED: bool = false, + _unused: u13 = 0, + }; - // Submit symlinkat + /// sqe.uring_cmd_flags (rw_flags in the Zig struct) + /// top 8bits aren't available for userspace + /// use registered buffer; pass this flag along with setting sqe.buf_index. + pub const Cmd = packed struct(u32) { + CMD_FIXED: bool = false, + _unused: u31 = 0, + }; - const sqe = try ring.symlinkat( - 0x12121212, - path, - tmp.dir.fd, - link_path, - ); - try testing.expectEqual(Op.SYMLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + /// sqe.fsync_flags (rw_flags in the Zig struct) + pub const Fsync = packed struct(u32) { + DATASYNC: bool = false, + _unused: u31 = 0, + }; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement symlinkat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = .{}, - }, cqe); + /// sqe.timeout_flags + pub const Timeout = packed struct(u32) { + TIMEOUT_ABS: bool = false, + /// Available since Linux 5.11 + TIMEOUT_UPDATE: bool = false, + /// Available since Linux 5.15 + TIMEOUT_BOOTTIME: bool = false, + /// Available since Linux 5.15 + TIMEOUT_REALTIME: bool = false, + /// Available since Linux 5.15 + LINK_TIMEOUT_UPDATE: bool = false, + /// Available since Linux 5.16 + TIMEOUT_ETIME_SUCCESS: bool = false, + // COMMIT: new Timeout Flag + // TODO: add when it became available + TIMEOUT_MULTISHOT: bool = false, + _unused: u25 = 0, + }; - // Validate that the symlink exist - _ = try tmp.dir.openFile(link_path, .{}); -} + /// sqe.splice_flags (rw_flags in Zig Struct) + /// extends splice(2) flags + pub const Splice = packed struct(u32) { + _unused: u31 = 0, + /// the last bit of __u32 + F_FD_IN_FIXED: bool = false, + }; -test "linkat" { - if (!is_linux) return error.SkipZigTest; + /// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct) + /// is the flag space, the command flags for POLL_ADD are stored in sqe.len. + pub const Poll = packed struct(u32) { + /// IORING_POLL_ADD_MULTI + /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue + /// to report CQEs on behalf of the same SQE. + ADD_MULTI: bool = false, + // TODO: verify this doc comment is valid for the 2 flags below + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + UPDATE_EVENTS: bool = false, + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + UPDATE_USER_DATA: bool = false, + /// IORING_POLL_LEVEL + /// Level triggered poll. + ADD_LEVEL: bool = false, + _unused: u28 = 0, + }; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, + /// ASYNC_CANCEL flags. + pub const AsyncCancel = packed struct(u32) { + /// IORING_ASYNC_CANCEL_ALL + /// Cancel all requests that match the given key + CANCEL_ALL: bool = false, + /// IORING_ASYNC_CANCEL_FD + /// Key off 'fd' for cancelation rather than the request 'user_data' + CANCEL_FD: bool = false, + /// IORING_ASYNC_CANCEL_ANY + /// Match any request + CANCEL_ANY: bool = false, + /// IORING_ASYNC_CANCEL_FD_FIXED + /// 'fd' passed in is a fixed descriptor + CANCEL_FD_FIXED: bool = false, + // COMMIT: new AsyncCancel Flags + /// IORING_ASYNC_CANCEL_USERDATA + /// Match on user_data, default for no other key + CANCEL_USERDATA: bool = false, + /// IORING_ASYNC_CANCEL_OP + /// Match request based on opcode + CANCEL_OP: bool = false, + _unused: u26 = 0, }; - defer ring.deinit(); - var tmp = std.testing.tmpDir(.{}); - defer tmp.cleanup(); + /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) + pub const MsgRing = packed struct(u32) { + /// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. + /// Not applicable for IORING_MSG_DATA, obviously. + CQE_SKIP: bool = false, + /// Pass through the flags from sqe.file_index to cqe.flags + FLAGS_PASS: bool = false, + _unused: u30 = 0, + }; - const first_path = "test_io_uring_linkat_first"; - const second_path = "test_io_uring_linkat_second"; + // COMMIT: new flag + /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) + pub const FixedFd = packed struct(u32) { + /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC + NO_CLOEXEC: bool = false, + }; - // Write file with data + /// COMMIT: new flags + /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) + pub const Nop = packed struct(u32) { + /// IORING_NOP_INJECT_RESULT Inject result from sqe.result + INJECT_RESULT: bool = false, + _unused: u4 = 0, + CQE32: bool = false, + _unused_1: u26 = 0, + }; - const first_file = try tmp.dir.createFile(first_path, .{ .truncate = true, .mode = 0o666 }); - defer first_file.close(); - try first_file.writeAll("hello"); + /// io_uring_enter(2) flags + pub const Enter = packed struct(u32) { + GETEVENTS: bool = false, + SQ_WAKEUP: bool = false, + SQ_WAIT: bool = false, + EXT_ARG: bool = false, + REGISTERED_RING: bool = false, + // COMMIT: new flags + ABS_TIMER: bool = false, + EXT_ARG_REG: bool = false, + NO_IOWAIT: bool = false, + _unused: u24 = 0, - // Submit linkat + pub fn empty(enter_flags: Enter) bool { + return @as(u32, @bitCast(enter_flags)) == 0; + } + }; - const sqe = try ring.linkat( - 0x12121212, - tmp.dir.fd, - first_path, - tmp.dir.fd, - second_path, - .{}, - ); - try testing.expectEqual(Op.LINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + /// io_uring_params.features flags + const Features = packed struct(u32) { + SINGLE_MMAP: bool = false, + NODROP: bool = false, + SUBMIT_STABLE: bool = false, + RW_CUR_POS: bool = false, + CUR_PERSONALITY: bool = false, + FAST_POLL: bool = false, + POLL_32BITS: bool = false, + SQPOLL_NONFIXED: bool = false, + EXT_ARG: bool = false, + NATIVE_WORKERS: bool = false, + RSRC_TAGS: bool = false, + CQE_SKIP: bool = false, + LINKED_FILE: bool = false, + // COMMIT: add new Feature Flags + REG_REG_RING: bool = false, + RECVSEND_BUNDLE: bool = false, + MIN_TIMEOUT: bool = false, + RW_ATTR: bool = false, + NO_IOWAIT: bool = false, + _unused: u14 = 0, - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - // This kernel's io_uring does not yet implement linkat (kernel version < 5.15) - .BADF, .INVAL => return error.SkipZigTest, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(Cqe{ - .user_data = 0x12121212, - .res = 0, - .flags = .{}, - }, cqe); + pub fn empty(features: Features) bool { + return @as(u32, @bitCast(features)) == 0; + } + }; +}; +/// io_uring_register(2) opcodes and arguments +/// matches io_uring_register_op in liburing +pub const RegisterOp = enum(u8) { + REGISTER_BUFFERS, + UNREGISTER_BUFFERS, + REGISTER_FILES, + UNREGISTER_FILES, + REGISTER_EVENTFD, + UNREGISTER_EVENTFD, + REGISTER_FILES_UPDATE, + REGISTER_EVENTFD_ASYNC, + REGISTER_PROBE, + REGISTER_PERSONALITY, + UNREGISTER_PERSONALITY, + REGISTER_RESTRICTIONS, + REGISTER_ENABLE_RINGS, - // Validate the second file - var second_file_data: [16]u8 = undefined; - try testing.expectEqualStrings("hello", try tmp.dir.readFile(second_path, &second_file_data)); -} + // extended with tagging + REGISTER_FILES2, + REGISTER_FILES_UPDATE2, + REGISTER_BUFFERS2, + REGISTER_BUFFERS_UPDATE, -test "provide_buffers: read" { - if (!is_linux) return error.SkipZigTest; + // set/clear io-wq thread affinities + REGISTER_IOWQ_AFF, + UNREGISTER_IOWQ_AFF, - var ring = IoUring.init(1, .{}) catch |err| switch (err) { - error.SystemOutdated => return error.SkipZigTest, - error.PermissionDenied => return error.SkipZigTest, - else => return err, - }; - defer ring.deinit(); + // set/get max number of io-wq workers + REGISTER_IOWQ_MAX_WORKERS, - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); + // register/unregister io_uring fd with the ring + REGISTER_RING_FDS, + UNREGISTER_RING_FDS, - const group_id = 1337; - const buffer_id = 0; + // register ring based provide buffer group + REGISTER_PBUF_RING, + UNREGISTER_PBUF_RING, - const buffer_len = 128; + // sync cancelation API + REGISTER_SYNC_CANCEL, - var buffers: [4][buffer_len]u8 = undefined; + // register a range of fixed file slots for automatic slot allocation + REGISTER_FILE_ALLOC_RANGE, - // Provide 4 buffers + // return status information for a buffer group + REGISTER_PBUF_STATUS, - { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // set/clear busy poll settings + REGISTER_NAPI, + UNREGISTER_NAPI, - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Happens when the kernel is < 5.7 - .INVAL, .BADF => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); - } + REGISTER_CLOCK, - // Do 4 reads which should consume all buffers + // clone registered buffers from source ring to current ring + REGISTER_CLONE_BUFFERS, - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // send MSG_RING without having a ring + REGISTER_SEND_MSG_RING, - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + // register a netdev hw rx queue for zerocopy + REGISTER_ZCRX_IFQ, - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = try cqe.buffer_id(); - try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); + // resize CQ ring + REGISTER_RESIZE_RINGS, - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); - } + REGISTER_MEM_REGION, - // This read should fail + // COMMIT: new register opcode + // query various aspects of io_uring, see linux/io_uring/query.h + REGISTER_QUERY, - { - const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _, +}; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - } +/// io-wq worker categories +/// matches io_wq_type in liburing +pub const IoWqCategory = enum(u8) { + BOUND, + UNBOUND, +}; - // Provide 1 buffer again +pub const SocketOp = enum(u16) { + SIOCIN, + SIOCOUTQ, + GETSOCKOPT, + SETSOCKOPT, + // COMMIT: new socket op + TX_TIMESTAMP, +}; - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 42); +/// io_uring_restriction.opcode values +/// matches io_uring_register_restriction_op in liburing +pub const RestrictionOp = enum(u16) { + /// Allow an io_uring_register(2) opcode + REGISTER_OP = 0, + /// Allow an sqe opcode + SQE_OP = 1, + /// Allow sqe flags + SQE_FLAGS_ALLOWED = 2, + /// Require sqe flags (these flags must be set on each submission) + SQE_FLAGS_REQUIRED = 3, - const reprovided_buffer_id = 2; + _, +}; - { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +/// IORING_OP_MSG_RING command types, stored in sqe.addr +pub const MsgRingCmd = enum { + /// pass sqe->len as 'res' and off as user_data + DATA, + /// send a registered fd to another ring + SEND_FD, +}; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - } +// COMMIT: OP to IoUring +pub const Op = enum(u8) { + NOP, + READV, + WRITEV, + FSYNC, + READ_FIXED, + WRITE_FIXED, + POLL_ADD, + POLL_REMOVE, + SYNC_FILE_RANGE, + SENDMSG, + RECVMSG, + TIMEOUT, + TIMEOUT_REMOVE, + ACCEPT, + ASYNC_CANCEL, + LINK_TIMEOUT, + CONNECT, + FALLOCATE, + OPENAT, + CLOSE, + FILES_UPDATE, + STATX, + READ, + WRITE, + FADVISE, + MADVISE, + SEND, + RECV, + EPOLL_CTL, + OPENAT2, + SPLICE, + PROVIDE_BUFFERS, + REMOVE_BUFFERS, + TEE, + SHUTDOWN, + RENAMEAT, + UNLINKAT, + MKDIRAT, + SYMLINKAT, + LINKAT, + MSG_RING, + FSETXATTR, + SETXATTR, + FGETXATTR, + GETXATTR, + SOCKET, + URING_CMD, + SEND_ZC, + SENDMSG_ZC, + READ_MULTISHOT, + WAITID, + FUTEX_WAIT, + FUTEX_WAKE, + FUTEX_WAITV, + FIXED_FD_INSTALL, + FTRUNCATE, + BIND, + LISTEN, + RECV_ZC, + // COMMIT: new OPs + // TODO: to be implemented + EPOLL_WAIT, + READV_FIXED, + WRITEV_FIXED, + PIPE, - // Final read which should work + _, +}; - { - const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +test "structs/offsets/entries" { + if (!is_linux) return error.SkipZigTest; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + try testing.expectEqual(@as(usize, 120), @sizeOf(Params)); + try testing.expectEqual(@as(usize, 64), @sizeOf(Sqe)); + try testing.expectEqual(@as(usize, 16), @sizeOf(Cqe)); - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = try cqe.buffer_id(); - try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); - } + try testing.expectEqual(0, constants.OFF_SQ_RING); + try testing.expectEqual(0x8000000, constants.OFF_CQ_RING); + try testing.expectEqual(0x10000000, constants.OFF_SQES); + + try testing.expectError(error.EntriesZero, IoUring.init(0, .{})); + try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, .{})); } -test "remove_buffers" { +test "nop" { if (!is_linux) return error.SkipZigTest; var ring = IoUring.init(1, .{}) catch |err| switch (err) { @@ -3068,378 +3571,447 @@ test "remove_buffers" { error.PermissionDenied => return error.SkipZigTest, else => return err, }; - defer ring.deinit(); - - const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); - defer posix.close(fd); - - const group_id = 1337; - const buffer_id = 0; - - const buffer_len = 128; - - var buffers: [4][buffer_len]u8 = undefined; - - // Provide 4 buffers - - { - _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .INVAL, .BADF => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + defer { + ring.deinit(); + testing.expectEqual(@as(linux.fd_t, -1), ring.fd) catch @panic("test failed"); } - // Remove 3 buffers + const sqe = try ring.nop(0xaaaaaaaa); + try testing.expectEqual(Sqe{ + .opcode = .NOP, + .flags = .{}, + .ioprio = .init_empty(), + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0xaaaaaaaa, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }, sqe.*); - { - const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); - try testing.expectEqual(Op.REMOVE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, 3), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 0), ring.cq.head.*); + try testing.expectEqual(@as(u32, 1), ring.sq_ready()); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 0), ring.cq.head.*); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); - } + try testing.expectEqual(Cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 1), ring.cq.head.*); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); - // This read should work + const sqe_barrier = try ring.nop(0xbbbbbbbb); + sqe_barrier.flags.IO_DRAIN = true; + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xbbbbbbbb, + .res = 0, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); + try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); + try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); + try testing.expectEqual(@as(u32, 2), ring.cq.head.*); +} - { - _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +test "readv" { + if (!is_linux) return error.SkipZigTest; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = try cqe.buffer_id(); - try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); - } + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); - // Final read should _not_ work + // Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1). + // Linux Kernel 5.5 adds support for sparse fd sets. + // Compare: + // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs + // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 + // We therefore avoid stressing sparse fd sets here: + var registered_fds = [_]linux.fd_t{0} ** 1; + const fd_index = 0; + registered_fds[fd_index] = fd; + try ring.register_files(registered_fds[0..]); - { - _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + var buffer = [_]u8{42} ** 128; + var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; + const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); + try testing.expectEqual(Op.READV, sqe.opcode); + sqe.flags.FIXED_FILE = true; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - } + try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + + try ring.unregister_files(); } -test "provide_buffers: accept/connect/send/recv" { +test "writev/fsync/readv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, .{}) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - const group_id = 1337; - const buffer_id = 0; - - const buffer_len = 128; - var buffers: [4][buffer_len]u8 = undefined; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - // Provide 4 buffers + const path = "test_io_uring_writev_fsync_readv"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; - { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const buffer_write = [_]u8{42} ** 128; + const iovecs_write = [_]posix.iovec_const{ + posix.iovec_const{ .base = &buffer_write, .len = buffer_write.len }, + }; + var buffer_read = [_]u8{0} ** 128; + var iovecs_read = [_]posix.iovec{ + posix.iovec{ .base = &buffer_read, .len = buffer_read.len }, + }; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Happens when the kernel is < 5.7 - .INVAL => return error.SkipZigTest, - // Happens on the kernel 5.4 - .BADF => return error.SkipZigTest, - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); - } + const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); + try testing.expectEqual(Op.WRITEV, sqe_writev.opcode); + try testing.expectEqual(@as(u64, 17), sqe_writev.off); + sqe_writev.flags.IO_LINK = true; - const socket_test_harness = try createSocketTestHarness(&ring); - defer socket_test_harness.close(); + const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); + try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); + try testing.expectEqual(fd, sqe_fsync.fd); + sqe_fsync.flags.IO_LINK = true; - // Do 4 send on the socket + const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); + try testing.expectEqual(Op.READV, sqe_readv.opcode); + try testing.expectEqual(@as(u64, 17), sqe_readv.off); - { - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - } + try testing.expectEqual(@as(u32, 3), ring.sq_ready()); + try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + try testing.expectEqual(@as(u32, 3), ring.cq_ready()); - var cqes: [4]Cqe = undefined; - try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); - } + try testing.expectEqual(Cqe{ + .user_data = 0xdddddddd, + .res = buffer_write.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - // Do 4 recv which should consume all buffers + try testing.expectEqual(Cqe{ + .user_data = 0xeeeeeeee, + .res = 0, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 1), ring.cq_ready()); - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 1); + try testing.expectEqual(Cqe{ + .user_data = 0xffffffff, + .res = buffer_read.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqual(@as(u32, 0), ring.cq_ready()); - var i: usize = 0; - while (i < buffers.len) : (i += 1) { - const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(.{ .BUFFER_SELECT = true }, sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); +} - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } +test "write/read" { + if (!is_linux) return error.SkipZigTest; - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = try cqe.buffer_id(); - try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); - } + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_write_read"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; - // This recv should fail + const buffer_write = [_]u8{97} ** 20; + var buffer_read = [_]u8{98} ** 20; + const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); + try testing.expectEqual(Op.WRITE, sqe_write.opcode); + try testing.expectEqual(@as(u64, 10), sqe_write.off); + sqe_write.flags.IO_LINK = true; + const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); + try testing.expectEqual(Op.READ, sqe_read.opcode); + try testing.expectEqual(@as(u64, 10), sqe_read.off); + try testing.expectEqual(@as(u32, 2), try ring.submit()); - { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe_write = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); + // Prior to Linux Kernel 5.6 this is the only way to test for read/write support: + // https://lwn.net/Articles/809820/ + if (cqe_write.err() == .INVAL) return error.SkipZigTest; + if (cqe_read.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x11111111, + .res = buffer_write.len, + .flags = .{}, + }, cqe_write); + try testing.expectEqual(Cqe{ + .user_data = 0x22222222, + .res = buffer_read.len, + .flags = .{}, + }, cqe_read); + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); +} - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - // Expected - .NOBUFS => {}, - .SUCCESS => std.debug.panic("unexpected success", .{}), - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - } +test "splice/read" { + if (!is_linux) return error.SkipZigTest; - // Provide 1 buffer again + var ring = IoUring.init(4, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - const reprovided_buffer_id = 2; + var tmp = std.testing.tmpDir(.{}); + const path_src = "test_io_uring_splice_src"; + const file_src = try tmp.dir.createFile(path_src, .{ .read = true, .truncate = true }); + defer file_src.close(); + const fd_src = file_src.handle; - { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const path_dst = "test_io_uring_splice_dst"; + const file_dst = try tmp.dir.createFile(path_dst, .{ .read = true, .truncate = true }); + defer file_dst.close(); + const fd_dst = file_dst.handle; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } - } + const buffer_write = [_]u8{97} ** 20; + var buffer_read = [_]u8{98} ** 20; + _ = try file_src.write(&buffer_write); - // Redo 1 send on the server socket + const fds = try posix.pipe(); + const pipe_offset: u64 = std.math.maxInt(u64); - { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); + try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); + try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); + try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); + sqe_splice_to_pipe.flags.IO_LINK = true; - _ = try ring.copy_cqe(); - } + const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); + try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); + try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); + try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); + sqe_splice_from_pipe.flags.IO_LINK = true; - // Final recv which should work + const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); + try testing.expectEqual(Op.READ, sqe_read.opcode); + try testing.expectEqual(@as(u64, 10), sqe_read.off); + try testing.expectEqual(@as(u32, 3), try ring.submit()); - // Deliberately put something we don't expect in the buffers - @memset(mem.sliceAsBytes(&buffers), 1); + const cqe_splice_to_pipe = try ring.copy_cqe(); + const cqe_splice_from_pipe = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); + // Prior to Linux Kernel 5.6 this is the only way to test for splice/read support: + // https://lwn.net/Articles/809820/ + if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; + if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; + if (cqe_read.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x11111111, + .res = buffer_write.len, + .flags = .{}, + }, cqe_splice_to_pipe); + try testing.expectEqual(Cqe{ + .user_data = 0x22222222, + .res = buffer_write.len, + .flags = .{}, + }, cqe_splice_from_pipe); + try testing.expectEqual(Cqe{ + .user_data = 0x33333333, + .res = buffer_read.len, + .flags = .{}, + }, cqe_read); + try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); +} - { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.RECV, sqe.opcode); - try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); +test "write_fixed/read_fixed" { + if (!is_linux) return error.SkipZigTest; - const cqe = try ring.copy_cqe(); - switch (cqe.err()) { - .SUCCESS => {}, - else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), - } + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - try testing.expect(cqe.flags.F_BUFFER); - const used_buffer_id = try cqe.buffer_id(); - try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); - } -} + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); -/// Used for testing server/client interactions. -const SocketTestHarness = struct { - listener: posix.socket_t, - server: posix.socket_t, - client: posix.socket_t, + const path = "test_io_uring_write_read_fixed"; + const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true }); + defer file.close(); + const fd = file.handle; - fn close(self: SocketTestHarness) void { - posix.close(self.client); - posix.close(self.listener); - } -}; + var raw_buffers: [2][11]u8 = undefined; + // First buffer will be written to the file. + @memset(&raw_buffers[0], 'z'); + raw_buffers[0][0.."foobar".len].* = "foobar".*; -fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { - // Create a TCP server socket - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + var buffers = [2]posix.iovec{ + .{ .base = &raw_buffers[0], .len = raw_buffers[0].len }, + .{ .base = &raw_buffers[1], .len = raw_buffers[1].len }, + }; + ring.register_buffers(&buffers) catch |err| switch (err) { + error.SystemResources => { + // See https://github.com/ziglang/zig/issues/15362 + return error.SkipZigTest; + }, + else => |e| return e, }; - const listener_socket = try createListenerSocket(&address); - errdefer posix.close(listener_socket); - // Submit 1 accept - var accept_addr: posix.sockaddr = undefined; - var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr)); - _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, .{}); + const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); + try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); + try testing.expectEqual(@as(u64, 3), sqe_write.off); + sqe_write.flags.IO_LINK = true; - // Create a TCP client socket - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - errdefer posix.close(client); - _ = try ring.connect(0xcccccccc, client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); + try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode); + try testing.expectEqual(@as(u64, 0), sqe_read.off); try testing.expectEqual(@as(u32, 2), try ring.submit()); - var cqe_accept = try ring.copy_cqe(); - if (cqe_accept.err() == .INVAL) return error.SkipZigTest; - var cqe_connect = try ring.copy_cqe(); - if (cqe_connect.err() == .INVAL) return error.SkipZigTest; - - // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: - if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { - const a = cqe_accept; - const b = cqe_connect; - cqe_accept = b; - cqe_connect = a; - } + const cqe_write = try ring.copy_cqe(); + const cqe_read = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); - if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); - try testing.expect(cqe_accept.res > 0); - try testing.expectEqual(@as(u32, 0), cqe_accept.flags); try testing.expectEqual(Cqe{ - .user_data = 0xcccccccc, - .res = 0, + .user_data = 0x45454545, + .res = @as(i32, @intCast(buffers[0].len)), .flags = .{}, - }, cqe_connect); + }, cqe_write); + try testing.expectEqual(Cqe{ + .user_data = 0x12121212, + .res = @as(i32, @intCast(buffers[1].len)), + .flags = .{}, + }, cqe_read); - // All good + try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].base[0..3]); + try testing.expectEqualSlices(u8, "foobar", buffers[1].base[3..9]); + try testing.expectEqualSlices(u8, "zz", buffers[1].base[9..11]); +} - return SocketTestHarness{ - .listener = listener_socket, - .server = cqe_accept.res, - .client = client, +test "openat" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; -} + defer ring.deinit(); -fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { - const kernel_backlog = 1; - const listener_socket = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - errdefer posix.close(listener_socket); + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); - try posix.bind(listener_socket, addrAny(address), @sizeOf(linux.sockaddr.in)); - try posix.listen(listener_socket, kernel_backlog); + const path = "test_io_uring_openat"; + + // Workaround for LLVM bug: https://github.com/ziglang/zig/issues/12014 + const path_addr = if (builtin.zig_backend == .stage2_llvm) p: { + var workaround = path; + _ = &workaround; + break :p @intFromPtr(workaround); + } else @intFromPtr(path); + + const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; + const mode: posix.mode_t = 0o666; + const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); + try testing.expectEqual(Sqe{ + .opcode = .OPENAT, + .flags = .{}, + .ioprio = .init_empty(), + .fd = tmp.dir.fd, + .off = 0, + .addr = path_addr, + .len = mode, + .rw_flags = @bitCast(flags), + .user_data = 0x33333333, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }, sqe_openat.*); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // set address to the OS-chosen IP/port. - var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(listener_socket, addrAny(address), &slen); + const cqe_openat = try ring.copy_cqe(); + try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); + if (cqe_openat.err() == .INVAL) return error.SkipZigTest; + if (cqe_openat.err() == .BADF) return error.SkipZigTest; + if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); + try testing.expect(cqe_openat.res > 0); + try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_openat.flags); - return listener_socket; + posix.close(cqe_openat.res); } -test "accept multishot" { +test "close" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); - - // submit multishot accept operation - var addr: posix.sockaddr = undefined; - var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); - const userdata: u64 = 0xaaaaaaaa; - _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - var nr: usize = 4; // number of clients to connect - while (nr > 0) : (nr -= 1) { - // connect client - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - errdefer posix.close(client); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + const path = "test_io_uring_close"; + const file = try tmp.dir.createFile(path, .{}); + errdefer file.close(); - // test accept completion - var cqe = try ring.copy_cqe(); - if (cqe.err() == .INVAL) return error.SkipZigTest; - try testing.expect(cqe.res > 0); - try testing.expect(cqe.user_data == userdata); - try testing.expect(cqe.flags.F_MORE); // more flag is set + const sqe_close = try ring.close(0x44444444, file.handle); + try testing.expectEqual(Op.CLOSE, sqe_close.opcode); + try testing.expectEqual(file.handle, sqe_close.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - posix.close(client); - } + const cqe_close = try ring.copy_cqe(); + if (cqe_close.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x44444444, + .res = 0, + .flags = .{}, + }, cqe_close); } -test "accept/connect/send_zc/recv" { - try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); +test "accept/connect/send/recv" { + if (!is_linux) return error.SkipZigTest; var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, @@ -3451,355 +4023,406 @@ test "accept/connect/send_zc/recv" { const socket_test_harness = try createSocketTestHarness(&ring); defer socket_test_harness.close(); - const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - var buffer_recv = [_]u8{0} ** 10; + const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - // zero-copy send - const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); + const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); sqe_send.flags.IO_LINK = true; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); try testing.expectEqual(@as(u32, 2), try ring.submit()); - var cqe_send = try ring.copy_cqe(); - // First completion of zero-copy send. - // IORING_CQE_F_MORE, means that there - // will be a second completion event / notification for the - // request, with the user_data field set to the same value. - // buffer_send must be keep alive until second cqe. + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, - .flags = .{ .F_MORE = true }, + .flags = .{}, }, cqe_send); - cqe_send, const cqe_recv = brk: { - const cqe1 = try ring.copy_cqe(); - const cqe2 = try ring.copy_cqe(); - break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 }; - }; - + const cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, + // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, }, cqe_recv); - try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); - // Second completion of zero-copy send. - // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer - try testing.expectEqual(Cqe{ - .user_data = 0xeeeeeeee, - .res = 0, - .flags = linux.IORING_CQE_F_NOTIF, - }, cqe_send); + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); } -test "accept_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); +test "sendmsg/recvmsg" { + if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var address: linux.sockaddr.in = .{ + + var address_server: linux.sockaddr.in = .{ .port = 0, .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; - // register direct file descriptors - var registered_fds = [_]linux.fd_t{-1} ** 2; - try ring.register_files(registered_fds[0..]); - - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); + const server = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); + defer posix.close(server); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.bind(server, addrAny(&address_server), @sizeOf(linux.sockaddr.in)); - const accept_userdata: u64 = 0xaaaaaaaa; - const read_userdata: u64 = 0xbbbbbbbb; - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + // set address_server to the OS-chosen IP/port. + var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(server, addrAny(&address_server), &slen); - for (0..2) |_| { - for (registered_fds, 0..) |_, i| { - var buffer_recv: [16]u8 = @splat(0); - const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop + const client = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); + defer posix.close(client); - // submit accept, will chose registered fd and return index in cqe - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const buffer_send = [_]u8{42} ** 128; + const iovecs_send = [_]posix.iovec_const{ + posix.iovec_const{ .base = &buffer_send, .len = buffer_send.len }, + }; + const msg_send: linux.msghdr_const = .{ + .name = addrAny(&address_server), + .namelen = @sizeOf(linux.sockaddr.in), + .iov = &iovecs_send, + .iovlen = 1, + .control = null, + .controllen = 0, + .flags = 0, + }; + const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); + sqe_sendmsg.flags.IO_LINK = true; + try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); + try testing.expectEqual(client, sqe_sendmsg.fd); - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); + var buffer_recv = [_]u8{0} ** 128; + var iovecs_recv = [_]posix.iovec{ + posix.iovec{ .base = &buffer_recv, .len = buffer_recv.len }, + }; + var address_recv: linux.sockaddr.in = .{ + .port = 0, + .addr = 0, + }; + var msg_recv: linux.msghdr = .{ + .name = addrAny(&address_recv), + .namelen = @sizeOf(linux.sockaddr.in), + .iov = &iovecs_recv, + .iovlen = 1, + .control = null, + .controllen = 0, + .flags = 0, + }; + const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); + try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode); + try testing.expectEqual(server, sqe_recvmsg.fd); - // accept completion - const cqe_accept = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); - const fd_index = cqe_accept.res; - try testing.expect(fd_index < registered_fds.len); - try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(@as(u32, 2), ring.sq_ready()); + try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); + try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + try testing.expectEqual(@as(u32, 2), ring.cq_ready()); - // send data - _ = try posix.send(client, buffer_send, 0); + const cqe_sendmsg = try ring.copy_cqe(); + if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x11111111, + .res = buffer_send.len, + .flags = .{}, + }, cqe_sendmsg); - // Example of how to use registered fd: - // Submit receive to fixed file returned by accept (fd_index). - // Fd field is set to registered file index, returned by accept. - // Flag linux.IOSQE_FIXED_FILE must be set. - const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); - recv_sqe.flags.FIXED_FILE = true; - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe_recvmsg = try ring.copy_cqe(); + if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x22222222, + .res = buffer_recv.len, + // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically + .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recvmsg); - // accept receive - const recv_cqe = try ring.copy_cqe(); - try testing.expect(recv_cqe.user_data == read_userdata); - try testing.expect(recv_cqe.res == buffer_send.len); - try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); - } - // no more available fds, accept will get NFILE error - { - // submit accept - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); - // completion with error - const cqe_accept = try ring.copy_cqe(); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); - } - // return file descriptors to kernel - try ring.register_files_update(0, registered_fds[0..]); - } - try ring.unregister_files(); + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); } -test "accept_multishot_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); +test "timeout (after a relative time)" { + if (!is_linux) return error.SkipZigTest; + const io = std.testing.io; + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const ms = 10; + const margin = 5; + const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; + + const started = try std.Io.Clock.awake.now(io); + const sqe = try ring.timeout(0x55555555, &ts, 0, 0); + try testing.expectEqual(Op.TIMEOUT, sqe.opcode); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + const stopped = try std.Io.Clock.awake.now(io); + + try testing.expectEqual(Cqe{ + .user_data = 0x55555555, + .res = -@as(i32, @intFromEnum(linux.E.TIME)), + .flags = .{}, + }, cqe); - if (builtin.cpu.arch == .riscv64) { - // https://github.com/ziglang/zig/issues/25734 - return error.SkipZigTest; - } + // Tests should not depend on timings: skip test if outside margin. + const ms_elapsed = started.durationTo(stopped).toMilliseconds(); + if (ms_elapsed > margin) return error.SkipZigTest; +} - var ring = IoUring.init(1, .{}) catch |err| switch (err) { +test "timeout (after a number of completions)" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; + const count_completions: u64 = 1; + const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, .{}); + try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(count_completions, sqe_timeout.off); + _ = try ring.nop(0x77777777); + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + const cqe_nop = try ring.copy_cqe(); + try testing.expectEqual(Cqe{ + .user_data = 0x77777777, + .res = 0, + .flags = .{}, + }, cqe_nop); + + const cqe_timeout = try ring.copy_cqe(); + try testing.expectEqual(Cqe{ + .user_data = 0x66666666, + .res = 0, + .flags = .{}, + }, cqe_timeout); +} + +test "timeout_remove" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; + defer ring.deinit(); - var registered_fds = [_]linux.fd_t{-1} ** 2; - try ring.register_files(registered_fds[0..]); + const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; + const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); + try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); + const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, .{}); + try testing.expectEqual(Op.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); + try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); + try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); - const accept_userdata: u64 = 0xaaaaaaaa; + try testing.expectEqual(@as(u32, 2), try ring.submit()); - for (0..2) |_| { - // submit multishot accept - // Will chose registered fd and return index of the selected registered file in cqe. - _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: + // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second + // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second - for (registered_fds) |_| { - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); + var cqes: [2]Cqe = undefined; + cqes[0] = try ring.copy_cqe(); + cqes[1] = try ring.copy_cqe(); - // accept completion - const cqe_accept = try ring.copy_cqe(); - const fd_index = cqe_accept.res; - try testing.expect(fd_index < registered_fds.len); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expect(cqe_accept.flags.F_MORE); // has more is set - } - // No more available fds, accept will get NFILE error. - // Multishot is terminated (more flag is not set). + for (cqes) |cqe| { + // IORING_OP_TIMEOUT_REMOVE is not supported by this kernel version: + // Timeout remove operations set the fd to -1, which results in EBADF before EINVAL. + // We use IORING_FEAT_RW_CUR_POS as a safety check here to make sure we are at least pre-5.6. + // We don't want to skip this test for newer kernels. + if (cqe.user_data == 0x99999999 and + cqe.err() == .BADF and + (!ring.features.RW_CUR_POS)) { - // connect - const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); - try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - defer posix.close(client); - // completion with error - const cqe_accept = try ring.copy_cqe(); - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); - try testing.expect(!cqe_accept.flags.F_MORE); // has more is not set + return error.SkipZigTest; + } + + try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); + + if (cqe.user_data == 0x88888888) { + try testing.expectEqual(Cqe{ + .user_data = 0x88888888, + .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), + .flags = .{}, + }, cqe); + } else if (cqe.user_data == 0x99999999) { + try testing.expectEqual(Cqe{ + .user_data = 0x99999999, + .res = 0, + .flags = .{}, + }, cqe); } - // return file descriptors to kernel - try ring.register_files_update(0, registered_fds[0..]); } - try ring.unregister_files(); } -test "socket" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); +test "accept/connect/recv/link_timeout" { + if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, .{}) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - // prepare, submit socket operation - _ = try ring.socket(0, .{ .inet = true }, .{ .type = .stream }, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); - // test completion - var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - const fd: linux.fd_t = @intCast(cqe.res); - try testing.expect(fd > 2); + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - posix.close(fd); + const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + sqe_recv.flags.IO_LINK = true; + + const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; + _ = try ring.link_timeout(0x22222222, &ts, 0); + + const nr_wait = try ring.submit(); + try testing.expectEqual(@as(u32, 2), nr_wait); + + var i: usize = 0; + while (i < nr_wait) : (i += 1) { + const cqe = try ring.copy_cqe(); + switch (cqe.user_data) { + 0xffffffff => { + if (cqe.res != -@as(i32, @intFromEnum(linux.E.INTR)) and + cqe.res != -@as(i32, @intFromEnum(linux.E.CANCELED))) + { + std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); + try testing.expect(false); + } + }, + 0x22222222 => { + if (cqe.res != -@as(i32, @intFromEnum(linux.E.ALREADY)) and + cqe.res != -@as(i32, @intFromEnum(linux.E.TIME))) + { + std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res }); + try testing.expect(false); + } + }, + else => @panic("should not happen"), + } + } } -test "socket_direct/socket_direct_alloc/close_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); +test "fallocate" { + if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var registered_fds = [_]linux.fd_t{-1} ** 3; - try ring.register_files(registered_fds[0..]); - - // create socket in registered file descriptor at index 0 (last param) - _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 0); + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - // create socket in registered file descriptor at index 1 (last param) - _ = try ring.socket_direct(0, linux.Af.INET, posix.SOCK.STREAM, 0, 0, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified + const path = "test_io_uring_fallocate"; + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); - // create socket in kernel chosen file descriptor index (_alloc version) - // completion res has index from registered files - _ = try ring.socket_direct_alloc(0, linux.Af.INET, posix.SOCK.STREAM, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); - try testing.expect(cqe_socket.res == 2); // returns registered file index + try testing.expectEqual(@as(u64, 0), (try file.stat()).size); - // use sockets from registered_fds in connect operation - var address: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - const listener_socket = try createListenerSocket(&address); - defer posix.close(listener_socket); - const accept_userdata: u64 = 0xaaaaaaaa; - const connect_userdata: u64 = 0xbbbbbbbb; - const close_userdata: u64 = 0xcccccccc; - for (registered_fds, 0..) |_, fd_index| { - // prepare accept - _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); - // prepare connect with fixed socket - const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); - connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index - // submit both - try testing.expectEqual(@as(u32, 2), try ring.submit()); - // get completions - var cqe_connect = try ring.copy_cqe(); - var cqe_accept = try ring.copy_cqe(); - // ignore order - if (cqe_connect.user_data == accept_userdata and cqe_accept.user_data == connect_userdata) { - const a = cqe_accept; - const b = cqe_connect; - cqe_accept = b; - cqe_connect = a; - } - // test connect completion - try testing.expect(cqe_connect.user_data == connect_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_connect.err()); - // test accept completion - try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + const len: u64 = 65536; + const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); + try testing.expectEqual(Op.FALLOCATE, sqe.opcode); + try testing.expectEqual(file.handle, sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // submit and test close_direct - _ = try ring.close_direct(close_userdata, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_close = try ring.copy_cqe(); - try testing.expect(cqe_close.user_data == close_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement fallocate(): + .INVAL => return error.SkipZigTest, + // This kernel does not implement fallocate(): + .NOSYS => return error.SkipZigTest, + // The filesystem containing the file referred to by fd does not support this operation; + // or the mode is not supported by the filesystem containing the file referred to by fd: + .OPNOTSUPP => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } + try testing.expectEqual(Cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = .{}, + }, cqe); - try ring.unregister_files(); + try testing.expectEqual(len, (try file.stat()).size); } -test "openat_direct/close_direct" { - try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); +test "statx" { + if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var registered_fds = [_]linux.fd_t{-1} ** 3; - try ring.register_files(registered_fds[0..]); - var tmp = std.testing.tmpDir(.{}); defer tmp.cleanup(); - const path = "test_io_uring_close_direct"; - const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; - const user_data: u64 = 0; + const path = "test_io_uring_statx"; + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); - // use registered file at index 0 (last param) - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 0); + try testing.expectEqual(@as(u64, 0), (try file.stat()).size); - // use registered file at index 1 - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 0); // res is 0 when we specify index + try file.writeAll("foobar"); - // let kernel choose registered file index - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); + var buf: linux.Statx = undefined; + const sqe = try ring.statx( + 0xaaaaaaaa, + tmp.dir.fd, + path, + .{}, + .{ .size = true }, + &buf, + ); + try testing.expectEqual(Op.STATX, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.res == 2); // chosen index is in res - // close all open file descriptors - for (registered_fds, 0..) |_, fd_index| { - _ = try ring.close_direct(user_data, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - var cqe_close = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement statx(): + .INVAL => return error.SkipZigTest, + // This kernel does not implement statx(): + .NOSYS => return error.SkipZigTest, + // The filesystem containing the file referred to by fd does not support this operation; + // or the mode is not supported by the filesystem containing the file referred to by fd: + .OPNOTSUPP => return error.SkipZigTest, + // not supported on older kernels (5.4) + .BADF => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try ring.unregister_files(); + try testing.expectEqual(Cqe{ + .user_data = 0xaaaaaaaa, + .res = 0, + .flags = .{}, + }, cqe); + + try testing.expect(buf.mask.size); + try testing.expectEqual(@as(u64, 6), buf.size); } -test "waitid" { - try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); +test "accept/connect/recv/cancel" { + if (!is_linux) return error.SkipZigTest; var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, @@ -3808,111 +4431,134 @@ test "waitid" { }; defer ring.deinit(); - const pid = try posix.fork(); - if (pid == 0) { - posix.exit(7); - } + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); - var siginfo: posix.siginfo_t = undefined; - _ = try ring.waitid(0, .PID, pid, &siginfo, .{ .exited = true }, 0); + var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - try testing.expectEqual(1, try ring.submit()); + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe_waitid = try ring.copy_cqe(); - try testing.expectEqual(0, cqe_waitid.res); - try testing.expectEqual(pid, siginfo.fields.common.first.piduid.pid); - try testing.expectEqual(7, siginfo.fields.common.second.sigchld.status); -} + const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); + try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode); + try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); + try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); + try testing.expectEqual(@as(u32, 1), try ring.submit()); -/// For use in tests. Returns SkipZigTest if kernel version is less than required. -inline fn skipKernelLessThan(required: std.SemanticVersion) !void { - if (!is_linux) return error.SkipZigTest; + var cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; + var cqe_cancel = try ring.copy_cqe(); + if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; - var uts: linux.utsname = undefined; - const res = linux.uname(&uts); - switch (linux.errno(res)) { - .SUCCESS => {}, - else => |errno| return posix.unexpectedErrno(errno), + // The recv/cancel CQEs may arrive in any order, the recv CQE will sometimes come first: + if (cqe_recv.user_data == 0x99999999 and cqe_cancel.user_data == 0xffffffff) { + const a = cqe_recv; + const b = cqe_cancel; + cqe_recv = b; + cqe_cancel = a; } - const release = mem.sliceTo(&uts.release, 0); - // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" - const extra_index = std.mem.indexOfAny(u8, release, "-+"); - const stripped = release[0..(extra_index orelse release.len)]; - // Make sure the input don't rely on the extra we just stripped - try testing.expect(required.pre == null and required.build == null); + try testing.expectEqual(Cqe{ + .user_data = 0xffffffff, + .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), + .flags = .{}, + }, cqe_recv); - var current = try std.SemanticVersion.parse(stripped); - current.pre = null; // don't check pre field - if (required.order(current) == .gt) return error.SkipZigTest; + try testing.expectEqual(Cqe{ + .user_data = 0x99999999, + .res = 0, + .flags = .{}, + }, cqe_cancel); } -test BufferGroup { +test "register_files_update" { if (!is_linux) return error.SkipZigTest; - // Init IoUring - var ring = IoUring.init(16, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - // Init buffer group for ring - const group_id: u16 = 1; // buffers group id - const buffers_count: u16 = 1; // number of buffers in buffer group - const buffer_size: usize = 128; // size of each buffer in group - var buf_grp = BufferGroup.init( - &ring, - testing.allocator, - group_id, - buffer_size, - buffers_count, - ) catch |err| switch (err) { - // kernel older than 5.19 - error.ArgumentsInvalid => return error.SkipZigTest, - else => return err, + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); + + var registered_fds = [_]linux.fd_t{0} ** 2; + const fd_index = 0; + const fd_index2 = 1; + registered_fds[fd_index] = fd; + registered_fds[fd_index2] = -1; + + ring.register_files(registered_fds[0..]) catch |err| switch (err) { + // Happens when the kernel doesn't support sparse entry (-1) in the file descriptors array. + error.FileDescriptorInvalid => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), }; - defer buf_grp.deinit(testing.allocator); - // Create client/server fds - const fds = try createSocketTestHarness(&ring); - defer fds.close(); - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + // Test IORING_REGISTER_FILES_UPDATE + // Only available since Linux 5.5 - // Client sends data + const fd2 = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd2); + + registered_fds[fd_index] = fd2; + registered_fds[fd_index2] = -1; + try ring.register_files_update(0, registered_fds[0..]); + + var buffer = [_]u8{42} ** 128; { - _ = try ring.send(1, fds.client, data[0..], 0); - const submitted = try ring.submit(); - try testing.expectEqual(1, submitted); - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = .{} }, cqe_send); + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + sqe.flags.FIXED_FILE = true; + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); } - // Server uses buffer group receive + // Test with a non-zero offset + + registered_fds[fd_index] = -1; + registered_fds[fd_index2] = -1; + try ring.register_files_update(1, registered_fds[1..]); + { - // Submit recv operation, buffer will be chosen from buffer group - _ = try buf_grp.recv(2, fds.server, 0); - const submitted = try ring.submit(); - try testing.expectEqual(1, submitted); + // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + sqe.flags.FIXED_FILE = true; + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = buffer.len, + .flags = .{}, + }, try ring.copy_cqe()); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + } + + try ring.register_files_update(0, registered_fds[0..]); + + { + // Now this should fail since both fds are sparse (-1) + const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + sqe.flags.FIXED_FILE = true; - // ... when we have completion for recv operation + try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); - try testing.expectEqual(2, cqe.user_data); // matches submitted user_data - try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(data.len, @as(usize, @intCast(cqe.res))); // cqe.res holds received data len - - // Get buffer from pool - const buf = try buf_grp.get(cqe); - try testing.expectEqualSlices(u8, &data, buf); - // Release buffer to the kernel when application is done with it - try buf_grp.put(cqe); + try testing.expectEqual(linux.E.BADF, cqe.err()); } + + try ring.unregister_files(); } -test "ring mapped buffers recv" { +test "shutdown" { if (!is_linux) return error.SkipZigTest; var ring = IoUring.init(16, .{}) catch |err| switch (err) { @@ -3922,2409 +4568,1764 @@ test "ring mapped buffers recv" { }; defer ring.deinit(); - // init buffer group - const group_id: u16 = 1; // buffers group id - const buffers_count: u16 = 2; // number of buffers in buffer group - const buffer_size: usize = 4; // size of each buffer in group - var buf_grp = BufferGroup.init( - &ring, - testing.allocator, - group_id, - buffer_size, - buffers_count, - ) catch |err| switch (err) { - // kernel older than 5.19 - error.ArgumentsInvalid => return error.SkipZigTest, - else => return err, + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; - defer buf_grp.deinit(testing.allocator); - // create client/server fds - const fds = try createSocketTestHarness(&ring); - defer fds.close(); + // Socket bound, expect shutdown to work + { + // TODO: update posix later to use Typed Flags + const server = try posix.socket(address.any.family, @as(u32, @bitCast(linux.Sock{ .type = .stream, .flags = .{ .cloexec = true } })), 0); + defer posix.close(server); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); + try posix.listen(server, 1); - // for random user_data in sqe/cqe - var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); - var rnd = Rnd.random(); + // set address to the OS-chosen IP/port. + var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(server, addrAny(&address), &slen); - var round: usize = 4; // repeat send/recv cycle round times - while (round > 0) : (round -= 1) { - // client sends data - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - { - const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); - } - var pos: usize = 0; + const shutdown_sqe = try ring.shutdown(0x445445445, server, .rd); + try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); - // read first chunk - const cqe1 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); - var buf = try buf_grp.get(cqe1); - try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); - pos += buf.len; - // second chunk - const cqe2 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); - buf = try buf_grp.get(cqe2); - try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); - pos += buf.len; + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // both buffers provided to the kernel are used so we get error - // 'no more buffers', until we put buffers to the kernel - { - const user_data = rnd.int(u64); - _ = try buf_grp.recv(user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only - try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement shutdown (kernel version < 5.11) + .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - // put buffers back to the kernel - try buf_grp.put(cqe1); - try buf_grp.put(cqe2); + try testing.expectEqual(Cqe{ + .user_data = 0x445445445, + .res = 0, + .flags = .{}, + }, cqe); + } - // read remaining data - while (pos < data.len) { - const cqe = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); - buf = try buf_grp.get(cqe); - try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); - pos += buf.len; - try buf_grp.put(cqe); - } + // Socket not bound, expect to fail with ENOTCONN + { + const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + defer posix.close(server); + + const shutdown_sqe = ring.shutdown(0x445445445, server, .rd) catch |err| switch (err) { + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + }; + try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); + try testing.expectEqual(linux.E.NOTCONN, cqe.err()); } } -test "ring mapped buffers multishot recv" { +test "renameat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - // init buffer group - const group_id: u16 = 1; // buffers group id - const buffers_count: u16 = 2; // number of buffers in buffer group - const buffer_size: usize = 4; // size of each buffer in group - var buf_grp = BufferGroup.init( - &ring, - testing.allocator, - group_id, - buffer_size, - buffers_count, - ) catch |err| switch (err) { - // kernel older than 5.19 - error.ArgumentsInvalid => return error.SkipZigTest, + const old_path = "test_io_uring_renameat_old"; + const new_path = "test_io_uring_renameat_new"; + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + // Write old file with data + + const old_file = try tmp.dir.createFile(old_path, .{ .truncate = true, .mode = 0o666 }); + defer old_file.close(); + try old_file.writeAll("hello"); + + // Submit renameat + + const sqe = try ring.renameat( + 0x12121212, + tmp.dir.fd, + old_path, + tmp.dir.fd, + new_path, + .{}, + ); + try testing.expectEqual(Op.RENAMEAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement renameat (kernel version < 5.11) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(Cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = .{}, + }, cqe); + + // Validate that the old file doesn't exist anymore + try testing.expectError(error.FileNotFound, tmp.dir.openFile(old_path, .{})); + + // Validate that the new file exists with the proper content + var new_file_data: [16]u8 = undefined; + try testing.expectEqualStrings("hello", try tmp.dir.readFile(new_path, &new_file_data)); +} + +test "unlinkat" { + if (!is_linux) return error.SkipZigTest; + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, else => return err, }; - defer buf_grp.deinit(testing.allocator); + defer ring.deinit(); - // create client/server fds - const fds = try createSocketTestHarness(&ring); - defer fds.close(); + const path = "test_io_uring_unlinkat"; - // for random user_data in sqe/cqe - var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); - var rnd = Rnd.random(); + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - var round: usize = 4; // repeat send/recv cycle round times - while (round > 0) : (round -= 1) { - // client sends data - const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf }; - { - const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - const cqe_send = try ring.copy_cqe(); - if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); - } + // Write old file with data + + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); + + // Submit unlinkat + + const sqe = try ring.unlinkat( + 0x12121212, + tmp.dir.fd, + path, + .{}, + ); + try testing.expectEqual(Op.UNLINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement unlinkat (kernel version < 5.11) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(Cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = .{}, + }, cqe); - // start multishot recv - var recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + // Validate that the file doesn't exist anymore + _ = tmp.dir.openFile(path, .{}) catch |err| switch (err) { + error.FileNotFound => {}, + else => std.debug.panic("unexpected error: {}", .{err}), + }; +} - // server reads data into provided buffers - // there are 2 buffers of size 4, so each read gets only chunk of data - // we read four chunks of 4, 4, 4, 4 bytes each - var chunk: []const u8 = data[0..buffer_size]; // first chunk - const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe1.flags.F_MORE); +test "mkdirat" { + if (!is_linux) return error.SkipZigTest; - chunk = data[buffer_size .. buffer_size * 2]; // second chunk - const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe2.flags.F_MORE); + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - // both buffers provided to the kernel are used so we get error - // 'no more buffers', until we put buffers to the kernel - { - const cqe = try ring.copy_cqe(); - try testing.expectEqual(recv_user_data, cqe.user_data); - try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only - // has more is not set - // indicates that multishot is finished - try testing.expect(!cqe.flags.F_MORE); - try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); - } + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - // put buffers back to the kernel - try buf_grp.put(cqe1); - try buf_grp.put(cqe2); + const path = "test_io_uring_mkdirat"; - // restart multishot - recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + // Submit mkdirat - chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk - const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe3.flags.F_MORE); - try buf_grp.put(cqe3); + const sqe = try ring.mkdirat( + 0x12121212, + tmp.dir.fd, + path, + 0o0755, + ); + try testing.expectEqual(Op.MKDIRAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - chunk = data[buffer_size * 3 ..]; // last chunk - const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe4.flags.F_MORE); - try buf_grp.put(cqe4); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement mkdirat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(Cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = .{}, + }, cqe); - // cancel pending multishot recv operation - { - const cancel_user_data = rnd.int(u64); - _ = try ring.cancel(cancel_user_data, recv_user_data, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + // Validate that the directory exist + _ = try tmp.dir.openDir(path, .{}); +} - // expect completion of cancel operation and completion of recv operation - var cqe_cancel = try ring.copy_cqe(); - if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; - var cqe_recv = try ring.copy_cqe(); - if (cqe_recv.err() == .INVAL) return error.SkipZigTest; +test "symlinkat" { + if (!is_linux) return error.SkipZigTest; - // don't depend on order of completions - if (cqe_cancel.user_data == recv_user_data and cqe_recv.user_data == cancel_user_data) { - const a = cqe_cancel; - const b = cqe_recv; - cqe_cancel = b; - cqe_recv = a; - } + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - // Note on different kernel results: - // on older kernel (tested with v6.0.16, v6.1.57, v6.2.12, v6.4.16) - // cqe_cancel.err() == .NOENT - // cqe_recv.err() == .NOBUFS - // on kernel (tested with v6.5.0, v6.5.7) - // cqe_cancel.err() == .SUCCESS - // cqe_recv.err() == .CANCELED - // Upstream reference: https://github.com/axboe/liburing/issues/984 + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - // cancel operation is success (or NOENT on older kernels) - try testing.expectEqual(cancel_user_data, cqe_cancel.user_data); - try testing.expect(cqe_cancel.err() == .NOENT or cqe_cancel.err() == .SUCCESS); + const path = "test_io_uring_symlinkat"; + const link_path = "test_io_uring_symlinkat_link"; - // recv operation is failed with err CANCELED (or NOBUFS on older kernels) - try testing.expectEqual(recv_user_data, cqe_recv.user_data); - try testing.expect(cqe_recv.res < 0); - try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED); - try testing.expect(!cqe_recv.flags.F_MORE); - } - } -} + const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); + defer file.close(); -// Prepare, submit recv and get cqe using buffer group. -fn buf_grp_recv_submit_get_cqe( - ring: *IoUring, - buf_grp: *BufferGroup, - fd: linux.fd_t, - user_data: u64, -) !Cqe { - // prepare and submit recv - const sqe = try buf_grp.recv(user_data, fd, 0); - try testing.expect(sqe.flags.BUFFER_SELECT); - try testing.expect(sqe.buf_index == buf_grp.group_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit - // get cqe, expect success - const cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set + // Submit symlinkat - return cqe; -} + const sqe = try ring.symlinkat( + 0x12121212, + path, + tmp.dir.fd, + link_path, + ); + try testing.expectEqual(Op.SYMLINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(u32, 1), try ring.submit()); -fn expect_buf_grp_cqe( - ring: *IoUring, - buf_grp: *BufferGroup, - user_data: u64, - expected: []const u8, -) !Cqe { - // get cqe const cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expect(cqe.res >= 0); // success - try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set - try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - - // get buffer from pool - const buffer_id = try cqe.buffer_id(); - const len = @as(usize, @intCast(cqe.res)); - const buf = buf_grp.get_by_id(buffer_id)[0..len]; - try testing.expectEqualSlices(u8, expected, buf); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement symlinkat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(Cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = .{}, + }, cqe); - return cqe; + // Validate that the symlink exist + _ = try tmp.dir.openFile(link_path, .{}); } -test "copy_cqes with wrapping sq.cqes buffer" { +test "linkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - try testing.expectEqual(2, ring.sq.sqes.len); - try testing.expectEqual(4, ring.cq.cqes.len); + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); - // submit 2 entries, receive 2 completions - var cqes: [8]Cqe = undefined; - { - for (0..2) |_| { - const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); - try testing.expect(try ring.submit() == 1); - } - var cqe_count: u32 = 0; - while (cqe_count < 2) { - cqe_count += try ring.copy_cqes(&cqes, 2 - cqe_count); - } - } + const first_path = "test_io_uring_linkat_first"; + const second_path = "test_io_uring_linkat_second"; - try testing.expectEqual(2, ring.cq.head.*); + // Write file with data - // sq.sqes len is 4, starting at position 2 - // every 4 entries submit wraps completion buffer - // we are reading ring.cq.cqes at indexes 2,3,0,1 - for (1..1024) |i| { - for (0..4) |_| { - const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); - try testing.expect(try ring.submit() == 1); - } - var cqe_count: u32 = 0; - while (cqe_count < 4) { - cqe_count += try ring.copy_cqes(&cqes, 4 - cqe_count); - } - try testing.expectEqual(4, cqe_count); - try testing.expectEqual(2 + 4 * i, ring.cq.head.*); + const first_file = try tmp.dir.createFile(first_path, .{ .truncate = true, .mode = 0o666 }); + defer first_file.close(); + try first_file.writeAll("hello"); + + // Submit linkat + + const sqe = try ring.linkat( + 0x12121212, + tmp.dir.fd, + first_path, + tmp.dir.fd, + second_path, + .{}, + ); + try testing.expectEqual(Op.LINKAT, sqe.opcode); + try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); + try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + // This kernel's io_uring does not yet implement linkat (kernel version < 5.15) + .BADF, .INVAL => return error.SkipZigTest, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } + try testing.expectEqual(Cqe{ + .user_data = 0x12121212, + .res = 0, + .flags = .{}, + }, cqe); + + // Validate the second file + var second_file_data: [16]u8 = undefined; + try testing.expectEqualStrings("hello", try tmp.dir.readFile(second_path, &second_file_data)); } -test "bind/listen/connect" { - if (builtin.cpu.arch == .s390x) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/25956 +test "provide_buffers: read" { + if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(4, .{}) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - const probe = ring.get_probe() catch return error.SkipZigTest; - // LISTEN is higher required operation - if (!probe.is_supported(.LISTEN)) return error.SkipZigTest; - - var addr: linux.sockaddr.in = .{ - .port = 0, - .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), - }; - const proto: u32 = if (addr.family == linux.AF.UNIX) 0 else linux.IPPROTO.TCP; - - const listen_fd = brk: { - // Create socket - _ = try ring.socket(1, addr.any.family, .{ .type = .stream, .flags = .{ .cloexec = true } }, proto, 0); - try testing.expectEqual(1, try ring.submit()); - var cqe = try ring.copy_cqe(); - try testing.expectEqual(1, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - const listen_fd: linux.fd_t = @intCast(cqe.res); - try testing.expect(listen_fd > 2); + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); - // Prepare: set socket option * 2, bind, listen - var optval: u32 = 1; - (try ring.setsockopt(2, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval))).link_next(); - (try ring.setsockopt(3, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEPORT, mem.asBytes(&optval))).link_next(); - (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); - _ = try ring.listen(5, listen_fd, 1, 0); - // Submit 4 operations - try testing.expectEqual(4, try ring.submit()); - // Expect all to succeed - for (2..6) |user_data| { - cqe = try ring.copy_cqe(); - try testing.expectEqual(user_data, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - } + const group_id = 1337; + const buffer_id = 0; - // Check that socket option is set - optval = 0; - _ = try ring.getsockopt(5, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval)); - try testing.expectEqual(1, try ring.submit()); - cqe = try ring.copy_cqe(); - try testing.expectEqual(5, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(1, optval); + const buffer_len = 128; - // Read system assigned port into addr - var addr_len: posix.socklen_t = @sizeOf(linux.sockaddr.in); - try posix.getsockname(listen_fd, addrAny(&addr), &addr_len); + var buffers: [4][buffer_len]u8 = undefined; - break :brk listen_fd; - }; + // Provide 4 buffers - const connect_fd = brk: { - // Create connect socket - _ = try ring.socket(6, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); - try testing.expectEqual(1, try ring.submit()); - const cqe = try ring.copy_cqe(); - try testing.expectEqual(6, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - // Get connect socket fd - const connect_fd: linux.fd_t = @intCast(cqe.res); - try testing.expect(connect_fd > 2 and connect_fd != listen_fd); - break :brk connect_fd; - }; + { + const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, buffers.len), sqe.fd); + try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // Prepare accept/connect operations - _ = try ring.accept(7, listen_fd, null, null, 0); - _ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in)); - try testing.expectEqual(2, try ring.submit()); - // Get listener accepted socket - var accept_fd: posix.socket_t = 0; - for (0..2) |_| { const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - if (cqe.user_data == 7) { - accept_fd = @intCast(cqe.res); - } else { - try testing.expectEqual(8, cqe.user_data); + switch (cqe.err()) { + // Happens when the kernel is < 5.7 + .INVAL, .BADF => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); } - try testing.expect(accept_fd > 2 and accept_fd != listen_fd and accept_fd != connect_fd); - // Communicate - try testSendRecv(&ring, connect_fd, accept_fd); - try testSendRecv(&ring, accept_fd, connect_fd); + // Do 4 reads which should consume all buffers - // Shutdown and close all sockets - for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { - (try ring.shutdown(9, fd, posix.SHUT.RDWR)).link_next(); - _ = try ring.close(10, fd); - try testing.expectEqual(2, try ring.submit()); - for (0..2) |i| { - const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(9 + i, cqe.user_data); + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - } -} -fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t) !void { - const buffer_send = "0123456789abcdf" ** 10; - var buffer_recv: [buffer_send.len * 2]u8 = undefined; + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); - // 2 sends - _ = try ring.send(1, send_fd, buffer_send, linux.MSG.WAITALL); - _ = try ring.send(2, send_fd, buffer_send, linux.MSG.WAITALL); - try testing.expectEqual(2, try ring.submit()); - for (0..2) |i| { - const cqe = try ring.copy_cqe(); - try testing.expectEqual(1 + i, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(buffer_send.len, @as(usize, @intCast(cqe.res))); + try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); } - // receive - var recv_len: usize = 0; - while (recv_len < buffer_send.len * 2) { - _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, 0); - try testing.expectEqual(1, try ring.submit()); + // This read should fail + + { + const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); - try testing.expectEqual(3, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - recv_len += @intCast(cqe.res); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); } - // inspect recv buffer - try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); - try testing.expectEqualSlices(u8, buffer_send, buffer_recv[buffer_send.len..]); -} + // Provide 1 buffer again -fn addrAny(addr: *linux.sockaddr.in) *linux.sockaddr { - return @ptrCast(addr); -} -// IO completion data structure (Completion Queue Entry) -pub const Cqe = extern struct { - /// sqe.user_data value passed back - user_data: u64, - /// result code for this event - res: i32, - flags: Flags, - // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration - // TODO: add support for the IORING_SETUP_CQE32 case - /// If the ring is initialized with IORING_SETUP_CQE32, then this field - /// contains 16-bytes of padding, doubling the size of the CQE. - // big_cqe: ?[2]u64, + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 42); - /// cqe.flags - pub const Flags = packed struct(u32) { - /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID - F_BUFFER: bool = false, - /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries - F_MORE: bool = false, - /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv - F_SOCK_NONEMPTY: bool = false, - /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct - /// them from sends. - F_NOTIF: bool = false, - /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get - /// more completions. In other words, the buffer is being - /// partially consumed, and will be used by the kernel for - /// more completions. This is only set for buffers used via - /// the incremental buffer consumption, as provided by - /// a ring buffer setup with IOU_PBUF_RING_INC. For any - /// other provided buffer type, all completions with a - /// buffer passed back is automatically returned to the - /// application. - F_BUF_MORE: bool = false, - // COMMIT: new flags - /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this - /// CQE. It's only purpose is to fill a gap in the ring, - /// if a large CQE is attempted posted when the ring has - /// just a single small CQE worth of space left before - /// wrapping. - F_SKIP: bool = false, - _unused: u9 = 0, - /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings - /// setup in a mixed CQE mode, where both 16b and 32b - /// CQEs may be posted to the CQ ring. - F_32: bool = false, - _unused_1: u16 = 0, - }; + const reprovided_buffer_id = 2; - pub fn err(self: Cqe) linux.E { - if (self.res > -4096 and self.res < 0) { - return @as(linux.E, @enumFromInt(-self.res)); - } - return .SUCCESS; - } + { + _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // On successful completion of the provided buffers IO request, the CQE flags field - // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by - // the upper 16-bits of the flags field. - pub fn buffer_id(self: Cqe) !u16 { - if (!self.flags.F_BUFFER) { - return error.NoBufferSelected; + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT); } -}; -/// IO submission data structure (Submission Queue Entry) -/// matches io_uring_sqe in liburing -pub const Sqe = extern struct { - /// type of operation for this sqe - opcode: Op, - /// IOSQE_* flags - flags: IoSqe, - /// ioprio for the request - ioprio: packed union { - send_recv: SendRecv, - accept: Accept, - const Ioprio = @This(); + // Final read which should work - pub fn init_empty() Ioprio { - return @bitCast(@as(u16, 0)); + { + const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(@as(i32, fd), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - }, - /// file descriptor to do IO on - fd: i32, - /// offset into file - off: u64, - /// pointer to buffer or iovecs - addr: u64, - /// buffer size or number of iovecs - len: u32, - /// flags for any sqe operation - /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | msg_flags - /// timeout_flags | accept_flags | cancel_flags | open_flags | statx_flags - /// fadvise_advice | splice_flags | rename_flags | unlink_flags | hardlink_flags - /// xattr_flags | msg_ring_flags | uring_cmd_flags | waitid_flags | futex_flags - /// install_fd_flags | nop_flags | pipe_flags - rw_flags: u32, - /// data to be passed back at completion time - user_data: u64, - /// index into fixed buffers or for grouped buffer selection - buf_index: u16, - personality: u16, - splice_fd_in: i32, - addr3: u64, - resv: u64, - /// sqe.flags - pub const IoSqe = packed struct(u8) { - /// use fixed fileset - FIXED_FILE: bool = false, - /// issue after inflight IO - IO_DRAIN: bool = false, - /// links next sqe - IO_LINK: bool = false, - /// like LINK, but stronger - IO_HARDLINK: bool = false, - /// always go async - ASYNC: bool = false, - /// select buffer from sqe->buf_group - BUFFER_SELECT: bool = false, - /// don't post CQE if request succeeded - CQE_SKIP_SUCCESS: bool = false, - _: u1 = 0, - }; + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expectEqual(used_buffer_id, reprovided_buffer_id); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + } +} - /// send/sendmsg and recv/recvmsg flags (sqe.ioprio) - pub const SendRecv = packed struct(u16) { - /// IORING_RECVSEND_POLL_FIRST - /// If set, instead of first attempting to send or receive and arm poll - /// if that yields an -EAGAIN result, arm poll upfront and skip the - /// initial transfer attempt. - RECVSEND_POLL_FIRST: bool = false, - /// IORING_RECV_MULTISHOT - /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue - /// to report CQEs on behalf of the same SQE. - RECV_MULTISHOT: bool = false, - /// IORING_RECVSEND_FIXED_BUF - /// Use registered buffers, the index is stored in the buf_index field. - RECVSEND_FIXED_BUF: bool = false, - /// IORING_SEND_ZC_REPORT_USAGE - /// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res - /// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was - /// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied - /// (at least partially). - SEND_ZC_REPORT_USAGE: bool = false, - /// IORING_RECVSEND_BUNDLE - /// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as - /// many buffers from the buffer group ID given and send them all. - /// The completion result will be the number of buffers send, with the - /// starting buffer ID in cqe.flags as per usual for provided buffer - /// usage. The buffers will be contiguous from the starting buffer ID. - RECVSEND_BUNDLE: bool = false, - // COMMIT: new flags - /// IORING_SEND_VECTORIZED - /// If set, SEND[_ZC] will take a pointer to a io_vec to allow - /// vectorized send operations. - SEND_VECTORIZED: bool = false, - _: u10 = 0, - }; +test "remove_buffers" { + if (!is_linux) return error.SkipZigTest; - /// accept flags stored in sqe.ioprio - pub const Accept = packed struct(u16) { - MULTISHOT: bool = false, - // COMMIT: new Flags - DONTWAIT: bool = false, - POLL_FIRST: bool = false, - _unused: u13 = 0, + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; + defer ring.deinit(); - pub fn prep_nop(sqe: *Sqe) void { - sqe.* = .{ - .opcode = .NOP, - .flags = .{}, - .ioprio = .init_empty(), - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } + const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); + defer posix.close(fd); - pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void { - sqe.* = .{ - .opcode = .FSYNC, - .flags = .{}, - .ioprio = .init_empty(), - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = @bitCast(flags), - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } + const group_id = 1337; + const buffer_id = 0; - pub fn prep_rw( - sqe: *Sqe, - op: Op, - fd: linux.fd_t, - addr: u64, - len: usize, - offset: u64, - ) void { - sqe.* = .{ - .opcode = op, - .flags = .{}, - .ioprio = .init_empty(), - .fd = fd, - .off = offset, - .addr = addr, - .len = @intCast(len), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } + const buffer_len = 128; - pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { - sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } + var buffers: [4][buffer_len]u8 = undefined; - pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { - sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + // Provide 4 buffers + + { + _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .INVAL, .BADF => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); } - pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { - sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); - sqe.addr = off_in; - sqe.splice_fd_in = fd_in; + // Remove 3 buffers + + { + const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); + try testing.expectEqual(Op.REMOVE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, 3), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); } - pub fn prep_readv( - sqe: *Sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec, - offset: u64, - ) void { - sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + // This read should work + + { + _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); } - pub fn prep_writev( - sqe: *Sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec_const, - offset: u64, - ) void { - sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } + // Final read should _not_ work - pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; - } + { + _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } } +} - pub fn prep_accept( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.Sock, - ) void { - // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. - // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); - sqe.rw_flags = flags; - } +test "provide_buffers: accept/connect/send/recv" { + if (!is_linux) return error.SkipZigTest; - /// accept directly into the fixed file table - pub fn prep_accept_direct( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.Sock, - file_index: u32, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - set_target_fixed_file(sqe, file_index); - } + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - pub fn prep_multishot_accept( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.Sock, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } }; - } + const group_id = 1337; + const buffer_id = 0; - /// multishot accept directly into the fixed file table - pub fn prep_multishot_accept_direct( - sqe: *Sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: linux.Sock, - ) void { - prep_multishot_accept(sqe, fd, addr, addrlen, flags); - set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); - } + const buffer_len = 128; + var buffers: [4][buffer_len]u8 = undefined; - fn set_target_fixed_file(sqe: *Sqe, file_index: u32) void { - const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC) - constants.FILE_INDEX_ALLOC - else - // 0 means no fixed files, indexes should be encoded as "index + 1" - file_index + 1; - // This filed is overloaded in liburing: - // splice_fd_in: i32 - // sqe_file_index: u32 - sqe.splice_fd_in = @bitCast(sqe_file_index); - } + // Provide 4 buffers - pub fn prep_connect( - sqe: *Sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - ) void { - // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); - } + { + const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(@as(i32, buffers.len), sqe.fd); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - pub fn prep_epoll_ctl( - sqe: *Sqe, - epfd: linux.fd_t, - fd: linux.fd_t, - op: u32, - ev: ?*linux.epoll_event, - ) void { - sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Happens when the kernel is < 5.7 + .INVAL => return error.SkipZigTest, + // Happens on the kernel 5.4 + .BADF => return error.SkipZigTest, + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); } - pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG) void { - sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); - // TODO: review recv `flags` - pub fn prep_recv_multishot( - sqe: *Sqe, - fd: linux.fd_t, - buffer: []u8, - flags: linux.MSG, - ) void { - sqe.prep_recv(fd, buffer, flags); - sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; - } + // Do 4 send on the socket - pub fn prep_recvmsg( - sqe: *Sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: linux.MSG, - ) void { - sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } + { + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + } - pub fn prep_recvmsg_multishot( - sqe: *Sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: linux.MSG, - ) void { - sqe.prep_recvmsg(fd, msg, flags); - sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; + var cqes: [4]Cqe = undefined; + try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); } - // COMMIT: fix send[|recv] flag param type - pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG) void { - sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } + // Do 4 recv which should consume all buffers - pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Sqe.SendRecv) void { - sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - sqe.ioprio = .{ .send_recv = zc_flags }; - } + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 1); - pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: Sqe.SendRecv, buf_index: u16) void { - const zc_flags_fixed = blk: { - var updated_flags = zc_flags; - updated_flags.RECVSEND_FIXED_BUF = true; - break :blk updated_flags; - }; - prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed); - sqe.buf_index = buf_index; - } + var i: usize = 0; + while (i < buffers.len) : (i += 1) { + const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(.{ .BUFFER_SELECT = true }, sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - pub fn prep_sendmsg( - sqe: *Sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: linux.MSG, - ) void { - sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } - pub fn prep_sendmsg_zc( - sqe: *Sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: linux.MSG, - ) void { - prep_sendmsg(sqe, fd, msg, flags); - sqe.opcode = .SENDMSG_ZC; + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + + try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); + const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; + try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); } - pub fn prep_openat( - sqe: *Sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); - sqe.rw_flags = @bitCast(flags); + // This recv should fail + + { + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + // Expected + .NOBUFS => {}, + .SUCCESS => std.debug.panic("unexpected success", .{}), + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); } - pub fn prep_openat_direct( - sqe: *Sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - file_index: u32, - ) void { - prep_openat(sqe, fd, path, flags, mode); - set_target_fixed_file(sqe, file_index); - } + // Provide 1 buffer again - pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void { - sqe.* = .{ - .opcode = .CLOSE, - .flags = .{}, - .ioprio = .init_empty(), - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } + const reprovided_buffer_id = 2; - pub fn prep_close_direct(sqe: *Sqe, file_index: u32) void { - prep_close(sqe, 0); - set_target_fixed_file(sqe, file_index); - } + { + _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - pub fn prep_timeout( - sqe: *Sqe, - ts: *const linux.kernel_timespec, - count: u32, - flags: uflags.Timeout, - ) void { - sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); - sqe.rw_flags = @bitCast(flags); + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } } - pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { - sqe.* = .{ - .opcode = .TIMEOUT_REMOVE, - .flags = .{}, - .ioprio = .init_empty(), - .fd = -1, - .off = 0, - .addr = timeout_user_data, - .len = 0, - .rw_flags = @bitCast(flags), - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } + // Redo 1 send on the server socket - pub fn prep_link_timeout( - sqe: *Sqe, - ts: *const linux.kernel_timespec, - flags: uflags.Timeout, - ) void { - sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); - sqe.rw_flags = flags; - } + { + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - pub fn prep_poll_add( - sqe: *Sqe, - fd: linux.fd_t, - poll_mask: linux.POLL, - ) void { - sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + _ = try ring.copy_cqe(); } - pub fn prep_poll_remove( - sqe: *Sqe, - target_user_data: u64, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); - } + // Final recv which should work - pub fn prep_poll_update( - sqe: *Sqe, - old_user_data: u64, - new_user_data: u64, - poll_mask: linux.POLL, - flags: uflags.Poll, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); - } + // Deliberately put something we don't expect in the buffers + @memset(mem.sliceAsBytes(&buffers), 1); - pub fn prep_fallocate( - sqe: *Sqe, - fd: linux.fd_t, - mode: i32, - offset: u64, - len: u64, - ) void { - sqe.* = .{ - .opcode = .FALLOCATE, - .flags = .{}, - .ioprio = .init_empty(), - .fd = fd, - .off = offset, - .addr = len, - .len = @intCast(mode), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } + { + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); + try testing.expectEqual(@as(u64, 0), sqe.addr); + try testing.expectEqual(@as(u32, buffer_len), sqe.len); + try testing.expectEqual(@as(u16, group_id), sqe.buf_index); + try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - pub fn prep_statx( - sqe: *Sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.At, - mask: linux.Statx.Mask, - buf: *linux.Statx, - ) void { - sqe.prep_rw(.STATX, fd, @intFromPtr(path), @as(u32, @bitCast(mask)), @intFromPtr(buf)); - sqe.rw_flags = @bitCast(flags); - } + const cqe = try ring.copy_cqe(); + switch (cqe.err()) { + .SUCCESS => {}, + else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), + } - pub fn prep_cancel( - sqe: *Sqe, - cancel_user_data: u64, - flags: uflags.AsyncCancel, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); - sqe.rw_flags = @bitCast(flags); + try testing.expect(cqe.flags.F_BUFFER); + const used_buffer_id = try cqe.buffer_id(); + try testing.expectEqual(used_buffer_id, reprovided_buffer_id); + try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; + try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); } +} - pub fn prep_cancel_fd( - sqe: *Sqe, - fd: linux.fd_t, - flags: uflags.AsyncCancel, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); - const enable_cancel_fd = blk: { - var update_flags = flags; - update_flags.CANCEL_FD = true; - break :blk update_flags; - }; - sqe.rw_flags = @bitCast(enable_cancel_fd); - } +/// Used for testing server/client interactions. +const SocketTestHarness = struct { + listener: posix.socket_t, + server: posix.socket_t, + client: posix.socket_t, - pub fn prep_shutdown( - sqe: *Sqe, - sockfd: linux.socket_t, - how: linux.Shut, - ) void { - sqe.prep_rw(.SHUTDOWN, sockfd, 0, @intFromEnum(how), 0); + fn close(self: SocketTestHarness) void { + posix.close(self.client); + posix.close(self.listener); } +}; - pub fn prep_renameat( - sqe: *Sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: linux.RenameFlags, - ) void { - sqe.prep_rw( - .RENAMEAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = @bitCast(flags); - } +fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { + // Create a TCP server socket + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; + const listener_socket = try createListenerSocket(&address); + errdefer posix.close(listener_socket); + + // Submit 1 accept + var accept_addr: posix.sockaddr = undefined; + var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr)); + _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, .{}); + + // Create a TCP client socket + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + errdefer posix.close(client); + _ = try ring.connect(0xcccccccc, client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + + try testing.expectEqual(@as(u32, 2), try ring.submit()); + + var cqe_accept = try ring.copy_cqe(); + if (cqe_accept.err() == .INVAL) return error.SkipZigTest; + var cqe_connect = try ring.copy_cqe(); + if (cqe_connect.err() == .INVAL) return error.SkipZigTest; - pub fn prep_unlinkat( - sqe: *Sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.At, // TODO: unlink flags only AT_REMOVEDIR - ) void { - sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); - sqe.rw_flags = @bitCast(flags); + // The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first: + if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) { + const a = cqe_accept; + const b = cqe_connect; + cqe_accept = b; + cqe_connect = a; } - pub fn prep_mkdirat( - sqe: *Sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); - } + try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); + if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); + try testing.expect(cqe_accept.res > 0); + try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_accept.flags); + try testing.expectEqual(Cqe{ + .user_data = 0xcccccccc, + .res = 0, + .flags = .{}, + }, cqe_connect); - pub fn prep_symlinkat( - sqe: *Sqe, - target: [*:0]const u8, - new_dir_fd: linux.fd_t, - link_path: [*:0]const u8, - ) void { - sqe.prep_rw( - .SYMLINKAT, - new_dir_fd, - @intFromPtr(target), - 0, - @intFromPtr(link_path), - ); - } + // All good - pub fn prep_linkat( - sqe: *Sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: linux.At, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW - ) void { - sqe.prep_rw( - .LINKAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = @bitCast(flags); - } + return SocketTestHarness{ + .listener = listener_socket, + .server = cqe_accept.res, + .client = client, + }; +} - pub fn prep_files_update( - sqe: *Sqe, - fds: []const linux.fd_t, - offset: u32, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); - } +fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { + const kernel_backlog = 1; + const listener_socket = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + errdefer posix.close(listener_socket); - pub fn prep_files_update_alloc( - sqe: *Sqe, - fds: []linux.fd_t, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); - } + try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.bind(listener_socket, addrAny(address), @sizeOf(linux.sockaddr.in)); + try posix.listen(listener_socket, kernel_backlog); - // TODO: why can't slice be used here ? - pub fn prep_provide_buffers( - sqe: *Sqe, - buffers: [*]u8, - buffer_len: usize, - num: usize, - group_id: usize, - buffer_id: usize, - ) void { - const ptr = @intFromPtr(buffers); - sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); - sqe.buf_index = @intCast(group_id); - } + // set address to the OS-chosen IP/port. + var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(listener_socket, addrAny(address), &slen); - pub fn prep_remove_buffers( - sqe: *Sqe, - num: usize, - group_id: usize, - ) void { - sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); - sqe.buf_index = @intCast(group_id); - } + return listener_socket; +} - pub fn prep_socket( - sqe: *Sqe, - domain: linux.Af, - socket_type: linux.Sock, - protocol: linux.IpProto, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - /// flags is unused - flags: u32, - ) void { - sqe.prep_rw(.SOCKET, @intCast(domain), 0, @intFromEnum(protocol), @intCast(@as(u32, @bitCast(socket_type)))); - sqe.rw_flags = flags; - } +test "accept multishot" { + if (!is_linux) return error.SkipZigTest; - pub fn prep_socket_direct( - sqe: *Sqe, - domain: linux.Af, - socket_type: linux.Sock, - protocol: linux.IpProto, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - /// flags is unused - flags: u32, - file_index: u32, - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - set_target_fixed_file(sqe, file_index); - } + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - pub fn prep_socket_direct_alloc( - sqe: *Sqe, - domain: linux.Af, - socket_type: linux.SOCK, - protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 - flags: u32, // flags is unused - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); - } + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); - pub fn prep_waitid( - sqe: *Sqe, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: linux.W, - flags: u32, // flags is unused - ) void { - sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); - sqe.rw_flags = flags; - sqe.splice_fd_in = @bitCast(options); - } + // submit multishot accept operation + var addr: posix.sockaddr = undefined; + var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); + const userdata: u64 = 0xaaaaaaaa; + _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, .{}); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // TODO: maybe remove unused flag fields? - pub fn prep_bind( - sqe: *Sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - flags: u32, // flags is unused and does't exist in io_uring's api - ) void { - sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); - sqe.rw_flags = flags; - } + var nr: usize = 4; // number of clients to connect + while (nr > 0) : (nr -= 1) { + // connect client + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + errdefer posix.close(client); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - pub fn prep_listen( - sqe: *Sqe, - fd: linux.fd_t, - backlog: usize, - flags: u32, // flags is unused and does't exist in io_uring's api - ) void { - sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); - sqe.rw_flags = flags; - } + // test accept completion + var cqe = try ring.copy_cqe(); + if (cqe.err() == .INVAL) return error.SkipZigTest; + try testing.expect(cqe.res > 0); + try testing.expect(cqe.user_data == userdata); + try testing.expect(cqe.flags.F_MORE); // more flag is set - pub fn prep_cmd_sock( - sqe: *Sqe, - cmd_op: SocketOp, - fd: linux.fd_t, - level: linux.SOL, - optname: linux.SO, - optval: u64, - optlen: u32, - ) void { - sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); - // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 - sqe.off = @intFromEnum(cmd_op); - // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 - sqe.addr = @bitCast(packed struct { - level: u32, - optname: u32, - }{ - .level = level, - .optname = optname, - }); - // splice_fd_in if overloaded u32 -> i32 - sqe.splice_fd_in = @bitCast(optlen); - // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 - sqe.addr3 = optval; + posix.close(client); } +} + +test "accept/connect/send_zc/recv" { + try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); + + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + const socket_test_harness = try createSocketTestHarness(&ring); + defer socket_test_harness.close(); + + const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + var buffer_recv = [_]u8{0} ** 10; - pub fn set_flags(sqe: *Sqe, flags: Sqe.IoSqe) void { - const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags)); - sqe.flags = @bitCast(updated_flags); - } + // zero-copy send + const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); + sqe_send.flags.IO_LINK = true; + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + try testing.expectEqual(@as(u32, 2), try ring.submit()); - /// This SQE forms a link with the next SQE in the submission ring. Next SQE - /// will not be started before this one completes. Forms a chain of SQEs. - pub fn link_next(sqe: *Sqe) void { - sqe.flags.IO_LINK = true; - } -}; + var cqe_send = try ring.copy_cqe(); + // First completion of zero-copy send. + // IORING_CQE_F_MORE, means that there + // will be a second completion event / notification for the + // request, with the user_data field set to the same value. + // buffer_send must be keep alive until second cqe. + try testing.expectEqual(Cqe{ + .user_data = 0xeeeeeeee, + .res = buffer_send.len, + .flags = .{ .F_MORE = true }, + }, cqe_send); -/// matches io_uring_sq in liburing -pub const Sq = struct { - head: *u32, - tail: *u32, - mask: u32, - flags: *Flags, - dropped: *u32, - array: []u32, - sqes: []Sqe, - mmap: []align(page_size_min) u8, - mmap_sqes: []align(page_size_min) u8, + cqe_send, const cqe_recv = brk: { + const cqe1 = try ring.copy_cqe(); + const cqe2 = try ring.copy_cqe(); + break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 }; + }; - // We use `sqe_head` and `sqe_tail` in the same way as liburing: - // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. - // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. - // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. - sqe_head: u32 = 0, - sqe_tail: u32 = 0, + try testing.expectEqual(Cqe{ + .user_data = 0xffffffff, + .res = buffer_recv.len, + .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + }, cqe_recv); + try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); - /// sq_ring.flags - pub const Flags = packed struct(u32) { - /// needs io_uring_enter wakeup - NEED_WAKEUP: bool = false, - /// CQ ring is overflown - CQ_OVERFLOW: bool = false, - /// task should enter the kernel - TASKRUN: bool = false, - _unused: u29 = 0, + // Second completion of zero-copy send. + // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer + try testing.expectEqual(Cqe{ + .user_data = 0xeeeeeeee, + .res = 0, + .flags = linux.IORING_CQE_F_NOTIF, + }, cqe_send); +} + +test "accept_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); + + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; - pub fn init(fd: posix.fd_t, p: Params) !Sq { - assert(fd >= 0); - assert(p.features.SINGLE_MMAP); - const size = @max( - p.sq_off.array + p.sq_entries * @sizeOf(u32), - p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe), - ); - const mmap = try posix.mmap( - null, - size, - posix.PROT.READ | posix.PROT.WRITE, - .{ .TYPE = .SHARED, .POPULATE = true }, - fd, - constants.OFF_SQ_RING, - ); - errdefer posix.munmap(mmap); - assert(mmap.len == size); + // register direct file descriptors + var registered_fds = [_]linux.fd_t{-1} ** 2; + try ring.register_files(registered_fds[0..]); - // The motivation for the `sqes` and `array` indirection is to make it possible for the - // application to preallocate static io_uring_sqe entries and then replay them when needed. - const size_sqes = p.sq_entries * @sizeOf(Sqe); - const mmap_sqes = try posix.mmap( - null, - size_sqes, - posix.PROT.READ | posix.PROT.WRITE, - .{ .TYPE = .SHARED, .POPULATE = true }, - fd, - constants.OFF_SQES, - ); - errdefer posix.munmap(mmap_sqes); - assert(mmap_sqes.len == size_sqes); + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); - const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); - const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0])); - // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, - // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. - assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); - return .{ - .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), - .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), - .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, - .flags = @ptrCast(@alignCast(&mmap[p.sq_off.flags])), - .dropped = @ptrCast(@alignCast(&mmap[p.sq_off.dropped])), - .array = array[0..p.sq_entries], - .sqes = sqes[0..p.sq_entries], - .mmap = mmap, - .mmap_sqes = mmap_sqes, - }; - } + const accept_userdata: u64 = 0xaaaaaaaa; + const read_userdata: u64 = 0xbbbbbbbb; + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - pub fn deinit(self: *Sq) void { - posix.munmap(self.mmap_sqes); - posix.munmap(self.mmap); - } -}; + for (0..2) |_| { + for (registered_fds, 0..) |_, i| { + var buffer_recv: [16]u8 = @splat(0); + const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop -/// matches io_uring_cq in liburing -pub const Cq = struct { - head: *u32, - tail: *u32, - mask: u32, - overflow: *u32, - cqes: []Cqe, + // submit accept, will chose registered fd and return index in cqe + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - /// cq_ring.flags - pub const Flags = packed struct(u32) { - /// disable eventfd notifications - EVENTFD_DISABLED: bool = false, - _unused: u31 = 0, - }; + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); - pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq { - assert(fd >= 0); - assert(p.features.SINGLE_MMAP); - const mmap = sq.mmap; - const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); - assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); - return .{ - .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), - .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), - .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, - .overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])), - .cqes = cqes[0..p.cq_entries], - }; - } + // accept completion + const cqe_accept = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + const fd_index = cqe_accept.res; + try testing.expect(fd_index < registered_fds.len); + try testing.expect(cqe_accept.user_data == accept_userdata); - pub fn deinit(self: *Cq) void { - _ = self; - // A no-op since we now share the mmap with the submission queue. - // Here for symmetry with the submission queue, and for any future feature support. - } -}; + // send data + _ = try posix.send(client, buffer_send, 0); -/// Group of application provided buffers. Uses newer type, called ring mapped -/// buffers, supported since kernel 5.19. Buffers are identified by a buffer -/// group ID, and within that group, a buffer ID. IO_Uring can have multiple -/// buffer groups, each with unique group ID. -/// -/// In `init` application provides contiguous block of memory `buffers` for -/// `buffers_count` buffers of size `buffers_size`. Application can then submit -/// `recv` operation without providing buffer upfront. Once the operation is -/// ready to receive data, a buffer is picked automatically and the resulting -/// CQE will contain the buffer ID in `cqe.buffer_id()`. Use `get` method to get -/// buffer for buffer ID identified by CQE. Once the application has processed -/// the buffer, it may hand ownership back to the kernel, by calling `put` -/// allowing the cycle to repeat. -/// -/// Depending on the rate of arrival of data, it is possible that a given buffer -/// group will run out of buffers before those in CQEs can be put back to the -/// kernel. If this happens, a `cqe.err()` will have ENOBUFS as the error value. -/// -pub const BufferGroup = struct { - /// Parent ring for which this group is registered. - ring: *IoUring, - /// Pointer to the memory shared by the kernel. - /// `buffers_count` of `io_uring_buf` structures are shared by the kernel. - /// First `io_uring_buf` is overlaid by `io_uring_buf_ring` struct. - br: *align(page_size_min) BufferRing, - /// Contiguous block of memory of size (buffers_count * buffer_size). - buffers: []u8, - /// Size of each buffer in buffers. - buffer_size: u32, - /// Number of buffers in `buffers`, number of `io_uring_buf structures` in br. - buffers_count: u16, - /// Head of unconsumed part of each buffer, if incremental consumption is enabled - heads: []u32, - /// ID of this group, must be unique in ring. - group_id: u16, + // Example of how to use registered fd: + // Submit receive to fixed file returned by accept (fd_index). + // Fd field is set to registered file index, returned by accept. + // Flag linux.IOSQE_FIXED_FILE must be set. + const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); + recv_sqe.flags.FIXED_FILE = true; + try testing.expectEqual(@as(u32, 1), try ring.submit()); + + // accept receive + const recv_cqe = try ring.copy_cqe(); + try testing.expect(recv_cqe.user_data == read_userdata); + try testing.expect(recv_cqe.res == buffer_send.len); + try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); + } + // no more available fds, accept will get NFILE error + { + // submit accept + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); + // completion with error + const cqe_accept = try ring.copy_cqe(); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); + } + // return file descriptors to kernel + try ring.register_files_update(0, registered_fds[0..]); + } + try ring.unregister_files(); +} - pub fn init( - ring: *IoUring, - allocator: mem.Allocator, - group_id: u16, - buffer_size: u32, - buffers_count: u16, - ) !BufferGroup { - const buffers = try allocator.alloc(u8, buffer_size * buffers_count); - errdefer allocator.free(buffers); - const heads = try allocator.alloc(u32, buffers_count); - errdefer allocator.free(heads); +test "accept_multishot_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .IOU_PBUF_RING_INC = true }); - buf_ring_init(br); + if (builtin.cpu.arch == .riscv64) { + // https://github.com/ziglang/zig/issues/25734 + return error.SkipZigTest; + } - const mask = buf_ring_mask(buffers_count); - var i: u16 = 0; - while (i < buffers_count) : (i += 1) { - const pos = buffer_size * i; - const buf = buffers[pos .. pos + buffer_size]; - heads[i] = 0; - buf_ring_add(br, buf, i, mask, i); - } - buf_ring_advance(br, buffers_count); + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - return BufferGroup{ - .ring = ring, - .group_id = group_id, - .br = br, - .buffers = buffers, - .heads = heads, - .buffer_size = buffer_size, - .buffers_count = buffers_count, - }; - } + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; - pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void { - free_buf_ring(self.ring.fd, self.br, self.buffers_count, self.group_id); - allocator.free(self.buffers); - allocator.free(self.heads); - } + var registered_fds = [_]linux.fd_t{-1} ** 2; + try ring.register_files(registered_fds[0..]); - // Prepare recv operation which will select buffer from this group. - pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { - var sqe = try self.ring.get_sqe(); - sqe.prep_rw(.RECV, fd, 0, 0, 0); - sqe.rw_flags = flags; - sqe.flags.BUFFER_SELECT = true; - sqe.buf_index = self.group_id; - sqe.user_data = user_data; - return sqe; - } + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); - // Prepare multishot recv operation which will select buffer from this group. - pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { - var sqe = try self.recv(user_data, fd, flags); - sqe.ioprio.send_recv.RECV_MULTISHOT = true; - return sqe; - } + const accept_userdata: u64 = 0xaaaaaaaa; - // Get buffer by id. - fn get_by_id(self: *BufferGroup, buffer_id: u16) []u8 { - const pos = self.buffer_size * buffer_id; - return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..]; - } + for (0..2) |_| { + // submit multishot accept + // Will chose registered fd and return index of the selected registered file in cqe. + _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, .{}); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - // Get buffer by CQE. - pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { - const buffer_id = try cqe.buffer_id(); - const used_len = @as(usize, @intCast(cqe.res)); - return self.get_by_id(buffer_id)[0..used_len]; - } + for (registered_fds) |_| { + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); - // Release buffer from CQE to the kernel. - pub fn put(self: *BufferGroup, cqe: Cqe) !void { - const buffer_id = try cqe.buffer_id(); - if (cqe.flags.F_BUF_MORE) { - // Incremental consumption active, kernel will write to the this buffer again - const used_len = @as(u32, @intCast(cqe.res)); - // Track what part of the buffer is used - self.heads[buffer_id] += used_len; - return; + // accept completion + const cqe_accept = try ring.copy_cqe(); + const fd_index = cqe_accept.res; + try testing.expect(fd_index < registered_fds.len); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expect(cqe_accept.flags.F_MORE); // has more is set } - self.heads[buffer_id] = 0; - - // Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count); - const mask = buf_ring_mask(self.buffers_count); - buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0); - buf_ring_advance(self.br, 1); + // No more available fds, accept will get NFILE error. + // Multishot is terminated (more flag is not set). + { + // connect + const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); + defer posix.close(client); + // completion with error + const cqe_accept = try ring.copy_cqe(); + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); + try testing.expect(!cqe_accept.flags.F_MORE); // has more is not set + } + // return file descriptors to kernel + try ring.register_files_update(0, registered_fds[0..]); } -}; + try ring.unregister_files(); +} -/// Used to select how the read should be handled. -pub const ReadBuffer = union(enum) { - /// io_uring will read directly into this buffer - buffer: []u8, +test "socket" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - /// io_uring will read directly into these buffers using readv. - iovecs: []const posix.iovec, + var ring = IoUring.init(1, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, -}; + // prepare, submit socket operation + _ = try ring.socket(0, .inet, .{ .type = .stream }, .default, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); -/// Used to select how the recv call should be handled. -pub const RecvBuffer = union(enum) { - /// io_uring will recv directly into this buffer - buffer: []u8, + // test completion + var cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + const fd: linux.fd_t = @intCast(cqe.res); + try testing.expect(fd > 2); - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, -}; + posix.close(fd); +} -/// Filled with the offset for mmap(2) -/// matches io_sqring_offsets in liburing -pub const SqOffsets = extern struct { - /// offset of ring head - head: u32, - /// offset of ring tail - tail: u32, - /// ring mask value - ring_mask: u32, - /// entries in ring - ring_entries: u32, - /// ring flags - flags: u32, - /// number of sqes not submitted - dropped: u32, - /// sqe index array - array: u32, - resv1: u32, - user_addr: u64, -}; +test "socket_direct/socket_direct_alloc/close_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); -/// matches io_cqring_offsets in liburing -pub const CqOffsets = extern struct { - head: u32, - tail: u32, - ring_mask: u32, - ring_entries: u32, - overflow: u32, - cqes: u32, - flags: u32, - resv: u32, - user_addr: u64, -}; + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); + + var registered_fds = [_]linux.fd_t{-1} ** 3; + try ring.register_files(registered_fds[0..]); + + // create socket in registered file descriptor at index 0 (last param) + _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 0); + + // create socket in registered file descriptor at index 1 (last param) + _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 1); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified + + // create socket in kernel chosen file descriptor index (_alloc version) + // completion res has index from registered files + _ = try ring.socket_direct_alloc(0, linux.Af.INET, posix.SOCK.STREAM, 0, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe_socket = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expect(cqe_socket.res == 2); // returns registered file index + + // use sockets from registered_fds in connect operation + var address: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; + const listener_socket = try createListenerSocket(&address); + defer posix.close(listener_socket); + const accept_userdata: u64 = 0xaaaaaaaa; + const connect_userdata: u64 = 0xbbbbbbbb; + const close_userdata: u64 = 0xcccccccc; + for (registered_fds, 0..) |_, fd_index| { + // prepare accept + _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); + // prepare connect with fixed socket + const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); + connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index + // submit both + try testing.expectEqual(@as(u32, 2), try ring.submit()); + // get completions + var cqe_connect = try ring.copy_cqe(); + var cqe_accept = try ring.copy_cqe(); + // ignore order + if (cqe_connect.user_data == accept_userdata and cqe_accept.user_data == connect_userdata) { + const a = cqe_accept; + const b = cqe_connect; + cqe_accept = b; + cqe_connect = a; + } + // test connect completion + try testing.expect(cqe_connect.user_data == connect_userdata); + try testing.expectEqual(posix.E.SUCCESS, cqe_connect.err()); + // test accept completion + try testing.expect(cqe_accept.user_data == accept_userdata); + try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); -/// Passed in for io_uring_setup(2). Copied back with updated info on success -/// matches io_uring_params in liburing -pub const Params = extern struct { - sq_entries: u32, - cq_entries: u32, - flags: uflags.Setup, - sq_thread_cpu: u32, - sq_thread_idle: u32, - features: uflags.Features, - wq_fd: u32, - resv: [3]u32, - sq_off: SqOffsets, - cq_off: CqOffsets, -}; + // submit and test close_direct + _ = try ring.close_direct(close_userdata, @intCast(fd_index)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_close = try ring.copy_cqe(); + try testing.expect(cqe_close.user_data == close_userdata); + try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + } -// COMMIT: remove deprecated io_uring_rsrc_update struct -// deprecated, see struct io_uring_rsrc_update + try ring.unregister_files(); +} -// COMMIT: add new io_uring_region_desc struct -/// matches io_uring_region_desc in liburing -pub const RegionDesc = extern struct { - user_addr: u64, - size: u64, - flags: Flags, - id: u32, - mmap_offset: u64, - __resv: [4]u64, +test "openat_direct/close_direct" { + try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - // COMMIT: new constant - /// initialise with user provided memory pointed by user_addr - pub const Flags = packed struct(u32) { - TYPE_USER: bool = false, - _: u31 = 0, + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; -}; + defer ring.deinit(); -// COMMIT: add new io_uring_mem_region_reg struct -/// matches io_uring_mem_region_reg in liburing -pub const MemRegionReg = extern struct { - /// struct io_uring_region_desc (RegionDesc in Zig) - region_uptr: u64, - flags: Flags, - __resv: [2]u64, + var registered_fds = [_]linux.fd_t{-1} ** 3; + try ring.register_files(registered_fds[0..]); - /// expose the region as registered wait arguments - pub const Flags = packed struct(u64) { - REG_WAIT_ARG: bool = false, - _: u63 = 0, - }; -}; + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + const path = "test_io_uring_close_direct"; + const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; + const mode: posix.mode_t = 0o666; + const user_data: u64 = 0; -/// matches io_uring_rsrc_register in liburing -pub const RsrcRegister = extern struct { - nr: u32, - flags: u32, - resv2: u64, - data: u64, - tags: u64, -}; + // use registered file at index 0 (last param) + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 0); -/// matches io_uring_rsrc_update in liburing -pub const RsrcUpdate = extern struct { - offset: u32, - resv: u32, - data: u64, -}; + // use registered file at index 1 + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 0); // res is 0 when we specify index -/// matches io_uring_rsrc_update2 in liburing -pub const RsrcUpdate2 = extern struct { - offset: u32, - resv: u32, - data: u64, - tags: u64, - nr: u32, - resv2: u32, -}; + // let kernel choose registered file index + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.res == 2); // chosen index is in res -/// matches io_uring_probe_op in liburing -pub const ProbeOp = extern struct { - op: Op, - resv: u8, - flags: Flags, - resv2: u32, + // close all open file descriptors + for (registered_fds, 0..) |_, fd_index| { + _ = try ring.close_direct(user_data, @intCast(fd_index)); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + var cqe_close = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + } + try ring.unregister_files(); +} - pub const Flags = packed struct(u16) { - OP_SUPPORTED: bool = false, - _: u15 = 0, +test "waitid" { + try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); + + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, }; + defer ring.deinit(); - pub fn is_supported(self: ProbeOp) bool { - return self.flags.OP_SUPPORTED; + const pid = try posix.fork(); + if (pid == 0) { + posix.exit(7); } -}; -/// matches io_uring_probe in liburing -pub const Probe = extern struct { - /// Last opcode supported - last_op: Op, - /// Length of ops[] array below - ops_len: u8, - resv: u16, - resv2: [3]u32, - ops: [256]ProbeOp, + var siginfo: posix.siginfo_t = undefined; + _ = try ring.waitid(0, .PID, pid, &siginfo, .{ .exited = true }, 0); - /// Is the operation supported on the running kernel. - pub fn is_supported(self: @This(), op: Op) bool { - const i = @intFromEnum(op); - if (i > @intFromEnum(self.last_op) or i >= self.ops_len) - return false; - return self.ops[i].is_supported(); - } -}; + try testing.expectEqual(1, try ring.submit()); -// COMMIT: fix defination of io_uring_restriction -// RegisterOp is actually u8 -/// matches io_uring_restriction in liburing -pub const Restriction = extern struct { - opcode: RestrictionOp, - arg: extern union { - /// IORING_RESTRICTION_REGISTER_OP - register_op: RegisterOp, - /// IORING_RESTRICTION_SQE_OP - sqe_op: Op, - /// IORING_RESTRICTION_SQE_FLAGS_* - sqe_flags: u8, - }, - resv: u8, - resv2: [3]u32, -}; + const cqe_waitid = try ring.copy_cqe(); + try testing.expectEqual(0, cqe_waitid.res); + try testing.expectEqual(pid, siginfo.fields.common.first.piduid.pid); + try testing.expectEqual(7, siginfo.fields.common.second.sigchld.status); +} -// COMMIT: add new struct type -/// matches io_uring_clock_register in liburing -pub const ClockRegister = extern struct { - clockid: u32, - __resv: [3]u32, -}; +/// For use in tests. Returns SkipZigTest if kernel version is less than required. +inline fn skipKernelLessThan(required: std.SemanticVersion) !void { + if (!is_linux) return error.SkipZigTest; -// COMMIT: add new struct type -/// matches io_uring_clone_buffers in liburing -pub const CloneBuffers = extern struct { - src_fd: u32, - flags: Flags, - src_off: u32, - dst_off: u32, - nr: u32, - pad: [3]u32, + var uts: linux.utsname = undefined; + const res = linux.uname(&uts); + switch (linux.errno(res)) { + .SUCCESS => {}, + else => |errno| return posix.unexpectedErrno(errno), + } - // COMMIT: new flags - pub const Flags = packed struct(u32) { - REGISTER_SRC_REGISTERED: bool = false, - REGISTER_DST_REPLACE: bool = false, - _: u30 = 0, - }; -}; + const release = mem.sliceTo(&uts.release, 0); + // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" + const extra_index = std.mem.indexOfAny(u8, release, "-+"); + const stripped = release[0..(extra_index orelse release.len)]; + // Make sure the input don't rely on the extra we just stripped + try testing.expect(required.pre == null and required.build == null); -/// matches io_uring_buf in liburing -pub const Buffer = extern struct { - addr: u64, - len: u32, - bid: u16, - resv: u16, -}; + var current = try std.SemanticVersion.parse(stripped); + current.pre = null; // don't check pre field + if (required.order(current) == .gt) return error.SkipZigTest; +} -/// matches io_uring_buf_ring in liburing -pub const BufferRing = extern struct { - resv1: u64, - resv2: u32, - resv3: u16, - tail: u16, -}; +test BufferGroup { + if (!is_linux) return error.SkipZigTest; -/// argument for IORING_(UN)REGISTER_PBUF_RING -/// matches io_uring_buf_reg in liburing -pub const BufferRegister = extern struct { - ring_addr: u64, - ring_entries: u32, - bgid: u16, - flags: Flags, - resv: [3]u64, + // Init IoUring + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - // COMMIT: new IORING_REGISTER_PBUF_RING flags - /// Flags for IORING_REGISTER_PBUF_RING. - pub const Flags = packed struct(u16) { - /// IOU_PBUF_RING_MMAP: - /// If set, kernel will allocate the memory for the ring. - /// The application must not set a ring_addr in struct io_uring_buf_reg - /// instead it must subsequently call mmap(2) with the offset set - /// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get - /// a virtual mapping for the ring. - IOU_PBUF_RING_MMAP: bool = false, - /// IOU_PBUF_RING_INC: - /// If set, buffers consumed from this buffer ring can be - /// consumed incrementally. Normally one (or more) buffers - /// are fully consumed. With incremental consumptions, it's - /// feasible to register big ranges of buffers, and each - /// use of it will consume only as much as it needs. This - /// requires that both the kernel and application keep - /// track of where the current read/recv index is at. - IOU_PBUF_RING_INC: bool = false, - _: u14 = 0, + // Init buffer group for ring + const group_id: u16 = 1; // buffers group id + const buffers_count: u16 = 1; // number of buffers in buffer group + const buffer_size: usize = 128; // size of each buffer in group + var buf_grp = BufferGroup.init( + &ring, + testing.allocator, + group_id, + buffer_size, + buffers_count, + ) catch |err| switch (err) { + // kernel older than 5.19 + error.ArgumentsInvalid => return error.SkipZigTest, + else => return err, }; -}; + defer buf_grp.deinit(testing.allocator); -/// argument for IORING_REGISTER_PBUF_STATUS -/// matches io_uring_buf_status in liburing -pub const BufferStatus = extern struct { - /// input - buf_group: u32, - /// output - head: u32, - resv: [8]u32, -}; + // Create client/server fds + const fds = try createSocketTestHarness(&ring); + defer fds.close(); + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; -/// argument for IORING_(UN)REGISTER_NAPI -/// matches io_uring_napi in liburing -pub const Napi = extern struct { - busy_poll_to: u32, - prefer_busy_poll: u8, - pad: [3]u8, - resv: u64, -}; + // Client sends data + { + _ = try ring.send(1, fds.client, data[0..], 0); + const submitted = try ring.submit(); + try testing.expectEqual(1, submitted); + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = .{} }, cqe_send); + } -// COMMIT: new struct type -/// Argument for io_uring_enter(2) with -/// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument -/// is an index into a previously registered fixed wait region described by -/// the below structure. -/// matches io_uring_reg_wait in liburing -pub const RegisterWait = extern struct { - ts: linux.kernel_timespec, - min_wait_usec: u32, - flags: Flags, - sigmask: u64, - sigmask_sz: u32, - pad: [3]u32, - pad2: [2]u64, + // Server uses buffer group receive + { + // Submit recv operation, buffer will be chosen from buffer group + _ = try buf_grp.recv(2, fds.server, 0); + const submitted = try ring.submit(); + try testing.expectEqual(1, submitted); - // COMMIT: new constant - pub const Flags = packed struct(u32) { - REG_WAIT_TS: bool = false, - _: u31 = 0, - }; -}; + // ... when we have completion for recv operation + const cqe = try ring.copy_cqe(); + try testing.expectEqual(2, cqe.user_data); // matches submitted user_data + try testing.expect(cqe.res >= 0); // success + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(data.len, @as(usize, @intCast(cqe.res))); // cqe.res holds received data len -/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG -/// matches io_uring_getevents_arg in liburing -pub const GetEventsArg = extern struct { - sigmask: u64, - sigmask_sz: u32, - pad: u32, - ts: u64, -}; + // Get buffer from pool + const buf = try buf_grp.get(cqe); + try testing.expectEqualSlices(u8, &data, buf); + // Release buffer to the kernel when application is done with it + try buf_grp.put(cqe); + } +} -// COMMIT: fix type definition of io_uring_sync_cancel_reg -/// Argument for IORING_REGISTER_SYNC_CANCEL -/// matches io_uring_sync_cancel_reg in liburing -pub const SyncCancelRegister = extern struct { - addr: u64, - fd: i32, - flags: uflags.AsyncCancel, - timeout: linux.kernel_timespec, - opcode: Op, - pad: [7]u8, - pad2: [4]u64, -}; +test "ring mapped buffers recv" { + if (!is_linux) return error.SkipZigTest; -/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE -/// The range is specified as [off, off + len) -/// matches io_uring_file_index_range in liburing -pub const FileIndexRange = extern struct { - off: u32, - len: u32, - resv: u64, -}; + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); -/// matches io_uring_recvmsg_out in liburing -pub const RecvmsgOut = extern struct { - namelen: u32, - controllen: u32, - payloadlen: u32, - flags: u32, -}; + // init buffer group + const group_id: u16 = 1; // buffers group id + const buffers_count: u16 = 2; // number of buffers in buffer group + const buffer_size: usize = 4; // size of each buffer in group + var buf_grp = BufferGroup.init( + &ring, + testing.allocator, + group_id, + buffer_size, + buffers_count, + ) catch |err| switch (err) { + // kernel older than 5.19 + error.ArgumentsInvalid => return error.SkipZigTest, + else => return err, + }; + defer buf_grp.deinit(testing.allocator); -/// Zero copy receive refill queue entry -/// matches io_uring_zcrx_rqe in liburing -pub const ZcrxRqe = extern struct { - off: u64, - len: u32, - __pad: u32, -}; + // create client/server fds + const fds = try createSocketTestHarness(&ring); + defer fds.close(); -/// matches io_uring_zcrx_cqe in liburing -pub const ZcrxCqe = extern struct { - off: u64, - __pad: u64, -}; + // for random user_data in sqe/cqe + var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); + var rnd = Rnd.random(); -/// matches io_uring_zcrx_offsets in liburing -pub const ZcrxOffsets = extern struct { - head: u32, - tail: u32, - rqes: u32, - __resv2: u32, - __resv: [2]u64, -}; + var round: usize = 4; // repeat send/recv cycle round times + while (round > 0) : (round -= 1) { + // client sends data + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + { + const user_data = rnd.int(u64); + _ = try ring.send(user_data, fds.client, data[0..], 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); + } + var pos: usize = 0; -/// matches io_uring_zcrx_area_reg in liburing -pub const ZcrxAreaRegister = extern struct { - addr: u64, - len: u64, - rq_area_token: u64, - flags: Flags, - dmabuf_fd: u32, - __resv2: [2]u64, + // read first chunk + const cqe1 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); + var buf = try buf_grp.get(cqe1); + try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); + pos += buf.len; + // second chunk + const cqe2 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); + buf = try buf_grp.get(cqe2); + try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); + pos += buf.len; + + // both buffers provided to the kernel are used so we get error + // 'no more buffers', until we put buffers to the kernel + { + const user_data = rnd.int(u64); + _ = try buf_grp.recv(user_data, fds.server, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expect(cqe.res < 0); // fail + try testing.expectEqual(posix.E.NOBUFS, cqe.err()); + try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only + try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); + } - pub const Flags = packed struct(u32) { - DMABUF: bool = false, - _: u31 = 0, - }; -}; + // put buffers back to the kernel + try buf_grp.put(cqe1); + try buf_grp.put(cqe2); -/// Argument for IORING_REGISTER_ZCRX_IFQ -/// matches io_uring_zcrx_ifq_reg in liburing -pub const ZcrxIfqRegister = extern struct { - if_idx: u32, - if_rxq: u32, - rq_entries: u32, - // TODO: find out its flags, I suspect its ZcrxAreaRegister.Flags - flags: u32, - /// pointer to struct io_uring_zcrx_area_reg - area_ptr: u64, - /// struct io_uring_region_desc - region_ptr: u64, - offsets: ZcrxOffsets, - zcrx_id: u32, - __resv2: u32, - __resv: [3]u64, -}; + // read remaining data + while (pos < data.len) { + const cqe = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64)); + buf = try buf_grp.get(cqe); + try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf); + pos += buf.len; + try buf_grp.put(cqe); + } + } +} -// COMMIT: move IoUring constants to Constants -pub const constants = struct { - /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new - /// an available direct descriptor instead of having the application pass one - /// direct descriptor (like openat/openat2/accept), then io_uring will allocate - /// in. The picked direct descriptor will be returned in cqe.res, or -ENFILE - /// if the space is full. - pub const FILE_INDEX_ALLOC = std.math.maxInt(u32); +test "ring mapped buffers multishot recv" { + if (!is_linux) return error.SkipZigTest; - pub const CMD_MASK = 1 << 0; + var ring = IoUring.init(16, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - pub const TIMEOUT_CLOCK_MASK = ((1 << 2) | (1 << 3)); - pub const TIMEOUT_UPDATE_MASK = ((1 << 1) | (1 << 4)); + // init buffer group + const group_id: u16 = 1; // buffers group id + const buffers_count: u16 = 2; // number of buffers in buffer group + const buffer_size: usize = 4; // size of each buffer in group + var buf_grp = BufferGroup.init( + &ring, + testing.allocator, + group_id, + buffer_size, + buffers_count, + ) catch |err| switch (err) { + // kernel older than 5.19 + error.ArgumentsInvalid => return error.SkipZigTest, + else => return err, + }; + defer buf_grp.deinit(testing.allocator); - pub const CQE_BUFFER_SHIFT = 16; + // create client/server fds + const fds = try createSocketTestHarness(&ring); + defer fds.close(); - /// cqe.res for IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was - /// requested It should be treated as a flag, all other bits of cqe.res - /// should be treated as reserved! - pub const NOTIF_USAGE_ZC_COPIED = (1 << 31); + // for random user_data in sqe/cqe + var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed); + var rnd = Rnd.random(); - //Magic offsets for the application to mmap the data it needs - pub const OFF_SQ_RING = 0; - pub const OFF_CQ_RING = 0x8000000; - pub const OFF_SQES = 0x10000000; - // COMMIT: new magic constants - pub const OFF_PBUF_RING = 0x80000000; - pub const OFF_PBUF_SHIFT = 16; - pub const OFF_MMAP_MASK = 0xf8000000; + var round: usize = 4; // repeat send/recv cycle round times + while (round > 0) : (round -= 1) { + // client sends data + const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf }; + { + const user_data = rnd.int(u64); + _ = try ring.send(user_data, fds.client, data[0..], 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); + const cqe_send = try ring.copy_cqe(); + if (cqe_send.err() == .INVAL) return error.SkipZigTest; + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); + } - /// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. - pub const RSRC_REGISTER_SPARSE = 1 << 0; + // start multishot recv + var recv_user_data = rnd.int(u64); + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit - /// Skip updating fd indexes set to this value in the fd table - pub const REGISTER_FILES_SKIP = -2; + // server reads data into provided buffers + // there are 2 buffers of size 4, so each read gets only chunk of data + // we read four chunks of 4, 4, 4, 4 bytes each + var chunk: []const u8 = data[0..buffer_size]; // first chunk + const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe1.flags.F_MORE); - // COMMIT: new TX Timestamp definition - /// SOCKET_URING_OP_TX_TIMESTAMP definitions - pub const TIMESTAMP_HW_SHIFT = 16; - /// The cqe.flags bit from which the timestamp type is stored - pub const TIMESTAMP_TYPE_SHIFT = (TIMESTAMP_HW_SHIFT + 1); - /// The cqe.flags flag signifying whether it's a hardware timestamp - pub const CQE_F_TSTAMP_HW = (1 << TIMESTAMP_HW_SHIFT); + chunk = data[buffer_size .. buffer_size * 2]; // second chunk + const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe2.flags.F_MORE); - /// The bit from which area id is encoded into offsets - pub const ZCRX_AREA_SHIFT = 48; - pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1)); + // both buffers provided to the kernel are used so we get error + // 'no more buffers', until we put buffers to the kernel + { + const cqe = try ring.copy_cqe(); + try testing.expectEqual(recv_user_data, cqe.user_data); + try testing.expect(cqe.res < 0); // fail + try testing.expectEqual(posix.E.NOBUFS, cqe.err()); + try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only + // has more is not set + // indicates that multishot is finished + try testing.expect(!cqe.flags.F_MORE); + try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); + } - // flag added to the opcode to use a registered ring fd - pub const REGISTER_USE_REGISTERED_RING = 1 << 31; -}; + // put buffers back to the kernel + try buf_grp.put(cqe1); + try buf_grp.put(cqe2); -// COMMIT: move IoUring flags to Flags struct -pub const uflags = struct { - /// io_uring_setup() flags - pub const Setup = packed struct(u32) { - /// io_context is polled - IOPOLL: bool = false, - /// SQ poll thread - SQPOLL: bool = false, - /// sq_thread_cpu is valid - SQ_AFF: bool = false, - /// app defines CQ size - CQSIZE: bool = false, - /// clamp SQ/CQ ring sizes - CLAMP: bool = false, - /// attach to existing wq - ATTACH_WQ: bool = false, - /// start with ring disabled - R_DISABLED: bool = false, - /// continue submit on error - SUBMIT_ALL: bool = false, - ///Cooperative task running. When requests complete, they often require - ///forcing the submitter to transition to the kernel to complete. If this - ///flag is set, work will be done when the task transitions anyway, rather - ///than force an inter-processor interrupt reschedule. This avoids interrupting - ///a task running in userspace, and saves an IPI. - COOP_TASKRUN: bool = false, - ///If COOP_TASKRUN is set, get notified if task work is available for - ///running and a kernel transition would be needed to run it. This sets - ///IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. - TASKRUN_FLAG: bool = false, - /// SQEs are 128 byte - SQE128: bool = false, - /// CQEs are 32 byte - CQE32: bool = false, - /// Only one task is allowed to submit requests - SINGLE_ISSUER: bool = false, - /// Defer running task work to get events. - /// Rather than running bits of task work whenever the task transitions - /// try to do it just before it is needed. - DEFER_TASKRUN: bool = false, - /// Application provides the memory for the rings - NO_MMAP: bool = false, - /// Register the ring fd in itself for use with - /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather - /// than an fd. - REGISTERED_FD_ONLY: bool = false, - /// Removes indirection through the SQ index array. - NO_SQARRAY: bool = false, - // COMMIT: new setup flags - /// Use hybrid poll in iopoll process - HYBRID_IOPOLL: bool = false, - /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have - /// IORING_CQE_F_32 set in cqe.flags. - CQE_MIXED: bool = false, - _unused: u13 = 0, - }; + // restart multishot + recv_user_data = rnd.int(u64); + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit - /// sqe.uring_cmd_flags (rw_flags in the Zig struct) - /// top 8bits aren't available for userspace - /// use registered buffer; pass this flag along with setting sqe.buf_index. - pub const Cmd = packed struct(u32) { - CMD_FIXED: bool = false, - _unused: u31 = 0, - }; + chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk + const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe3.flags.F_MORE); + try buf_grp.put(cqe3); - /// sqe.fsync_flags (rw_flags in the Zig struct) - pub const Fsync = packed struct(u32) { - DATASYNC: bool = false, - _unused: u31 = 0, - }; + chunk = data[buffer_size * 3 ..]; // last chunk + const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); + try testing.expect(cqe4.flags.F_MORE); + try buf_grp.put(cqe4); - /// sqe.timeout_flags - pub const Timeout = packed struct(u32) { - TIMEOUT_ABS: bool = false, - /// Available since Linux 5.11 - TIMEOUT_UPDATE: bool = false, - /// Available since Linux 5.15 - TIMEOUT_BOOTTIME: bool = false, - /// Available since Linux 5.15 - TIMEOUT_REALTIME: bool = false, - /// Available since Linux 5.15 - LINK_TIMEOUT_UPDATE: bool = false, - /// Available since Linux 5.16 - TIMEOUT_ETIME_SUCCESS: bool = false, - // COMMIT: new Timeout Flag - // TODO: add when it became available - TIMEOUT_MULTISHOT: bool = false, - _unused: u25 = 0, - }; + // cancel pending multishot recv operation + { + const cancel_user_data = rnd.int(u64); + _ = try ring.cancel(cancel_user_data, recv_user_data, 0); + try testing.expectEqual(@as(u32, 1), try ring.submit()); - /// sqe.splice_flags (rw_flags in Zig Struct) - /// extends splice(2) flags - pub const Splice = packed struct(u32) { - _unused: u31 = 0, - /// the last bit of __u32 - F_FD_IN_FIXED: bool = false, - }; + // expect completion of cancel operation and completion of recv operation + var cqe_cancel = try ring.copy_cqe(); + if (cqe_cancel.err() == .INVAL) return error.SkipZigTest; + var cqe_recv = try ring.copy_cqe(); + if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - /// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct) - /// is the flag space, the command flags for POLL_ADD are stored in sqe.len. - pub const Poll = packed struct(u32) { - /// IORING_POLL_ADD_MULTI - /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue - /// to report CQEs on behalf of the same SQE. - ADD_MULTI: bool = false, - // TODO: verify this doc comment is valid for the 2 flags below - /// IORING_POLL_UPDATE - /// Update existing poll request, matching sqe.addr as the old user_data - /// field. - UPDATE_EVENTS: bool = false, - /// IORING_POLL_UPDATE - /// Update existing poll request, matching sqe.addr as the old user_data - /// field. - UPDATE_USER_DATA: bool = false, - /// IORING_POLL_LEVEL - /// Level triggered poll. - ADD_LEVEL: bool = false, - _unused: u28 = 0, - }; + // don't depend on order of completions + if (cqe_cancel.user_data == recv_user_data and cqe_recv.user_data == cancel_user_data) { + const a = cqe_cancel; + const b = cqe_recv; + cqe_cancel = b; + cqe_recv = a; + } - /// ASYNC_CANCEL flags. - pub const AsyncCancel = packed struct(u32) { - /// IORING_ASYNC_CANCEL_ALL - /// Cancel all requests that match the given key - CANCEL_ALL: bool = false, - /// IORING_ASYNC_CANCEL_FD - /// Key off 'fd' for cancelation rather than the request 'user_data' - CANCEL_FD: bool = false, - /// IORING_ASYNC_CANCEL_ANY - /// Match any request - CANCEL_ANY: bool = false, - /// IORING_ASYNC_CANCEL_FD_FIXED - /// 'fd' passed in is a fixed descriptor - CANCEL_FD_FIXED: bool = false, - // COMMIT: new AsyncCancel Flags - /// IORING_ASYNC_CANCEL_USERDATA - /// Match on user_data, default for no other key - CANCEL_USERDATA: bool = false, - /// IORING_ASYNC_CANCEL_OP - /// Match request based on opcode - CANCEL_OP: bool = false, - _unused: u26 = 0, - }; + // Note on different kernel results: + // on older kernel (tested with v6.0.16, v6.1.57, v6.2.12, v6.4.16) + // cqe_cancel.err() == .NOENT + // cqe_recv.err() == .NOBUFS + // on kernel (tested with v6.5.0, v6.5.7) + // cqe_cancel.err() == .SUCCESS + // cqe_recv.err() == .CANCELED + // Upstream reference: https://github.com/axboe/liburing/issues/984 - /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) - pub const MsgRing = packed struct(u32) { - /// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. - /// Not applicable for IORING_MSG_DATA, obviously. - CQE_SKIP: bool = false, - /// Pass through the flags from sqe.file_index to cqe.flags - FLAGS_PASS: bool = false, - _unused: u30 = 0, - }; + // cancel operation is success (or NOENT on older kernels) + try testing.expectEqual(cancel_user_data, cqe_cancel.user_data); + try testing.expect(cqe_cancel.err() == .NOENT or cqe_cancel.err() == .SUCCESS); - // COMMIT: new flag - /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) - pub const FixedFd = packed struct(u32) { - /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC - NO_CLOEXEC: bool = false, - }; + // recv operation is failed with err CANCELED (or NOBUFS on older kernels) + try testing.expectEqual(recv_user_data, cqe_recv.user_data); + try testing.expect(cqe_recv.res < 0); + try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED); + try testing.expect(!cqe_recv.flags.F_MORE); + } + } +} - /// COMMIT: new flags - /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) - pub const Nop = packed struct(u32) { - /// IORING_NOP_INJECT_RESULT Inject result from sqe.result - INJECT_RESULT: bool = false, - _unused: u4 = 0, - CQE32: bool = false, - _unused_1: u26 = 0, - }; +// Prepare, submit recv and get cqe using buffer group. +fn buf_grp_recv_submit_get_cqe( + ring: *IoUring, + buf_grp: *BufferGroup, + fd: linux.fd_t, + user_data: u64, +) !Cqe { + // prepare and submit recv + const sqe = try buf_grp.recv(user_data, fd, 0); + try testing.expect(sqe.flags.BUFFER_SELECT); + try testing.expect(sqe.buf_index == buf_grp.group_id); + try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + // get cqe, expect success + const cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expect(cqe.res >= 0); // success + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set - /// io_uring_enter(2) flags - pub const Enter = packed struct(u32) { - GETEVENTS: bool = false, - SQ_WAKEUP: bool = false, - SQ_WAIT: bool = false, - EXT_ARG: bool = false, - REGISTERED_RING: bool = false, - // COMMIT: new flags - ABS_TIMER: bool = false, - EXT_ARG_REG: bool = false, - NO_IOWAIT: bool = false, - _unused: u24 = 0, + return cqe; +} - pub fn empty(enter_flags: Enter) bool { - return @as(u32, @bitCast(enter_flags)) == 0; - } - }; +fn expect_buf_grp_cqe( + ring: *IoUring, + buf_grp: *BufferGroup, + user_data: u64, + expected: []const u8, +) !Cqe { + // get cqe + const cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expect(cqe.res >= 0); // success + try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set + try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - /// io_uring_params.features flags - const Features = packed struct(u32) { - SINGLE_MMAP: bool = false, - NODROP: bool = false, - SUBMIT_STABLE: bool = false, - RW_CUR_POS: bool = false, - CUR_PERSONALITY: bool = false, - FAST_POLL: bool = false, - POLL_32BITS: bool = false, - SQPOLL_NONFIXED: bool = false, - EXT_ARG: bool = false, - NATIVE_WORKERS: bool = false, - RSRC_TAGS: bool = false, - CQE_SKIP: bool = false, - LINKED_FILE: bool = false, - // COMMIT: add new Feature Flags - REG_REG_RING: bool = false, - RECVSEND_BUNDLE: bool = false, - MIN_TIMEOUT: bool = false, - RW_ATTR: bool = false, - NO_IOWAIT: bool = false, - _unused: u14 = 0, + // get buffer from pool + const buffer_id = try cqe.buffer_id(); + const len = @as(usize, @intCast(cqe.res)); + const buf = buf_grp.get_by_id(buffer_id)[0..len]; + try testing.expectEqualSlices(u8, expected, buf); - pub fn empty(features: Features) bool { - return @as(u32, @bitCast(features)) == 0; - } - }; -}; -/// io_uring_register(2) opcodes and arguments -/// matches io_uring_register_op in liburing -pub const RegisterOp = enum(u8) { - REGISTER_BUFFERS, - UNREGISTER_BUFFERS, - REGISTER_FILES, - UNREGISTER_FILES, - REGISTER_EVENTFD, - UNREGISTER_EVENTFD, - REGISTER_FILES_UPDATE, - REGISTER_EVENTFD_ASYNC, - REGISTER_PROBE, - REGISTER_PERSONALITY, - UNREGISTER_PERSONALITY, - REGISTER_RESTRICTIONS, - REGISTER_ENABLE_RINGS, + return cqe; +} - // extended with tagging - REGISTER_FILES2, - REGISTER_FILES_UPDATE2, - REGISTER_BUFFERS2, - REGISTER_BUFFERS_UPDATE, +test "copy_cqes with wrapping sq.cqes buffer" { + if (!is_linux) return error.SkipZigTest; - // set/clear io-wq thread affinities - REGISTER_IOWQ_AFF, - UNREGISTER_IOWQ_AFF, + var ring = IoUring.init(2, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - // set/get max number of io-wq workers - REGISTER_IOWQ_MAX_WORKERS, + try testing.expectEqual(2, ring.sq.sqes.len); + try testing.expectEqual(4, ring.cq.cqes.len); - // register/unregister io_uring fd with the ring - REGISTER_RING_FDS, - UNREGISTER_RING_FDS, + // submit 2 entries, receive 2 completions + var cqes: [8]Cqe = undefined; + { + for (0..2) |_| { + const sqe = try ring.get_sqe(); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); + try testing.expect(try ring.submit() == 1); + } + var cqe_count: u32 = 0; + while (cqe_count < 2) { + cqe_count += try ring.copy_cqes(&cqes, 2 - cqe_count); + } + } - // register ring based provide buffer group - REGISTER_PBUF_RING, - UNREGISTER_PBUF_RING, + try testing.expectEqual(2, ring.cq.head.*); - // sync cancelation API - REGISTER_SYNC_CANCEL, + // sq.sqes len is 4, starting at position 2 + // every 4 entries submit wraps completion buffer + // we are reading ring.cq.cqes at indexes 2,3,0,1 + for (1..1024) |i| { + for (0..4) |_| { + const sqe = try ring.get_sqe(); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); + try testing.expect(try ring.submit() == 1); + } + var cqe_count: u32 = 0; + while (cqe_count < 4) { + cqe_count += try ring.copy_cqes(&cqes, 4 - cqe_count); + } + try testing.expectEqual(4, cqe_count); + try testing.expectEqual(2 + 4 * i, ring.cq.head.*); + } +} - // register a range of fixed file slots for automatic slot allocation - REGISTER_FILE_ALLOC_RANGE, +test "bind/listen/connect" { + if (builtin.cpu.arch == .s390x) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/25956 - // return status information for a buffer group - REGISTER_PBUF_STATUS, + var ring = IoUring.init(4, .{}) catch |err| switch (err) { + error.SystemOutdated => return error.SkipZigTest, + error.PermissionDenied => return error.SkipZigTest, + else => return err, + }; + defer ring.deinit(); - // set/clear busy poll settings - REGISTER_NAPI, - UNREGISTER_NAPI, + const probe = ring.get_probe() catch return error.SkipZigTest; + // LISTEN is higher required operation + if (!probe.is_supported(.LISTEN)) return error.SkipZigTest; - REGISTER_CLOCK, + var addr: linux.sockaddr.in = .{ + .port = 0, + .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), + }; + const proto: u32 = if (addr.family == linux.AF.UNIX) 0 else linux.IPPROTO.TCP; - // clone registered buffers from source ring to current ring - REGISTER_CLONE_BUFFERS, + const listen_fd = brk: { + // Create socket + _ = try ring.socket(1, addr.any.family, .{ .type = .stream, .flags = .{ .cloexec = true } }, proto, 0); + try testing.expectEqual(1, try ring.submit()); + var cqe = try ring.copy_cqe(); + try testing.expectEqual(1, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + const listen_fd: linux.fd_t = @intCast(cqe.res); + try testing.expect(listen_fd > 2); - // send MSG_RING without having a ring - REGISTER_SEND_MSG_RING, + // Prepare: set socket option * 2, bind, listen + var optval: u32 = 1; + (try ring.setsockopt(2, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval))).link_next(); + (try ring.setsockopt(3, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEPORT, mem.asBytes(&optval))).link_next(); + (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); + _ = try ring.listen(5, listen_fd, 1, 0); + // Submit 4 operations + try testing.expectEqual(4, try ring.submit()); + // Expect all to succeed + for (2..6) |user_data| { + cqe = try ring.copy_cqe(); + try testing.expectEqual(user_data, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + } - // register a netdev hw rx queue for zerocopy - REGISTER_ZCRX_IFQ, + // Check that socket option is set + optval = 0; + _ = try ring.getsockopt(5, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval)); + try testing.expectEqual(1, try ring.submit()); + cqe = try ring.copy_cqe(); + try testing.expectEqual(5, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(1, optval); - // resize CQ ring - REGISTER_RESIZE_RINGS, + // Read system assigned port into addr + var addr_len: posix.socklen_t = @sizeOf(linux.sockaddr.in); + try posix.getsockname(listen_fd, addrAny(&addr), &addr_len); - REGISTER_MEM_REGION, + break :brk listen_fd; + }; - // COMMIT: new register opcode - // query various aspects of io_uring, see linux/io_uring/query.h - REGISTER_QUERY, + const connect_fd = brk: { + // Create connect socket + _ = try ring.socket(6, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); + try testing.expectEqual(1, try ring.submit()); + const cqe = try ring.copy_cqe(); + try testing.expectEqual(6, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + // Get connect socket fd + const connect_fd: linux.fd_t = @intCast(cqe.res); + try testing.expect(connect_fd > 2 and connect_fd != listen_fd); + break :brk connect_fd; + }; - _, -}; + // Prepare accept/connect operations + _ = try ring.accept(7, listen_fd, null, null, 0); + _ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in)); + try testing.expectEqual(2, try ring.submit()); + // Get listener accepted socket + var accept_fd: posix.socket_t = 0; + for (0..2) |_| { + const cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + if (cqe.user_data == 7) { + accept_fd = @intCast(cqe.res); + } else { + try testing.expectEqual(8, cqe.user_data); + } + } + try testing.expect(accept_fd > 2 and accept_fd != listen_fd and accept_fd != connect_fd); -/// io-wq worker categories -/// matches io_wq_type in liburing -pub const IoWqCategory = enum(u8) { - BOUND, - UNBOUND, -}; + // Communicate + try testSendRecv(&ring, connect_fd, accept_fd); + try testSendRecv(&ring, accept_fd, connect_fd); -pub const SocketOp = enum(u16) { - SIOCIN, - SIOCOUTQ, - GETSOCKOPT, - SETSOCKOPT, - // COMMIT: new socket op - TX_TIMESTAMP, -}; + // Shutdown and close all sockets + for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { + (try ring.shutdown(9, fd, posix.SHUT.RDWR)).link_next(); + _ = try ring.close(10, fd); + try testing.expectEqual(2, try ring.submit()); + for (0..2) |i| { + const cqe = try ring.copy_cqe(); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(9 + i, cqe.user_data); + } + } +} -/// io_uring_restriction.opcode values -/// matches io_uring_register_restriction_op in liburing -pub const RestrictionOp = enum(u16) { - /// Allow an io_uring_register(2) opcode - REGISTER_OP = 0, - /// Allow an sqe opcode - SQE_OP = 1, - /// Allow sqe flags - SQE_FLAGS_ALLOWED = 2, - /// Require sqe flags (these flags must be set on each submission) - SQE_FLAGS_REQUIRED = 3, +fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t) !void { + const buffer_send = "0123456789abcdf" ** 10; + var buffer_recv: [buffer_send.len * 2]u8 = undefined; - _, -}; + // 2 sends + _ = try ring.send(1, send_fd, buffer_send, linux.Msg.WAITALL); + _ = try ring.send(2, send_fd, buffer_send, linux.Msg.WAITALL); + try testing.expectEqual(2, try ring.submit()); + for (0..2) |i| { + const cqe = try ring.copy_cqe(); + try testing.expectEqual(1 + i, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(buffer_send.len, @as(usize, @intCast(cqe.res))); + } -/// IORING_OP_MSG_RING command types, stored in sqe.addr -pub const MsgRingCmd = enum { - /// pass sqe->len as 'res' and off as user_data - DATA, - /// send a registered fd to another ring - SEND_FD, -}; + // receive + var recv_len: usize = 0; + while (recv_len < buffer_send.len * 2) { + _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, 0); + try testing.expectEqual(1, try ring.submit()); + const cqe = try ring.copy_cqe(); + try testing.expectEqual(3, cqe.user_data); + try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + recv_len += @intCast(cqe.res); + } -// COMMIT: OP to IoUring -pub const Op = enum(u8) { - NOP, - READV, - WRITEV, - FSYNC, - READ_FIXED, - WRITE_FIXED, - POLL_ADD, - POLL_REMOVE, - SYNC_FILE_RANGE, - SENDMSG, - RECVMSG, - TIMEOUT, - TIMEOUT_REMOVE, - ACCEPT, - ASYNC_CANCEL, - LINK_TIMEOUT, - CONNECT, - FALLOCATE, - OPENAT, - CLOSE, - FILES_UPDATE, - STATX, - READ, - WRITE, - FADVISE, - MADVISE, - SEND, - RECV, - EPOLL_CTL, - OPENAT2, - SPLICE, - PROVIDE_BUFFERS, - REMOVE_BUFFERS, - TEE, - SHUTDOWN, - RENAMEAT, - UNLINKAT, - MKDIRAT, - SYMLINKAT, - LINKAT, - MSG_RING, - FSETXATTR, - SETXATTR, - FGETXATTR, - GETXATTR, - SOCKET, - URING_CMD, - SEND_ZC, - SENDMSG_ZC, - READ_MULTISHOT, - WAITID, - FUTEX_WAIT, - FUTEX_WAKE, - FUTEX_WAITV, - FIXED_FD_INSTALL, - FTRUNCATE, - BIND, - LISTEN, - RECV_ZC, - // COMMIT: new OPs - // TODO: to be implemented - EPOLL_WAIT, - READV_FIXED, - WRITEV_FIXED, - PIPE, + // inspect recv buffer + try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]); + try testing.expectEqualSlices(u8, buffer_send, buffer_recv[buffer_send.len..]); +} - _, -}; +fn addrAny(addr: *linux.sockaddr.in) *linux.sockaddr { + return @ptrCast(addr); +} From 922bd94dd926af4e0e45923790a970b89087604e Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Tue, 7 Oct 2025 18:09:56 +0000 Subject: [PATCH 10/41] Get test passing for all the newly introduced flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4❯ zig test lib/std/std.zig --zig-lib-dir lib --test-filter "os.linux" 68/116 os.linux.test.test.futex2_wait...SKIP 69/116 os.linux.test.test.futex2_wake...SKIP 70/116 os.linux.test.test.futex2_requeue...SKIP 111/116 os.linux.IoUring.test.waitid...SKIP 116/116 os.linux.IoUring.test.bind/listen/connect...SKIP 111 passed; 5 skipped; 0 failed. fix skipKernelLessThan fn to work on wsl linux fix logical error in flags compatibility check Update all IoUring to use the implemented flags bandage types that affect posix for now so that the scope doesn't become unmanageable TODO: impl linux.SOL and linux.SO types Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 23 +++--- lib/std/os/linux/IoUring.zig | 150 ++++++++++++++++++++--------------- 2 files changed, 96 insertions(+), 77 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 70d3a448331d..2f5a5d36e951 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -2237,15 +2237,16 @@ pub fn lstat(pathname: [*:0]const u8, statbuf: *Stat) usize { } } -pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: At) usize { +// TODO: flags is At Flags +pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: u32) usize { if (native_arch == .riscv32 or native_arch.isLoongArch()) { // riscv32 and loongarch have made the interesting decision to not implement some of // the older stat syscalls, including this one. @compileError("No fstatat syscall on this architecture."); } else if (@hasField(SYS, "fstatat64")) { - return syscall4(.fstatat64, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), @as(u32, @bitCast(flags))); + return syscall4(.fstatat64, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); } else { - return syscall4(.fstatat, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), @as(u32, @bitCast(flags))); + return syscall4(.fstatat, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); } } @@ -3983,22 +3984,22 @@ pub const Sock = packed struct(u32) { // bit range is (8 - 32] of the u32 /// Flags for socket, socketpair, accept4 pub const Flags = if (is_sparc) packed struct(u25) { - _: u7 = 0, // start from u7 since Type comes before Flags + _8: u7 = 0, // start from u7 since Type comes before Flags nonblock: bool = false, - _1: u7 = 0, + _16: u7 = 0, cloexec: bool = false, - _2: u9 = 0, + _24: u9 = 0, } else if (is_mips) packed struct(u25) { nonblock: bool = false, - _: u11 = 0, + _9: u11 = 0, cloexec: bool = false, - _1: u12 = 0, + _21: u12 = 0, } else packed struct(u25) { - _: u4 = 0, + _8: u4 = 0, nonblock: bool = false, - _1: u7 = 0, + _13: u7 = 0, cloexec: bool = false, - _2: u12 = 0, + _21: u12 = 0, }; // Deprecated aliases for SOCK diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index fdc956a1b4c4..1ad5ac0fdb69 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -46,9 +46,9 @@ pub fn init_params(entries: u16, p: *Params) !IoUring { assert(p.wq_fd == 0 or p.flags.ATTACH_WQ); // flags compatibility - assert(p.flags.SQPOLL and !(p.flags.COOP_TASKRUN or p.flags.TASKRUN_FLAG or p.flags.DEFER_TASKRUN)); - assert(p.flags.SQ_AFF and p.flags.SQPOLL); - assert(p.flags.DEFER_TASKRUN and p.flags.SINGLE_ISSUER); + if (p.flags.SQPOLL) assert(!(p.flags.COOP_TASKRUN or p.flags.TASKRUN_FLAG or p.flags.DEFER_TASKRUN)); + if (p.flags.SQ_AFF) assert(p.flags.SQPOLL); + if (p.flags.DEFER_TASKRUN) assert(p.flags.SINGLE_ISSUER); const res = linux.io_uring_setup(entries, p); switch (linux.errno(res)) { @@ -609,7 +609,7 @@ pub fn recv( .buffer => |slice| sqe.prep_recv(fd, slice, flags), .buffer_selection => |selection| { sqe.prep_rw(.RECV, fd, 0, selection.len, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); sqe.flags.BUFFER_SELECT = true; sqe.buf_index = selection.group_id; }, @@ -689,7 +689,7 @@ pub fn recvmsg( user_data: u64, fd: linux.fd_t, msg: *linux.msghdr, - flags: u32, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_recvmsg(fd, msg, flags); @@ -857,7 +857,7 @@ pub fn link_timeout( self: *IoUring, user_data: u64, ts: *const linux.kernel_timespec, - flags: u32, + flags: uflags.Timeout, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_link_timeout(ts, flags); @@ -953,7 +953,7 @@ pub fn cancel( self: *IoUring, user_data: u64, cancel_user_data: u64, - flags: u32, + flags: uflags.AsyncCancel, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_cancel(cancel_user_data, flags); @@ -1331,8 +1331,7 @@ pub fn socket( domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - /// flags is unused - flags: u32, + flags: u32, // flags is unused ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_socket(domain, socket_type, protocol, flags); @@ -1364,10 +1363,10 @@ pub fn socket_direct( pub fn socket_direct_alloc( self: *IoUring, user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + flags: u32, // flags unused ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); @@ -1953,7 +1952,7 @@ pub const Sqe = extern struct { pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.Msg) void { sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } // TODO: review recv `flags` @@ -1974,7 +1973,7 @@ pub const Sqe = extern struct { flags: linux.Msg, ) void { sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_recvmsg_multishot( @@ -1990,12 +1989,12 @@ pub const Sqe = extern struct { // COMMIT: fix send[|recv] flag param type pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg) void { sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv) void { sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); sqe.ioprio = .{ .send_recv = zc_flags }; } @@ -2016,7 +2015,7 @@ pub const Sqe = extern struct { flags: linux.Msg, ) void { sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_sendmsg_zc( @@ -2111,7 +2110,7 @@ pub const Sqe = extern struct { flags: uflags.Timeout, ) void { sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); - sqe.rw_flags = flags; + sqe.rw_flags = @bitCast(flags); } pub fn prep_poll_add( @@ -2347,7 +2346,7 @@ pub const Sqe = extern struct { sqe: *Sqe, domain: linux.Af, socket_type: linux.Sock, - protocol: linux.IpProto, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + protocol: linux.IpProto, /// flags is unused flags: u32, file_index: u32, @@ -2359,8 +2358,8 @@ pub const Sqe = extern struct { pub fn prep_socket_direct_alloc( sqe: *Sqe, domain: linux.Af, - socket_type: linux.SOCK, - protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7 + socket_type: linux.Sock, + protocol: linux.IpProto, flags: u32, // flags is unused ) void { prep_socket(sqe, domain, socket_type, protocol, flags); @@ -2406,8 +2405,8 @@ pub const Sqe = extern struct { sqe: *Sqe, cmd_op: SocketOp, fd: linux.fd_t, - level: linux.SOL, - optname: linux.SO, + level: u32, // TODO: linux.SOL, + optname: u32, // TODO: linux.SO, optval: u64, optlen: u32, ) void { @@ -4026,9 +4025,9 @@ test "accept/connect/send/recv" { const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); + const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}); sqe_send.flags.IO_LINK = true; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); const cqe_send = try ring.copy_cqe(); @@ -4044,8 +4043,10 @@ test "accept/connect/send/recv" { try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, + // TODO: comment seems to be wrong but clarify with previous maintainers + // Only check IORING_CQE_F_SOCK_NONEMPTY flag, as other flags are system-dependent (Might be more appropriate) // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + .flags = .{ .F_SOCK_NONEMPTY = cqe_recv.flags.F_SOCK_NONEMPTY }, }, cqe_recv); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -4092,7 +4093,7 @@ test "sendmsg/recvmsg" { .controllen = 0, .flags = 0, }; - const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); + const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, .{}); sqe_sendmsg.flags.IO_LINK = true; try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); try testing.expectEqual(client, sqe_sendmsg.fd); @@ -4114,7 +4115,7 @@ test "sendmsg/recvmsg" { .controllen = 0, .flags = 0, }; - const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); + const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, .{}); try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode); try testing.expectEqual(server, sqe_recvmsg.fd); @@ -4137,7 +4138,7 @@ test "sendmsg/recvmsg" { .user_data = 0x22222222, .res = buffer_recv.len, // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically - .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + .flags = .{ .F_SOCK_NONEMPTY = cqe_recvmsg.flags.F_SOCK_NONEMPTY }, }, cqe_recvmsg); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -4284,11 +4285,11 @@ test "accept/connect/recv/link_timeout" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); sqe_recv.flags.IO_LINK = true; const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; - _ = try ring.link_timeout(0x22222222, &ts, 0); + _ = try ring.link_timeout(0x22222222, &ts, .{}); const nr_wait = try ring.submit(); try testing.expectEqual(@as(u32, 2), nr_wait); @@ -4436,10 +4437,10 @@ test "accept/connect/recv/cancel" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); - const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); + const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, .{}); try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode); try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); @@ -5150,7 +5151,7 @@ test "provide_buffers: accept/connect/send/recv" { { var i: usize = 0; while (i < buffers.len) : (i += 1) { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); } @@ -5165,14 +5166,14 @@ test "provide_buffers: accept/connect/send/recv" { var i: usize = 0; while (i < buffers.len) : (i += 1) { - const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); try testing.expectEqual(Op.RECV, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(.{ .BUFFER_SELECT = true }, sqe.flags); + try testing.expectEqual(Sqe.IoSqe{ .BUFFER_SELECT = true }, sqe.flags); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5194,14 +5195,14 @@ test "provide_buffers: accept/connect/send/recv" { // This recv should fail { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); try testing.expectEqual(Op.RECV, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(Sqe.IoSqe{ .BUFFER_SELECT = true }, sqe.flags); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5232,7 +5233,7 @@ test "provide_buffers: accept/connect/send/recv" { // Redo 1 send on the server socket { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); + _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); _ = try ring.copy_cqe(); @@ -5244,14 +5245,14 @@ test "provide_buffers: accept/connect/send/recv" { @memset(mem.sliceAsBytes(&buffers), 1); { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); try testing.expectEqual(Op.RECV, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); + try testing.expectEqual(Sqe.IoSqe{ .BUFFER_SELECT = true }, sqe.flags); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5410,9 +5411,9 @@ test "accept/connect/send_zc/recv" { var buffer_recv = [_]u8{0} ** 10; // zero-copy send - const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); + const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}, .{}); sqe_send.flags.IO_LINK = true; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); var cqe_send = try ring.copy_cqe(); @@ -5436,7 +5437,7 @@ test "accept/connect/send_zc/recv" { try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + .flags = .{ .F_SOCK_NONEMPTY = cqe_recv.flags.F_SOCK_NONEMPTY }, }, cqe_recv); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -5445,7 +5446,7 @@ test "accept/connect/send_zc/recv" { try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, - .flags = linux.IORING_CQE_F_NOTIF, + .flags = .{ .F_NOTIF = true }, }, cqe_send); } @@ -5502,7 +5503,7 @@ test "accept_direct" { // Submit receive to fixed file returned by accept (fd_index). // Fd field is set to registered file index, returned by accept. // Flag linux.IOSQE_FIXED_FILE must be set. - const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); + const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, .{}); recv_sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -5515,7 +5516,7 @@ test "accept_direct" { // no more available fds, accept will get NFILE error { // submit accept - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); // connect const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); @@ -5650,7 +5651,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { // create socket in kernel chosen file descriptor index (_alloc version) // completion res has index from registered files - _ = try ring.socket_direct_alloc(0, linux.Af.INET, posix.SOCK.STREAM, 0, 0); + _ = try ring.socket_direct_alloc(0, .inet, .{ .type = .stream }, .default, 0); try testing.expectEqual(@as(u32, 1), try ring.submit()); cqe_socket = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); @@ -5668,10 +5669,10 @@ test "socket_direct/socket_direct_alloc/close_direct" { const close_userdata: u64 = 0xcccccccc; for (registered_fds, 0..) |_, fd_index| { // prepare accept - _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); + _ = try ring.accept(accept_userdata, listener_socket, null, null, .{}); // prepare connect with fixed socket const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); - connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index + connect_sqe.flags.FIXED_FILE = true; // fd is fixed file index // submit both try testing.expectEqual(@as(u32, 2), try ring.submit()); // get completions @@ -5791,14 +5792,26 @@ inline fn skipKernelLessThan(required: std.SemanticVersion) !void { } const release = mem.sliceTo(&uts.release, 0); - // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" - const extra_index = std.mem.indexOfAny(u8, release, "-+"); - const stripped = release[0..(extra_index orelse release.len)]; - // Make sure the input don't rely on the extra we just stripped + // Make sure the input don't rely on the extra we are about to stripped try testing.expect(required.pre == null and required.build == null); + const stripped = blk: { + // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" + const extra_index = std.mem.findAny(u8, release, "-+"); + const stripped = release[0..(extra_index orelse release.len)]; + + // wsl kernel isn't semver compliant + // .ie 6.6.87.2-microsoft-standard-WSL2 strip the extra .2 after 87 + const wsl = "WSL2"; + if (std.mem.eql(u8, release[release.len - wsl.len ..][0..wsl.len], wsl)) { + const wsl_stripped, _ = std.mem.cutScalarLast(u8, stripped, '.') orelse unreachable; + break :blk wsl_stripped; + } + break :blk stripped; + }; var current = try std.SemanticVersion.parse(stripped); current.pre = null; // don't check pre field + if (required.order(current) == .gt) return error.SkipZigTest; } @@ -5837,7 +5850,7 @@ test BufferGroup { // Client sends data { - _ = try ring.send(1, fds.client, data[0..], 0); + _ = try ring.send(1, fds.client, data[0..], .{}); const submitted = try ring.submit(); try testing.expectEqual(1, submitted); const cqe_send = try ring.copy_cqe(); @@ -5908,7 +5921,7 @@ test "ring mapped buffers recv" { const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; { const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); + _ = try ring.send(user_data, fds.client, data[0..], .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; @@ -5997,7 +6010,7 @@ test "ring mapped buffers multishot recv" { const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf }; { const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); + _ = try ring.send(user_data, fds.client, data[0..], .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; @@ -6056,7 +6069,7 @@ test "ring mapped buffers multishot recv" { // cancel pending multishot recv operation { const cancel_user_data = rnd.int(u64); - _ = try ring.cancel(cancel_user_data, recv_user_data, 0); + _ = try ring.cancel(cancel_user_data, recv_user_data, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); // expect completion of cancel operation and completion of recv operation @@ -6205,11 +6218,16 @@ test "bind/listen/connect" { .port = 0, .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; + // TODO: switch family to IpProto type + // const proto: linux.IpProto = switch (addr.any.family) { + // .unix => .default, + // else => .tcp, + // }; const proto: u32 = if (addr.family == linux.AF.UNIX) 0 else linux.IPPROTO.TCP; const listen_fd = brk: { // Create socket - _ = try ring.socket(1, addr.any.family, .{ .type = .stream, .flags = .{ .cloexec = true } }, proto, 0); + _ = try ring.socket(1, @enumFromInt(addr.any.family), .{ .type = .stream, .flags = .{ .cloexec = true } }, @enumFromInt(proto), 0); try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); try testing.expectEqual(1, cqe.user_data); @@ -6250,7 +6268,7 @@ test "bind/listen/connect" { const connect_fd = brk: { // Create connect socket - _ = try ring.socket(6, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); + _ = try ring.socket(6, @enumFromInt(addr.family), .{ .type = .stream, .flags = .{ .cloexec = true } }, @enumFromInt(proto), 0); try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(6, cqe.user_data); @@ -6262,7 +6280,7 @@ test "bind/listen/connect" { }; // Prepare accept/connect operations - _ = try ring.accept(7, listen_fd, null, null, 0); + _ = try ring.accept(7, listen_fd, null, null, .{}); _ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in)); try testing.expectEqual(2, try ring.submit()); // Get listener accepted socket @@ -6284,7 +6302,7 @@ test "bind/listen/connect" { // Shutdown and close all sockets for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { - (try ring.shutdown(9, fd, posix.SHUT.RDWR)).link_next(); + (try ring.shutdown(9, fd, .rdwr)).link_next(); _ = try ring.close(10, fd); try testing.expectEqual(2, try ring.submit()); for (0..2) |i| { @@ -6300,8 +6318,8 @@ fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t var buffer_recv: [buffer_send.len * 2]u8 = undefined; // 2 sends - _ = try ring.send(1, send_fd, buffer_send, linux.Msg.WAITALL); - _ = try ring.send(2, send_fd, buffer_send, linux.Msg.WAITALL); + _ = try ring.send(1, send_fd, buffer_send, .{ .waitall = true }); + _ = try ring.send(2, send_fd, buffer_send, .{ .waitall = true }); try testing.expectEqual(2, try ring.submit()); for (0..2) |i| { const cqe = try ring.copy_cqe(); @@ -6313,7 +6331,7 @@ fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t // receive var recv_len: usize = 0; while (recv_len < buffer_send.len * 2) { - _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, 0); + _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, .{}); try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(3, cqe.user_data); From 864316af0f777136f5e43407c73325fb8b8854cc Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Thu, 9 Oct 2025 14:33:04 +0000 Subject: [PATCH 11/41] Add So and Sol typed flags Replace SO with So type and Add SO named constants for compatability Replace SOL with Sol type and Add SOL named constants for compatability Replace some more typed flags with their typed equivalent Remove any unnecessary IoUring dependency on linux.* Add TODOs for all untyped flags Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 766 +++++++++++++++++++++-------------- lib/std/os/linux/IoUring.zig | 79 ++-- 2 files changed, 505 insertions(+), 340 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 2f5a5d36e951..8f49cc9ad22f 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -4226,282 +4226,369 @@ pub const Af = enum(u16) { pub const MAX: u16 = @intFromEnum(Af.max); }; -pub const SO = if (is_mips) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 0x0004; - pub const KEEPALIVE = 0x0008; - pub const DONTROUTE = 0x0010; - pub const BROADCAST = 0x0020; - pub const LINGER = 0x0080; - pub const OOBINLINE = 0x0100; - pub const REUSEPORT = 0x0200; - pub const SNDBUF = 0x1001; - pub const RCVBUF = 0x1002; - pub const SNDLOWAT = 0x1003; - pub const RCVLOWAT = 0x1004; - pub const RCVTIMEO = 0x1006; - pub const SNDTIMEO = 0x1005; - pub const ERROR = 0x1007; - pub const TYPE = 0x1008; - pub const ACCEPTCONN = 0x1009; - pub const PROTOCOL = 0x1028; - pub const DOMAIN = 0x1029; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const BSDCOMPAT = 14; - pub const PASSCRED = 17; - pub const PEERCRED = 18; - pub const PEERSEC = 30; - pub const SNDBUFFORCE = 31; - pub const RCVBUFFORCE = 33; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; -} else if (is_ppc) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 2; - pub const TYPE = 3; - pub const ERROR = 4; - pub const DONTROUTE = 5; - pub const BROADCAST = 6; - pub const SNDBUF = 7; - pub const RCVBUF = 8; - pub const KEEPALIVE = 9; - pub const OOBINLINE = 10; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 13; - pub const BSDCOMPAT = 14; - pub const REUSEPORT = 15; - pub const RCVLOWAT = 16; - pub const SNDLOWAT = 17; - pub const RCVTIMEO = 18; - pub const SNDTIMEO = 19; - pub const PASSCRED = 20; - pub const PEERCRED = 21; - pub const ACCEPTCONN = 30; - pub const PEERSEC = 31; - pub const SNDBUFFORCE = 32; - pub const RCVBUFFORCE = 33; - pub const PROTOCOL = 38; - pub const DOMAIN = 39; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; -} else if (is_sparc) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 4; - pub const TYPE = 4104; - pub const ERROR = 4103; - pub const DONTROUTE = 16; - pub const BROADCAST = 32; - pub const SNDBUF = 4097; - pub const RCVBUF = 4098; - pub const KEEPALIVE = 8; - pub const OOBINLINE = 256; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 128; - pub const BSDCOMPAT = 1024; - pub const REUSEPORT = 512; - pub const PASSCRED = 2; - pub const PEERCRED = 64; - pub const RCVLOWAT = 2048; - pub const SNDLOWAT = 4096; - pub const RCVTIMEO = 8192; - pub const SNDTIMEO = 16384; - pub const ACCEPTCONN = 32768; - pub const PEERSEC = 30; - pub const SNDBUFFORCE = 4106; - pub const RCVBUFFORCE = 4107; - pub const PROTOCOL = 4136; - pub const DOMAIN = 4137; - pub const SECURITY_AUTHENTICATION = 20481; - pub const SECURITY_ENCRYPTION_TRANSPORT = 20482; - pub const SECURITY_ENCRYPTION_NETWORK = 20484; - pub const BINDTODEVICE = 13; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = 26; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 31; - pub const TIMESTAMPNS_OLD = 33; - pub const MARK = 34; - pub const TIMESTAMPING_OLD = 35; - pub const RXQ_OVFL = 36; - pub const WIFI_STATUS = 37; - pub const PEEK_OFF = 38; - pub const NOFCS = 39; - pub const LOCK_FILTER = 40; - pub const SELECT_ERR_QUEUE = 41; - pub const BUSY_POLL = 48; - pub const MAX_PACING_RATE = 49; - pub const BPF_EXTENSIONS = 50; - pub const INCOMING_CPU = 51; - pub const ATTACH_BPF = 52; - pub const DETACH_BPF = 27; - pub const ATTACH_REUSEPORT_CBPF = 53; - pub const ATTACH_REUSEPORT_EBPF = 54; - pub const CNX_ADVICE = 55; - pub const MEMINFO = 57; - pub const INCOMING_NAPI_ID = 58; - pub const COOKIE = 59; - pub const PEERGROUPS = 61; - pub const ZEROCOPY = 62; - pub const TXTIME = 63; - pub const BINDTOIFINDEX = 65; - pub const TIMESTAMP_NEW = 70; - pub const TIMESTAMPNS_NEW = 66; - pub const TIMESTAMPING_NEW = 67; - pub const RCVTIMEO_NEW = 68; - pub const SNDTIMEO_NEW = 69; - pub const DETACH_REUSEPORT_BPF = 71; -} else struct { - pub const DEBUG = 1; - pub const REUSEADDR = 2; - pub const TYPE = 3; - pub const ERROR = 4; - pub const DONTROUTE = 5; - pub const BROADCAST = 6; - pub const SNDBUF = 7; - pub const RCVBUF = 8; - pub const KEEPALIVE = 9; - pub const OOBINLINE = 10; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 13; - pub const BSDCOMPAT = 14; - pub const REUSEPORT = 15; - pub const PASSCRED = 16; - pub const PEERCRED = 17; - pub const RCVLOWAT = 18; - pub const SNDLOWAT = 19; - pub const RCVTIMEO = 20; - pub const SNDTIMEO = 21; - pub const ACCEPTCONN = 30; - pub const PEERSEC = 31; - pub const SNDBUFFORCE = 32; - pub const RCVBUFFORCE = 33; - pub const PROTOCOL = 38; - pub const DOMAIN = 39; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; +// COMMIT: add new Typed So enum +/// SO_* type +pub const So = if (is_mips) enum(u16) { + debug = 1, + reuseaddr = 0x0004, + keepalive = 0x0008, + dontroute = 0x0010, + broadcast = 0x0020, + linger = 0x0080, + oobinline = 0x0100, + reuseport = 0x0200, + sndbuf = 0x1001, + rcvbuf = 0x1002, + sndlowat = 0x1003, + rcvlowat = 0x1004, + sndtimeo = 0x1005, + rcvtimeo = 0x1006, + @"error" = 0x1007, + type = 0x1008, + acceptconn = 0x1009, + protocol = 0x1028, + domain = 0x1029, + no_check = 11, + priority = 12, + bsdcompat = 14, + passcred = 17, + peercred = 18, + peersec = 30, + sndbufforce = 31, + rcvbufforce = 33, + security_authentication = 22, + security_encryption_transport = 23, + security_encryption_network = 24, + bindtodevice = 25, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + passsec = 34, + timestampns_old = 35, + mark = 36, + timestamping_old = 37, + rxq_ovfl = 40, + wifi_status = 41, + peek_off = 42, + nofcs = 43, + lock_filter = 44, + select_err_queue = 45, + busy_poll = 46, + max_pacing_rate = 47, + bpf_extensions = 48, + incoming_cpu = 49, + attach_bpf = 50, + attach_reuseport_cbpf = 51, + attach_reuseport_ebpf = 52, + cnx_advice = 53, + meminfo = 55, + incoming_napi_id = 56, + cookie = 57, + peergroups = 59, + zerocopy = 60, + txtime = 61, + bindtoifindex = 62, + timestamp_new = 63, + timestampns_new = 64, + timestamping_new = 65, + rcvtimeo_new = 66, + sndtimeo_new = 67, + detach_reuseport_bpf = 68, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +} else if (is_ppc) enum(u16) { + debug = 1, + reuseaddr = 2, + type = 3, + @"error" = 4, + dontroute = 5, + broadcast = 6, + sndbuf = 7, + rcvbuf = 8, + keepalive = 9, + oobinline = 10, + no_check = 11, + priority = 12, + linger = 13, + bsdcompat = 14, + reuseport = 15, + rcvlowat = 16, + sndlowat = 17, + rcvtimeo = 18, + sndtimeo = 19, + passcred = 20, + peercred = 21, + acceptconn = 30, + peersec = 31, + sndbufforce = 32, + rcvbufforce = 33, + protocol = 38, + domain = 39, + security_authentication = 22, + security_encryption_transport = 23, + security_encryption_network = 24, + bindtodevice = 25, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + passsec = 34, + timestampns_old = 35, + mark = 36, + timestamping_old = 37, + rxq_ovfl = 40, + wifi_status = 41, + peek_off = 42, + nofcs = 43, + lock_filter = 44, + select_err_queue = 45, + busy_poll = 46, + max_pacing_rate = 47, + bpf_extensions = 48, + incoming_cpu = 49, + attach_bpf = 50, + attach_reuseport_cbpf = 51, + attach_reuseport_ebpf = 52, + cnx_advice = 53, + meminfo = 55, + incoming_napi_id = 56, + cookie = 57, + peergroups = 59, + zerocopy = 60, + txtime = 61, + bindtoifindex = 62, + timestamp_new = 63, + timestampns_new = 64, + timestamping_new = 65, + rcvtimeo_new = 66, + sndtimeo_new = 67, + detach_reuseport_bpf = 68, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +} else if (is_sparc) enum(u16) { + debug = 1, + reuseaddr = 4, + type = 4104, + @"error" = 4103, + dontroute = 16, + broadcast = 32, + sndbuf = 4097, + rcvbuf = 4098, + keepalive = 8, + oobinline = 256, + no_check = 11, + priority = 12, + linger = 128, + bsdcompat = 1024, + reuseport = 512, + passcred = 2, + peercred = 64, + rcvlowat = 2048, + sndlowat = 4096, + rcvtimeo = 8192, + sndtimeo = 16384, + acceptconn = 32768, + peersec = 30, + sndbufforce = 4106, + rcvbufforce = 4107, + protocol = 4136, + domain = 4137, + security_authentication = 20481, + security_encryption_transport = 20482, + security_encryption_network = 20484, + bindtodevice = 13, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + passsec = 31, + timestampns_old = 33, + mark = 34, + timestamping_old = 35, + rxq_ovfl = 36, + wifi_status = 37, + peek_off = 38, + nofcs = 39, + lock_filter = 40, + select_err_queue = 41, + busy_poll = 48, + max_pacing_rate = 49, + bpf_extensions = 50, + incoming_cpu = 51, + attach_bpf = 52, + attach_reuseport_cbpf = 53, + attach_reuseport_ebpf = 54, + cnx_advice = 55, + meminfo = 57, + incoming_napi_id = 58, + cookie = 59, + peergroups = 61, + zerocopy = 62, + txtime = 63, + bindtoifindex = 65, + timestamp_new = 70, + timestampns_new = 66, + timestamping_new = 67, + rcvtimeo_new = 68, + sndtimeo_new = 69, + detach_reuseport_bpf = 71, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +} else enum(u16) { + debug = 1, + reuseaddr = 2, + type = 3, + @"error" = 4, + dontroute = 5, + broadcast = 6, + sndbuf = 7, + rcvbuf = 8, + keepalive = 9, + oobinline = 10, + no_check = 11, + priority = 12, + linger = 13, + bsdcompat = 14, + reuseport = 15, + passcred = 16, + peercred = 17, + rcvlowat = 18, + sndlowat = 19, + rcvtimeo = 20, + sndtimeo = 21, + acceptconn = 30, + peersec = 31, + sndbufforce = 32, + rcvbufforce = 33, + passsec = 34, + timestampns_old = 35, + mark = 36, + timestamping_old = 37, + protocol = 38, + domain = 39, + rxq_ovfl = 40, + wifi_status = 41, + peek_off = 42, + nofcs = 43, + lock_filter = 44, + select_err_queue = 45, + busy_poll = 46, + max_pacing_rate = 47, + bpf_extensions = 48, + incoming_cpu = 49, + attach_bpf = 50, + attach_reuseport_cbpf = 51, + attach_reuseport_ebpf = 52, + cnx_advice = 53, + meminfo = 55, + incoming_napi_id = 56, + cookie = 57, + peergroups = 59, + zerocopy = 60, + txtime = 61, + bindtoifindex = 62, + timestamp_new = 63, + timestampns_new = 64, + timestamping_new = 65, + rcvtimeo_new = 66, + sndtimeo_new = 67, + detach_reuseport_bpf = 68, + security_authentication = 22, + security_encryption_transport = 23, + security_encryption_network = 24, + bindtodevice = 25, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +}; + +// COMMIT: add SO constants +/// Backwards-compatible SO_* constants +pub const SO = struct { + pub const DEBUG: u16 = @intFromEnum(So.debug); + pub const REUSEADDR: u16 = @intFromEnum(So.reuseaddr); + pub const KEEPALIVE: u16 = @intFromEnum(So.keepalive); + pub const DONTROUTE: u16 = @intFromEnum(So.dontroute); + pub const BROADCAST: u16 = @intFromEnum(So.broadcast); + pub const LINGER: u16 = @intFromEnum(So.linger); + pub const OOBINLINE: u16 = @intFromEnum(So.oobinline); + pub const REUSEPORT: u16 = @intFromEnum(So.reuseport); + pub const SNDBUF: u16 = @intFromEnum(So.sndbuf); + pub const RCVBUF: u16 = @intFromEnum(So.rcvbuf); + pub const SNDLOWAT: u16 = @intFromEnum(So.sndlowat); + pub const RCVLOWAT: u16 = @intFromEnum(So.rcvlowat); + pub const RCVTIMEO: u16 = @intFromEnum(So.rcvtimeo); + pub const SNDTIMEO: u16 = @intFromEnum(So.sndtimeo); + pub const ERROR: u16 = @intFromEnum(So.@"error"); + pub const TYPE: u16 = @intFromEnum(So.type); + pub const ACCEPTCONN: u16 = @intFromEnum(So.acceptconn); + pub const PROTOCOL: u16 = @intFromEnum(So.protocol); + pub const DOMAIN: u16 = @intFromEnum(So.domain); + pub const NO_CHECK: u16 = @intFromEnum(So.no_check); + pub const PRIORITY: u16 = @intFromEnum(So.priority); + pub const BSDCOMPAT: u16 = @intFromEnum(So.bsdcompat); + pub const PASSCRED: u16 = @intFromEnum(So.passcred); + pub const PEERCRED: u16 = @intFromEnum(So.peercred); + pub const PEERSEC: u16 = @intFromEnum(So.peersec); + pub const SNDBUFFORCE: u16 = @intFromEnum(So.sndbufforce); + pub const RCVBUFFORCE: u16 = @intFromEnum(So.rcvbufforce); + pub const SECURITY_AUTHENTICATION: u16 = @intFromEnum(So.security_authentication); + pub const SECURITY_ENCRYPTION_TRANSPORT: u16 = @intFromEnum(So.security_encryption_transport); + pub const SECURITY_ENCRYPTION_NETWORK: u16 = @intFromEnum(So.security_encryption_network); + pub const BINDTODEVICE: u16 = @intFromEnum(So.bindtodevice); + pub const ATTACH_FILTER: u16 = @intFromEnum(So.attach_filter); + pub const DETACH_FILTER: u16 = @intFromEnum(So.detach_filter); + pub const GET_FILTER: u16 = ATTACH_FILTER; // alias + pub const PEERNAME: u16 = @intFromEnum(So.peername); + pub const TIMESTAMP_OLD: u16 = @intFromEnum(So.timestamp_old); + pub const PASSSEC: u16 = @intFromEnum(So.passsec); + pub const TIMESTAMPNS_OLD: u16 = @intFromEnum(So.timestampns_old); + pub const MARK: u16 = @intFromEnum(So.mark); + pub const TIMESTAMPING_OLD: u16 = @intFromEnum(So.timestamping_old); + pub const RXQ_OVFL: u16 = @intFromEnum(So.rxq_ovfl); + pub const WIFI_STATUS: u16 = @intFromEnum(So.wifi_status); + pub const PEEK_OFF: u16 = @intFromEnum(So.peek_off); + pub const NOFCS: u16 = @intFromEnum(So.nofcs); + pub const LOCK_FILTER: u16 = @intFromEnum(So.lock_filter); + pub const SELECT_ERR_QUEUE: u16 = @intFromEnum(So.select_err_queue); + pub const BUSY_POLL: u16 = @intFromEnum(So.busy_poll); + pub const MAX_PACING_RATE: u16 = @intFromEnum(So.max_pacing_rate); + pub const BPF_EXTENSIONS: u16 = @intFromEnum(So.bpf_extensions); + pub const INCOMING_CPU: u16 = @intFromEnum(So.incoming_cpu); + pub const ATTACH_BPF: u16 = @intFromEnum(So.attach_bpf); + pub const DETACH_BPF: u16 = DETACH_FILTER; // alias in original + pub const ATTACH_REUSEPORT_CBPF: u16 = @intFromEnum(So.attach_reuseport_cbpf); + pub const ATTACH_REUSEPORT_EBPF: u16 = @intFromEnum(So.attach_reuseport_ebpf); + pub const CNX_ADVICE: u16 = @intFromEnum(So.cnx_advice); + pub const MEMINFO: u16 = @intFromEnum(So.meminfo); + pub const INCOMING_NAPI_ID: u16 = @intFromEnum(So.incoming_napi_id); + pub const COOKIE: u16 = @intFromEnum(So.cookie); + pub const PEERGROUPS: u16 = @intFromEnum(So.peergroups); + pub const ZEROCOPY: u16 = @intFromEnum(So.zerocopy); + pub const TXTIME: u16 = @intFromEnum(So.txtime); + pub const BINDTOIFINDEX: u16 = @intFromEnum(So.bindtoifindex); + pub const TIMESTAMP_NEW: u16 = @intFromEnum(So.timestamp_new); + pub const TIMESTAMPNS_NEW: u16 = @intFromEnum(So.timestampns_new); + pub const TIMESTAMPING_NEW: u16 = @intFromEnum(So.timestamping_new); + pub const RCVTIMEO_NEW: u16 = @intFromEnum(So.rcvtimeo_new); + pub const SNDTIMEO_NEW: u16 = @intFromEnum(So.sndtimeo_new); + pub const DETACH_REUSEPORT_BPF: u16 = @intFromEnum(So.detach_reuseport_bpf); }; pub const SCM = struct { @@ -4517,37 +4604,100 @@ pub const SCM = struct { pub const TXTIME = SO.TXTIME; }; -pub const SOL = struct { - pub const SOCKET = if (is_mips or is_sparc) 65535 else 1; - - pub const IP = 0; - pub const IPV6 = 41; - pub const ICMPV6 = 58; - - pub const RAW = 255; - pub const DECNET = 261; - pub const X25 = 262; - pub const PACKET = 263; - pub const ATM = 264; - pub const AAL = 265; - pub const IRDA = 266; - pub const NETBEUI = 267; - pub const LLC = 268; - pub const DCCP = 269; - pub const NETLINK = 270; - pub const TIPC = 271; - pub const RXRPC = 272; - pub const PPPOL2TP = 273; - pub const BLUETOOTH = 274; - pub const PNPIPE = 275; - pub const RDS = 276; - pub const IUCV = 277; - pub const CAIF = 278; - pub const ALG = 279; - pub const NFC = 280; - pub const KCM = 281; - pub const TLS = 282; - pub const XDP = 283; +/// Deprecated in favor of Sol +pub const SOL = Sol; +// https://github.com/torvalds/linux/blob/0d97f2067c166eb495771fede9f7b73999c67f66/include/linux/socket.h#L347C1-L388C22 +/// Socket option level for setsockopt(2)/getsockopt(2) +pub const Sol = enum(u16) { + ip = 0, + socket = if (is_mips or is_sparc) 65535 else 1, + tcp = 6, + udp = 17, + ipv6 = 41, + icmpv6 = 58, + sctp = 132, + /// UDP-Lite (RFC 3828) + udplite = 136, + raw = 255, + ipx = 256, + ax25 = 257, + atalk = 258, + netrom = 259, + rose = 260, + decnet = 261, + x25 = 262, + packet = 263, + /// ATM layer (cell level) + atm = 264, + /// ATM Adaption Layer (packet level) + aal = 265, + irda = 266, + netbeui = 267, + llc = 268, + dccp = 269, + netlink = 270, + tipc = 271, + rxrpc = 272, + pppol2tp = 273, + bluetooth = 274, + pnpipe = 275, + rds = 276, + iucv = 277, + caif = 278, + alg = 279, + nfc = 280, + kcm = 281, + tls = 282, + xdp = 283, + mptcp = 284, + mctp = 285, + smc = 286, + vsock = 287, + _, + + /// Deprecated constants for compatibility with current Zig + pub const IP: u16 = @intFromEnum(Sol.ip); + pub const SOCKET: u16 = @intFromEnum(Sol.socket); + pub const TCP: u16 = @intFromEnum(Sol.tcp); + pub const UDP: u16 = @intFromEnum(Sol.udp); + pub const IPV6: u16 = @intFromEnum(Sol.ipv6); + pub const ICMPV6: u16 = @intFromEnum(Sol.icmpv6); + pub const SCTP: u16 = @intFromEnum(Sol.sctp); + pub const UDPLITE: u16 = @intFromEnum(Sol.udplite); + + pub const RAW: u16 = @intFromEnum(Sol.raw); + pub const IPX: u16 = @intFromEnum(Sol.ipx); + pub const AX25: u16 = @intFromEnum(Sol.ax25); + pub const ATALK: u16 = @intFromEnum(Sol.atalk); + pub const NETROM: u16 = @intFromEnum(Sol.netrom); + pub const ROSE: u16 = @intFromEnum(Sol.rose); + pub const DECNET: u16 = @intFromEnum(Sol.decnet); + pub const X25: u16 = @intFromEnum(Sol.x25); + pub const PACKET: u16 = @intFromEnum(Sol.packet); + pub const ATM: u16 = @intFromEnum(Sol.atm); + pub const AAL: u16 = @intFromEnum(Sol.aal); + pub const IRDA: u16 = @intFromEnum(Sol.irda); + pub const NETBEUI: u16 = @intFromEnum(Sol.netbeui); + pub const LLC: u16 = @intFromEnum(Sol.llc); + pub const DCCP: u16 = @intFromEnum(Sol.dccp); + pub const NETLINK: u16 = @intFromEnum(Sol.netlink); + pub const TIPC: u16 = @intFromEnum(Sol.tipc); + pub const RXRPC: u16 = @intFromEnum(Sol.rxrpc); + pub const PPPOL2TP: u16 = @intFromEnum(Sol.pppol2tp); + pub const BLUETOOTH: u16 = @intFromEnum(Sol.bluetooth); + pub const PNPIPE: u16 = @intFromEnum(Sol.pnpipe); + pub const RDS: u16 = @intFromEnum(Sol.rds); + pub const IUCV: u16 = @intFromEnum(Sol.iucv); + pub const CAIF: u16 = @intFromEnum(Sol.caif); + pub const ALG: u16 = @intFromEnum(Sol.alg); + pub const NFC: u16 = @intFromEnum(Sol.nfc); + pub const KCM: u16 = @intFromEnum(Sol.kcm); + pub const TLS: u16 = @intFromEnum(Sol.tls); + pub const XDP: u16 = @intFromEnum(Sol.xdp); + pub const MPTCP: u16 = @intFromEnum(Sol.mptcp); + pub const MCTP: u16 = @intFromEnum(Sol.mctp); + pub const SMC: u16 = @intFromEnum(Sol.smc); + pub const VSOCK: u16 = @intFromEnum(Sol.vsock); }; pub const SOMAXCONN = 128; @@ -7123,7 +7273,7 @@ pub const utsname = extern struct { pub const HOST_NAME_MAX = 64; // COMMIT: RenameFlags -pub const RenameFlags = packed struct(u32) { +pub const Rename = packed struct(u32) { /// Don't overwrite target noreplace: bool = false, /// Exchange source and dest diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 1ad5ac0fdb69..2574dfab78fb 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -671,8 +671,8 @@ pub fn send_zc_fixed( user_data: u64, fd: linux.fd_t, buffer: []const u8, - send_flags: u32, - zc_flags: u16, + send_flags: linux.Msg, + zc_flags: Sqe.SendRecv, buf_index: u16, ) !*Sqe { const sqe = try self.get_sqe(); @@ -721,7 +721,7 @@ pub fn sendmsg_zc( user_data: u64, fd: linux.fd_t, msg: *const linux.msghdr_const, - flags: u32, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_sendmsg_zc(fd, msg, flags); @@ -900,7 +900,7 @@ pub fn poll_update( old_user_data: u64, new_user_data: u64, poll_mask: u32, - flags: u32, + flags: u32, // TODO: what are the flags ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); @@ -986,7 +986,7 @@ pub fn renameat( old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: linux.RenameFlags, + flags: linux.Rename, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); @@ -1100,7 +1100,7 @@ pub fn waitid( id: i32, infop: *linux.siginfo_t, options: linux.W, - flags: u32, + flags: u32, // TODO: wait flags ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_waitid(id_type, id, infop, options, flags); @@ -1176,7 +1176,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { self.fd, .REGISTER_FILES2, @ptrCast(®), - @as(u32, @sizeOf(linux.io_uring_rsrc_register)), + @as(u32, @sizeOf(RsrcRegister)), ); return handle_registration_result(res); @@ -1197,7 +1197,7 @@ pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { self.fd, .REGISTER_FILE_ALLOC_RANGE, @ptrCast(&range), - @as(u32, @sizeOf(linux.io_uring_file_index_range)), + @as(u32, @sizeOf(FileIndexRange)), ); return handle_registration_result(res); @@ -1383,7 +1383,7 @@ pub fn bind( fd: linux.fd_t, addr: *const posix.sockaddr, addrlen: posix.socklen_t, - flags: u32, + flags: u32, // TODO: bind flags ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_bind(fd, addr, addrlen, flags); @@ -1399,7 +1399,7 @@ pub fn listen( user_data: u64, fd: linux.fd_t, backlog: usize, - flags: u32, + flags: u32, // TODO: listen flags ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_listen(fd, backlog, flags); @@ -1415,8 +1415,8 @@ pub fn cmd_sock( user_data: u64, cmd_op: SocketOp, fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO + level: linux.Sol, + optname: linux.So, optval: u64, // pointer to the option value optlen: u32, // size of the option value ) !*Sqe { @@ -1433,8 +1433,8 @@ pub fn setsockopt( self: *IoUring, user_data: u64, fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO + level: linux.Sol, + optname: linux.So, opt: []const u8, ) !*Sqe { return try self.cmd_sock( @@ -1455,8 +1455,8 @@ pub fn getsockopt( self: *IoUring, user_data: u64, fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO + level: linux.Sol, + optname: linux.So, opt: []u8, ) !*Sqe { return try self.cmd_sock( @@ -1944,7 +1944,7 @@ pub const Sqe = extern struct { sqe: *Sqe, epfd: linux.fd_t, fd: linux.fd_t, - op: u32, + op: u32, // TODO: what is the type of OP ev: ?*linux.epoll_event, ) void { sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); @@ -2116,7 +2116,7 @@ pub const Sqe = extern struct { pub fn prep_poll_add( sqe: *Sqe, fd: linux.fd_t, - poll_mask: linux.POLL, + poll_mask: linux.POLL, // TODO: Poll mask typed ) void { sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); // Poll masks previously used to comprise of 16 bits in the flags union of @@ -2139,7 +2139,7 @@ pub const Sqe = extern struct { sqe: *Sqe, old_user_data: u64, new_user_data: u64, - poll_mask: linux.POLL, + poll_mask: linux.POLL, //TODO: Poll mask flags: uflags.Poll, ) void { sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); @@ -2226,7 +2226,7 @@ pub const Sqe = extern struct { old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: linux.RenameFlags, + flags: linux.Rename, ) void { sqe.prep_rw( .RENAMEAT, @@ -2335,8 +2335,7 @@ pub const Sqe = extern struct { domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - /// flags is unused - flags: u32, + flags: u32, // flags is unused ) void { sqe.prep_rw(.SOCKET, @intFromEnum(domain), 0, @intFromEnum(protocol), @intCast(@as(u32, @bitCast(socket_type)))); sqe.rw_flags = flags; @@ -2347,8 +2346,7 @@ pub const Sqe = extern struct { domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - /// flags is unused - flags: u32, + flags: u32, // flags is unused file_index: u32, ) void { prep_socket(sqe, domain, socket_type, protocol, flags); @@ -2405,8 +2403,8 @@ pub const Sqe = extern struct { sqe: *Sqe, cmd_op: SocketOp, fd: linux.fd_t, - level: u32, // TODO: linux.SOL, - optname: u32, // TODO: linux.SO, + level: linux.Sol, + optname: linux.So, optval: u64, optlen: u32, ) void { @@ -2418,8 +2416,8 @@ pub const Sqe = extern struct { level: u32, optname: u32, }{ - .level = level, - .optname = optname, + .level = @intFromEnum(level), + .optname = @intFromEnum(optname), }); // splice_fd_in if overloaded u32 -> i32 sqe.splice_fd_in = @bitCast(optlen); @@ -2640,6 +2638,7 @@ pub const BufferGroup = struct { allocator.free(self.heads); } + // TODO: recv flags // Prepare recv operation which will select buffer from this group. pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { var sqe = try self.ring.get_sqe(); @@ -2651,6 +2650,7 @@ pub const BufferGroup = struct { return sqe; } + // TODO: recv_multishot flags // Prepare multishot recv operation which will select buffer from this group. pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { var sqe = try self.recv(user_data, fd, flags); @@ -2732,6 +2732,7 @@ pub const SqOffsets = extern struct { ring_mask: u32, /// entries in ring ring_entries: u32, + // TODO: find type of this flags /// ring flags flags: u32, /// number of sqes not submitted @@ -2750,6 +2751,7 @@ pub const CqOffsets = extern struct { ring_entries: u32, overflow: u32, cqes: u32, + // TODO: find type of these flags flags: u32, resv: u32, user_addr: u64, @@ -2809,6 +2811,7 @@ pub const MemRegionReg = extern struct { /// matches io_uring_rsrc_register in liburing pub const RsrcRegister = extern struct { nr: u32, + // TODO: find type of these flags flags: u32, resv2: u64, data: u64, @@ -2878,6 +2881,7 @@ pub const Restriction = extern struct { register_op: RegisterOp, /// IORING_RESTRICTION_SQE_OP sqe_op: Op, + // TODO: find type of these flags /// IORING_RESTRICTION_SQE_FLAGS_* sqe_flags: u8, }, @@ -3035,6 +3039,7 @@ pub const RecvmsgOut = extern struct { namelen: u32, controllen: u32, payloadlen: u32, + // TODO: find type of these flags flags: u32, }; @@ -3820,12 +3825,14 @@ test "splice/read" { try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); + // TODO: use io_link function sqe_splice_to_pipe.flags.IO_LINK = true; const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); + // TODO: use io_link function sqe_splice_from_pipe.flags.IO_LINK = true; const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); @@ -3897,6 +3904,7 @@ test "write_fixed/read_fixed" { const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); try testing.expectEqual(@as(u64, 3), sqe_write.off); + // TODO: use io_link function sqe_write.flags.IO_LINK = true; const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); @@ -4026,6 +4034,7 @@ test "accept/connect/send/recv" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}); + // TODO: use io_link function sqe_send.flags.IO_LINK = true; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -4286,6 +4295,7 @@ test "accept/connect/recv/link_timeout" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); + // TODO: use io_link function sqe_recv.flags.IO_LINK = true; const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; @@ -4511,6 +4521,7 @@ test "register_files_update" { { const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.READ, sqe.opcode); + // TODO: use setflags function sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4532,6 +4543,7 @@ test "register_files_update" { // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.READ, sqe.opcode); + // TODO: use setflags function sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4549,6 +4561,7 @@ test "register_files_update" { // Now this should fail since both fds are sparse (-1) const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.READ, sqe.opcode); + // TODO: use setflags function sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -5412,6 +5425,7 @@ test "accept/connect/send_zc/recv" { // zero-copy send const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}, .{}); + // TODO: use io_link function sqe_send.flags.IO_LINK = true; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -5504,6 +5518,7 @@ test "accept_direct" { // Fd field is set to registered file index, returned by accept. // Flag linux.IOSQE_FIXED_FILE must be set. const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, .{}); + // TODO: use setflags function recv_sqe.flags.FIXED_FILE = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -5738,7 +5753,7 @@ test "openat_direct/close_direct" { try testing.expect(cqe.res == 0); // res is 0 when we specify index // let kernel choose registered file index - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, constants.FILE_INDEX_ALLOC); try testing.expectEqual(@as(u32, 1), try ring.submit()); cqe = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); @@ -6237,8 +6252,8 @@ test "bind/listen/connect" { // Prepare: set socket option * 2, bind, listen var optval: u32 = 1; - (try ring.setsockopt(2, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval))).link_next(); - (try ring.setsockopt(3, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEPORT, mem.asBytes(&optval))).link_next(); + (try ring.setsockopt(2, listen_fd, .socket, .reuseaddr, mem.asBytes(&optval))).link_next(); + (try ring.setsockopt(3, listen_fd, .socket, .reuseport, mem.asBytes(&optval))).link_next(); (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); _ = try ring.listen(5, listen_fd, 1, 0); // Submit 4 operations @@ -6252,7 +6267,7 @@ test "bind/listen/connect" { // Check that socket option is set optval = 0; - _ = try ring.getsockopt(5, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval)); + _ = try ring.getsockopt(5, listen_fd, .socket, .reuseaddr, mem.asBytes(&optval)); try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); try testing.expectEqual(5, cqe.user_data); From efbd148798ddac510ec24e86b7d5fe952fa29ae5 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Thu, 9 Oct 2025 23:26:15 +0000 Subject: [PATCH 12/41] Replace EPOLL struct with an EpollOp enum and Epoll packed struct type Prefer Using aliases over packed union field Fix bug in HANDLE_FID declaration Use EpollOp op type and Epoll packed struct flag type in IoUring Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 139 ++++++++++++++++++++++++----------- lib/std/os/linux/IoUring.zig | 34 +++++---- 2 files changed, 115 insertions(+), 58 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 8f49cc9ad22f..46afcf7847a7 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -3481,17 +3481,11 @@ pub const STDERR_FILENO = 2; pub const AT = At; /// matches AT_* and AT_STATX_* pub const At = packed struct(u32) { - _reserved: u8 = 0, + _u1: u8 = 0, /// Do not follow symbolic links symlink_nofollow: bool = false, /// Remove directory instead of unlinking file - /// Or - /// File handle is needed to compare object identity and may not be usable - /// with open_by_handle_at(2) - removedir_or_handle_fid: packed union { - removedir: bool, - handle_fid: bool, - } = @bitCast(false), + removedir: bool = false, /// Follow symbolic links. symlink_follow: bool = false, /// Suppress terminal automount traversal @@ -3504,7 +3498,11 @@ pub const At = packed struct(u32) { statx_dont_sync: bool = false, /// Apply to the entire subtree recursive: bool = false, - _reserved_1: u16 = 0, + _17: u16 = 0, + + /// File handle is needed to compare object identity and may not be usable + /// with open_by_handle_at(2) + pub const handle_fid: At = .{ .removedir = true }; /// Special value used to indicate openat should use the current working directory pub const fdcwd = -100; @@ -3524,36 +3522,25 @@ pub const At = packed struct(u32) { // /// Special value used to indicate openat should use the current working directory pub const FDCWD = fdcwd; - /// Do not follow symbolic links pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); - /// Remove directory instead of unlinking file - pub const REMOVEDIR: u32 = @bitCast(At{ .removedir_or_handle_fid = .{ .removedir = true } }); - - pub const HANDLE_FID = At{ .removedir_or_handle_fid = .{ .handle_fid = true } }; - + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + pub const HANDLE_FID: u32 = @bitCast(handle_fid); /// Follow symbolic links. pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); - /// Suppress terminal automount traversal pub const NO_AUTOMOUNT: u32 = @bitCast(At{ .no_automount = true }); - /// Allow empty relative pathname pub const EMPTY_PATH: u32 = @bitCast(At{ .empty_path = true }); - /// Type of synchronisation required from statx() pub const STATX_SYNC_TYPE: u32 = @bitCast(statx_sync_type); - /// - Do whatever stat() does pub const STATX_SYNC_AS_STAT: u32 = @bitCast(statx_sync_as_stat); - /// - Force the attributes to be sync'd with the server pub const STATX_FORCE_SYNC: u32 = @bitCast(At{ .statx_force_sync = true }); - /// - Don't sync attributes with the server pub const STATX_DONT_SYNC: u32 = @bitCast(At{ .statx_dont_sync = true }); - /// Apply to the entire subtree pub const RECURSIVE: u32 = @bitCast(At{ .recursive = true }); }; @@ -5668,28 +5655,96 @@ pub const SER = struct { }; }; -pub const EPOLL = struct { - pub const CLOEXEC = 1 << @bitOffsetOf(O, "CLOEXEC"); +/// Valid opcodes to issue to sys_epoll_ctl() +pub const EpollOp = enum(u32) { + ctl_add = 1, + ctl_del = 2, + ctl_mod = 3, + _, - pub const CTL_ADD = 1; - pub const CTL_DEL = 2; - pub const CTL_MOD = 3; + // Deprecated Constants + pub const CTL_ADD: u32 = @intFromEnum(EpollOp.ctl_add); + pub const CTL_DEL: u32 = @intFromEnum(EpollOp.ctl_del); + pub const CTL_MOD: u32 = @intFromEnum(EpollOp.ctl_mod); +}; - pub const IN = 0x001; - pub const PRI = 0x002; - pub const OUT = 0x004; - pub const RDNORM = 0x040; - pub const RDBAND = 0x080; - pub const WRNORM = if (is_mips) 0x004 else 0x100; - pub const WRBAND = if (is_mips) 0x100 else 0x200; - pub const MSG = 0x400; - pub const ERR = 0x008; - pub const HUP = 0x010; - pub const RDHUP = 0x2000; - pub const EXCLUSIVE = (@as(u32, 1) << 28); - pub const WAKEUP = (@as(u32, 1) << 29); - pub const ONESHOT = (@as(u32, 1) << 30); - pub const ET = (@as(u32, 1) << 31); +/// Deprecated alias for Epoll +pub const EPOLL = Epoll; +/// Epoll event masks +// https://github.com/torvalds/linux/blob/18a7e218cfcdca6666e1f7356533e4c988780b57/include/uapi/linux/eventpoll.h#L30 +pub const Epoll = packed struct(u32) { + // EPOLL event types (lower 16 bits) + // + /// The associated file is available for read(2) operations + in: bool = false, + /// There is an exceptional condition on the file descriptor + pri: bool = false, + /// The associated file is available for write(2) operations + out: bool = false, + /// Error condition happened on the associated file descriptor + err: bool = false, + /// Hang up happened on the associated file descriptor + hup: bool = false, + /// Invalid request: fd not open + nval: bool = false, + /// Normal data may be read + rdnorm: bool = false, + /// Priority data may be read + rdband: bool = false, + // COMMIT: new flags + /// Writing is now possible (normal data) + wrnorm: bool = false, + /// Priority data may be written + wrband: bool = false, + /// Message available (unused on Linux) + msg: bool = false, + _12: u2 = 0, + /// Stream socket peer closed connection + rdhup: bool = false, + _15: u13 = 0, + // EPOLL input flags (Higher order flags are included as internal stat) + // + /// Internal flag - wakeup generated by io_uring, used to detect + /// recursion back into the io_uring poll handler + uring_wake: bool = true, + /// Set exclusive wakeup mode for the target file descriptor + exclusive: bool = true, + /// Request the handling of system wakeup events so as to prevent system + /// suspends from happening while those events are being processed. + /// Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will + /// not be re-allowed until epoll_wait is called again after consuming the + /// wakeup event(s). + /// Requires CAP_BLOCK_SUSPEND + wakeup: bool = true, + /// Set the One Shot behaviour for the target file descriptor + oneshot: bool = true, + /// Set the Edge Triggered behaviour for the target file descriptor + et: bool = true, + + // Deprecated Named constants + // EPOLL event types + pub const IN: u32 = @bitCast(Epoll{ .in = true }); + pub const PRI: u32 = @bitCast(Epoll{ .pri = true }); + pub const OUT: u32 = @bitCast(Epoll{ .out = true }); + pub const ERR: u32 = @bitCast(Epoll{ .err = true }); + pub const HUP: u32 = @bitCast(Epoll{ .hup = true }); + pub const NVAL: u32 = @bitCast(Epoll{ .nval = true }); + pub const RDNORM: u32 = @bitCast(Epoll{ .rdnorm = true }); + pub const RDBAND: u32 = @bitCast(Epoll{ .rdband = true }); + pub const WRNORM: u32 = @bitCast(Epoll{ .wrnorm = true }); + pub const WRBAND: u32 = @bitCast(Epoll{ .wrband = true }); + pub const MSG: u32 = @bitCast(Epoll{ .msg = true }); + pub const RDHUP: u32 = @bitCast(Epoll{ .rdhup = true }); + + // EPOLL input flags + pub const URING_WAKE: u32 = @bitCast(Epoll{ .uring_wake = true }); + pub const EXCLUSIVE: u32 = @bitCast(Epoll{ .exclusive = true }); + pub const WAKEUP: u32 = @bitCast(Epoll{ .wakeup = true }); + pub const ONESHOT: u32 = @bitCast(Epoll{ .oneshot = true }); + pub const ET: u32 = @bitCast(Epoll{ .let = true }); + + /// Flags for epoll_create1 + pub const CLOEXEC = 1 << @bitOffsetOf(O, "CLOEXEC"); }; pub const CLOCK = clockid_t; diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 2574dfab78fb..64053e4132c8 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -585,7 +585,7 @@ pub fn epoll_ctl( user_data: u64, epfd: linux.fd_t, fd: linux.fd_t, - op: u32, + op: linux.EpollOp, ev: ?*linux.epoll_event, ) !*Sqe { const sqe = try self.get_sqe(); @@ -871,7 +871,7 @@ pub fn poll_add( self: *IoUring, user_data: u64, fd: linux.fd_t, - poll_mask: u32, + poll_mask: linux.Epoll, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_poll_add(fd, poll_mask); @@ -899,8 +899,8 @@ pub fn poll_update( user_data: u64, old_user_data: u64, new_user_data: u64, - poll_mask: u32, - flags: u32, // TODO: what are the flags + poll_mask: linux.Epoll, + flags: uflags.Poll, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); @@ -1100,7 +1100,7 @@ pub fn waitid( id: i32, infop: *linux.siginfo_t, options: linux.W, - flags: u32, // TODO: wait flags + flags: u32, // They are currently unused, and hence 0 should be passed ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_waitid(id_type, id, infop, options, flags); @@ -1383,7 +1383,9 @@ pub fn bind( fd: linux.fd_t, addr: *const posix.sockaddr, addrlen: posix.socklen_t, - flags: u32, // TODO: bind flags + // liburing doesn't have this flag, hence 0 should be passed + // TODO: consider removing this and all flags like this + flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_bind(fd, addr, addrlen, flags); @@ -1399,7 +1401,9 @@ pub fn listen( user_data: u64, fd: linux.fd_t, backlog: usize, - flags: u32, // TODO: listen flags + // liburing doesn't have this flag, hence 0 should be passed + // TODO: consider removing this and all flags like this + flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_listen(fd, backlog, flags); @@ -1599,7 +1603,6 @@ pub const Cqe = extern struct { /// result code for this event res: i32, flags: Flags, - // COMMIT: add big_cqe which was missing in io_uring_cqe type declaration // TODO: add support for the IORING_SETUP_CQE32 case /// If the ring is initialized with IORING_SETUP_CQE32, then this field /// contains 16-bytes of padding, doubling the size of the CQE. @@ -1944,10 +1947,10 @@ pub const Sqe = extern struct { sqe: *Sqe, epfd: linux.fd_t, fd: linux.fd_t, - op: u32, // TODO: what is the type of OP + op: linux.EpollOp, ev: ?*linux.epoll_event, ) void { - sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); + sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), @intFromEnum(op), @intCast(fd)); } pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.Msg) void { @@ -1955,7 +1958,6 @@ pub const Sqe = extern struct { sqe.rw_flags = @bitCast(flags); } - // TODO: review recv `flags` pub fn prep_recv_multishot( sqe: *Sqe, fd: linux.fd_t, @@ -2116,7 +2118,7 @@ pub const Sqe = extern struct { pub fn prep_poll_add( sqe: *Sqe, fd: linux.fd_t, - poll_mask: linux.POLL, // TODO: Poll mask typed + poll_mask: linux.Epoll, ) void { sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); // Poll masks previously used to comprise of 16 bits in the flags union of @@ -2125,7 +2127,7 @@ pub const Sqe = extern struct { // poll masks are consistently and properly read across multiple kernel // versions, poll masks are enforced to be little-endian. // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + sqe.rw_flags = std.mem.nativeToLittle(u32, @as(u32, @bitCast(poll_mask))); } pub fn prep_poll_remove( @@ -2139,7 +2141,7 @@ pub const Sqe = extern struct { sqe: *Sqe, old_user_data: u64, new_user_data: u64, - poll_mask: linux.POLL, //TODO: Poll mask + poll_mask: linux.Epoll, flags: uflags.Poll, ) void { sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); @@ -2149,7 +2151,7 @@ pub const Sqe = extern struct { // poll masks are consistently and properly read across multiple kernel // versions, poll masks are enforced to be little-endian. // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); + sqe.rw_flags = std.mem.nativeToLittle(u32, @as(u32, @bitCast(poll_mask))); } pub fn prep_fallocate( @@ -2243,7 +2245,7 @@ pub const Sqe = extern struct { sqe: *Sqe, dir_fd: linux.fd_t, path: [*:0]const u8, - flags: linux.At, // TODO: unlink flags only AT_REMOVEDIR + flags: linux.At, ) void { sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); sqe.rw_flags = @bitCast(flags); From eba509d78b1e41728ea493d4fab3c08c7e5b415a Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 10 Oct 2025 12:33:26 +0000 Subject: [PATCH 13/41] fix error of setting some fields in Epoll to true by default Add Deprecated Op constants for compatibility Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 46afcf7847a7..f9b140323dbf 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -5706,20 +5706,20 @@ pub const Epoll = packed struct(u32) { // /// Internal flag - wakeup generated by io_uring, used to detect /// recursion back into the io_uring poll handler - uring_wake: bool = true, + uring_wake: bool = false, /// Set exclusive wakeup mode for the target file descriptor - exclusive: bool = true, + exclusive: bool = false, /// Request the handling of system wakeup events so as to prevent system /// suspends from happening while those events are being processed. /// Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will /// not be re-allowed until epoll_wait is called again after consuming the /// wakeup event(s). /// Requires CAP_BLOCK_SUSPEND - wakeup: bool = true, + wakeup: bool = false, /// Set the One Shot behaviour for the target file descriptor - oneshot: bool = true, + oneshot: bool = false, /// Set the Edge Triggered behaviour for the target file descriptor - et: bool = true, + et: bool = false, // Deprecated Named constants // EPOLL event types @@ -5741,10 +5741,15 @@ pub const Epoll = packed struct(u32) { pub const EXCLUSIVE: u32 = @bitCast(Epoll{ .exclusive = true }); pub const WAKEUP: u32 = @bitCast(Epoll{ .wakeup = true }); pub const ONESHOT: u32 = @bitCast(Epoll{ .oneshot = true }); - pub const ET: u32 = @bitCast(Epoll{ .let = true }); + pub const ET: u32 = @bitCast(Epoll{ .et = true }); /// Flags for epoll_create1 pub const CLOEXEC = 1 << @bitOffsetOf(O, "CLOEXEC"); + + // Deprecated Op Constants use EpollOp enum type + pub const CTL_ADD: u32 = @intFromEnum(EpollOp.ctl_add); + pub const CTL_DEL: u32 = @intFromEnum(EpollOp.ctl_del); + pub const CTL_MOD: u32 = @intFromEnum(EpollOp.ctl_mod); }; pub const CLOCK = clockid_t; From d8149265da503a1acf60ed3b01cd7733af7593dc Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 10 Oct 2025 12:35:28 +0000 Subject: [PATCH 14/41] Remove io_uring_sqe.zig from CMakeLists Signed-off-by: Bernard Assan --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7090f8852768..38acb611f084 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -473,7 +473,6 @@ set(ZIG_STAGE2_SOURCES lib/std/os/linux.zig lib/std/os/linux.zig lib/std/os/linux/IoUring.zig - lib/std/os/linux/io_uring_sqe.zig lib/std/os/linux/x86_64.zig lib/std/os/linux/x86_64.zig lib/std/os/windows.zig From c17b67db7b7e5127f26eb3e6d7a772e5b0a7464e Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 10 Oct 2025 23:36:14 +0000 Subject: [PATCH 15/41] Use lower case identifiers for IoUring flags and enums update io_uring_enter and io_uring_register syscalls to use IoUring flags and types Cleanup comments Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 6 +- lib/std/os/linux/IoUring.zig | 1168 ++++++++++++++++++---------------- 2 files changed, 606 insertions(+), 568 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index f9b140323dbf..127211b81f44 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -2514,11 +2514,11 @@ pub fn io_uring_setup(entries: u32, p: *IoUring.Params) usize { return syscall2(.io_uring_setup, entries, @intFromPtr(p)); } -pub fn io_uring_enter(fd: i32, to_submit: u32, min_complete: u32, flags: u32, sig: ?*sigset_t) usize { - return syscall6(.io_uring_enter, @as(usize, @bitCast(@as(isize, fd))), to_submit, min_complete, flags, @intFromPtr(sig), NSIG / 8); +pub fn io_uring_enter(fd: i32, to_submit: u32, min_complete: u32, flags: IoUring.uflags.Enter, sig: ?*sigset_t) usize { + return syscall6(.io_uring_enter, @as(usize, @bitCast(@as(isize, fd))), to_submit, min_complete, @intCast(@as(u32, @bitCast(flags))), @intFromPtr(sig), NSIG / 8); } -pub fn io_uring_register(fd: i32, opcode: IORING_REGISTER, arg: ?*const anyopaque, nr_args: u32) usize { +pub fn io_uring_register(fd: i32, opcode: IoUring.RegisterOp, arg: ?*const anyopaque, nr_args: u32) usize { return syscall4(.io_uring_register, @as(usize, @bitCast(@as(isize, fd))), @intFromEnum(opcode), @intFromPtr(arg), nr_args); } diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 64053e4132c8..0dfea27fcc9e 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -17,8 +17,9 @@ flags: uflags.Setup, features: uflags.Features, /// A friendly way to setup an io_uring, with default linux.io_uring_params. -/// `entries` must be a power of two between 1 and 32768, although the kernel will make the final -/// call on how many entries the submission and completion queues will ultimately have, +/// `entries` must be a power of two between 1 and 32768, although the kernel +/// will make the final call on how many entries the submission and completion +/// queues will ultimately have, /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. /// Matches the interface of io_uring_queue_init() in liburing. pub fn init(entries: u16, flags: uflags.Setup) !IoUring { @@ -29,9 +30,11 @@ pub fn init(entries: u16, flags: uflags.Setup) !IoUring { return try .init_params(entries, ¶ms); } -/// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission -/// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). -/// `params` is passed by reference because the kernel needs to modify the parameters. +/// A powerful way to setup an io_uring, if you want to tweak +/// linux.io_uring_params such as submission queue thread cpu affinity or +/// thread idle timeout (the kernel and our default is 1 second). +/// `params` is passed by reference because the kernel needs to modify the +/// parameters. /// Matches the interface of io_uring_queue_init_params() in liburing. pub fn init_params(entries: u16, p: *Params) !IoUring { if (entries == 0) return error.EntriesZero; @@ -42,27 +45,29 @@ pub fn init_params(entries: u16, p: *Params) !IoUring { assert(p.resv[1] == 0); assert(p.resv[2] == 0); - assert(p.cq_entries == 0 or p.flags.CQSIZE); - assert(p.wq_fd == 0 or p.flags.ATTACH_WQ); + assert(p.cq_entries == 0 or p.flags.cqsize); + assert(p.wq_fd == 0 or p.flags.attach_wq); // flags compatibility - if (p.flags.SQPOLL) assert(!(p.flags.COOP_TASKRUN or p.flags.TASKRUN_FLAG or p.flags.DEFER_TASKRUN)); - if (p.flags.SQ_AFF) assert(p.flags.SQPOLL); - if (p.flags.DEFER_TASKRUN) assert(p.flags.SINGLE_ISSUER); + if (p.flags.sqpoll) assert(!(p.flags.coop_taskrun or p.flags.taskrun_flag or p.flags.defer_taskrun)); + if (p.flags.sq_aff) assert(p.flags.sqpoll); + if (p.flags.defer_taskrun) assert(p.flags.single_issuer); const res = linux.io_uring_setup(entries, p); switch (linux.errno(res)) { .SUCCESS => {}, .FAULT => return error.ParamsOutsideAccessibleAddressSpace, - // The resv array contains non-zero data, p.flags contains an unsupported flag, - // entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL, - // or IORING_SETUP_CQSIZE was specified but linux.io_uring_params.cq_entries was invalid: + // The resv array contains non-zero data, p.flags contains an + // unsupported flag, entries out of bounds, IORING_SETUP_SQ_AFF was + // specified without IORING_SETUP_SQPOLL, or IORING_SETUP_CQSIZE was + // specified but linux.io_uring_params.cq_entries was invalid: .INVAL => return error.ArgumentsInvalid, .MFILE => return error.ProcessFdQuotaExceeded, .NFILE => return error.SystemFdQuotaExceeded, .NOMEM => return error.SystemResources, - // IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges, - // or a container seccomp policy prohibits io_uring syscalls: + // IORING_SETUP_SQPOLL was specified but effective user ID lacks + // sufficient privileges, or a container seccomp policy prohibits + // io_uring syscalls: .PERM => return error.PermissionDenied, .NOSYS => return error.SystemOutdated, else => |errno| return posix.unexpectedErrno(errno), @@ -71,16 +76,19 @@ pub fn init_params(entries: u16, p: *Params) !IoUring { assert(fd >= 0); errdefer posix.close(fd); - // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. - // This is not an optional feature for us... if the kernel does it, we have to do it. - // The thinking on this by the kernel developers was that both the submission and the - // completion queue rings have sizes just over a power of two, but the submission queue ring - // is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel - // gets the submission queue ring for free. + // Kernel versions 5.4 and up use only one mmap() for the submission and + // completion queues. + // This is not an optional feature for us... if the kernel does it, we have + // to do it. The thinking on this by the kernel developers was that both + // the submission and the completion queue rings have sizes just over a + // power of two, but the submission queue ring is significantly smaller + // with u32 slots. By bundling both in a single mmap, the kernel gets the + // submission queue ring for free. // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. - // We do not support the double mmap() done before 5.4, because we want to keep the - // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. - if (!p.features.SINGLE_MMAP) { + // We do not support the double mmap() done before 5.4, because we want to + // keep the init/deinit mmap paths simple and because io_uring has had many + // bug fixes even since 5.4. + if (!p.features.single_mmap) { return error.SystemOutdated; } @@ -89,8 +97,10 @@ pub fn init_params(entries: u16, p: *Params) !IoUring { assert(p.cq_entries != 0); assert(p.cq_entries >= p.sq_entries); - // From here on, we only need to read from params, so pass `p` by value as immutable. - // The completion queue shares the mmap with the submission queue, so pass `sq` there too. + // From here on, we only need to read from params, so pass `p` by value as + // immutable. + // The completion queue shares the mmap with the submission queue, so pass + // `sq` there too. var sq: Sq = try .init(fd, p.*); errdefer sq.deinit(); var cq: Cq = try .init(fd, p.*, sq); @@ -100,7 +110,8 @@ pub fn init_params(entries: u16, p: *Params) !IoUring { assert(sq.head.* == 0); assert(sq.tail.* == 0); assert(sq.mask == p.sq_entries - 1); - // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. + // Allow flags.* to be non-zero, since the kernel may set + // IORING_SQ_NEED_WAKEUP at any time. assert(sq.dropped.* == 0); assert(sq.array.len == p.sq_entries); assert(sq.sqes.len == p.sq_entries); @@ -131,17 +142,20 @@ pub fn deinit(self: *IoUring) void { self.fd = -1; } -/// Returns a pointer to a vacant SQE, or an error if the submission queue is full. -/// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. +/// Returns a pointer to a vacant SQE, or an error if the submission queue is +/// full. We follow the implementation (and atomics) of liburing's +/// `io_uring_get_sqe()` exactly. /// However, instead of a null we return an error to force safe handling. -/// Any situation where the submission queue is full tends more towards a control flow error, -/// and the null return in liburing is more a C idiom than anything else, for lack of a better -/// alternative. In Zig, we have first-class error handling... so let's use it. +/// Any situation where the submission queue is full tends more towards a +/// control flow error, and the null return in liburing is more a C idiom than +/// anything else, for lack of a better alternative. In Zig, we have +/// first-class error handling... so let's use it. /// Matches the implementation of io_uring_get_sqe() in liburing. pub fn get_sqe(self: *IoUring) !*Sqe { const head = @atomicLoad(u32, self.sq.head, .acquire); - // Remember that these head and tail offsets wrap around every four billion operations. - // We must therefore use wrapping addition and subtraction to avoid a runtime crash. + // Remember that these head and tail offsets wrap around every four billion + // operations. We must therefore use wrapping addition and subtraction to + // avoid a runtime crash. const next = self.sq.sqe_tail +% 1; if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; @@ -149,12 +163,14 @@ pub fn get_sqe(self: *IoUring) !*Sqe { return sqe; } -/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have -/// called get_sqe() multiple times to setup multiple I/O requests. -/// Returns the number of SQEs submitted, if not used alongside IORING_SETUP_SQPOLL. -/// If the io_uring instance is uses IORING_SETUP_SQPOLL, the value returned on success is not -/// guaranteed to match the amount of actually submitted sqes during this call. A value higher -/// or lower, including 0, may be returned. +/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this +/// once after you have called get_sqe() multiple times to setup multiple I/O +/// requests. +/// Returns the number of SQEs submitted, if not used alongside +/// IORING_SETUP_SQPOLL. +/// If the io_uring instance uses IORING_SETUP_SQPOLL, the value returned on +/// success is not guaranteed to match the amount of actually submitted sqes +/// during this call. A value higher or lower, including 0, may be returned. /// Matches the implementation of io_uring_submit() in liburing. pub fn submit(self: *IoUring) !u32 { return self.submit_and_wait(0); @@ -167,8 +183,8 @@ pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { const submitted = self.flush_sq(); var flags: uflags.Enter = .{}; if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { - if (wait_nr > 0 or self.flags.IOPOLL) { - flags.GETEVENTS = true; + if (wait_nr > 0 or self.flags.iopoll) { + flags.getevents = true; } return try self.enter(submitted, wait_nr, flags); } @@ -182,29 +198,36 @@ pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.En const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); switch (linux.errno(res)) { .SUCCESS => {}, - // The kernel was unable to allocate memory or ran out of resources for the request. - // The application should wait for some completions and try again: + // The kernel was unable to allocate memory or ran out of resources for + // the request. The application should wait for some completions and + // try again: .AGAIN => return error.SystemResources, - // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: + // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files + // were registered: .BADF => return error.FileDescriptorInvalid, // The file descriptor is valid, but the ring is not in the right state. // See io_uring_register(2) for how to enable the ring. .BADFD => return error.FileDescriptorInBadState, - // The application attempted to overcommit the number of requests it can have pending. - // The application should wait for some completions and try again: + // The application attempted to overcommit the number of requests it + // can have pending. The application should wait for some completions + // and try again: .BUSY => return error.CompletionQueueOvercommitted, - // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: + // The SQE is invalid, or valid but the ring was setup with + // IORING_SETUP_IOPOLL: .INVAL => return error.SubmissionQueueEntryInvalid, - // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED - // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range - // described by `addr` and `len` is not within the buffer registered at `buf_index`: + // The buffer is outside the process' accessible address space, or + // IORING_OP_READ_FIXED or IORING_OP_WRITE_FIXED was specified but no + // buffers were registered, or the range described by `addr` and `len` + // is not within the buffer registered at `buf_index`: .FAULT => return error.BufferInvalid, .NXIO => return error.RingShuttingDown, - // The kernel believes our `self.fd` does not refer to an io_uring instance, - // or the opcode is valid but not supported by this kernel (more likely): + // The kernel believes our `self.fd` does not refer to an io_uring + // instance, or the opcode is valid but not supported by this kernel + // (more likely): .OPNOTSUPP => return error.OpcodeNotSupported, - // The operation was interrupted by a delivery of a signal before it could complete. - // This can happen while waiting for events with IORING_ENTER_GETEVENTS: + // The operation was interrupted by a delivery of a signal before it + // could complete. This can happen while waiting for events with + // IORING_ENTER_GETEVENTS: .INTR => return error.SignalInterrupt, else => |errno| return posix.unexpectedErrno(errno), } @@ -212,9 +235,10 @@ pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.En } /// Sync internal state with kernel ring state on the SQ side. -/// Returns the number of all pending events in the SQ ring, for the shared ring. -/// This return value includes previously flushed SQEs, as per liburing. -/// The rationale is to suggest that an io_uring_enter() call is needed rather than not. +/// Returns the number of all pending events in the SQ ring, for the shared +/// ring. This return value includes previously flushed SQEs, as per liburing. +/// The rationale is to suggest that an io_uring_enter() call is needed rather +/// than not. /// Matches the implementation of __io_uring_flush_sq() in liburing. pub fn flush_sq(self: *IoUring) u32 { if (self.sq.sqe_head != self.sq.sqe_tail) { @@ -227,29 +251,30 @@ pub fn flush_sq(self: *IoUring) u32 { tail +%= 1; self.sq.sqe_head +%= 1; } - // Ensure that the kernel can actually see the SQE updates when it sees the tail update. + // Ensure that the kernel can actually see the SQE updates when it sees + // the tail update. @atomicStore(u32, self.sq.tail, tail, .release); } return self.sq_ready(); } /// Returns true if we are not using an SQ thread (thus nobody submits but us), -/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. -/// For the latter case, we set the SQ thread wakeup flag. +/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly +/// awakened. For the latter case, we set the SQ thread wakeup flag. /// Matches the implementation of sq_ring_needs_enter() in liburing. pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { assert(flags.*.empty()); - if (!self.flags.SQPOLL) return true; - if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).NEED_WAKEUP) { - flags.*.SQ_WAKEUP = true; + if (!self.flags.sqpoll) return true; + if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).need_wakeup) { + flags.*.sq_wakeup = true; return true; } return false; } -/// Returns the number of flushed and unflushed SQEs pending in the submission queue. -/// In other words, this is the number of SQEs in the submission queue, i.e. its length. -/// These are SQEs that the kernel is yet to consume. +/// Returns the number of flushed and unflushed SQEs pending in the submission +/// queue. In other words, this is the number of SQEs in the submission queue, +/// i.e. its length. These are SQEs that the kernel is yet to consume. /// Matches the implementation of io_uring_sq_ready in liburing. pub fn sq_ready(self: *IoUring) u32 { // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, @@ -264,21 +289,26 @@ pub fn cq_ready(self: *IoUring) u32 { return @atomicLoad(u32, self.cq.tail, .acquire) -% self.cq.head.*; } -/// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice. -/// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs. +/// Copies as many CQEs as are ready, and that can fit into the destination +/// `cqes` slice. If none are available, enters into the kernel to wait for at +/// most `wait_nr` CQEs. /// Returns the number of CQEs copied, advancing the CQ ring. -/// Provides all the wait/peek methods found in liburing, but with batching and a single method. -/// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes -/// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface. -/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs. -/// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. +/// Provides all the wait/peek methods found in liburing, but with batching and +/// a single method. +/// The rationale for copying CQEs rather than copying pointers is that +/// pointers are 8 bytes whereas CQEs are not much more at only 16 bytes, and +/// this provides a safer faster interface. +/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency +/// bugs. Faster, because we can now amortize the atomic store release to +/// `cq.head` across the batch. /// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. -/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. +/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but +/// supports waiting. pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 { const count = self.copy_cqes_ready(cqes); if (count > 0) return count; if (self.cq_ring_needs_flush() or wait_nr > 0) { - _ = try self.enter(0, wait_nr, .{ .GETEVENTS = true }); + _ = try self.enter(0, wait_nr, .{ .getevents = true }); return self.copy_cqes_ready(cqes); } return 0; @@ -303,8 +333,10 @@ fn copy_cqes_ready(self: *IoUring, cqes: []Cqe) u32 { return count; } -/// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. -/// A convenience method for `copy_cqes()` for when you don't need to batch or peek. +/// Returns a copy of an I/O completion, waiting for it if necessary, and +/// advancing the CQ ring. +/// A convenience method for `copy_cqes()` for when you don't need to batch or +/// peek. pub fn copy_cqe(ring: *IoUring) !Cqe { var cqes: [1]Cqe = undefined; while (true) { @@ -315,12 +347,13 @@ pub fn copy_cqe(ring: *IoUring) !Cqe { /// Matches the implementation of cq_ring_needs_flush() in liburing. pub fn cq_ring_needs_flush(self: *IoUring) bool { - return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).CQ_OVERFLOW; + return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).cq_overflow; } /// For advanced use cases only that implement custom completion queue methods. -/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance(). -/// Must be called exactly once after a zero-copy CQE has been processed by your application. +/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or +/// cq_advance(). Must be called exactly once after a zero-copy CQE has been +/// processed by your application. /// Not idempotent, calling more than once will result in other CQEs being lost. /// Matches the implementation of cqe_seen() in liburing. pub fn cqe_seen(self: *IoUring, cqe: *Cqe) void { @@ -338,14 +371,18 @@ pub fn cq_advance(self: *IoUring, count: u32) void { } /// Queues (but does not submit) an SQE to perform an `fsync(2)`. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. -/// N.B. While SQEs are initiated in the order in which they appear in the submission queue, -/// operations execute in parallel and completions are unordered. Therefore, an application that -/// submits a write followed by an fsync in the submission queue cannot expect the fsync to -/// apply to the write, since the fsync may complete before the write is issued to the disk. -/// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, -/// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the +/// SQE's `rw_flags`. +/// N.B. While SQEs are initiated in the order in which they appear in the +/// submission queue, operations execute in parallel and completions are +/// unordered. Therefore, an application that submits a write followed by an +/// fsync in the submission queue cannot expect the fsync to apply to the write, +/// since the fsync may complete before the write is issued to the disk. +/// You should preferably use `link_with_next_sqe()` on a write's SQE to link +/// it with an fsync, or else insert a full write barrier using +/// `drain_previous_sqes()` when queueing an fsync. pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_fsync(fd, flags); @@ -383,8 +420,8 @@ pub fn read( .buffer => |slice| sqe.prep_read(fd, slice, offset), .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), .buffer_selection => |selection| { - sqe.prep_rw(.READ, fd, 0, selection.len, offset); - sqe.flags.BUFFER_SELECT = true; + sqe.prep_rw(.read, fd, 0, selection.len, offset); + sqe.flags.buffer_select = true; sqe.buf_index = selection.group_id; }, } @@ -608,9 +645,9 @@ pub fn recv( switch (buffer) { .buffer => |slice| sqe.prep_recv(fd, slice, flags), .buffer_selection => |selection| { - sqe.prep_rw(.RECV, fd, 0, selection.len, 0); + sqe.prep_rw(.recv, fd, 0, selection.len, 0); sqe.rw_flags = @bitCast(flags); - sqe.flags.BUFFER_SELECT = true; + sqe.flags.buffer_select = true; sqe.buf_index = selection.group_id; }, } @@ -1122,7 +1159,7 @@ pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { assert(self.fd >= 0); const res = linux.io_uring_register( self.fd, - .REGISTER_FILES, + .register_files, @as(*const anyopaque, @ptrCast(fds.ptr)), @as(u32, @intCast(fds.len)), ); @@ -1153,7 +1190,7 @@ pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_ const res = linux.io_uring_register( self.fd, - .REGISTER_FILES_UPDATE, + .register_files_update, @as(*const anyopaque, @ptrCast(&update)), @as(u32, @intCast(fds.len)), ); @@ -1166,7 +1203,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { const reg: RsrcRegister = .{ .nr = nr_files, - .flags = constants.RSRC_REGISTER_SPARSE, + .flags = .{ .rsrc_register_sparse = true }, .resv2 = 0, .data = 0, .tags = 0, @@ -1174,7 +1211,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { const res = linux.io_uring_register( self.fd, - .REGISTER_FILES2, + .register_files2, @ptrCast(®), @as(u32, @sizeOf(RsrcRegister)), ); @@ -1195,7 +1232,7 @@ pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { const res = linux.io_uring_register( self.fd, - .REGISTER_FILE_ALLOC_RANGE, + .register_file_alloc_range, @ptrCast(&range), @as(u32, @sizeOf(FileIndexRange)), ); @@ -1210,7 +1247,7 @@ pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { assert(self.fd >= 0); const res = linux.io_uring_register( self.fd, - .REGISTER_EVENTFD, + .register_eventfd, @as(*const anyopaque, @ptrCast(&fd)), 1, ); @@ -1225,7 +1262,7 @@ pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { assert(self.fd >= 0); const res = linux.io_uring_register( self.fd, - .REGISTER_EVENTFD_ASYNC, + .register_eventfd_async, @as(*const anyopaque, @ptrCast(&fd)), 1, ); @@ -1237,7 +1274,7 @@ pub fn unregister_eventfd(self: *IoUring) !void { assert(self.fd >= 0); const res = linux.io_uring_register( self.fd, - .UNREGISTER_EVENTFD, + .unregister_eventfd, null, 0, ); @@ -1246,13 +1283,13 @@ pub fn unregister_eventfd(self: *IoUring) !void { pub fn register_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .REGISTER_NAPI, napi, 1); + const res = linux.io_uring_register(self.fd, .register_napi, napi, 1); try handle_registration_result(res); } pub fn unregister_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_NAPI, napi, 1); + const res = linux.io_uring_register(self.fd, .unregister_napi, napi, 1); try handle_registration_result(res); } @@ -1261,7 +1298,7 @@ pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { assert(self.fd >= 0); const res = linux.io_uring_register( self.fd, - .REGISTER_BUFFERS, + .register_buffers, buffers.ptr, @as(u32, @intCast(buffers.len)), ); @@ -1271,7 +1308,7 @@ pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { /// Unregister the registered buffers. pub fn unregister_buffers(self: *IoUring) !void { assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_BUFFERS, null, 0); + const res = linux.io_uring_register(self.fd, .unregister_buffers, null, 0); switch (linux.errno(res)) { .SUCCESS => {}, .NXIO => return error.BuffersNotRegistered, @@ -1284,7 +1321,7 @@ pub fn unregister_buffers(self: *IoUring) !void { /// list of supported operations. pub fn get_probe(self: *IoUring) !Probe { var probe = mem.zeroInit(Probe, .{}); - const res = linux.io_uring_register(self.fd, .REGISTER_PROBE, &probe, probe.ops.len); + const res = linux.io_uring_register(self.fd, .register_probe, &probe, probe.ops.len); try handle_register_buf_ring_result(res); return probe; } @@ -1314,7 +1351,7 @@ fn handle_registration_result(res: usize) !void { /// Unregisters all registered file descriptors previously associated with the ring. pub fn unregister_files(self: *IoUring) !void { assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); + const res = linux.io_uring_register(self.fd, .unregister_files, null, 0); switch (linux.errno(res)) { .SUCCESS => {}, .NXIO => return error.FilesNotRegistered, @@ -1443,7 +1480,7 @@ pub fn setsockopt( ) !*Sqe { return try self.cmd_sock( user_data, - .SETSOCKOPT, + .setsockopt, fd, level, optname, @@ -1465,7 +1502,7 @@ pub fn getsockopt( ) !*Sqe { return try self.cmd_sock( user_data, - .GETSOCKOPT, + .getsockopt, fd, level, optname, @@ -1518,12 +1555,12 @@ fn register_buf_ring( .bgid = group_id, .flags = flags, }); - var res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); - if (linux.errno(res) == .INVAL and reg.flags.IOU_PBUF_RING_INC) { + var res = linux.io_uring_register(fd, .register_pbuf_ring, @ptrCast(®), 1); + if (linux.errno(res) == .INVAL and reg.flags.iou_pbuf_ring_inc) { // Retry without incremental buffer consumption. // It is available since kernel 6.12. returns INVAL on older. - reg.flags.IOU_PBUF_RING_INC = false; - res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); + reg.flags.iou_pbuf_ring_inc = false; + res = linux.io_uring_register(fd, .register_pbuf_ring, @as(*const anyopaque, @ptrCast(®)), 1); } try handle_register_buf_ring_result(res); } @@ -1534,7 +1571,7 @@ fn unregister_buf_ring(fd: posix.fd_t, group_id: u16) !void { }); const res = linux.io_uring_register( fd, - .UNREGISTER_PBUF_RING, + .unregister_pbuf_ring, @as(*const anyopaque, @ptrCast(®)), 1, ); @@ -1611,37 +1648,36 @@ pub const Cqe = extern struct { /// cqe.flags pub const Flags = packed struct(u32) { /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID - F_BUFFER: bool = false, + f_buffer: bool = false, /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries - F_MORE: bool = false, - /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv - F_SOCK_NONEMPTY: bool = false, - /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct - /// them from sends. - F_NOTIF: bool = false, - /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get - /// more completions. In other words, the buffer is being - /// partially consumed, and will be used by the kernel for - /// more completions. This is only set for buffers used via - /// the incremental buffer consumption, as provided by - /// a ring buffer setup with IOU_PBUF_RING_INC. For any - /// other provided buffer type, all completions with a - /// buffer passed back is automatically returned to the - /// application. - F_BUF_MORE: bool = false, + f_more: bool = false, + /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket + /// recv + f_sock_nonempty: bool = false, + /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to + /// distinct them from sends. + f_notif: bool = false, + /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion + /// will get more completions. In other words, the buffer is being + /// partially consumed, and will be used by the kernel for more + /// completions. This is only set for buffers used via the incremental + /// buffer consumption, as provided by a ring buffer setup with + /// IOU_PBUF_RING_INC. For any other provided buffer type, all + /// completions with a buffer passed back is automatically returned to + /// the application. + f_buf_more: bool = false, // COMMIT: new flags - /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this - /// CQE. It's only purpose is to fill a gap in the ring, - /// if a large CQE is attempted posted when the ring has - /// just a single small CQE worth of space left before - /// wrapping. - F_SKIP: bool = false, - _unused: u9 = 0, - /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings - /// setup in a mixed CQE mode, where both 16b and 32b - /// CQEs may be posted to the CQ ring. - F_32: bool = false, - _unused_1: u16 = 0, + /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore + /// this CQE. It's only purpose is to fill a gap in the ring, if a + /// large CQE is attempted posted when the ring has just a single small + /// CQE worth of space left before wrapping. + f_skip: bool = false, + _7: u9 = 0, + /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with + /// rings setup in a mixed CQE mode, where both 16b and 32b CQEs may be + /// posted to the CQ ring. + f_32: bool = false, + _17: u16 = 0, }; pub fn err(self: Cqe) linux.E { @@ -1655,7 +1691,7 @@ pub const Cqe = extern struct { // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by // the upper 16-bits of the flags field. pub fn buffer_id(self: Cqe) !u16 { - if (!self.flags.F_BUFFER) { + if (!self.flags.f_buffer) { return error.NoBufferSelected; } return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT); @@ -1706,19 +1742,19 @@ pub const Sqe = extern struct { /// sqe.flags pub const IoSqe = packed struct(u8) { /// use fixed fileset - FIXED_FILE: bool = false, + fixed_file: bool = false, /// issue after inflight IO - IO_DRAIN: bool = false, + io_drain: bool = false, /// links next sqe - IO_LINK: bool = false, + io_link: bool = false, /// like LINK, but stronger - IO_HARDLINK: bool = false, + io_hardlink: bool = false, /// always go async - ASYNC: bool = false, + async: bool = false, /// select buffer from sqe->buf_group - BUFFER_SELECT: bool = false, + buffer_select: bool = false, /// don't post CQE if request succeeded - CQE_SKIP_SUCCESS: bool = false, + cqe_skip_success: bool = false, _: u1 = 0, }; @@ -1728,47 +1764,47 @@ pub const Sqe = extern struct { /// If set, instead of first attempting to send or receive and arm poll /// if that yields an -EAGAIN result, arm poll upfront and skip the /// initial transfer attempt. - RECVSEND_POLL_FIRST: bool = false, + recvsend_poll_first: bool = false, /// IORING_RECV_MULTISHOT /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue /// to report CQEs on behalf of the same SQE. - RECV_MULTISHOT: bool = false, + recv_multishot: bool = false, /// IORING_RECVSEND_FIXED_BUF /// Use registered buffers, the index is stored in the buf_index field. - RECVSEND_FIXED_BUF: bool = false, + recvsend_fixed_buf: bool = false, /// IORING_SEND_ZC_REPORT_USAGE /// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res /// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was /// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied /// (at least partially). - SEND_ZC_REPORT_USAGE: bool = false, + send_zc_report_usage: bool = false, /// IORING_RECVSEND_BUNDLE /// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as /// many buffers from the buffer group ID given and send them all. /// The completion result will be the number of buffers send, with the /// starting buffer ID in cqe.flags as per usual for provided buffer /// usage. The buffers will be contiguous from the starting buffer ID. - RECVSEND_BUNDLE: bool = false, + recvsend_bundle: bool = false, // COMMIT: new flags /// IORING_SEND_VECTORIZED /// If set, SEND[_ZC] will take a pointer to a io_vec to allow /// vectorized send operations. - SEND_VECTORIZED: bool = false, + send_vectorized: bool = false, _: u10 = 0, }; /// accept flags stored in sqe.ioprio pub const Accept = packed struct(u16) { - MULTISHOT: bool = false, + multishot: bool = false, // COMMIT: new Flags - DONTWAIT: bool = false, - POLL_FIRST: bool = false, - _unused: u13 = 0, + dontwait: bool = false, + poll_first: bool = false, + _: u13 = 0, }; pub fn prep_nop(sqe: *Sqe) void { sqe.* = .{ - .opcode = .NOP, + .opcode = .nop, .flags = .{}, .ioprio = .init_empty(), .fd = 0, @@ -1787,7 +1823,7 @@ pub const Sqe = extern struct { pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void { sqe.* = .{ - .opcode = .FSYNC, + .opcode = .fsync, .flags = .{}, .ioprio = .init_empty(), .fd = fd, @@ -1831,15 +1867,15 @@ pub const Sqe = extern struct { } pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { - sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + sqe.prep_rw(.read, fd, @intFromPtr(buffer.ptr), buffer.len, offset); } pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { - sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + sqe.prep_rw(.write, fd, @intFromPtr(buffer.ptr), buffer.len, offset); } pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { - sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); + sqe.prep_rw(.splice, fd_out, undefined, len, off_out); sqe.addr = off_in; sqe.splice_fd_in = fd_in; } @@ -1850,7 +1886,7 @@ pub const Sqe = extern struct { iovecs: []const std.posix.iovec, offset: u64, ) void { - sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + sqe.prep_rw(.readv, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); } pub fn prep_writev( @@ -1859,16 +1895,16 @@ pub const Sqe = extern struct { iovecs: []const std.posix.iovec_const, offset: u64, ) void { - sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + sqe.prep_rw(.writev, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); } pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.prep_rw(.read_fixed, fd, @intFromPtr(buffer.base), buffer.len, offset); sqe.buf_index = buffer_index; } pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.prep_rw(.write_fixed, fd, @intFromPtr(buffer.base), buffer.len, offset); sqe.buf_index = buffer_index; } @@ -1881,7 +1917,7 @@ pub const Sqe = extern struct { ) void { // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); + sqe.prep_rw(.accept, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); sqe.rw_flags = @bitCast(flags); } @@ -1906,7 +1942,7 @@ pub const Sqe = extern struct { flags: linux.Sock, ) void { prep_accept(sqe, fd, addr, addrlen, flags); - sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } }; + sqe.ioprio = .{ .accept = .{ .multishot = true } }; } /// multishot accept directly into the fixed file table @@ -1940,7 +1976,7 @@ pub const Sqe = extern struct { addrlen: linux.socklen_t, ) void { // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); + sqe.prep_rw(.connect, fd, @intFromPtr(addr), 0, addrlen); } pub fn prep_epoll_ctl( @@ -1950,11 +1986,11 @@ pub const Sqe = extern struct { op: linux.EpollOp, ev: ?*linux.epoll_event, ) void { - sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), @intFromEnum(op), @intCast(fd)); + sqe.prep_rw(.epoll_ctl, epfd, @intFromPtr(ev), @intFromEnum(op), @intCast(fd)); } pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.Msg) void { - sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.prep_rw(.recv, fd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = @bitCast(flags); } @@ -1965,7 +2001,7 @@ pub const Sqe = extern struct { flags: linux.Msg, ) void { sqe.prep_recv(fd, buffer, flags); - sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; + sqe.ioprio = .{ .send_recv = .{ .recv_multishot = true } }; } pub fn prep_recvmsg( @@ -1974,7 +2010,7 @@ pub const Sqe = extern struct { msg: *linux.msghdr, flags: linux.Msg, ) void { - sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); + sqe.prep_rw(.recvmsg, fd, @intFromPtr(msg), 1, 0); sqe.rw_flags = @bitCast(flags); } @@ -1985,17 +2021,17 @@ pub const Sqe = extern struct { flags: linux.Msg, ) void { sqe.prep_recvmsg(fd, msg, flags); - sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } }; + sqe.ioprio = .{ .send_recv = .{ .recv_multishot = true } }; } // COMMIT: fix send[|recv] flag param type pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg) void { - sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.prep_rw(.send, fd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = @bitCast(flags); } pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv) void { - sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.prep_rw(.send_zc, fd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = @bitCast(flags); sqe.ioprio = .{ .send_recv = zc_flags }; } @@ -2003,7 +2039,7 @@ pub const Sqe = extern struct { pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv, buf_index: u16) void { const zc_flags_fixed = blk: { var updated_flags = zc_flags; - updated_flags.RECVSEND_FIXED_BUF = true; + updated_flags.recvsend_fixed_buf = true; break :blk updated_flags; }; prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed); @@ -2016,7 +2052,7 @@ pub const Sqe = extern struct { msg: *const linux.msghdr_const, flags: linux.Msg, ) void { - sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); + sqe.prep_rw(.sendmsg, fd, @intFromPtr(msg), 1, 0); sqe.rw_flags = @bitCast(flags); } @@ -2027,7 +2063,7 @@ pub const Sqe = extern struct { flags: linux.Msg, ) void { prep_sendmsg(sqe, fd, msg, flags); - sqe.opcode = .SENDMSG_ZC; + sqe.opcode = .sendmsg_zc; } pub fn prep_openat( @@ -2037,7 +2073,7 @@ pub const Sqe = extern struct { flags: linux.O, mode: linux.mode_t, ) void { - sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); + sqe.prep_rw(.openat, fd, @intFromPtr(path), mode, 0); sqe.rw_flags = @bitCast(flags); } @@ -2055,7 +2091,7 @@ pub const Sqe = extern struct { pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void { sqe.* = .{ - .opcode = .CLOSE, + .opcode = .close, .flags = .{}, .ioprio = .init_empty(), .fd = fd, @@ -2083,13 +2119,13 @@ pub const Sqe = extern struct { count: u32, flags: uflags.Timeout, ) void { - sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); + sqe.prep_rw(.timeout, -1, @intFromPtr(ts), 1, count); sqe.rw_flags = @bitCast(flags); } pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { sqe.* = .{ - .opcode = .TIMEOUT_REMOVE, + .opcode = .timeout_remove, .flags = .{}, .ioprio = .init_empty(), .fd = -1, @@ -2111,7 +2147,7 @@ pub const Sqe = extern struct { ts: *const linux.kernel_timespec, flags: uflags.Timeout, ) void { - sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); + sqe.prep_rw(.link_timeout, -1, @intFromPtr(ts), 1, 0); sqe.rw_flags = @bitCast(flags); } @@ -2120,7 +2156,7 @@ pub const Sqe = extern struct { fd: linux.fd_t, poll_mask: linux.Epoll, ) void { - sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); + sqe.prep_rw(.poll_add, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); // Poll masks previously used to comprise of 16 bits in the flags union of // a SQE, but were then extended to comprise of 32 bits in order to make // room for additional option flags. To ensure that the correct bits of @@ -2134,7 +2170,7 @@ pub const Sqe = extern struct { sqe: *Sqe, target_user_data: u64, ) void { - sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); + sqe.prep_rw(.poll_remove, -1, target_user_data, 0, 0); } pub fn prep_poll_update( @@ -2144,7 +2180,7 @@ pub const Sqe = extern struct { poll_mask: linux.Epoll, flags: uflags.Poll, ) void { - sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); + sqe.prep_rw(.poll_remove, -1, old_user_data, flags, new_user_data); // Poll masks previously used to comprise of 16 bits in the flags union of // a SQE, but were then extended to comprise of 32 bits in order to make // room for additional option flags. To ensure that the correct bits of @@ -2162,7 +2198,7 @@ pub const Sqe = extern struct { len: u64, ) void { sqe.* = .{ - .opcode = .FALLOCATE, + .opcode = .fallocate, .flags = .{}, .ioprio = .init_empty(), .fd = fd, @@ -2187,7 +2223,7 @@ pub const Sqe = extern struct { mask: linux.Statx.Mask, buf: *linux.Statx, ) void { - sqe.prep_rw(.STATX, fd, @intFromPtr(path), @as(u32, @bitCast(mask)), @intFromPtr(buf)); + sqe.prep_rw(.statx, fd, @intFromPtr(path), @as(u32, @bitCast(mask)), @intFromPtr(buf)); sqe.rw_flags = @bitCast(flags); } @@ -2196,7 +2232,7 @@ pub const Sqe = extern struct { cancel_user_data: u64, flags: uflags.AsyncCancel, ) void { - sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); + sqe.prep_rw(.async_cancel, -1, cancel_user_data, 0, 0); sqe.rw_flags = @bitCast(flags); } @@ -2205,10 +2241,10 @@ pub const Sqe = extern struct { fd: linux.fd_t, flags: uflags.AsyncCancel, ) void { - sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); + sqe.prep_rw(.async_cancel, fd, 0, 0, 0); const enable_cancel_fd = blk: { var update_flags = flags; - update_flags.CANCEL_FD = true; + update_flags.cancel_fd = true; break :blk update_flags; }; sqe.rw_flags = @bitCast(enable_cancel_fd); @@ -2219,7 +2255,7 @@ pub const Sqe = extern struct { sockfd: linux.socket_t, how: linux.Shut, ) void { - sqe.prep_rw(.SHUTDOWN, sockfd, 0, @intFromEnum(how), 0); + sqe.prep_rw(.shutdown, sockfd, 0, @intFromEnum(how), 0); } pub fn prep_renameat( @@ -2231,7 +2267,7 @@ pub const Sqe = extern struct { flags: linux.Rename, ) void { sqe.prep_rw( - .RENAMEAT, + .renameat, old_dir_fd, @intFromPtr(old_path), 0, @@ -2247,7 +2283,7 @@ pub const Sqe = extern struct { path: [*:0]const u8, flags: linux.At, ) void { - sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); + sqe.prep_rw(.unlinkat, dir_fd, @intFromPtr(path), 0, 0); sqe.rw_flags = @bitCast(flags); } @@ -2257,7 +2293,7 @@ pub const Sqe = extern struct { path: [*:0]const u8, mode: linux.mode_t, ) void { - sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); + sqe.prep_rw(.mkdirat, dir_fd, @intFromPtr(path), mode, 0); } pub fn prep_symlinkat( @@ -2267,7 +2303,7 @@ pub const Sqe = extern struct { link_path: [*:0]const u8, ) void { sqe.prep_rw( - .SYMLINKAT, + .symlinkat, new_dir_fd, @intFromPtr(target), 0, @@ -2281,10 +2317,10 @@ pub const Sqe = extern struct { old_path: [*:0]const u8, new_dir_fd: linux.fd_t, new_path: [*:0]const u8, - flags: linux.At, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW + flags: linux.At, ) void { sqe.prep_rw( - .LINKAT, + .linkat, old_dir_fd, @intFromPtr(old_path), 0, @@ -2299,14 +2335,14 @@ pub const Sqe = extern struct { fds: []const linux.fd_t, offset: u32, ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); + sqe.prep_rw(.files_update, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); } pub fn prep_files_update_alloc( sqe: *Sqe, fds: []linux.fd_t, ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); + sqe.prep_rw(.files_update, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); } // TODO: why can't slice be used here ? @@ -2319,7 +2355,7 @@ pub const Sqe = extern struct { buffer_id: usize, ) void { const ptr = @intFromPtr(buffers); - sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); + sqe.prep_rw(.provide_buffers, @intCast(num), ptr, buffer_len, buffer_id); sqe.buf_index = @intCast(group_id); } @@ -2328,7 +2364,7 @@ pub const Sqe = extern struct { num: usize, group_id: usize, ) void { - sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); + sqe.prep_rw(.remove_buffers, @intCast(num), 0, 0, 0); sqe.buf_index = @intCast(group_id); } @@ -2339,7 +2375,7 @@ pub const Sqe = extern struct { protocol: linux.IpProto, flags: u32, // flags is unused ) void { - sqe.prep_rw(.SOCKET, @intFromEnum(domain), 0, @intFromEnum(protocol), @intCast(@as(u32, @bitCast(socket_type)))); + sqe.prep_rw(.socket, @intFromEnum(domain), 0, @intFromEnum(protocol), @intCast(@as(u32, @bitCast(socket_type)))); sqe.rw_flags = flags; } @@ -2374,7 +2410,7 @@ pub const Sqe = extern struct { options: linux.W, flags: u32, // flags is unused ) void { - sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); + sqe.prep_rw(.waitid, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); sqe.rw_flags = flags; sqe.splice_fd_in = @bitCast(options); } @@ -2387,7 +2423,7 @@ pub const Sqe = extern struct { addrlen: linux.socklen_t, flags: u32, // flags is unused and does't exist in io_uring's api ) void { - sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); + sqe.prep_rw(.bind, fd, @intFromPtr(addr), 0, addrlen); sqe.rw_flags = flags; } @@ -2397,7 +2433,7 @@ pub const Sqe = extern struct { backlog: usize, flags: u32, // flags is unused and does't exist in io_uring's api ) void { - sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); + sqe.prep_rw(.listen, fd, 0, backlog, 0); sqe.rw_flags = flags; } @@ -2410,7 +2446,7 @@ pub const Sqe = extern struct { optval: u64, optlen: u32, ) void { - sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); + sqe.prep_rw(.uring_cmd, fd, 0, 0, 0); // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 sqe.off = @intFromEnum(cmd_op); // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 @@ -2435,7 +2471,7 @@ pub const Sqe = extern struct { /// This SQE forms a link with the next SQE in the submission ring. Next SQE /// will not be started before this one completes. Forms a chain of SQEs. pub fn link_next(sqe: *Sqe) void { - sqe.flags.IO_LINK = true; + sqe.flags.io_link = true; } }; @@ -2461,17 +2497,17 @@ pub const Sq = struct { /// sq_ring.flags pub const Flags = packed struct(u32) { /// needs io_uring_enter wakeup - NEED_WAKEUP: bool = false, + need_wakeup: bool = false, /// CQ ring is overflown - CQ_OVERFLOW: bool = false, + cq_overflow: bool = false, /// task should enter the kernel - TASKRUN: bool = false, - _unused: u29 = 0, + taskrun: bool = false, + _: u29 = 0, }; pub fn init(fd: posix.fd_t, p: Params) !Sq { assert(fd >= 0); - assert(p.features.SINGLE_MMAP); + assert(p.features.single_mmap); const size = @max( p.sq_off.array + p.sq_entries * @sizeOf(u32), p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe), @@ -2536,13 +2572,13 @@ pub const Cq = struct { /// cq_ring.flags pub const Flags = packed struct(u32) { /// disable eventfd notifications - EVENTFD_DISABLED: bool = false, - _unused: u31 = 0, + eventfd_disabled: bool = false, + _: u31 = 0, }; pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq { assert(fd >= 0); - assert(p.features.SINGLE_MMAP); + assert(p.features.single_mmap); const mmap = sq.mmap; const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); @@ -2579,7 +2615,6 @@ pub const Cq = struct { /// Depending on the rate of arrival of data, it is possible that a given buffer /// group will run out of buffers before those in CQEs can be put back to the /// kernel. If this happens, a `cqe.err()` will have ENOBUFS as the error value. -/// pub const BufferGroup = struct { /// Parent ring for which this group is registered. ring: *IoUring, @@ -2610,7 +2645,7 @@ pub const BufferGroup = struct { const heads = try allocator.alloc(u32, buffers_count); errdefer allocator.free(heads); - const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .IOU_PBUF_RING_INC = true }); + const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .iou_pbuf_ring_inc = true }); buf_ring_init(br); const mask = buf_ring_mask(buffers_count); @@ -2623,7 +2658,7 @@ pub const BufferGroup = struct { } buf_ring_advance(br, buffers_count); - return BufferGroup{ + return .{ .ring = ring, .group_id = group_id, .br = br, @@ -2640,23 +2675,21 @@ pub const BufferGroup = struct { allocator.free(self.heads); } - // TODO: recv flags // Prepare recv operation which will select buffer from this group. - pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { + pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: linux.Msg) !*Sqe { var sqe = try self.ring.get_sqe(); - sqe.prep_rw(.RECV, fd, 0, 0, 0); - sqe.rw_flags = flags; - sqe.flags.BUFFER_SELECT = true; + sqe.prep_rw(.recv, fd, 0, 0, 0); + sqe.rw_flags = @bitCast(flags); + sqe.flags.buffer_select = true; sqe.buf_index = self.group_id; sqe.user_data = user_data; return sqe; } - // TODO: recv_multishot flags // Prepare multishot recv operation which will select buffer from this group. - pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe { + pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: linux.Msg) !*Sqe { var sqe = try self.recv(user_data, fd, flags); - sqe.ioprio.send_recv.RECV_MULTISHOT = true; + sqe.ioprio.send_recv.recv_multishot = true; return sqe; } @@ -2676,7 +2709,7 @@ pub const BufferGroup = struct { // Release buffer from CQE to the kernel. pub fn put(self: *BufferGroup, cqe: Cqe) !void { const buffer_id = try cqe.buffer_id(); - if (cqe.flags.F_BUF_MORE) { + if (cqe.flags.f_buf_more) { // Incremental consumption active, kernel will write to the this buffer again const used_len = @as(u32, @intCast(cqe.res)); // Track what part of the buffer is used @@ -2734,8 +2767,7 @@ pub const SqOffsets = extern struct { ring_mask: u32, /// entries in ring ring_entries: u32, - // TODO: find type of this flags - /// ring flags + /// ring flags index flags: u32, /// number of sqes not submitted dropped: u32, @@ -2753,8 +2785,7 @@ pub const CqOffsets = extern struct { ring_entries: u32, overflow: u32, cqes: u32, - // TODO: find type of these flags - flags: u32, + flags: u32, // flags index resv: u32, user_addr: u64, }; @@ -2790,7 +2821,7 @@ pub const RegionDesc = extern struct { // COMMIT: new constant /// initialise with user provided memory pointed by user_addr pub const Flags = packed struct(u32) { - TYPE_USER: bool = false, + type_user: bool = false, _: u31 = 0, }; }; @@ -2805,7 +2836,7 @@ pub const MemRegionReg = extern struct { /// expose the region as registered wait arguments pub const Flags = packed struct(u64) { - REG_WAIT_ARG: bool = false, + reg_wait_arg: bool = false, _: u63 = 0, }; }; @@ -2813,11 +2844,17 @@ pub const MemRegionReg = extern struct { /// matches io_uring_rsrc_register in liburing pub const RsrcRegister = extern struct { nr: u32, - // TODO: find type of these flags - flags: u32, + flags: Flags, resv2: u64, data: u64, tags: u64, + + pub const Flags = packed struct(u32) { + /// Register a fully sparse file space, rather than pass in an array of + /// all -1 file descriptors. + rsrc_register_sparse: bool = false, + _: 31 = 0, + }; }; /// matches io_uring_rsrc_update in liburing @@ -2845,12 +2882,12 @@ pub const ProbeOp = extern struct { resv2: u32, pub const Flags = packed struct(u16) { - OP_SUPPORTED: bool = false, + op_supported: bool = false, _: u15 = 0, }; pub fn is_supported(self: ProbeOp) bool { - return self.flags.OP_SUPPORTED; + return self.flags.op_supported; } }; @@ -2883,9 +2920,8 @@ pub const Restriction = extern struct { register_op: RegisterOp, /// IORING_RESTRICTION_SQE_OP sqe_op: Op, - // TODO: find type of these flags /// IORING_RESTRICTION_SQE_FLAGS_* - sqe_flags: u8, + sqe_flags: Sqe.IoSqe, }, resv: u8, resv2: [3]u32, @@ -2910,8 +2946,8 @@ pub const CloneBuffers = extern struct { // COMMIT: new flags pub const Flags = packed struct(u32) { - REGISTER_SRC_REGISTERED: bool = false, - REGISTER_DST_REPLACE: bool = false, + register_src_registered: bool = false, + register_dst_replace: bool = false, _: u30 = 0, }; }; @@ -2950,7 +2986,7 @@ pub const BufferRegister = extern struct { /// instead it must subsequently call mmap(2) with the offset set /// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get /// a virtual mapping for the ring. - IOU_PBUF_RING_MMAP: bool = false, + iou_pbuf_ring_mmap: bool = false, /// IOU_PBUF_RING_INC: /// If set, buffers consumed from this buffer ring can be /// consumed incrementally. Normally one (or more) buffers @@ -2959,7 +2995,7 @@ pub const BufferRegister = extern struct { /// use of it will consume only as much as it needs. This /// requires that both the kernel and application keep /// track of where the current read/recv index is at. - IOU_PBUF_RING_INC: bool = false, + iou_pbuf_ring_inc: bool = false, _: u14 = 0, }; }; @@ -3000,7 +3036,7 @@ pub const RegisterWait = extern struct { // COMMIT: new constant pub const Flags = packed struct(u32) { - REG_WAIT_TS: bool = false, + reg_wait_ts: bool = false, _: u31 = 0, }; }; @@ -3041,8 +3077,7 @@ pub const RecvmsgOut = extern struct { namelen: u32, controllen: u32, payloadlen: u32, - // TODO: find type of these flags - flags: u32, + flags: linux.Msg, }; /// Zero copy receive refill queue entry @@ -3078,7 +3113,7 @@ pub const ZcrxAreaRegister = extern struct { __resv2: [2]u64, pub const Flags = packed struct(u32) { - DMABUF: bool = false, + dmabuf: bool = false, _: u31 = 0, }; }; @@ -3089,13 +3124,13 @@ pub const ZcrxIfqRegister = extern struct { if_idx: u32, if_rxq: u32, rq_entries: u32, - // TODO: find out its flags, I suspect its ZcrxAreaRegister.Flags + // FIXME: I don't know what these flags are yet even after my research flags: u32, /// pointer to struct io_uring_zcrx_area_reg area_ptr: u64, /// struct io_uring_region_desc region_ptr: u64, - offsets: ZcrxOffsets, + offsets: ZcrxOffsets, // the kernel fill in the offsets zcrx_id: u32, __resv2: u32, __resv: [3]u64, @@ -3103,10 +3138,11 @@ pub const ZcrxIfqRegister = extern struct { // COMMIT: move IoUring constants to Constants pub const constants = struct { - /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new - /// an available direct descriptor instead of having the application pass one - /// direct descriptor (like openat/openat2/accept), then io_uring will allocate - /// in. The picked direct descriptor will be returned in cqe.res, or -ENFILE + /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for + /// opcodes that instantiate a new an available direct descriptor instead + /// of having the application pass one direct descriptor + /// (like openat/openat2/accept), then io_uring will allocate in. The + /// picked direct descriptor will be returned in cqe.res, or -ENFILE /// if the space is full. pub const FILE_INDEX_ALLOC = std.math.maxInt(u32); @@ -3131,9 +3167,6 @@ pub const constants = struct { pub const OFF_PBUF_SHIFT = 16; pub const OFF_MMAP_MASK = 0xf8000000; - /// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. - pub const RSRC_REGISTER_SPARSE = 1 << 0; - /// Skip updating fd indexes set to this value in the fd table pub const REGISTER_FILES_SKIP = -2; @@ -3158,184 +3191,185 @@ pub const uflags = struct { /// io_uring_setup() flags pub const Setup = packed struct(u32) { /// io_context is polled - IOPOLL: bool = false, + iopoll: bool = false, /// SQ poll thread - SQPOLL: bool = false, + sqpoll: bool = false, /// sq_thread_cpu is valid - SQ_AFF: bool = false, + sq_aff: bool = false, /// app defines CQ size - CQSIZE: bool = false, + cqsize: bool = false, /// clamp SQ/CQ ring sizes - CLAMP: bool = false, + clamp: bool = false, /// attach to existing wq - ATTACH_WQ: bool = false, + attach_wq: bool = false, /// start with ring disabled - R_DISABLED: bool = false, + r_disabled: bool = false, /// continue submit on error - SUBMIT_ALL: bool = false, - ///Cooperative task running. When requests complete, they often require - ///forcing the submitter to transition to the kernel to complete. If this - ///flag is set, work will be done when the task transitions anyway, rather - ///than force an inter-processor interrupt reschedule. This avoids interrupting - ///a task running in userspace, and saves an IPI. - COOP_TASKRUN: bool = false, - ///If COOP_TASKRUN is set, get notified if task work is available for - ///running and a kernel transition would be needed to run it. This sets - ///IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. - TASKRUN_FLAG: bool = false, + submit_all: bool = false, + /// Cooperative task running. When requests complete, they often require + /// forcing the submitter to transition to the kernel to complete. If + /// this flag is set, work will be done when the task transitions + /// anyway, rather than force an inter-processor interrupt reschedule. + /// This avoids interrupting a task running in userspace, and saves an + /// IPI. + coop_taskrun: bool = false, + /// If COOP_TASKRUN is set, get notified if task work is available for + /// running and a kernel transition would be needed to run it. This sets + /// IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + taskrun_flag: bool = false, /// SQEs are 128 byte - SQE128: bool = false, + sqe128: bool = false, /// CQEs are 32 byte - CQE32: bool = false, + cqe32: bool = false, /// Only one task is allowed to submit requests - SINGLE_ISSUER: bool = false, + single_issuer: bool = false, /// Defer running task work to get events. /// Rather than running bits of task work whenever the task transitions /// try to do it just before it is needed. - DEFER_TASKRUN: bool = false, + defer_taskrun: bool = false, /// Application provides the memory for the rings - NO_MMAP: bool = false, + no_mmap: bool = false, /// Register the ring fd in itself for use with - /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather - /// than an fd. - REGISTERED_FD_ONLY: bool = false, + /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index + /// rather than an fd. + registered_fd_only: bool = false, /// Removes indirection through the SQ index array. - NO_SQARRAY: bool = false, + no_sqarray: bool = false, // COMMIT: new setup flags /// Use hybrid poll in iopoll process - HYBRID_IOPOLL: bool = false, + hybrid_iopoll: bool = false, /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have /// IORING_CQE_F_32 set in cqe.flags. - CQE_MIXED: bool = false, - _unused: u13 = 0, + cqe_mixed: bool = false, + _20: u13 = 0, }; /// sqe.uring_cmd_flags (rw_flags in the Zig struct) /// top 8bits aren't available for userspace /// use registered buffer; pass this flag along with setting sqe.buf_index. pub const Cmd = packed struct(u32) { - CMD_FIXED: bool = false, - _unused: u31 = 0, + cmd_fixed: bool = false, + _2: u31 = 0, }; /// sqe.fsync_flags (rw_flags in the Zig struct) pub const Fsync = packed struct(u32) { - DATASYNC: bool = false, - _unused: u31 = 0, + datasync: bool = false, + _2: u31 = 0, }; /// sqe.timeout_flags pub const Timeout = packed struct(u32) { - TIMEOUT_ABS: bool = false, + timeout_abs: bool = false, /// Available since Linux 5.11 - TIMEOUT_UPDATE: bool = false, + timeout_update: bool = false, /// Available since Linux 5.15 - TIMEOUT_BOOTTIME: bool = false, + timeout_boottime: bool = false, /// Available since Linux 5.15 - TIMEOUT_REALTIME: bool = false, + timeout_realtime: bool = false, /// Available since Linux 5.15 - LINK_TIMEOUT_UPDATE: bool = false, + link_timeout_update: bool = false, /// Available since Linux 5.16 - TIMEOUT_ETIME_SUCCESS: bool = false, + timeout_etime_success: bool = false, // COMMIT: new Timeout Flag // TODO: add when it became available - TIMEOUT_MULTISHOT: bool = false, - _unused: u25 = 0, + timeout_multishot: bool = false, + _8: u25 = 0, }; /// sqe.splice_flags (rw_flags in Zig Struct) /// extends splice(2) flags pub const Splice = packed struct(u32) { - _unused: u31 = 0, + _1: u31 = 0, /// the last bit of __u32 - F_FD_IN_FIXED: bool = false, + f_fd_in_fixed: bool = false, }; /// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct) /// is the flag space, the command flags for POLL_ADD are stored in sqe.len. pub const Poll = packed struct(u32) { /// IORING_POLL_ADD_MULTI - /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue - /// to report CQEs on behalf of the same SQE. - ADD_MULTI: bool = false, + /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will + /// continue to report CQEs on behalf of the same SQE. + add_multi: bool = false, // TODO: verify this doc comment is valid for the 2 flags below /// IORING_POLL_UPDATE /// Update existing poll request, matching sqe.addr as the old user_data /// field. - UPDATE_EVENTS: bool = false, + update_events: bool = false, /// IORING_POLL_UPDATE /// Update existing poll request, matching sqe.addr as the old user_data /// field. - UPDATE_USER_DATA: bool = false, + update_user_data: bool = false, /// IORING_POLL_LEVEL /// Level triggered poll. - ADD_LEVEL: bool = false, - _unused: u28 = 0, + add_level: bool = false, + _5: u28 = 0, }; /// ASYNC_CANCEL flags. pub const AsyncCancel = packed struct(u32) { /// IORING_ASYNC_CANCEL_ALL /// Cancel all requests that match the given key - CANCEL_ALL: bool = false, + cancel_all: bool = false, /// IORING_ASYNC_CANCEL_FD /// Key off 'fd' for cancelation rather than the request 'user_data' - CANCEL_FD: bool = false, + cancel_fd: bool = false, /// IORING_ASYNC_CANCEL_ANY /// Match any request - CANCEL_ANY: bool = false, + cancel_any: bool = false, /// IORING_ASYNC_CANCEL_FD_FIXED /// 'fd' passed in is a fixed descriptor - CANCEL_FD_FIXED: bool = false, + cancel_fd_fixed: bool = false, // COMMIT: new AsyncCancel Flags /// IORING_ASYNC_CANCEL_USERDATA /// Match on user_data, default for no other key - CANCEL_USERDATA: bool = false, + cancel_userdata: bool = false, /// IORING_ASYNC_CANCEL_OP /// Match request based on opcode - CANCEL_OP: bool = false, - _unused: u26 = 0, + cancel_op: bool = false, + _7: u26 = 0, }; /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) pub const MsgRing = packed struct(u32) { /// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. /// Not applicable for IORING_MSG_DATA, obviously. - CQE_SKIP: bool = false, + cqe_skip: bool = false, /// Pass through the flags from sqe.file_index to cqe.flags - FLAGS_PASS: bool = false, - _unused: u30 = 0, + flags_pass: bool = false, + _3: u30 = 0, }; // COMMIT: new flag /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) pub const FixedFd = packed struct(u32) { /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC - NO_CLOEXEC: bool = false, + no_cloexec: bool = false, }; /// COMMIT: new flags /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) pub const Nop = packed struct(u32) { /// IORING_NOP_INJECT_RESULT Inject result from sqe.result - INJECT_RESULT: bool = false, - _unused: u4 = 0, - CQE32: bool = false, - _unused_1: u26 = 0, + inject_result: bool = false, + _2: u4 = 0, + cqe32: bool = false, + _7: u26 = 0, }; /// io_uring_enter(2) flags pub const Enter = packed struct(u32) { - GETEVENTS: bool = false, - SQ_WAKEUP: bool = false, - SQ_WAIT: bool = false, - EXT_ARG: bool = false, - REGISTERED_RING: bool = false, - // COMMIT: new flags - ABS_TIMER: bool = false, - EXT_ARG_REG: bool = false, - NO_IOWAIT: bool = false, - _unused: u24 = 0, + getevents: bool = false, + sq_wakeup: bool = false, + sq_wait: bool = false, + ext_arg: bool = false, + registered_ring: bool = false, + // commit: new flags + abs_timer: bool = false, + ext_arg_reg: bool = false, + no_iowait: bool = false, + _9: u24 = 0, pub fn empty(enter_flags: Enter) bool { return @as(u32, @bitCast(enter_flags)) == 0; @@ -3344,26 +3378,26 @@ pub const uflags = struct { /// io_uring_params.features flags const Features = packed struct(u32) { - SINGLE_MMAP: bool = false, - NODROP: bool = false, - SUBMIT_STABLE: bool = false, - RW_CUR_POS: bool = false, - CUR_PERSONALITY: bool = false, - FAST_POLL: bool = false, - POLL_32BITS: bool = false, - SQPOLL_NONFIXED: bool = false, - EXT_ARG: bool = false, - NATIVE_WORKERS: bool = false, - RSRC_TAGS: bool = false, - CQE_SKIP: bool = false, - LINKED_FILE: bool = false, - // COMMIT: add new Feature Flags - REG_REG_RING: bool = false, - RECVSEND_BUNDLE: bool = false, - MIN_TIMEOUT: bool = false, - RW_ATTR: bool = false, - NO_IOWAIT: bool = false, - _unused: u14 = 0, + single_mmap: bool = false, + nodrop: bool = false, + submit_stable: bool = false, + rw_cur_pos: bool = false, + cur_personality: bool = false, + fast_poll: bool = false, + poll_32bits: bool = false, + sqpoll_nonfixed: bool = false, + ext_arg: bool = false, + native_workers: bool = false, + rsrc_tags: bool = false, + cqe_skip: bool = false, + linked_file: bool = false, + // commit: add new feature flags + reg_reg_ring: bool = false, + recvsend_bundle: bool = false, + min_timeout: bool = false, + rw_attr: bool = false, + no_iowait: bool = false, + _19: u14 = 0, pub fn empty(features: Features) bool { return @as(u32, @bitCast(features)) == 0; @@ -3373,73 +3407,73 @@ pub const uflags = struct { /// io_uring_register(2) opcodes and arguments /// matches io_uring_register_op in liburing pub const RegisterOp = enum(u8) { - REGISTER_BUFFERS, - UNREGISTER_BUFFERS, - REGISTER_FILES, - UNREGISTER_FILES, - REGISTER_EVENTFD, - UNREGISTER_EVENTFD, - REGISTER_FILES_UPDATE, - REGISTER_EVENTFD_ASYNC, - REGISTER_PROBE, - REGISTER_PERSONALITY, - UNREGISTER_PERSONALITY, - REGISTER_RESTRICTIONS, - REGISTER_ENABLE_RINGS, + register_buffers, + unregister_buffers, + register_files, + unregister_files, + register_eventfd, + unregister_eventfd, + register_files_update, + register_eventfd_async, + register_probe, + register_personality, + unregister_personality, + register_restrictions, + register_enable_rings, // extended with tagging - REGISTER_FILES2, - REGISTER_FILES_UPDATE2, - REGISTER_BUFFERS2, - REGISTER_BUFFERS_UPDATE, + register_files2, + register_files_update2, + register_buffers2, + register_buffers_update, // set/clear io-wq thread affinities - REGISTER_IOWQ_AFF, - UNREGISTER_IOWQ_AFF, + register_iowq_aff, + unregister_iowq_aff, // set/get max number of io-wq workers - REGISTER_IOWQ_MAX_WORKERS, + register_iowq_max_workers, // register/unregister io_uring fd with the ring - REGISTER_RING_FDS, - UNREGISTER_RING_FDS, + register_ring_fds, + unregister_ring_fds, // register ring based provide buffer group - REGISTER_PBUF_RING, - UNREGISTER_PBUF_RING, + register_pbuf_ring, + unregister_pbuf_ring, // sync cancelation API - REGISTER_SYNC_CANCEL, + register_sync_cancel, // register a range of fixed file slots for automatic slot allocation - REGISTER_FILE_ALLOC_RANGE, + register_file_alloc_range, // return status information for a buffer group - REGISTER_PBUF_STATUS, + register_pbuf_status, // set/clear busy poll settings - REGISTER_NAPI, - UNREGISTER_NAPI, + register_napi, + unregister_napi, - REGISTER_CLOCK, + register_clock, // clone registered buffers from source ring to current ring - REGISTER_CLONE_BUFFERS, + register_clone_buffers, // send MSG_RING without having a ring - REGISTER_SEND_MSG_RING, + register_send_msg_ring, // register a netdev hw rx queue for zerocopy - REGISTER_ZCRX_IFQ, + register_zcrx_ifq, // resize CQ ring - REGISTER_RESIZE_RINGS, + register_resize_rings, - REGISTER_MEM_REGION, + register_mem_region, // COMMIT: new register opcode // query various aspects of io_uring, see linux/io_uring/query.h - REGISTER_QUERY, + register_query, _, }; @@ -3447,30 +3481,32 @@ pub const RegisterOp = enum(u8) { /// io-wq worker categories /// matches io_wq_type in liburing pub const IoWqCategory = enum(u8) { - BOUND, - UNBOUND, + bound, + unbound, + _, }; pub const SocketOp = enum(u16) { - SIOCIN, - SIOCOUTQ, - GETSOCKOPT, - SETSOCKOPT, + siocin, + siocoutq, + getsockopt, + setsockopt, // COMMIT: new socket op - TX_TIMESTAMP, + tx_timestamp, + _, }; /// io_uring_restriction.opcode values /// matches io_uring_register_restriction_op in liburing pub const RestrictionOp = enum(u16) { /// Allow an io_uring_register(2) opcode - REGISTER_OP = 0, + register_op = 0, /// Allow an sqe opcode - SQE_OP = 1, + sqe_op = 1, /// Allow sqe flags - SQE_FLAGS_ALLOWED = 2, + sqe_flags_allowed = 2, /// Require sqe flags (these flags must be set on each submission) - SQE_FLAGS_REQUIRED = 3, + sqe_flags_required = 3, _, }; @@ -3478,78 +3514,78 @@ pub const RestrictionOp = enum(u16) { /// IORING_OP_MSG_RING command types, stored in sqe.addr pub const MsgRingCmd = enum { /// pass sqe->len as 'res' and off as user_data - DATA, + data, /// send a registered fd to another ring - SEND_FD, + send_fd, }; // COMMIT: OP to IoUring pub const Op = enum(u8) { - NOP, - READV, - WRITEV, - FSYNC, - READ_FIXED, - WRITE_FIXED, - POLL_ADD, - POLL_REMOVE, - SYNC_FILE_RANGE, - SENDMSG, - RECVMSG, - TIMEOUT, - TIMEOUT_REMOVE, - ACCEPT, - ASYNC_CANCEL, - LINK_TIMEOUT, - CONNECT, - FALLOCATE, - OPENAT, - CLOSE, - FILES_UPDATE, - STATX, - READ, - WRITE, - FADVISE, - MADVISE, - SEND, - RECV, - EPOLL_CTL, - OPENAT2, - SPLICE, - PROVIDE_BUFFERS, - REMOVE_BUFFERS, - TEE, - SHUTDOWN, - RENAMEAT, - UNLINKAT, - MKDIRAT, - SYMLINKAT, - LINKAT, - MSG_RING, - FSETXATTR, - SETXATTR, - FGETXATTR, - GETXATTR, - SOCKET, - URING_CMD, - SEND_ZC, - SENDMSG_ZC, - READ_MULTISHOT, - WAITID, - FUTEX_WAIT, - FUTEX_WAKE, - FUTEX_WAITV, - FIXED_FD_INSTALL, - FTRUNCATE, - BIND, - LISTEN, - RECV_ZC, + nop, + readv, + writev, + fsync, + read_fixed, + write_fixed, + poll_add, + poll_remove, + sync_file_range, + sendmsg, + recvmsg, + timeout, + timeout_remove, + accept, + async_cancel, + link_timeout, + connect, + fallocate, + openat, + close, + files_update, + statx, + read, + write, + fadvise, + madvise, + send, + recv, + epoll_ctl, + openat2, + splice, + provide_buffers, + remove_buffers, + tee, + shutdown, + renameat, + unlinkat, + mkdirat, + symlinkat, + linkat, + msg_ring, + fsetxattr, + setxattr, + fgetxattr, + getxattr, + socket, + uring_cmd, + send_zc, + sendmsg_zc, + read_multishot, + waitid, + futex_wait, + futex_wake, + futex_waitv, + fixed_fd_install, + ftruncate, + bind, + listen, + recv_zc, // COMMIT: new OPs // TODO: to be implemented - EPOLL_WAIT, - READV_FIXED, - WRITEV_FIXED, - PIPE, + epoll_wait, + readv_fixed, + writev_fixed, + pipe, _, }; @@ -3584,7 +3620,7 @@ test "nop" { const sqe = try ring.nop(0xaaaaaaaa); try testing.expectEqual(Sqe{ - .opcode = .NOP, + .opcode = .nop, .flags = .{}, .ioprio = .init_empty(), .fd = 0, @@ -3623,7 +3659,7 @@ test "nop" { try testing.expectEqual(@as(u32, 0), ring.cq_ready()); const sqe_barrier = try ring.nop(0xbbbbbbbb); - sqe_barrier.flags.IO_DRAIN = true; + sqe_barrier.flags.io_drain = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); try testing.expectEqual(Cqe{ .user_data = 0xbbbbbbbb, @@ -3663,8 +3699,8 @@ test "readv" { var buffer = [_]u8{42} ** 128; var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); - try testing.expectEqual(Op.READV, sqe.opcode); - sqe.flags.FIXED_FILE = true; + try testing.expectEqual(Op.readv, sqe.opcode); + sqe.flags.fixed_file = true; try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -3706,17 +3742,17 @@ test "writev/fsync/readv" { }; const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); - try testing.expectEqual(Op.WRITEV, sqe_writev.opcode); + try testing.expectEqual(Op.writev, sqe_writev.opcode); try testing.expectEqual(@as(u64, 17), sqe_writev.off); - sqe_writev.flags.IO_LINK = true; + sqe_writev.flags.io_link = true; const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); - try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode); + try testing.expectEqual(Op.fsync, sqe_fsync.opcode); try testing.expectEqual(fd, sqe_fsync.fd); - sqe_fsync.flags.IO_LINK = true; + sqe_fsync.flags.io_link = true; const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); - try testing.expectEqual(Op.READV, sqe_readv.opcode); + try testing.expectEqual(Op.readv, sqe_readv.opcode); try testing.expectEqual(@as(u64, 17), sqe_readv.off); try testing.expectEqual(@as(u32, 3), ring.sq_ready()); @@ -3768,11 +3804,11 @@ test "write/read" { const buffer_write = [_]u8{97} ** 20; var buffer_read = [_]u8{98} ** 20; const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); - try testing.expectEqual(Op.WRITE, sqe_write.opcode); + try testing.expectEqual(Op.write, sqe_write.opcode); try testing.expectEqual(@as(u64, 10), sqe_write.off); - sqe_write.flags.IO_LINK = true; + sqe_write.flags.io_link = true; const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(Op.READ, sqe_read.opcode); + try testing.expectEqual(Op.read, sqe_read.opcode); try testing.expectEqual(@as(u64, 10), sqe_read.off); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -3824,21 +3860,21 @@ test "splice/read" { const pipe_offset: u64 = std.math.maxInt(u64); const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); - try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode); + try testing.expectEqual(Op.splice, sqe_splice_to_pipe.opcode); try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); // TODO: use io_link function - sqe_splice_to_pipe.flags.IO_LINK = true; + sqe_splice_to_pipe.flags.io_link = true; const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); - try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode); + try testing.expectEqual(Op.splice, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); // TODO: use io_link function - sqe_splice_from_pipe.flags.IO_LINK = true; + sqe_splice_from_pipe.flags.io_link = true; const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(Op.READ, sqe_read.opcode); + try testing.expectEqual(Op.read, sqe_read.opcode); try testing.expectEqual(@as(u64, 10), sqe_read.off); try testing.expectEqual(@as(u32, 3), try ring.submit()); @@ -3904,13 +3940,13 @@ test "write_fixed/read_fixed" { }; const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); - try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode); + try testing.expectEqual(Op.write_fixed, sqe_write.opcode); try testing.expectEqual(@as(u64, 3), sqe_write.off); // TODO: use io_link function - sqe_write.flags.IO_LINK = true; + sqe_write.flags.io_link = true; const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); - try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode); + try testing.expectEqual(Op.read_fixed, sqe_read.opcode); try testing.expectEqual(@as(u64, 0), sqe_read.off); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -3960,7 +3996,7 @@ test "openat" { const mode: posix.mode_t = 0o666; const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); try testing.expectEqual(Sqe{ - .opcode = .OPENAT, + .opcode = .openat, .flags = .{}, .ioprio = .init_empty(), .fd = tmp.dir.fd, @@ -4006,7 +4042,7 @@ test "close" { errdefer file.close(); const sqe_close = try ring.close(0x44444444, file.handle); - try testing.expectEqual(Op.CLOSE, sqe_close.opcode); + try testing.expectEqual(Op.close, sqe_close.opcode); try testing.expectEqual(file.handle, sqe_close.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4037,7 +4073,7 @@ test "accept/connect/send/recv" { const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}); // TODO: use io_link function - sqe_send.flags.IO_LINK = true; + sqe_send.flags.io_link = true; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -4057,7 +4093,7 @@ test "accept/connect/send/recv" { // TODO: comment seems to be wrong but clarify with previous maintainers // Only check IORING_CQE_F_SOCK_NONEMPTY flag, as other flags are system-dependent (Might be more appropriate) // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems - .flags = .{ .F_SOCK_NONEMPTY = cqe_recv.flags.F_SOCK_NONEMPTY }, + .flags = .{ .f_sock_nonempty = cqe_recv.flags.f_sock_nonempty }, }, cqe_recv); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -4105,8 +4141,8 @@ test "sendmsg/recvmsg" { .flags = 0, }; const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, .{}); - sqe_sendmsg.flags.IO_LINK = true; - try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode); + sqe_sendmsg.flags.io_link = true; + try testing.expectEqual(Op.sendmsg, sqe_sendmsg.opcode); try testing.expectEqual(client, sqe_sendmsg.fd); var buffer_recv = [_]u8{0} ** 128; @@ -4127,7 +4163,7 @@ test "sendmsg/recvmsg" { .flags = 0, }; const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, .{}); - try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode); + try testing.expectEqual(Op.recvmsg, sqe_recvmsg.opcode); try testing.expectEqual(server, sqe_recvmsg.fd); try testing.expectEqual(@as(u32, 2), ring.sq_ready()); @@ -4149,7 +4185,7 @@ test "sendmsg/recvmsg" { .user_data = 0x22222222, .res = buffer_recv.len, // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically - .flags = .{ .F_SOCK_NONEMPTY = cqe_recvmsg.flags.F_SOCK_NONEMPTY }, + .flags = .{ .f_sock_nonempty = cqe_recvmsg.flags.f_sock_nonempty }, }, cqe_recvmsg); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -4171,8 +4207,8 @@ test "timeout (after a relative time)" { const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; const started = try std.Io.Clock.awake.now(io); - const sqe = try ring.timeout(0x55555555, &ts, 0, 0); - try testing.expectEqual(Op.TIMEOUT, sqe.opcode); + const sqe = try ring.timeout(0x55555555, &ts, 0, .{}); + try testing.expectEqual(Op.timeout, sqe.opcode); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); const stopped = try std.Io.Clock.awake.now(io); @@ -4201,7 +4237,7 @@ test "timeout (after a number of completions)" { const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; const count_completions: u64 = 1; const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, .{}); - try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(Op.timeout, sqe_timeout.opcode); try testing.expectEqual(count_completions, sqe_timeout.off); _ = try ring.nop(0x77777777); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -4233,11 +4269,11 @@ test "timeout_remove" { const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); - try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode); + try testing.expectEqual(Op.timeout, sqe_timeout.opcode); try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, .{}); - try testing.expectEqual(Op.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); + try testing.expectEqual(Op.timeout_remove, sqe_timeout_remove.opcode); try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); @@ -4258,7 +4294,7 @@ test "timeout_remove" { // We don't want to skip this test for newer kernels. if (cqe.user_data == 0x99999999 and cqe.err() == .BADF and - (!ring.features.RW_CUR_POS)) + (!ring.features.rw_cur_pos)) { return error.SkipZigTest; } @@ -4298,7 +4334,7 @@ test "accept/connect/recv/link_timeout" { const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); // TODO: use io_link function - sqe_recv.flags.IO_LINK = true; + sqe_recv.flags.io_link = true; const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; _ = try ring.link_timeout(0x22222222, &ts, .{}); @@ -4352,7 +4388,7 @@ test "fallocate" { const len: u64 = 65536; const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); - try testing.expectEqual(Op.FALLOCATE, sqe.opcode); + try testing.expectEqual(Op.fallocate, sqe.opcode); try testing.expectEqual(file.handle, sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4406,7 +4442,7 @@ test "statx" { .{ .size = true }, &buf, ); - try testing.expectEqual(Op.STATX, sqe.opcode); + try testing.expectEqual(Op.statx, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4453,7 +4489,7 @@ test "accept/connect/recv/cancel" { try testing.expectEqual(@as(u32, 1), try ring.submit()); const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, .{}); - try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode); + try testing.expectEqual(Op.async_cancel, sqe_cancel.opcode); try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4522,9 +4558,9 @@ test "register_files_update" { var buffer = [_]u8{42} ** 128; { const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(Op.read, sqe.opcode); // TODO: use setflags function - sqe.flags.FIXED_FILE = true; + sqe.flags.fixed_file = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); try testing.expectEqual(Cqe{ @@ -4544,9 +4580,9 @@ test "register_files_update" { { // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(Op.read, sqe.opcode); // TODO: use setflags function - sqe.flags.FIXED_FILE = true; + sqe.flags.fixed_file = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); try testing.expectEqual(Cqe{ @@ -4562,9 +4598,9 @@ test "register_files_update" { { // Now this should fail since both fds are sparse (-1) const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(Op.read, sqe.opcode); // TODO: use setflags function - sqe.flags.FIXED_FILE = true; + sqe.flags.fixed_file = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -4603,7 +4639,7 @@ test "shutdown" { try posix.getsockname(server, addrAny(&address), &slen); const shutdown_sqe = try ring.shutdown(0x445445445, server, .rd); - try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(Op.shutdown, shutdown_sqe.opcode); try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4631,7 +4667,7 @@ test "shutdown" { const shutdown_sqe = ring.shutdown(0x445445445, server, .rd) catch |err| switch (err) { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), }; - try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode); + try testing.expectEqual(Op.shutdown, shutdown_sqe.opcode); try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4674,7 +4710,7 @@ test "renameat" { new_path, .{}, ); - try testing.expectEqual(Op.RENAMEAT, sqe.opcode); + try testing.expectEqual(Op.renameat, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4728,7 +4764,7 @@ test "unlinkat" { path, .{}, ); - try testing.expectEqual(Op.UNLINKAT, sqe.opcode); + try testing.expectEqual(Op.unlinkat, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4775,7 +4811,7 @@ test "mkdirat" { path, 0o0755, ); - try testing.expectEqual(Op.MKDIRAT, sqe.opcode); + try testing.expectEqual(Op.mkdirat, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4823,7 +4859,7 @@ test "symlinkat" { tmp.dir.fd, link_path, ); - try testing.expectEqual(Op.SYMLINKAT, sqe.opcode); + try testing.expectEqual(Op.symlinkat, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4876,7 +4912,7 @@ test "linkat" { second_path, .{}, ); - try testing.expectEqual(Op.LINKAT, sqe.opcode); + try testing.expectEqual(Op.linkat, sqe.opcode); try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); try testing.expectEqual(@as(u32, 1), try ring.submit()); @@ -4923,7 +4959,7 @@ test "provide_buffers: read" { { const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(Op.provide_buffers, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); @@ -4944,7 +4980,7 @@ test "provide_buffers: read" { var i: usize = 0; while (i < buffers.len) : (i += 1) { const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(Op.read, sqe.opcode); try testing.expectEqual(@as(i32, fd), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -4957,7 +4993,7 @@ test "provide_buffers: read" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags.F_BUFFER); + try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -4970,7 +5006,7 @@ test "provide_buffers: read" { { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(Op.read, sqe.opcode); try testing.expectEqual(@as(i32, fd), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -5009,7 +5045,7 @@ test "provide_buffers: read" { { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(Op.READ, sqe.opcode); + try testing.expectEqual(Op.read, sqe.opcode); try testing.expectEqual(@as(i32, fd), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); @@ -5022,7 +5058,7 @@ test "provide_buffers: read" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags.F_BUFFER); + try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expectEqual(used_buffer_id, reprovided_buffer_id); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -5070,7 +5106,7 @@ test "remove_buffers" { { const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); - try testing.expectEqual(Op.REMOVE_BUFFERS, sqe.opcode); + try testing.expectEqual(Op.remove_buffers, sqe.opcode); try testing.expectEqual(@as(i32, 3), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); @@ -5096,7 +5132,7 @@ test "remove_buffers" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags.F_BUFFER); + try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -5140,7 +5176,7 @@ test "provide_buffers: accept/connect/send/recv" { { const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode); + try testing.expectEqual(Op.provide_buffers, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); @@ -5182,13 +5218,13 @@ test "provide_buffers: accept/connect/send/recv" { var i: usize = 0; while (i < buffers.len) : (i += 1) { const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); - try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(Sqe.IoSqe{ .BUFFER_SELECT = true }, sqe.flags); + try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5197,7 +5233,7 @@ test "provide_buffers: accept/connect/send/recv" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags.F_BUFFER); + try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -5211,13 +5247,13 @@ test "provide_buffers: accept/connect/send/recv" { { const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); - try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(Sqe.IoSqe{ .BUFFER_SELECT = true }, sqe.flags); + try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5261,13 +5297,13 @@ test "provide_buffers: accept/connect/send/recv" { { const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); - try testing.expectEqual(Op.RECV, sqe.opcode); + try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); try testing.expectEqual(@as(u64, 0), sqe.addr); try testing.expectEqual(@as(u32, buffer_len), sqe.len); try testing.expectEqual(@as(u16, group_id), sqe.buf_index); try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(Sqe.IoSqe{ .BUFFER_SELECT = true }, sqe.flags); + try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5276,7 +5312,7 @@ test "provide_buffers: accept/connect/send/recv" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags.F_BUFFER); + try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expectEqual(used_buffer_id, reprovided_buffer_id); try testing.expectEqual(@as(i32, buffer_len), cqe.res); @@ -5344,7 +5380,7 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { // All good - return SocketTestHarness{ + return .{ .listener = listener_socket, .server = cqe_accept.res, .client = client, @@ -5403,7 +5439,7 @@ test "accept multishot" { if (cqe.err() == .INVAL) return error.SkipZigTest; try testing.expect(cqe.res > 0); try testing.expect(cqe.user_data == userdata); - try testing.expect(cqe.flags.F_MORE); // more flag is set + try testing.expect(cqe.flags.f_more); // more flag is set posix.close(client); } @@ -5428,7 +5464,7 @@ test "accept/connect/send_zc/recv" { // zero-copy send const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}, .{}); // TODO: use io_link function - sqe_send.flags.IO_LINK = true; + sqe_send.flags.io_link = true; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -5441,7 +5477,7 @@ test "accept/connect/send_zc/recv" { try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, - .flags = .{ .F_MORE = true }, + .flags = .{ .f_more = true }, }, cqe_send); cqe_send, const cqe_recv = brk: { @@ -5453,7 +5489,7 @@ test "accept/connect/send_zc/recv" { try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, - .flags = .{ .F_SOCK_NONEMPTY = cqe_recv.flags.F_SOCK_NONEMPTY }, + .flags = .{ .f_sock_nonempty = cqe_recv.flags.f_sock_nonempty }, }, cqe_recv); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -5462,7 +5498,7 @@ test "accept/connect/send_zc/recv" { try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, - .flags = .{ .F_NOTIF = true }, + .flags = .{ .f_notif = true }, }, cqe_send); } @@ -5521,7 +5557,7 @@ test "accept_direct" { // Flag linux.IOSQE_FIXED_FILE must be set. const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, .{}); // TODO: use setflags function - recv_sqe.flags.FIXED_FILE = true; + recv_sqe.flags.fixed_file = true; try testing.expectEqual(@as(u32, 1), try ring.submit()); // accept receive @@ -5595,7 +5631,7 @@ test "accept_multishot_direct" { const fd_index = cqe_accept.res; try testing.expect(fd_index < registered_fds.len); try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expect(cqe_accept.flags.F_MORE); // has more is set + try testing.expect(cqe_accept.flags.f_more); // has more is set } // No more available fds, accept will get NFILE error. // Multishot is terminated (more flag is not set). @@ -5608,7 +5644,7 @@ test "accept_multishot_direct" { const cqe_accept = try ring.copy_cqe(); try testing.expect(cqe_accept.user_data == accept_userdata); try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); - try testing.expect(!cqe_accept.flags.F_MORE); // has more is not set + try testing.expect(!cqe_accept.flags.f_more); // has more is not set } // return file descriptors to kernel try ring.register_files_update(0, registered_fds[0..]); @@ -5689,7 +5725,8 @@ test "socket_direct/socket_direct_alloc/close_direct" { _ = try ring.accept(accept_userdata, listener_socket, null, null, .{}); // prepare connect with fixed socket const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); - connect_sqe.flags.FIXED_FILE = true; // fd is fixed file index + // TODO: use setflags function + connect_sqe.flags.fixed_file = true; // fd is fixed file index // submit both try testing.expectEqual(@as(u32, 2), try ring.submit()); // get completions @@ -5878,7 +5915,7 @@ test BufferGroup { // Server uses buffer group receive { // Submit recv operation, buffer will be chosen from buffer group - _ = try buf_grp.recv(2, fds.server, 0); + _ = try buf_grp.recv(2, fds.server, .{}); const submitted = try ring.submit(); try testing.expectEqual(1, submitted); @@ -5961,13 +5998,13 @@ test "ring mapped buffers recv" { // 'no more buffers', until we put buffers to the kernel { const user_data = rnd.int(u64); - _ = try buf_grp.recv(user_data, fds.server, 0); + _ = try buf_grp.recv(user_data, fds.server, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only + try testing.expect(!cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flags is set on success only try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); } @@ -6036,7 +6073,7 @@ test "ring mapped buffers multishot recv" { // start multishot recv var recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit // server reads data into provided buffers @@ -6044,11 +6081,11 @@ test "ring mapped buffers multishot recv" { // we read four chunks of 4, 4, 4, 4 bytes each var chunk: []const u8 = data[0..buffer_size]; // first chunk const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe1.flags.F_MORE); + try testing.expect(cqe1.flags.f_more); chunk = data[buffer_size .. buffer_size * 2]; // second chunk const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe2.flags.F_MORE); + try testing.expect(cqe2.flags.f_more); // both buffers provided to the kernel are used so we get error // 'no more buffers', until we put buffers to the kernel @@ -6057,10 +6094,11 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(recv_user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only + // IORING_CQE_F_BUFFER flags is set on success only + try testing.expect(!cqe.flags.f_buffer); // has more is not set // indicates that multishot is finished - try testing.expect(!cqe.flags.F_MORE); + try testing.expect(!cqe.flags.f_more); try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); } @@ -6070,17 +6108,17 @@ test "ring mapped buffers multishot recv" { // restart multishot recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, .{}); try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe3.flags.F_MORE); + try testing.expect(cqe3.flags.f_more); try buf_grp.put(cqe3); chunk = data[buffer_size * 3 ..]; // last chunk const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe4.flags.F_MORE); + try testing.expect(cqe4.flags.f_more); try buf_grp.put(cqe4); // cancel pending multishot recv operation @@ -6120,7 +6158,7 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(recv_user_data, cqe_recv.user_data); try testing.expect(cqe_recv.res < 0); try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED); - try testing.expect(!cqe_recv.flags.F_MORE); + try testing.expect(!cqe_recv.flags.f_more); } } } @@ -6133,8 +6171,8 @@ fn buf_grp_recv_submit_get_cqe( user_data: u64, ) !Cqe { // prepare and submit recv - const sqe = try buf_grp.recv(user_data, fd, 0); - try testing.expect(sqe.flags.BUFFER_SELECT); + const sqe = try buf_grp.recv(user_data, fd, .{}); + try testing.expect(sqe.flags.buffer_select); try testing.expect(sqe.buf_index == buf_grp.group_id); try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit // get cqe, expect success @@ -6142,7 +6180,7 @@ fn buf_grp_recv_submit_get_cqe( try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set + try testing.expect(cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flag is set return cqe; } @@ -6157,7 +6195,7 @@ fn expect_buf_grp_cqe( const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success - try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set + try testing.expect(cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flag is set try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); @@ -6229,7 +6267,7 @@ test "bind/listen/connect" { const probe = ring.get_probe() catch return error.SkipZigTest; // LISTEN is higher required operation - if (!probe.is_supported(.LISTEN)) return error.SkipZigTest; + if (!probe.is_supported(.listen)) return error.SkipZigTest; var addr: linux.sockaddr.in = .{ .port = 0, From 9465385047ca97fabdb122c08375d22b5b4ca74d Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 10 Oct 2025 23:48:26 +0000 Subject: [PATCH 16/41] replace direct set of some flags with link_next and set_flags calls Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 41 ++++++++++++++---------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 0dfea27fcc9e..1b0bc9a102b1 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -3863,15 +3863,13 @@ test "splice/read" { try testing.expectEqual(Op.splice, sqe_splice_to_pipe.opcode); try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); - // TODO: use io_link function - sqe_splice_to_pipe.flags.io_link = true; + sqe_splice_to_pipe.link_next(); const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); try testing.expectEqual(Op.splice, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); - // TODO: use io_link function - sqe_splice_from_pipe.flags.io_link = true; + sqe_splice_from_pipe.link_next(); const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); try testing.expectEqual(Op.read, sqe_read.opcode); @@ -3942,8 +3940,7 @@ test "write_fixed/read_fixed" { const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); try testing.expectEqual(Op.write_fixed, sqe_write.opcode); try testing.expectEqual(@as(u64, 3), sqe_write.off); - // TODO: use io_link function - sqe_write.flags.io_link = true; + sqe_write.link_next(); const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); try testing.expectEqual(Op.read_fixed, sqe_read.opcode); @@ -4072,8 +4069,8 @@ test "accept/connect/send/recv" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}); - // TODO: use io_link function - sqe_send.flags.io_link = true; + sqe_send.link_next(); + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -4090,9 +4087,8 @@ test "accept/connect/send/recv" { try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, - // TODO: comment seems to be wrong but clarify with previous maintainers - // Only check IORING_CQE_F_SOCK_NONEMPTY flag, as other flags are system-dependent (Might be more appropriate) - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems + // Only check IORING_CQE_F_SOCK_NONEMPTY flag, as other flags are + // system-dependent .flags = .{ .f_sock_nonempty = cqe_recv.flags.f_sock_nonempty }, }, cqe_recv); @@ -4333,8 +4329,7 @@ test "accept/connect/recv/link_timeout" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); - // TODO: use io_link function - sqe_recv.flags.io_link = true; + sqe_recv.link_next(); const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; _ = try ring.link_timeout(0x22222222, &ts, .{}); @@ -4559,8 +4554,7 @@ test "register_files_update" { { const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.read, sqe.opcode); - // TODO: use setflags function - sqe.flags.fixed_file = true; + sqe.set_flags(.{ .fixed_file = true }); try testing.expectEqual(@as(u32, 1), try ring.submit()); try testing.expectEqual(Cqe{ @@ -4581,8 +4575,7 @@ test "register_files_update" { // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.read, sqe.opcode); - // TODO: use setflags function - sqe.flags.fixed_file = true; + sqe.set_flags(.{ .fixed_file = true }); try testing.expectEqual(@as(u32, 1), try ring.submit()); try testing.expectEqual(Cqe{ @@ -4599,8 +4592,7 @@ test "register_files_update" { // Now this should fail since both fds are sparse (-1) const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.read, sqe.opcode); - // TODO: use setflags function - sqe.flags.fixed_file = true; + sqe.set_flags(.{ .fixed_file = true }); try testing.expectEqual(@as(u32, 1), try ring.submit()); const cqe = try ring.copy_cqe(); @@ -5463,8 +5455,7 @@ test "accept/connect/send_zc/recv" { // zero-copy send const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}, .{}); - // TODO: use io_link function - sqe_send.flags.io_link = true; + sqe_send.link_next(); _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); try testing.expectEqual(@as(u32, 2), try ring.submit()); @@ -5556,8 +5547,8 @@ test "accept_direct" { // Fd field is set to registered file index, returned by accept. // Flag linux.IOSQE_FIXED_FILE must be set. const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, .{}); - // TODO: use setflags function - recv_sqe.flags.fixed_file = true; + recv_sqe.set_flags(.{ .fixed_file = true }); + try testing.expectEqual(@as(u32, 1), try ring.submit()); // accept receive @@ -5725,8 +5716,8 @@ test "socket_direct/socket_direct_alloc/close_direct" { _ = try ring.accept(accept_userdata, listener_socket, null, null, .{}); // prepare connect with fixed socket const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); - // TODO: use setflags function - connect_sqe.flags.fixed_file = true; // fd is fixed file index + // fd is fixed file index + connect_sqe.set_flags(.{ .fixed_file = true }); // submit both try testing.expectEqual(@as(u32, 2), try ring.submit()); // get completions From 0d16810f7059a18a50cb87e8f7e1c079f8759a5e Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sat, 11 Oct 2025 00:00:52 +0000 Subject: [PATCH 17/41] Remove io_uring bit and pieces from linux.zig now that IoUring.zig is self contained Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 659 +------------------------------------------ 1 file changed, 1 insertion(+), 658 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 127211b81f44..067e454a7657 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -22,6 +22,7 @@ const iovec = std.posix.iovec; const iovec_const = std.posix.iovec_const; const winsize = std.posix.winsize; const ACCMODE = std.posix.ACCMODE; +pub const IoUring = @import("linux/IoUring.zig"); test { if (builtin.os.tag == .linux) { @@ -6664,664 +6665,6 @@ else fields: siginfo_fields_union, }; -// io_uring_params.flags - -/// io_context is polled -pub const IORING_SETUP_IOPOLL = 1 << 0; - -/// SQ poll thread -pub const IORING_SETUP_SQPOLL = 1 << 1; - -/// sq_thread_cpu is valid -pub const IORING_SETUP_SQ_AFF = 1 << 2; - -/// app defines CQ size -pub const IORING_SETUP_CQSIZE = 1 << 3; - -/// clamp SQ/CQ ring sizes -pub const IORING_SETUP_CLAMP = 1 << 4; - -/// attach to existing wq -pub const IORING_SETUP_ATTACH_WQ = 1 << 5; - -/// start with ring disabled -pub const IORING_SETUP_R_DISABLED = 1 << 6; - -/// continue submit on error -pub const IORING_SETUP_SUBMIT_ALL = 1 << 7; - -/// Cooperative task running. When requests complete, they often require -/// forcing the submitter to transition to the kernel to complete. If this -/// flag is set, work will be done when the task transitions anyway, rather -/// than force an inter-processor interrupt reschedule. This avoids interrupting -/// a task running in userspace, and saves an IPI. -pub const IORING_SETUP_COOP_TASKRUN = 1 << 8; - -/// If COOP_TASKRUN is set, get notified if task work is available for -/// running and a kernel transition would be needed to run it. This sets -/// IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. -pub const IORING_SETUP_TASKRUN_FLAG = 1 << 9; - -/// SQEs are 128 byte -pub const IORING_SETUP_SQE128 = 1 << 10; -/// CQEs are 32 byte -pub const IORING_SETUP_CQE32 = 1 << 11; - -/// Only one task is allowed to submit requests -pub const IORING_SETUP_SINGLE_ISSUER = 1 << 12; - -/// Defer running task work to get events. -/// Rather than running bits of task work whenever the task transitions -/// try to do it just before it is needed. -pub const IORING_SETUP_DEFER_TASKRUN = 1 << 13; - -/// Application provides ring memory -pub const IORING_SETUP_NO_MMAP = 1 << 14; - -/// Register the ring fd in itself for use with -/// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather -/// than an fd. -pub const IORING_SETUP_REGISTERED_FD_ONLY = 1 << 15; - -/// Removes indirection through the SQ index array. -pub const IORING_SETUP_NO_SQARRAY = 1 << 16; - -pub const IoUring = @import("linux/IoUring.zig"); - -/// If sqe->file_index is set to this for opcodes that instantiate a new -/// direct descriptor (like openat/openat2/accept), then io_uring will allocate -/// an available direct descriptor instead of having the application pass one -/// in. The picked direct descriptor will be returned in cqe->res, or -ENFILE -/// if the space is full. -/// Available since Linux 5.19 -pub const IORING_FILE_INDEX_ALLOC = maxInt(u32); - -pub const IOSQE_BIT = enum(u8) { - FIXED_FILE, - IO_DRAIN, - IO_LINK, - IO_HARDLINK, - ASYNC, - BUFFER_SELECT, - CQE_SKIP_SUCCESS, - - _, -}; - -// io_uring_sqe.flags - -/// use fixed fileset -pub const IOSQE_FIXED_FILE = 1 << @intFromEnum(IOSQE_BIT.FIXED_FILE); - -/// issue after inflight IO -pub const IOSQE_IO_DRAIN = 1 << @intFromEnum(IOSQE_BIT.IO_DRAIN); - -/// links next sqe -pub const IOSQE_IO_LINK = 1 << @intFromEnum(IOSQE_BIT.IO_LINK); - -/// like LINK, but stronger -pub const IOSQE_IO_HARDLINK = 1 << @intFromEnum(IOSQE_BIT.IO_HARDLINK); - -/// always go async -pub const IOSQE_ASYNC = 1 << @intFromEnum(IOSQE_BIT.ASYNC); - -/// select buffer from buf_group -pub const IOSQE_BUFFER_SELECT = 1 << @intFromEnum(IOSQE_BIT.BUFFER_SELECT); - -/// don't post CQE if request succeeded -/// Available since Linux 5.17 -pub const IOSQE_CQE_SKIP_SUCCESS = 1 << @intFromEnum(IOSQE_BIT.CQE_SKIP_SUCCESS); - -pub const IORING_OP = enum(u8) { - NOP, - READV, - WRITEV, - FSYNC, - READ_FIXED, - WRITE_FIXED, - POLL_ADD, - POLL_REMOVE, - SYNC_FILE_RANGE, - SENDMSG, - RECVMSG, - TIMEOUT, - TIMEOUT_REMOVE, - ACCEPT, - ASYNC_CANCEL, - LINK_TIMEOUT, - CONNECT, - FALLOCATE, - OPENAT, - CLOSE, - FILES_UPDATE, - STATX, - READ, - WRITE, - FADVISE, - MADVISE, - SEND, - RECV, - OPENAT2, - EPOLL_CTL, - SPLICE, - PROVIDE_BUFFERS, - REMOVE_BUFFERS, - TEE, - SHUTDOWN, - RENAMEAT, - UNLINKAT, - MKDIRAT, - SYMLINKAT, - LINKAT, - MSG_RING, - FSETXATTR, - SETXATTR, - FGETXATTR, - GETXATTR, - SOCKET, - URING_CMD, - SEND_ZC, - SENDMSG_ZC, - READ_MULTISHOT, - WAITID, - FUTEX_WAIT, - FUTEX_WAKE, - FUTEX_WAITV, - FIXED_FD_INSTALL, - FTRUNCATE, - BIND, - LISTEN, - RECV_ZC, - - _, -}; -// io_uring_sqe.uring_cmd_flags (rw_flags in the Zig struct) - -/// use registered buffer; pass thig flag along with setting sqe->buf_index. -pub const IORING_URING_CMD_FIXED = 1 << 0; - -// io_uring_sqe.fsync_flags (rw_flags in the Zig struct) -pub const IORING_FSYNC_DATASYNC = 1 << 0; - -// io_uring_sqe.timeout_flags (rw_flags in the Zig struct) -pub const IORING_TIMEOUT_ABS = 1 << 0; -pub const IORING_TIMEOUT_UPDATE = 1 << 1; // Available since Linux 5.11 -pub const IORING_TIMEOUT_BOOTTIME = 1 << 2; // Available since Linux 5.15 -pub const IORING_TIMEOUT_REALTIME = 1 << 3; // Available since Linux 5.15 -pub const IORING_LINK_TIMEOUT_UPDATE = 1 << 4; // Available since Linux 5.15 -pub const IORING_TIMEOUT_ETIME_SUCCESS = 1 << 5; // Available since Linux 5.16 -pub const IORING_TIMEOUT_CLOCK_MASK = IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME; -pub const IORING_TIMEOUT_UPDATE_MASK = IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE; - -// io_uring_sqe.splice_flags (rw_flags in the Zig struct) -// extends splice(2) flags -pub const IORING_SPLICE_F_FD_IN_FIXED = 1 << 31; - -// POLL_ADD flags. -// Note that since sqe->poll_events (rw_flags in the Zig struct) is the flag space, the command flags for POLL_ADD are stored in sqe->len. - -/// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue to report CQEs on behalf of the same SQE. -pub const IORING_POLL_ADD_MULTI = 1 << 0; -/// Update existing poll request, matching sqe->addr as the old user_data field. -pub const IORING_POLL_UPDATE_EVENTS = 1 << 1; -pub const IORING_POLL_UPDATE_USER_DATA = 1 << 2; -pub const IORING_POLL_ADD_LEVEL = 1 << 3; - -// ASYNC_CANCEL flags. - -/// Cancel all requests that match the given key -pub const IORING_ASYNC_CANCEL_ALL = 1 << 0; -/// Key off 'fd' for cancelation rather than the request 'user_data'. -pub const IORING_ASYNC_CANCEL_FD = 1 << 1; -/// Match any request -pub const IORING_ASYNC_CANCEL_ANY = 1 << 2; -/// 'fd' passed in is a fixed descriptor. Available since Linux 6.0 -pub const IORING_ASYNC_CANCEL_FD_FIXED = 1 << 3; - -// send/sendmsg and recv/recvmsg flags (sqe->ioprio) - -/// If set, instead of first attempting to send or receive and arm poll if that yields an -EAGAIN result, -/// arm poll upfront and skip the initial transfer attempt. -pub const IORING_RECVSEND_POLL_FIRST = 1 << 0; -/// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue to report CQEs on behalf of the same SQE. -pub const IORING_RECV_MULTISHOT = 1 << 1; -/// Use registered buffers, the index is stored in the buf_index field. -pub const IORING_RECVSEND_FIXED_BUF = 1 << 2; -/// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res for the IORING_CQE_F_NOTIF cqe. -pub const IORING_SEND_ZC_REPORT_USAGE = 1 << 3; -/// If set, send or recv will grab as many buffers from the buffer group ID given and send them all. -/// The completion result will be the number of buffers send, with the starting buffer ID in cqe as per usual. -/// The buffers be contigious from the starting buffer ID. -/// Used with IOSQE_BUFFER_SELECT. -pub const IORING_RECVSEND_BUNDLE = 1 << 4; -/// CQE.RES FOR IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was requested -pub const IORING_NOTIF_USAGE_ZC_COPIED = 1 << 31; - -/// accept flags stored in sqe->iopri -pub const IORING_ACCEPT_MULTISHOT = 1 << 0; - -/// IORING_OP_MSG_RING command types, stored in sqe->addr -pub const IORING_MSG_RING_COMMAND = enum(u8) { - /// pass sqe->len as 'res' and off as user_data - DATA, - /// send a registered fd to another ring - SEND_FD, -}; - -// io_uring_sqe.msg_ring_flags (rw_flags in the Zig struct) - -/// Don't post a CQE to the target ring. Not applicable for IORING_MSG_DATA, obviously. -pub const IORING_MSG_RING_CQE_SKIP = 1 << 0; - -/// Pass through the flags from sqe->file_index (splice_fd_in in the zig struct) to cqe->flags */ -pub const IORING_MSG_RING_FLAGS_PASS = 1 << 1; - -// IO completion data structure (Completion Queue Entry) -pub const io_uring_cqe = extern struct { - /// io_uring_sqe.data submission passed back - user_data: u64, - - /// result code for this event - res: i32, - flags: u32, - - // Followed by 16 bytes of padding if initialized with IORING_SETUP_CQE32, doubling cqe size - - pub fn err(self: io_uring_cqe) E { - if (self.res > -4096 and self.res < 0) { - return @as(E, @enumFromInt(-self.res)); - } - return .SUCCESS; - } - - // On successful completion of the provided buffers IO request, the CQE flags field - // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by - // the upper 16-bits of the flags field. - pub fn buffer_id(self: io_uring_cqe) !u16 { - if (self.flags & IORING_CQE_F_BUFFER != IORING_CQE_F_BUFFER) { - return error.NoBufferSelected; - } - return @as(u16, @intCast(self.flags >> IORING_CQE_BUFFER_SHIFT)); - } -}; - -// io_uring_cqe.flags - -/// If set, the upper 16 bits are the buffer ID -pub const IORING_CQE_F_BUFFER = 1 << 0; -/// If set, parent SQE will generate more CQE entries. -/// Available since Linux 5.13. -pub const IORING_CQE_F_MORE = 1 << 1; -/// If set, more data to read after socket recv -pub const IORING_CQE_F_SOCK_NONEMPTY = 1 << 2; -/// Set for notification CQEs. Can be used to distinct them from sends. -pub const IORING_CQE_F_NOTIF = 1 << 3; -/// If set, the buffer ID set in the completion will get more completions. -pub const IORING_CQE_F_BUF_MORE = 1 << 4; - -pub const IORING_CQE_BUFFER_SHIFT = 16; - -/// Magic offsets for the application to mmap the data it needs -pub const IORING_OFF_SQ_RING = 0; -pub const IORING_OFF_CQ_RING = 0x8000000; -pub const IORING_OFF_SQES = 0x10000000; - -/// Filled with the offset for mmap(2) -pub const io_sqring_offsets = extern struct { - /// offset of ring head - head: u32, - - /// offset of ring tail - tail: u32, - - /// ring mask value - ring_mask: u32, - - /// entries in ring - ring_entries: u32, - - /// ring flags - flags: u32, - - /// number of sqes not submitted - dropped: u32, - - /// sqe index array - array: u32, - - resv1: u32, - user_addr: u64, -}; - -// io_sqring_offsets.flags - -/// needs io_uring_enter wakeup -pub const IORING_SQ_NEED_WAKEUP = 1 << 0; -/// kernel has cqes waiting beyond the cq ring -pub const IORING_SQ_CQ_OVERFLOW = 1 << 1; -/// task should enter the kernel -pub const IORING_SQ_TASKRUN = 1 << 2; - -pub const io_cqring_offsets = extern struct { - head: u32, - tail: u32, - ring_mask: u32, - ring_entries: u32, - overflow: u32, - cqes: u32, - flags: u32, - resv: u32, - user_addr: u64, -}; - -// io_cqring_offsets.flags - -/// disable eventfd notifications -pub const IORING_CQ_EVENTFD_DISABLED = 1 << 0; - -// io_uring_enter flags -pub const IORING_ENTER_GETEVENTS = 1 << 0; -pub const IORING_ENTER_SQ_WAKEUP = 1 << 1; -pub const IORING_ENTER_SQ_WAIT = 1 << 2; -pub const IORING_ENTER_EXT_ARG = 1 << 3; -pub const IORING_ENTER_REGISTERED_RING = 1 << 4; - -pub const io_uring_params = extern struct { - sq_entries: u32, - cq_entries: u32, - flags: u32, - sq_thread_cpu: u32, - sq_thread_idle: u32, - features: u32, - wq_fd: u32, - resv: [3]u32, - sq_off: io_sqring_offsets, - cq_off: io_cqring_offsets, -}; - -// io_uring_params.features flags - -pub const IORING_FEAT_SINGLE_MMAP = 1 << 0; -pub const IORING_FEAT_NODROP = 1 << 1; -pub const IORING_FEAT_SUBMIT_STABLE = 1 << 2; -pub const IORING_FEAT_RW_CUR_POS = 1 << 3; -pub const IORING_FEAT_CUR_PERSONALITY = 1 << 4; -pub const IORING_FEAT_FAST_POLL = 1 << 5; -pub const IORING_FEAT_POLL_32BITS = 1 << 6; -pub const IORING_FEAT_SQPOLL_NONFIXED = 1 << 7; -pub const IORING_FEAT_EXT_ARG = 1 << 8; -pub const IORING_FEAT_NATIVE_WORKERS = 1 << 9; -pub const IORING_FEAT_RSRC_TAGS = 1 << 10; -pub const IORING_FEAT_CQE_SKIP = 1 << 11; -pub const IORING_FEAT_LINKED_FILE = 1 << 12; - -// io_uring_register opcodes and arguments -pub const IORING_REGISTER = enum(u32) { - REGISTER_BUFFERS, - UNREGISTER_BUFFERS, - REGISTER_FILES, - UNREGISTER_FILES, - REGISTER_EVENTFD, - UNREGISTER_EVENTFD, - REGISTER_FILES_UPDATE, - REGISTER_EVENTFD_ASYNC, - REGISTER_PROBE, - REGISTER_PERSONALITY, - UNREGISTER_PERSONALITY, - REGISTER_RESTRICTIONS, - REGISTER_ENABLE_RINGS, - - // extended with tagging - REGISTER_FILES2, - REGISTER_FILES_UPDATE2, - REGISTER_BUFFERS2, - REGISTER_BUFFERS_UPDATE, - - // set/clear io-wq thread affinities - REGISTER_IOWQ_AFF, - UNREGISTER_IOWQ_AFF, - - // set/get max number of io-wq workers - REGISTER_IOWQ_MAX_WORKERS, - - // register/unregister io_uring fd with the ring - REGISTER_RING_FDS, - UNREGISTER_RING_FDS, - - // register ring based provide buffer group - REGISTER_PBUF_RING, - UNREGISTER_PBUF_RING, - - // sync cancelation API - REGISTER_SYNC_CANCEL, - - // register a range of fixed file slots for automatic slot allocation - REGISTER_FILE_ALLOC_RANGE, - - // return status information for a buffer group - REGISTER_PBUF_STATUS, - - // set/clear busy poll settings - REGISTER_NAPI, - UNREGISTER_NAPI, - - REGISTER_CLOCK, - - // clone registered buffers from source ring to current ring - REGISTER_CLONE_BUFFERS, - - // send MSG_RING without having a ring - REGISTER_SEND_MSG_RING, - - // register a netdev hw rx queue for zerocopy - REGISTER_ZCRX_IFQ, - - // resize CQ ring - REGISTER_RESIZE_RINGS, - - REGISTER_MEM_REGION, - - // flag added to the opcode to use a registered ring fd - REGISTER_USE_REGISTERED_RING = 1 << 31, - - _, -}; - -/// io_uring_restriction->opcode values -pub const IOWQ_CATEGORIES = enum(u8) { - BOUND, - UNBOUND, -}; - -/// deprecated, see struct io_uring_rsrc_update -pub const io_uring_files_update = extern struct { - offset: u32, - resv: u32, - fds: u64, -}; - -/// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. -pub const IORING_RSRC_REGISTER_SPARSE = 1 << 0; - -pub const io_uring_rsrc_register = extern struct { - nr: u32, - flags: u32, - resv2: u64, - data: u64, - tags: u64, -}; - -pub const io_uring_rsrc_update = extern struct { - offset: u32, - resv: u32, - data: u64, -}; - -pub const io_uring_rsrc_update2 = extern struct { - offset: u32, - resv: u32, - data: u64, - tags: u64, - nr: u32, - resv2: u32, -}; - -pub const io_uring_notification_slot = extern struct { - tag: u64, - resv: [3]u64, -}; - -pub const io_uring_notification_register = extern struct { - nr_slots: u32, - resv: u32, - resv2: u64, - data: u64, - resv3: u64, -}; - -pub const io_uring_napi = extern struct { - busy_poll_to: u32, - prefer_busy_poll: u8, - _pad: [3]u8, - resv: u64, -}; - -/// Skip updating fd indexes set to this value in the fd table */ -pub const IORING_REGISTER_FILES_SKIP = -2; - -pub const IO_URING_OP_SUPPORTED = 1 << 0; - -pub const io_uring_probe_op = extern struct { - op: IORING_OP, - resv: u8, - /// IO_URING_OP_* flags - flags: u16, - resv2: u32, - - pub fn is_supported(self: @This()) bool { - return self.flags & IO_URING_OP_SUPPORTED != 0; - } -}; - -pub const io_uring_probe = extern struct { - /// Last opcode supported - last_op: IORING_OP, - /// Length of ops[] array below - ops_len: u8, - resv: u16, - resv2: [3]u32, - ops: [256]io_uring_probe_op, - - /// Is the operation supported on the running kernel. - pub fn is_supported(self: @This(), op: IORING_OP) bool { - const i = @intFromEnum(op); - if (i > @intFromEnum(self.last_op) or i >= self.ops_len) - return false; - return self.ops[i].is_supported(); - } -}; - -pub const io_uring_restriction = extern struct { - opcode: IORING_RESTRICTION, - arg: extern union { - /// IORING_RESTRICTION_REGISTER_OP - register_op: IORING_REGISTER, - - /// IORING_RESTRICTION_SQE_OP - sqe_op: IORING_OP, - - /// IORING_RESTRICTION_SQE_FLAGS_* - sqe_flags: u8, - }, - resv: u8, - resv2: [3]u32, -}; - -/// io_uring_restriction->opcode values -pub const IORING_RESTRICTION = enum(u16) { - /// Allow an io_uring_register(2) opcode - REGISTER_OP = 0, - - /// Allow an sqe opcode - SQE_OP = 1, - - /// Allow sqe flags - SQE_FLAGS_ALLOWED = 2, - - /// Require sqe flags (these flags must be set on each submission) - SQE_FLAGS_REQUIRED = 3, - - _, -}; - -pub const IO_URING_SOCKET_OP = enum(u16) { - SIOCIN = 0, - SIOCOUTQ = 1, - GETSOCKOPT = 2, - SETSOCKOPT = 3, -}; - -pub const io_uring_buf = extern struct { - addr: u64, - len: u32, - bid: u16, - resv: u16, -}; - -pub const io_uring_buf_ring = extern struct { - resv1: u64, - resv2: u32, - resv3: u16, - tail: u16, -}; - -/// argument for IORING_(UN)REGISTER_PBUF_RING -pub const io_uring_buf_reg = extern struct { - ring_addr: u64, - ring_entries: u32, - bgid: u16, - flags: Flags, - resv: [3]u64, - - pub const Flags = packed struct { - _0: u1 = 0, - /// Incremental buffer consumption. - inc: bool, - _: u14 = 0, - }; -}; - -pub const io_uring_getevents_arg = extern struct { - sigmask: u64, - sigmask_sz: u32, - pad: u32, - ts: u64, -}; - -/// Argument for IORING_REGISTER_SYNC_CANCEL -pub const io_uring_sync_cancel_reg = extern struct { - addr: u64, - fd: i32, - flags: u32, - timeout: kernel_timespec, - pad: [4]u64, -}; - -/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE -/// The range is specified as [off, off + len) -pub const io_uring_file_index_range = extern struct { - off: u32, - len: u32, - resv: u64, -}; - -pub const io_uring_recvmsg_out = extern struct { - namelen: u32, - controllen: u32, - payloadlen: u32, - flags: u32, -}; - pub const utsname = extern struct { sysname: [64:0]u8, nodename: [64:0]u8, From e514091ac2fe5a0ba999f67f6142b213922f2d59 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sat, 11 Oct 2025 14:57:37 +0000 Subject: [PATCH 18/41] Fix posix.W in process/Child.zig add Deprecated linux.W fns Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 81 +++++++++++++++++++++++++++++---------- lib/std/process/Child.zig | 15 ++++---- 2 files changed, 68 insertions(+), 28 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 067e454a7657..97482a215480 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -3691,46 +3691,85 @@ pub const R_OK = 4; pub const W = packed struct(u32) { nohang: bool = false, - untraced_or_stopped: packed union { - untraced: bool, - stopped: bool, - } = @bitCast(false), + stopped: bool = false, exited: bool = false, continued: bool = false, - _unused: u20 = 0, + _5: u20 = 0, nowait: bool = false, - _unused_1: u7 = 0, + _26: u7 = 0, + /// alias to stopped + pub const untraced: W = .{ .stopped = true }; - // Deprecated aliases + fn toInt(s: W) u32 { + return @bitCast(s); + } + + /// matches EXITSTATUS in C + pub fn exitStatus(s: W) u8 { + return @intCast((s.toInt() & 0xff00) >> 8); + } + + /// matches TERMSIG in C + pub fn termSig(s: W) u32 { + return s.toInt() & 0x7f; + } + + /// matches STOPSIG in C + pub fn stopSig(s: W) u32 { + return exitStatus(s); + } + + /// matches IFEXITED in C + pub fn ifExited(s: W) bool { + return termSig(s) == 0; + } + + /// matches IFSTOPPED in C + pub fn ifStopped(s: W) bool { + return @as(u16, @truncate(((s.toInt() & 0xffff) *% 0x10001) >> 8)) > 0x7f00; + } + + /// matches IFSIGNALED in C + pub fn ifSignaled(s: W) bool { + return (s.toInt() & 0xffff) -% 1 < 0xff; + } + + // Deprecated constants pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); - pub const UNTRACED: u32 = @bitCast(W{ .untraced_or_stopped = .{ .untraced = true } }); - pub const STOPPED: u32 = @bitCast(W{ .untraced_or_stopped = .{ .stopped = true } }); + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + pub const UNTRACED: u32 = @bitCast(untraced); pub const EXITED: u32 = @bitCast(W{ .exited = true }); pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); - pub fn EXITSTATUS(s: W) u8 { - return @intCast((@as(u32, @bitCast(s)) & 0xff00) >> 8); + /// DEPRECATED alias to exitStatus + pub fn EXITSTATUS(s: u32) u8 { + return exitStatus(@bitCast(s)); } - pub fn TERMSIG(s: W) u32 { - return @as(u32, @bitCast(s)) & 0x7f; + /// DEPRECATED alias to termSig + pub fn TERMSIG(s: u32) u32 { + return termSig(@bitCast(s)); } - pub fn STOPSIG(s: W) u32 { - return EXITSTATUS(s); + /// DEPRECATED alias to stopSig + pub fn STOPSIG(s: u32) u32 { + return stopSig(@bitCast(s)); } - pub fn IFEXITED(s: W) bool { - return TERMSIG(s) == 0; + /// DEPRECATED alias to ifExited + pub fn IFEXITED(s: u32) bool { + return ifExited(@bitCast(s)); } - pub fn IFSTOPPED(s: W) bool { - return @as(u16, @truncate(((@as(u32, @bitCast(s)) & 0xffff) *% 0x10001) >> 8)) > 0x7f00; + /// DEPRECATED alias to ifStopped + pub fn IFSTOPPED(s: u32) bool { + return ifStopped(@bitCast(s)); } - pub fn IFSIGNALED(s: W) bool { - return (s & 0xffff) -% 1 < 0xff; + /// DEPRECATED alias to ifSignaled + pub fn IFSIGNALED(s: u32) bool { + return ifSignaled(@bitCast(s)); } }; diff --git a/lib/std/process/Child.zig b/lib/std/process/Child.zig index da8d803d6056..61b6d64411c4 100644 --- a/lib/std/process/Child.zig +++ b/lib/std/process/Child.zig @@ -523,14 +523,15 @@ fn cleanupStreams(self: *ChildProcess) void { } fn statusToTerm(status: u32) Term { - return if (posix.W.IFEXITED(status)) - Term{ .Exited = posix.W.EXITSTATUS(status) } - else if (posix.W.IFSIGNALED(status)) - Term{ .Signal = posix.W.TERMSIG(status) } - else if (posix.W.IFSTOPPED(status)) - Term{ .Stopped = posix.W.STOPSIG(status) } + const w: posix.W = @bitCast(status); + return if (w.ifExited()) + .{ .Exited = w.exitStatus() } + else if (w.ifSignaled()) + .{ .Signal = w.termSig() } + else if (w.ifStopped()) + .{ .Stopped = w.stopSig() } else - Term{ .Unknown = status }; + .{ .Unknown = status }; } fn spawnPosix(self: *ChildProcess) SpawnError!void { From fc7385ec0e4f488e1527fc3226274ad261011270 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 12 Oct 2025 12:19:23 +0000 Subject: [PATCH 19/41] Add mips defination for Epoll Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 82 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 97482a215480..a565b88ed6e0 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -5712,7 +5712,87 @@ pub const EpollOp = enum(u32) { pub const EPOLL = Epoll; /// Epoll event masks // https://github.com/torvalds/linux/blob/18a7e218cfcdca6666e1f7356533e4c988780b57/include/uapi/linux/eventpoll.h#L30 -pub const Epoll = packed struct(u32) { +pub const Epoll = if (is_mips) packed struct(u32) { + // EPOLL event types (lower 16 bits) + // + /// The associated file is available for read(2) operations + in: bool = false, + /// There is an exceptional condition on the file descriptor + pri: bool = false, + /// The associated file is available for write(2) operations + out: bool = false, + /// Error condition happened on the associated file descriptor + err: bool = false, + /// Hang up happened on the associated file descriptor + hup: bool = false, + /// Invalid request: fd not open + nval: bool = false, + /// Normal data may be read + rdnorm: bool = false, + /// Priority data may be read + rdband: bool = false, + /// Priority data may be written + wrband: bool = false, + _10: u1 = 0, + /// Message available (unused on Linux) + msg: bool = false, + _12: u2 = 0, + /// Stream socket peer closed connection + rdhup: bool = false, + _15: u13 = 0, + // EPOLL input flags (Higher order flags are included as internal stat) + // + /// Internal flag - wakeup generated by io_uring, used to detect + /// recursion back into the io_uring poll handler + uring_wake: bool = false, + /// Set exclusive wakeup mode for the target file descriptor + exclusive: bool = false, + /// Request the handling of system wakeup events so as to prevent system + /// suspends from happening while those events are being processed. + /// Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will + /// not be re-allowed until epoll_wait is called again after consuming the + /// wakeup event(s). + /// Requires CAP_BLOCK_SUSPEND + wakeup: bool = false, + /// Set the One Shot behaviour for the target file descriptor + oneshot: bool = false, + /// Set the Edge Triggered behaviour for the target file descriptor + et: bool = false, + + /// Alias to out on Mips + /// Writing is now possible (normal data) + pub const wrnorm: Epoll = .{ .out = true }; + + // Deprecated Named constants + // EPOLL event types + pub const IN: u32 = @bitCast(Epoll{ .in = true }); + pub const PRI: u32 = @bitCast(Epoll{ .pri = true }); + pub const OUT: u32 = @bitCast(Epoll{ .out = true }); + pub const ERR: u32 = @bitCast(Epoll{ .err = true }); + pub const HUP: u32 = @bitCast(Epoll{ .hup = true }); + pub const NVAL: u32 = @bitCast(Epoll{ .nval = true }); + pub const RDNORM: u32 = @bitCast(Epoll{ .rdnorm = true }); + pub const RDBAND: u32 = @bitCast(Epoll{ .rdband = true }); + pub const WRNORM: u32 = @bitCast(wrnorm); + pub const WRBAND: u32 = @bitCast(Epoll{ .wrband = true }); + pub const MSG: u32 = @bitCast(Epoll{ .msg = true }); + pub const RDHUP: u32 = @bitCast(Epoll{ .rdhup = true }); + + // EPOLL input flags + pub const URING_WAKE: u32 = @bitCast(Epoll{ .uring_wake = true }); + pub const EXCLUSIVE: u32 = @bitCast(Epoll{ .exclusive = true }); + pub const WAKEUP: u32 = @bitCast(Epoll{ .wakeup = true }); + pub const ONESHOT: u32 = @bitCast(Epoll{ .oneshot = true }); + pub const ET: u32 = @bitCast(Epoll{ .et = true }); + + /// Flags for epoll_create1 + pub const CLOEXEC = 1 << @bitOffsetOf(O, "CLOEXEC"); + + // Deprecated Op Constants use EpollOp enum type + pub const CTL_ADD: u32 = @intFromEnum(EpollOp.ctl_add); + pub const CTL_DEL: u32 = @intFromEnum(EpollOp.ctl_del); + pub const CTL_MOD: u32 = @intFromEnum(EpollOp.ctl_mod); +} else packed struct(u32) { // EPOLL event types (lower 16 bits) // /// The associated file is available for read(2) operations From 96f295a565c4b507202782b895406697bbc58c62 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 12 Oct 2025 13:52:24 +0000 Subject: [PATCH 20/41] Add and improve comments Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 454 ++++++++++++++++++++--------------- 1 file changed, 263 insertions(+), 191 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 1b0bc9a102b1..d713b271b657 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -21,7 +21,7 @@ features: uflags.Features, /// will make the final call on how many entries the submission and completion /// queues will ultimately have, /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. -/// Matches the interface of io_uring_queue_init() in liburing. +/// Matches the interface of `io_uring_queue_init()` in liburing. pub fn init(entries: u16, flags: uflags.Setup) !IoUring { var params = mem.zeroInit(Params, .{ .flags = flags, @@ -35,7 +35,7 @@ pub fn init(entries: u16, flags: uflags.Setup) !IoUring { /// thread idle timeout (the kernel and our default is 1 second). /// `params` is passed by reference because the kernel needs to modify the /// parameters. -/// Matches the interface of io_uring_queue_init_params() in liburing. +/// Matches the interface of `io_uring_queue_init_params()` in liburing. pub fn init_params(entries: u16, p: *Params) !IoUring { if (entries == 0) return error.EntriesZero; if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; @@ -150,7 +150,7 @@ pub fn deinit(self: *IoUring) void { /// control flow error, and the null return in liburing is more a C idiom than /// anything else, for lack of a better alternative. In Zig, we have /// first-class error handling... so let's use it. -/// Matches the implementation of io_uring_get_sqe() in liburing. +/// Matches the implementation of `io_uring_get_sqe()` in liburing. pub fn get_sqe(self: *IoUring) !*Sqe { const head = @atomicLoad(u32, self.sq.head, .acquire); // Remember that these head and tail offsets wrap around every four billion @@ -163,22 +163,22 @@ pub fn get_sqe(self: *IoUring) !*Sqe { return sqe; } -/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this -/// once after you have called get_sqe() multiple times to setup multiple I/O +/// Submits the SQEs acquired via `get_sqe()` to the kernel. You can call this +/// once after you have called `get_sqe()` multiple times to setup multiple I/O /// requests. /// Returns the number of SQEs submitted, if not used alongside /// IORING_SETUP_SQPOLL. /// If the io_uring instance uses IORING_SETUP_SQPOLL, the value returned on /// success is not guaranteed to match the amount of actually submitted sqes /// during this call. A value higher or lower, including 0, may be returned. -/// Matches the implementation of io_uring_submit() in liburing. +/// Matches the implementation of `io_uring_submit()` in liburing. pub fn submit(self: *IoUring) !u32 { return self.submit_and_wait(0); } -/// Like submit(), but allows waiting for events as well. +/// Like `submit()`, but allows waiting for events as well. /// Returns the number of SQEs submitted. -/// Matches the implementation of io_uring_submit_and_wait() in liburing. +/// Matches the implementation of `io_uring_submit_and_wait()` in liburing. pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { const submitted = self.flush_sq(); var flags: uflags.Enter = .{}; @@ -237,9 +237,9 @@ pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.En /// Sync internal state with kernel ring state on the SQ side. /// Returns the number of all pending events in the SQ ring, for the shared /// ring. This return value includes previously flushed SQEs, as per liburing. -/// The rationale is to suggest that an io_uring_enter() call is needed rather +/// The rationale is to suggest that an `io_uring_enter()` call is needed rather /// than not. -/// Matches the implementation of __io_uring_flush_sq() in liburing. +/// Matches the implementation of `__io_uring_flush_sq()` in liburing. pub fn flush_sq(self: *IoUring) u32 { if (self.sq.sqe_head != self.sq.sqe_tail) { // Fill in SQEs that we have queued up, adding them to the kernel ring. @@ -261,7 +261,7 @@ pub fn flush_sq(self: *IoUring) u32 { /// Returns true if we are not using an SQ thread (thus nobody submits but us), /// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly /// awakened. For the latter case, we set the SQ thread wakeup flag. -/// Matches the implementation of sq_ring_needs_enter() in liburing. +/// Matches the implementation of `sq_ring_needs_enter()` in liburing. pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { assert(flags.*.empty()); if (!self.flags.sqpoll) return true; @@ -275,7 +275,7 @@ pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { /// Returns the number of flushed and unflushed SQEs pending in the submission /// queue. In other words, this is the number of SQEs in the submission queue, /// i.e. its length. These are SQEs that the kernel is yet to consume. -/// Matches the implementation of io_uring_sq_ready in liburing. +/// Matches the implementation of `io_uring_sq_ready()` in liburing. pub fn sq_ready(self: *IoUring) u32 { // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, // see https://github.com/axboe/liburing/issues/92. @@ -284,7 +284,7 @@ pub fn sq_ready(self: *IoUring) u32 { /// Returns the number of CQEs in the completion queue, i.e. its length. /// These are CQEs that the application is yet to consume. -/// Matches the implementation of io_uring_cq_ready in liburing. +/// Matches the implementation of `io_uring_cq_ready()` in liburing. pub fn cq_ready(self: *IoUring) u32 { return @atomicLoad(u32, self.cq.tail, .acquire) -% self.cq.head.*; } @@ -298,11 +298,11 @@ pub fn cq_ready(self: *IoUring) u32 { /// The rationale for copying CQEs rather than copying pointers is that /// pointers are 8 bytes whereas CQEs are not much more at only 16 bytes, and /// this provides a safer faster interface. -/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency +/// Safer, because you no longer need to call `cqe_seen()`, avoiding idempotency /// bugs. Faster, because we can now amortize the atomic store release to /// `cq.head` across the batch. /// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. -/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but +/// Matches the implementation of `io_uring_peek_batch_cqe()` in liburing, but /// supports waiting. pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 { const count = self.copy_cqes_ready(cqes); @@ -345,27 +345,28 @@ pub fn copy_cqe(ring: *IoUring) !Cqe { } } -/// Matches the implementation of cq_ring_needs_flush() in liburing. +/// Matches the implementation of `cq_ring_needs_flush()` in liburing. pub fn cq_ring_needs_flush(self: *IoUring) bool { return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).cq_overflow; } /// For advanced use cases only that implement custom completion queue methods. -/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or -/// cq_advance(). Must be called exactly once after a zero-copy CQE has been +/// If you use `copy_cqes()` or `copy_cqe()` you must not call `cqe_seen()` or +/// `cq_advance()`. Must be called exactly once after a zero-copy CQE has been /// processed by your application. /// Not idempotent, calling more than once will result in other CQEs being lost. -/// Matches the implementation of cqe_seen() in liburing. +/// Matches the implementation of `cqe_seen()` in liburing. pub fn cqe_seen(self: *IoUring, cqe: *Cqe) void { _ = cqe; self.cq_advance(1); } /// For advanced use cases only that implement custom completion queue methods. -/// Matches the implementation of cq_advance() in liburing. +/// Matches the implementation of `cq_advance()` in liburing. pub fn cq_advance(self: *IoUring, count: u32) void { if (count > 0) { - // Ensure the kernel only sees the new head value after the CQEs have been read. + // Ensure the kernel only sees the new head value after the CQEs have + // been read. @atomicStore(u32, self.cq.head, self.cq.head.* +% count, .release); } } @@ -391,10 +392,11 @@ pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync } /// Queues (but does not submit) an SQE to perform a no-op. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. /// A no-op is more useful than may appear at first glance. -/// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to -/// know when the ring is idle before acting on a kill signal. +/// For example, you could call `drain_previous_sqes()` on the returned SQE, to +/// use the no-op to know when the ring is idle before acting on a kill signal. pub fn nop(self: *IoUring, user_data: u64) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_nop(); @@ -402,10 +404,13 @@ pub fn nop(self: *IoUring, user_data: u64) !*Sqe { return sqe; } -/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` depending on the buffer type. +/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` +/// depending on the buffer type. /// * Reading into a `ReadBuffer.buffer` uses `read(2)` /// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` -/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. See https://man7.org/linux/man-pages/man2/preadv2.2.html +/// +/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. +/// See https://man7.org/linux/man-pages/man2/preadv2.2.html /// /// Returns a pointer to the SQE. pub fn read( @@ -446,19 +451,34 @@ pub fn write( /// Queues (but does not submit) an SQE to perform a `splice(2)` /// Either `fd_in` or `fd_out` must be a pipe. -/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64). -/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read -/// from `fd_in` starting from the file offset, which is incremented by the number of bytes read. -/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`. -/// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first, -/// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally. +/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to +/// std.math.maxInt(u64). +/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` +/// are read from `fd_in` starting from the file offset, which is incremented +/// by the number of bytes read. +/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then +/// the starting offset of `fd_in` will be `off_in`. +/// +/// This splice operation can be used to implement sendfile by splicing to an +/// intermediate pipe first, then splice to the final destination. In fact, the +/// implementation of sendfile in kernel uses splice internally. /// -/// NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with EINVAL if one of the -/// fd doesn't explicitly support splice peration, e.g. reading from terminal is unsupported from kernel 5.7 to 5.11. -/// See https://github.com/axboe/liburing/issues/291 +/// NOTE that even if `fd_in` or `fd_out` refers to a pipe, the splice operation +/// can still fail with EINVAL if one of the fd doesn't explicitly support +/// splice operation, e.g. reading from terminal is unsupported from kernel 5.7 +/// to 5.11. See https://github.com/axboe/liburing/issues/291 /// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd_out: posix.fd_t, off_out: u64, len: usize) !*Sqe { +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +pub fn splice( + self: *IoUring, + user_data: u64, + fd_in: posix.fd_t, + off_in: u64, + fd_out: posix.fd_t, + off_out: u64, + len: usize, +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); sqe.user_data = user_data; @@ -466,10 +486,12 @@ pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd } /// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED. -/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. -/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// The `buffer` provided must be registered with the kernel by calling +/// `register_buffers()` first. The `buffer_index` must be the same as its +/// index in the array provided to `register_buffers()`. /// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. pub fn read_fixed( self: *IoUring, user_data: u64, @@ -485,9 +507,10 @@ pub fn read_fixed( } /// Queues (but does not submit) an SQE to perform a `pwritev()`. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. -/// See https://linux.die.net/man/2/pwritev. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the +/// returned SQE. See https://linux.die.net/man/2/pwritev. pub fn writev( self: *IoUring, user_data: u64, @@ -502,10 +525,12 @@ pub fn writev( } /// Queues (but does not submit) an SQE to perform a IORING_OP_WRITE_FIXED. -/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. -/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// The `buffer` provided must be registered with the kernel by calling +/// `register_buffers()` first. The `buffer_index` must be the same as its index +/// in the array provided to `register_buffers()`. /// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. pub fn write_fixed( self: *IoUring, user_data: u64, @@ -831,14 +856,15 @@ pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe { /// Queues (but does not submit) an SQE to register a timeout operation. /// Returns a pointer to the SQE. /// -/// The timeout will complete when either the timeout expires, or after the specified number of -/// events complete (if `count` is greater than `0`). +/// The timeout will complete when either the timeout expires, or after the +/// specified number of events complete (if `count` is greater than `0`). /// -/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout. +/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an +/// absolute timeout. /// -/// The completion event result will be `-ETIME` if the timeout completed through expiration, -/// `0` if the timeout completed after the specified number of events, or `-ECANCELED` if the -/// timeout was removed before it expired. +/// The completion event result will be `-ETIME` if the timeout completed +/// through expiration, `0` if the timeout completed after the specified number +/// of events, or `-ECANCELED` if the timeout was removed before it expired. /// /// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. pub fn timeout( @@ -859,7 +885,8 @@ pub fn timeout( /// /// The timeout is identified by its `user_data`. /// -/// The completion event result will be `0` if the timeout was found and canceled successfully, +/// The completion event result will be `0` if the timeout was found and +/// cancelled successfully else: /// `-EBUSY` if the timeout was found but expiration was already in progress, or /// `-ENOENT` if the timeout was not found. pub fn timeout_remove( @@ -877,18 +904,17 @@ pub fn timeout_remove( /// Queues (but does not submit) an SQE to add a link timeout operation. /// Returns a pointer to the SQE. /// -/// You need to set linux.IOSQE_IO_LINK to flags of the target operation -/// and then call this method right after the target operation. +/// You need to set IOSQE_IO_LINK to flags of the target operation and then +/// call this method right after the target operation. /// See https://lwn.net/Articles/803932/ for detail. /// /// If the dependent request finishes before the linked timeout, the timeout /// is canceled. If the timeout finishes before the dependent request, the /// dependent request will be canceled. /// -/// The completion event result of the link_timeout will be -/// `-ETIME` if the timeout finishes before the dependent request -/// (in this case, the completion event result of the dependent request will -/// be `-ECANCELED`), or +/// The completion event result of the link_timeout will be either of: +/// `-ETIME` if the timeout finishes before the dependent request (in this case, +/// the completion event result of the dependent request will be `-ECANCELED`) /// `-EALREADY` if the dependent request finishes before the linked timeout. pub fn link_timeout( self: *IoUring, @@ -929,8 +955,8 @@ pub fn poll_remove( return sqe; } -/// Queues (but does not submit) an SQE to update the user data of an existing poll -/// operation. Returns a pointer to the SQE. +/// Queues (but does not submit) an SQE to update the user data of an existing +/// poll operation. Returns a pointer to the SQE. pub fn poll_update( self: *IoUring, user_data: u64, @@ -983,8 +1009,9 @@ pub fn statx( /// /// The operation is identified by its `user_data`. /// -/// The completion event result will be `0` if the operation was found and canceled successfully, -/// `-EALREADY` if the operation was found but was already in progress, or +/// The completion event result will be `0` if the operation was found and +/// cancelled successfully else either of: +/// `-EALREADY` if the operation was found but was already in progress /// `-ENOENT` if the operation was not found. pub fn cancel( self: *IoUring, @@ -1093,12 +1120,15 @@ pub fn linkat( return sqe; } -/// Queues (but does not submit) an SQE to provide a group of buffers used for commands that read/receive data. -/// Returns a pointer to the SQE. +/// Queues (but does not submit) an SQE to provide a group of buffers used for +/// commands that read/receive data. Returns a pointer to the SQE. /// -/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via .buffer_selection. +/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via +/// buffer_selection. /// -/// The kernel expects a contiguous block of memory of size (buffers_count * buffer_size). +/// The kernel expects a contiguous block of memory of size (buffers_count * +/// buffer_size). +// TODO: why not use a slice with `buffers_count` pub fn provide_buffers( self: *IoUring, user_data: u64, @@ -1146,15 +1176,22 @@ pub fn waitid( } /// Registers an array of file descriptors. -/// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must -/// retrieve a reference to the file, and once I/O has completed the file reference must be -/// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads. -/// This slowdown can be avoided by pre-registering file descriptors. -/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags, -/// and the SQE's fd must be set to the index of the file descriptor in the registered array. -/// Registering file descriptors will wait for the ring to idle. -/// Files are automatically unregistered by the kernel when the ring is torn down. -/// An application need unregister only if it wants to register a new array of file descriptors. +/// +/// Every time a file descriptor is put in an SQE and submitted to the kernel, +/// the kernel must retrieve a reference to the file, and once I/O has +/// completed, the file reference must be dropped. The atomic nature of this +/// file reference can be a slowdown for high IOPS workloads. This slowdown can +/// be avoided by pre-registering file descriptors. +/// +/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in +/// the SQE's flags, and the SQE's fd must be set to the index of the file +/// descriptor in the registered array. +/// +/// Registering file descriptors will wait for the ring to idle and files are +/// automatically unregistered by the kernel when the ring is torn down. +/// +/// An application need unregister only if it wants to register a new array of +/// file descriptors. pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { assert(self.fd >= 0); const res = linux.io_uring_register( @@ -1168,11 +1205,13 @@ pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { /// Updates registered file descriptors. /// -/// Updates are applied starting at the provided offset in the original file descriptors slice. +/// Updates are applied starting at the provided offset in the original file +/// descriptors slice. /// There are three kind of updates: /// * turning a sparse entry (where the fd is -1) into a real one /// * removing an existing entry (set the fd to -1) /// * replacing an existing entry with a new fd +/// /// Adding new file descriptors must be done with `register_files`. pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { assert(self.fd >= 0); @@ -1240,8 +1279,8 @@ pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { return handle_registration_result(res); } -/// Registers the file descriptor for an eventfd that will be notified of completion events on -/// an io_uring instance. +/// Registers the file descriptor for an eventfd that will be notified of +/// completion events on an io_uring instance. /// Only a single a eventfd can be registered at any given point in time. pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { assert(self.fd >= 0); @@ -1254,9 +1293,10 @@ pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { try handle_registration_result(res); } -/// Registers the file descriptor for an eventfd that will be notified of completion events on -/// an io_uring instance. Notifications are only posted for events that complete in an async manner. -/// This means that events that complete inline while being submitted do not trigger a notification event. +/// Registers the file descriptor for an eventfd that will be notified of +/// completion events on an io_uring instance. Notifications are only posted +/// for events that complete in an async manner. This means that events that +/// complete inline while being submitted do not trigger a notification event. /// Only a single eventfd can be registered at any given point in time. pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { assert(self.fd >= 0); @@ -1329,26 +1369,32 @@ pub fn get_probe(self: *IoUring) !Probe { fn handle_registration_result(res: usize) !void { switch (linux.errno(res)) { .SUCCESS => {}, - // One or more fds in the array are invalid, or the kernel does not support sparse sets: + // One or more fds in the array are invalid, or the kernel does not + // support sparse sets: .BADF => return error.FileDescriptorInvalid, .BUSY => return error.FilesAlreadyRegistered, .INVAL => return error.FilesEmpty, - // Adding `nr_args` file references would exceed the maximum allowed number of files the - // user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and - // the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed - // for a fixed file set (older kernels have a limit of 1024 files vs 64K files): + // Adding `nr_args` file references would exceed the maximum allowed + // number of files the user is allowed to have according to the + // per-user RLIMIT_NOFILE resource limit and the CAP_SYS_RESOURCE + // capability is not set, or `nr_args` exceeds the maximum allowed + // for a fixed file set (older kernels have a limit of 1024 files vs + // 64K files): .MFILE => return error.UserFdQuotaExceeded, - // Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft - // resource limit but tried to lock more memory than the limit permitted (not enforced - // when the process is privileged with CAP_IPC_LOCK): + // Insufficient kernel resources, or the caller had a non-zero + // RLIMIT_MEMLOCK soft resource limit but tried to lock more memory + // than the limit permitted (not enforced when the process is + // privileged with CAP_IPC_LOCK): .NOMEM => return error.SystemResources, - // Attempt to register files on a ring already registering files or being torn down: + // Attempt to register files on a ring already registering files or + // being torn down: .NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, else => |errno| return posix.unexpectedErrno(errno), } } -/// Unregisters all registered file descriptors previously associated with the ring. +/// Unregisters all registered file descriptors previously associated with the +/// ring. pub fn unregister_files(self: *IoUring) !void { assert(self.fd >= 0); const res = linux.io_uring_register(self.fd, .unregister_files, null, 0); @@ -1394,7 +1440,8 @@ pub fn socket_direct( return sqe; } -/// Prepares a socket creation request for registered file, index chosen by kernel (file index alloc). +/// Prepares a socket creation request for registered file, index chosen by +/// kernel (file index alloc). /// File index will be returned in CQE res field. /// Available since 5.19 pub fn socket_direct_alloc( @@ -1511,10 +1558,12 @@ pub fn getsockopt( ); } -/// Registers a shared buffer ring to be used with provided buffers. -/// `entries` number of `io_uring_buf` structures is mem mapped and shared by kernel. +/// Registers a shared buffer ring to be used with provided buffers. `entries` +/// number of `io_uring_buf` structures is mem mapped and shared by kernel. +/// +/// `entries` is the number of entries requested in the buffer ring and must be +/// a power of 2. /// `fd` is IO_Uring.fd for which the provided buffer ring is being registered. -/// `entries` is the number of entries requested in the buffer ring, must be power of 2. /// `group_id` is the chosen buffer group ID, unique in IO_Uring. pub fn setup_buf_ring( fd: linux.fd_t, @@ -1586,7 +1635,8 @@ fn handle_register_buf_ring_result(res: usize) !void { } } -// Unregisters a previously registered shared buffer ring, returned from io_uring_setup_buf_ring. +// Unregisters a previously registered shared buffer ring, returned from +// io_uring_setup_buf_ring. pub fn free_buf_ring(fd: posix.fd_t, br: *align(page_size_min) BufferRing, entries: u32, group_id: u16) void { unregister_buf_ring(fd, group_id) catch {}; var mmap: []align(page_size_min) u8 = undefined; @@ -1609,8 +1659,10 @@ pub fn buf_ring_mask(entries: u16) u16 { /// Assigns `buffer` with the `br` buffer ring. /// `buffer_id` is identifier which will be returned in the CQE. /// `buffer_offset` is the offset to insert at from the current tail. -/// If just one buffer is provided before the ring tail is committed with advance then offset should be 0. -/// If buffers are provided in a loop before being committed, the offset must be incremented by one for each buffer added. +/// If just one buffer is provided before the ring tail is committed with +/// advance then offset should be 0. +/// If buffers are provided in a loop before being committed, the offset must +/// be incremented by one for each buffer added. pub fn buf_ring_add( br: *BufferRing, buffer: []u8, @@ -1627,13 +1679,14 @@ pub fn buf_ring_add( } /// Make `count` new buffers visible to the kernel. Called after -/// `io_uring_buf_ring_add` has been called `count` times to fill in new buffers. +/// `io_uring_buf_ring_add` has been called `count` times to fill in new +/// buffers. pub fn buf_ring_advance(br: *BufferRing, count: u16) void { const tail: u16 = br.tail +% count; @atomicStore(u16, &br.tail, tail, .release); } -// IO completion data structure (Completion Queue Entry) +/// IO completion data structure (Completion Queue Entry) pub const Cqe = extern struct { /// sqe.user_data value passed back user_data: u64, @@ -1687,9 +1740,9 @@ pub const Cqe = extern struct { return .SUCCESS; } - // On successful completion of the provided buffers IO request, the CQE flags field - // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by - // the upper 16-bits of the flags field. + /// On successful completion of the provided buffers IO request, the CQE + /// flags field will have IORING_CQE_F_BUFFER set and the selected buffer + /// ID will be indicated by the upper 16-bits of the flags field. pub fn buffer_id(self: Cqe) !u16 { if (!self.flags.f_buffer) { return error.NoBufferSelected; @@ -1699,7 +1752,7 @@ pub const Cqe = extern struct { }; /// IO submission data structure (Submission Queue Entry) -/// matches io_uring_sqe in liburing +/// matches `io_uring_sqe` in liburing pub const Sqe = extern struct { /// type of operation for this sqe opcode: Op, @@ -1723,12 +1776,12 @@ pub const Sqe = extern struct { addr: u64, /// buffer size or number of iovecs len: u32, - /// flags for any sqe operation - /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | msg_flags - /// timeout_flags | accept_flags | cancel_flags | open_flags | statx_flags - /// fadvise_advice | splice_flags | rename_flags | unlink_flags | hardlink_flags - /// xattr_flags | msg_ring_flags | uring_cmd_flags | waitid_flags | futex_flags - /// install_fd_flags | nop_flags | pipe_flags + /// flags for any Sqe operation + /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | + /// msg_flags timeout_flags | accept_flags | cancel_flags | open_flags | + /// statx_flags fadvise_advice | splice_flags | rename_flags | unlink_flags + /// | hardlink_flags xattr_flags | msg_ring_flags | uring_cmd_flags | + /// waitid_flags | futex_flags install_fd_flags | nop_flags | pipe_flags rw_flags: u32, /// data to be passed back at completion time user_data: u64, @@ -1915,8 +1968,10 @@ pub const Sqe = extern struct { addrlen: ?*linux.socklen_t, flags: linux.Sock, ) void { - // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. - // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to + // socklen_t`. + // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only + // a u32). sqe.prep_rw(.accept, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); sqe.rw_flags = @bitCast(flags); } @@ -1975,7 +2030,8 @@ pub const Sqe = extern struct { addr: *const linux.sockaddr, addrlen: linux.socklen_t, ) void { - // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). + // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is + // only a u32). sqe.prep_rw(.connect, fd, @intFromPtr(addr), 0, addrlen); } @@ -2157,12 +2213,12 @@ pub const Sqe = extern struct { poll_mask: linux.Epoll, ) void { sqe.prep_rw(.poll_add, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html + // Poll masks previously used to comprise of 16 bits in the flags union + // of a SQE, but were then extended to comprise of 32 bits in order to + // make room for additional option flags. To ensure that the correct + // bits of poll masks are consistently and properly read across + // multiple kernel versions, poll masks are enforced to be + // little-endian. https://www.spinics.net/lists/io-uring/msg02848.html sqe.rw_flags = std.mem.nativeToLittle(u32, @as(u32, @bitCast(poll_mask))); } @@ -2181,12 +2237,12 @@ pub const Sqe = extern struct { flags: uflags.Poll, ) void { sqe.prep_rw(.poll_remove, -1, old_user_data, flags, new_user_data); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html + // Poll masks previously used to comprise of 16 bits in the flags union + // of a SQE, but were then extended to comprise of 32 bits in order to + // make room for additional option flags. To ensure that the correct + // bits of poll masks are consistently and properly read across + // multiple kernel versions, poll masks are enforced to be + // little-endian. https://www.spinics.net/lists/io-uring/msg02848.html sqe.rw_flags = std.mem.nativeToLittle(u32, @as(u32, @bitCast(poll_mask))); } @@ -2475,7 +2531,7 @@ pub const Sqe = extern struct { } }; -/// matches io_uring_sq in liburing +/// matches `io_uring_sq` in liburing pub const Sq = struct { head: *u32, tail: *u32, @@ -2486,11 +2542,11 @@ pub const Sq = struct { sqes: []Sqe, mmap: []align(page_size_min) u8, mmap_sqes: []align(page_size_min) u8, - // We use `sqe_head` and `sqe_tail` in the same way as liburing: // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. - // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. - // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. + // We then set `tail` to `sqe_tail` once, only when these events are + // actually submitted. This allows us to amortize the cost of the + // @atomicStore to `tail` across multiple SQEs. sqe_head: u32 = 0, sqe_tail: u32 = 0, @@ -2523,8 +2579,9 @@ pub const Sq = struct { errdefer posix.munmap(mmap); assert(mmap.len == size); - // The motivation for the `sqes` and `array` indirection is to make it possible for the - // application to preallocate static io_uring_sqe entries and then replay them when needed. + // The motivation for the `sqes` and `array` indirection is to make it + // possible for the application to preallocate static io_uring_sqe + // entries and then replay them when needed. const size_sqes = p.sq_entries * @sizeOf(Sqe); const mmap_sqes = try posix.mmap( null, @@ -2539,8 +2596,8 @@ pub const Sq = struct { const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0])); - // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, - // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. + // We expect the kernel copies p.sq_entries to the u32 pointed to by + // p.sq_off.ring_entries, See https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); return .{ .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), @@ -2561,7 +2618,7 @@ pub const Sq = struct { } }; -/// matches io_uring_cq in liburing +/// matches `io_uring_cq` in liburing pub const Cq = struct { head: *u32, tail: *u32, @@ -2594,7 +2651,8 @@ pub const Cq = struct { pub fn deinit(self: *Cq) void { _ = self; // A no-op since we now share the mmap with the submission queue. - // Here for symmetry with the submission queue, and for any future feature support. + // Here for symmetry with the submission queue, and for any future + // feature support. } }; @@ -2609,7 +2667,7 @@ pub const Cq = struct { /// ready to receive data, a buffer is picked automatically and the resulting /// CQE will contain the buffer ID in `cqe.buffer_id()`. Use `get` method to get /// buffer for buffer ID identified by CQE. Once the application has processed -/// the buffer, it may hand ownership back to the kernel, by calling `put` +/// the buffer, it may hand ownership back to the kernel, by calling `put()` /// allowing the cycle to repeat. /// /// Depending on the rate of arrival of data, it is possible that a given buffer @@ -2675,8 +2733,13 @@ pub const BufferGroup = struct { allocator.free(self.heads); } - // Prepare recv operation which will select buffer from this group. - pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: linux.Msg) !*Sqe { + /// Prepare recv operation which will select buffer from this group. + pub fn recv( + self: *BufferGroup, + user_data: u64, + fd: posix.fd_t, + flags: linux.Msg, + ) !*Sqe { var sqe = try self.ring.get_sqe(); sqe.prep_rw(.recv, fd, 0, 0, 0); sqe.rw_flags = @bitCast(flags); @@ -2686,8 +2749,14 @@ pub const BufferGroup = struct { return sqe; } - // Prepare multishot recv operation which will select buffer from this group. - pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: linux.Msg) !*Sqe { + /// Prepare multishot recv operation which will select buffer from this + /// group. + pub fn recv_multishot( + self: *BufferGroup, + user_data: u64, + fd: posix.fd_t, + flags: linux.Msg, + ) !*Sqe { var sqe = try self.recv(user_data, fd, flags); sqe.ioprio.send_recv.recv_multishot = true; return sqe; @@ -2699,18 +2768,19 @@ pub const BufferGroup = struct { return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..]; } - // Get buffer by CQE. + /// Get buffer by CQE. pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { const buffer_id = try cqe.buffer_id(); const used_len = @as(usize, @intCast(cqe.res)); return self.get_by_id(buffer_id)[0..used_len]; } - // Release buffer from CQE to the kernel. + /// Release buffer from CQE to the kernel. pub fn put(self: *BufferGroup, cqe: Cqe) !void { const buffer_id = try cqe.buffer_id(); if (cqe.flags.f_buf_more) { - // Incremental consumption active, kernel will write to the this buffer again + // Incremental consumption active, kernel will write to the this + // buffer again const used_len = @as(u32, @intCast(cqe.res)); // Track what part of the buffer is used self.heads[buffer_id] += used_len; @@ -2718,7 +2788,7 @@ pub const BufferGroup = struct { } self.heads[buffer_id] = 0; - // Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count); + // Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count); buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0); buf_ring_advance(self.br, 1); @@ -2729,12 +2799,11 @@ pub const BufferGroup = struct { pub const ReadBuffer = union(enum) { /// io_uring will read directly into this buffer buffer: []u8, - /// io_uring will read directly into these buffers using readv. iovecs: []const posix.iovec, - - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. + /// io_uring will select a buffer that has previously been provided with + /// `provide_buffers`. + /// `group_id` must contain at least one buffer for the read to work. /// `len` controls the number of bytes to read into the selected buffer. buffer_selection: struct { group_id: u16, @@ -2746,9 +2815,9 @@ pub const ReadBuffer = union(enum) { pub const RecvBuffer = union(enum) { /// io_uring will recv directly into this buffer buffer: []u8, - - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. + /// io_uring will select a buffer that has previously been provided with + /// `provide_buffers`. + /// `group_id` must contain at least one buffer for the recv call to work. /// `len` controls the number of bytes to read into the selected buffer. buffer_selection: struct { group_id: u16, @@ -2756,8 +2825,8 @@ pub const RecvBuffer = union(enum) { }, }; -/// Filled with the offset for mmap(2) -/// matches io_sqring_offsets in liburing +/// Filled with the offset for `mmap(2)` +/// matches `io_sqring_offsets` in liburing pub const SqOffsets = extern struct { /// offset of ring head head: u32, @@ -2777,7 +2846,7 @@ pub const SqOffsets = extern struct { user_addr: u64, }; -/// matches io_cqring_offsets in liburing +/// matches `io_cqring_offsets` in liburing pub const CqOffsets = extern struct { head: u32, tail: u32, @@ -2790,8 +2859,8 @@ pub const CqOffsets = extern struct { user_addr: u64, }; -/// Passed in for io_uring_setup(2). Copied back with updated info on success -/// matches io_uring_params in liburing +/// Passed in for `io_uring_setup(2)`. Copied back with updated info on success +/// matches `io_uring_params` in liburing pub const Params = extern struct { sq_entries: u32, cq_entries: u32, @@ -2809,7 +2878,7 @@ pub const Params = extern struct { // deprecated, see struct io_uring_rsrc_update // COMMIT: add new io_uring_region_desc struct -/// matches io_uring_region_desc in liburing +/// matches `io_uring_region_desc` in liburing pub const RegionDesc = extern struct { user_addr: u64, size: u64, @@ -2827,7 +2896,7 @@ pub const RegionDesc = extern struct { }; // COMMIT: add new io_uring_mem_region_reg struct -/// matches io_uring_mem_region_reg in liburing +/// matches `io_uring_mem_region_reg` in liburing pub const MemRegionReg = extern struct { /// struct io_uring_region_desc (RegionDesc in Zig) region_uptr: u64, @@ -2841,7 +2910,7 @@ pub const MemRegionReg = extern struct { }; }; -/// matches io_uring_rsrc_register in liburing +/// matches `io_uring_rsrc_register` in liburing pub const RsrcRegister = extern struct { nr: u32, flags: Flags, @@ -2857,14 +2926,14 @@ pub const RsrcRegister = extern struct { }; }; -/// matches io_uring_rsrc_update in liburing +/// matches `io_uring_rsrc_update` in liburing pub const RsrcUpdate = extern struct { offset: u32, resv: u32, data: u64, }; -/// matches io_uring_rsrc_update2 in liburing +/// matches `io_uring_rsrc_update2` in liburing pub const RsrcUpdate2 = extern struct { offset: u32, resv: u32, @@ -2874,7 +2943,7 @@ pub const RsrcUpdate2 = extern struct { resv2: u32, }; -/// matches io_uring_probe_op in liburing +/// matches `io_uring_probe_op` in liburing pub const ProbeOp = extern struct { op: Op, resv: u8, @@ -2891,7 +2960,7 @@ pub const ProbeOp = extern struct { } }; -/// matches io_uring_probe in liburing +/// matches `io_uring_probe` in liburing pub const Probe = extern struct { /// Last opcode supported last_op: Op, @@ -2912,7 +2981,7 @@ pub const Probe = extern struct { // COMMIT: fix defination of io_uring_restriction // RegisterOp is actually u8 -/// matches io_uring_restriction in liburing +/// matches `io_uring_restriction` in liburing pub const Restriction = extern struct { opcode: RestrictionOp, arg: extern union { @@ -2928,14 +2997,14 @@ pub const Restriction = extern struct { }; // COMMIT: add new struct type -/// matches io_uring_clock_register in liburing +/// matches `io_uring_clock_register` in liburing pub const ClockRegister = extern struct { clockid: u32, __resv: [3]u32, }; // COMMIT: add new struct type -/// matches io_uring_clone_buffers in liburing +/// matches `io_uring_clone_buffers` in liburing pub const CloneBuffers = extern struct { src_fd: u32, flags: Flags, @@ -2952,7 +3021,7 @@ pub const CloneBuffers = extern struct { }; }; -/// matches io_uring_buf in liburing +/// matches `io_uring_buf` in liburing pub const Buffer = extern struct { addr: u64, len: u32, @@ -2960,7 +3029,7 @@ pub const Buffer = extern struct { resv: u16, }; -/// matches io_uring_buf_ring in liburing +/// matches `io_uring_buf_ring` in liburing pub const BufferRing = extern struct { resv1: u64, resv2: u32, @@ -2969,7 +3038,7 @@ pub const BufferRing = extern struct { }; /// argument for IORING_(UN)REGISTER_PBUF_RING -/// matches io_uring_buf_reg in liburing +/// matches `io_uring_buf_reg` in liburing pub const BufferRegister = extern struct { ring_addr: u64, ring_entries: u32, @@ -3001,7 +3070,7 @@ pub const BufferRegister = extern struct { }; /// argument for IORING_REGISTER_PBUF_STATUS -/// matches io_uring_buf_status in liburing +/// matches `io_uring_buf_status` in liburing pub const BufferStatus = extern struct { /// input buf_group: u32, @@ -3011,7 +3080,7 @@ pub const BufferStatus = extern struct { }; /// argument for IORING_(UN)REGISTER_NAPI -/// matches io_uring_napi in liburing +/// matches `io_uring_napi` in liburing pub const Napi = extern struct { busy_poll_to: u32, prefer_busy_poll: u8, @@ -3020,11 +3089,10 @@ pub const Napi = extern struct { }; // COMMIT: new struct type -/// Argument for io_uring_enter(2) with -/// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument -/// is an index into a previously registered fixed wait region described by -/// the below structure. -/// matches io_uring_reg_wait in liburing +/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG +/// set, where the actual argument is an index into a previously registered +/// fixed wait region described by the below structure. +/// matches `io_uring_reg_wait` in liburing pub const RegisterWait = extern struct { ts: linux.kernel_timespec, min_wait_usec: u32, @@ -3041,8 +3109,9 @@ pub const RegisterWait = extern struct { }; }; -/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG -/// matches io_uring_getevents_arg in liburing +/// Argument for `io_uring_enter(2)` with IORING_GETEVENTS | +/// IORING_ENTER_EXT_ARG +/// matches `io_uring_getevents_arg` in liburing pub const GetEventsArg = extern struct { sigmask: u64, sigmask_sz: u32, @@ -3052,7 +3121,7 @@ pub const GetEventsArg = extern struct { // COMMIT: fix type definition of io_uring_sync_cancel_reg /// Argument for IORING_REGISTER_SYNC_CANCEL -/// matches io_uring_sync_cancel_reg in liburing +/// matches `io_uring_sync_cancel_reg` in liburing pub const SyncCancelRegister = extern struct { addr: u64, fd: i32, @@ -3065,14 +3134,14 @@ pub const SyncCancelRegister = extern struct { /// Argument for IORING_REGISTER_FILE_ALLOC_RANGE /// The range is specified as [off, off + len) -/// matches io_uring_file_index_range in liburing +/// matches `io_uring_file_index_range` in liburing pub const FileIndexRange = extern struct { off: u32, len: u32, resv: u64, }; -/// matches io_uring_recvmsg_out in liburing +/// matches `io_uring_recvmsg_out` in liburing pub const RecvmsgOut = extern struct { namelen: u32, controllen: u32, @@ -3081,20 +3150,20 @@ pub const RecvmsgOut = extern struct { }; /// Zero copy receive refill queue entry -/// matches io_uring_zcrx_rqe in liburing +/// matches `io_uring_zcrx_rqe` in liburing pub const ZcrxRqe = extern struct { off: u64, len: u32, __pad: u32, }; -/// matches io_uring_zcrx_cqe in liburing +/// matches `io_uring_zcrx_cqe` in liburing pub const ZcrxCqe = extern struct { off: u64, __pad: u64, }; -/// matches io_uring_zcrx_offsets in liburing +/// matches `io_uring_zcrx_offsets` in liburing pub const ZcrxOffsets = extern struct { head: u32, tail: u32, @@ -3103,7 +3172,7 @@ pub const ZcrxOffsets = extern struct { __resv: [2]u64, }; -/// matches io_uring_zcrx_area_reg in liburing +/// matches `io_uring_zcrx_area_reg` in liburing pub const ZcrxAreaRegister = extern struct { addr: u64, len: u64, @@ -3119,7 +3188,7 @@ pub const ZcrxAreaRegister = extern struct { }; /// Argument for IORING_REGISTER_ZCRX_IFQ -/// matches io_uring_zcrx_ifq_reg in liburing +/// matches `io_uring_zcrx_ifq_reg` in liburing pub const ZcrxIfqRegister = extern struct { if_idx: u32, if_rxq: u32, @@ -3404,8 +3473,8 @@ pub const uflags = struct { } }; }; -/// io_uring_register(2) opcodes and arguments -/// matches io_uring_register_op in liburing +/// `io_uring_register(2)` opcodes and arguments +/// matches `io_uring_register_op` in liburing pub const RegisterOp = enum(u8) { register_buffers, unregister_buffers, @@ -3479,13 +3548,14 @@ pub const RegisterOp = enum(u8) { }; /// io-wq worker categories -/// matches io_wq_type in liburing +/// matches `io_wq_type` in liburing pub const IoWqCategory = enum(u8) { bound, unbound, _, }; +/// matches `io_uring_socket_op` in liburing pub const SocketOp = enum(u16) { siocin, siocoutq, @@ -3497,7 +3567,7 @@ pub const SocketOp = enum(u16) { }; /// io_uring_restriction.opcode values -/// matches io_uring_register_restriction_op in liburing +/// matches `io_uring_register_restriction_op` in liburing pub const RestrictionOp = enum(u16) { /// Allow an io_uring_register(2) opcode register_op = 0, @@ -3512,6 +3582,7 @@ pub const RestrictionOp = enum(u16) { }; /// IORING_OP_MSG_RING command types, stored in sqe.addr +/// matches `io_uring_msg_ring_flags` in liburing pub const MsgRingCmd = enum { /// pass sqe->len as 'res' and off as user_data data, @@ -3520,6 +3591,7 @@ pub const MsgRingCmd = enum { }; // COMMIT: OP to IoUring +/// matches `io_uring_op` in liburing pub const Op = enum(u8) { nop, readv, From 2a8565b1f88087449081ac2bd20861dc57e6bf33 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 12 Oct 2025 15:47:18 +0000 Subject: [PATCH 21/41] Remove unnecessary use of @as coercion update `cq_ring_needs_flush()` to match liburing replace FilesUpdate with RsrcUpdate struct Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 472 +++++++++++++++++------------------ 1 file changed, 236 insertions(+), 236 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index d713b271b657..a88ea8e91845 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -72,7 +72,7 @@ pub fn init_params(entries: u16, p: *Params) !IoUring { .NOSYS => return error.SystemOutdated, else => |errno| return posix.unexpectedErrno(errno), } - const fd = @as(linux.fd_t, @intCast(res)); + const fd: linux.fd_t = @intCast(res); assert(fd >= 0); errdefer posix.close(fd); @@ -231,7 +231,7 @@ pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.En .INTR => return error.SignalInterrupt, else => |errno| return posix.unexpectedErrno(errno), } - return @as(u32, @intCast(res)); + return @intCast(res); } /// Sync internal state with kernel ring state on the SQ side. @@ -277,8 +277,8 @@ pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { /// i.e. its length. These are SQEs that the kernel is yet to consume. /// Matches the implementation of `io_uring_sq_ready()` in liburing. pub fn sq_ready(self: *IoUring) u32 { - // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, - // see https://github.com/axboe/liburing/issues/92. + // Always use the shared ring state (i.e. head and not sqe_head) to avoid + // going out of sync, see https://github.com/axboe/liburing/issues/92. return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .acquire); } @@ -347,7 +347,9 @@ pub fn copy_cqe(ring: *IoUring) !Cqe { /// Matches the implementation of `cq_ring_needs_flush()` in liburing. pub fn cq_ring_needs_flush(self: *IoUring) bool { - return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).cq_overflow; + const sq_flags = @atomicLoad(Sq.Flags, self.sq.flags, .unordered); + if (sq_flags.cq_overflow or sq_flags.taskrun) return true; + return false; } /// For advanced use cases only that implement custom completion queue methods. @@ -1197,8 +1199,8 @@ pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { const res = linux.io_uring_register( self.fd, .register_files, - @as(*const anyopaque, @ptrCast(fds.ptr)), - @as(u32, @intCast(fds.len)), + @ptrCast(fds.ptr), + @intCast(fds.len), ); try handle_registration_result(res); } @@ -1216,22 +1218,17 @@ pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { assert(self.fd >= 0); - const FilesUpdate = extern struct { - offset: u32, - resv: u32, - fds: u64 align(8), - }; - var update: FilesUpdate = .{ + var update: RsrcUpdate = .{ .offset = offset, - .resv = @as(u32, 0), - .fds = @as(u64, @intFromPtr(fds.ptr)), + .resv = 0, + .data = @intFromPtr(fds.ptr), }; const res = linux.io_uring_register( self.fd, .register_files_update, - @as(*const anyopaque, @ptrCast(&update)), - @as(u32, @intCast(fds.len)), + @ptrCast(&update), + @intCast(fds.len), ); try handle_registration_result(res); } @@ -1252,7 +1249,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { self.fd, .register_files2, @ptrCast(®), - @as(u32, @sizeOf(RsrcRegister)), + @sizeOf(RsrcRegister), ); return handle_registration_result(res); @@ -1273,7 +1270,7 @@ pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { self.fd, .register_file_alloc_range, @ptrCast(&range), - @as(u32, @sizeOf(FileIndexRange)), + @sizeOf(FileIndexRange), ); return handle_registration_result(res); @@ -1287,7 +1284,7 @@ pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { const res = linux.io_uring_register( self.fd, .register_eventfd, - @as(*const anyopaque, @ptrCast(&fd)), + @ptrCast(&fd), 1, ); try handle_registration_result(res); @@ -1303,7 +1300,7 @@ pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { const res = linux.io_uring_register( self.fd, .register_eventfd_async, - @as(*const anyopaque, @ptrCast(&fd)), + @ptrCast(&fd), 1, ); try handle_registration_result(res); @@ -1340,7 +1337,7 @@ pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { self.fd, .register_buffers, buffers.ptr, - @as(u32, @intCast(buffers.len)), + @intCast(buffers.len), ); try handle_registration_result(res); } @@ -1574,7 +1571,7 @@ pub fn setup_buf_ring( if (entries == 0 or entries > 1 << 15) return error.EntriesNotInRange; if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; - const mmap_size = @as(usize, entries) * @sizeOf(Buffer); + const mmap_size: usize = entries * @sizeOf(Buffer); const mmap = try posix.mmap( null, mmap_size, @@ -1609,7 +1606,7 @@ fn register_buf_ring( // Retry without incremental buffer consumption. // It is available since kernel 6.12. returns INVAL on older. reg.flags.iou_pbuf_ring_inc = false; - res = linux.io_uring_register(fd, .register_pbuf_ring, @as(*const anyopaque, @ptrCast(®)), 1); + res = linux.io_uring_register(fd, .register_pbuf_ring, @ptrCast(®), 1); } try handle_register_buf_ring_result(res); } @@ -1621,7 +1618,7 @@ fn unregister_buf_ring(fd: posix.fd_t, group_id: u16) !void { const res = linux.io_uring_register( fd, .unregister_pbuf_ring, - @as(*const anyopaque, @ptrCast(®)), + @ptrCast(®), 1, ); try handle_register_buf_ring_result(res); @@ -1735,7 +1732,7 @@ pub const Cqe = extern struct { pub fn err(self: Cqe) linux.E { if (self.res > -4096 and self.res < 0) { - return @as(linux.E, @enumFromInt(-self.res)); + return @enumFromInt(-self.res); } return .SUCCESS; } @@ -2219,7 +2216,7 @@ pub const Sqe = extern struct { // bits of poll masks are consistently and properly read across // multiple kernel versions, poll masks are enforced to be // little-endian. https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, @as(u32, @bitCast(poll_mask))); + sqe.rw_flags = std.mem.nativeToLittle(u32, @bitCast(poll_mask)); } pub fn prep_poll_remove( @@ -2243,7 +2240,7 @@ pub const Sqe = extern struct { // bits of poll masks are consistently and properly read across // multiple kernel versions, poll masks are enforced to be // little-endian. https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, @as(u32, @bitCast(poll_mask))); + sqe.rw_flags = std.mem.nativeToLittle(u32, @bitCast(poll_mask)); } pub fn prep_fallocate( @@ -2431,7 +2428,7 @@ pub const Sqe = extern struct { protocol: linux.IpProto, flags: u32, // flags is unused ) void { - sqe.prep_rw(.socket, @intFromEnum(domain), 0, @intFromEnum(protocol), @intCast(@as(u32, @bitCast(socket_type)))); + sqe.prep_rw(.socket, @intFromEnum(domain), 0, @intFromEnum(protocol), @as(u32, @bitCast(socket_type))); sqe.rw_flags = flags; } @@ -2771,7 +2768,7 @@ pub const BufferGroup = struct { /// Get buffer by CQE. pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { const buffer_id = try cqe.buffer_id(); - const used_len = @as(usize, @intCast(cqe.res)); + const used_len: usize = @intCast(cqe.res); return self.get_by_id(buffer_id)[0..used_len]; } @@ -2781,7 +2778,7 @@ pub const BufferGroup = struct { if (cqe.flags.f_buf_more) { // Incremental consumption active, kernel will write to the this // buffer again - const used_len = @as(u32, @intCast(cqe.res)); + const used_len: u32 = @intCast(cqe.res); // Track what part of the buffer is used self.heads[buffer_id] += used_len; return; @@ -3665,9 +3662,9 @@ pub const Op = enum(u8) { test "structs/offsets/entries" { if (!is_linux) return error.SkipZigTest; - try testing.expectEqual(@as(usize, 120), @sizeOf(Params)); - try testing.expectEqual(@as(usize, 64), @sizeOf(Sqe)); - try testing.expectEqual(@as(usize, 16), @sizeOf(Cqe)); + try testing.expectEqual(120, @sizeOf(Params)); + try testing.expectEqual(64, @sizeOf(Sqe)); + try testing.expectEqual(16, @sizeOf(Cqe)); try testing.expectEqual(0, constants.OFF_SQ_RING); try testing.expectEqual(0x8000000, constants.OFF_CQ_RING); @@ -3687,7 +3684,7 @@ test "nop" { }; defer { ring.deinit(); - testing.expectEqual(@as(linux.fd_t, -1), ring.fd) catch @panic("test failed"); + testing.expectEqual(-1, ring.fd) catch @panic("test failed"); } const sqe = try ring.nop(0xaaaaaaaa); @@ -3708,40 +3705,40 @@ test "nop" { .resv = 0, }, sqe.*); - try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 1), ring.sq_ready()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + try testing.expectEqual(0, ring.sq.sqe_head); + try testing.expectEqual(1, ring.sq.sqe_tail); + try testing.expectEqual(0, ring.sq.tail.*); + try testing.expectEqual(0, ring.cq.head.*); + try testing.expectEqual(1, ring.sq_ready()); + try testing.expectEqual(0, ring.cq_ready()); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); + try testing.expectEqual(1, try ring.submit()); + try testing.expectEqual(1, ring.sq.sqe_head); + try testing.expectEqual(1, ring.sq.sqe_tail); + try testing.expectEqual(1, ring.sq.tail.*); + try testing.expectEqual(0, ring.cq.head.*); + try testing.expectEqual(0, ring.sq_ready()); try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + try testing.expectEqual(1, ring.cq.head.*); + try testing.expectEqual(0, ring.cq_ready()); const sqe_barrier = try ring.nop(0xbbbbbbbb); sqe_barrier.flags.io_drain = true; - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); try testing.expectEqual(Cqe{ .user_data = 0xbbbbbbbb, .res = 0, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 2), ring.cq.head.*); + try testing.expectEqual(2, ring.sq.sqe_head); + try testing.expectEqual(2, ring.sq.sqe_tail); + try testing.expectEqual(2, ring.sq.tail.*); + try testing.expectEqual(2, ring.cq.head.*); } test "readv" { @@ -3775,7 +3772,7 @@ test "readv" { sqe.flags.fixed_file = true; try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, @@ -3815,43 +3812,43 @@ test "writev/fsync/readv" { const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); try testing.expectEqual(Op.writev, sqe_writev.opcode); - try testing.expectEqual(@as(u64, 17), sqe_writev.off); - sqe_writev.flags.io_link = true; + try testing.expectEqual(17, sqe_writev.off); + sqe_writev.link_next(); const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); - try testing.expectEqual(Op.fsync, sqe_fsync.opcode); + try testing.expectEqual(.fsync, sqe_fsync.opcode); try testing.expectEqual(fd, sqe_fsync.fd); - sqe_fsync.flags.io_link = true; + sqe_fsync.link_next(); const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); try testing.expectEqual(Op.readv, sqe_readv.opcode); - try testing.expectEqual(@as(u64, 17), sqe_readv.off); + try testing.expectEqual(17, sqe_readv.off); - try testing.expectEqual(@as(u32, 3), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), ring.cq_ready()); + try testing.expectEqual(3, ring.sq_ready()); + try testing.expectEqual(3, try ring.submit_and_wait(3)); + try testing.expectEqual(0, ring.sq_ready()); + try testing.expectEqual(3, ring.cq_ready()); try testing.expectEqual(Cqe{ .user_data = 0xdddddddd, .res = buffer_write.len, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + try testing.expectEqual(2, ring.cq_ready()); try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq_ready()); + try testing.expectEqual(1, ring.cq_ready()); try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_read.len, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + try testing.expectEqual(0, ring.cq_ready()); try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } @@ -3877,12 +3874,12 @@ test "write/read" { var buffer_read = [_]u8{98} ** 20; const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); try testing.expectEqual(Op.write, sqe_write.opcode); - try testing.expectEqual(@as(u64, 10), sqe_write.off); + try testing.expectEqual(10, sqe_write.off); sqe_write.flags.io_link = true; const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); try testing.expectEqual(Op.read, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(10, sqe_read.off); + try testing.expectEqual(2, try ring.submit()); const cqe_write = try ring.copy_cqe(); const cqe_read = try ring.copy_cqe(); @@ -3933,20 +3930,20 @@ test "splice/read" { const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); try testing.expectEqual(Op.splice, sqe_splice_to_pipe.opcode); - try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); + try testing.expectEqual(0, sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); sqe_splice_to_pipe.link_next(); const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); try testing.expectEqual(Op.splice, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); - try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); + try testing.expectEqual(10, sqe_splice_from_pipe.off); sqe_splice_from_pipe.link_next(); const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); try testing.expectEqual(Op.read, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 3), try ring.submit()); + try testing.expectEqual(10, sqe_read.off); + try testing.expectEqual(3, try ring.submit()); const cqe_splice_to_pipe = try ring.copy_cqe(); const cqe_splice_from_pipe = try ring.copy_cqe(); @@ -4011,26 +4008,26 @@ test "write_fixed/read_fixed" { const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); try testing.expectEqual(Op.write_fixed, sqe_write.opcode); - try testing.expectEqual(@as(u64, 3), sqe_write.off); + try testing.expectEqual(3, sqe_write.off); sqe_write.link_next(); const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); try testing.expectEqual(Op.read_fixed, sqe_read.opcode); - try testing.expectEqual(@as(u64, 0), sqe_read.off); + try testing.expectEqual(0, sqe_read.off); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); const cqe_write = try ring.copy_cqe(); const cqe_read = try ring.copy_cqe(); try testing.expectEqual(Cqe{ .user_data = 0x45454545, - .res = @as(i32, @intCast(buffers[0].len)), + .res = @intCast(buffers[0].len), .flags = .{}, }, cqe_write); try testing.expectEqual(Cqe{ .user_data = 0x12121212, - .res = @as(i32, @intCast(buffers[1].len)), + .res = @intCast(buffers[1].len), .flags = .{}, }, cqe_read); @@ -4080,15 +4077,15 @@ test "openat" { .addr3 = 0, .resv = 0, }, sqe_openat.*); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe_openat = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); + try testing.expectEqual(0x33333333, cqe_openat.user_data); if (cqe_openat.err() == .INVAL) return error.SkipZigTest; if (cqe_openat.err() == .BADF) return error.SkipZigTest; if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); try testing.expect(cqe_openat.res > 0); - try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_openat.flags); + try testing.expectEqual(@as(Cqe.Flags, .{}), cqe_openat.flags); posix.close(cqe_openat.res); } @@ -4113,7 +4110,7 @@ test "close" { const sqe_close = try ring.close(0x44444444, file.handle); try testing.expectEqual(Op.close, sqe_close.opcode); try testing.expectEqual(file.handle, sqe_close.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe_close = try ring.copy_cqe(); if (cqe_close.err() == .INVAL) return error.SkipZigTest; @@ -4144,7 +4141,7 @@ test "accept/connect/send/recv" { sqe_send.link_next(); _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; @@ -4184,8 +4181,8 @@ test "sendmsg/recvmsg" { const server = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(u32, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(u32, 1))); try posix.bind(server, addrAny(&address_server), @sizeOf(linux.sockaddr.in)); // set address_server to the OS-chosen IP/port. @@ -4234,10 +4231,10 @@ test "sendmsg/recvmsg" { try testing.expectEqual(Op.recvmsg, sqe_recvmsg.opcode); try testing.expectEqual(server, sqe_recvmsg.fd); - try testing.expectEqual(@as(u32, 2), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + try testing.expectEqual(2, ring.sq_ready()); + try testing.expectEqual(2, try ring.submit_and_wait(2)); + try testing.expectEqual(0, ring.sq_ready()); + try testing.expectEqual(2, ring.cq_ready()); const cqe_sendmsg = try ring.copy_cqe(); if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; @@ -4271,13 +4268,12 @@ test "timeout (after a relative time)" { defer ring.deinit(); const ms = 10; - const margin = 5; const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; const started = try std.Io.Clock.awake.now(io); const sqe = try ring.timeout(0x55555555, &ts, 0, .{}); try testing.expectEqual(Op.timeout, sqe.opcode); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); const stopped = try std.Io.Clock.awake.now(io); @@ -4288,6 +4284,7 @@ test "timeout (after a relative time)" { }, cqe); // Tests should not depend on timings: skip test if outside margin. + const margin = 5; const ms_elapsed = started.durationTo(stopped).toMilliseconds(); if (ms_elapsed > margin) return error.SkipZigTest; } @@ -4308,7 +4305,7 @@ test "timeout (after a number of completions)" { try testing.expectEqual(Op.timeout, sqe_timeout.opcode); try testing.expectEqual(count_completions, sqe_timeout.off); _ = try ring.nop(0x77777777); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); const cqe_nop = try ring.copy_cqe(); try testing.expectEqual(Cqe{ @@ -4338,14 +4335,14 @@ test "timeout_remove" { const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); try testing.expectEqual(Op.timeout, sqe_timeout.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); + try testing.expectEqual(0x88888888, sqe_timeout.user_data); const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, .{}); try testing.expectEqual(Op.timeout_remove, sqe_timeout_remove.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); + try testing.expectEqual(0x88888888, sqe_timeout_remove.addr); + try testing.expectEqual(0x99999999, sqe_timeout_remove.user_data); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second @@ -4407,7 +4404,7 @@ test "accept/connect/recv/link_timeout" { _ = try ring.link_timeout(0x22222222, &ts, .{}); const nr_wait = try ring.submit(); - try testing.expectEqual(@as(u32, 2), nr_wait); + try testing.expectEqual(2, nr_wait); var i: usize = 0; while (i < nr_wait) : (i += 1) { @@ -4451,13 +4448,13 @@ test "fallocate" { const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); defer file.close(); - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + try testing.expectEqual(0, (try file.stat()).size); const len: u64 = 65536; const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); try testing.expectEqual(Op.fallocate, sqe.opcode); try testing.expectEqual(file.handle, sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -4496,7 +4493,7 @@ test "statx" { const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); defer file.close(); - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + try testing.expectEqual(0, (try file.stat()).size); try file.writeAll("foobar"); @@ -4510,8 +4507,8 @@ test "statx" { &buf, ); try testing.expectEqual(Op.statx, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -4534,7 +4531,7 @@ test "statx" { }, cqe); try testing.expect(buf.mask.size); - try testing.expectEqual(@as(u64, 6), buf.size); + try testing.expectEqual(6, buf.size); } test "accept/connect/recv/cancel" { @@ -4553,13 +4550,13 @@ test "accept/connect/recv/cancel" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, .{}); try testing.expectEqual(Op.async_cancel, sqe_cancel.opcode); - try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(0xffffffff, sqe_cancel.addr); + try testing.expectEqual(0x99999999, sqe_cancel.user_data); + try testing.expectEqual(1, try ring.submit()); var cqe_recv = try ring.copy_cqe(); if (cqe_recv.err() == .INVAL) return error.SkipZigTest; @@ -4628,7 +4625,7 @@ test "register_files_update" { try testing.expectEqual(Op.read, sqe.opcode); sqe.set_flags(.{ .fixed_file = true }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, @@ -4649,7 +4646,7 @@ test "register_files_update" { try testing.expectEqual(Op.read, sqe.opcode); sqe.set_flags(.{ .fixed_file = true }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, @@ -4666,7 +4663,7 @@ test "register_files_update" { try testing.expectEqual(Op.read, sqe.opcode); sqe.set_flags(.{ .fixed_file = true }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(linux.E.BADF, cqe.err()); } @@ -4694,7 +4691,7 @@ test "shutdown" { // TODO: update posix later to use Typed Flags const server = try posix.socket(address.any.family, @as(u32, @bitCast(linux.Sock{ .type = .stream, .flags = .{ .cloexec = true } })), 0); defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(u32, 1))); try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); try posix.listen(server, 1); @@ -4704,9 +4701,9 @@ test "shutdown" { const shutdown_sqe = try ring.shutdown(0x445445445, server, .rd); try testing.expectEqual(Op.shutdown, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + try testing.expectEqual(server, shutdown_sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -4732,12 +4729,12 @@ test "shutdown" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), }; try testing.expectEqual(Op.shutdown, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + try testing.expectEqual(server, shutdown_sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); + try testing.expectEqual(0x445445445, cqe.user_data); try testing.expectEqual(linux.E.NOTCONN, cqe.err()); } } @@ -4775,9 +4772,9 @@ test "renameat" { .{}, ); try testing.expectEqual(Op.renameat, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(tmp.dir.fd, @as(i32, @intCast(sqe.len))); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -4829,8 +4826,8 @@ test "unlinkat" { .{}, ); try testing.expectEqual(Op.unlinkat, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -4876,8 +4873,8 @@ test "mkdirat" { 0o0755, ); try testing.expectEqual(Op.mkdirat, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -4924,8 +4921,8 @@ test "symlinkat" { link_path, ); try testing.expectEqual(Op.symlinkat, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -4977,9 +4974,9 @@ test "linkat" { .{}, ); try testing.expectEqual(Op.linkat, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(tmp.dir.fd, @as(i32, @intCast(sqe.len))); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5022,12 +5019,12 @@ test "provide_buffers: read" { // Provide 4 buffers { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + const sqe = try ring.provide_buffers(0xcccccccc, @ptrCast(&buffers), buffer_len, buffers.len, group_id, buffer_id); try testing.expectEqual(Op.provide_buffers, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(buffers[0].len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5036,7 +5033,7 @@ test "provide_buffers: read" { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + try testing.expectEqual(0xcccccccc, cqe.user_data); } // Do 4 reads which should consume all buffers @@ -5045,11 +5042,11 @@ test "provide_buffers: read" { while (i < buffers.len) : (i += 1) { const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); try testing.expectEqual(Op.read, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(fd, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5060,10 +5057,11 @@ test "provide_buffers: read" { try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(buffer_len, cqe.res); - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(0xdededede, cqe.user_data); + const empty: [buffer_len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } // This read should fail @@ -5071,11 +5069,11 @@ test "provide_buffers: read" { { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); try testing.expectEqual(Op.read, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(fd, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5084,7 +5082,7 @@ test "provide_buffers: read" { .SUCCESS => std.debug.panic("unexpected success", .{}), else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); } // Provide 1 buffer again @@ -5095,8 +5093,8 @@ test "provide_buffers: read" { const reprovided_buffer_id = 2; { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.provide_buffers(0xabababab, @ptrCast(&buffers[reprovided_buffer_id]), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5110,11 +5108,11 @@ test "provide_buffers: read" { { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); try testing.expectEqual(Op.read, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(fd, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5125,9 +5123,10 @@ test "provide_buffers: read" { try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(buffer_len, cqe.res); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); + const empty: [buffer_len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } } @@ -5154,8 +5153,8 @@ test "remove_buffers" { // Provide 4 buffers { - _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.provide_buffers(0xcccccccc, @ptrCast(&buffers), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5163,7 +5162,7 @@ test "remove_buffers" { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + try testing.expectEqual(0xcccccccc, cqe.user_data); } // Remove 3 buffers @@ -5171,24 +5170,24 @@ test "remove_buffers" { { const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); try testing.expectEqual(Op.remove_buffers, sqe.opcode); - try testing.expectEqual(@as(i32, 3), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(3, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); + try testing.expectEqual(0xbababababa, cqe.user_data); } // This read should work { _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5199,16 +5198,17 @@ test "remove_buffers" { try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(buffer_len, cqe.res); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); + const empty: [buffer_len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } // Final read should _not_ work { _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5239,12 +5239,12 @@ test "provide_buffers: accept/connect/send/recv" { // Provide 4 buffers { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); + const sqe = try ring.provide_buffers(0xcccccccc, @ptrCast(&buffers), buffer_len, buffers.len, group_id, buffer_id); try testing.expectEqual(Op.provide_buffers, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5255,7 +5255,7 @@ test "provide_buffers: accept/connect/send/recv" { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + try testing.expectEqual(0xcccccccc, cqe.user_data); } const socket_test_harness = try createSocketTestHarness(&ring); @@ -5267,11 +5267,11 @@ test "provide_buffers: accept/connect/send/recv" { var i: usize = 0; while (i < buffers.len) : (i += 1) { _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); } var cqes: [4]Cqe = undefined; - try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); + try testing.expectEqual(4, try ring.copy_cqes(&cqes, 4)); } // Do 4 recv which should consume all buffers @@ -5284,12 +5284,12 @@ test "provide_buffers: accept/connect/send/recv" { const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(0, sqe.rw_flags); try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5300,11 +5300,11 @@ test "provide_buffers: accept/connect/send/recv" { try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(buffer_len, cqe.res); - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); + try testing.expectEqual(0xdededede, cqe.user_data); + const zzz: [buffer_len]u8 = @splat('z'); + try testing.expectEqualSlices(u8, zzz[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } // This recv should fail @@ -5313,12 +5313,12 @@ test "provide_buffers: accept/connect/send/recv" { const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(0, sqe.rw_flags); try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5327,7 +5327,7 @@ test "provide_buffers: accept/connect/send/recv" { .SUCCESS => std.debug.panic("unexpected success", .{}), else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); } // Provide 1 buffer again @@ -5335,8 +5335,8 @@ test "provide_buffers: accept/connect/send/recv" { const reprovided_buffer_id = 2; { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.provide_buffers(0xabababab, @ptrCast(&buffers[reprovided_buffer_id]), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5349,7 +5349,7 @@ test "provide_buffers: accept/connect/send/recv" { { _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); _ = try ring.copy_cqe(); } @@ -5363,12 +5363,12 @@ test "provide_buffers: accept/connect/send/recv" { const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(0, sqe.rw_flags); try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -5379,10 +5379,10 @@ test "provide_buffers: accept/connect/send/recv" { try testing.expect(cqe.flags.f_buffer); const used_buffer_id = try cqe.buffer_id(); try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); + try testing.expectEqual(buffer_len, cqe.res); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); + const www: [buffer_len]u8 = @splat('w'); + try testing.expectEqualSlices(u8, www[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } } @@ -5417,7 +5417,7 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { errdefer posix.close(client); _ = try ring.connect(0xcccccccc, client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); var cqe_accept = try ring.copy_cqe(); if (cqe_accept.err() == .INVAL) return error.SkipZigTest; @@ -5432,10 +5432,10 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { cqe_connect = a; } - try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); + try testing.expectEqual(0xaaaaaaaa, cqe_accept.user_data); if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); try testing.expect(cqe_accept.res > 0); - try testing.expectEqual(@as(Cqe.Flags, @bitCast(@as(u32, 0))), cqe_accept.flags); + try testing.expectEqual(@as(Cqe.Flags, .{}), cqe_accept.flags); try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = 0, @@ -5456,7 +5456,7 @@ fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { const listener_socket = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); errdefer posix.close(listener_socket); - try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(u32, 1))); try posix.bind(listener_socket, addrAny(address), @sizeOf(linux.sockaddr.in)); try posix.listen(listener_socket, kernel_backlog); @@ -5489,7 +5489,7 @@ test "accept multishot" { var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); const userdata: u64 = 0xaaaaaaaa; _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var nr: usize = 4; // number of clients to connect while (nr > 0) : (nr -= 1) { @@ -5529,7 +5529,7 @@ test "accept/connect/send_zc/recv" { const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}, .{}); sqe_send.link_next(); _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); var cqe_send = try ring.copy_cqe(); // First completion of zero-copy send. @@ -5597,7 +5597,7 @@ test "accept_direct" { // submit accept, will chose registered fd and return index in cqe _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); // connect const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); @@ -5621,7 +5621,7 @@ test "accept_direct" { const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, .{}); recv_sqe.set_flags(.{ .fixed_file = true }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); // accept receive const recv_cqe = try ring.copy_cqe(); @@ -5633,7 +5633,7 @@ test "accept_direct" { { // submit accept _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); // connect const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); @@ -5681,7 +5681,7 @@ test "accept_multishot_direct" { // submit multishot accept // Will chose registered fd and return index of the selected registered file in cqe. _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); for (registered_fds) |_| { // connect @@ -5727,7 +5727,7 @@ test "socket" { // prepare, submit socket operation _ = try ring.socket(0, .inet, .{ .type = .stream }, .default, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); // test completion var cqe = try ring.copy_cqe(); @@ -5753,14 +5753,14 @@ test "socket_direct/socket_direct_alloc/close_direct" { // create socket in registered file descriptor at index 0 (last param) _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var cqe_socket = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 0); // create socket in registered file descriptor at index 1 (last param) _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); cqe_socket = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified @@ -5768,7 +5768,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { // create socket in kernel chosen file descriptor index (_alloc version) // completion res has index from registered files _ = try ring.socket_direct_alloc(0, .inet, .{ .type = .stream }, .default, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); cqe_socket = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 2); // returns registered file index @@ -5791,7 +5791,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { // fd is fixed file index connect_sqe.set_flags(.{ .fixed_file = true }); // submit both - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); // get completions var cqe_connect = try ring.copy_cqe(); var cqe_accept = try ring.copy_cqe(); @@ -5811,7 +5811,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { // submit and test close_direct _ = try ring.close_direct(close_userdata, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var cqe_close = try ring.copy_cqe(); try testing.expect(cqe_close.user_data == close_userdata); try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); @@ -5842,21 +5842,21 @@ test "openat_direct/close_direct" { // use registered file at index 0 (last param) _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 0); // use registered file at index 1 _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 0); // res is 0 when we specify index // let kernel choose registered file index _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, constants.FILE_INDEX_ALLOC); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 2); // chosen index is in res @@ -5864,7 +5864,7 @@ test "openat_direct/close_direct" { // close all open file descriptors for (registered_fds, 0..) |_, fd_index| { _ = try ring.close_direct(user_data, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var cqe_close = try ring.copy_cqe(); try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); } @@ -5987,7 +5987,7 @@ test BufferGroup { try testing.expectEqual(2, cqe.user_data); // matches submitted user_data try testing.expect(cqe.res >= 0); // success try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(data.len, @as(usize, @intCast(cqe.res))); // cqe.res holds received data len + try testing.expectEqual(@as(i32, data.len), cqe.res); // cqe.res holds received data len // Get buffer from pool const buf = try buf_grp.get(cqe); @@ -6039,7 +6039,7 @@ test "ring mapped buffers recv" { { const user_data = rnd.int(u64); _ = try ring.send(user_data, fds.client, data[0..], .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); @@ -6062,7 +6062,7 @@ test "ring mapped buffers recv" { { const user_data = rnd.int(u64); _ = try buf_grp.recv(user_data, fds.server, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail @@ -6128,7 +6128,7 @@ test "ring mapped buffers multishot recv" { { const user_data = rnd.int(u64); _ = try ring.send(user_data, fds.client, data[0..], .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); @@ -6137,7 +6137,7 @@ test "ring mapped buffers multishot recv" { // start multishot recv var recv_user_data = rnd.int(u64); _ = try buf_grp.recv_multishot(recv_user_data, fds.server, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + try testing.expectEqual(1, try ring.submit()); // submit // server reads data into provided buffers // there are 2 buffers of size 4, so each read gets only chunk of data @@ -6172,7 +6172,7 @@ test "ring mapped buffers multishot recv" { // restart multishot recv_user_data = rnd.int(u64); _ = try buf_grp.recv_multishot(recv_user_data, fds.server, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + try testing.expectEqual(1, try ring.submit()); // submit chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); @@ -6188,7 +6188,7 @@ test "ring mapped buffers multishot recv" { { const cancel_user_data = rnd.int(u64); _ = try ring.cancel(cancel_user_data, recv_user_data, .{}); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); // expect completion of cancel operation and completion of recv operation var cqe_cancel = try ring.copy_cqe(); @@ -6237,7 +6237,7 @@ fn buf_grp_recv_submit_get_cqe( const sqe = try buf_grp.recv(user_data, fd, .{}); try testing.expect(sqe.flags.buffer_select); try testing.expect(sqe.buf_index == buf_grp.group_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + try testing.expectEqual(1, try ring.submit()); // submit // get cqe, expect success const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); @@ -6259,12 +6259,12 @@ fn expect_buf_grp_cqe( try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success try testing.expect(cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flag is set - try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); + try testing.expectEqual(@as(i32, @intCast(expected.len)), cqe.res); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); // get buffer from pool const buffer_id = try cqe.buffer_id(); - const len = @as(usize, @intCast(cqe.res)); + const len: usize = @intCast(cqe.res); const buf = buf_grp.get_by_id(buffer_id)[0..len]; try testing.expectEqualSlices(u8, expected, buf); @@ -6443,7 +6443,7 @@ fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t const cqe = try ring.copy_cqe(); try testing.expectEqual(1 + i, cqe.user_data); try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(buffer_send.len, @as(usize, @intCast(cqe.res))); + try testing.expectEqual(@as(i32, buffer_send.len), cqe.res); } // receive From 9f1db4e3614b257ec37d50cc8a98a21092ccda9c Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 12 Oct 2025 19:00:46 +0000 Subject: [PATCH 22/41] Implement more IoUring register functions Future enhancement: enter_ring_fd: u24, init_flags: u8, register functions that require the extra fields above like io_uring_register_ring_fd, io_uring_close_ring_fd, io_uring_clone_buffers_offset, io_uring_set_iowait not implemented register_*_tag functions not implemented remove unnecessary use of @ptrCast in io_uring_register Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 407 ++++++++++++++++++++++++++--------- 1 file changed, 302 insertions(+), 105 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index a88ea8e91845..1c816dad8966 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -3,6 +3,7 @@ const std = @import("std"); const builtin = @import("builtin"); const assert = std.debug.assert; const mem = std.mem; +const math = std.math; const net = std.Io.net; const posix = std.posix; const linux = std.os.linux; @@ -38,7 +39,7 @@ pub fn init(entries: u16, flags: uflags.Setup) !IoUring { /// Matches the interface of `io_uring_queue_init_params()` in liburing. pub fn init_params(entries: u16, p: *Params) !IoUring { if (entries == 0) return error.EntriesZero; - if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + if (!math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; assert(p.sq_entries == 0); assert(p.features.empty()); assert(p.resv[0] == 0); @@ -454,7 +455,7 @@ pub fn write( /// Queues (but does not submit) an SQE to perform a `splice(2)` /// Either `fd_in` or `fd_out` must be a pipe. /// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to -/// std.math.maxInt(u64). +/// math.maxInt(u64). /// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` /// are read from `fd_in` starting from the file offset, which is incremented /// by the number of bytes read. @@ -1177,34 +1178,47 @@ pub fn waitid( return sqe; } -/// Registers an array of file descriptors. -/// -/// Every time a file descriptor is put in an SQE and submitted to the kernel, -/// the kernel must retrieve a reference to the file, and once I/O has -/// completed, the file reference must be dropped. The atomic nature of this -/// file reference can be a slowdown for high IOPS workloads. This slowdown can -/// be avoided by pre-registering file descriptors. -/// -/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in -/// the SQE's flags, and the SQE's fd must be set to the index of the file -/// descriptor in the registered array. -/// -/// Registering file descriptors will wait for the ring to idle and files are -/// automatically unregistered by the kernel when the ring is torn down. -/// -/// An application need unregister only if it wants to register a new array of -/// file descriptors. -pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { +pub fn register_buffers_sparse(self: *IoUring, nr: u32) !void { assert(self.fd >= 0); + + const reg: RsrcRegister = .{ + .flags = .{ .register_sparse = true }, + .nr = nr, + }; + const res = linux.io_uring_register( self.fd, - .register_files, - @ptrCast(fds.ptr), - @intCast(fds.len), + .register_buffers2, + ®, + @sizeOf(RsrcRegister), + ); + try handle_registration_result(res); +} + +/// Registers an array of buffers for use with `read_fixed` and `write_fixed`. +pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register( + self.fd, + .register_buffers, + buffers.ptr, + @intCast(buffers.len), ); try handle_registration_result(res); } +/// Unregister the registered buffers. +pub fn unregister_buffers(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .unregister_buffers, null, 0); + switch (linux.E.init(res)) { + .SUCCESS => {}, + .NXIO => return error.BuffersNotRegistered, + else => |errno| return posix.unexpectedErrno(errno), + } +} + /// Updates registered file descriptors. /// /// Updates are applied starting at the provided offset in the original file @@ -1215,19 +1229,18 @@ pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { /// * replacing an existing entry with a new fd /// /// Adding new file descriptors must be done with `register_files`. -pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { +pub fn register_files_update(self: *IoUring, offset: u32, fds: []const posix.fd_t) !void { assert(self.fd >= 0); - var update: RsrcUpdate = .{ + var update = mem.zeroInit(RsrcUpdate, .{ .offset = offset, - .resv = 0, .data = @intFromPtr(fds.ptr), - }; + }); const res = linux.io_uring_register( self.fd, .register_files_update, - @ptrCast(&update), + &update, @intCast(fds.len), ); try handle_registration_result(res); @@ -1237,43 +1250,58 @@ pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { assert(self.fd >= 0); - const reg: RsrcRegister = .{ + const reg = mem.zeroInit(RsrcRegister, .{ .nr = nr_files, - .flags = .{ .rsrc_register_sparse = true }, - .resv2 = 0, - .data = 0, - .tags = 0, - }; + .flags = .{ .register_sparse = true }, + }); const res = linux.io_uring_register( self.fd, .register_files2, - @ptrCast(®), + ®, @sizeOf(RsrcRegister), ); return handle_registration_result(res); } - -// Registers range for fixed file allocations. -// Available since 6.0 -pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { +/// Registers an array of file descriptors. +/// +/// Every time a file descriptor is put in an SQE and submitted to the kernel, +/// the kernel must retrieve a reference to the file, and once I/O has +/// completed, the file reference must be dropped. The atomic nature of this +/// file reference can be a slowdown for high IOPS workloads. This slowdown can +/// be avoided by pre-registering file descriptors. +/// +/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in +/// the SQE's flags, and the SQE's fd must be set to the index of the file +/// descriptor in the registered array. +/// +/// Registering file descriptors will wait for the ring to idle and files are +/// automatically unregistered by the kernel when the ring is torn down. +/// +/// An application need unregister only if it wants to register a new array of +/// file descriptors. +pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { assert(self.fd >= 0); - - const range: FileIndexRange = .{ - .off = offset, - .len = len, - .resv = 0, - }; - const res = linux.io_uring_register( self.fd, - .register_file_alloc_range, - @ptrCast(&range), - @sizeOf(FileIndexRange), + .register_files, + fds.ptr, + @intCast(fds.len), ); + try handle_registration_result(res); +} - return handle_registration_result(res); +/// Unregisters all registered file descriptors previously associated with the +/// ring. +pub fn unregister_files(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .unregister_files, null, 0); + switch (linux.E.init(res)) { + .SUCCESS => {}, + .NXIO => return error.FilesNotRegistered, + else => |errno| return posix.unexpectedErrno(errno), + } } /// Registers the file descriptor for an eventfd that will be notified of @@ -1284,7 +1312,7 @@ pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { const res = linux.io_uring_register( self.fd, .register_eventfd, - @ptrCast(&fd), + &fd, 1, ); try handle_registration_result(res); @@ -1300,7 +1328,7 @@ pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { const res = linux.io_uring_register( self.fd, .register_eventfd_async, - @ptrCast(&fd), + &fd, 1, ); try handle_registration_result(res); @@ -1318,39 +1346,212 @@ pub fn unregister_eventfd(self: *IoUring) !void { try handle_registration_result(res); } +pub fn register_probe(self: *IoUring, probe: []Probe) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .register_probe, + probe.ptr, + @intCast(probe.len), + ); + try handle_registration_result(res); +} + +/// See https://github.com/axboe/liburing/issues/357 for how to use personality +/// matches `io_uring_register_personality()` in liburing +pub fn register_personality(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .register_personality, + @as(?*anyopaque, null), + 0, + ); + try handle_registration_result(res); +} + +pub fn unregister_personality(self: *IoUring, credential_id: u32) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .unregister_personality, + @as(?*anyopaque, null), + credential_id, + ); + try handle_registration_result(res); +} + +pub fn register_restrictions(self: *IoUring, restriction: []Restriction) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .register_restrictions, + restriction.ptr, + @intCast(restriction.len), + ); + try handle_registration_result(res); +} + +pub fn enable_rings(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .register_enable_rings, + @as(?*anyopaque, null), + @intCast(0), + ); + try handle_registration_result(res); +} + +pub fn register_iowq_aff(self: *IoUring, cpusz: u32, mask: *linux.cpu_set_t) !void { + assert(self.fd >= 0); + + if (cpusz >= math.maxInt(u32)) return error.ArgumentsInvalid; + + const res = linux.io_uring_register( + self.fd, + .register_iowq_aff, + mask, + cpusz, + ); + try handle_registration_result(res); +} + +pub fn unregister_iowq_aff(self: *IoUring) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register( + self.fd, + .unregister_iowq_aff, + @as(?*anyopaque, null), + 0, + ); + try handle_registration_result(res); +} + +/// `max_workers`: `max_workers[0]` should contain the maximum number of +/// desired bounded workers, and the `max_workers[1]` the maximum number of +/// desired unbounded workers. +/// If both values are set to 0, the existing values are returned +/// Read `io_uring_register_iowq_max_workers(3)` for more info +pub fn register_iowq_max_workers(self: *IoUring, max_workers: [2]u32) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register( + self.fd, + .register_iowq_max_workers, + &max_workers, + 2, + ); + try handle_registration_result(res); +} + +/// See `io_uring_register_sync_cancel(3)` +pub fn register_sync_cancel(self: *IoUring, cancel_reg: *SyncCancelRegister) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register( + self.fd, + .register_sync_cancel, + cancel_reg, + 1, + ); + try handle_registration_result(res); +} + +/// See `io_uring_register_sync_msg(3)` +pub fn register_sync_msg(self: *IoUring, sqe: *Sqe) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register( + -1, + .register_send_msg_ring, + sqe, + 1, + ); + try handle_registration_result(res); +} + +// COMMIT: fix register file alloc range taking @sizeOf(FileIndexRange) instead of zero in register syscall +/// Registers range for fixed file allocations. +/// Available since 6.0 +pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { + assert(self.fd >= 0); + + const range: FileIndexRange = .{ + .off = offset, + .len = len, + .resv = 0, + }; + + const res = linux.io_uring_register( + self.fd, + .register_file_alloc_range, + &range, + 0, + ); + + return handle_registration_result(res); +} + pub fn register_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .register_napi, napi, 1); + const res = linux.io_uring_register( + self.fd, + .register_napi, + napi, + 1, + ); try handle_registration_result(res); } pub fn unregister_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .unregister_napi, napi, 1); + const res = linux.io_uring_register( + self.fd, + .unregister_napi, + napi, + 1, + ); try handle_registration_result(res); } -/// Registers an array of buffers for use with `read_fixed` and `write_fixed`. -pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { +pub fn register_clock(self: *IoUring, clock_reg: *ClockRegister) !void { assert(self.fd >= 0); const res = linux.io_uring_register( self.fd, - .register_buffers, - buffers.ptr, - @intCast(buffers.len), + .register_clock, + clock_reg, + 0, ); try handle_registration_result(res); } -/// Unregister the registered buffers. -pub fn unregister_buffers(self: *IoUring) !void { +pub fn register_ifq(self: *IoUring, ifq_reg: *ZcrxIfqRegister) !void { assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .unregister_buffers, null, 0); - switch (linux.errno(res)) { - .SUCCESS => {}, - .NXIO => return error.BuffersNotRegistered, - else => |errno| return posix.unexpectedErrno(errno), - } + const res = linux.io_uring_register( + self.fd, + .register_zcrx_ifq, + ifq_reg, + 1, + ); + try handle_registration_result(res); +} + +pub fn register_resize_rings(self: *IoUring, _: *Params) !void { + assert(self.fd >= 0); + return error.Unimplemented; +} + +pub fn register_region(self: *IoUring, mem_reg: *MemRegionRegister) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register( + self.fd, + .register_mem_region, + mem_reg, + 1, + ); + try handle_registration_result(res); } /// Returns a Probe which is used to probe the capabilities of the @@ -1390,18 +1591,6 @@ fn handle_registration_result(res: usize) !void { } } -/// Unregisters all registered file descriptors previously associated with the -/// ring. -pub fn unregister_files(self: *IoUring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .unregister_files, null, 0); - switch (linux.errno(res)) { - .SUCCESS => {}, - .NXIO => return error.FilesNotRegistered, - else => |errno| return posix.unexpectedErrno(errno), - } -} - /// Prepares a socket creation request. /// New socket fd will be returned in completion result. /// Available since 5.19 @@ -1555,6 +1744,7 @@ pub fn getsockopt( ); } +// TODO: move buf_ring fns into BufferRing type /// Registers a shared buffer ring to be used with provided buffers. `entries` /// number of `io_uring_buf` structures is mem mapped and shared by kernel. /// @@ -1568,8 +1758,8 @@ pub fn setup_buf_ring( group_id: u16, flags: BufferRegister.Flags, ) !*align(page_size_min) BufferRing { - if (entries == 0 or entries > 1 << 15) return error.EntriesNotInRange; - if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + if (entries == 0 or entries > math.maxInt(u16)) return error.EntriesNotInRange; + if (!math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; const mmap_size: usize = entries * @sizeOf(Buffer); const mmap = try posix.mmap( @@ -1583,42 +1773,49 @@ pub fn setup_buf_ring( errdefer posix.munmap(mmap); assert(mmap.len == mmap_size); - const br: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); - try register_buf_ring(fd, @intFromPtr(br), entries, group_id, flags); - return br; -} - -fn register_buf_ring( - fd: linux.fd_t, - addr: u64, - entries: u32, - group_id: u16, - flags: BufferRegister.Flags, -) !void { + const br_addr: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); var reg = mem.zeroInit(BufferRegister, .{ - .ring_addr = addr, + .ring_addr = @intFromPtr(br_addr), .ring_entries = entries, .bgid = group_id, .flags = flags, }); - var res = linux.io_uring_register(fd, .register_pbuf_ring, @ptrCast(®), 1); - if (linux.errno(res) == .INVAL and reg.flags.iou_pbuf_ring_inc) { + try register_buf_ring(fd, ®); + return br_addr; +} + +pub fn register_buf_ring( + fd: posix.fd_t, + buf_reg: *BufferRegister, +) !void { + var res = linux.io_uring_register( + fd, + .register_pbuf_ring, + buf_reg, + 1, + ); + if (linux.E.init(res) == .INVAL and buf_reg.flags.iou_pbuf_ring_inc) { // Retry without incremental buffer consumption. // It is available since kernel 6.12. returns INVAL on older. - reg.flags.iou_pbuf_ring_inc = false; - res = linux.io_uring_register(fd, .register_pbuf_ring, @ptrCast(®), 1); + buf_reg.flags.iou_pbuf_ring_inc = false; + res = linux.io_uring_register( + fd, + .register_pbuf_ring, + buf_reg, + 1, + ); } try handle_register_buf_ring_result(res); } -fn unregister_buf_ring(fd: posix.fd_t, group_id: u16) !void { +pub fn unregister_buf_ring(fd: posix.fd_t, buf_group_id: u16) !void { var reg = mem.zeroInit(BufferRegister, .{ - .bgid = group_id, + .bgid = buf_group_id, }); const res = linux.io_uring_register( fd, .unregister_pbuf_ring, - @ptrCast(®), + ®, 1, ); try handle_register_buf_ring_result(res); @@ -2894,7 +3091,7 @@ pub const RegionDesc = extern struct { // COMMIT: add new io_uring_mem_region_reg struct /// matches `io_uring_mem_region_reg` in liburing -pub const MemRegionReg = extern struct { +pub const MemRegionRegister = extern struct { /// struct io_uring_region_desc (RegionDesc in Zig) region_uptr: u64, flags: Flags, @@ -2918,7 +3115,7 @@ pub const RsrcRegister = extern struct { pub const Flags = packed struct(u32) { /// Register a fully sparse file space, rather than pass in an array of /// all -1 file descriptors. - rsrc_register_sparse: bool = false, + register_sparse: bool = false, _: 31 = 0, }; }; @@ -3210,7 +3407,7 @@ pub const constants = struct { /// (like openat/openat2/accept), then io_uring will allocate in. The /// picked direct descriptor will be returned in cqe.res, or -ENFILE /// if the space is full. - pub const FILE_INDEX_ALLOC = std.math.maxInt(u32); + pub const FILE_INDEX_ALLOC = math.maxInt(u32); pub const CMD_MASK = 1 << 0; @@ -3926,7 +4123,7 @@ test "splice/read" { _ = try file_src.write(&buffer_write); const fds = try posix.pipe(); - const pipe_offset: u64 = std.math.maxInt(u64); + const pipe_offset: u64 = math.maxInt(u64); const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); try testing.expectEqual(Op.splice, sqe_splice_to_pipe.opcode); From ba34ff2fb29a4a4cfdfdf1f414aa2fe0cd223285 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 12 Oct 2025 21:48:16 +0000 Subject: [PATCH 23/41] Remove unnecessary null to optional anyopaque coercion Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 1c816dad8966..49901cd3accd 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1364,7 +1364,7 @@ pub fn register_personality(self: *IoUring) !void { const res = linux.io_uring_register( self.fd, .register_personality, - @as(?*anyopaque, null), + null, 0, ); try handle_registration_result(res); @@ -1372,12 +1372,7 @@ pub fn register_personality(self: *IoUring) !void { pub fn unregister_personality(self: *IoUring, credential_id: u32) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .unregister_personality, - @as(?*anyopaque, null), - credential_id, - ); + const res = linux.io_uring_register(self.fd, .unregister_personality, null, credential_id); try handle_registration_result(res); } @@ -1397,8 +1392,8 @@ pub fn enable_rings(self: *IoUring) !void { const res = linux.io_uring_register( self.fd, .register_enable_rings, - @as(?*anyopaque, null), - @intCast(0), + null, + 0, ); try handle_registration_result(res); } @@ -1423,7 +1418,7 @@ pub fn unregister_iowq_aff(self: *IoUring) !void { const res = linux.io_uring_register( self.fd, .unregister_iowq_aff, - @as(?*anyopaque, null), + null, 0, ); try handle_registration_result(res); From 42a13cf5e3fe39b35140e54339746ac7ab535cb2 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Mon, 13 Oct 2025 23:15:54 +0000 Subject: [PATCH 24/41] Move buf_ring_* functions into BufferRing type as methods add IoUring tee syscall functionality adapt ReadBuffer union idea for read_fixed to support fixed read and readv Use same idea and create a WriteBuffer type for write() and write_fixed() ignored flags for splice and tee lets see if they become important in the future add a get_data helper to Cqe Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 642 ++++++++++++++++------------------- 1 file changed, 297 insertions(+), 345 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 49901cd3accd..4ff57be3a609 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -407,51 +407,6 @@ pub fn nop(self: *IoUring, user_data: u64) !*Sqe { return sqe; } -/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` -/// depending on the buffer type. -/// * Reading into a `ReadBuffer.buffer` uses `read(2)` -/// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` -/// -/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. -/// See https://man7.org/linux/man-pages/man2/preadv2.2.html -/// -/// Returns a pointer to the SQE. -pub fn read( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: ReadBuffer, - offset: u64, -) !*Sqe { - const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| sqe.prep_read(fd, slice, offset), - .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), - .buffer_selection => |selection| { - sqe.prep_rw(.read, fd, 0, selection.len, offset); - sqe.flags.buffer_select = true; - sqe.buf_index = selection.group_id; - }, - } - sqe.user_data = user_data; - return sqe; -} - -/// Queues (but does not submit) an SQE to perform a `write(2)`. -/// Returns a pointer to the SQE. -pub fn write( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - offset: u64, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_write(fd, buffer, offset); - sqe.user_data = user_data; - return sqe; -} - /// Queues (but does not submit) an SQE to perform a `splice(2)` /// Either `fd_in` or `fd_out` must be a pipe. /// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to @@ -488,6 +443,51 @@ pub fn splice( return sqe; } +// COMMIT: ignored flags for splice and tee lets see if they become important +// in the future +pub fn tee( + self: *IoUring, + user_data: u64, + fd_in: posix.fd_t, + fd_out: posix.fd_t, + len: usize, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_tee(fd_in, fd_out, len); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `pread(2)` or `preadv(2)` +/// depending on the buffer type. +/// * Reading into a `ReadBuffer.buffer` uses `pread(2)` +/// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` +/// +/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. +/// See https://man7.org/linux/man-pages/man2/preadv2.2.html +/// +/// Returns a pointer to the SQE. +pub fn read( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + buffer: ReadBuffer, + offset: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_read(fd, slice, offset), + .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), + .buffer_selection => |selection| { + sqe.prep_rw(.read, fd, 0, selection.len, offset); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} + /// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED. /// The `buffer` provided must be registered with the kernel by calling /// `register_buffers()` first. The `buffer_index` must be the same as its @@ -499,30 +499,45 @@ pub fn read_fixed( self: *IoUring, user_data: u64, fd: linux.fd_t, - buffer: *posix.iovec, + buffer: ReadBuffer, offset: u64, buffer_index: u16, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_read_fixed(fd, buffer, offset, buffer_index); + switch (buffer) { + .buffer => |slice| sqe.prep_read_fixed(fd, slice, offset, buffer_index), + .iovecs => |vecs| sqe.prep_readv_fixed(fd, vecs, offset, buffer_index), + .buffer_selection => |selection| { + sqe.prep_rw(.read_fixed, fd, 0, selection.len, offset); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `pwritev()`. +/// Queues (but does not submit) an SQE to perform a `pwrite(2)` or `pwritev(2)` +/// depending on the write buffer type. +/// * Reading into a `WriteBuffer.buffer` uses `pwrite(2)` +/// * Reading into a `WriteBuffer.iovecs` uses `pwritev(2)` +/// /// Returns a pointer to the SQE so that you can further modify the SQE for /// advanced use cases. /// For example, if you want to do a `pwritev2()` then set `rw_flags` on the /// returned SQE. See https://linux.die.net/man/2/pwritev. -pub fn writev( +pub fn write( self: *IoUring, user_data: u64, fd: linux.fd_t, - iovecs: []const posix.iovec_const, + buffer: WriteBuffer, offset: u64, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_writev(fd, iovecs, offset); + switch (buffer) { + .buffer => |slice| sqe.prep_write(fd, slice, offset), + .iovecs => |vecs| sqe.prep_writev(fd, vecs, offset), + } sqe.user_data = user_data; return sqe; } @@ -538,12 +553,80 @@ pub fn write_fixed( self: *IoUring, user_data: u64, fd: linux.fd_t, - buffer: *posix.iovec, + buffer: WriteBuffer, offset: u64, buffer_index: u16, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_write_fixed(fd, buffer, offset, buffer_index); + switch (buffer) { + .buffer => |slice| { + sqe.prep_write_fixed(fd, slice, offset, buffer_index); + }, + .iovecs => |vecs| { + sqe.prep_writev_fixed(fd, vecs, offset, buffer_index); + }, + } + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.3 +pub fn recvmsg( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + msg: *posix.msghdr, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_recvmsg(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a multishot `recvmsg(2)`. +/// Returns a pointer to the SQE. +pub fn recvmsg_multishot( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + msg: *posix.msghdr, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_recvmsg_multishot(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.3 +pub fn sendmsg( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + msg: *const posix.msghdr_const, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `poll(2)`. +/// Returns a pointer to the SQE. +pub fn poll_add( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + poll_mask: linux.Epoll, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_add(fd, poll_mask); sqe.user_data = user_data; return sqe; } @@ -746,38 +829,6 @@ pub fn send_zc_fixed( return sqe; } -/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.3 -pub fn recvmsg( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: linux.Msg, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_recvmsg(fd, msg, flags); - sqe.user_data = user_data; - return sqe; -} - -/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.3 -pub fn sendmsg( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: linux.Msg, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_sendmsg(fd, msg, flags); - sqe.user_data = user_data; - return sqe; -} - /// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. /// Returns a pointer to the SQE. /// Available since 6.1 @@ -931,20 +982,6 @@ pub fn link_timeout( return sqe; } -/// Queues (but does not submit) an SQE to perform a `poll(2)`. -/// Returns a pointer to the SQE. -pub fn poll_add( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - poll_mask: linux.Epoll, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_poll_add(fd, poll_mask); - sqe.user_data = user_data; - return sqe; -} - /// Queues (but does not submit) an SQE to remove an existing poll operation. /// Returns a pointer to the SQE. pub fn poll_remove( @@ -1186,25 +1223,16 @@ pub fn register_buffers_sparse(self: *IoUring, nr: u32) !void { .nr = nr, }; - const res = linux.io_uring_register( - self.fd, - .register_buffers2, - ®, - @sizeOf(RsrcRegister), - ); + const res = linux.io_uring_register(self.fd, .register_buffers2, ®, @sizeOf(RsrcRegister)); try handle_registration_result(res); } -/// Registers an array of buffers for use with `read_fixed` and `write_fixed`. +/// Registers an array of buffers for use with `read_fixed`, `readv_fixed`, +/// `write_fixed` and `writev_fixed`. pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_buffers, - buffers.ptr, - @intCast(buffers.len), - ); + const res = linux.io_uring_register(self.fd, .register_buffers, buffers.ptr, @intCast(buffers.len)); try handle_registration_result(res); } @@ -1237,12 +1265,7 @@ pub fn register_files_update(self: *IoUring, offset: u32, fds: []const posix.fd_ .data = @intFromPtr(fds.ptr), }); - const res = linux.io_uring_register( - self.fd, - .register_files_update, - &update, - @intCast(fds.len), - ); + const res = linux.io_uring_register(self.fd, .register_files_update, &update, @intCast(fds.len)); try handle_registration_result(res); } @@ -1255,12 +1278,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { .flags = .{ .register_sparse = true }, }); - const res = linux.io_uring_register( - self.fd, - .register_files2, - ®, - @sizeOf(RsrcRegister), - ); + const res = linux.io_uring_register(self.fd, .register_files2, ®, @sizeOf(RsrcRegister)); return handle_registration_result(res); } @@ -1283,12 +1301,7 @@ pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { /// file descriptors. pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_files, - fds.ptr, - @intCast(fds.len), - ); + const res = linux.io_uring_register(self.fd, .register_files, fds.ptr, @intCast(fds.len)); try handle_registration_result(res); } @@ -1309,12 +1322,7 @@ pub fn unregister_files(self: *IoUring) !void { /// Only a single a eventfd can be registered at any given point in time. pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_eventfd, - &fd, - 1, - ); + const res = linux.io_uring_register(self.fd, .register_eventfd, &fd, 1); try handle_registration_result(res); } @@ -1325,35 +1333,20 @@ pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { /// Only a single eventfd can be registered at any given point in time. pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_eventfd_async, - &fd, - 1, - ); + const res = linux.io_uring_register(self.fd, .register_eventfd_async, &fd, 1); try handle_registration_result(res); } /// Unregister the registered eventfd file descriptor. pub fn unregister_eventfd(self: *IoUring) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .unregister_eventfd, - null, - 0, - ); + const res = linux.io_uring_register(self.fd, .unregister_eventfd, null, 0); try handle_registration_result(res); } pub fn register_probe(self: *IoUring, probe: []Probe) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_probe, - probe.ptr, - @intCast(probe.len), - ); + const res = linux.io_uring_register(self.fd, .register_probe, probe.ptr, @intCast(probe.len)); try handle_registration_result(res); } @@ -1361,12 +1354,7 @@ pub fn register_probe(self: *IoUring, probe: []Probe) !void { /// matches `io_uring_register_personality()` in liburing pub fn register_personality(self: *IoUring) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_personality, - null, - 0, - ); + const res = linux.io_uring_register(self.fd, .register_personality, null, 0); try handle_registration_result(res); } @@ -1378,23 +1366,13 @@ pub fn unregister_personality(self: *IoUring, credential_id: u32) !void { pub fn register_restrictions(self: *IoUring, restriction: []Restriction) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_restrictions, - restriction.ptr, - @intCast(restriction.len), - ); + const res = linux.io_uring_register(self.fd, .register_restrictions, restriction.ptr, @intCast(restriction.len)); try handle_registration_result(res); } pub fn enable_rings(self: *IoUring) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_enable_rings, - null, - 0, - ); + const res = linux.io_uring_register(self.fd, .register_enable_rings, null, 0); try handle_registration_result(res); } @@ -1403,24 +1381,14 @@ pub fn register_iowq_aff(self: *IoUring, cpusz: u32, mask: *linux.cpu_set_t) !vo if (cpusz >= math.maxInt(u32)) return error.ArgumentsInvalid; - const res = linux.io_uring_register( - self.fd, - .register_iowq_aff, - mask, - cpusz, - ); + const res = linux.io_uring_register(self.fd, .register_iowq_aff, mask, cpusz); try handle_registration_result(res); } pub fn unregister_iowq_aff(self: *IoUring) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .unregister_iowq_aff, - null, - 0, - ); + const res = linux.io_uring_register(self.fd, .unregister_iowq_aff, null, 0); try handle_registration_result(res); } @@ -1432,12 +1400,7 @@ pub fn unregister_iowq_aff(self: *IoUring) !void { pub fn register_iowq_max_workers(self: *IoUring, max_workers: [2]u32) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_iowq_max_workers, - &max_workers, - 2, - ); + const res = linux.io_uring_register(self.fd, .register_iowq_max_workers, &max_workers, 2); try handle_registration_result(res); } @@ -1445,12 +1408,7 @@ pub fn register_iowq_max_workers(self: *IoUring, max_workers: [2]u32) !void { pub fn register_sync_cancel(self: *IoUring, cancel_reg: *SyncCancelRegister) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_sync_cancel, - cancel_reg, - 1, - ); + const res = linux.io_uring_register(self.fd, .register_sync_cancel, cancel_reg, 1); try handle_registration_result(res); } @@ -1458,12 +1416,7 @@ pub fn register_sync_cancel(self: *IoUring, cancel_reg: *SyncCancelRegister) !vo pub fn register_sync_msg(self: *IoUring, sqe: *Sqe) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - -1, - .register_send_msg_ring, - sqe, - 1, - ); + const res = linux.io_uring_register(-1, .register_send_msg_ring, sqe, 1); try handle_registration_result(res); } @@ -1479,57 +1432,32 @@ pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { .resv = 0, }; - const res = linux.io_uring_register( - self.fd, - .register_file_alloc_range, - &range, - 0, - ); + const res = linux.io_uring_register(self.fd, .register_file_alloc_range, &range, 0); return handle_registration_result(res); } pub fn register_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_napi, - napi, - 1, - ); + const res = linux.io_uring_register(self.fd, .register_napi, napi, 1); try handle_registration_result(res); } pub fn unregister_napi(self: *IoUring, napi: *Napi) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .unregister_napi, - napi, - 1, - ); + const res = linux.io_uring_register(self.fd, .unregister_napi, napi, 1); try handle_registration_result(res); } pub fn register_clock(self: *IoUring, clock_reg: *ClockRegister) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_clock, - clock_reg, - 0, - ); + const res = linux.io_uring_register(self.fd, .register_clock, clock_reg, 0); try handle_registration_result(res); } pub fn register_ifq(self: *IoUring, ifq_reg: *ZcrxIfqRegister) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_zcrx_ifq, - ifq_reg, - 1, - ); + const res = linux.io_uring_register(self.fd, .register_zcrx_ifq, ifq_reg, 1); try handle_registration_result(res); } @@ -1540,12 +1468,7 @@ pub fn register_resize_rings(self: *IoUring, _: *Params) !void { pub fn register_region(self: *IoUring, mem_reg: *MemRegionRegister) !void { assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .register_mem_region, - mem_reg, - 1, - ); + const res = linux.io_uring_register(self.fd, .register_mem_region, mem_reg, 1); try handle_registration_result(res); } @@ -1747,12 +1670,14 @@ pub fn getsockopt( /// a power of 2. /// `fd` is IO_Uring.fd for which the provided buffer ring is being registered. /// `group_id` is the chosen buffer group ID, unique in IO_Uring. -pub fn setup_buf_ring( - fd: linux.fd_t, +/// matches `io_uring_setup_buf_ring()` in liburing +pub fn init_buffer_ring( + self: *IoUring, entries: u16, group_id: u16, flags: BufferRegister.Flags, ) !*align(page_size_min) BufferRing { + assert(self.fd >= 0); if (entries == 0 or entries > math.maxInt(u16)) return error.EntriesNotInRange; if (!math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; @@ -1768,51 +1693,36 @@ pub fn setup_buf_ring( errdefer posix.munmap(mmap); assert(mmap.len == mmap_size); - const br_addr: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); + const buffer_ring: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); var reg = mem.zeroInit(BufferRegister, .{ - .ring_addr = @intFromPtr(br_addr), + .ring_addr = @intFromPtr(buffer_ring), .ring_entries = entries, .bgid = group_id, .flags = flags, }); - try register_buf_ring(fd, ®); - return br_addr; + try self.register_buffer_ring(®); + buffer_ring.init(); + return buffer_ring; } -pub fn register_buf_ring( - fd: posix.fd_t, - buf_reg: *BufferRegister, -) !void { - var res = linux.io_uring_register( - fd, - .register_pbuf_ring, - buf_reg, - 1, - ); +/// matches `io_uring_register_buf_ring` +pub fn register_buffer_ring(self: *IoUring, buf_reg: *BufferRegister) !void { + var res = linux.io_uring_register(self.fd, .register_pbuf_ring, buf_reg, 1); if (linux.E.init(res) == .INVAL and buf_reg.flags.iou_pbuf_ring_inc) { // Retry without incremental buffer consumption. // It is available since kernel 6.12. returns INVAL on older. buf_reg.flags.iou_pbuf_ring_inc = false; - res = linux.io_uring_register( - fd, - .register_pbuf_ring, - buf_reg, - 1, - ); + res = linux.io_uring_register(self.fd, .register_pbuf_ring, buf_reg, 1); } try handle_register_buf_ring_result(res); } -pub fn unregister_buf_ring(fd: posix.fd_t, buf_group_id: u16) !void { +/// matches `io_uring_unregister_buf_ring` +pub fn unregister_buffer_ring(self: *IoUring, buf_group_id: u16) !void { var reg = mem.zeroInit(BufferRegister, .{ .bgid = buf_group_id, }); - const res = linux.io_uring_register( - fd, - .unregister_pbuf_ring, - ®, - 1, - ); + const res = linux.io_uring_register(self.fd, .unregister_pbuf_ring, ®, 1); try handle_register_buf_ring_result(res); } @@ -1824,57 +1734,6 @@ fn handle_register_buf_ring_result(res: usize) !void { } } -// Unregisters a previously registered shared buffer ring, returned from -// io_uring_setup_buf_ring. -pub fn free_buf_ring(fd: posix.fd_t, br: *align(page_size_min) BufferRing, entries: u32, group_id: u16) void { - unregister_buf_ring(fd, group_id) catch {}; - var mmap: []align(page_size_min) u8 = undefined; - mmap.ptr = @ptrCast(br); - mmap.len = entries * @sizeOf(Buffer); - posix.munmap(mmap); -} - -/// Initialises `br` so that it is ready to be used. -pub fn buf_ring_init(br: *BufferRing) void { - br.tail = 0; -} - -/// Calculates the appropriate size mask for a buffer ring. -/// `entries` is the ring entries as specified in io_uring_register_buf_ring. -pub fn buf_ring_mask(entries: u16) u16 { - return entries - 1; -} - -/// Assigns `buffer` with the `br` buffer ring. -/// `buffer_id` is identifier which will be returned in the CQE. -/// `buffer_offset` is the offset to insert at from the current tail. -/// If just one buffer is provided before the ring tail is committed with -/// advance then offset should be 0. -/// If buffers are provided in a loop before being committed, the offset must -/// be incremented by one for each buffer added. -pub fn buf_ring_add( - br: *BufferRing, - buffer: []u8, - buffer_id: u16, - mask: u16, - buffer_offset: u16, -) void { - const bufs: [*]Buffer = @ptrCast(br); - const buf: *Buffer = &bufs[(br.tail +% buffer_offset) & mask]; - - buf.addr = @intFromPtr(buffer.ptr); - buf.len = @intCast(buffer.len); - buf.bid = buffer_id; -} - -/// Make `count` new buffers visible to the kernel. Called after -/// `io_uring_buf_ring_add` has been called `count` times to fill in new -/// buffers. -pub fn buf_ring_advance(br: *BufferRing, count: u16) void { - const tail: u16 = br.tail +% count; - @atomicStore(u16, &br.tail, tail, .release); -} - /// IO completion data structure (Completion Queue Entry) pub const Cqe = extern struct { /// sqe.user_data value passed back @@ -1922,6 +1781,12 @@ pub const Cqe = extern struct { _17: u16 = 0, }; + /// Retrive the 64-bit cqe `user_data`, as `*T` after completion of an Sqe + /// this data is passed through `Sqe` -> `Cqe` unchanged + pub fn get_data(cqe: Cqe, comptime T: type) *T { + return @ptrFromInt(cqe.user_data); + } + pub fn err(self: Cqe) linux.E { if (self.res > -4096 and self.res < 0) { return @enumFromInt(-self.res); @@ -2108,20 +1973,45 @@ pub const Sqe = extern struct { }; } - pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { - sqe.prep_rw(.read, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } - pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { sqe.prep_rw(.write, fd, @intFromPtr(buffer.ptr), buffer.len, offset); } + pub fn prep_writev( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec_const, + offset: u64, + ) void { + sqe.prep_rw(.writev, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } + + pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.write_fixed, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_writev_fixed(sqe: *Sqe, fd: linux.fd_t, iovecs: []const posix.iovec_const, offset: u64, buffer_index: u16) void { + sqe.prep_rw(.write_fixed, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + sqe.buf_index = buffer_index; + } + pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { sqe.prep_rw(.splice, fd_out, undefined, len, off_out); sqe.addr = off_in; sqe.splice_fd_in = fd_in; } + pub fn prep_tee(sqe: *Sqe, fd_in: linux.fd_t, fd_out: linux.fd_t, len: usize) void { + sqe.prep_rw(.tee, fd_out, undefined, len, 0); + sqe.addr = undefined; + sqe.splice_fd_in = fd_in; + } + + pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { + sqe.prep_rw(.read, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } + pub fn prep_readv( sqe: *Sqe, fd: linux.fd_t, @@ -2131,22 +2021,25 @@ pub const Sqe = extern struct { sqe.prep_rw(.readv, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); } - pub fn prep_writev( + pub fn prep_read_fixed( sqe: *Sqe, fd: linux.fd_t, - iovecs: []const std.posix.iovec_const, + buffer: []u8, offset: u64, + buffer_index: u16, ) void { - sqe.prep_rw(.writev, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } - - pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.read_fixed, fd, @intFromPtr(buffer.base), buffer.len, offset); + sqe.prep_rw(.read_fixed, fd, @intFromPtr(buffer.ptr), buffer.len, offset); sqe.buf_index = buffer_index; } - pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.write_fixed, fd, @intFromPtr(buffer.base), buffer.len, offset); + pub fn prep_readv_fixed( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const std.posix.iovec, + offset: u64, + buffer_index: u16, + ) void { + sqe.prep_rw(.read_fixed, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); sqe.buf_index = buffer_index; } @@ -2892,18 +2785,17 @@ pub const BufferGroup = struct { const heads = try allocator.alloc(u32, buffers_count); errdefer allocator.free(heads); - const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .iou_pbuf_ring_inc = true }); - buf_ring_init(br); + const br = try ring.init_buffer_ring(buffers_count, group_id, .{ .iou_pbuf_ring_inc = true }); - const mask = buf_ring_mask(buffers_count); + const mask = br.mask(buffers_count); var i: u16 = 0; while (i < buffers_count) : (i += 1) { const pos = buffer_size * i; const buf = buffers[pos .. pos + buffer_size]; heads[i] = 0; - buf_ring_add(br, buf, i, mask, i); + br.add(buf, i, mask, i); } - buf_ring_advance(br, buffers_count); + br.advance(buffers_count); return .{ .ring = ring, @@ -2917,7 +2809,7 @@ pub const BufferGroup = struct { } pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void { - free_buf_ring(self.ring.fd, self.br, self.buffers_count, self.group_id); + self.br.deinit(self.ring, self.buffers_count, self.group_id); allocator.free(self.buffers); allocator.free(self.heads); } @@ -2978,9 +2870,9 @@ pub const BufferGroup = struct { self.heads[buffer_id] = 0; // Release buffer to the kernel. - const mask = buf_ring_mask(self.buffers_count); - buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0); - buf_ring_advance(self.br, 1); + const mask = self.br.mask(self.buffers_count); + self.br.add(self.get_by_id(buffer_id), buffer_id, mask, 0); + self.br.advance(1); } }; @@ -3000,6 +2892,14 @@ pub const ReadBuffer = union(enum) { }, }; +/// Used to select how the write should be handled. +pub const WriteBuffer = union(enum) { + /// io_uring will write data from this buffer into fd. + buffer: []const u8, + /// io_uring will write data from iovecs into fd using pwritev. + iovecs: []const posix.iovec_const, +}; + /// Used to select how the recv call should be handled. pub const RecvBuffer = union(enum) { /// io_uring will recv directly into this buffer @@ -3160,7 +3060,7 @@ pub const Probe = extern struct { ops: [256]ProbeOp, /// Is the operation supported on the running kernel. - pub fn is_supported(self: @This(), op: Op) bool { + pub fn is_supported(self: *const Probe, op: Op) bool { const i = @intFromEnum(op); if (i > @intFromEnum(self.last_op) or i >= self.ops_len) return false; @@ -3224,6 +3124,58 @@ pub const BufferRing = extern struct { resv2: u32, resv3: u16, tail: u16, + + /// Initialises `br` so that it is ready to be used. + /// matches `io_uring_buf_ring_init` in liburing + fn init(br: *align(page_size_min) BufferRing) void { + br.tail = 0; + } + + // Unregisters a previously registered shared buffer ring, returned from + // io_uring_setup_buf_ring. + pub fn deinit(br: *align(page_size_min) BufferRing, uring: *IoUring, entries: u32, group_id: u16) void { + uring.unregister_buffer_ring(group_id) catch {}; + var mmap: []align(page_size_min) u8 = undefined; + mmap.ptr = @ptrCast(br); + mmap.len = entries * @sizeOf(Buffer); + posix.munmap(mmap); + } + + /// Calculates the appropriate size mask for a buffer ring. + /// `entries` is the ring entries as specified in io_uring_register_buf_ring + pub fn mask(_: *align(page_size_min) BufferRing, entries: u16) u16 { + return entries - 1; + } + + /// Assigns `buffer` with the `br` buffer ring. + /// `buffer_id` is identifier which will be returned in the CQE. + /// `buffer_offset` is the offset to insert at from the current tail. + /// If just one buffer is provided before the ring tail is committed with + /// advance then offset should be 0. + /// If buffers are provided in a loop before being committed, the offset must + /// be incremented by one for each buffer added. + pub fn add( + br: *align(page_size_min) BufferRing, + buffer: []u8, + buffer_id: u16, + buffer_mask: u16, + buffer_offset: u16, + ) void { + const bufs: [*]Buffer = @ptrCast(br); + const buf: *Buffer = &bufs[(br.tail +% buffer_offset) & buffer_mask]; + + buf.addr = @intFromPtr(buffer.ptr); + buf.len = @intCast(buffer.len); + buf.bid = buffer_id; + } + + /// Make `count` new buffers visible to the kernel. Called after + /// `io_uring_buf_ring_add` has been called `count` times to fill in new + /// buffers. + pub fn advance(br: *align(page_size_min) BufferRing, count: u16) void { + const tail: u16 = br.tail +% count; + @atomicStore(u16, &br.tail, tail, .release); + } }; /// argument for IORING_(UN)REGISTER_PBUF_RING @@ -4002,7 +3954,7 @@ test "writev/fsync/readv" { posix.iovec{ .base = &buffer_read, .len = buffer_read.len }, }; - const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); + const sqe_writev = try ring.write(0xdddddddd, fd, .{ .iovecs = iovecs_write[0..] }, 17); try testing.expectEqual(Op.writev, sqe_writev.opcode); try testing.expectEqual(17, sqe_writev.off); sqe_writev.link_next(); @@ -4064,7 +4016,7 @@ test "write/read" { const buffer_write = [_]u8{97} ** 20; var buffer_read = [_]u8{98} ** 20; - const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); + const sqe_write = try ring.write(0x11111111, fd, .{ .buffer = buffer_write[0..] }, 10); try testing.expectEqual(Op.write, sqe_write.opcode); try testing.expectEqual(10, sqe_write.off); sqe_write.flags.io_link = true; @@ -4198,12 +4150,12 @@ test "write_fixed/read_fixed" { else => |e| return e, }; - const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); + const sqe_write = try ring.write_fixed(0x45454545, fd, .{ .buffer = raw_buffers[0][0..] }, 3, 0); try testing.expectEqual(Op.write_fixed, sqe_write.opcode); try testing.expectEqual(3, sqe_write.off); sqe_write.link_next(); - const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); + const sqe_read = try ring.read_fixed(0x12121212, fd, .{ .buffer = raw_buffers[1][0..] }, 0, 1); try testing.expectEqual(Op.read_fixed, sqe_read.opcode); try testing.expectEqual(0, sqe_read.off); From 5ceaa00dacd77e46b05b2aae11026ffb77505721 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Tue, 14 Oct 2025 23:35:36 +0000 Subject: [PATCH 25/41] Implement some more IoUring operations Add timeout update, cancel_fd, epoll_wait, files_update, open, open_direct, f/madvice don't implement f/madvice64 for now I doubt it is used by a lot of people in practice Implement BufferGroup read_multishot Try to make definition of function align as logically as posible with liburing to make keeping our implementation in sync easy TODO: add M/Fadvice enum flags TODO: understand buffer_selection and how its different from BufferGroup Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 7 + lib/std/os/linux/IoUring.zig | 726 ++++++++++++++++++++++------------- 2 files changed, 460 insertions(+), 273 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index a565b88ed6e0..10bec0cb67eb 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -8370,6 +8370,13 @@ pub const MADV = struct { pub const SOFT_OFFLINE = 101; }; +pub const Madvice = enum(u32) { + _, // TODO: add options +}; +pub const Fadvice = enum(u32) { + _, // TODO: add options +}; + pub const POSIX_FADV = switch (native_arch) { .s390x => if (@typeInfo(usize).int.bits == 64) struct { pub const NORMAL = 0; diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 4ff57be3a609..b37e2d26d80d 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -374,39 +374,6 @@ pub fn cq_advance(self: *IoUring, count: u32) void { } } -/// Queues (but does not submit) an SQE to perform an `fsync(2)`. -/// Returns a pointer to the SQE so that you can further modify the SQE for -/// advanced use cases. -/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the -/// SQE's `rw_flags`. -/// N.B. While SQEs are initiated in the order in which they appear in the -/// submission queue, operations execute in parallel and completions are -/// unordered. Therefore, an application that submits a write followed by an -/// fsync in the submission queue cannot expect the fsync to apply to the write, -/// since the fsync may complete before the write is issued to the disk. -/// You should preferably use `link_with_next_sqe()` on a write's SQE to link -/// it with an fsync, or else insert a full write barrier using -/// `drain_previous_sqes()` when queueing an fsync. -pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_fsync(fd, flags); - sqe.user_data = user_data; - return sqe; -} - -/// Queues (but does not submit) an SQE to perform a no-op. -/// Returns a pointer to the SQE so that you can further modify the SQE for -/// advanced use cases. -/// A no-op is more useful than may appear at first glance. -/// For example, you could call `drain_previous_sqes()` on the returned SQE, to -/// use the no-op to know when the ring is idle before acting on a kill signal. -pub fn nop(self: *IoUring, user_data: u64) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_nop(); - sqe.user_data = user_data; - return sqe; -} - /// Queues (but does not submit) an SQE to perform a `splice(2)` /// Either `fd_in` or `fd_out` must be a pipe. /// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to @@ -631,32 +598,146 @@ pub fn poll_add( return sqe; } -/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. +/// Queues (but does not submit) an SQE to perform a multishot `poll(2)`. /// Returns a pointer to the SQE. -/// Available since 5.5 -pub fn accept( +pub fn poll_multishot( self: *IoUring, user_data: u64, - fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: linux.Sock, + fd: posix.fd_t, + poll_mask: linux.Epoll, +) !*Sqe { + const sqe = try self.poll_add(user_data, fd, poll_mask); + sqe.len = @bitCast(uflags.Poll{ .add_multi = true }); + return sqe; +} + +/// Queues (but does not submit) an SQE to remove an existing poll operation. +/// Returns a pointer to the SQE. +pub fn poll_remove( + self: *IoUring, + user_data: u64, + target_user_data: u64, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_accept(fd, addr, addrlen, flags); + sqe.prep_poll_remove(target_user_data); sqe.user_data = user_data; return sqe; } -/// Queues an multishot accept on a socket. +/// Queues (but does not submit) an SQE to update the user data of an existing +/// poll operation. Returns a pointer to the SQE. +pub fn poll_update( + self: *IoUring, + user_data: u64, + old_user_data: u64, + new_user_data: u64, + poll_mask: linux.Epoll, + flags: uflags.Poll, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an `fsync(2)`. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the +/// SQE's `rw_flags`. +/// N.B. While SQEs are initiated in the order in which they appear in the +/// submission queue, operations execute in parallel and completions are +/// unordered. Therefore, an application that submits a write followed by an +/// fsync in the submission queue cannot expect the fsync to apply to the write, +/// since the fsync may complete before the write is issued to the disk. +/// You should preferably use `link_with_next_sqe()` on a write's SQE to link +/// it with an fsync, or else insert a full write barrier using +/// `drain_previous_sqes()` when queueing an fsync. +pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_fsync(fd, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a no-op. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +/// A no-op is more useful than may appear at first glance. +/// For example, you could call `drain_previous_sqes()` on the returned SQE, to +/// use the no-op to know when the ring is idle before acting on a kill signal. +pub fn nop(self: *IoUring, user_data: u64) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_nop(); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to register a timeout operation. +/// Returns a pointer to the SQE. /// -/// Multishot variant allows an application to issue a single accept request, -/// which will repeatedly trigger a CQE when a connection request comes in. -/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate -/// further CQEs. +/// The timeout will complete when either the timeout expires, or after the +/// specified number of events complete (if `count` is greater than `0`). /// -/// Available since 5.19 -pub fn accept_multishot( +/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an +/// absolute timeout. +/// +/// The completion event result will be `-ETIME` if the timeout completed +/// through expiration, `0` if the timeout completed after the specified number +/// of events, or `-ECANCELED` if the timeout was removed before it expired. +/// +/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. +pub fn timeout( + self: *IoUring, + user_data: u64, + ts: *const linux.kernel_timespec, + count: u32, + flags: uflags.Timeout, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_timeout(ts, count, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to remove an existing timeout operation. +/// Returns a pointer to the SQE. +/// +/// The timeout is identified by its `user_data`. +/// +/// The completion event result will be `0` if the timeout was found and +/// cancelled successfully else: +/// `-EBUSY` if the timeout was found but expiration was already in progress, or +/// `-ENOENT` if the timeout was not found. +pub fn timeout_remove( + self: *IoUring, + user_data: u64, + timeout_user_data: u64, + flags: uflags.Timeout, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_timeout_remove(timeout_user_data, flags); + sqe.user_data = user_data; + return sqe; +} + +pub fn timeout_update( + self: *IoUring, + user_data: u64, + timeout_user_data: u64, + ts: *const linux.kernel_timespec, + flags: uflags.Timeout, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_timeout_update(timeout_user_data, ts, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. +/// Returns a pointer to the SQE. +/// Available since 5.5 +pub fn accept( self: *IoUring, user_data: u64, fd: linux.fd_t, @@ -665,7 +746,7 @@ pub fn accept_multishot( flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_multishot_accept(fd, addr, addrlen, flags); + sqe.prep_accept(fd, addr, addrlen, flags); sqe.user_data = user_data; return sqe; } @@ -695,6 +776,28 @@ pub fn accept_direct( return sqe; } +/// Queues an multishot accept on a socket. +/// +/// Multishot variant allows an application to issue a single accept request, +/// which will repeatedly trigger a CQE when a connection request comes in. +/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate +/// further CQEs. +/// +/// Available since 5.19 +pub fn accept_multishot( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + addr: ?*posix.sockaddr, + addrlen: ?*posix.socklen_t, + flags: linux.Sock, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_multishot_accept(fd, addr, addrlen, flags); + sqe.user_data = user_data; + return sqe; +} + /// Queues an multishot accept using direct (registered) file descriptors. /// Available since 5.19 pub fn accept_multishot_direct( @@ -711,6 +814,66 @@ pub fn accept_multishot_direct( return sqe; } +/// Queues (but does not submit) an SQE to remove an existing operation. +/// Returns a pointer to the SQE. +/// +/// The operation is identified by its `user_data`. +/// +/// The completion event result will be `0` if the operation was found and +/// cancelled successfully else either of: +/// `-EALREADY` if the operation was found but was already in progress +/// `-ENOENT` if the operation was not found. +pub fn cancel( + self: *IoUring, + user_data: u64, + cancel_user_data: u64, + flags: uflags.AsyncCancel, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cancel(cancel_user_data, flags); + sqe.user_data = user_data; + return sqe; +} + +pub fn cancel_fd( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + flags: uflags.AsyncCancel, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cancel_fd(fd, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to add a link timeout operation. +/// Returns a pointer to the SQE. +/// +/// You need to set IOSQE_IO_LINK to flags of the target operation and then +/// call this method right after the target operation. +/// See https://lwn.net/Articles/803932/ for detail. +/// +/// If the dependent request finishes before the linked timeout, the timeout +/// is canceled. If the timeout finishes before the dependent request, the +/// dependent request will be canceled. +/// +/// The completion event result of the link_timeout will be either of: +/// `-ETIME` if the timeout finishes before the dependent request (in this case, +/// the completion event result of the dependent request will be `-ECANCELED`) +/// `-EALREADY` if the dependent request finishes before the linked timeout. +pub fn link_timeout( + self: *IoUring, + user_data: u64, + ts: *const linux.kernel_timespec, + flags: uflags.Timeout, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_link_timeout(ts, flags); + sqe.user_data = user_data; + return sqe; +} + /// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. /// Returns a pointer to the SQE. pub fn connect( @@ -726,164 +889,180 @@ pub fn connect( return sqe; } -/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. +/// Queues (but does not submit) an SQE to perform an `bind(2)` on a socket. /// Returns a pointer to the SQE. -pub fn epoll_ctl( +/// Available since 6.11 +pub fn bind( self: *IoUring, user_data: u64, - epfd: linux.fd_t, - fd: linux.fd_t, - op: linux.EpollOp, - ev: ?*linux.epoll_event, + fd: posix.fd_t, + addr: *const posix.sockaddr, + addrlen: posix.socklen_t, + // liburing doesn't have this flag, hence 0 should be passed + // TODO: consider removing this and all flags like this + flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_epoll_ctl(epfd, fd, op, ev); + sqe.prep_bind(fd, addr, addrlen, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `recv(2)`. +/// Queues (but does not submit) an SQE to perform an `listen(2)` on a socket. /// Returns a pointer to the SQE. -/// Available since 5.6 -pub fn recv( +/// Available since 6.11 +pub fn listen( self: *IoUring, user_data: u64, - fd: linux.fd_t, - buffer: RecvBuffer, - flags: linux.Msg, + fd: posix.fd_t, + backlog: usize, + // liburing doesn't have this flag, hence 0 should be passed + // TODO: consider removing this and all flags like this + flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| sqe.prep_recv(fd, slice, flags), - .buffer_selection => |selection| { - sqe.prep_rw(.recv, fd, 0, selection.len, 0); - sqe.rw_flags = @bitCast(flags); - sqe.flags.buffer_select = true; - sqe.buf_index = selection.group_id; - }, - } + sqe.prep_listen(fd, backlog, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `send(2)`. +/// Queues (but does not submit) an SQE to perform a `epoll_wait(2)`. /// Returns a pointer to the SQE. -/// Available since 5.6 -pub fn send( +pub fn epoll_wait( self: *IoUring, user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - flags: linux.Msg, + fd: posix.fd_t, + events: ?*linux.epoll_event, + max_events: u32, + flags: linux.Epoll, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send(fd, buffer, flags); + sqe.prep_epoll_wait(fd, events, max_events, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. -/// -/// This operation will most likely produce two CQEs. The flags field of the -/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will -/// be a second cqe with the user_data field set to the same value. The user -/// must not modify the data buffer until the notification is posted. The first -/// cqe follows the usual rules and so its res field will contain the number of -/// bytes sent or a negative error code. The notification's res field will be -/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two -/// step model is needed because the kernel may hold on to buffers for a long -/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling -/// the lifetime of the buffers. Even errored requests may generate a -/// notification. -/// -/// Available since 6.0 -pub fn send_zc( +/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. +/// Returns a pointer to the SQE. +pub fn epoll_ctl( self: *IoUring, user_data: u64, + epfd: linux.fd_t, fd: linux.fd_t, - buffer: []const u8, - send_flags: linux.Msg, - zc_flags: Sqe.SendRecv, + op: linux.EpollOp, + ev: ?*linux.epoll_event, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); + sqe.prep_epoll_ctl(epfd, fd, op, ev); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +pub fn files_update( + self: *IoUring, + user_data: u64, + fds: []const linux.fd_t, + offset: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_files_update(fds, offset); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform an `fallocate(2)`. /// Returns a pointer to the SQE. -/// Available since 6.0 -pub fn send_zc_fixed( +pub fn fallocate( self: *IoUring, user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - send_flags: linux.Msg, - zc_flags: Sqe.SendRecv, - buf_index: u16, + fd: posix.fd_t, + mode: i32, + offset: u64, + len: u64, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); + sqe.prep_fallocate(fd, mode, offset, len); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. +/// Queues (but does not submit) an SQE to perform an `openat(2)`. /// Returns a pointer to the SQE. -/// Available since 6.1 -pub fn sendmsg_zc( +/// Available since 5.6. +pub fn openat( self: *IoUring, user_data: u64, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: linux.Msg, + fd: posix.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: posix.mode_t, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_openat(fd, path, flags, mode); + sqe.user_data = user_data; + return sqe; +} + +/// Queues an openat using direct (registered) file descriptors. +/// +/// To use an accept direct variant, the application must first have registered +/// a file table (with register_files()). An unused table index will be +/// dynamically chosen and returned in the CQE res field. +/// +/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE +/// flags member, and setting the SQE fd field to the direct descriptor value +/// rather than the regular file descriptor. +/// +/// Available since 5.15 +pub fn openat_direct( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: posix.mode_t, + file_index: u32, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_sendmsg_zc(fd, msg, flags); + sqe.prep_openat_direct(fd, path, flags, mode, file_index); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an `openat(2)`. +/// Queues (but does not submit) an SQE to perform an `open(2)`. /// Returns a pointer to the SQE. -/// Available since 5.6. -pub fn openat( +pub fn open( self: *IoUring, user_data: u64, - fd: linux.fd_t, path: [*:0]const u8, flags: linux.O, mode: posix.mode_t, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_openat(fd, path, flags, mode); + sqe.prep_openat(linux.At.fdcwd, path, flags, mode); sqe.user_data = user_data; return sqe; } -/// Queues an openat using direct (registered) file descriptors. +/// Queues an open using direct (registered) file descriptors. /// /// To use an accept direct variant, the application must first have registered -/// a file table (with register_files). An unused table index will be +/// a file table (with register_files()). An unused table index will be /// dynamically chosen and returned in the CQE res field. /// /// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE /// flags member, and setting the SQE fd field to the direct descriptor value /// rather than the regular file descriptor. -/// -/// Available since 5.15 -pub fn openat_direct( +pub fn open_direct( self: *IoUring, user_data: u64, - fd: linux.fd_t, path: [*:0]const u8, flags: linux.O, mode: posix.mode_t, file_index: u32, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_openat_direct(fd, path, flags, mode, file_index); + sqe.prep_openat_direct(linux.At.fdcwd, path, flags, mode, file_index); sqe.user_data = user_data; return sqe; } @@ -907,160 +1086,153 @@ pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe { return sqe; } -/// Queues (but does not submit) an SQE to register a timeout operation. +/// Queues (but does not submit) an SQE to perform an `statx(2)`. /// Returns a pointer to the SQE. -/// -/// The timeout will complete when either the timeout expires, or after the -/// specified number of events complete (if `count` is greater than `0`). -/// -/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an -/// absolute timeout. -/// -/// The completion event result will be `-ETIME` if the timeout completed -/// through expiration, `0` if the timeout completed after the specified number -/// of events, or `-ECANCELED` if the timeout was removed before it expired. -/// -/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. -pub fn timeout( +pub fn statx( self: *IoUring, user_data: u64, - ts: *const linux.kernel_timespec, - count: u32, - flags: uflags.Timeout, + fd: posix.fd_t, + path: [:0]const u8, + flags: linux.At, + mask: linux.Statx.Mask, + buf: *linux.Statx, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_timeout(ts, count, flags); + sqe.prep_statx(fd, path, flags, mask, buf); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to remove an existing timeout operation. +// COMMIT: don't implement f/madvice64 for now I dought it is used by a lot of people in practice +/// Queues (but does not submit) an SQE to perform an `posix_fadvise(2)`. /// Returns a pointer to the SQE. -/// -/// The timeout is identified by its `user_data`. -/// -/// The completion event result will be `0` if the timeout was found and -/// cancelled successfully else: -/// `-EBUSY` if the timeout was found but expiration was already in progress, or -/// `-ENOENT` if the timeout was not found. -pub fn timeout_remove( +pub fn fadvice( self: *IoUring, user_data: u64, - timeout_user_data: u64, - flags: uflags.Timeout, + fd: posix.fd_t, + offset: u64, + len: u32, + advice: linux.Fadvice, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_timeout_remove(timeout_user_data, flags); + sqe.prep_fadvice(fd, offset, len, advice); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to add a link timeout operation. +/// Queues (but does not submit) an SQE to perform an `madvise(2)`. /// Returns a pointer to the SQE. -/// -/// You need to set IOSQE_IO_LINK to flags of the target operation and then -/// call this method right after the target operation. -/// See https://lwn.net/Articles/803932/ for detail. -/// -/// If the dependent request finishes before the linked timeout, the timeout -/// is canceled. If the timeout finishes before the dependent request, the -/// dependent request will be canceled. -/// -/// The completion event result of the link_timeout will be either of: -/// `-ETIME` if the timeout finishes before the dependent request (in this case, -/// the completion event result of the dependent request will be `-ECANCELED`) -/// `-EALREADY` if the dependent request finishes before the linked timeout. -pub fn link_timeout( +pub fn madvice( self: *IoUring, user_data: u64, - ts: *const linux.kernel_timespec, - flags: uflags.Timeout, + memory: []u8, + advice: linux.Fadvice, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_link_timeout(ts, flags); + sqe.prep_madvice(memory, advice); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to remove an existing poll operation. +/// Queues (but does not submit) an SQE to perform a `recv(2)`. /// Returns a pointer to the SQE. -pub fn poll_remove( +/// Available since 5.6 +pub fn recv( self: *IoUring, user_data: u64, - target_user_data: u64, + fd: linux.fd_t, + buffer: RecvBuffer, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_poll_remove(target_user_data); + switch (buffer) { + .buffer => |slice| sqe.prep_recv(fd, slice, flags), + .buffer_selection => |selection| { + sqe.prep_rw(.recv, fd, 0, selection.len, 0); + sqe.rw_flags = @bitCast(flags); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to update the user data of an existing -/// poll operation. Returns a pointer to the SQE. -pub fn poll_update( +/// Queues (but does not submit) an SQE to perform a `send(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn send( self: *IoUring, user_data: u64, - old_user_data: u64, - new_user_data: u64, - poll_mask: linux.Epoll, - flags: uflags.Poll, + fd: linux.fd_t, + buffer: []const u8, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); + sqe.prep_send(fd, buffer, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an `fallocate(2)`. -/// Returns a pointer to the SQE. -pub fn fallocate( +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// +/// This operation will most likely produce two CQEs. The flags field of the +/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will +/// be a second cqe with the user_data field set to the same value. The user +/// must not modify the data buffer until the notification is posted. The first +/// cqe follows the usual rules and so its res field will contain the number of +/// bytes sent or a negative error code. The notification's res field will be +/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two +/// step model is needed because the kernel may hold on to buffers for a long +/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling +/// the lifetime of the buffers. Even errored requests may generate a +/// notification. +/// +/// Available since 6.0 +pub fn send_zc( self: *IoUring, user_data: u64, fd: linux.fd_t, - mode: i32, - offset: u64, - len: u64, + buffer: []const u8, + send_flags: linux.Msg, + zc_flags: Sqe.SendRecv, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_fallocate(fd, mode, offset, len); + sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an `statx(2)`. +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. /// Returns a pointer to the SQE. -pub fn statx( +/// Available since 6.0 +pub fn send_zc_fixed( self: *IoUring, user_data: u64, fd: linux.fd_t, - path: [:0]const u8, - flags: linux.At, - mask: linux.Statx.Mask, - buf: *linux.Statx, + buffer: []const u8, + send_flags: linux.Msg, + zc_flags: Sqe.SendRecv, + buf_index: u16, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_statx(fd, path, flags, mask, buf); + sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to remove an existing operation. +/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. /// Returns a pointer to the SQE. -/// -/// The operation is identified by its `user_data`. -/// -/// The completion event result will be `0` if the operation was found and -/// cancelled successfully else either of: -/// `-EALREADY` if the operation was found but was already in progress -/// `-ENOENT` if the operation was not found. -pub fn cancel( +/// Available since 6.1 +pub fn sendmsg_zc( self: *IoUring, user_data: u64, - cancel_user_data: u64, - flags: uflags.AsyncCancel, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_cancel(cancel_user_data, flags); + sqe.prep_sendmsg_zc(fd, msg, flags); sqe.user_data = user_data; return sqe; } @@ -1562,43 +1734,6 @@ pub fn socket_direct_alloc( return sqe; } -/// Queues (but does not submit) an SQE to perform an `bind(2)` on a socket. -/// Returns a pointer to the SQE. -/// Available since 6.11 -pub fn bind( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: *const posix.sockaddr, - addrlen: posix.socklen_t, - // liburing doesn't have this flag, hence 0 should be passed - // TODO: consider removing this and all flags like this - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_bind(fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; -} - -/// Queues (but does not submit) an SQE to perform an `listen(2)` on a socket. -/// Returns a pointer to the SQE. -/// Available since 6.11 -pub fn listen( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - backlog: usize, - // liburing doesn't have this flag, hence 0 should be passed - // TODO: consider removing this and all flags like this - flags: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_listen(fd, backlog, flags); - sqe.user_data = user_data; - return sqe; -} - /// Prepares an cmd request for a socket. /// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html /// Available since 6.7. @@ -1832,10 +1967,11 @@ pub const Sqe = extern struct { len: u32, /// flags for any Sqe operation /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | - /// msg_flags timeout_flags | accept_flags | cancel_flags | open_flags | - /// statx_flags fadvise_advice | splice_flags | rename_flags | unlink_flags - /// | hardlink_flags xattr_flags | msg_ring_flags | uring_cmd_flags | - /// waitid_flags | futex_flags install_fd_flags | nop_flags | pipe_flags + /// msg_flags | timeout_flags | accept_flags | cancel_flags | open_flags | + /// statx_flags | fadvise_advice | splice_flags | rename_flags | + /// unlink_flags | hardlink_flags xattr_flags | msg_ring_flags | + /// uring_cmd_flags | waitid_flags | futex_flags install_fd_flags | + /// nop_flags | pipe_flags rw_flags: u32, /// data to be passed back at completion time user_data: u64, @@ -2117,6 +2253,17 @@ pub const Sqe = extern struct { sqe.prep_rw(.connect, fd, @intFromPtr(addr), 0, addrlen); } + pub fn prep_epoll_wait( + sqe: *Sqe, + fd: linux.fd_t, + event: ?*linux.epoll_event, + max_events: u32, + flags: linux.Epoll, + ) void { + sqe.prep_rw(.epoll_wait, fd, @intFromPtr(event), max_events, 0); + sqe.rw_flags = @bitCast(flags); + } + pub fn prep_epoll_ctl( sqe: *Sqe, epfd: linux.fd_t, @@ -2262,22 +2409,18 @@ pub const Sqe = extern struct { } pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { - sqe.* = .{ - .opcode = .timeout_remove, - .flags = .{}, - .ioprio = .init_empty(), - .fd = -1, - .off = 0, - .addr = timeout_user_data, - .len = 0, - .rw_flags = @bitCast(flags), - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, + sqe.prep_rw(.timeout_remove, -1, timeout_user_data, 0, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_timeout_update(sqe: *Sqe, timeout_user_data: u64, ts: *const linux.kernel_timespec, flags: uflags.Timeout) void { + sqe.prep_rw(.timeout_remove, -1, timeout_user_data, 0, @intFromPtr(ts)); + const enable_timeout_update = if (flags.timeout_update) flags else blk: { + var tflags = flags; + tflags.timeout_update = true; + break :blk tflags; }; + sqe.rw_flags = @bitCast(enable_timeout_update); } pub fn prep_link_timeout( @@ -2365,6 +2508,26 @@ pub const Sqe = extern struct { sqe.rw_flags = @bitCast(flags); } + pub fn prep_fadvice( + sqe: *Sqe, + fd: linux.fd_t, + offset: u64, + len: u32, + advice: linux.Fadvice, + ) void { + sqe.prep_rw(.fadvise, fd, undefined, len, offset); + sqe.rw_flags = @intFromEnum(advice); + } + + pub fn prep_madvice( + sqe: *Sqe, + memory: []u8, + advice: linux.Madvice, + ) void { + sqe.prep_rw(.madvise, -1, @intFromPtr(memory.ptr), memory.len, 0); + sqe.rw_flags = @intFromEnum(advice); + } + pub fn prep_cancel( sqe: *Sqe, cancel_user_data: u64, @@ -2379,11 +2542,11 @@ pub const Sqe = extern struct { fd: linux.fd_t, flags: uflags.AsyncCancel, ) void { - sqe.prep_rw(.async_cancel, fd, 0, 0, 0); - const enable_cancel_fd = blk: { - var update_flags = flags; - update_flags.cancel_fd = true; - break :blk update_flags; + sqe.prep_rw(.async_cancel, fd, undefined, 0, 0); + const enable_cancel_fd = if (flags.cancel_fd) flags else blk: { + var cancel_flags = flags; + cancel_flags.cancel_fd = true; + break :blk cancel_flags; }; sqe.rw_flags = @bitCast(enable_cancel_fd); } @@ -2814,6 +2977,23 @@ pub const BufferGroup = struct { allocator.free(self.heads); } + /// Prepare multishot read operation which will select buffer from this + /// group. + pub fn read_multishot( + self: *BufferGroup, + user_data: u64, + fd: posix.fd_t, + nbytes: u32, + offset: u64, + ) !*Sqe { + var sqe = try self.ring.get_sqe(); + sqe.prep_rw(.read_multishot, fd, undefined, nbytes, offset); + sqe.flags.buffer_select = true; + sqe.buf_index = self.group_id; + sqe.user_data = user_data; + return sqe; + } + /// Prepare recv operation which will select buffer from this group. pub fn recv( self: *BufferGroup, From e856ced2b4c1bf41a1df7b961965441542d07b83 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Wed, 15 Oct 2025 22:44:52 +0000 Subject: [PATCH 26/41] add IoUring send_bundle, send_to, recv_multishot, sync_file_range ignore prep_openat2* for now add non-at variant of some unlink, rename, mkdir, symlink, link TODO: add Sync File Flags Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 6 + lib/std/os/linux/IoUring.zig | 336 +++++++++++++++++++++++++++-------- 2 files changed, 268 insertions(+), 74 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 10bec0cb67eb..e40b30f41e4d 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -3988,6 +3988,12 @@ pub const Shut = enum(u32) { pub const RDWR: u32 = @intFromEnum(Shut.rdwr); }; +/// SYNC_FILE_RANGE_* flags +pub const SyncFileRange = packed struct(u32) { + _: u32 = 0, // TODO: fill out +}; + +/// Deprecated alias to Sock pub const SOCK = Sock; /// SOCK_* Socket type and flags pub const Sock = packed struct(u32) { diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index b37e2d26d80d..f1f9afff7388 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -737,6 +737,8 @@ pub fn timeout_update( /// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. /// Returns a pointer to the SQE. /// Available since 5.5 +// TODO: can't we make the sockaddr and socklen_t combo in our api better? +// Investigate this pub fn accept( self: *IoUring, user_data: u64, @@ -1002,6 +1004,7 @@ pub fn openat( sqe.user_data = user_data; return sqe; } +// COMMIT: ignore openat2* for now /// Queues an openat using direct (registered) file descriptors. /// @@ -1134,42 +1137,50 @@ pub fn madvice( return sqe; } -/// Queues (but does not submit) an SQE to perform a `recv(2)`. +/// Queues (but does not submit) an SQE to perform a `send(2)`. /// Returns a pointer to the SQE. /// Available since 5.6 -pub fn recv( +pub fn send( self: *IoUring, user_data: u64, - fd: linux.fd_t, - buffer: RecvBuffer, + sockfd: posix.fd_t, + buffer: []const u8, flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| sqe.prep_recv(fd, slice, flags), - .buffer_selection => |selection| { - sqe.prep_rw(.recv, fd, 0, selection.len, 0); - sqe.rw_flags = @bitCast(flags); - sqe.flags.buffer_select = true; - sqe.buf_index = selection.group_id; - }, - } + sqe.prep_send(sockfd, buffer, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `send(2)`. +/// Queues (but does not submit) an SQE to perform a bundled `send(2)`. /// Returns a pointer to the SQE. -/// Available since 5.6 -pub fn send( +pub fn send_bundle( self: *IoUring, user_data: u64, - fd: linux.fd_t, + sockfd: posix.fd_t, + len: u64, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_bundle(sockfd, len, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a bundled `sendto(2)`. +/// Returns a pointer to the SQE. +pub fn send_to( + self: *IoUring, + user_data: u64, + sockfd: posix.fd_t, buffer: []const u8, flags: linux.Msg, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send(fd, buffer, flags); + sqe.prep_send_to(sockfd, buffer, flags, addr, addrlen); sqe.user_data = user_data; return sqe; } @@ -1192,13 +1203,13 @@ pub fn send( pub fn send_zc( self: *IoUring, user_data: u64, - fd: linux.fd_t, + sockfd: posix.fd_t, buffer: []const u8, send_flags: linux.Msg, zc_flags: Sqe.SendRecv, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); + sqe.prep_send_zc(sockfd, buffer, send_flags, zc_flags); sqe.user_data = user_data; return sqe; } @@ -1209,14 +1220,14 @@ pub fn send_zc( pub fn send_zc_fixed( self: *IoUring, user_data: u64, - fd: linux.fd_t, + sockfd: posix.fd_t, buffer: []const u8, send_flags: linux.Msg, zc_flags: Sqe.SendRecv, buf_index: u16, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); + sqe.prep_send_zc_fixed(sockfd, buffer, send_flags, zc_flags, buf_index); sqe.user_data = user_data; return sqe; } @@ -1227,8 +1238,8 @@ pub fn send_zc_fixed( pub fn sendmsg_zc( self: *IoUring, user_data: u64, - fd: linux.fd_t, - msg: *const linux.msghdr_const, + fd: posix.fd_t, + msg: *const posix.msghdr_const, flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1237,6 +1248,106 @@ pub fn sendmsg_zc( return sqe; } +/// Queues (but does not submit) an SQE to perform an fixed async zerocopy +/// `sendmsg(2)`. Returns a pointer to the SQE. +pub fn sendmsg_zc_fixed( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + msg: *const posix.msghdr_const, + flags: linux.Msg, + buf_index: u16, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg_zc_fixed(fd, msg, flags, buf_index); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `recv(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn recv( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: RecvBuffer, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_recv(fd, slice, flags), + .buffer_selection => |selection| { + sqe.prep_rw(.recv, fd, 0, selection.len, 0); + sqe.rw_flags = @bitCast(flags); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} + +pub fn recv_multishot( + self: *IoUring, + user_data: u64, + sockfd: linux.fd_t, + buffer: RecvBuffer, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_recv_multishot(sockfd, slice, flags), + .buffer_selection => |selection| { + sqe.prep_rw(.recv, sockfd, 0, selection.len, 0); + sqe.ioprio = .{ .send_recv = .{ .recv_multishot = true } }; + sqe.rw_flags = @bitCast(flags); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to provide a group of buffers used for +/// commands that read/receive data. Returns a pointer to the SQE. +/// +/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via +/// buffer_selection. +/// +/// The kernel expects a contiguous block of memory of size (buffers_count * +/// buffer_size). +// TODO: why not use a slice with `buffers_count` +pub fn provide_buffers( + self: *IoUring, + user_data: u64, + buffers: [*]u8, + buffer_size: usize, + buffers_count: usize, + group_id: usize, + buffer_id: usize, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to remove a group of provided buffers. +/// Returns a pointer to the SQE. +pub fn remove_buffers( + self: *IoUring, + user_data: u64, + buffers_count: usize, + group_id: usize, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_remove_buffers(buffers_count, group_id); + sqe.user_data = user_data; + return sqe; +} + /// Queues (but does not submit) an SQE to perform a `shutdown(2)`. /// Returns a pointer to the SQE. /// @@ -1253,14 +1364,40 @@ pub fn shutdown( return sqe; } +/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. +/// Returns a pointer to the SQE. +pub fn unlinkat( + self: *IoUring, + user_data: u64, + dir_fd: posix.fd_t, + path: [*:0]const u8, + flags: linux.At, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_unlinkat(dir_fd, path, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `unlink(2)`. +/// Returns a pointer to the SQE. +pub fn unlink( + self: *IoUring, + user_data: u64, + path: [*:0]const u8, + flags: linux.At, +) !*Sqe { + return try self.unlinkat(user_data, linux.At.fdcwd, path, flags); +} + /// Queues (but does not submit) an SQE to perform a `renameat2(2)`. /// Returns a pointer to the SQE. pub fn renameat( self: *IoUring, user_data: u64, - old_dir_fd: linux.fd_t, + old_dir_fd: posix.fd_t, old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, + new_dir_fd: posix.fd_t, new_path: [*:0]const u8, flags: linux.Rename, ) !*Sqe { @@ -1270,17 +1407,30 @@ pub fn renameat( return sqe; } -/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. +/// Queues (but does not submit) an SQE to perform a `rename(2)`. /// Returns a pointer to the SQE. -pub fn unlinkat( +pub fn rename( self: *IoUring, user_data: u64, - dir_fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.At, + old_path: [*:0]const u8, + new_path: [*:0]const u8, + flags: linux.Rename, +) !*Sqe { + return try self.renameat(user_data, linux.At.fdcwd, old_path, linux.At.fdcwd, new_path, flags); +} + +/// Queues (but does not submit) an SQE to perform a `sync_file_range(2)`. +/// Returns a pointer to the SQE. +pub fn sync_file_range( + self: *IoUring, + user_data: u64, + fd: posix.fd_t, + len: u32, + offset: u64, + flags: linux.SyncFileRange, // TODO: add flags ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_unlinkat(dir_fd, path, flags); + sqe.prep_sync_file_range(fd, len, offset, flags); sqe.user_data = user_data; return sqe; } @@ -1290,7 +1440,7 @@ pub fn unlinkat( pub fn mkdirat( self: *IoUring, user_data: u64, - dir_fd: linux.fd_t, + dir_fd: posix.fd_t, path: [*:0]const u8, mode: posix.mode_t, ) !*Sqe { @@ -1300,13 +1450,24 @@ pub fn mkdirat( return sqe; } +/// Queues (but does not submit) an SQE to perform a `mkdir(2)`. +/// Returns a pointer to the SQE. +pub fn mkdir( + self: *IoUring, + user_data: u64, + path: [*:0]const u8, + mode: posix.mode_t, +) !*Sqe { + return try self.mkdirat(user_data, linux.At.fdcwd, path, mode); +} + /// Queues (but does not submit) an SQE to perform a `symlinkat(2)`. /// Returns a pointer to the SQE. pub fn symlinkat( self: *IoUring, user_data: u64, target: [*:0]const u8, - new_dir_fd: linux.fd_t, + new_dir_fd: posix.fd_t, link_path: [*:0]const u8, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1315,14 +1476,25 @@ pub fn symlinkat( return sqe; } +/// Queues (but does not submit) an SQE to perform a `symlink(2)`. +/// Returns a pointer to the SQE. +pub fn symlink( + self: *IoUring, + user_data: u64, + target: [*:0]const u8, + link_path: [*:0]const u8, +) !*Sqe { + return try self.symlinkat(user_data, target, linux.At.fdcwd, link_path); +} + /// Queues (but does not submit) an SQE to perform a `linkat(2)`. /// Returns a pointer to the SQE. pub fn linkat( self: *IoUring, user_data: u64, - old_dir_fd: linux.fd_t, + old_dir_fd: posix.fd_t, old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, + new_dir_fd: posix.fd_t, new_path: [*:0]const u8, flags: linux.At, ) !*Sqe { @@ -1332,42 +1504,16 @@ pub fn linkat( return sqe; } -/// Queues (but does not submit) an SQE to provide a group of buffers used for -/// commands that read/receive data. Returns a pointer to the SQE. -/// -/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via -/// buffer_selection. -/// -/// The kernel expects a contiguous block of memory of size (buffers_count * -/// buffer_size). -// TODO: why not use a slice with `buffers_count` -pub fn provide_buffers( - self: *IoUring, - user_data: u64, - buffers: [*]u8, - buffer_size: usize, - buffers_count: usize, - group_id: usize, - buffer_id: usize, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); - sqe.user_data = user_data; - return sqe; -} - -/// Queues (but does not submit) an SQE to remove a group of provided buffers. +/// Queues (but does not submit) an SQE to perform a `link(2)`. /// Returns a pointer to the SQE. -pub fn remove_buffers( +pub fn link( self: *IoUring, user_data: u64, - buffers_count: usize, - group_id: usize, + old_path: [*:0]const u8, + new_path: [*:0]const u8, + flags: linux.At, ) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_remove_buffers(buffers_count, group_id); - sqe.user_data = user_data; - return sqe; + return try self.linkat(user_data, linux.At.fdcwd, old_path, linux.At.fdcwd, new_path, flags); } /// Queues (but does not submit) an SQE to perform a `waitid(2)`. @@ -2310,9 +2456,29 @@ pub const Sqe = extern struct { } // COMMIT: fix send[|recv] flag param type - pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg) void { - sqe.prep_rw(.send, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + pub fn prep_send(sqe: *Sqe, sockfd: linux.fd_t, buffer: []const u8, flags: linux.Msg) void { + sqe.prep_rw(.send, sockfd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_send_bundle(sqe: *Sqe, sockfd: linux.fd_t, len: u64, flags: linux.Msg) void { + sqe.prep_rw(.send, sockfd, undefined, len, 0); sqe.rw_flags = @bitCast(flags); + sqe.ioprio = .{ .send_recv = .{ .recvsend_bundle = true } }; + } + + pub fn prep_send_to( + sqe: *Sqe, + sockfd: linux.fd_t, + buffer: []const u8, + flags: linux.Msg, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + ) void { + // addr2 maps to sqe.off and addr_len maps to sqe.splice_fd_in + sqe.prep_send(.send, sockfd, buffer, flags); + sqe.off = @intFromPtr(addr); + sqe.splice_fd_in = @intCast(addrlen); } pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv) void { @@ -2322,12 +2488,12 @@ pub const Sqe = extern struct { } pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv, buf_index: u16) void { - const zc_flags_fixed = blk: { + const zc_flags_fixed = if (zc_flags.recvsend_fixed_buf) zc_flags else blk: { var updated_flags = zc_flags; updated_flags.recvsend_fixed_buf = true; break :blk updated_flags; }; - prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed); + sqe.prep_send_zc(fd, buffer, flags, zc_flags_fixed); sqe.buf_index = buf_index; } @@ -2347,10 +2513,21 @@ pub const Sqe = extern struct { msg: *const linux.msghdr_const, flags: linux.Msg, ) void { - prep_sendmsg(sqe, fd, msg, flags); + sqe.prep_sendmsg(fd, msg, flags); sqe.opcode = .sendmsg_zc; } + pub fn prep_sendmsg_zc_fixed( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, + buf_index: u16, + ) void { + sqe.prep_sendmsg_zc(fd, msg, flags); + sqe.ioprio = .{ .send_recv = .{ .recvsend_fixed_buf = true } }; + sqe.buf_index = buf_index; + } pub fn prep_openat( sqe: *Sqe, fd: linux.fd_t, @@ -2588,6 +2765,17 @@ pub const Sqe = extern struct { sqe.rw_flags = @bitCast(flags); } + pub fn prep_sync_file_range( + sqe: *Sqe, + fd: posix.fd_t, + len: u32, + offset: u64, + flags: linux.SyncFileRange, // TODO: add flags + ) void { + sqe.prep_rw(.sync_file_range, fd, undefined, len, offset); + sqe.rw_flags = @bitCast(flags); + } + pub fn prep_mkdirat( sqe: *Sqe, dir_fd: linux.fd_t, @@ -3463,7 +3651,7 @@ pub const FileIndexRange = extern struct { }; /// matches `io_uring_recvmsg_out` in liburing -pub const RecvmsgOut = extern struct { +pub const RecvMsgOut = extern struct { namelen: u32, controllen: u32, payloadlen: u32, From 4f05fd44a9a19dde471787e8f5e172c7cc7d1bb1 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 17 Oct 2025 00:05:53 +0000 Subject: [PATCH 27/41] add msg_ring_*, setxattr and getxattr IoUring operations TODO: add SetXattr flags Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 3 + lib/std/os/linux/IoUring.zig | 241 +++++++++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index e40b30f41e4d..ff7f77804dc2 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -6811,6 +6811,9 @@ pub const Rename = packed struct(u32) { _: u29 = 0, }; +pub const SetXattr = packed struct(u32) { + _: u32 = 0, // TODO: add flags +}; pub const statx_timestamp = extern struct { sec: i64, nsec: u32, diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index f1f9afff7388..250152896634 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1516,6 +1516,137 @@ pub fn link( return try self.linkat(user_data, linux.At.fdcwd, old_path, linux.At.fdcwd, new_path, flags); } +/// Queues (but does not submit) an SQE to send a CQE to an io_uring file +/// descriptor. The use case for this can be anything from simply waking up +/// someone waiting on the targeted ring, or it can be used to pass messages +/// between the two rings +/// Returns a pointer to the SQE. +pub fn msg_ring( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + len: u32, + data: u64, + flags: uflags.MsgRing, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring(fd, len, data, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to send a CQE to an io_uring file +/// descriptor. See `msg_ring` +/// This has and additonal `cqe_flags` parameter that allows you to set the CQE +/// flags field cqe.flags when sending a message +/// Returns a pointer to the SQE. +pub fn msg_ring_cqe_flags( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + len: u32, + data: u64, + msg_flags: uflags.MsgRing, + cqe_flags: Cqe.Flags, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring_cqe_flags( + fd, + len, + data, + msg_flags, + cqe_flags, + ); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to to send a direct file descriptor to +/// another ring. +/// This has and additonal `cqe_flags` parameter that allows you to set the CQE +/// flags field cqe.flags when sending a message +/// Returns a pointer to the SQE. +pub fn msg_ring_fd( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + source_fd: linux.fd_t, + target_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring_fd( + fd, + source_fd, + target_fd, + data, + flags, + ); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to send a direct file descriptor to +/// another ring. See `msg_ring_fd()` +/// `msg_ring_fd_alloc()` is similar to `msg_ring_fd()`, but doesn't specify a +/// target_fd for the descriptor. Instead, this target_fd is allocated in the +/// target ring and returned in the CQE res field. +/// Returns a pointer to the SQE. +pub fn msg_ring_fd_alloc( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + source_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring_fd_alloc( + fd, + source_fd, + data, + flags, + ); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to prepares a request to get an +/// extended attribute value +/// Returns a pointer to the SQE. +pub fn getxattr( + self: *IoUring, + user_data: u64, + name: []const u8, + value: []const u8, + path: []const u8, + len: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_getxattr(name, value, path, len); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to prepares a request to set an +/// extended attribute value +/// Returns a pointer to the SQE. +pub fn setxattr( + self: *IoUring, + user_data: u64, + name: []const u8, + value: []const u8, + path: []const u8, + flags: linux.SetXattr, + len: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_setxattr(name, value, path, flags, len); + sqe.user_data = user_data; + return sqe; +} + /// Queues (but does not submit) an SQE to perform a `waitid(2)`. /// Returns a pointer to the SQE. pub fn waitid( @@ -2819,6 +2950,116 @@ pub const Sqe = extern struct { sqe.rw_flags = @bitCast(flags); } + pub fn prep_msg_ring( + sqe: *Sqe, + fd: linux.fd_t, + len: u32, + data: u64, + flags: uflags.MsgRing, + ) void { + sqe.prep_rw( + .msg_ring, + fd, + undefined, + len, + data, + ); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_msg_ring_cqe_flags( + sqe: *Sqe, + fd: linux.fd_t, + len: u32, + data: u64, + msg_flags: uflags.MsgRing, + cqe_flags: Cqe.Flags, + ) void { + const enable_flags_pass = blk: { + var flags = msg_flags; + flags.flags_pass = true; + break :blk flags; + }; + sqe.prep_msg_ring(fd, len, data, enable_flags_pass); + // sqe.file_index in liburing maps to splice_fd_in in Zig sqe + sqe.splice_fd_in = @intCast(@as(u32, @bitCast(cqe_flags))); + } + + pub fn prep_msg_ring_fd( + sqe: *Sqe, + fd: linux.fd_t, + source_fd: linux.fd_t, + target_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, + ) void { + sqe.prep_rw( + .msg_ring, + fd, + @ptrFromInt(@intFromEnum(MsgRingCmd.send_fd)), + 0, + data, + ); + sqe.addr3 = @intCast(source_fd); + sqe.rw_flags = @bitCast(flags); + sqe.set_target_fixed_file(@intCast(target_fd)); + } + + pub fn prep_msg_ring_fd_alloc( + sqe: *Sqe, + fd: linux.fd_t, + source_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, + ) void { + sqe.prep_rw( + .msg_ring, + fd, + @ptrFromInt(@intFromEnum(MsgRingCmd.send_fd)), + 0, + data, + ); + sqe.addr3 = @intCast(source_fd); + sqe.rw_flags = @bitCast(flags); + sqe.set_target_fixed_file(constants.FILE_INDEX_ALLOC); + } + + pub fn prep_getxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + path: []const u8, + len: u32, + ) void { + sqe.prep_rw( + .getxattr, + 0, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + sqe.addr3 = @intFromPtr(path.ptr); + } + + pub fn prep_setxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + path: []const u8, + flags: linux.SetXattr, + len: u32, + ) void { + sqe.prep_rw( + .setxattr, + 0, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + sqe.addr3 = @intFromPtr(path.ptr); + sqe.rw_flags = @bitCast(flags); + } + pub fn prep_files_update( sqe: *Sqe, fds: []const linux.fd_t, From f04aa8fa03a132312f50c20ed8c05563a65481ff Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 17 Oct 2025 16:07:57 +0000 Subject: [PATCH 28/41] IoUring: Implement set_iowait functionality closes https://github.com/ziglang/zig/issues/25566 closes https://github.com/ziglang/zig/pull/25604 and IoUring enter_flags() helper function add typed Init flags which match int_flags in liburing Co-authored-by: RaidoAun <45874591+RaidoAun@users.noreply.github.com> Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 63 ++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 250152896634..40aeb861b1cf 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -16,6 +16,8 @@ sq: Sq, cq: Cq, flags: uflags.Setup, features: uflags.Features, +/// matches int_flags in liburing +init_flags: uflags.Init, /// A friendly way to setup an io_uring, with default linux.io_uring_params. /// `entries` must be a power of two between 1 and 32768, although the kernel @@ -131,6 +133,7 @@ pub fn init_params(entries: u16, p: *Params) !IoUring { .cq = cq, .flags = p.flags, .features = p.features, + .init_flags = .{}, }; } @@ -182,7 +185,7 @@ pub fn submit(self: *IoUring) !u32 { /// Matches the implementation of `io_uring_submit_and_wait()` in liburing. pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { const submitted = self.flush_sq(); - var flags: uflags.Enter = .{}; + var flags: uflags.Enter = self.enter_flags(); if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { if (wait_nr > 0 or self.flags.iopoll) { flags.getevents = true; @@ -264,7 +267,7 @@ pub fn flush_sq(self: *IoUring) u32 { /// awakened. For the latter case, we set the SQ thread wakeup flag. /// Matches the implementation of `sq_ring_needs_enter()` in liburing. pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { - assert(flags.*.empty()); + assert(flags.*.valid_init_flags()); if (!self.flags.sqpoll) return true; if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).need_wakeup) { flags.*.sq_wakeup = true; @@ -309,7 +312,12 @@ pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 { const count = self.copy_cqes_ready(cqes); if (count > 0) return count; if (self.cq_ring_needs_flush() or wait_nr > 0) { - _ = try self.enter(0, wait_nr, .{ .getevents = true }); + const flags = blk: { + var flags = self.enter_flags(); + flags.getevents = true; + break :blk flags; + }; + _ = try self.enter(0, wait_nr, flags); return self.copy_cqes_ready(cqes); } return 0; @@ -374,6 +382,20 @@ pub fn cq_advance(self: *IoUring, count: u32) void { } } +/// Enable/disable setting of iowait by the kernel. +/// matches `io_uring_set_iowait` in liburing +pub fn set_iowait(self: *IoUring, enable_iowait: bool) !void { + if (!self.features.no_iowait) { + return error.SystemOutdated; + } + self.init_flags.no_iowait = !enable_iowait; +} + +/// matches `ring_enter_flags()` in liburing +pub fn enter_flags(self: *IoUring) uflags.Enter { + return self.init_flags.enter_flags(); +} + /// Queues (but does not submit) an SQE to perform a `splice(2)` /// Either `fd_in` or `fd_out` must be a pipe. /// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to @@ -4190,13 +4212,41 @@ pub const uflags = struct { no_iowait: bool = false, _9: u24 = 0, - pub fn empty(enter_flags: Enter) bool { - return @as(u32, @bitCast(enter_flags)) == 0; + /// Ensure only `Init` flags usable in `Enter` are set + pub fn valid_init_flags(self: Enter) bool { + const valid_flags: u32 = @bitCast(Enter{ .registered_ring = true, .no_iowait = true }); + const flags: u32 = @bitCast(self); + // check if any invalid flags are set + return (flags & ~valid_flags) == 0; + } + + pub fn empty(flags: Enter) bool { + return @as(u32, @bitCast(flags)) == 0; + } + }; + + /// matches INT_FLAG_* in liburing + pub const Init = packed struct(u8) { + reg_reg_ring: bool = false, + app_mem: bool = false, + cq_enter: bool = false, + _4: u1 = 0, + /// matches `registered_ring` flag in `Enter` + reg_ring: bool = false, + _6: u2 = 0, + /// matches `no_iowait` flag in `Enter` + no_iowait: bool = false, + + /// Return all valid `Enter` flags set in `Init` + pub fn enter_flags(self: Init) Enter { + const valid_flags: u8 = @bitCast(Init{ .reg_ring = true, .no_iowait = true }); + const flags: u8 = @bitCast(self); + return @bitCast(@as(u32, @intCast(flags & valid_flags))); } }; /// io_uring_params.features flags - const Features = packed struct(u32) { + pub const Features = packed struct(u32) { single_mmap: bool = false, nodrop: bool = false, submit_stable: bool = false, @@ -4223,6 +4273,7 @@ pub const uflags = struct { } }; }; + /// `io_uring_register(2)` opcodes and arguments /// matches `io_uring_register_op` in liburing pub const RegisterOp = enum(u8) { From f9b1a1af7026f063278b69f371db658d9f905645 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sat, 18 Oct 2025 01:11:32 +0000 Subject: [PATCH 29/41] Add XattrSource to decide how to prepare set/getxattr operations Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 223 ++++++++++++++++++++++------------- 1 file changed, 142 insertions(+), 81 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 40aeb861b1cf..f7ee6c26496a 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1636,35 +1636,121 @@ pub fn msg_ring_fd_alloc( /// Queues (but does not submit) an SQE to prepares a request to get an /// extended attribute value +/// The `from` parameter is used to decide the source to get the extended +/// attributes from /// Returns a pointer to the SQE. pub fn getxattr( self: *IoUring, user_data: u64, name: []const u8, value: []const u8, - path: []const u8, + from: XattrSource, len: u32, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_getxattr(name, value, path, len); + switch (from) { + .path => |path_| sqe.prep_getxattr(name, value, path_, len), + .fd => |fd_| sqe.prep_fgetxattr(name, value, fd_, len), + } sqe.user_data = user_data; return sqe; } /// Queues (but does not submit) an SQE to prepares a request to set an /// extended attribute value +/// The `on` parameter is used to decide the source to set the extended +/// attributes on /// Returns a pointer to the SQE. pub fn setxattr( self: *IoUring, user_data: u64, name: []const u8, value: []const u8, - path: []const u8, + on: XattrSource, flags: linux.SetXattr, len: u32, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_setxattr(name, value, path, flags, len); + switch (on) { + .path => |path_| sqe.prep_setxattr(name, value, path_, flags, len), + .fd => |fd_| sqe.prep_fsetxattr(name, value, fd_, flags, len), + } + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request. +/// New socket fd will be returned in completion result. +/// Available since 5.19 +pub fn socket( + self: *IoUring, + user_data: u64, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags is unused + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request for registered file at index `file_index`. +/// Available since 5.19 +pub fn socket_direct( + self: *IoUring, + user_data: u64, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags is unused + flags: u32, + file_index: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request for registered file, index chosen by +/// kernel (file index alloc). +/// File index will be returned in CQE res field. +/// Available since 5.19 +pub fn socket_direct_alloc( + self: *IoUring, + user_data: u64, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags unused + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares an cmd request for a socket. +/// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html +/// Available since 6.7. +pub fn cmd_sock( + self: *IoUring, + user_data: u64, + cmd_op: SocketOp, + fd: linux.fd_t, + level: linux.Sol, + optname: linux.So, + /// pointer to the option value + optval: u64, + /// size of the option value + optlen: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); sqe.user_data = user_data; return sqe; } @@ -1678,7 +1764,8 @@ pub fn waitid( id: i32, infop: *linux.siginfo_t, options: linux.W, - flags: u32, // They are currently unused, and hence 0 should be passed + /// They are currently unused, and hence 0 should be passed + flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_waitid(id_type, id, infop, options, flags); @@ -1980,78 +2067,6 @@ fn handle_registration_result(res: usize) !void { } } -/// Prepares a socket creation request. -/// New socket fd will be returned in completion result. -/// Available since 5.19 -pub fn socket( - self: *IoUring, - user_data: u64, - domain: linux.Af, - socket_type: linux.Sock, - protocol: linux.IpProto, - flags: u32, // flags is unused -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket(domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; -} - -/// Prepares a socket creation request for registered file at index `file_index`. -/// Available since 5.19 -pub fn socket_direct( - self: *IoUring, - user_data: u64, - domain: linux.Af, - socket_type: linux.Sock, - protocol: linux.IpProto, - /// flags is unused - flags: u32, - file_index: u32, -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); - sqe.user_data = user_data; - return sqe; -} - -/// Prepares a socket creation request for registered file, index chosen by -/// kernel (file index alloc). -/// File index will be returned in CQE res field. -/// Available since 5.19 -pub fn socket_direct_alloc( - self: *IoUring, - user_data: u64, - domain: linux.Af, - socket_type: linux.Sock, - protocol: linux.IpProto, - flags: u32, // flags unused -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; -} - -/// Prepares an cmd request for a socket. -/// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html -/// Available since 6.7. -pub fn cmd_sock( - self: *IoUring, - user_data: u64, - cmd_op: SocketOp, - fd: linux.fd_t, - level: linux.Sol, - optname: linux.So, - optval: u64, // pointer to the option value - optlen: u32, // size of the option value -) !*Sqe { - const sqe = try self.get_sqe(); - sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); - sqe.user_data = user_data; - return sqe; -} - /// Prepares set socket option for the optname argument, at the protocol /// level specified by the level argument. /// Available since 6.7.n @@ -3063,6 +3078,22 @@ pub const Sqe = extern struct { sqe.addr3 = @intFromPtr(path.ptr); } + pub fn prep_fgetxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + fd: linux.fd_t, + len: u32, + ) void { + sqe.prep_rw( + .fgetxattr, + fd, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + } + pub fn prep_setxattr( sqe: *Sqe, name: []const u8, @@ -3082,6 +3113,24 @@ pub const Sqe = extern struct { sqe.rw_flags = @bitCast(flags); } + pub fn prep_fsetxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + fd: linux.fd_t, + flags: linux.SetXattr, + len: u32, + ) void { + sqe.prep_rw( + .fsetxattr, + fd, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + sqe.rw_flags = @bitCast(flags); + } + pub fn prep_files_update( sqe: *Sqe, fds: []const linux.fd_t, @@ -3125,7 +3174,8 @@ pub const Sqe = extern struct { domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - flags: u32, // flags is unused + /// flags is unused + flags: u32, ) void { sqe.prep_rw(.socket, @intFromEnum(domain), 0, @intFromEnum(protocol), @as(u32, @bitCast(socket_type))); sqe.rw_flags = flags; @@ -3136,7 +3186,8 @@ pub const Sqe = extern struct { domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - flags: u32, // flags is unused + /// flags is unused + flags: u32, file_index: u32, ) void { prep_socket(sqe, domain, socket_type, protocol, flags); @@ -3148,7 +3199,8 @@ pub const Sqe = extern struct { domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - flags: u32, // flags is unused + /// flags is unused + flags: u32, ) void { prep_socket(sqe, domain, socket_type, protocol, flags); set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); @@ -3160,7 +3212,8 @@ pub const Sqe = extern struct { id: i32, infop: *linux.siginfo_t, options: linux.W, - flags: u32, // flags is unused + /// flags is unused + flags: u32, ) void { sqe.prep_rw(.waitid, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); sqe.rw_flags = flags; @@ -3531,6 +3584,14 @@ pub const WriteBuffer = union(enum) { iovecs: []const posix.iovec_const, }; +/// Used to select how get/setxttr should be handled. +pub const XattrSource = union(enum) { + /// Get/Set xattr associated with the given path in the filesystem + path: []const u8, + /// Get/Set xattr for the opened file referenced by this fd + fd: linux.fd_t, +}; + /// Used to select how the recv call should be handled. pub const RecvBuffer = union(enum) { /// io_uring will recv directly into this buffer From dd43389cb87560b13c4676ba5b913ab9ac7befb3 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sun, 19 Oct 2025 22:55:02 +0000 Subject: [PATCH 30/41] IoUring: futex operations Signed-off-by: Bernard Assan Add futex_* operations to IoUring Refactor Futex 2 flags into Futex2 struct add mpol to Wait flags and fix private field as its 128 not 32 Signed-off-by: Bernard Assan Update futex2_* functions and constants to use the new Futex2 type Improve the Api of futex2_* functions to be more idiomatic Zig Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 157 ++++++++++++++++++++++------------- lib/std/os/linux/IoUring.zig | 105 ++++++++++++++++++++++- 2 files changed, 203 insertions(+), 59 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index ff7f77804dc2..ce711166dd85 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -722,13 +722,13 @@ pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout ); } -/// Given an array of `futex2_waitone`, wait on each uaddr. +/// Given an array of `Futex2.WaitOne`, wait on each uaddr. /// The thread wakes if a futex_wake() is performed at any uaddr. /// The syscall returns immediately if any futex has *uaddr != val. /// timeout is an optional, absolute timeout value for the operation. /// The `flags` argument is for future use and currently should be `.{}`. /// Flags for private futexes, sizes, etc. should be set on the -/// individual flags of each `futex2_waitone`. +/// individual flags of each `Futex2.WaitOne`. /// /// Returns the array index of one of the woken futexes. /// No further information is provided: any number of other futexes may also @@ -738,20 +738,21 @@ pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout /// most recently woken, nor...) /// /// Requires at least kernel v5.16. +// TODO: can't we use slices here? and assert `Futex2.waitone_max` pub fn futex2_waitv( - futexes: [*]const futex2_waitone, - /// Length of `futexes`. Max of FUTEX2_WAITONE_MAX. - nr_futexes: u32, - flags: FUTEX2_FLAGS_WAITV, + /// The length of `futexes` slice must not exceed `Futex2.waitone_max` + futexes: []const Futex2.WaitOne, + flags: Futex2.Waitv, /// Optional absolute timeout. Always 64-bit, even on 32-bit platforms. timeout: ?*const kernel_timespec, /// Clock to be used for the timeout, realtime or monotonic. clockid: clockid_t, ) usize { + assert(futexes.len <= Futex2.waitone_max); return syscall5( .futex_waitv, @intFromPtr(futexes), - nr_futexes, + @intCast(futexes.len), @as(u32, @bitCast(flags)), @intFromPtr(timeout), @intFromEnum(clockid), @@ -769,8 +770,8 @@ pub fn futex2_wait( /// Value of `uaddr`. val: usize, /// Bitmask to match against incoming wakeup masks. Must not be zero. - mask: usize, - flags: FUTEX2_FLAGS, + mask: Futex2.Bitset, + flags: Futex2.Wait, /// Optional absolute timeout. Always 64-bit, even on 32-bit platforms. timeout: ?*const kernel_timespec, /// Clock to be used for the timeout, realtime or monotonic. @@ -780,7 +781,7 @@ pub fn futex2_wait( .futex_wait, @intFromPtr(uaddr), val, - mask, + @intFromEnum(mask), @as(u32, @bitCast(flags)), @intFromPtr(timeout), @intFromEnum(clockid), @@ -796,16 +797,16 @@ pub fn futex2_wake( /// Futex to wake uaddr: *const anyopaque, /// Bitmask to match against waiters. - mask: usize, + mask: Futex2.Bitset, /// Maximum number of waiters on the futex to wake. nr_wake: i32, - flags: FUTEX2_FLAGS, + flags: Futex2.Wake, ) usize { return syscall4( .futex_wake, @intFromPtr(uaddr), - mask, - @as(u32, @bitCast(nr_wake)), + @intFromEnum(mask), + @intCast(nr_wake), @as(u32, @bitCast(flags)), ); } @@ -814,11 +815,12 @@ pub fn futex2_wake( /// Identical to `FUTEX.CMP_REQUEUE`, except it is part of the futex2 family of calls. /// /// Requires at least kernel v6.7. +// TODO: test to ensure I didn't break it pub fn futex2_requeue( /// The source and destination futexes. Must be a 2-element array. - waiters: [*]const futex2_waitone, + waiters: *const [2]Futex2.WaitOne, /// Currently unused. - flags: FUTEX2_FLAGS_REQUEUE, + flags: Futex2.Requeue, /// Maximum number of waiters to wake on the source futex. nr_wake: i32, /// Maximum number of waiters to transfer to the destination futex. @@ -828,8 +830,8 @@ pub fn futex2_requeue( .futex_requeue, @intFromPtr(waiters), @as(u32, @bitCast(flags)), - @as(u32, @bitCast(nr_wake)), - @as(u32, @bitCast(nr_requeue)), + @intCast(nr_wake), + @intCast(nr_requeue), ); } @@ -3629,39 +3631,91 @@ pub const FUTEX_WAKE_OP_CMP = enum(u4) { GE = 5, }; -/// Max numbers of elements in a `futex2_waitone` array. -pub const FUTEX2_WAITONE_MAX = 128; +pub const Futex2 = struct { + /// Max numbers of elements in a `futex_waitv` .ie `WaitOne` array + /// matches FUTEX_WAITV_MAX + pub const waitone_max = 128; -/// For futex v2 API, the size of the futex at the uaddr. v1 futex are -/// always implicitly U32. As of kernel v6.14, only U32 is implemented -/// for v2 futexes. -pub const FUTEX2_SIZE = enum(u2) { - U8 = 0, - U16 = 1, - U32 = 2, - U64 = 3, -}; + /// For futex v2 API, the size of the futex at the uaddr. v1 futex are + /// always implicitly U32. As of kernel v6.14, only U32 is implemented + /// for v2 futexes. + pub const Size = enum(u2) { + U8 = 0, + U16 = 1, + U32 = 2, + U64 = 3, + }; -/// As of kernel 6.14 there are no defined flags to futex2_waitv. -pub const FUTEX2_FLAGS_WAITV = packed struct(u32) { - _reserved: u32 = 0, -}; + /// flags for `futex2_requeue` syscall + /// As of kernel 6.14 there are no defined flags to futex2_requeue. + pub const Requeue = packed struct(u32) { + _: u32 = 0, + }; -/// As of kernel 6.14 there are no defined flags to futex2_requeue. -pub const FUTEX2_FLAGS_REQUEUE = packed struct(u32) { - _reserved: u32 = 0, -}; + /// flags for `futex2_waitv` syscall + /// As of kernel 6.14 there are no defined flags to futex2_waitv. + pub const Waitv = packed struct(u32) { + _: u32 = 0, + }; -/// Flags for futex v2 APIs (futex2_wait, futex2_wake, futex2_requeue, but -/// not the futex2_waitv syscall, but also used in the futex2_waitone struct). -pub const FUTEX2_FLAGS = packed struct(u32) { - size: FUTEX2_SIZE, - numa: bool = false, - _reserved: u4 = 0, - private: bool, - _undefined: u24 = 0, + /// flags for `futex2_wait` syscall + // COMMIT: add mpol and fix private field as its 128 not 32 + pub const Wait = packed struct(u32) { + size: Size, + numa: bool = false, + mpol: bool = false, + _5: u3 = 0, + private: bool, + _9: u24 = 0, + }; + + /// flags for `futex2_wake` syscall + pub const Wake = Wait; + + /// A waiter for vectorized wait + /// For `futex2_waitv` and `futex2_requeue`. Arrays of `WaitOne` + /// allow waiting on multiple futexes in one call. + /// matches `futex_waitv` in kernel + pub const WaitOne = extern struct { + /// Expected value at uaddr, should match size of futex. + val: u64, + /// User address to wait on. Top-bits must be 0 on 32-bit. + uaddr: u64, + /// Flags for this waiter. + flags: Wait, + /// Reserved member to preserve data alignment. + __reserved: u32 = 0, + }; + + pub const Bitset = enum(u64) { + /// matches FUTEX_WAIT_BITSET + wait = 9, + /// matches FUTEX_WAKE_BITSET + wake = 10, + /// bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a + /// match of any bit. + match_any = 0xffffffff, + }; }; +/// DEPRECATED use `Futex2.WaitOne` +pub const futex2_waitone = Futex2.WaitOne; + +/// DEPRECATED use constant in `Futex2` +pub const FUTEX2_WAITONE_MAX = Futex2.waitone_max; + +/// DEPRECATED use `Size` type in `Futex2` +pub const FUTEX2_SIZE = Futex2.Size; + +/// DEPRECATED use `Waitv` in `Futex2` +pub const FUTEX2_FLAGS_WAITV = Futex2.Waitv; + +/// DEPRECATED use `Requeue` in `Futex2` +pub const FUTEX2_FLAGS_REQUEUE = Futex2.Requeue; + +/// DEPRECATED use `Wait` in `Futex2` +pub const FUTEX2_FLAGS = Futex2.Wait; + pub const PROT = struct { /// page can not be accessed pub const NONE = 0x0; @@ -9810,19 +9864,6 @@ pub const PTRACE = struct { }; }; -/// For futex2_waitv and futex2_requeue. Arrays of `futex2_waitone` allow -/// waiting on multiple futexes in one call. -pub const futex2_waitone = extern struct { - /// Expected value at uaddr, should match size of futex. - val: u64, - /// User address to wait on. Top-bits must be 0 on 32-bit. - uaddr: u64, - /// Flags for this waiter. - flags: FUTEX2_FLAGS, - /// Reserved member to preserve alignment. - __reserved: u32 = 0, -}; - pub const cache_stat_range = extern struct { off: u64, len: u64, diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index f7ee6c26496a..fd31e6794ecb 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -295,7 +295,7 @@ pub fn cq_ready(self: *IoUring) u32 { /// Copies as many CQEs as are ready, and that can fit into the destination /// `cqes` slice. If none are available, enters into the kernel to wait for at -/// most `wait_nr` CQEs. +/// least `wait_nr` CQEs. /// Returns the number of CQEs copied, advancing the CQ ring. /// Provides all the wait/peek methods found in liburing, but with batching and /// a single method. @@ -1773,6 +1773,54 @@ pub fn waitid( return sqe; } +/// Available since kernel 6.7 +pub fn futex_wake( + self: *IoUring, + user_data: u64, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wake, + flags: u32, // They are currently unused, and hence 0 should be passed +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_futex_wake(futex, max_wake_count, mask, futex_flags, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Available since kernel 6.7 +pub fn futex_wait( + self: *IoUring, + user_data: u64, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wait, + /// They are currently unused, and hence 0 should be passed + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_futex_wait(futex, max_wake_count, mask, futex_flags, flags); + sqe.user_data = user_data; + return sqe; +} + +// TODO: ensure flags and Wait in futexv are correct +/// Available since kernel 6.7 +pub fn futex_waitv( + self: *IoUring, + user_data: u64, + futexv: []linux.Futex2.WaitOne, + /// They are currently unused, and hence 0 should be passed + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_futex_waitv(futexv, flags); + sqe.user_data = user_data; + return sqe; +} + pub fn register_buffers_sparse(self: *IoUring, nr: u32) !void { assert(self.fd >= 0); @@ -3220,6 +3268,61 @@ pub const Sqe = extern struct { sqe.splice_fd_in = @bitCast(options); } + pub fn prep_futex_wake( + sqe: *Sqe, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wake, + flags: u32, // They are currently unused, and hence 0 should be passed + ) void { + sqe.prep_rw( + .futex_wake, + @intCast(@as(u32, @bitCast(futex_flags))), + @intFromPtr(futex), + 0, + max_wake_count, + ); + sqe.rw_flags = flags; + sqe.addr3 = @intFromEnum(mask); + } + + pub fn prep_futex_wait( + sqe: *Sqe, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wait, + /// They are currently unused, and hence 0 should be passed + flags: u32, + ) void { + sqe.prep_rw( + .futex_wait, + @intCast(@as(u32, @bitCast(futex_flags))), + @intFromPtr(futex), + 0, + max_wake_count, + ); + sqe.rw_flags = flags; + sqe.addr3 = @intFromEnum(mask); + } + + pub fn prep_futex_waitv( + sqe: *Sqe, + futexv: []linux.Futex2.WaitOne, + /// They are currently unused, and hence 0 should be passed + flags: u32, + ) void { + sqe.prep_rw( + .futex_waitv, + 0, + @intFromPtr(futexv.ptr), + futexv.len, + 0, + ); + sqe.rw_flags = flags; + } + // TODO: maybe remove unused flag fields? pub fn prep_bind( sqe: *Sqe, From c6b3844e754fd9d02f738d8f80f2e5b5e7b610b0 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Tue, 21 Oct 2025 17:34:49 +0000 Subject: [PATCH 31/41] Add a Futex2 `Bitset` type for futex2 wake and wait syscalls Cleanup linux/test.zig type futex/2_* uaddr as *const u32 consider changing to *const atomic.Value(u32) Use At Flags in fstatat Use EpollOp in epoll_ctl syscall Signed-off-by: Bernard Assan --- lib/std/Thread.zig | 2 +- lib/std/os/linux.zig | 121 +++++++++++++++++++++++++++++++------- lib/std/os/linux/test.zig | 76 ++++++++++++------------ 3 files changed, 138 insertions(+), 61 deletions(-) diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig index 93563bb245fb..7e3b230cc409 100644 --- a/lib/std/Thread.zig +++ b/lib/std/Thread.zig @@ -1663,7 +1663,7 @@ const LinuxThreadImpl = struct { if (tid == 0) break; switch (linux.errno(linux.futex_4arg( - &self.thread.child_tid.raw, + @ptrCast(&self.thread.child_tid.raw), .{ .cmd = .WAIT, .private = false }, @bitCast(tid), null, diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index ce711166dd85..005a21a22f20 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -687,7 +687,14 @@ pub const futex_param4 = extern union { /// /// The futex_op parameter is a sub-command and flags. The sub-command /// defines which of the subsequent paramters are relevant. -pub fn futex(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, val2timeout: futex_param4, uaddr2: ?*const anyopaque, val3: u32) usize { +pub fn futex( + uaddr: *const u32, + futex_op: FUTEX_OP, + val: u32, + val2timeout: futex_param4, + uaddr2: ?*const anyopaque, + val3: u32, +) usize { return syscall6( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -701,7 +708,7 @@ pub fn futex(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, val2timeout: /// Three-argument variation of the v1 futex call. Only suitable for a /// futex_op that ignores the remaining arguments (e.g., FUTUX_OP.WAKE). -pub fn futex_3arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32) usize { +pub fn futex_3arg(uaddr: *const u32, futex_op: FUTEX_OP, val: u32) usize { return syscall3( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -712,7 +719,7 @@ pub fn futex_3arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32) usize { /// Four-argument variation on the v1 futex call. Only suitable for /// futex_op that ignores the remaining arguments (e.g., FUTEX_OP.WAIT). -pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout: ?*const timespec) usize { +pub fn futex_4arg(uaddr: *const u32, futex_op: FUTEX_OP, val: u32, timeout: ?*const timespec) usize { return syscall4( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -751,7 +758,7 @@ pub fn futex2_waitv( assert(futexes.len <= Futex2.waitone_max); return syscall5( .futex_waitv, - @intFromPtr(futexes), + @intFromPtr(futexes.ptr), @intCast(futexes.len), @as(u32, @bitCast(flags)), @intFromPtr(timeout), @@ -766,7 +773,7 @@ pub fn futex2_waitv( /// Requires at least kernel v6.7. pub fn futex2_wait( /// Address of the futex to wait on. - uaddr: *const anyopaque, + uaddr: *const u32, /// Value of `uaddr`. val: usize, /// Bitmask to match against incoming wakeup masks. Must not be zero. @@ -781,7 +788,7 @@ pub fn futex2_wait( .futex_wait, @intFromPtr(uaddr), val, - @intFromEnum(mask), + @intCast(mask.toInt()), @as(u32, @bitCast(flags)), @intFromPtr(timeout), @intFromEnum(clockid), @@ -795,7 +802,7 @@ pub fn futex2_wait( /// Requires at least kernel v6.7. pub fn futex2_wake( /// Futex to wake - uaddr: *const anyopaque, + uaddr: *const u32, /// Bitmask to match against waiters. mask: Futex2.Bitset, /// Maximum number of waiters on the futex to wake. @@ -805,7 +812,7 @@ pub fn futex2_wake( return syscall4( .futex_wake, @intFromPtr(uaddr), - @intFromEnum(mask), + @intCast(mask.toInt()), @intCast(nr_wake), @as(u32, @bitCast(flags)), ); @@ -2240,16 +2247,27 @@ pub fn lstat(pathname: [*:0]const u8, statbuf: *Stat) usize { } } -// TODO: flags is At Flags -pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: u32) usize { +pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: At) usize { if (native_arch == .riscv32 or native_arch.isLoongArch()) { // riscv32 and loongarch have made the interesting decision to not implement some of // the older stat syscalls, including this one. @compileError("No fstatat syscall on this architecture."); } else if (@hasField(SYS, "fstatat64")) { - return syscall4(.fstatat64, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); + return syscall4( + .fstatat64, + @as(usize, @bitCast(@as(isize, dirfd))), + @intFromPtr(path), + @intFromPtr(stat_buf), + @intCast(@as(u32, @bitCast(flags))), + ); } else { - return syscall4(.fstatat, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); + return syscall4( + .fstatat, + @as(usize, @bitCast(@as(isize, dirfd))), + @intFromPtr(path), + @intFromPtr(stat_buf), + @bitCast(flags), + ); } } @@ -2419,8 +2437,14 @@ pub fn epoll_create1(flags: usize) usize { return syscall1(.epoll_create1, flags); } -pub fn epoll_ctl(epoll_fd: i32, op: u32, fd: i32, ev: ?*epoll_event) usize { - return syscall4(.epoll_ctl, @as(usize, @bitCast(@as(isize, epoll_fd))), @as(usize, @intCast(op)), @as(usize, @bitCast(@as(isize, fd))), @intFromPtr(ev)); +pub fn epoll_ctl(epoll_fd: i32, op: EpollOp, fd: i32, ev: ?*epoll_event) usize { + return syscall4( + .epoll_ctl, + @as(usize, @bitCast(@as(isize, epoll_fd))), + @as(usize, @intFromEnum(op)), + @as(usize, @bitCast(@as(isize, fd))), + @intFromPtr(ev), + ); } pub fn epoll_wait(epoll_fd: i32, events: [*]epoll_event, maxevents: u32, timeout: i32) usize { @@ -3687,14 +3711,67 @@ pub const Futex2 = struct { __reserved: u32 = 0, }; - pub const Bitset = enum(u64) { - /// matches FUTEX_WAIT_BITSET - wait = 9, - /// matches FUTEX_WAKE_BITSET - wake = 10, - /// bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a - /// match of any bit. - match_any = 0xffffffff, + /// `Bitset` for `futex2_wait`, `futex2_wake`, `IoUring.futex_wait` and + /// `IoUring.futex_wake` operations + /// At least one bit must be set before performing supported operations + /// The bitset is stored in the kernel-internal state of a waiter. During a + /// wake operation, the same mask previously set during the wait call can + /// be used to select which waiters to woke up + /// See https://man7.org/linux/man-pages/man2/futex_wake_bitset.2const.html + /// `IoUring` supports a u64 `Bitset` while the raw syscalls uses only u32 + /// bits of `Bitset` + pub const Bitset = packed struct(u64) { + waiter1: bool = false, + waiter2: bool = false, + waiter3: bool = false, + waiter4: bool = false, + waiter5: bool = false, + waiter6: bool = false, + waiter7: bool = false, + waiter8: bool = false, + waiter9: bool = false, + waiter10: bool = false, + waiter11: bool = false, + waiter12: bool = false, + waiter13: bool = false, + waiter14: bool = false, + waiter15: bool = false, + waiter16: bool = false, + waiter17: bool = false, + waiter18: bool = false, + waiter19: bool = false, + waiter20: bool = false, + waiter21: bool = false, + waiter22: bool = false, + waiter23: bool = false, + waiter24: bool = false, + waiter25: bool = false, + waiter26: bool = false, + waiter27: bool = false, + waiter28: bool = false, + waiter29: bool = false, + waiter30: bool = false, + waiter31: bool = false, + waiter32: bool = false, + io_uring_extra: u32 = 0, + + /// `Bitset` with all bits set for the FUTEX_xxx_BITSET OPs to request a + /// match of any bit. matches FUTEX_BITSET_MATCH_ANY + pub const match_any: Bitset = @bitCast(@as(u64, 0x00000000ffffffff)); + /// Bitset must not be empty, this is only useful in test + pub const empty: Bitset = .{}; + + /// Create from raw u64 value + pub fn fromInt(value: u64) Bitset { + const bitset: Bitset = @bitCast(value); + assert(bitset != empty); + return bitset; + } + + /// Convert to raw u64 for syscall + pub fn toInt(self: Bitset) u64 { + return @bitCast(self); + } }; }; diff --git a/lib/std/os/linux/test.zig b/lib/std/os/linux/test.zig index 4676d4899b33..fffc8abec349 100644 --- a/lib/std/os/linux/test.zig +++ b/lib/std/os/linux/test.zig @@ -45,34 +45,33 @@ test "timer" { var err: linux.E = linux.errno(epoll_fd); try expect(err == .SUCCESS); - const timer_fd = linux.timerfd_create(linux.TIMERFD_CLOCK.MONOTONIC, .{}); + const timer_fd = linux.timerfd_create(.MONOTONIC, .{}); try expect(linux.errno(timer_fd) == .SUCCESS); - const time_interval = linux.timespec{ + const time_interval: linux.timespec = .{ .sec = 0, .nsec = 2000000, }; - const new_time = linux.itimerspec{ + const new_time: linux.itimerspec = .{ .it_interval = time_interval, .it_value = time_interval, }; - err = linux.errno(linux.timerfd_settime(@as(i32, @intCast(timer_fd)), .{}, &new_time, null)); + err = linux.errno(linux.timerfd_settime(@intCast(timer_fd), .{}, &new_time, null)); try expect(err == .SUCCESS); - var event = linux.epoll_event{ + var event: linux.epoll_event = .{ .events = linux.EPOLL.IN | linux.EPOLL.OUT | linux.EPOLL.ET, - .data = linux.epoll_data{ .ptr = 0 }, + .data = .{ .ptr = 0 }, }; - err = linux.errno(linux.epoll_ctl(@as(i32, @intCast(epoll_fd)), linux.EPOLL.CTL_ADD, @as(i32, @intCast(timer_fd)), &event)); + err = linux.errno(linux.epoll_ctl(@intCast(epoll_fd), .ctl_add, @intCast(timer_fd), &event)); try expect(err == .SUCCESS); - const events_one: linux.epoll_event = undefined; - var events = [_]linux.epoll_event{events_one} ** 8; + var events: [8]linux.epoll_event = @splat(undefined); - err = linux.errno(linux.epoll_wait(@as(i32, @intCast(epoll_fd)), &events, 8, -1)); + err = linux.errno(linux.epoll_wait(@intCast(epoll_fd), &events, 8, -1)); try expect(err == .SUCCESS); } @@ -99,11 +98,11 @@ test "statx" { } try expect(stat_buf.mode == statx_buf.mode); - try expect(@as(u32, @bitCast(stat_buf.uid)) == statx_buf.uid); - try expect(@as(u32, @bitCast(stat_buf.gid)) == statx_buf.gid); - try expect(@as(u64, @bitCast(@as(i64, stat_buf.size))) == statx_buf.size); - try expect(@as(u64, @bitCast(@as(i64, stat_buf.blksize))) == statx_buf.blksize); - try expect(@as(u64, @bitCast(@as(i64, stat_buf.blocks))) == statx_buf.blocks); + try expect(stat_buf.uid == statx_buf.uid); + try expect(stat_buf.gid == statx_buf.gid); + try expect(stat_buf.size == statx_buf.size); + try expect(stat_buf.blksize == statx_buf.blksize); + try expect(stat_buf.blocks == statx_buf.blocks); } test "user and group ids" { @@ -190,14 +189,14 @@ comptime { assert(256 == @as(u32, @bitCast(linux.FUTEX_OP{ .cmd = @enumFromInt(0), .private = false, .realtime = true }))); // Check futex_param4 union is packed correctly - const param_union = linux.futex_param4{ + const param_union: linux.futex_param4 = .{ .val2 = 0xaabbcc, }; assert(@intFromPtr(param_union.timeout) == 0xaabbcc); } test "futex v1" { - var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + var lock: std.atomic.Value(u32) = .init(1); var rc: usize = 0; // No-op wait, lock value is not expected value @@ -230,14 +229,14 @@ test "futex v1" { const val3 = 1; const wake_nr = 3; const requeue_max = std.math.maxInt(u31); - var target_lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + var target_lock: std.atomic.Value(u32) = .init(1); rc = linux.futex(&lock.raw, .{ .cmd = .CMP_REQUEUE, .private = true }, wake_nr, .{ .val2 = requeue_max }, &target_lock.raw, val3); try expectEqual(0, rc); } // WAKE_OP - just to see if we can construct the arguments ... { - var lock2: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + var lock2: std.atomic.Value(u32) = .init(1); const wake1_nr = 2; const wake2_nr = 3; const wake_op = linux.FUTEX_WAKE_OP{ @@ -276,18 +275,18 @@ test "futex v1" { } comptime { - assert(2 == @as(u32, @bitCast(linux.FUTEX2_FLAGS{ .size = .U32, .private = false }))); - assert(128 == @as(u32, @bitCast(linux.FUTEX2_FLAGS{ .size = @enumFromInt(0), .private = true }))); + std.debug.assert(2 == @as(u32, @bitCast(linux.Futex2.Wait{ .size = .U32, .private = false }))); + std.debug.assert(128 == @as(u32, @bitCast(linux.Futex2.Wait{ .size = @enumFromInt(0), .private = true }))); } test "futex2_waitv" { - const locks = [_]std.atomic.Value(u32){ - std.atomic.Value(u32).init(1), - std.atomic.Value(u32).init(1), - std.atomic.Value(u32).init(1), + const locks: [3]std.atomic.Value(u32) = .{ + .init(1), + .init(1), + .init(1), }; - const futexes = [_]linux.futex2_waitone{ + const futexes: [3]linux.Futex2.WaitOne = .{ .{ .val = 1, .uaddr = @intFromPtr(&locks[0].raw), @@ -305,8 +304,9 @@ test "futex2_waitv" { }, }; - const timeout = linux.kernel_timespec{ .sec = 0, .nsec = 2 }; // absolute timeout, so this is 1970... - const rc = linux.futex2_waitv(&futexes, futexes.len, .{}, &timeout, .MONOTONIC); + // absolute timeout, so this is 1970... + const timeout: linux.kernel_timespec = .{ .sec = 0, .nsec = 2 }; + const rc = linux.futex2_waitv(futexes[0..], .{}, &timeout, .MONOTONIC); switch (linux.errno(rc)) { .NOSYS => return error.SkipZigTest, // futex2_waitv added in kernel v5.16 else => |err| try expectEqual(.TIMEDOUT, err), @@ -317,16 +317,16 @@ test "futex2_waitv" { // return ENOSYS. fn futex2_skip_if_unsupported() !void { const lock: u32 = 0; - const rc = linux.futex2_wake(&lock, 0, 1, .{ .size = .U32, .private = true }); + const rc = linux.futex2_wake(&lock, .empty, 1, .{ .size = .U32, .private = true }); if (linux.errno(rc) == .NOSYS) { return error.SkipZigTest; } } test "futex2_wait" { - var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + var lock: std.atomic.Value(u32) = .init(1); var rc: usize = 0; - const mask = 0x1; + const mask: linux.Futex2.Bitset = .{ .waiter1 = true }; try futex2_skip_if_unsupported(); @@ -343,7 +343,7 @@ test "futex2_wait" { try expectEqual(.INVAL, linux.errno(rc)); } - const flags = linux.FUTEX2_FLAGS{ .size = .U32, .private = true }; + const flags: linux.Futex2.Wait = .{ .size = .U32, .private = true }; // no-wait, lock state mismatch rc = linux.futex2_wait(&lock.raw, 2, mask, flags, null, .MONOTONIC); try expectEqual(.AGAIN, linux.errno(rc)); @@ -372,23 +372,23 @@ test "futex2_wait" { } test "futex2_wake" { - var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + var lock: std.atomic.Value(u32) = .init(1); try futex2_skip_if_unsupported(); - const rc = linux.futex2_wake(&lock.raw, 0xFF, 1, .{ .size = .U32, .private = true }); + const rc = linux.futex2_wake(&lock.raw, .fromInt(0xFF), 1, .{ .size = .U32, .private = true }); try expectEqual(0, rc); } test "futex2_requeue" { try futex2_skip_if_unsupported(); - const locks = [_]std.atomic.Value(u32){ - std.atomic.Value(u32).init(1), - std.atomic.Value(u32).init(1), + const locks: [2]std.atomic.Value(u32) = .{ + .init(1), + .init(1), }; - const futexes = [_]linux.futex2_waitone{ + const futexes: [2]linux.Futex2.WaitOne = .{ .{ .val = 1, .uaddr = @intFromPtr(&locks[0].raw), From 7a3cd45c327e65e4f39e39448668e6a44e0059fa Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Thu, 23 Oct 2025 17:31:00 +0000 Subject: [PATCH 32/41] IoUring: Working on Pipe2 flags Build on the extensive work already done IoUring: add fixed_fd_install, ftruncate, cmd_discard Working on IoUring pipe flags Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 163 ++++++++++++++++++++++++++++++++++- lib/std/os/linux/IoUring.zig | 123 +++++++++++++++++++++++++- 2 files changed, 281 insertions(+), 5 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 005a21a22f20..0705e1b9c3de 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -494,6 +494,161 @@ pub const O = switch (native_arch) { else => @compileError("missing std.os.linux.O constants for this architecture"), }; +pub const Pipe2 = switch (native_arch) { + .x86_64, .x86, .riscv32, .riscv64, .loongarch64 => packed struct(u32) { + _: u7 = 0, + EXCL: bool = false, // + _9: u3 = 0, + NONBLOCK: bool = false, // + _13: u2 = 0, + DIRECT: bool = false, // + _16: u4 = 0, + CLOEXEC: bool = false, // + _21: u12 = 0, + }, + .aarch64, .aarch64_be, .arm, .armeb, .thumb, .thumbeb => packed struct(u32) { + ACCMODE: ACCMODE = .RDONLY, + _2: u4 = 0, + CREAT: bool = false, + EXCL: bool = false, // + NOCTTY: bool = false, + TRUNC: bool = false, + APPEND: bool = false, + NONBLOCK: bool = false, // + DSYNC: bool = false, + ASYNC: bool = false, + DIRECTORY: bool = false, + NOFOLLOW: bool = false, + DIRECT: bool = false, // + LARGEFILE: bool = false, + NOATIME: bool = false, + CLOEXEC: bool = false, // + SYNC: bool = false, + PATH: bool = false, + TMPFILE: bool = false, + _23: u9 = 0, + }, + .sparc64 => packed struct(u32) { + ACCMODE: ACCMODE = .RDONLY, + _2: u1 = 0, + APPEND: bool = false, + _4: u2 = 0, + ASYNC: bool = false, + _7: u2 = 0, + CREAT: bool = false, + TRUNC: bool = false, + EXCL: bool = false, // + _12: u1 = 0, + DSYNC: bool = false, + NONBLOCK: bool = false, // + NOCTTY: bool = false, + DIRECTORY: bool = false, + NOFOLLOW: bool = false, + _18: u2 = 0, + DIRECT: bool = false, // + NOATIME: bool = false, + CLOEXEC: bool = false, // + SYNC: bool = false, + PATH: bool = false, + TMPFILE: bool = false, + _27: u6 = 0, + }, + .mips, .mipsel, .mips64, .mips64el => packed struct(u32) { + ACCMODE: ACCMODE = .RDONLY, + _2: u1 = 0, + APPEND: bool = false, + DSYNC: bool = false, + _5: u2 = 0, + NONBLOCK: bool = false, // + CREAT: bool = false, + TRUNC: bool = false, + EXCL: bool = false, // + NOCTTY: bool = false, + ASYNC: bool = false, + LARGEFILE: bool = false, + SYNC: bool = false, + DIRECT: bool = false, // + DIRECTORY: bool = false, + NOFOLLOW: bool = false, + NOATIME: bool = false, // + CLOEXEC: bool = false, + _20: u1 = 0, + PATH: bool = false, + TMPFILE: bool = false, + _23: u9 = 0, + }, + .powerpc, .powerpcle, .powerpc64, .powerpc64le => packed struct(u32) { + ACCMODE: ACCMODE = .RDONLY, + _2: u4 = 0, + CREAT: bool = false, + EXCL: bool = false, // + NOCTTY: bool = false, + TRUNC: bool = false, + APPEND: bool = false, + NONBLOCK: bool = false, // + DSYNC: bool = false, + ASYNC: bool = false, + DIRECTORY: bool = false, + NOFOLLOW: bool = false, + LARGEFILE: bool = false, + DIRECT: bool = false, // + NOATIME: bool = false, + CLOEXEC: bool = false, // + SYNC: bool = false, + PATH: bool = false, + TMPFILE: bool = false, + _23: u9 = 0, + }, + .hexagon, .or1k, .s390x => packed struct(u32) { + ACCMODE: ACCMODE = .RDONLY, + _2: u4 = 0, + CREAT: bool = false, + EXCL: bool = false, // + NOCTTY: bool = false, + TRUNC: bool = false, + APPEND: bool = false, + NONBLOCK: bool = false, // + DSYNC: bool = false, + ASYNC: bool = false, + DIRECT: bool = false, // + LARGEFILE: bool = false, + DIRECTORY: bool = false, + NOFOLLOW: bool = false, + NOATIME: bool = false, + CLOEXEC: bool = false, // + _20: u1 = 0, + PATH: bool = false, + _22: u10 = 0, + + // #define O_RSYNC 04010000 + // #define O_SYNC 04010000 + // #define O_TMPFILE 020200000 + // #define O_NDELAY O_NONBLOCK + }, + .m68k => packed struct(u32) { + ACCMODE: ACCMODE = .RDONLY, + _2: u4 = 0, + CREAT: bool = false, + EXCL: bool = false, // + NOCTTY: bool = false, + TRUNC: bool = false, + APPEND: bool = false, + NONBLOCK: bool = false, // + DSYNC: bool = false, + ASYNC: bool = false, + DIRECTORY: bool = false, + NOFOLLOW: bool = false, + DIRECT: bool = false, // + LARGEFILE: bool = false, + NOATIME: bool = false, + CLOEXEC: bool = false, // + _20: u1 = 0, + PATH: bool = false, + _22: u10 = 0, + }, + else => @compileError("missing std.os.linux.O constants for this architecture"), +}; + /// Set by startup code, used by `getauxval`. pub var elf_aux_maybe: ?[*]std.elf.Auxv = null; @@ -745,7 +900,6 @@ pub fn futex_4arg(uaddr: *const u32, futex_op: FUTEX_OP, val: u32, timeout: ?*co /// most recently woken, nor...) /// /// Requires at least kernel v5.16. -// TODO: can't we use slices here? and assert `Futex2.waitone_max` pub fn futex2_waitv( /// The length of `futexes` slice must not exceed `Futex2.waitone_max` futexes: []const Futex2.WaitOne, @@ -3757,8 +3911,11 @@ pub const Futex2 = struct { /// `Bitset` with all bits set for the FUTEX_xxx_BITSET OPs to request a /// match of any bit. matches FUTEX_BITSET_MATCH_ANY - pub const match_any: Bitset = @bitCast(@as(u64, 0x00000000ffffffff)); - /// Bitset must not be empty, this is only useful in test + pub const match_any: Bitset = @bitCast(@as(u64, 0x0000_0000_ffff_ffff)); + /// An empty `Bitset` will not wake any threads because the kernel + /// requires at least one bit to be set in the bitmask to identify + /// which waiters should be woken up. Therefore, no action will be + /// taken if the bitset is zero, this is only useful in test pub const empty: Bitset = .{}; /// Create from raw u64 value diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index fd31e6794ecb..b9b731a5b91c 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1806,7 +1806,6 @@ pub fn futex_wait( return sqe; } -// TODO: ensure flags and Wait in futexv are correct /// Available since kernel 6.7 pub fn futex_waitv( self: *IoUring, @@ -1821,6 +1820,55 @@ pub fn futex_waitv( return sqe; } +pub fn fixed_fd_install( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + flags: uflags.FixedFd, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_fixed_fd_install(fd, flags); + sqe.user_data = user_data; + return sqe; +} + +pub fn ftruncate( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + offset: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_ftruncate(fd, offset); + sqe.user_data = user_data; + return sqe; +} + +pub fn cmd_discard( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + offset: u64, + nbytes: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cmd_discard(fd, offset, nbytes); + sqe.user_data = user_data; + return sqe; +} + +pub fn pipe( + self: *IoUring, + user_data: u64, + fds: *[2]linux.fd_t, + flags: uflags., +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_pipe(fds, offset, nbytes); + sqe.user_data = user_data; + return sqe; +} + pub fn register_buffers_sparse(self: *IoUring, nr: u32) !void { assert(self.fd >= 0); @@ -2332,7 +2380,7 @@ pub const Sqe = extern struct { /// msg_flags | timeout_flags | accept_flags | cancel_flags | open_flags | /// statx_flags | fadvise_advice | splice_flags | rename_flags | /// unlink_flags | hardlink_flags xattr_flags | msg_ring_flags | - /// uring_cmd_flags | waitid_flags | futex_flags install_fd_flags | + /// uring_cmd_flags | waitid_flags | futex_flags | install_fd_flags | /// nop_flags | pipe_flags rw_flags: u32, /// data to be passed back at completion time @@ -3323,6 +3371,73 @@ pub const Sqe = extern struct { sqe.rw_flags = flags; } + pub fn prep_fixed_fd_install( + sqe: *Sqe, + fd: linux.fd_t, + flags: uflags.FixedFd, + ) void { + sqe.prep_rw( + .fixed_fd_install, + fd, + undefined, + 0, + 0, + ); + sqe.flags = .{ .fixed_file = true }; + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_ftruncate( + sqe: *Sqe, + fd: linux.fd_t, + offset: u64, + ) void { + sqe.prep_rw( + .ftruncate, + fd, + undefined, + 0, + offset, + ); + } + + pub fn prep_cmd_discard( + sqe: *Sqe, + fd: linux.fd_t, + offset: u64, + nbytes: u64, + ) void { + sqe.prep_rw( + .uring_cmd, + fd, + undefined, + 0, + 0, + ); + // sqe.off maps to sqe.cmd_op in liburing + sqe.off = constants.BLOCK_URING_CMD_DISCARD; + sqe.addr = offset; + sqe.addr3 = nbytes; + } + + pub fn prep_pipe( + sqe: *Sqe, + fd: linux.fd_t, + offset: u64, + nbytes: u64, + ) void { + sqe.prep_rw( + .uring_cmd, + fd, + undefined, + 0, + 0, + ); + // sqe.off maps to sqe.cmd_op in liburing + sqe.off = constants.BLOCK_URING_CMD_DISCARD; + sqe.addr = offset; + sqe.addr3 = nbytes; + } // TODO: maybe remove unused flag fields? pub fn prep_bind( sqe: *Sqe, @@ -4143,6 +4258,10 @@ pub const ZcrxIfqRegister = extern struct { // COMMIT: move IoUring constants to Constants pub const constants = struct { + /// io_uring block file commands, see IORING_OP_URING_CMD. + /// It's a different number space from ioctl(), reuse the block's code 0x12. + /// It is the value of ioctl.IO(0x12, 0) at runtime + pub const BLOCK_URING_CMD_DISCARD = 0x1200; /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for /// opcodes that instantiate a new an available direct descriptor instead /// of having the application pass one direct descriptor From c036f688aadee6607a97e1b99c93021be5fb95c2 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Thu, 23 Oct 2025 21:02:42 +0000 Subject: [PATCH 33/41] IoUring: implement pipe and pipe_direct operations Add dedicated Pipe2 flags for pipe2 as O contains some fields which are invalid in that context Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 197 +++++++++++------------------------ lib/std/os/linux/IoUring.zig | 44 +++++--- 2 files changed, 95 insertions(+), 146 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 0705e1b9c3de..c3be48afd347 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -494,159 +494,88 @@ pub const O = switch (native_arch) { else => @compileError("missing std.os.linux.O constants for this architecture"), }; +/// flags for `pipe2` and `IoUring.pipe` pub const Pipe2 = switch (native_arch) { .x86_64, .x86, .riscv32, .riscv64, .loongarch64 => packed struct(u32) { _: u7 = 0, - EXCL: bool = false, // + notification_pipe: bool = false, _9: u3 = 0, - NONBLOCK: bool = false, // + nonblock: bool = false, _13: u2 = 0, - DIRECT: bool = false, // + direct: bool = false, _16: u4 = 0, - CLOEXEC: bool = false, // + cloexec: bool = false, _21: u12 = 0, }, .aarch64, .aarch64_be, .arm, .armeb, .thumb, .thumbeb => packed struct(u32) { - ACCMODE: ACCMODE = .RDONLY, - _2: u4 = 0, - CREAT: bool = false, - EXCL: bool = false, // - NOCTTY: bool = false, - TRUNC: bool = false, - APPEND: bool = false, - NONBLOCK: bool = false, // - DSYNC: bool = false, - ASYNC: bool = false, - DIRECTORY: bool = false, - NOFOLLOW: bool = false, - DIRECT: bool = false, // - LARGEFILE: bool = false, - NOATIME: bool = false, - CLOEXEC: bool = false, // - SYNC: bool = false, - PATH: bool = false, - TMPFILE: bool = false, - _23: u9 = 0, + _: u7 = 0, + notification_pipe: bool = false, + _9: u3 = 0, + nonblock: bool = false, + _13: u4 = 0, + direct: bool = false, + _18: u2 = 0, + cloexec: bool = false, + _21: u12 = 0, }, .sparc64 => packed struct(u32) { - ACCMODE: ACCMODE = .RDONLY, - _2: u1 = 0, - APPEND: bool = false, - _4: u2 = 0, - ASYNC: bool = false, - _7: u2 = 0, - CREAT: bool = false, - TRUNC: bool = false, - EXCL: bool = false, // - _12: u1 = 0, - DSYNC: bool = false, - NONBLOCK: bool = false, // - NOCTTY: bool = false, - DIRECTORY: bool = false, - NOFOLLOW: bool = false, - _18: u2 = 0, - DIRECT: bool = false, // - NOATIME: bool = false, - CLOEXEC: bool = false, // - SYNC: bool = false, - PATH: bool = false, - TMPFILE: bool = false, - _27: u6 = 0, + _: u11 = 0, + notification_pipe: bool = false, + _13: u2 = 0, + nonblock: bool = false, + _16: u5 = 0, + direct: bool = false, + _22: u1 = 0, + cloexec: bool = false, + _24: u9 = 0, }, .mips, .mipsel, .mips64, .mips64el => packed struct(u32) { - ACCMODE: ACCMODE = .RDONLY, - _2: u1 = 0, - APPEND: bool = false, - DSYNC: bool = false, - _5: u2 = 0, - NONBLOCK: bool = false, // - CREAT: bool = false, - TRUNC: bool = false, - EXCL: bool = false, // - NOCTTY: bool = false, - ASYNC: bool = false, - LARGEFILE: bool = false, - SYNC: bool = false, - DIRECT: bool = false, // - DIRECTORY: bool = false, - NOFOLLOW: bool = false, - NOATIME: bool = false, // - CLOEXEC: bool = false, - _20: u1 = 0, - PATH: bool = false, - TMPFILE: bool = false, - _23: u9 = 0, + _: u7 = 0, + nonblock: bool = false, + _9: u2 = 0, + notification_pipe: bool = false, + _12: u4 = 0, + direct: bool = false, + _17: u3 = 0, + cloexec: bool = false, + _21: u12 = 0, }, .powerpc, .powerpcle, .powerpc64, .powerpc64le => packed struct(u32) { - ACCMODE: ACCMODE = .RDONLY, - _2: u4 = 0, - CREAT: bool = false, - EXCL: bool = false, // - NOCTTY: bool = false, - TRUNC: bool = false, - APPEND: bool = false, - NONBLOCK: bool = false, // - DSYNC: bool = false, - ASYNC: bool = false, - DIRECTORY: bool = false, - NOFOLLOW: bool = false, - LARGEFILE: bool = false, - DIRECT: bool = false, // - NOATIME: bool = false, - CLOEXEC: bool = false, // - SYNC: bool = false, - PATH: bool = false, - TMPFILE: bool = false, - _23: u9 = 0, + _: u7 = 0, + notification_pipe: bool = false, + _9: u3 = 0, + nonblock: bool = false, + _13: u5 = 0, + direct: bool = false, + _19: u1 = 0, + cloexec: bool = false, + _21: u12 = 0, }, + // matches `x86_64` above check if the full struct matches .hexagon, .or1k, .s390x => packed struct(u32) { - ACCMODE: ACCMODE = .RDONLY, - _2: u4 = 0, - CREAT: bool = false, - EXCL: bool = false, // - NOCTTY: bool = false, - TRUNC: bool = false, - APPEND: bool = false, - NONBLOCK: bool = false, // - DSYNC: bool = false, - ASYNC: bool = false, - DIRECT: bool = false, // - LARGEFILE: bool = false, - DIRECTORY: bool = false, - NOFOLLOW: bool = false, - NOATIME: bool = false, - CLOEXEC: bool = false, // - _20: u1 = 0, - PATH: bool = false, - _22: u10 = 0, - - // #define O_RSYNC 04010000 - // #define O_SYNC 04010000 - // #define O_TMPFILE 020200000 - // #define O_NDELAY O_NONBLOCK + _: u7 = 0, + notification_pipe: bool = false, + _9: u3 = 0, + nonblock: bool = false, + _13: u2 = 0, + direct: bool = false, + _16: u4 = 0, + cloexec: bool = false, + _21: u12 = 0, }, + // matches `aarch64` above check if the full struct matches .m68k => packed struct(u32) { - ACCMODE: ACCMODE = .RDONLY, - _2: u4 = 0, - CREAT: bool = false, - EXCL: bool = false, // - NOCTTY: bool = false, - TRUNC: bool = false, - APPEND: bool = false, - NONBLOCK: bool = false, // - DSYNC: bool = false, - ASYNC: bool = false, - DIRECTORY: bool = false, - NOFOLLOW: bool = false, - DIRECT: bool = false, // - LARGEFILE: bool = false, - NOATIME: bool = false, - CLOEXEC: bool = false, // - _20: u1 = 0, - PATH: bool = false, - _22: u10 = 0, + _: u7 = 0, + notification_pipe: bool = false, + _9: u3 = 0, + nonblock: bool = false, + _13: u4 = 0, + direct: bool = false, + _18: u2 = 0, + cloexec: bool = false, + _21: u12 = 0, }, - else => @compileError("missing std.os.linux.O constants for this architecture"), + else => @compileError("missing std.os.linux.Pipe2 flags for this architecture"), }; /// Set by startup code, used by `getauxval`. @@ -1422,7 +1351,7 @@ pub fn pipe(fd: *[2]i32) usize { } } -pub fn pipe2(fd: *[2]i32, flags: O) usize { +pub fn pipe2(fd: *[2]i32, flags: Pipe2) usize { return syscall2(.pipe2, @intFromPtr(fd), @as(u32, @bitCast(flags))); } diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index b9b731a5b91c..f5b9b38d0178 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1861,10 +1861,23 @@ pub fn pipe( self: *IoUring, user_data: u64, fds: *[2]linux.fd_t, - flags: uflags., + flags: linux.Pipe2, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_pipe(fds, offset, nbytes); + sqe.prep_pipe(fds, flags); + sqe.user_data = user_data; + return sqe; +} + +pub fn pipe_direct( + self: *IoUring, + user_data: u64, + fds: *[2]linux.fd_t, + flags: linux.Pipe2, + file_index: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_pipe_direct(fds, flags, file_index); sqe.user_data = user_data; return sqe; } @@ -3422,22 +3435,29 @@ pub const Sqe = extern struct { pub fn prep_pipe( sqe: *Sqe, - fd: linux.fd_t, - offset: u64, - nbytes: u64, + fds: *[2]linux.fd_t, + flags: linux.Pipe2, ) void { sqe.prep_rw( - .uring_cmd, - fd, - undefined, + .pipe, + 0, + @intFromPtr(fds), 0, 0, ); - // sqe.off maps to sqe.cmd_op in liburing - sqe.off = constants.BLOCK_URING_CMD_DISCARD; - sqe.addr = offset; - sqe.addr3 = nbytes; + sqe.rw_flags = @bitCast(flags); } + + pub fn prep_pipe_direct( + sqe: *Sqe, + fds: *[2]linux.fd_t, + flags: linux.Pipe2, + file_index: u32, + ) void { + sqe.prep_pipe(fds, flags); + sqe.set_target_fixed_file(file_index); + } + // TODO: maybe remove unused flag fields? pub fn prep_bind( sqe: *Sqe, From aea148c6063d253bb72a3552add14a2f017aa074 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 24 Oct 2025 10:11:53 +0000 Subject: [PATCH 34/41] Merge archs that have the same Pipe2 flags Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index c3be48afd347..24dcc7dfd66e 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -495,8 +495,9 @@ pub const O = switch (native_arch) { }; /// flags for `pipe2` and `IoUring.pipe` +/// matches flags in `O` but specific to `pipe2` syscall pub const Pipe2 = switch (native_arch) { - .x86_64, .x86, .riscv32, .riscv64, .loongarch64 => packed struct(u32) { + .x86_64, .x86, .riscv32, .riscv64, .loongarch64, .hexagon, .or1k, .s390x => packed struct(u32) { _: u7 = 0, notification_pipe: bool = false, _9: u3 = 0, @@ -507,7 +508,7 @@ pub const Pipe2 = switch (native_arch) { cloexec: bool = false, _21: u12 = 0, }, - .aarch64, .aarch64_be, .arm, .armeb, .thumb, .thumbeb => packed struct(u32) { + .aarch64, .aarch64_be, .arm, .armeb, .thumb, .thumbeb, .m68k => packed struct(u32) { _: u7 = 0, notification_pipe: bool = false, _9: u3 = 0, @@ -551,30 +552,6 @@ pub const Pipe2 = switch (native_arch) { cloexec: bool = false, _21: u12 = 0, }, - // matches `x86_64` above check if the full struct matches - .hexagon, .or1k, .s390x => packed struct(u32) { - _: u7 = 0, - notification_pipe: bool = false, - _9: u3 = 0, - nonblock: bool = false, - _13: u2 = 0, - direct: bool = false, - _16: u4 = 0, - cloexec: bool = false, - _21: u12 = 0, - }, - // matches `aarch64` above check if the full struct matches - .m68k => packed struct(u32) { - _: u7 = 0, - notification_pipe: bool = false, - _9: u3 = 0, - nonblock: bool = false, - _13: u4 = 0, - direct: bool = false, - _18: u2 = 0, - cloexec: bool = false, - _21: u12 = 0, - }, else => @compileError("missing std.os.linux.Pipe2 flags for this architecture"), }; From f4b1bde4fc45d06b6319f899af7677d67983b68e Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 24 Oct 2025 11:54:28 +0000 Subject: [PATCH 35/41] Use linux types directly since IoUring is only supported on linux Use explicit array types were possible Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 291 ++++++++++++++++++----------------- 1 file changed, 150 insertions(+), 141 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index f5b9b38d0178..49a5303df1cb 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -11,7 +11,7 @@ const testing = std.testing; const is_linux = builtin.os.tag == .linux; const page_size_min = std.heap.page_size_min; -fd: posix.fd_t = -1, +fd: linux.fd_t = -1, sq: Sq, cq: Cq, flags: uflags.Setup, @@ -420,9 +420,9 @@ pub fn enter_flags(self: *IoUring) uflags.Enter { pub fn splice( self: *IoUring, user_data: u64, - fd_in: posix.fd_t, + fd_in: linux.fd_t, off_in: u64, - fd_out: posix.fd_t, + fd_out: linux.fd_t, off_out: u64, len: usize, ) !*Sqe { @@ -437,8 +437,8 @@ pub fn splice( pub fn tee( self: *IoUring, user_data: u64, - fd_in: posix.fd_t, - fd_out: posix.fd_t, + fd_in: linux.fd_t, + fd_out: linux.fd_t, len: usize, ) !*Sqe { const sqe = try self.get_sqe(); @@ -459,7 +459,7 @@ pub fn tee( pub fn read( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, buffer: ReadBuffer, offset: u64, ) !*Sqe { @@ -565,8 +565,8 @@ pub fn write_fixed( pub fn recvmsg( self: *IoUring, user_data: u64, - fd: posix.fd_t, - msg: *posix.msghdr, + fd: linux.fd_t, + msg: *linux.msghdr, flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); @@ -580,8 +580,8 @@ pub fn recvmsg( pub fn recvmsg_multishot( self: *IoUring, user_data: u64, - fd: posix.fd_t, - msg: *posix.msghdr, + fd: linux.fd_t, + msg: *linux.msghdr, flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); @@ -596,8 +596,8 @@ pub fn recvmsg_multishot( pub fn sendmsg( self: *IoUring, user_data: u64, - fd: posix.fd_t, - msg: *const posix.msghdr_const, + fd: linux.fd_t, + msg: *const linux.msghdr_const, flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); @@ -611,7 +611,7 @@ pub fn sendmsg( pub fn poll_add( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, poll_mask: linux.Epoll, ) !*Sqe { const sqe = try self.get_sqe(); @@ -625,7 +625,7 @@ pub fn poll_add( pub fn poll_multishot( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, poll_mask: linux.Epoll, ) !*Sqe { const sqe = try self.poll_add(user_data, fd, poll_mask); @@ -675,7 +675,7 @@ pub fn poll_update( /// You should preferably use `link_with_next_sqe()` on a write's SQE to link /// it with an fsync, or else insert a full write barrier using /// `drain_previous_sqes()` when queueing an fsync. -pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe { +pub fn fsync(self: *IoUring, user_data: u64, fd: linux.fd_t, flags: uflags.Fsync) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_fsync(fd, flags); sqe.user_data = user_data; @@ -765,8 +765,8 @@ pub fn accept( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); @@ -790,8 +790,8 @@ pub fn accept_direct( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); @@ -811,9 +811,9 @@ pub fn accept_direct( pub fn accept_multishot( self: *IoUring, user_data: u64, - fd: posix.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); @@ -828,8 +828,8 @@ pub fn accept_multishot_direct( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, flags: linux.Sock, ) !*Sqe { const sqe = try self.get_sqe(); @@ -904,8 +904,8 @@ pub fn connect( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: *const posix.sockaddr, - addrlen: posix.socklen_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_connect(fd, addr, addrlen); @@ -919,9 +919,9 @@ pub fn connect( pub fn bind( self: *IoUring, user_data: u64, - fd: posix.fd_t, - addr: *const posix.sockaddr, - addrlen: posix.socklen_t, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, // liburing doesn't have this flag, hence 0 should be passed // TODO: consider removing this and all flags like this flags: u32, @@ -938,7 +938,7 @@ pub fn bind( pub fn listen( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, backlog: usize, // liburing doesn't have this flag, hence 0 should be passed // TODO: consider removing this and all flags like this @@ -955,7 +955,7 @@ pub fn listen( pub fn epoll_wait( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, events: ?*linux.epoll_event, max_events: u32, flags: linux.Epoll, @@ -999,7 +999,7 @@ pub fn files_update( pub fn fallocate( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, mode: i32, offset: u64, len: u64, @@ -1016,10 +1016,10 @@ pub fn fallocate( pub fn openat( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, path: [*:0]const u8, flags: linux.O, - mode: posix.mode_t, + mode: linux.mode_t, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_openat(fd, path, flags, mode); @@ -1042,10 +1042,10 @@ pub fn openat( pub fn openat_direct( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, path: [*:0]const u8, flags: linux.O, - mode: posix.mode_t, + mode: linux.mode_t, file_index: u32, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1061,7 +1061,7 @@ pub fn open( user_data: u64, path: [*:0]const u8, flags: linux.O, - mode: posix.mode_t, + mode: linux.mode_t, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_openat(linux.At.fdcwd, path, flags, mode); @@ -1083,7 +1083,7 @@ pub fn open_direct( user_data: u64, path: [*:0]const u8, flags: linux.O, - mode: posix.mode_t, + mode: linux.mode_t, file_index: u32, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1095,7 +1095,7 @@ pub fn open_direct( /// Queues (but does not submit) an SQE to perform a `close(2)`. /// Returns a pointer to the SQE. /// Available since 5.6. -pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*Sqe { +pub fn close(self: *IoUring, user_data: u64, fd: linux.fd_t) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_close(fd); sqe.user_data = user_data; @@ -1116,7 +1116,7 @@ pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe { pub fn statx( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, path: [:0]const u8, flags: linux.At, mask: linux.Statx.Mask, @@ -1134,7 +1134,7 @@ pub fn statx( pub fn fadvice( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, offset: u64, len: u32, advice: linux.Fadvice, @@ -1165,7 +1165,7 @@ pub fn madvice( pub fn send( self: *IoUring, user_data: u64, - sockfd: posix.fd_t, + sockfd: linux.fd_t, buffer: []const u8, flags: linux.Msg, ) !*Sqe { @@ -1180,7 +1180,7 @@ pub fn send( pub fn send_bundle( self: *IoUring, user_data: u64, - sockfd: posix.fd_t, + sockfd: linux.fd_t, len: u64, flags: linux.Msg, ) !*Sqe { @@ -1195,7 +1195,7 @@ pub fn send_bundle( pub fn send_to( self: *IoUring, user_data: u64, - sockfd: posix.fd_t, + sockfd: linux.fd_t, buffer: []const u8, flags: linux.Msg, addr: *const linux.sockaddr, @@ -1225,7 +1225,7 @@ pub fn send_to( pub fn send_zc( self: *IoUring, user_data: u64, - sockfd: posix.fd_t, + sockfd: linux.fd_t, buffer: []const u8, send_flags: linux.Msg, zc_flags: Sqe.SendRecv, @@ -1242,7 +1242,7 @@ pub fn send_zc( pub fn send_zc_fixed( self: *IoUring, user_data: u64, - sockfd: posix.fd_t, + sockfd: linux.fd_t, buffer: []const u8, send_flags: linux.Msg, zc_flags: Sqe.SendRecv, @@ -1260,8 +1260,8 @@ pub fn send_zc_fixed( pub fn sendmsg_zc( self: *IoUring, user_data: u64, - fd: posix.fd_t, - msg: *const posix.msghdr_const, + fd: linux.fd_t, + msg: *const linux.msghdr_const, flags: linux.Msg, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1275,8 +1275,8 @@ pub fn sendmsg_zc( pub fn sendmsg_zc_fixed( self: *IoUring, user_data: u64, - fd: posix.fd_t, - msg: *const posix.msghdr_const, + fd: linux.fd_t, + msg: *const linux.msghdr_const, flags: linux.Msg, buf_index: u16, ) !*Sqe { @@ -1377,7 +1377,7 @@ pub fn remove_buffers( pub fn shutdown( self: *IoUring, user_data: u64, - sockfd: posix.socket_t, + sockfd: linux.socket_t, how: linux.Shut, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1391,7 +1391,7 @@ pub fn shutdown( pub fn unlinkat( self: *IoUring, user_data: u64, - dir_fd: posix.fd_t, + dir_fd: linux.fd_t, path: [*:0]const u8, flags: linux.At, ) !*Sqe { @@ -1417,9 +1417,9 @@ pub fn unlink( pub fn renameat( self: *IoUring, user_data: u64, - old_dir_fd: posix.fd_t, + old_dir_fd: linux.fd_t, old_path: [*:0]const u8, - new_dir_fd: posix.fd_t, + new_dir_fd: linux.fd_t, new_path: [*:0]const u8, flags: linux.Rename, ) !*Sqe { @@ -1446,7 +1446,7 @@ pub fn rename( pub fn sync_file_range( self: *IoUring, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, len: u32, offset: u64, flags: linux.SyncFileRange, // TODO: add flags @@ -1462,9 +1462,9 @@ pub fn sync_file_range( pub fn mkdirat( self: *IoUring, user_data: u64, - dir_fd: posix.fd_t, + dir_fd: linux.fd_t, path: [*:0]const u8, - mode: posix.mode_t, + mode: linux.mode_t, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_mkdirat(dir_fd, path, mode); @@ -1478,7 +1478,7 @@ pub fn mkdir( self: *IoUring, user_data: u64, path: [*:0]const u8, - mode: posix.mode_t, + mode: linux.mode_t, ) !*Sqe { return try self.mkdirat(user_data, linux.At.fdcwd, path, mode); } @@ -1489,7 +1489,7 @@ pub fn symlinkat( self: *IoUring, user_data: u64, target: [*:0]const u8, - new_dir_fd: posix.fd_t, + new_dir_fd: linux.fd_t, link_path: [*:0]const u8, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1514,9 +1514,9 @@ pub fn symlink( pub fn linkat( self: *IoUring, user_data: u64, - old_dir_fd: posix.fd_t, + old_dir_fd: linux.fd_t, old_path: [*:0]const u8, - new_dir_fd: posix.fd_t, + new_dir_fd: linux.fd_t, new_path: [*:0]const u8, flags: linux.At, ) !*Sqe { @@ -1924,7 +1924,7 @@ pub fn unregister_buffers(self: *IoUring) !void { /// * replacing an existing entry with a new fd /// /// Adding new file descriptors must be done with `register_files`. -pub fn register_files_update(self: *IoUring, offset: u32, fds: []const posix.fd_t) !void { +pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { assert(self.fd >= 0); var update = mem.zeroInit(RsrcUpdate, .{ @@ -2539,7 +2539,7 @@ pub const Sqe = extern struct { pub fn prep_writev( sqe: *Sqe, fd: linux.fd_t, - iovecs: []const std.posix.iovec_const, + iovecs: []const posix.iovec_const, offset: u64, ) void { sqe.prep_rw(.writev, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); @@ -2574,7 +2574,7 @@ pub const Sqe = extern struct { pub fn prep_readv( sqe: *Sqe, fd: linux.fd_t, - iovecs: []const std.posix.iovec, + iovecs: []const posix.iovec, offset: u64, ) void { sqe.prep_rw(.readv, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); @@ -2594,7 +2594,7 @@ pub const Sqe = extern struct { pub fn prep_readv_fixed( sqe: *Sqe, fd: linux.fd_t, - iovecs: []const std.posix.iovec, + iovecs: []const posix.iovec, offset: u64, buffer_index: u16, ) void { @@ -3044,7 +3044,7 @@ pub const Sqe = extern struct { pub fn prep_sync_file_range( sqe: *Sqe, - fd: posix.fd_t, + fd: linux.fd_t, len: u32, offset: u64, flags: linux.SyncFileRange, // TODO: add flags @@ -3548,7 +3548,7 @@ pub const Sq = struct { _: u29 = 0, }; - pub fn init(fd: posix.fd_t, p: Params) !Sq { + pub fn init(fd: linux.fd_t, p: Params) !Sq { assert(fd >= 0); assert(p.features.single_mmap); const size = @max( @@ -3620,7 +3620,7 @@ pub const Cq = struct { _: u31 = 0, }; - pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq { + pub fn init(fd: linux.fd_t, p: Params, sq: Sq) !Cq { assert(fd >= 0); assert(p.features.single_mmap); const mmap = sq.mmap; @@ -3724,7 +3724,7 @@ pub const BufferGroup = struct { pub fn read_multishot( self: *BufferGroup, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, nbytes: u32, offset: u64, ) !*Sqe { @@ -3740,7 +3740,7 @@ pub const BufferGroup = struct { pub fn recv( self: *BufferGroup, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, flags: linux.Msg, ) !*Sqe { var sqe = try self.ring.get_sqe(); @@ -3757,7 +3757,7 @@ pub const BufferGroup = struct { pub fn recv_multishot( self: *BufferGroup, user_data: u64, - fd: posix.fd_t, + fd: linux.fd_t, flags: linux.Msg, ) !*Sqe { var sqe = try self.recv(user_data, fd, flags); @@ -4867,13 +4867,15 @@ test "readv" { // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 // We therefore avoid stressing sparse fd sets here: - var registered_fds = [_]linux.fd_t{0} ** 1; + var registered_fds: [1]linux.fd_t = @splat(0); const fd_index = 0; registered_fds[fd_index] = fd; try ring.register_files(registered_fds[0..]); - var buffer = [_]u8{42} ** 128; - var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; + var buffer: [128]u8 = @splat(42); + var iovecs: [1]posix.iovec = .{ + .{ .base = &buffer, .len = buffer.len }, + }; const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); try testing.expectEqual(Op.readv, sqe.opcode); sqe.flags.fixed_file = true; @@ -4885,7 +4887,8 @@ test "readv" { .res = buffer.len, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + const empty: [buffer.len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffer[0..]); try ring.unregister_files(); } @@ -4908,13 +4911,14 @@ test "writev/fsync/readv" { defer file.close(); const fd = file.handle; - const buffer_write = [_]u8{42} ** 128; - const iovecs_write = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_write, .len = buffer_write.len }, + const buffer_write: [128]u8 = @splat(42); + const iovecs_write: [1]posix.iovec_const = .{ + .{ .base = &buffer_write, .len = buffer_write.len }, }; - var buffer_read = [_]u8{0} ** 128; - var iovecs_read = [_]posix.iovec{ - posix.iovec{ .base = &buffer_read, .len = buffer_read.len }, + + var buffer_read: [128]u8 = @splat(0); + var iovecs_read: [1]posix.iovec = .{ + .{ .base = &buffer_read, .len = buffer_read.len }, }; const sqe_writev = try ring.write(0xdddddddd, fd, .{ .iovecs = iovecs_write[0..] }, 17); @@ -4977,8 +4981,8 @@ test "write/read" { defer file.close(); const fd = file.handle; - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; + const buffer_write: [20]u8 = @splat(97); + var buffer_read: [20]u8 = @splat(98); const sqe_write = try ring.write(0x11111111, fd, .{ .buffer = buffer_write[0..] }, 10); try testing.expectEqual(Op.write, sqe_write.opcode); try testing.expectEqual(10, sqe_write.off); @@ -5028,8 +5032,8 @@ test "splice/read" { defer file_dst.close(); const fd_dst = file_dst.handle; - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; + const buffer_write: [20]u8 = @splat(97); + var buffer_read: [20]u8 = @splat(98); _ = try file_src.write(&buffer_write); const fds = try posix.pipe(); @@ -5101,7 +5105,7 @@ test "write_fixed/read_fixed" { @memset(&raw_buffers[0], 'z'); raw_buffers[0][0.."foobar".len].* = "foobar".*; - var buffers = [2]posix.iovec{ + var buffers: [2]posix.iovec = .{ .{ .base = &raw_buffers[0], .len = raw_buffers[0].len }, .{ .base = &raw_buffers[1], .len = raw_buffers[1].len }, }; @@ -5166,7 +5170,7 @@ test "openat" { } else @intFromPtr(path); const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; + const mode: linux.mode_t = 0o666; const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); try testing.expectEqual(Sqe{ .opcode = .openat, @@ -5299,9 +5303,9 @@ test "sendmsg/recvmsg" { const client = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); defer posix.close(client); - const buffer_send = [_]u8{42} ** 128; - const iovecs_send = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_send, .len = buffer_send.len }, + const buffer_send: [128]u8 = @splat(42); + const iovecs_send: [1]posix.iovec_const = .{ + .{ .base = &buffer_send, .len = buffer_send.len }, }; const msg_send: linux.msghdr_const = .{ .name = addrAny(&address_server), @@ -5317,9 +5321,9 @@ test "sendmsg/recvmsg" { try testing.expectEqual(Op.sendmsg, sqe_sendmsg.opcode); try testing.expectEqual(client, sqe_sendmsg.fd); - var buffer_recv = [_]u8{0} ** 128; - var iovecs_recv = [_]posix.iovec{ - posix.iovec{ .base = &buffer_recv, .len = buffer_recv.len }, + var buffer_recv: [128]u8 = @splat(0); + var iovecs_recv: [1]posix.iovec = .{ + .{ .base = &buffer_recv, .len = buffer_recv.len }, }; var address_recv: linux.sockaddr.in = .{ .port = 0, @@ -5704,7 +5708,7 @@ test "register_files_update" { const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); defer posix.close(fd); - var registered_fds = [_]linux.fd_t{0} ** 2; + var registered_fds: [2]linux.fd_t = @splat(0); const fd_index = 0; const fd_index2 = 1; registered_fds[fd_index] = fd; @@ -5726,7 +5730,7 @@ test "register_files_update" { registered_fds[fd_index2] = -1; try ring.register_files_update(0, registered_fds[0..]); - var buffer = [_]u8{42} ** 128; + var buffer: [128]u8 = @splat(42); { const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); try testing.expectEqual(Op.read, sqe.opcode); @@ -5738,7 +5742,8 @@ test "register_files_update" { .res = buffer.len, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + const empty: [buffer.len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffer[0..]); } // Test with a non-zero offset @@ -5759,7 +5764,9 @@ test "register_files_update" { .res = buffer.len, .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + const empty: [buffer.len]u8 = @splat(0); + + try testing.expectEqualSlices(u8, empty[0..], buffer[0..]); } try ring.register_files_update(0, registered_fds[0..]); @@ -6373,7 +6380,8 @@ test "provide_buffers: accept/connect/send/recv" { { var i: usize = 0; while (i < buffers.len) : (i += 1) { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), .{}); + const zz_buffer: [buffer_len]u8 = @splat('z'); + _ = try ring.send(0xdeaddead, socket_test_harness.server, zz_buffer[0..], .{}); try testing.expectEqual(1, try ring.submit()); } @@ -6455,7 +6463,8 @@ test "provide_buffers: accept/connect/send/recv" { // Redo 1 send on the server socket { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), .{}); + const ww_buffer: [buffer_len]u8 = @splat('w'); + _ = try ring.send(0xdeaddead, socket_test_harness.server, ww_buffer[0..], .{}); try testing.expectEqual(1, try ring.submit()); _ = try ring.copy_cqe(); @@ -6515,8 +6524,8 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { errdefer posix.close(listener_socket); // Submit 1 accept - var accept_addr: posix.sockaddr = undefined; - var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr)); + var accept_addr: linux.sockaddr = undefined; + var accept_addr_len: linux.socklen_t = @sizeOf(@TypeOf(accept_addr)); _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, .{}); // Create a TCP client socket @@ -6592,8 +6601,8 @@ test "accept multishot" { defer posix.close(listener_socket); // submit multishot accept operation - var addr: posix.sockaddr = undefined; - var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); + var addr: linux.sockaddr = undefined; + var addr_len: linux.socklen_t = @sizeOf(@TypeOf(addr)); const userdata: u64 = 0xaaaaaaaa; _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, .{}); try testing.expectEqual(1, try ring.submit()); @@ -6629,8 +6638,8 @@ test "accept/connect/send_zc/recv" { const socket_test_harness = try createSocketTestHarness(&ring); defer socket_test_harness.close(); - const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - var buffer_recv = [_]u8{0} ** 10; + const buffer_send: [15]u8 = .{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + var buffer_recv: [10]u8 = @splat(0); // zero-copy send const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}, .{}); @@ -6687,7 +6696,7 @@ test "accept_direct" { }; // register direct file descriptors - var registered_fds = [_]linux.fd_t{-1} ** 2; + var registered_fds: [2]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); const listener_socket = try createListenerSocket(&address); @@ -6713,7 +6722,7 @@ test "accept_direct" { // accept completion const cqe_accept = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_accept.err()); const fd_index = cqe_accept.res; try testing.expect(fd_index < registered_fds.len); try testing.expect(cqe_accept.user_data == accept_userdata); @@ -6748,7 +6757,7 @@ test "accept_direct" { // completion with error const cqe_accept = try ring.copy_cqe(); try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); + try testing.expectEqual(linux.E.NFILE, cqe_accept.err()); } // return file descriptors to kernel try ring.register_files_update(0, registered_fds[0..]); @@ -6776,7 +6785,7 @@ test "accept_multishot_direct" { .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; - var registered_fds = [_]linux.fd_t{-1} ** 2; + var registered_fds: [2]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); const listener_socket = try createListenerSocket(&address); @@ -6813,7 +6822,7 @@ test "accept_multishot_direct" { // completion with error const cqe_accept = try ring.copy_cqe(); try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); + try testing.expectEqual(linux.E.NFILE, cqe_accept.err()); try testing.expect(!cqe_accept.flags.f_more); // has more is not set } // return file descriptors to kernel @@ -6838,7 +6847,7 @@ test "socket" { // test completion var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); const fd: linux.fd_t = @intCast(cqe.res); try testing.expect(fd > 2); @@ -6855,21 +6864,21 @@ test "socket_direct/socket_direct_alloc/close_direct" { }; defer ring.deinit(); - var registered_fds = [_]linux.fd_t{-1} ** 3; + var registered_fds: [3]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); // create socket in registered file descriptor at index 0 (last param) _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 0); try testing.expectEqual(1, try ring.submit()); var cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 0); // create socket in registered file descriptor at index 1 (last param) _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 1); try testing.expectEqual(1, try ring.submit()); cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified // create socket in kernel chosen file descriptor index (_alloc version) @@ -6877,7 +6886,7 @@ test "socket_direct/socket_direct_alloc/close_direct" { _ = try ring.socket_direct_alloc(0, .inet, .{ .type = .stream }, .default, 0); try testing.expectEqual(1, try ring.submit()); cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 2); // returns registered file index // use sockets from registered_fds in connect operation @@ -6911,17 +6920,17 @@ test "socket_direct/socket_direct_alloc/close_direct" { } // test connect completion try testing.expect(cqe_connect.user_data == connect_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_connect.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_connect.err()); // test accept completion try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_accept.err()); // submit and test close_direct _ = try ring.close_direct(close_userdata, @intCast(fd_index)); try testing.expectEqual(1, try ring.submit()); var cqe_close = try ring.copy_cqe(); try testing.expect(cqe_close.user_data == close_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_close.err()); } try ring.unregister_files(); @@ -6937,35 +6946,35 @@ test "openat_direct/close_direct" { }; defer ring.deinit(); - var registered_fds = [_]linux.fd_t{-1} ** 3; + var registered_fds: [3]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); var tmp = std.testing.tmpDir(.{}); defer tmp.cleanup(); const path = "test_io_uring_close_direct"; const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; + const mode: linux.mode_t = 0o666; const user_data: u64 = 0; // use registered file at index 0 (last param) _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 0); // use registered file at index 1 _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 0); // res is 0 when we specify index // let kernel choose registered file index _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, constants.FILE_INDEX_ALLOC); try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 2); // chosen index is in res // close all open file descriptors @@ -6973,7 +6982,7 @@ test "openat_direct/close_direct" { _ = try ring.close_direct(user_data, @intCast(fd_index)); try testing.expectEqual(1, try ring.submit()); var cqe_close = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_close.err()); } try ring.unregister_files(); } @@ -6993,7 +7002,7 @@ test "waitid" { posix.exit(7); } - var siginfo: posix.siginfo_t = undefined; + var siginfo: linux.siginfo_t = undefined; _ = try ring.waitid(0, .PID, pid, &siginfo, .{ .exited = true }, 0); try testing.expectEqual(1, try ring.submit()); @@ -7093,7 +7102,7 @@ test BufferGroup { const cqe = try ring.copy_cqe(); try testing.expectEqual(2, cqe.user_data); // matches submitted user_data try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expectEqual(@as(i32, data.len), cqe.res); // cqe.res holds received data len // Get buffer from pool @@ -7173,7 +7182,7 @@ test "ring mapped buffers recv" { const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); + try testing.expectEqual(linux.E.NOBUFS, cqe.err()); try testing.expect(!cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flags is set on success only try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); } @@ -7263,7 +7272,7 @@ test "ring mapped buffers multishot recv" { const cqe = try ring.copy_cqe(); try testing.expectEqual(recv_user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); + try testing.expectEqual(linux.E.NOBUFS, cqe.err()); // IORING_CQE_F_BUFFER flags is set on success only try testing.expect(!cqe.flags.f_buffer); // has more is not set @@ -7349,7 +7358,7 @@ fn buf_grp_recv_submit_get_cqe( const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expect(cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flag is set return cqe; @@ -7367,7 +7376,7 @@ fn expect_buf_grp_cqe( try testing.expect(cqe.res >= 0); // success try testing.expect(cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flag is set try testing.expectEqual(@as(i32, @intCast(expected.len)), cqe.res); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); // get buffer from pool const buffer_id = try cqe.buffer_id(); @@ -7456,7 +7465,7 @@ test "bind/listen/connect" { try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); try testing.expectEqual(1, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); const listen_fd: linux.fd_t = @intCast(cqe.res); try testing.expect(listen_fd > 2); @@ -7472,7 +7481,7 @@ test "bind/listen/connect" { for (2..6) |user_data| { cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); } // Check that socket option is set @@ -7481,7 +7490,7 @@ test "bind/listen/connect" { try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); try testing.expectEqual(5, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expectEqual(1, optval); // Read system assigned port into addr @@ -7497,7 +7506,7 @@ test "bind/listen/connect" { try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(6, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); // Get connect socket fd const connect_fd: linux.fd_t = @intCast(cqe.res); try testing.expect(connect_fd > 2 and connect_fd != listen_fd); @@ -7509,10 +7518,10 @@ test "bind/listen/connect" { _ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in)); try testing.expectEqual(2, try ring.submit()); // Get listener accepted socket - var accept_fd: posix.socket_t = 0; + var accept_fd: linux.socket_t = 0; for (0..2) |_| { const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); if (cqe.user_data == 7) { accept_fd = @intCast(cqe.res); } else { @@ -7526,19 +7535,19 @@ test "bind/listen/connect" { try testSendRecv(&ring, accept_fd, connect_fd); // Shutdown and close all sockets - for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { + for ([_]linux.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { (try ring.shutdown(9, fd, .rdwr)).link_next(); _ = try ring.close(10, fd); try testing.expectEqual(2, try ring.submit()); for (0..2) |i| { const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expectEqual(9 + i, cqe.user_data); } } } -fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t) !void { +fn testSendRecv(ring: *IoUring, send_fd: linux.socket_t, recv_fd: linux.socket_t) !void { const buffer_send = "0123456789abcdf" ** 10; var buffer_recv: [buffer_send.len * 2]u8 = undefined; @@ -7549,7 +7558,7 @@ fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t for (0..2) |i| { const cqe = try ring.copy_cqe(); try testing.expectEqual(1 + i, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expectEqual(@as(i32, buffer_send.len), cqe.res); } @@ -7560,7 +7569,7 @@ fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(3, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); recv_len += @intCast(cqe.res); } From 729c555f9768525746195f1ff7bdf5bb8a008397 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 24 Oct 2025 17:33:46 +0000 Subject: [PATCH 36/41] IoUring: implement outstanding flags and enumerations Add SYNC_FILE_RANGE_* flags add XATTR_* flags Add Madvise for MADV_* constants Add Fadvise enum type for POSIX_FADV_* constants Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 219 ++++++++++++++++++++++++++--------- lib/std/os/linux/IoUring.zig | 21 ++-- 2 files changed, 175 insertions(+), 65 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 24dcc7dfd66e..d111324af1a6 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -499,6 +499,7 @@ pub const O = switch (native_arch) { pub const Pipe2 = switch (native_arch) { .x86_64, .x86, .riscv32, .riscv64, .loongarch64, .hexagon, .or1k, .s390x => packed struct(u32) { _: u7 = 0, + /// Parameter to `pipe2` selecting notification pipe notification_pipe: bool = false, _9: u3 = 0, nonblock: bool = false, @@ -510,6 +511,7 @@ pub const Pipe2 = switch (native_arch) { }, .aarch64, .aarch64_be, .arm, .armeb, .thumb, .thumbeb, .m68k => packed struct(u32) { _: u7 = 0, + /// Parameter to `pipe2` selecting notification pipe notification_pipe: bool = false, _9: u3 = 0, nonblock: bool = false, @@ -521,6 +523,7 @@ pub const Pipe2 = switch (native_arch) { }, .sparc64 => packed struct(u32) { _: u11 = 0, + /// Parameter to `pipe2` selecting notification pipe notification_pipe: bool = false, _13: u2 = 0, nonblock: bool = false, @@ -534,6 +537,7 @@ pub const Pipe2 = switch (native_arch) { _: u7 = 0, nonblock: bool = false, _9: u2 = 0, + /// Parameter to `pipe2` selecting notification pipe notification_pipe: bool = false, _12: u4 = 0, direct: bool = false, @@ -543,6 +547,7 @@ pub const Pipe2 = switch (native_arch) { }, .powerpc, .powerpcle, .powerpc64, .powerpc64le => packed struct(u32) { _: u7 = 0, + /// Parameter to `pipe2` selecting notification pipe notification_pipe: bool = false, _9: u3 = 0, nonblock: bool = false, @@ -882,7 +887,6 @@ pub fn futex2_wake( /// Identical to `FUTEX.CMP_REQUEUE`, except it is part of the futex2 family of calls. /// /// Requires at least kernel v6.7. -// TODO: test to ensure I didn't break it pub fn futex2_requeue( /// The source and destination futexes. Must be a 2-element array. waiters: *const [2]Futex2.WaitOne, @@ -4182,9 +4186,19 @@ pub const Shut = enum(u32) { pub const RDWR: u32 = @intFromEnum(Shut.rdwr); }; -/// SYNC_FILE_RANGE_* flags +/// flags for `sync_file_range(2)` syscall +/// matches SYNC_FILE_RANGE_* in kernel pub const SyncFileRange = packed struct(u32) { - _: u32 = 0, // TODO: fill out + wait_before: bool = false, + write: bool = false, + wait_after: bool = false, + _: u29 = 0, + + pub const write_and_wait: SyncFileRange = .{ + .wait_before = true, + .write = true, + .wait_after = true, + }; }; /// Deprecated alias to Sock @@ -7005,9 +7019,18 @@ pub const Rename = packed struct(u32) { _: u29 = 0, }; +/// By default (i.e, flags is .{}), the extended attribute will be created +/// if it does not exist, or the value will be replaced if the attribute +/// already exists. To modify this semantics, one of the fields in `SetXattr` +/// can be specified in flags. Matches XATTR_* in kernel pub const SetXattr = packed struct(u32) { - _: u32 = 0, // TODO: add flags + /// set value, fail if attr already exists + create: bool = false, + /// set value, fail if attr does not exist + replace: bool = false, + _: u30 = 0, }; + pub const statx_timestamp = extern struct { sec: i64, nsec: u32, @@ -8549,60 +8572,142 @@ pub const rlimit = extern struct { max: rlim_t, }; -pub const MADV = struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 4; - pub const FREE = 8; - pub const REMOVE = 9; - pub const DONTFORK = 10; - pub const DOFORK = 11; - pub const MERGEABLE = 12; - pub const UNMERGEABLE = 13; - pub const HUGEPAGE = 14; - pub const NOHUGEPAGE = 15; - pub const DONTDUMP = 16; - pub const DODUMP = 17; - pub const WIPEONFORK = 18; - pub const KEEPONFORK = 19; - pub const COLD = 20; - pub const PAGEOUT = 21; - pub const HWPOISON = 100; - pub const SOFT_OFFLINE = 101; -}; - -pub const Madvice = enum(u32) { - _, // TODO: add options -}; -pub const Fadvice = enum(u32) { - _, // TODO: add options -}; - -pub const POSIX_FADV = switch (native_arch) { - .s390x => if (@typeInfo(usize).int.bits == 64) struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 6; - pub const NOREUSE = 7; - } else struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 4; - pub const NOREUSE = 5; +/// DEPRECATED alias for Madvise +pub const MADV = Madvise; + +// COMMIT: update MADV_* flags and type as enum +/// advice flags for `madvise` +/// matches MADV_* in kernel +pub const Madvise = enum(u32) { + /// no further special treatment + normal = 0, + /// expect random page references + random = 1, + /// expect sequential page references + sequential = 2, + /// will need these pages + willneed = 3, + /// don't need these pages + dontneed = 4, + /// free pages only if memory pressure + free = 8, + /// remove these pages & resources + remove = 9, + /// don't inherit across fork + dontfork = 10, + /// do inherit across fork + dofork = 11, + /// KSM may merge identical pages + mergeable = 12, + /// KSM may not merge identical pages + unmergeable = 13, + /// Worth backing with hugepages + hugepage = 14, + /// Not worth backing with hugepages + nohugepage = 15, + /// Explicity exclude from the core dump, overrides the coredump filter bits + dontdump = 16, + /// Clear the MADV_DONTDUMP flag + dodump = 17, + /// Zero memory on fork, child only + wipeonfork = 18, + /// Undo MADV_WIPEONFORK + keeponfork = 19, + /// deactivate these pages + cold = 20, + /// reclaim these pages + pageout = 21, + /// populate (prefault) page tables readable + populate_read = 22, + /// populate (prefault) page tables writable + populate_write = 23, + /// like DONTNEED, but drop locked pages too + dontneed_locked = 24, + /// Synchronous hugepage collapse + collapse = 25, + /// poison a page for testing + hwpoison = 100, + /// soft offline page for testing + soft_offline = 101, + /// fatal signal on access to range + guard_install = 102, + /// unguard range + guard_remove = 103, + _, + + // DEPRECATED aliases for `Madvise` + pub const NORMAL: u32 = @intFromEnum(Madvise.normal); + pub const RANDOM: u32 = @intFromEnum(Madvise.random); + pub const SEQUENTIAL: u32 = @intFromEnum(Madvise.sequential); + pub const WILLNEED: u32 = @intFromEnum(Madvise.willneed); + pub const DONTNEED: u32 = @intFromEnum(Madvise.dontneed); + pub const FREE: u32 = @intFromEnum(Madvise.free); + pub const REMOVE: u32 = @intFromEnum(Madvise.remove); + pub const DONTFORK: u32 = @intFromEnum(Madvise.dontfork); + pub const DOFORK: u32 = @intFromEnum(Madvise.dofork); + pub const MERGEABLE: u32 = @intFromEnum(Madvise.mergeable); + pub const UNMERGEABLE: u32 = @intFromEnum(Madvise.unmergeable); + pub const HUGEPAGE: u32 = @intFromEnum(Madvise.hugepage); + pub const NOHUGEPAGE: u32 = @intFromEnum(Madvise.nohugepage); + pub const DONTDUMP: u32 = @intFromEnum(Madvise.dontdump); + pub const DODUMP: u32 = @intFromEnum(Madvise.dodump); + pub const WIPEONFORK: u32 = @intFromEnum(Madvise.wipeonfork); + pub const KEEPONFORK: u32 = @intFromEnum(Madvise.keeponfork); + pub const COLD: u32 = @intFromEnum(Madvise.cold); + pub const PAGEOUT: u32 = @intFromEnum(Madvise.pageout); + pub const HWPOISON: u32 = @intFromEnum(Madvise.hwpoison); + pub const SOFT_OFFLINE: u32 = @intFromEnum(Madvise.soft_offline); +}; + +/// DEPRECATED alias to Fadvice +pub const POSIX_FADV = Fadvise; + +/// advice flags for `posix_fadvise` +/// matches POSIX_FADV_* in kernel +pub const Fadvise = switch (native_arch) { + .s390x => if (@typeInfo(usize).int.bits == 64) enum(u32) { + /// No further special treatment + normal = 0, + /// Expect random page references + random = 1, + /// Expect sequential page references + sequential = 2, + /// Will need these pages + willneed = 3, + /// Don't need these pages + dontneed = 6, + /// Data will be accessed once + noreuse = 7, + _, + + pub const NORMAL: u32 = @intFromEnum(Fadvise.normal); + pub const RANDOM: u32 = @intFromEnum(Fadvise.random); + pub const SEQUENTIAL: u32 = @intFromEnum(Fadvise.sequential); + pub const WILLNEED: u32 = @intFromEnum(Fadvise.willneed); + pub const DONTNEED: u32 = @intFromEnum(Fadvise.dontneed); + pub const NOREUSE: u32 = @intFromEnum(Fadvise.noreuse); }, - else => struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 4; - pub const NOREUSE = 5; + else => enum(u32) { + /// No further special treatment + normal = 0, + /// Expect random page references + random = 1, + /// Expect sequential page references + sequential = 2, + /// Will need these pages + willneed = 3, + /// Don't need these pages + dontneed = 4, + /// Data will be accessed once + noreuse = 5, + _, + + pub const NORMAL: u32 = @intFromEnum(Fadvise.normal); + pub const RANDOM: u32 = @intFromEnum(Fadvise.random); + pub const SEQUENTIAL: u32 = @intFromEnum(Fadvise.sequential); + pub const WILLNEED: u32 = @intFromEnum(Fadvise.willneed); + pub const DONTNEED: u32 = @intFromEnum(Fadvise.dontneed); + pub const NOREUSE: u32 = @intFromEnum(Fadvise.noreuse); }, }; diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 49a5303df1cb..98686fcfadd6 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1137,7 +1137,7 @@ pub fn fadvice( fd: linux.fd_t, offset: u64, len: u32, - advice: linux.Fadvice, + advice: linux.Fadvise, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_fadvice(fd, offset, len, advice); @@ -1151,7 +1151,7 @@ pub fn madvice( self: *IoUring, user_data: u64, memory: []u8, - advice: linux.Fadvice, + advice: linux.Madvise, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_madvice(memory, advice); @@ -1351,7 +1351,13 @@ pub fn provide_buffers( buffer_id: usize, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); + sqe.prep_provide_buffers( + buffers, + buffer_size, + buffers_count, + group_id, + buffer_id, + ); sqe.user_data = user_data; return sqe; } @@ -1449,7 +1455,7 @@ pub fn sync_file_range( fd: linux.fd_t, len: u32, offset: u64, - flags: linux.SyncFileRange, // TODO: add flags + flags: linux.SyncFileRange, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_sync_file_range(fd, len, offset, flags); @@ -2220,7 +2226,6 @@ pub fn getsockopt( ); } -// TODO: move buf_ring fns into BufferRing type /// Registers a shared buffer ring to be used with provided buffers. `entries` /// number of `io_uring_buf` structures is mem mapped and shared by kernel. /// @@ -2967,7 +2972,7 @@ pub const Sqe = extern struct { fd: linux.fd_t, offset: u64, len: u32, - advice: linux.Fadvice, + advice: linux.Fadvise, ) void { sqe.prep_rw(.fadvise, fd, undefined, len, offset); sqe.rw_flags = @intFromEnum(advice); @@ -2976,7 +2981,7 @@ pub const Sqe = extern struct { pub fn prep_madvice( sqe: *Sqe, memory: []u8, - advice: linux.Madvice, + advice: linux.Madvise, ) void { sqe.prep_rw(.madvise, -1, @intFromPtr(memory.ptr), memory.len, 0); sqe.rw_flags = @intFromEnum(advice); @@ -3047,7 +3052,7 @@ pub const Sqe = extern struct { fd: linux.fd_t, len: u32, offset: u64, - flags: linux.SyncFileRange, // TODO: add flags + flags: linux.SyncFileRange, ) void { sqe.prep_rw(.sync_file_range, fd, undefined, len, offset); sqe.rw_flags = @bitCast(flags); From 267ed9e85e62346ac76f5fe82ec3ed758ecac924 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sat, 25 Oct 2025 16:00:27 +0000 Subject: [PATCH 37/41] IoUring: use the splice flags type for splice and tee Use appropriately sized integers where applicable Signed-off-by: Bernard Assan --- lib/std/os/linux/IoUring.zig | 114 ++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 34 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 98686fcfadd6..dec59af08358 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -424,25 +424,32 @@ pub fn splice( off_in: u64, fd_out: linux.fd_t, off_out: u64, - len: usize, + len: u32, + flags: uflags.Splice, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); + sqe.prep_splice( + fd_in, + off_in, + fd_out, + off_out, + len, + flags, + ); sqe.user_data = user_data; return sqe; } -// COMMIT: ignored flags for splice and tee lets see if they become important -// in the future pub fn tee( self: *IoUring, user_data: u64, fd_in: linux.fd_t, fd_out: linux.fd_t, - len: usize, + len: u32, + flags: uflags.Splice, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_tee(fd_in, fd_out, len); + sqe.prep_tee(fd_in, fd_out, len, flags); sqe.user_data = user_data; return sqe; } @@ -759,8 +766,6 @@ pub fn timeout_update( /// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. /// Returns a pointer to the SQE. /// Available since 5.5 -// TODO: can't we make the sockaddr and socklen_t combo in our api better? -// Investigate this pub fn accept( self: *IoUring, user_data: u64, @@ -939,7 +944,7 @@ pub fn listen( self: *IoUring, user_data: u64, fd: linux.fd_t, - backlog: usize, + backlog: u32, // liburing doesn't have this flag, hence 0 should be passed // TODO: consider removing this and all flags like this flags: u32, @@ -1339,21 +1344,24 @@ pub fn recv_multishot( /// buffer_selection. /// /// The kernel expects a contiguous block of memory of size (buffers_count * -/// buffer_size). -// TODO: why not use a slice with `buffers_count` +/// buffer_len). pub fn provide_buffers( self: *IoUring, user_data: u64, + /// an array of `buffers_count` buffers of len `buffer_len` laid out as a + /// contiguous slice of memory buffers: [*]u8, - buffer_size: usize, - buffers_count: usize, - group_id: usize, - buffer_id: usize, + /// lenght of each buffer in `buffers` + buffer_len: u32, + /// count of buffer in `buffers` + buffers_count: u32, + group_id: u32, + buffer_id: u32, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_provide_buffers( buffers, - buffer_size, + buffer_len, buffers_count, group_id, buffer_id, @@ -1367,8 +1375,8 @@ pub fn provide_buffers( pub fn remove_buffers( self: *IoUring, user_data: u64, - buffers_count: usize, - group_id: usize, + buffers_count: u32, + group_id: u32, ) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_remove_buffers(buffers_count, group_id); @@ -2550,26 +2558,54 @@ pub const Sqe = extern struct { sqe.prep_rw(.writev, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); } - pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64, buffer_index: u16) void { + pub fn prep_write_fixed( + sqe: *Sqe, + fd: linux.fd_t, + buffer: []const u8, + offset: u64, + buffer_index: u16, + ) void { sqe.prep_rw(.write_fixed, fd, @intFromPtr(buffer.ptr), buffer.len, offset); sqe.buf_index = buffer_index; } - pub fn prep_writev_fixed(sqe: *Sqe, fd: linux.fd_t, iovecs: []const posix.iovec_const, offset: u64, buffer_index: u16) void { + pub fn prep_writev_fixed( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const posix.iovec_const, + offset: u64, + buffer_index: u16, + ) void { sqe.prep_rw(.write_fixed, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); sqe.buf_index = buffer_index; } - pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { + pub fn prep_splice( + sqe: *Sqe, + fd_in: linux.fd_t, + off_in: u64, + fd_out: linux.fd_t, + off_out: u64, + len: u32, + flags: uflags.Splice, + ) void { sqe.prep_rw(.splice, fd_out, undefined, len, off_out); sqe.addr = off_in; sqe.splice_fd_in = fd_in; + sqe.rw_flags = @bitCast(flags); } - pub fn prep_tee(sqe: *Sqe, fd_in: linux.fd_t, fd_out: linux.fd_t, len: usize) void { + pub fn prep_tee( + sqe: *Sqe, + fd_in: linux.fd_t, + fd_out: linux.fd_t, + len: u32, + flags: uflags.Splice, + ) void { sqe.prep_rw(.tee, fd_out, undefined, len, 0); sqe.addr = undefined; sqe.splice_fd_in = fd_in; + sqe.rw_flags = @bitCast(flags); } pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { @@ -3260,24 +3296,34 @@ pub const Sqe = extern struct { sqe.prep_rw(.files_update, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); } - // TODO: why can't slice be used here ? + // Note: It is more appropriate to use a `[*]u8` than `[]u8` slice here + // because `[]u8` would free us of the extra `buffer_len` parameter but + // would require us to alway calculate the `buffer_len` in the function + // which is redundant since the `buffer_len` and `buffers_count` + // information are alway available for any 2 dimentional array type + // .ie [buffers_count][buffer_len]u8 pub fn prep_provide_buffers( sqe: *Sqe, buffers: [*]u8, - buffer_len: usize, - num: usize, - group_id: usize, - buffer_id: usize, + buffer_len: u32, + buffers_count: u32, + group_id: u32, + buffer_id: u32, ) void { - const ptr = @intFromPtr(buffers); - sqe.prep_rw(.provide_buffers, @intCast(num), ptr, buffer_len, buffer_id); + sqe.prep_rw( + .provide_buffers, + @intCast(buffers_count), + @intFromPtr(buffers), + buffer_len, + buffer_id, + ); sqe.buf_index = @intCast(group_id); } pub fn prep_remove_buffers( sqe: *Sqe, - num: usize, - group_id: usize, + num: u32, + group_id: u32, ) void { sqe.prep_rw(.remove_buffers, @intCast(num), 0, 0, 0); sqe.buf_index = @intCast(group_id); @@ -3478,7 +3524,7 @@ pub const Sqe = extern struct { pub fn prep_listen( sqe: *Sqe, fd: linux.fd_t, - backlog: usize, + backlog: u32, flags: u32, // flags is unused and does't exist in io_uring's api ) void { sqe.prep_rw(.listen, fd, 0, backlog, 0); @@ -5044,13 +5090,13 @@ test "splice/read" { const fds = try posix.pipe(); const pipe_offset: u64 = math.maxInt(u64); - const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); + const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len, .{}); try testing.expectEqual(Op.splice, sqe_splice_to_pipe.opcode); try testing.expectEqual(0, sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); sqe_splice_to_pipe.link_next(); - const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); + const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len, .{}); try testing.expectEqual(Op.splice, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); try testing.expectEqual(10, sqe_splice_from_pipe.off); From 8f6f9696b379430802b59096791bd84e4d70dc0b Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Sat, 25 Oct 2025 16:16:30 +0000 Subject: [PATCH 38/41] IoUring: fix and remove TODOs backout from updating `pipe2` syscall to use Pipe2 flags Review if useful to implement - openat2*, f/madvice64 Signed-off-by: Bernard Assan --- lib/std/os/linux.zig | 11 +----- lib/std/os/linux/IoUring.zig | 68 +++++------------------------------- 2 files changed, 9 insertions(+), 70 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index d111324af1a6..cbba463ffb9a 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -1332,7 +1332,7 @@ pub fn pipe(fd: *[2]i32) usize { } } -pub fn pipe2(fd: *[2]i32, flags: Pipe2) usize { +pub fn pipe2(fd: *[2]i32, flags: O) usize { return syscall2(.pipe2, @intFromPtr(fd), @as(u32, @bitCast(flags))); } @@ -3747,7 +3747,6 @@ pub const Futex2 = struct { }; /// flags for `futex2_wait` syscall - // COMMIT: add mpol and fix private field as its 128 not 32 pub const Wait = packed struct(u32) { size: Size, numa: bool = false, @@ -4467,7 +4466,6 @@ pub const Af = enum(u16) { pub const MAX: u16 = @intFromEnum(Af.max); }; -// COMMIT: add new Typed So enum /// SO_* type pub const So = if (is_mips) enum(u16) { debug = 1, @@ -4759,7 +4757,6 @@ pub const So = if (is_mips) enum(u16) { pub const detach_bpf: So = .detach_filter; }; -// COMMIT: add SO constants /// Backwards-compatible SO_* constants pub const SO = struct { pub const DEBUG: u16 = @intFromEnum(So.debug); @@ -5409,7 +5406,6 @@ pub const Msg = packed struct(u32) { /// sendpage() internal: page may carry plain text and require encryption sendpage_decrypted: bool = false, _22: u4 = 0, - // COMMIT: new flags /// Receive devmem skbs as cmsg sock_devmem: bool = false, /// Use user data in kernel path @@ -6025,7 +6021,6 @@ pub const Epoll = if (is_mips) packed struct(u32) { rdnorm: bool = false, /// Priority data may be read rdband: bool = false, - // COMMIT: new flags /// Writing is now possible (normal data) wrnorm: bool = false, /// Priority data may be written @@ -7008,7 +7003,6 @@ pub const utsname = extern struct { }; pub const HOST_NAME_MAX = 64; -// COMMIT: RenameFlags pub const Rename = packed struct(u32) { /// Don't overwrite target noreplace: bool = false, @@ -7099,7 +7093,6 @@ pub const Statx = extern struct { __pad2: [14]u64, - // COMMIT: add new StatxMask fields // https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L203 /// matches STATX_* in kernel pub const Mask = packed struct(u32) { @@ -7157,7 +7150,6 @@ pub const Statx = extern struct { }; }; - // COMMIT: Statx as Packed Struct // https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L248 /// matches STATX_ATTR_* in kernel pub const Attr = packed struct(u64) { @@ -8575,7 +8567,6 @@ pub const rlimit = extern struct { /// DEPRECATED alias for Madvise pub const MADV = Madvise; -// COMMIT: update MADV_* flags and type as enum /// advice flags for `madvise` /// matches MADV_* in kernel pub const Madvise = enum(u32) { diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index dec59af08358..e5688482609e 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -927,12 +927,9 @@ pub fn bind( fd: linux.fd_t, addr: *const linux.sockaddr, addrlen: linux.socklen_t, - // liburing doesn't have this flag, hence 0 should be passed - // TODO: consider removing this and all flags like this - flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_bind(fd, addr, addrlen, flags); + sqe.prep_bind(fd, addr, addrlen); sqe.user_data = user_data; return sqe; } @@ -945,12 +942,9 @@ pub fn listen( user_data: u64, fd: linux.fd_t, backlog: u32, - // liburing doesn't have this flag, hence 0 should be passed - // TODO: consider removing this and all flags like this - flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_listen(fd, backlog, flags); + sqe.prep_listen(fd, backlog); sqe.user_data = user_data; return sqe; } @@ -1031,7 +1025,6 @@ pub fn openat( sqe.user_data = user_data; return sqe; } -// COMMIT: ignore openat2* for now /// Queues an openat using direct (registered) file descriptors. /// @@ -1133,7 +1126,6 @@ pub fn statx( return sqe; } -// COMMIT: don't implement f/madvice64 for now I dought it is used by a lot of people in practice /// Queues (but does not submit) an SQE to perform an `posix_fadvise(2)`. /// Returns a pointer to the SQE. pub fn fadvice( @@ -1350,7 +1342,7 @@ pub fn provide_buffers( user_data: u64, /// an array of `buffers_count` buffers of len `buffer_len` laid out as a /// contiguous slice of memory - buffers: [*]u8, + buffers: []u8, /// lenght of each buffer in `buffers` buffer_len: u32, /// count of buffer in `buffers` @@ -2101,7 +2093,6 @@ pub fn register_sync_msg(self: *IoUring, sqe: *Sqe) !void { try handle_registration_result(res); } -// COMMIT: fix register file alloc range taking @sizeOf(FileIndexRange) instead of zero in register syscall /// Registers range for fixed file allocations. /// Available since 6.0 pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { @@ -2338,7 +2329,6 @@ pub const Cqe = extern struct { /// completions with a buffer passed back is automatically returned to /// the application. f_buf_more: bool = false, - // COMMIT: new flags /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore /// this CQE. It's only purpose is to fill a gap in the ring, if a /// large CQE is attempted posted when the ring has just a single small @@ -2464,7 +2454,6 @@ pub const Sqe = extern struct { /// starting buffer ID in cqe.flags as per usual for provided buffer /// usage. The buffers will be contiguous from the starting buffer ID. recvsend_bundle: bool = false, - // COMMIT: new flags /// IORING_SEND_VECTORIZED /// If set, SEND[_ZC] will take a pointer to a io_vec to allow /// vectorized send operations. @@ -2475,7 +2464,6 @@ pub const Sqe = extern struct { /// accept flags stored in sqe.ioprio pub const Accept = packed struct(u16) { multishot: bool = false, - // COMMIT: new Flags dontwait: bool = false, poll_first: bool = false, _: u13 = 0, @@ -2773,7 +2761,6 @@ pub const Sqe = extern struct { sqe.ioprio = .{ .send_recv = .{ .recv_multishot = true } }; } - // COMMIT: fix send[|recv] flag param type pub fn prep_send(sqe: *Sqe, sockfd: linux.fd_t, buffer: []const u8, flags: linux.Msg) void { sqe.prep_rw(.send, sockfd, @intFromPtr(buffer.ptr), buffer.len, 0); sqe.rw_flags = @bitCast(flags); @@ -3296,24 +3283,19 @@ pub const Sqe = extern struct { sqe.prep_rw(.files_update, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); } - // Note: It is more appropriate to use a `[*]u8` than `[]u8` slice here - // because `[]u8` would free us of the extra `buffer_len` parameter but - // would require us to alway calculate the `buffer_len` in the function - // which is redundant since the `buffer_len` and `buffers_count` - // information are alway available for any 2 dimentional array type - // .ie [buffers_count][buffer_len]u8 pub fn prep_provide_buffers( sqe: *Sqe, - buffers: [*]u8, + buffers: []u8, buffer_len: u32, buffers_count: u32, group_id: u32, buffer_id: u32, ) void { + assert(buffers.len == buffer_len * buffers_count); sqe.prep_rw( .provide_buffers, @intCast(buffers_count), - @intFromPtr(buffers), + @intFromPtr(buffers.ptr), buffer_len, buffer_id, ); @@ -3509,26 +3491,21 @@ pub const Sqe = extern struct { sqe.set_target_fixed_file(file_index); } - // TODO: maybe remove unused flag fields? pub fn prep_bind( sqe: *Sqe, fd: linux.fd_t, addr: *const linux.sockaddr, addrlen: linux.socklen_t, - flags: u32, // flags is unused and does't exist in io_uring's api ) void { sqe.prep_rw(.bind, fd, @intFromPtr(addr), 0, addrlen); - sqe.rw_flags = flags; } pub fn prep_listen( sqe: *Sqe, fd: linux.fd_t, backlog: u32, - flags: u32, // flags is unused and does't exist in io_uring's api ) void { sqe.prep_rw(.listen, fd, 0, backlog, 0); - sqe.rw_flags = flags; } pub fn prep_cmd_sock( @@ -3944,10 +3921,6 @@ pub const Params = extern struct { cq_off: CqOffsets, }; -// COMMIT: remove deprecated io_uring_rsrc_update struct -// deprecated, see struct io_uring_rsrc_update - -// COMMIT: add new io_uring_region_desc struct /// matches `io_uring_region_desc` in liburing pub const RegionDesc = extern struct { user_addr: u64, @@ -3957,7 +3930,6 @@ pub const RegionDesc = extern struct { mmap_offset: u64, __resv: [4]u64, - // COMMIT: new constant /// initialise with user provided memory pointed by user_addr pub const Flags = packed struct(u32) { type_user: bool = false, @@ -3965,7 +3937,6 @@ pub const RegionDesc = extern struct { }; }; -// COMMIT: add new io_uring_mem_region_reg struct /// matches `io_uring_mem_region_reg` in liburing pub const MemRegionRegister = extern struct { /// struct io_uring_region_desc (RegionDesc in Zig) @@ -4049,7 +4020,6 @@ pub const Probe = extern struct { } }; -// COMMIT: fix defination of io_uring_restriction // RegisterOp is actually u8 /// matches `io_uring_restriction` in liburing pub const Restriction = extern struct { @@ -4066,14 +4036,12 @@ pub const Restriction = extern struct { resv2: [3]u32, }; -// COMMIT: add new struct type /// matches `io_uring_clock_register` in liburing pub const ClockRegister = extern struct { clockid: u32, __resv: [3]u32, }; -// COMMIT: add new struct type /// matches `io_uring_clone_buffers` in liburing pub const CloneBuffers = extern struct { src_fd: u32, @@ -4083,7 +4051,6 @@ pub const CloneBuffers = extern struct { nr: u32, pad: [3]u32, - // COMMIT: new flags pub const Flags = packed struct(u32) { register_src_registered: bool = false, register_dst_replace: bool = false, @@ -4168,7 +4135,6 @@ pub const BufferRegister = extern struct { flags: Flags, resv: [3]u64, - // COMMIT: new IORING_REGISTER_PBUF_RING flags /// Flags for IORING_REGISTER_PBUF_RING. pub const Flags = packed struct(u16) { /// IOU_PBUF_RING_MMAP: @@ -4210,7 +4176,6 @@ pub const Napi = extern struct { resv: u64, }; -// COMMIT: new struct type /// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG /// set, where the actual argument is an index into a previously registered /// fixed wait region described by the below structure. @@ -4224,7 +4189,6 @@ pub const RegisterWait = extern struct { pad: [3]u32, pad2: [2]u64, - // COMMIT: new constant pub const Flags = packed struct(u32) { reg_wait_ts: bool = false, _: u31 = 0, @@ -4241,7 +4205,6 @@ pub const GetEventsArg = extern struct { ts: u64, }; -// COMMIT: fix type definition of io_uring_sync_cancel_reg /// Argument for IORING_REGISTER_SYNC_CANCEL /// matches `io_uring_sync_cancel_reg` in liburing pub const SyncCancelRegister = extern struct { @@ -4327,7 +4290,6 @@ pub const ZcrxIfqRegister = extern struct { __resv: [3]u64, }; -// COMMIT: move IoUring constants to Constants pub const constants = struct { /// io_uring block file commands, see IORING_OP_URING_CMD. /// It's a different number space from ioctl(), reuse the block's code 0x12. @@ -4357,7 +4319,6 @@ pub const constants = struct { pub const OFF_SQ_RING = 0; pub const OFF_CQ_RING = 0x8000000; pub const OFF_SQES = 0x10000000; - // COMMIT: new magic constants pub const OFF_PBUF_RING = 0x80000000; pub const OFF_PBUF_SHIFT = 16; pub const OFF_MMAP_MASK = 0xf8000000; @@ -4365,7 +4326,6 @@ pub const constants = struct { /// Skip updating fd indexes set to this value in the fd table pub const REGISTER_FILES_SKIP = -2; - // COMMIT: new TX Timestamp definition /// SOCKET_URING_OP_TX_TIMESTAMP definitions pub const TIMESTAMP_HW_SHIFT = 16; /// The cqe.flags bit from which the timestamp type is stored @@ -4381,7 +4341,6 @@ pub const constants = struct { pub const REGISTER_USE_REGISTERED_RING = 1 << 31; }; -// COMMIT: move IoUring flags to Flags struct pub const uflags = struct { /// io_uring_setup() flags pub const Setup = packed struct(u32) { @@ -4430,7 +4389,6 @@ pub const uflags = struct { registered_fd_only: bool = false, /// Removes indirection through the SQ index array. no_sqarray: bool = false, - // COMMIT: new setup flags /// Use hybrid poll in iopoll process hybrid_iopoll: bool = false, /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have @@ -4466,8 +4424,7 @@ pub const uflags = struct { link_timeout_update: bool = false, /// Available since Linux 5.16 timeout_etime_success: bool = false, - // COMMIT: new Timeout Flag - // TODO: add when it became available + /// Available since Linux 6.4 timeout_multishot: bool = false, _8: u25 = 0, }; @@ -4487,7 +4444,6 @@ pub const uflags = struct { /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will /// continue to report CQEs on behalf of the same SQE. add_multi: bool = false, - // TODO: verify this doc comment is valid for the 2 flags below /// IORING_POLL_UPDATE /// Update existing poll request, matching sqe.addr as the old user_data /// field. @@ -4516,7 +4472,6 @@ pub const uflags = struct { /// IORING_ASYNC_CANCEL_FD_FIXED /// 'fd' passed in is a fixed descriptor cancel_fd_fixed: bool = false, - // COMMIT: new AsyncCancel Flags /// IORING_ASYNC_CANCEL_USERDATA /// Match on user_data, default for no other key cancel_userdata: bool = false, @@ -4536,14 +4491,12 @@ pub const uflags = struct { _3: u30 = 0, }; - // COMMIT: new flag /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) pub const FixedFd = packed struct(u32) { /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC no_cloexec: bool = false, }; - /// COMMIT: new flags /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) pub const Nop = packed struct(u32) { /// IORING_NOP_INJECT_RESULT Inject result from sqe.result @@ -4695,7 +4648,6 @@ pub const RegisterOp = enum(u8) { register_mem_region, - // COMMIT: new register opcode // query various aspects of io_uring, see linux/io_uring/query.h register_query, @@ -4716,7 +4668,6 @@ pub const SocketOp = enum(u16) { siocoutq, getsockopt, setsockopt, - // COMMIT: new socket op tx_timestamp, _, }; @@ -4745,7 +4696,6 @@ pub const MsgRingCmd = enum { send_fd, }; -// COMMIT: OP to IoUring /// matches `io_uring_op` in liburing pub const Op = enum(u8) { nop, @@ -4807,8 +4757,6 @@ pub const Op = enum(u8) { bind, listen, recv_zc, - // COMMIT: new OPs - // TODO: to be implemented epoll_wait, readv_fixed, writev_fixed, @@ -7525,7 +7473,7 @@ test "bind/listen/connect" { (try ring.setsockopt(2, listen_fd, .socket, .reuseaddr, mem.asBytes(&optval))).link_next(); (try ring.setsockopt(3, listen_fd, .socket, .reuseport, mem.asBytes(&optval))).link_next(); (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); - _ = try ring.listen(5, listen_fd, 1, 0); + _ = try ring.listen(5, listen_fd, 1); // Submit 4 operations try testing.expectEqual(4, try ring.submit()); // Expect all to succeed From 23a45610ed3111351db2ff15b5e3c1d4dd8af031 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Mon, 3 Nov 2025 16:33:40 +0000 Subject: [PATCH 39/41] IoUring: Fix merge conflicts use flags in Threaded change futex signature to expect *const atomic.Value(u32) fix IoUring timeout test Signed-off-by: Bernard Assan --- lib/std/Io/Threaded.zig | 27 ++++++++++++---- lib/std/Thread.zig | 6 ++-- lib/std/Thread/Futex.zig | 4 +-- lib/std/os/linux.zig | 11 ++++--- lib/std/os/linux/IoUring.zig | 20 ++++++------ lib/std/os/linux/test.zig | 62 ++++++++++++++++++------------------ lib/std/posix.zig | 2 +- 7 files changed, 74 insertions(+), 58 deletions(-) diff --git a/lib/std/Io/Threaded.zig b/lib/std/Io/Threaded.zig index 0e5c12504b15..1707e3593657 100644 --- a/lib/std/Io/Threaded.zig +++ b/lib/std/Io/Threaded.zig @@ -1353,8 +1353,7 @@ fn dirStatPathLinux( var path_buffer: [posix.PATH_MAX]u8 = undefined; const sub_path_posix = try pathToPosix(sub_path, &path_buffer); - const flags: u32 = linux.AT.NO_AUTOMOUNT | - @as(u32, if (!options.follow_symlinks) linux.AT.SYMLINK_NOFOLLOW else 0); + const flags: linux.At = .{ .no_automount = true, .symlink_nofollow = if (!options.follow_symlinks) true else false }; while (true) { try t.checkCancel(); @@ -1363,7 +1362,15 @@ fn dirStatPathLinux( dir.handle, sub_path_posix, flags, - linux.STATX_INO | linux.STATX_SIZE | linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME, + .{ + .ino = true, + .size = true, + .type = true, + .mode = true, + .atime = true, + .mtime = true, + .ctime = true, + }, &statx, ); switch (linux.errno(rc)) { @@ -1509,8 +1516,16 @@ fn fileStatLinux(userdata: ?*anyopaque, file: Io.File) Io.File.StatError!Io.File const rc = linux.statx( file.handle, "", - linux.AT.EMPTY_PATH, - linux.STATX_INO | linux.STATX_SIZE | linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME, + .{ .empty_path = true }, + .{ + .ino = true, + .size = true, + .type = true, + .mode = true, + .atime = true, + .mtime = true, + .ctime = true, + }, &statx, ); switch (linux.errno(rc)) { @@ -5941,7 +5956,7 @@ pub fn futexWake(ptr: *const std.atomic.Value(u32), max_waiters: u32) void { .linux => { const linux = std.os.linux; switch (linux.errno(linux.futex_3arg( - &ptr.raw, + ptr, .{ .cmd = .WAKE, .private = true }, @min(max_waiters, std.math.maxInt(i32)), ))) { diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig index 7e3b230cc409..72b96f8b6d27 100644 --- a/lib/std/Thread.zig +++ b/lib/std/Thread.zig @@ -1217,8 +1217,8 @@ const LinuxThreadImpl = struct { thread: *ThreadCompletion, const ThreadCompletion = struct { - completion: Completion = Completion.init(.running), - child_tid: std.atomic.Value(i32) = std.atomic.Value(i32).init(1), + completion: Completion = .init(.running), + child_tid: std.atomic.Value(i32) = .init(1), parent_tid: i32 = undefined, mapped: []align(std.heap.page_size_min) u8, @@ -1663,7 +1663,7 @@ const LinuxThreadImpl = struct { if (tid == 0) break; switch (linux.errno(linux.futex_4arg( - @ptrCast(&self.thread.child_tid.raw), + @ptrCast(&self.thread.child_tid), .{ .cmd = .WAIT, .private = false }, @bitCast(tid), null, diff --git a/lib/std/Thread/Futex.zig b/lib/std/Thread/Futex.zig index b61062c361a0..3f4cdda5bf02 100644 --- a/lib/std/Thread/Futex.zig +++ b/lib/std/Thread/Futex.zig @@ -263,7 +263,7 @@ const LinuxImpl = struct { } const rc = linux.futex_4arg( - &ptr.raw, + ptr, .{ .cmd = .WAIT, .private = true }, expect, if (timeout != null) &ts else null, @@ -285,7 +285,7 @@ const LinuxImpl = struct { fn wake(ptr: *const atomic.Value(u32), max_waiters: u32) void { const rc = linux.futex_3arg( - &ptr.raw, + ptr, .{ .cmd = .WAKE, .private = true }, @min(max_waiters, std.math.maxInt(i32)), ); diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index cbba463ffb9a..8948bec2b24d 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -4,6 +4,7 @@ //! * Implement all the syscalls in the same way that libc functions will //! provide `rename` when only the `renameat` syscall exists. const std = @import("../std.zig"); +const atomic = std.atomic; const builtin = @import("builtin"); const assert = std.debug.assert; const maxInt = std.math.maxInt; @@ -754,7 +755,7 @@ pub const futex_param4 = extern union { /// The futex_op parameter is a sub-command and flags. The sub-command /// defines which of the subsequent paramters are relevant. pub fn futex( - uaddr: *const u32, + uaddr: *const atomic.Value(u32), futex_op: FUTEX_OP, val: u32, val2timeout: futex_param4, @@ -774,7 +775,7 @@ pub fn futex( /// Three-argument variation of the v1 futex call. Only suitable for a /// futex_op that ignores the remaining arguments (e.g., FUTUX_OP.WAKE). -pub fn futex_3arg(uaddr: *const u32, futex_op: FUTEX_OP, val: u32) usize { +pub fn futex_3arg(uaddr: *const atomic.Value(u32), futex_op: FUTEX_OP, val: u32) usize { return syscall3( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -785,7 +786,7 @@ pub fn futex_3arg(uaddr: *const u32, futex_op: FUTEX_OP, val: u32) usize { /// Four-argument variation on the v1 futex call. Only suitable for /// futex_op that ignores the remaining arguments (e.g., FUTEX_OP.WAIT). -pub fn futex_4arg(uaddr: *const u32, futex_op: FUTEX_OP, val: u32, timeout: ?*const timespec) usize { +pub fn futex_4arg(uaddr: *const atomic.Value(u32), futex_op: FUTEX_OP, val: u32, timeout: ?*const timespec) usize { return syscall4( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -838,7 +839,7 @@ pub fn futex2_waitv( /// Requires at least kernel v6.7. pub fn futex2_wait( /// Address of the futex to wait on. - uaddr: *const u32, + uaddr: *const atomic.Value(u32), /// Value of `uaddr`. val: usize, /// Bitmask to match against incoming wakeup masks. Must not be zero. @@ -867,7 +868,7 @@ pub fn futex2_wait( /// Requires at least kernel v6.7. pub fn futex2_wake( /// Futex to wake - uaddr: *const u32, + uaddr: *const atomic.Value(u32), /// Bitmask to match against waiters. mask: Futex2.Bitset, /// Maximum number of waiters on the futex to wake. diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index e5688482609e..8d201e5a6fa4 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1694,7 +1694,7 @@ pub fn socket( domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - /// flags is unused + /// flags are currently unused flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); @@ -1711,7 +1711,7 @@ pub fn socket_direct( domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - /// flags is unused + /// flags are currently unused flags: u32, file_index: u32, ) !*Sqe { @@ -1731,7 +1731,7 @@ pub fn socket_direct_alloc( domain: linux.Af, socket_type: linux.Sock, protocol: linux.IpProto, - /// flags unused + /// flags are currently unused flags: u32, ) !*Sqe { const sqe = try self.get_sqe(); @@ -5377,8 +5377,8 @@ test "timeout (after a relative time)" { }; defer ring.deinit(); - const ms = 10; - const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; + const ms = 5; + const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1_000_000 }; const started = try std.Io.Clock.awake.now(io); const sqe = try ring.timeout(0x55555555, &ts, 0, .{}); @@ -5394,9 +5394,9 @@ test "timeout (after a relative time)" { }, cqe); // Tests should not depend on timings: skip test if outside margin. - const margin = 5; + const ms_margin = 5; const ms_elapsed = started.durationTo(stopped).toMilliseconds(); - if (ms_elapsed > margin) return error.SkipZigTest; + if (ms_elapsed > ms_margin) return error.SkipZigTest; } test "timeout (after a number of completions)" { @@ -5802,7 +5802,7 @@ test "shutdown" { // Socket bound, expect shutdown to work { // TODO: update posix later to use Typed Flags - const server = try posix.socket(address.any.family, @as(u32, @bitCast(linux.Sock{ .type = .stream, .flags = .{ .cloexec = true } })), 0); + const server = try posix.socket(address.family, @as(u32, @bitCast(linux.Sock{ .type = .stream, .flags = .{ .cloexec = true } })), 0); defer posix.close(server); try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(u32, 1))); try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); @@ -7460,7 +7460,7 @@ test "bind/listen/connect" { const listen_fd = brk: { // Create socket - _ = try ring.socket(1, @enumFromInt(addr.any.family), .{ .type = .stream, .flags = .{ .cloexec = true } }, @enumFromInt(proto), 0); + _ = try ring.socket(1, @enumFromInt(addr.family), .{ .type = .stream, .flags = .{ .cloexec = true } }, @enumFromInt(proto), 0); try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); try testing.expectEqual(1, cqe.user_data); @@ -7472,7 +7472,7 @@ test "bind/listen/connect" { var optval: u32 = 1; (try ring.setsockopt(2, listen_fd, .socket, .reuseaddr, mem.asBytes(&optval))).link_next(); (try ring.setsockopt(3, listen_fd, .socket, .reuseport, mem.asBytes(&optval))).link_next(); - (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); + (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in))).link_next(); _ = try ring.listen(5, listen_fd, 1); // Submit 4 operations try testing.expectEqual(4, try ring.submit()); diff --git a/lib/std/os/linux/test.zig b/lib/std/os/linux/test.zig index fffc8abec349..c0db602110d2 100644 --- a/lib/std/os/linux/test.zig +++ b/lib/std/os/linux/test.zig @@ -200,28 +200,28 @@ test "futex v1" { var rc: usize = 0; // No-op wait, lock value is not expected value - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT, .private = true }, 2, .{ .timeout = null }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAIT, .private = true }, 2, .{ .timeout = null }, null, 0); try expectEqual(.AGAIN, linux.errno(rc)); - rc = linux.futex_4arg(&lock.raw, .{ .cmd = .WAIT, .private = true }, 2, null); + rc = linux.futex_4arg(&lock, .{ .cmd = .WAIT, .private = true }, 2, null); try expectEqual(.AGAIN, linux.errno(rc)); // Short-fuse wait, timeout kicks in - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT, .private = true }, 1, .{ .timeout = &.{ .sec = 0, .nsec = 2 } }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAIT, .private = true }, 1, .{ .timeout = &.{ .sec = 0, .nsec = 2 } }, null, 0); try expectEqual(.TIMEDOUT, linux.errno(rc)); - rc = linux.futex_4arg(&lock.raw, .{ .cmd = .WAIT, .private = true }, 1, &.{ .sec = 0, .nsec = 2 }); + rc = linux.futex_4arg(&lock, .{ .cmd = .WAIT, .private = true }, 1, &.{ .sec = 0, .nsec = 2 }); try expectEqual(.TIMEDOUT, linux.errno(rc)); // Wakeup (no waiters) - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE, .private = true }, 2, .{ .timeout = null }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAKE, .private = true }, 2, .{ .timeout = null }, null, 0); try expectEqual(0, rc); - rc = linux.futex_3arg(&lock.raw, .{ .cmd = .WAKE, .private = true }, 2); + rc = linux.futex_3arg(&lock, .{ .cmd = .WAKE, .private = true }, 2); try expectEqual(0, rc); // CMP_REQUEUE - val3 mismatch - rc = linux.futex(&lock.raw, .{ .cmd = .CMP_REQUEUE, .private = true }, 2, .{ .val2 = 0 }, null, 99); + rc = linux.futex(&lock, .{ .cmd = .CMP_REQUEUE, .private = true }, 2, .{ .val2 = 0 }, null, 99); try expectEqual(.AGAIN, linux.errno(rc)); // CMP_REQUEUE - requeue (but no waiters, so ... not much) @@ -229,14 +229,14 @@ test "futex v1" { const val3 = 1; const wake_nr = 3; const requeue_max = std.math.maxInt(u31); - var target_lock: std.atomic.Value(u32) = .init(1); - rc = linux.futex(&lock.raw, .{ .cmd = .CMP_REQUEUE, .private = true }, wake_nr, .{ .val2 = requeue_max }, &target_lock.raw, val3); + const target_lock: std.atomic.Value(u32) = .init(1); + rc = linux.futex(&lock, .{ .cmd = .CMP_REQUEUE, .private = true }, wake_nr, .{ .val2 = requeue_max }, &target_lock, val3); try expectEqual(0, rc); } // WAKE_OP - just to see if we can construct the arguments ... { - var lock2: std.atomic.Value(u32) = .init(1); + const lock2: std.atomic.Value(u32) = .init(1); const wake1_nr = 2; const wake2_nr = 3; const wake_op = linux.FUTEX_WAKE_OP{ @@ -247,29 +247,29 @@ test "futex v1" { .cmdarg = 5, }; - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE_OP, .private = true }, wake1_nr, .{ .val2 = wake2_nr }, &lock2.raw, @bitCast(wake_op)); + rc = linux.futex(&lock, .{ .cmd = .WAKE_OP, .private = true }, wake1_nr, .{ .val2 = wake2_nr }, &lock2, @bitCast(wake_op)); try expectEqual(0, rc); } // WAIT_BITSET { // val1 return early - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff); + rc = linux.futex(&lock, .{ .cmd = .WAIT_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff); try expectEqual(.AGAIN, linux.errno(rc)); // timeout wait const timeout: linux.timespec = .{ .sec = 0, .nsec = 2 }; - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT_BITSET, .private = true }, 1, .{ .timeout = &timeout }, null, 0xfff); + rc = linux.futex(&lock, .{ .cmd = .WAIT_BITSET, .private = true }, 1, .{ .timeout = &timeout }, null, 0xfff); try expectEqual(.TIMEDOUT, linux.errno(rc)); } // WAKE_BITSET { - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff000); + rc = linux.futex(&lock, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff000); try expectEqual(0, rc); // bitmask must have at least 1 bit set: - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0); try expectEqual(.INVAL, linux.errno(rc)); } } @@ -289,17 +289,17 @@ test "futex2_waitv" { const futexes: [3]linux.Futex2.WaitOne = .{ .{ .val = 1, - .uaddr = @intFromPtr(&locks[0].raw), + .uaddr = @intFromPtr(&locks[0]), .flags = .{ .size = .U32, .private = true }, }, .{ .val = 1, - .uaddr = @intFromPtr(&locks[1].raw), + .uaddr = @intFromPtr(&locks[1]), .flags = .{ .size = .U32, .private = true }, }, .{ .val = 1, - .uaddr = @intFromPtr(&locks[2].raw), + .uaddr = @intFromPtr(&locks[2]), .flags = .{ .size = .U32, .private = true }, }, }; @@ -316,7 +316,7 @@ test "futex2_waitv" { // Futex v2 API is only supported on recent kernels (v6.7), so skip tests if the syscalls // return ENOSYS. fn futex2_skip_if_unsupported() !void { - const lock: u32 = 0; + const lock: std.atomic.Value(u32) = .init(0); const rc = linux.futex2_wake(&lock, .empty, 1, .{ .size = .U32, .private = true }); if (linux.errno(rc) == .NOSYS) { return error.SkipZigTest; @@ -324,7 +324,7 @@ fn futex2_skip_if_unsupported() !void { } test "futex2_wait" { - var lock: std.atomic.Value(u32) = .init(1); + const lock: std.atomic.Value(u32) = .init(1); var rc: usize = 0; const mask: linux.Futex2.Bitset = .{ .waiter1 = true }; @@ -333,23 +333,23 @@ test "futex2_wait" { // The API for 8,16,64 bit futexes is defined, but as of kernel v6.14 // (at least) they're not implemented. if (false) { - rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U8, .private = true }, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, .{ .size = .U8, .private = true }, null, .MONOTONIC); try expectEqual(.INVAL, linux.errno(rc)); - rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U16, .private = true }, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, .{ .size = .U16, .private = true }, null, .MONOTONIC); try expectEqual(.INVAL, linux.errno(rc)); - rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U64, .private = true }, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, .{ .size = .U64, .private = true }, null, .MONOTONIC); try expectEqual(.INVAL, linux.errno(rc)); } const flags: linux.Futex2.Wait = .{ .size = .U32, .private = true }; // no-wait, lock state mismatch - rc = linux.futex2_wait(&lock.raw, 2, mask, flags, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 2, mask, flags, null, .MONOTONIC); try expectEqual(.AGAIN, linux.errno(rc)); // hit timeout on wait - rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .MONOTONIC); try expectEqual(.TIMEDOUT, linux.errno(rc)); // timeout is absolute @@ -363,20 +363,20 @@ test "futex2_wait" { .sec = curr.sec, .nsec = curr.nsec + 2, }; - rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &timeout, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, flags, &timeout, .MONOTONIC); try expectEqual(.TIMEDOUT, linux.errno(rc)); } - rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .REALTIME); + rc = linux.futex2_wait(&lock, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .REALTIME); try expectEqual(.TIMEDOUT, linux.errno(rc)); } test "futex2_wake" { - var lock: std.atomic.Value(u32) = .init(1); + const lock: std.atomic.Value(u32) = .init(1); try futex2_skip_if_unsupported(); - const rc = linux.futex2_wake(&lock.raw, .fromInt(0xFF), 1, .{ .size = .U32, .private = true }); + const rc = linux.futex2_wake(&lock, .fromInt(0xFF), 1, .{ .size = .U32, .private = true }); try expectEqual(0, rc); } @@ -391,12 +391,12 @@ test "futex2_requeue" { const futexes: [2]linux.Futex2.WaitOne = .{ .{ .val = 1, - .uaddr = @intFromPtr(&locks[0].raw), + .uaddr = @intFromPtr(&locks[0]), .flags = .{ .size = .U32, .private = true }, }, .{ .val = 1, - .uaddr = @intFromPtr(&locks[1].raw), + .uaddr = @intFromPtr(&locks[1]), .flags = .{ .size = .U32, .private = true }, }, }; diff --git a/lib/std/posix.zig b/lib/std/posix.zig index 3e80466cd3b7..5e01f36bfcac 100644 --- a/lib/std/posix.zig +++ b/lib/std/posix.zig @@ -3889,7 +3889,7 @@ pub fn fstatatZ(dirfd: fd_t, pathname: [*:0]const u8, flags: u32) FStatAtError!S const fstatat_sym = if (lfs64_abi) system.fstatat64 else system.fstatat; var stat = mem.zeroes(Stat); - switch (errno(fstatat_sym(dirfd, pathname, &stat, flags))) { + switch (errno(fstatat_sym(dirfd, pathname, &stat, @bitCast(flags)))) { .SUCCESS => return stat, .INVAL => unreachable, .BADF => unreachable, // Always a race condition. From cc474ab0323f26cf1679165a09ac9e8faa7a973a Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 14 Nov 2025 14:17:13 +0000 Subject: [PATCH 40/41] Fix `At` flags for real The fix got reverted in the merge conflict but now properly fix `At` Update posix `At` to use packed struct (I was holding back from touching `posix` and `c` but I might as well just get this done instead of using the deprecated constants since the diff is already big and At is worth while) Signed-off-by: Bernard Assan --- lib/std/Io/Dir.zig | 2 +- lib/std/Io/Threaded.zig | 10 +- lib/std/c.zig | 313 +++++++++++++++++++++++++++++----------- lib/std/fs.zig | 2 +- lib/std/fs/Dir.zig | 10 +- lib/std/os/linux.zig | 4 +- lib/std/posix.zig | 34 +++-- lib/std/posix/test.zig | 10 +- 8 files changed, 263 insertions(+), 122 deletions(-) diff --git a/lib/std/Io/Dir.zig b/lib/std/Io/Dir.zig index 5f66ec70947c..710ad7f23ee9 100644 --- a/lib/std/Io/Dir.zig +++ b/lib/std/Io/Dir.zig @@ -24,7 +24,7 @@ pub fn cwd() Dir { return switch (native_os) { .windows => .{ .handle = std.os.windows.peb().ProcessParameters.CurrentDirectory.Handle }, .wasi => .{ .handle = std.options.wasiCwd() }, - else => .{ .handle = std.posix.AT.FDCWD }, + else => .{ .handle = std.posix.At.fdcwd }, }; } diff --git a/lib/std/Io/Threaded.zig b/lib/std/Io/Threaded.zig index 1707e3593657..8f8d3ce7713b 100644 --- a/lib/std/Io/Threaded.zig +++ b/lib/std/Io/Threaded.zig @@ -1403,12 +1403,12 @@ fn dirStatPathPosix( var path_buffer: [posix.PATH_MAX]u8 = undefined; const sub_path_posix = try pathToPosix(sub_path, &path_buffer); - const flags: u32 = if (!options.follow_symlinks) posix.AT.SYMLINK_NOFOLLOW else 0; + const flags: posix.At = .{ .symlink_nofollow = if (!options.follow_symlinks) true else false }; while (true) { try t.checkCancel(); var stat = std.mem.zeroes(posix.Stat); - switch (posix.errno(fstatat_sym(dir.handle, sub_path_posix, &stat, flags))) { + switch (posix.errno(fstatat_sym(dir.handle, sub_path_posix, &stat, @bitCast(flags)))) { .SUCCESS => return statFromPosix(&stat), .INTR => continue, .CANCELED => return error.Canceled, @@ -1632,7 +1632,7 @@ fn dirAccessPosix( var path_buffer: [posix.PATH_MAX]u8 = undefined; const sub_path_posix = try pathToPosix(sub_path, &path_buffer); - const flags: u32 = @as(u32, if (!options.follow_symlinks) posix.AT.SYMLINK_NOFOLLOW else 0); + const flags: posix.At = .{ .symlink_nofollow = if (!options.follow_symlinks) true else false }; const mode: u32 = @as(u32, if (options.read) posix.R_OK else 0) | @@ -1641,7 +1641,7 @@ fn dirAccessPosix( while (true) { try t.checkCancel(); - switch (posix.errno(posix.system.faccessat(dir.handle, sub_path_posix, mode, flags))) { + switch (posix.errno(posix.system.faccessat(dir.handle, sub_path_posix, mode, @bitCast(flags)))) { .SUCCESS => return, .INTR => continue, .CANCELED => return error.Canceled, @@ -2901,7 +2901,7 @@ fn fileSeekTo(userdata: ?*anyopaque, file: Io.File, offset: u64) Io.File.SeekErr fn openSelfExe(userdata: ?*anyopaque, flags: Io.File.OpenFlags) Io.File.OpenSelfExeError!Io.File { const t: *Threaded = @ptrCast(@alignCast(userdata)); switch (native_os) { - .linux, .serenity => return dirOpenFilePosix(t, .{ .handle = posix.AT.FDCWD }, "/proc/self/exe", flags), + .linux, .serenity => return dirOpenFilePosix(t, .{ .handle = posix.At.fdcwd }, "/proc/self/exe", flags), .windows => { // If ImagePathName is a symlink, then it will contain the path of the symlink, // not the path that the symlink points to. However, because we are opening diff --git a/lib/std/c.zig b/lib/std/c.zig index f22112a086f7..f164702ebb1f 100644 --- a/lib/std/c.zig +++ b/lib/std/c.zig @@ -8375,128 +8375,267 @@ pub const port_event = switch (native_os) { else => void, }; -pub const AT = switch (native_os) { - .linux => linux.AT, - .windows => struct { +/// Deprecated Alias to `At` +pub const AT = At; + +pub const At = switch (native_os) { + .emscripten, .linux => linux.At, + .windows => packed struct(u32) { + _: u9 = 0, /// Remove directory instead of unlinking file - pub const REMOVEDIR = 0x200; + removedir: bool = false, + _11: u22 = 0, + + /// DEPRECATED aliase to `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); }, - .driverkit, .ios, .maccatalyst, .macos, .tvos, .visionos, .watchos => struct { - pub const FDCWD = -2; + .driverkit, .ios, .maccatalyst, .macos, .tvos, .visionos, .watchos => packed struct(u32) { + _: u4 = 0, /// Use effective ids in access check - pub const EACCESS = 0x0010; + eaccess: bool = false, /// Act on the symlink itself not the target - pub const SYMLINK_NOFOLLOW = 0x0020; + symlink_nofollow: bool = false, /// Act on target of symlink - pub const SYMLINK_FOLLOW = 0x0040; + symlink_follow: bool = false, /// Path refers to directory - pub const REMOVEDIR = 0x0080; + removedir: bool = false, + _9: u24 = 0, + + pub const fdcwd: fd_t = -2; + + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; }, - .freebsd => struct { - /// Magic value that specify the use of the current working directory - /// to determine the target of relative file paths in the openat() and - /// similar syscalls. - pub const FDCWD = -100; + .freebsd => packed struct(u32) { + _: u8 = 0, /// Check access using effective user and group ID - pub const EACCESS = 0x0100; + eaccess: bool = false, /// Do not follow symbolic links - pub const SYMLINK_NOFOLLOW = 0x0200; + symlink_nofollow: bool = false, /// Follow symbolic link - pub const SYMLINK_FOLLOW = 0x0400; + symlink_follow: bool = false, /// Remove directory instead of file - pub const REMOVEDIR = 0x0800; + removedir: bool = false, /// Fail if not under dirfd - pub const BENEATH = 0x1000; - }, - .netbsd => struct { + beneath: bool = false, + _14: u19 = 0, + /// Magic value that specify the use of the current working directory /// to determine the target of relative file paths in the openat() and /// similar syscalls. - pub const FDCWD = -100; + pub const fdcwd: fd_t = -100; + + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + /// DEPRECATED: use `beneath` + pub const BENEATH: u32 = @bitCast(At{ .beneath = true }); + + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; + }, + .netbsd => packed struct(u32) { + _: u8 = 0, /// Check access using effective user and group ID - pub const EACCESS = 0x0100; + eaccess: bool = false, /// Do not follow symbolic links - pub const SYMLINK_NOFOLLOW = 0x0200; + symlink_nofollow: bool = false, /// Follow symbolic link - pub const SYMLINK_FOLLOW = 0x0400; + symlink_follow: bool = false, /// Remove directory instead of file - pub const REMOVEDIR = 0x0800; - }, - .dragonfly => struct { - pub const FDCWD = -328243; - pub const SYMLINK_NOFOLLOW = 1; - pub const REMOVEDIR = 2; - pub const EACCESS = 4; - pub const SYMLINK_FOLLOW = 8; - }, - .openbsd => struct { + removedir: bool = false, + _12: u20 = 0, + /// Magic value that specify the use of the current working directory /// to determine the target of relative file paths in the openat() and /// similar syscalls. - pub const FDCWD = -100; + pub const fdcwd: fd_t = -100; + + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; + }, + .dragonfly => packed struct(u32) { + /// Do not follow symbolic links + symlink_nofollow: bool = false, + /// Remove directory instead of file + removedir: bool = false, + /// Check access using effective user and group ID + eaccess: bool = false, + /// Follow symbolic link + symlink_follow: bool = false, + _: u28 = 0, + + pub const fdcwd: fd_t = -328243; + + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; + }, + .openbsd => packed struct(u32) { /// Check access using effective user and group ID - pub const EACCESS = 0x01; + eaccess: bool = false, /// Do not follow symbolic links - pub const SYMLINK_NOFOLLOW = 0x02; + symlink_nofollow: bool = false, /// Follow symbolic link - pub const SYMLINK_FOLLOW = 0x04; + symlink_follow: bool = false, /// Remove directory instead of file - pub const REMOVEDIR = 0x08; - }, - .haiku => struct { - pub const FDCWD = -1; - pub const SYMLINK_NOFOLLOW = 0x01; - pub const SYMLINK_FOLLOW = 0x02; - pub const REMOVEDIR = 0x04; - pub const EACCESS = 0x08; - }, - .illumos => struct { + removedir: bool = false, + _: u28 = 0, + /// Magic value that specify the use of the current working directory /// to determine the target of relative file paths in the openat() and /// similar syscalls. - pub const FDCWD: fd_t = @bitCast(@as(u32, 0xffd19553)); + pub const fdcwd: fd_t = -100; + + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + + pub const FDCWD = fdcwd; + }, + .haiku => packed struct(u32) { /// Do not follow symbolic links - pub const SYMLINK_NOFOLLOW = 0x1000; + symlink_nofollow: bool = false, /// Follow symbolic link - pub const SYMLINK_FOLLOW = 0x2000; + symlink_follow: bool = false, /// Remove directory instead of file - pub const REMOVEDIR = 0x1; - pub const TRIGGER = 0x2; + removedir: bool = false, /// Check access using effective user and group ID - pub const EACCESS = 0x4; - }, - .emscripten => struct { - pub const FDCWD = -100; - pub const SYMLINK_NOFOLLOW = 0x100; - pub const REMOVEDIR = 0x200; - pub const SYMLINK_FOLLOW = 0x400; - pub const NO_AUTOMOUNT = 0x800; - pub const EMPTY_PATH = 0x1000; - pub const STATX_SYNC_TYPE = 0x6000; - pub const STATX_SYNC_AS_STAT = 0x0000; - pub const STATX_FORCE_SYNC = 0x2000; - pub const STATX_DONT_SYNC = 0x4000; - pub const RECURSIVE = 0x8000; + eaccess: bool = false, + _: u28 = 0, + + pub const fdcwd: fd_t = -1; + + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; }, - .wasi => struct { - // Match `AT_*` constants in lib/libc/include/wasm-wasi-musl/__header_fcntl.h - pub const EACCESS = 0x0; - pub const SYMLINK_NOFOLLOW = 0x1; - pub const SYMLINK_FOLLOW = 0x2; - pub const REMOVEDIR = 0x4; - /// When linking libc, we follow their convention and use -2 for current working directory. - /// However, without libc, Zig does a different convention: it assumes the - /// current working directory is the first preopen. This behavior can be - /// overridden with a public function called `wasi_cwd` in the root source - /// file. - pub const FDCWD: fd_t = if (builtin.link_libc) -2 else 3; + .illumos => packed struct(u32) { + /// Remove directory instead of file + removedir: bool = false, + trigger: bool = false, + /// Check access using effective user and group ID + eaccess: bool = false, + _4: u9 = 0, + /// Do not follow symbolic links + symlink_nofollow: bool = false, + /// Follow symbolic link + symlink_follow: bool = false, + _15: u18 = 0, + + /// Magic value that specify the use of the current working directory + /// to determine the target of relative file paths in the openat() and + /// similar syscalls. + pub const fdcwd: fd_t = @bitCast(@as(u32, 0xffd19553)); + + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + /// DEPRECATED: use `trigger` + pub const TRIGGER: u32 = @bitCast(At{ .trigger = true }); + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; + }, + // Match `AT_*` constants in lib/libc/include/wasm-wasi-musl/__header_fcntl.h + .wasi => packed struct(u32) { + /// Do not follow symbolic links + symlink_nofollow: bool = false, + /// Follow symbolic link + symlink_follow: bool = false, + /// Remove directory instead of file + removedir: bool = false, + _: u29 = 0, + + pub const eaccess: u32 = @bitCast(At{}); + + /// When linking libc, we follow their convention and use -2 for + /// current working directory. However, without libc, Zig does a + /// different convention: it assumes the current working directory is + /// the first preopen. This behavior can be overridden with a public + /// function called `wasi_cwd` in the root source file. + pub const fdcwd: fd_t = if (builtin.link_libc) -2 else 3; + + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `symlink_follow` + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = eaccess; }, // https://github.com/SerenityOS/serenity/blob/2808b0376406a40e31293bb3bcb9170374e90506/Kernel/API/POSIX/fcntl.h#L49-L52 - .serenity => struct { - pub const FDCWD = -100; - pub const SYMLINK_NOFOLLOW = 0x100; - pub const REMOVEDIR = 0x200; - pub const EACCESS = 0x400; + .serenity => packed struct(u32) { + _: u8 = 0, + /// Do not follow symbolic links + symlink_nofollow: bool = false, + /// Remove directory instead of file + removedir: bool = false, + /// Check access using effective user and group ID + eaccess: bool = false, + _11: u21 = 0, + + pub const fdcwd: fd_t = -100; + + /// DEPRECATED: use `symlink_nofollow` + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// DEPRECATED: use `removedir` + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + /// DEPRECATED: use `eaccess` + pub const EACCESS: u32 = @bitCast(At{ .eaccess = true }); + /// DEPRECATED: use `fdcwd` + pub const FDCWD = fdcwd; }, else => void, }; @@ -10485,7 +10624,7 @@ pub extern "c" fn inotify_add_watch(fd: fd_t, pathname: [*:0]const u8, mask: u32 pub extern "c" fn inotify_rm_watch(fd: fd_t, wd: c_int) c_int; pub extern "c" fn fstat64(fd: fd_t, buf: *Stat) c_int; -pub extern "c" fn fstatat64(dirfd: fd_t, noalias path: [*:0]const u8, noalias stat_buf: *Stat, flags: u32) c_int; +pub extern "c" fn fstatat64(dirfd: fd_t, noalias path: [*:0]const u8, noalias stat_buf: *Stat, flags: At) c_int; pub extern "c" fn fallocate64(fd: fd_t, mode: c_int, offset: off_t, len: off_t) c_int; pub extern "c" fn fopen64(noalias filename: [*:0]const u8, noalias modes: [*:0]const u8) ?*FILE; pub extern "c" fn ftruncate64(fd: c_int, length: off_t) c_int; diff --git a/lib/std/fs.zig b/lib/std/fs.zig index d07901a7710b..a98be767c11a 100644 --- a/lib/std/fs.zig +++ b/lib/std/fs.zig @@ -187,7 +187,7 @@ pub fn cwd() Dir { } else if (native_os == .wasi) { return .{ .fd = std.options.wasiCwd() }; } else { - return .{ .fd = posix.AT.fdcwd }; + return .{ .fd = posix.At.fdcwd }; } } diff --git a/lib/std/fs/Dir.zig b/lib/std/fs/Dir.zig index 23bd903fe8df..738c0ddf1290 100644 --- a/lib/std/fs/Dir.zig +++ b/lib/std/fs/Dir.zig @@ -148,7 +148,7 @@ pub const Iterator = switch (native_os) { const stat_info = posix.fstatat( self.dir.fd, name, - posix.AT.SYMLINK_NOFOLLOW, + .{ .symlink_nofollow = true }, ) catch |err| switch (err) { error.NameTooLong => unreachable, error.SymLinkLoop => unreachable, @@ -1112,7 +1112,7 @@ pub fn deleteFileZ(self: Dir, sub_path_c: [*:0]const u8) DeleteFileError!void { // directory, so we need to handle that case specifically and translate the error .driverkit, .ios, .maccatalyst, .macos, .tvos, .visionos, .watchos, .freebsd, .netbsd, .dragonfly, .openbsd, .illumos => { // Don't follow symlinks to match unlinkat (which acts on symlinks rather than follows them) - const fstat = posix.fstatatZ(self.fd, sub_path_c, posix.AT.SYMLINK_NOFOLLOW) catch return e; + const fstat = posix.fstatatZ(self.fd, sub_path_c, .{ .symlink_nofollow = true }) catch return e; const is_dir = fstat.mode & posix.S.IFMT == posix.S.IFDIR; return if (is_dir) error.IsDir else e; }, @@ -1163,7 +1163,7 @@ pub fn deleteDir(self: Dir, sub_path: []const u8) DeleteDirError!void { const sub_path_w = try windows.sliceToPrefixedFileW(self.fd, sub_path); return self.deleteDirW(sub_path_w.span()); } else if (native_os == .wasi and !builtin.link_libc) { - posix.unlinkat(self.fd, sub_path, posix.AT.REMOVEDIR) catch |err| switch (err) { + posix.unlinkat(self.fd, sub_path, posix.At.REMOVEDIR) catch |err| switch (err) { error.IsDir => unreachable, // not possible since we pass AT.REMOVEDIR else => |e| return e, }; @@ -1175,7 +1175,7 @@ pub fn deleteDir(self: Dir, sub_path: []const u8) DeleteDirError!void { /// Same as `deleteDir` except the parameter is null-terminated. pub fn deleteDirZ(self: Dir, sub_path_c: [*:0]const u8) DeleteDirError!void { - posix.unlinkatZ(self.fd, sub_path_c, posix.AT.REMOVEDIR) catch |err| switch (err) { + posix.unlinkatZ(self.fd, sub_path_c, posix.At.REMOVEDIR) catch |err| switch (err) { error.IsDir => unreachable, // not possible since we pass AT.REMOVEDIR else => |e| return e, }; @@ -1184,7 +1184,7 @@ pub fn deleteDirZ(self: Dir, sub_path_c: [*:0]const u8) DeleteDirError!void { /// Same as `deleteDir` except the parameter is WTF16LE, NT prefixed. /// This function is Windows-only. pub fn deleteDirW(self: Dir, sub_path_w: []const u16) DeleteDirError!void { - posix.unlinkatW(self.fd, sub_path_w, posix.AT.REMOVEDIR) catch |err| switch (err) { + posix.unlinkatW(self.fd, sub_path_w, posix.At.REMOVEDIR) catch |err| switch (err) { error.IsDir => unreachable, // not possible since we pass AT.REMOVEDIR else => |e| return e, }; diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 8948bec2b24d..fb226e211fcd 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -1253,7 +1253,7 @@ pub fn rmdir(path: [*:0]const u8) usize { if (@hasField(SYS, "rmdir")) { return syscall1(.rmdir, @intFromPtr(path)); } else { - return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), @as(u32, @bitCast(At{ .removedir_or_handle_fid = .{ .removedir = true } }))); + return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), @as(u32, @bitCast(At{ .removedir = true }))); } } @@ -3573,7 +3573,7 @@ pub const STDERR_FILENO = 2; pub const AT = At; /// matches AT_* and AT_STATX_* pub const At = packed struct(u32) { - _u1: u8 = 0, + _1: u8 = 0, /// Do not follow symbolic links symlink_nofollow: bool = false, /// Remove directory instead of unlinking file diff --git a/lib/std/posix.zig b/lib/std/posix.zig index 5e01f36bfcac..d7b1bc57b527 100644 --- a/lib/std/posix.zig +++ b/lib/std/posix.zig @@ -63,7 +63,9 @@ pub const AF = system.AF; pub const AF_SUN = system.AF_SUN; pub const AI = system.AI; pub const ARCH = system.ARCH; -pub const AT = system.AT; +/// DEPRECATED: use `At` +pub const AT = At; +pub const At = system.At; pub const AT_SUN = system.AT_SUN; pub const CLOCK = system.CLOCK; pub const CPU_COUNT = system.CPU_COUNT; @@ -480,7 +482,7 @@ fn fchmodat2(dirfd: fd_t, path: []const u8, mode: mode_t, flags: u32) FChmodAtEr } defer close(pathfd); - const stat = fstatatZ(pathfd, "", AT.EMPTY_PATH) catch |err| switch (err) { + const stat = fstatatZ(pathfd, "", .{ .empty_path = true }) catch |err| switch (err) { error.NameTooLong => unreachable, error.FileNotFound => unreachable, error.Streaming => unreachable, @@ -1553,7 +1555,7 @@ pub fn open(file_path: []const u8, flags: O, perm: mode_t) OpenError!fd_t { if (native_os == .windows) { @compileError("Windows does not support POSIX; use Windows-specific API or cross-platform std.fs API"); } else if (native_os == .wasi and !builtin.link_libc) { - return openat(AT.FDCWD, file_path, flags, perm); + return openat(At.fdcwd, file_path, flags, perm); } const file_path_c = try toPosixPath(file_path); return openZ(&file_path_c, flags, perm); @@ -1943,7 +1945,7 @@ pub fn symlink(target_path: []const u8, sym_link_path: []const u8) SymLinkError! if (native_os == .windows) { @compileError("symlink is not supported on Windows; use std.os.windows.CreateSymbolicLink instead"); } else if (native_os == .wasi and !builtin.link_libc) { - return symlinkat(target_path, AT.FDCWD, sym_link_path); + return symlinkat(target_path, At.fdcwd, sym_link_path); } const target_path_c = try toPosixPath(target_path); const sym_link_path_c = try toPosixPath(sym_link_path); @@ -2103,7 +2105,7 @@ pub fn linkZ(oldpath: [*:0]const u8, newpath: [*:0]const u8) LinkError!void { /// On other platforms, both paths are an opaque sequence of bytes with no particular encoding. pub fn link(oldpath: []const u8, newpath: []const u8) LinkError!void { if (native_os == .wasi and !builtin.link_libc) { - return linkat(AT.FDCWD, oldpath, AT.FDCWD, newpath, 0) catch |err| switch (err) { + return linkat(At.fdcwd, oldpath, At.fdcwd, newpath, 0) catch |err| switch (err) { error.NotDir => unreachable, // link() does not support directories else => |e| return e, }; @@ -2163,7 +2165,7 @@ pub fn linkat( const old: RelativePathWasi = .{ .dir_fd = olddir, .relative_path = oldpath }; const new: RelativePathWasi = .{ .dir_fd = newdir, .relative_path = newpath }; const old_flags: wasi.lookupflags_t = .{ - .SYMLINK_FOLLOW = (flags & AT.SYMLINK_FOLLOW) != 0, + .SYMLINK_FOLLOW = (flags & At.SYMLINK_FOLLOW) != 0, }; switch (wasi.path_link( old.dir_fd, @@ -2234,7 +2236,7 @@ pub const UnlinkError = error{ /// See also `unlinkZ`. pub fn unlink(file_path: []const u8) UnlinkError!void { if (native_os == .wasi and !builtin.link_libc) { - return unlinkat(AT.FDCWD, file_path, 0) catch |err| switch (err) { + return unlinkat(At.fdcwd, file_path, 0) catch |err| switch (err) { error.DirNotEmpty => unreachable, // only occurs when targeting directories else => |e| return e, }; @@ -2308,7 +2310,7 @@ pub fn unlinkat(dirfd: fd_t, file_path: []const u8, flags: u32) UnlinkatError!vo /// WASI-only. Same as `unlinkat` but targeting WASI. /// See also `unlinkat`. pub fn unlinkatWasi(dirfd: fd_t, file_path: []const u8, flags: u32) UnlinkatError!void { - const remove_dir = (flags & AT.REMOVEDIR) != 0; + const remove_dir = (flags & At.REMOVEDIR) != 0; const res = if (remove_dir) wasi.path_remove_directory(dirfd, file_path.ptr, file_path.len) else @@ -2373,7 +2375,7 @@ pub fn unlinkatZ(dirfd: fd_t, file_path_c: [*:0]const u8, flags: u32) UnlinkatEr /// Same as `unlinkat` but `sub_path_w` is WTF16LE, NT prefixed. Windows only. pub fn unlinkatW(dirfd: fd_t, sub_path_w: []const u16, flags: u32) UnlinkatError!void { - const remove_dir = (flags & AT.REMOVEDIR) != 0; + const remove_dir = (flags & At.REMOVEDIR) != 0; return windows.DeleteFile(sub_path_w, .{ .dir = dirfd, .remove_dir = remove_dir }); } @@ -2421,7 +2423,7 @@ pub const RenameError = error{ /// On other platforms, both paths are an opaque sequence of bytes with no particular encoding. pub fn rename(old_path: []const u8, new_path: []const u8) RenameError!void { if (native_os == .wasi and !builtin.link_libc) { - return renameat(AT.FDCWD, old_path, AT.FDCWD, new_path); + return renameat(At.fdcwd, old_path, At.FDCWD, new_path); } else if (native_os == .windows) { const old_path_w = try windows.sliceToPrefixedFileW(null, old_path); const new_path_w = try windows.sliceToPrefixedFileW(null, new_path); @@ -2644,7 +2646,7 @@ pub const MakeDirError = std.Io.Dir.MakeError; /// On other platforms, `dir_path` is an opaque sequence of bytes with no particular encoding. pub fn mkdir(dir_path: []const u8, mode: mode_t) MakeDirError!void { if (native_os == .wasi and !builtin.link_libc) { - return mkdirat(AT.FDCWD, dir_path, mode); + return mkdirat(At.fdcwd, dir_path, mode); } else if (native_os == .windows) { const dir_path_w = try windows.sliceToPrefixedFileW(null, dir_path); return mkdirW(dir_path_w.span(), mode); @@ -2729,7 +2731,7 @@ pub const DeleteDirError = error{ /// On other platforms, `dir_path` is an opaque sequence of bytes with no particular encoding. pub fn rmdir(dir_path: []const u8) DeleteDirError!void { if (native_os == .wasi and !builtin.link_libc) { - return unlinkat(AT.FDCWD, dir_path, AT.REMOVEDIR) catch |err| switch (err) { + return unlinkat(At.fdcwd, dir_path, At.REMOVEDIR) catch |err| switch (err) { error.FileSystem => unreachable, // only occurs when targeting files error.IsDir => unreachable, // only occurs when targeting files else => |e| return e, @@ -2856,7 +2858,7 @@ pub const FchdirError = error{ } || UnexpectedError; pub fn fchdir(dirfd: fd_t) FchdirError!void { - if (dirfd == AT.FDCWD) return; + if (dirfd == At.fdcwd) return; while (true) { switch (errno(system.fchdir(dirfd))) { .SUCCESS => return, @@ -2909,7 +2911,7 @@ pub const ReadLinkError = error{ /// On other platforms, the result is an opaque sequence of bytes with no particular encoding. pub fn readlink(file_path: []const u8, out_buffer: []u8) ReadLinkError![]u8 { if (native_os == .wasi and !builtin.link_libc) { - return readlinkat(AT.FDCWD, file_path, out_buffer); + return readlinkat(At.fdcwd, file_path, out_buffer); } else if (native_os == .windows) { var file_path_w = try windows.sliceToPrefixedFileW(null, file_path); const result_w = try readlinkW(file_path_w.span(), &file_path_w.data); @@ -3869,7 +3871,7 @@ pub const FStatAtError = FStatError || error{ /// On WASI, `pathname` should be encoded as valid UTF-8. /// On other platforms, `pathname` is an opaque sequence of bytes with no particular encoding. /// See also `fstatatZ`. -pub fn fstatat(dirfd: fd_t, pathname: []const u8, flags: u32) FStatAtError!Stat { +pub fn fstatat(dirfd: fd_t, pathname: []const u8, flags: At) FStatAtError!Stat { if (native_os == .wasi and !builtin.link_libc) { @compileError("use std.Io instead"); } else if (native_os == .windows) { @@ -3882,7 +3884,7 @@ pub fn fstatat(dirfd: fd_t, pathname: []const u8, flags: u32) FStatAtError!Stat /// Same as `fstatat` but `pathname` is null-terminated. /// See also `fstatat`. -pub fn fstatatZ(dirfd: fd_t, pathname: [*:0]const u8, flags: u32) FStatAtError!Stat { +pub fn fstatatZ(dirfd: fd_t, pathname: [*:0]const u8, flags: At) FStatAtError!Stat { if (native_os == .wasi and !builtin.link_libc) { @compileError("use std.Io instead"); } diff --git a/lib/std/posix/test.zig b/lib/std/posix/test.zig index aaaaa1d948a9..5eed5663eef5 100644 --- a/lib/std/posix/test.zig +++ b/lib/std/posix/test.zig @@ -37,7 +37,7 @@ test "check WASI CWD" { if (!builtin.link_libc) { // WASI without-libc hardcodes fd 3 as the FDCWD token so it can be passed directly to WASI calls - try expectEqual(3, posix.AT.FDCWD); + try expectEqual(3, posix.At.fdcwd); } } } @@ -907,7 +907,7 @@ test "pwrite with empty buffer" { } fn expectMode(dir: posix.fd_t, file: []const u8, mode: posix.mode_t) !void { - const st = try posix.fstatat(dir, file, posix.AT.SYMLINK_NOFOLLOW); + const st = try posix.fstatat(dir, file, .{ .symlink_nofollow = true }); try expectEqual(mode, st.mode & 0b111_111_111); } @@ -932,13 +932,13 @@ test "fchmodat smoke test" { try posix.symlinkat("regfile", tmp.dir.fd, "symlink"); const sym_mode = blk: { - const st = try posix.fstatat(tmp.dir.fd, "symlink", posix.AT.SYMLINK_NOFOLLOW); + const st = try posix.fstatat(tmp.dir.fd, "symlink", .{ .symlink_nofollow = true }); break :blk st.mode & 0b111_111_111; }; try posix.fchmodat(tmp.dir.fd, "regfile", 0o640, 0); try expectMode(tmp.dir.fd, "regfile", 0o640); - try posix.fchmodat(tmp.dir.fd, "regfile", 0o600, posix.AT.SYMLINK_NOFOLLOW); + try posix.fchmodat(tmp.dir.fd, "regfile", 0o600, posix.At.SYMLINK_NOFOLLOW); try expectMode(tmp.dir.fd, "regfile", 0o600); try posix.fchmodat(tmp.dir.fd, "symlink", 0o640, 0); @@ -946,7 +946,7 @@ test "fchmodat smoke test" { try expectMode(tmp.dir.fd, "symlink", sym_mode); var test_link = true; - posix.fchmodat(tmp.dir.fd, "symlink", 0o600, posix.AT.SYMLINK_NOFOLLOW) catch |err| switch (err) { + posix.fchmodat(tmp.dir.fd, "symlink", 0o600, posix.At.SYMLINK_NOFOLLOW) catch |err| switch (err) { error.OperationNotSupported => test_link = false, else => |e| return e, }; From 1240ace1c3536e6535a5f310ddc95c513d36f4d1 Mon Sep 17 00:00:00 2001 From: Bernard Assan Date: Fri, 14 Nov 2025 20:08:22 +0000 Subject: [PATCH 41/41] Implement W as packed struct in `c.zig` fixes the below on CI /lib/std/process/Child.zig:526:24: error: cannot @bitCast to 'c.W__struct_316926'; struct does not have a guaranteed in-memory layout const w: posix.W = @bitCast(status); ^~~~~~~~~~~~~~~~ /lib/std/c.zig:3682:75: note: struct declared here .driverkit, .ios, .maccatalyst, .macos, .tvos, .visionos, .watchos => struct { ^~~~~~ Signed-off-by: Bernard Assan --- lib/std/c.zig | 675 ++++++++++++++++++++++++++++------- lib/std/os/emscripten.zig | 28 -- lib/std/os/linux/IoUring.zig | 4 +- 3 files changed, 544 insertions(+), 163 deletions(-) diff --git a/lib/std/c.zig b/lib/std/c.zig index f164702ebb1f..87bc90ac2bc6 100644 --- a/lib/std/c.zig +++ b/lib/std/c.zig @@ -3686,278 +3686,686 @@ pub const T = switch (native_os) { }, else => void, }; + pub const IOCPARM_MASK = switch (native_os) { .windows => ws2_32.IOCPARM_MASK, .driverkit, .ios, .maccatalyst, .macos, .tvos, .visionos, .watchos => 0x1fff, else => void, }; + pub const TCSA = std.posix.TCSA; + pub const TFD = switch (native_os) { .linux => linux.TFD, else => void, }; + pub const VDSO = switch (native_os) { .linux => linux.VDSO, else => void, }; + pub const W = switch (native_os) { - .linux => linux.W, - .emscripten => emscripten.W, - .driverkit, .ios, .maccatalyst, .macos, .tvos, .visionos, .watchos => struct { + .emscripten, .linux => linux.W, + .driverkit, .ios, .maccatalyst, .macos, .tvos, .visionos, .watchos => packed struct(u32) { /// [XSI] no hang in wait/no child to reap - pub const NOHANG = 0x00000001; + nohang: bool = false, /// [XSI] notify on stop, untraced child - pub const UNTRACED = 0x00000002; + untraced: bool = false, + _3: u30 = 0, + + const stopped = 0o177; + + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(W{ .untraced = true }); + + pub fn exitStatus(x: W) u8 { + return @intCast(x.toInt() >> 8); + } + + pub fn termSig(x: W) u32 { + return x.status(); + } + + pub fn stopSig(x: W) u32 { + return x.toInt() >> 8; + } + + pub fn ifExited(x: W) bool { + return x.status() == 0; + } + + pub fn ifStopped(x: W) bool { + return x.status() == stopped and x.stopSig() != 0x13; + } + + pub fn ifSignaled(x: W) bool { + return x.status() != stopped and x.status() != 0; + } + /// DEPRECATED: use `exitStatus` pub fn EXITSTATUS(x: u32) u8 { - return @as(u8, @intCast(x >> 8)); + return exitStatus(@bitCast(x)); } + /// DEPRECATED: use `termSig` pub fn TERMSIG(x: u32) u32 { - return status(x); + return termSig(@bitCast(x)); } + /// DEPRECATED: use `stopSig` pub fn STOPSIG(x: u32) u32 { - return x >> 8; + return stopSig(@bitCast(x)); } + /// DEPRECATED: use `ifExited` pub fn IFEXITED(x: u32) bool { - return status(x) == 0; + return ifExited(@bitCast(x)); + } + /// DEPRECATED: use `ifSignaled` + pub fn IFSIGNALED(x: u32) bool { + return ifSignaled(@bitCast(x)); } + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(x: u32) bool { - return status(x) == stopped and STOPSIG(x) != 0x13; + return ifStopped(@bitCast(x)); } - pub fn IFSIGNALED(x: u32) bool { - return status(x) != stopped and status(x) != 0; + + fn status(x: W) u32 { + return x.toInt() & 0o177; } - fn status(x: u32) u32 { - return x & 0o177; + fn toInt(s: W) u32 { + return @bitCast(s); } - const stopped = 0o177; }, - .freebsd => struct { - pub const NOHANG = 1; - pub const UNTRACED = 2; - pub const STOPPED = UNTRACED; - pub const CONTINUED = 4; - pub const NOWAIT = 8; - pub const EXITED = 16; - pub const TRAPPED = 32; + .freebsd => packed struct(u32) { + /// no hang in wait/no child to reap + nohang: bool = false, + /// notify on stop, untraced child + stopped: bool = false, + /// process continued + continued: bool = false, + /// don't wait for processes to exit + nowait: bool = false, + /// process exited + exited: bool = false, + /// process trapped + trapped: bool = false, + _7: u26 = 0, + + const untraced: W = .{ .stopped = true }; + + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(untraced); + /// DEPRECATED: use `untraced` + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + /// DEPRECATED: use `continued` + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + /// DEPRECATED: use `nowait` + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + /// DEPRECATED: use `exited` + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + /// DEPRECATED: use `trapped` + pub const TRAPPED: u32 = @bitCast(W{ .trapped = true }); + + pub fn exitStatus(s: W) u8 { + return @intCast((s.toInt() & 0xff00) >> 8); + } + + pub fn termSig(s: W) u32 { + return s.toInt() & 0x7f; + } + + pub fn stopSig(s: W) u32 { + return s.exitStatus(); + } + pub fn ifExited(s: W) bool { + return s.termSig() == 0; + } + + pub fn ifStopped(s: W) bool { + return @as(u16, @truncate((((s.toInt() & 0xffff) *% 0x10001) >> 8))) > 0x7f00; + } + + pub fn ifSignaled(s: W) bool { + return (s.toInt() & 0xffff) -% 1 < 0xff; + } + + /// DEPRECATED: use `exitStatus` pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s & 0xff00) >> 8)); + return exitStatus(@bitCast(s)); } + /// DEPRECATED: use `termSig` pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; + return termSig(@bitCast(s)); } + /// DEPRECATED: use `stopSig` pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); + return stopSig(@bitCast(s)); } + /// DEPRECATED: use `ifExited` pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; + return ifExited(s); } + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(s: u32) bool { - return @as(u16, @truncate((((s & 0xffff) *% 0x10001) >> 8))) > 0x7f00; + return ifStopped(@bitCast(s)); } + /// DEPRECATED: use `ifSignaled` pub fn IFSIGNALED(s: u32) bool { - return (s & 0xffff) -% 1 < 0xff; + return ifSignaled(@bitCast(s)); + } + + fn toInt(s: W) u32 { + return @bitCast(s); } }, - .illumos => struct { - pub const EXITED = 0o001; - pub const TRAPPED = 0o002; - pub const UNTRACED = 0o004; - pub const STOPPED = UNTRACED; - pub const CONTINUED = 0o010; - pub const NOHANG = 0o100; - pub const NOWAIT = 0o200; + .illumos => packed struct(u32) { + /// process exited + exited: bool = false, + /// process trapped + trapped: bool = false, + /// notify on stop, untraced child + stopped: bool = false, + /// process continued + continued: bool = false, + _5: u2 = 0, + /// no hang in wait/no child to reap + nohang: bool = false, + /// don't wait for processes to exit + nowait: bool = false, + _8: u24 = 0, + + pub const untraced: W = .{ .stopped = true }; + /// DEPRECATED: use `exited` + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + /// DEPRECATED: use `trapped` + pub const TRAPPED: u32 = @bitCast(W{ .trapped = true }); + /// DEPRECATED: use `untraced` + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(untraced); + /// DEPRECATED: use `continued` + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `nowait` + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + + pub fn exitStatus(x: W) u8 { + return @intCast((x.toInt() >> 8) & 0xff); + } + + pub fn termSig(x: W) u32 { + return x.toInt() & 0x7f; + } + + pub fn stopSig(x: W) u32 { + return x.exitStatus(); + } + + pub fn ifExited(x: W) bool { + return x.termSig() == 0; + } + + pub fn ifContinued(x: W) bool { + return (x.toInt() & 0o177777) == 0o177777; + } + + pub fn ifStopped(x: W) bool { + const s = x.toInt(); + return (s & 0x00ff != 0o177) and !(s & 0xff00 != 0); + } + + pub fn ifSignaled(x: W) bool { + const s = x.toInt(); + return s & 0x00ff > 0 and s & 0xff00 == 0; + } + + /// DEPRECATED: use `exitStatus` pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s >> 8) & 0xff)); + return exitStatus(@bitCast(s)); } + /// DEPRECATED: use `termSig` pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; + return termSig(@bitCast(s)); } + /// DEPRECATED: use `stopSig` pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); + return stopSig(@bitCast(s)); } + /// DEPRECATED: use `ifExited` pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; + return ifExited(@bitCast(s)); } - + /// DEPRECATED: use `ifContinued` pub fn IFCONTINUED(s: u32) bool { - return ((s & 0o177777) == 0o177777); + return ifContinued(@bitCast(s)); } - + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(s: u32) bool { - return (s & 0x00ff != 0o177) and !(s & 0xff00 != 0); + return ifStopped(@bitCast(s)); } - + /// DEPRECATED: use `ifSignaled` pub fn IFSIGNALED(s: u32) bool { - return s & 0x00ff > 0 and s & 0xff00 == 0; + return ifSignaled(@bitCast(s)); } - }, - .netbsd => struct { - pub const NOHANG = 0x00000001; - pub const UNTRACED = 0x00000002; - pub const STOPPED = UNTRACED; - pub const CONTINUED = 0x00000010; - pub const NOWAIT = 0x00010000; - pub const EXITED = 0x00000020; - pub const TRAPPED = 0x00000040; + fn toInt(s: W) u32 { + return @bitCast(s); + } + }, + .netbsd => packed struct(u32) { + /// no hang in wait/no child to reap + nohang: bool = false, + /// notify on stop, untraced child + stopped: bool = false, + _3: u2 = 0, + /// process continued + continued: bool = false, + /// process exited + exited: bool = false, + /// process trapped + trapped: bool = false, + _8: u9 = 0, + /// don't wait for processes to exit + nowait: bool = false, + _18: u15 = 0, + + pub const untraced: W = .{ .stopped = true }; + + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(untraced); + /// DEPRECATED: use `stopped` + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + /// DEPRECATED: use `continued` + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + /// DEPRECATED: use `nowait` + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + /// DEPRECATED: use `exited` + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + /// DEPRECATED: use `trapped` + pub const TRAPPED: u32 = @bitCast(W{ .trapped = true }); + + pub fn exitStatus(x: W) u8 { + return @intCast((x.toInt() >> 8) & 0xff); + } + pub fn termSig(x: W) u32 { + return x.toInt() & 0x7f; + } + pub fn stopSig(x: W) u32 { + return x.exitStatus(); + } + pub fn ifExited(x: W) bool { + return x.termSig() == 0; + } + pub fn ifContinued(x: W) bool { + return (x.toInt() & 0x7f) == 0xffff; + } + pub fn ifStopped(x: W) bool { + const s = x.toInt(); + return (s & 0x7f != 0x7f) and !x.ifContinued(); + } + pub fn ifSignaled(x: W) bool { + return !x.ifStopped() and !x.ifContinued() and !x.ifExited(); + } + + /// DEPRECATED: use `exitStatus` pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s >> 8) & 0xff)); + return exitStatus(@bitCast(s)); } + /// DEPRECATED: use `termSig` pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; + return termSig(@bitCast(s)); } + /// DEPRECATED: use `stopSig` pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); + return stopSig(@bitCast(s)); } + /// DEPRECATED: use `ifExited` pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; + return ifExited(@bitCast(s)); } - + /// DEPRECATED: use `ifContinued` pub fn IFCONTINUED(s: u32) bool { - return ((s & 0x7f) == 0xffff); + return ifContinued(@bitCast(s)); } - + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(s: u32) bool { - return ((s & 0x7f != 0x7f) and !IFCONTINUED(s)); + return ifStopped(@bitCast(s)); } - + /// DEPRECATED: use `ifSignaled` pub fn IFSIGNALED(s: u32) bool { - return !IFSTOPPED(s) and !IFCONTINUED(s) and !IFEXITED(s); + return ifSignaled(@bitCast(s)); + } + + fn toInt(s: W) u32 { + return @bitCast(s); } }, - .dragonfly => struct { - pub const NOHANG = 0x0001; - pub const UNTRACED = 0x0002; - pub const CONTINUED = 0x0004; - pub const STOPPED = UNTRACED; - pub const NOWAIT = 0x0008; - pub const EXITED = 0x0010; - pub const TRAPPED = 0x0020; + .dragonfly => packed struct(u32) { + /// no hang in wait/no child to reap + nohang: bool = false, + /// notify on stop, untraced child + stopped: bool = false, + /// process continued + continued: bool = false, + /// don't wait for processes to exit + nowait: bool = false, + /// process exited + exited: bool = false, + /// process trapped + trapped: bool = false, + _7: u26 = 0, + + pub const untraced: W = .{ .stopped = true }; + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(untraced); + /// DEPRECATED: use `continued` + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + /// DEPRECATED: use `stopped` + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + /// DEPRECATED: use `nowait` + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + /// DEPRECATED: use `exited` + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + /// DEPRECATED: use `trapped` + pub const TRAPPED: u32 = @bitCast(W{ .trapped = true }); + + pub fn exitStatus(x: W) u8 { + return @intCast((x.toInt() & 0xff00) >> 8); + } + pub fn termSig(x: W) u32 { + return x.toInt() & 0x7f; + } + pub fn stopSig(x: W) u32 { + return x.exitStatus(); + } + pub fn ifExited(x: W) bool { + return x.termSig() == 0; + } + pub fn ifStopped(x: W) bool { + return @as(u16, @truncate((((x.toInt() & 0xffff) *% 0x10001) >> 8))) > 0x7f00; + } + pub fn ifSignaled(x: W) bool { + return (x.toInt() & 0xffff) -% 1 < 0xff; + } + + /// DEPRECATED: use `exitStatus` pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s & 0xff00) >> 8)); + return exitStatus(@bitCast(s)); } + /// DEPRECATED: use `termSig` pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; + return termSig(@bitCast(s)); } + /// DEPRECATED: use `stopSig` pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); + return stopSig(@bitCast(s)); } + /// DEPRECATED: use `ifExited` pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; + return ifExited(@bitCast(s)); } + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(s: u32) bool { - return @as(u16, @truncate((((s & 0xffff) *% 0x10001) >> 8))) > 0x7f00; + return ifStopped(@bitCast(s)); } + /// DEPRECATED: use `ifSignaled` pub fn IFSIGNALED(s: u32) bool { - return (s & 0xffff) -% 1 < 0xff; + return ifSignaled(@bitCast(s)); } - }, - .haiku => struct { - pub const NOHANG = 0x1; - pub const UNTRACED = 0x2; - pub const CONTINUED = 0x4; - pub const EXITED = 0x08; - pub const STOPPED = 0x10; - pub const NOWAIT = 0x20; + fn toInt(s: W) u32 { + return @bitCast(s); + } + }, + .haiku => packed struct(u32) { + /// no hang in wait/no child to reap + nohang: bool = false, + /// notify on stop, untraced child + untraced: bool = false, + /// process continued + continued: bool = false, + /// process exited + exited: bool = false, + /// process stopped + stopped: bool = false, + /// don't wait for processes to exit + nowait: bool = false, + _7: u26 = 0, + + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(W{ .untraced = true }); + /// DEPRECATED: use `continued` + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + /// DEPRECATED: use `exited` + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + /// DEPRECATED: use `stopped` + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + /// DEPRECATED: use `nowait` + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + + pub fn exitStatus(x: W) u8 { + return @intCast(x.toInt() & 0xff); + } + pub fn termSig(x: W) u32 { + return (x.toInt() >> 8) & 0xff; + } + pub fn stopSig(x: W) u32 { + return (x.toInt() >> 16) & 0xff; + } + pub fn ifExited(x: W) bool { + return (x.toInt() & ~@as(u32, 0xff)) == 0; + } + pub fn ifStopped(x: W) bool { + return ((x.toInt() >> 16) & 0xff) != 0; + } + pub fn ifSignaled(x: W) bool { + return ((x.toInt() >> 8) & 0xff) != 0; + } + + /// DEPRECATED: use `exitStatus` pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast(s & 0xff)); + return exitStatus(@bitCast(s)); } - + /// DEPRECATED: use `termSig` pub fn TERMSIG(s: u32) u32 { - return (s >> 8) & 0xff; + return termSig(@bitCast(s)); } - + /// DEPRECATED: use `stopSig` pub fn STOPSIG(s: u32) u32 { - return (s >> 16) & 0xff; + return stopSig(@bitCast(s)); } - + /// DEPRECATED: use `ifExited` pub fn IFEXITED(s: u32) bool { - return (s & ~@as(u32, 0xff)) == 0; + return ifExited(@bitCast(s)); } - + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(s: u32) bool { - return ((s >> 16) & 0xff) != 0; + return ifStopped(@bitCast(s)); } - + /// DEPRECATED: use `ifSignaled` pub fn IFSIGNALED(s: u32) bool { - return ((s >> 8) & 0xff) != 0; + return ifSignaled(@bitCast(s)); + } + + fn toInt(s: W) u32 { + return @bitCast(s); } }, - .openbsd => struct { - pub const NOHANG = 1; - pub const UNTRACED = 2; - pub const CONTINUED = 8; + .openbsd => packed struct(u32) { + /// no hang in wait/no child to reap + nohang: bool = false, + /// notify on stop, untraced child + untraced: bool = false, + _3: u1 = 0, + /// process continued + continued: bool = false, + _5: u28 = 0, + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(W{ .untraced = true }); + /// DEPRECATED: use `continued` + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + + pub fn exitStatus(x: W) u8 { + return @intCast((x.toInt() >> 8) & 0xff); + } + pub fn termSig(x: W) u32 { + return x.toInt() & 0x7f; + } + pub fn stopSig(x: W) u32 { + return x.exitStatus(); + } + pub fn ifExited(x: W) bool { + return x.termSig() == 0; + } + pub fn ifContinued(x: W) bool { + return (x.toInt() & 0o177777) == 0o177777; + } + pub fn ifStopped(x: W) bool { + return (x.toInt() & 0xff == 0o177); + } + pub fn ifSignaled(x: W) bool { + const s = x.toInt(); + return (s & 0o177) != 0o177 and (s & 0o177) != 0; + } + + /// DEPRECATED: use `exitStatus` pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s >> 8) & 0xff)); + return exitStatus(@bitCast(s)); } + /// DEPRECATED: use `termSig` pub fn TERMSIG(s: u32) u32 { - return (s & 0x7f); + return termSig(@bitCast(s)); } + /// DEPRECATED: use `stopSig` pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); + return stopSig(@bitCast(s)); } + /// DEPRECATED: use `ifExited` pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; + return ifExited(@bitCast(s)); } - + /// DEPRECATED: use `ifContinued` pub fn IFCONTINUED(s: u32) bool { - return ((s & 0o177777) == 0o177777); + return ifContinued(@bitCast(s)); } - + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(s: u32) bool { - return (s & 0xff == 0o177); + return ifStopped(@bitCast(s)); } - + /// DEPRECATED: use `ifSignaled` pub fn IFSIGNALED(s: u32) bool { - return (((s) & 0o177) != 0o177) and (((s) & 0o177) != 0); + return ifSignaled(@bitCast(s)); + } + + fn toInt(s: W) u32 { + return @bitCast(s); } }, // https://github.com/SerenityOS/serenity/blob/ec492a1a0819e6239ea44156825c4ee7234ca3db/Kernel/API/POSIX/sys/wait.h - .serenity => struct { - pub const NOHANG = 1; - pub const UNTRACED = 2; - pub const STOPPED = UNTRACED; - pub const EXITED = 4; - pub const CONTINUED = 8; - pub const NOWAIT = 0x1000000; + .serenity => packed struct(u32) { + /// no hang in wait/no child to reap + nohang: bool = false, + /// notify on stop, untraced child + stopped: bool = false, + /// process exited + exited: bool = false, + /// process continued + continued: bool = false, + _5: u20 = 0, + /// don't wait for processes to exit + nowait: bool = false, + _26: u7 = 0, - pub fn EXITSTATUS(s: u32) u8 { - return @intCast((s & 0xff00) >> 8); + pub const untraced: u32 = @bitCast(W{ .stopped = true }); + + /// DEPRECATED: use `nohang` + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + /// DEPRECATED: use `stopped` + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + /// DEPRECATED: use `untraced` + pub const UNTRACED: u32 = @bitCast(untraced); + /// DEPRECATED: use `exited` + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + /// DEPRECATED: use `continued` + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + /// DEPRECATED: use `nowait` + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + + pub fn exitStatus(x: W) u8 { + return @intCast((x.toInt() & 0xff00) >> 8); + } + pub fn stopSig(x: W) u32 { + return x.exitStatus(); + } + pub fn termSig(x: W) u32 { + return x.toInt() & 0x7f; + } + pub fn ifExited(x: W) bool { + return x.termSig() == 0; + } + pub fn ifStopped(x: W) bool { + return (x.toInt() & 0xff) == 0x7f; + } + pub fn ifSignaled(x: W) bool { + return (((x.toInt() & 0x7f) + 1) >> 1) > 0; + } + pub fn ifContinued(x: W) bool { + return x.toInt() == 0xffff; } + /// DEPRECATED: use `exitStatus` + pub fn EXITSTATUS(s: u32) u8 { + return exitStatus(@bitCast(s)); + } + /// DEPRECATED: use `stopSig` pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); + return stopSig(@bitCast(s)); } - + /// DEPRECATED: use `termSig` pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; + return termSig(@bitCast(s)); } - + /// DEPRECATED: use `ifExited` pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; + return ifExited(@bitCast(s)); } - + /// DEPRECATED: use `ifStopped` pub fn IFSTOPPED(s: u32) bool { - return (s & 0xff) == 0x7f; + return ifStopped(@bitCast(s)); } - + /// DEPRECATED: use `ifSignaled` pub fn IFSIGNALED(s: u32) bool { - return (((s & 0x7f) + 1) >> 1) > 0; + return ifSignaled(@bitCast(s)); } - + /// DEPRECATED: use `ifContinued` pub fn IFCONTINUED(s: u32) bool { - return s == 0xffff; + return ifContinued(@bitCast(s)); + } + + fn toInt(s: W) u32 { + return @bitCast(s); } }, else => void, }; + pub const accept_filter_arg = switch (native_os) { // https://github.com/freebsd/freebsd-src/blob/2024887abc7d1b931e00fbb0697658e98adf048d/sys/sys/socket.h#L205 // https://github.com/DragonFlyBSD/DragonFlyBSD/blob/6098912863ed4c7b3f70d7483910ce2956cf4ed3/sys/sys/socket.h#L164 @@ -3969,6 +4377,7 @@ pub const accept_filter_arg = switch (native_os) { }, else => void, }; + pub const clock_t = switch (native_os) { .linux => linux.clock_t, .emscripten => emscripten.clock_t, diff --git a/lib/std/os/emscripten.zig b/lib/std/os/emscripten.zig index cb444b360dd0..4e878f946ae7 100644 --- a/lib/std/os/emscripten.zig +++ b/lib/std/os/emscripten.zig @@ -213,34 +213,6 @@ pub const X_OK = 1; pub const W_OK = 2; pub const R_OK = 4; -pub const W = struct { - pub const NOHANG = 1; - pub const UNTRACED = 2; - pub const STOPPED = 2; - pub const EXITED = 4; - pub const CONTINUED = 8; - pub const NOWAIT = 0x1000000; - - pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s & 0xff00) >> 8)); - } - pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; - } - pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); - } - pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; - } - pub fn IFSTOPPED(s: u32) bool { - return @as(u16, @truncate(((s & 0xffff) *% 0x10001) >> 8)) > 0x7f00; - } - pub fn IFSIGNALED(s: u32) bool { - return (s & 0xffff) -% 1 < 0xff; - } -}; - pub const Flock = extern struct { type: i16, whence: i16, diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 8d201e5a6fa4..720cd22b5f1b 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -1983,7 +1983,7 @@ pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { pub fn unregister_files(self: *IoUring) !void { assert(self.fd >= 0); const res = linux.io_uring_register(self.fd, .unregister_files, null, 0); - switch (linux.E.init(res)) { + switch (linux.errno(res)) { .SUCCESS => {}, .NXIO => return error.FilesNotRegistered, else => |errno| return posix.unexpectedErrno(errno), @@ -2270,7 +2270,7 @@ pub fn init_buffer_ring( /// matches `io_uring_register_buf_ring` pub fn register_buffer_ring(self: *IoUring, buf_reg: *BufferRegister) !void { var res = linux.io_uring_register(self.fd, .register_pbuf_ring, buf_reg, 1); - if (linux.E.init(res) == .INVAL and buf_reg.flags.iou_pbuf_ring_inc) { + if (linux.errno(res) == .INVAL and buf_reg.flags.iou_pbuf_ring_inc) { // Retry without incremental buffer consumption. // It is available since kernel 6.12. returns INVAL on older. buf_reg.flags.iou_pbuf_ring_inc = false;