Efficient IO with io_uring¶

0x00 引言¶

io uring 是 Linux 上面新的异步 IO 机制。在 io uring 之前，Linux 下面的异步 IO 机制是 aio，不过 aio 存在不少的问题。io uring 一出现就受到了比较大的关注，这也可能多亏了 aio 衬托地好吧¯(ツ)/¯。io uring 的核心是三个系统调用，直接使用这三个系统调用的话会比较麻烦。所以作者还额外添加了一个库 liburing，方便使用。

0x01 liburing¶

liburing 是为了简化 io uring 的使用的一个库，估计后面使用 io uring 的话，使用到这个库的机会还是很大的。liburing 中，一个核心的结构是 struct io_uring，

struct io_uring {
    struct io_uring_sq sq;
    struct io_uring_cq cq;
    unsigned flags;
    int ring_fd;
};
// 其中的两个struct结构如下
struct io_uring_sq {
    unsigned *khead;
    unsigned *ktail;
    unsigned *kring_mask;
    unsigned *kring_entries;
    unsigned *kflags;
    unsigned *kdropped;
    unsigned *array;
    struct io_uring_sqe *sqes;

    unsigned sqe_head;
    unsigned sqe_tail;

    size_t ring_sz;
    void *ring_ptr;
};

struct io_uring_cq {
    unsigned *khead;
    unsigned *ktail;
    unsigned *kring_mask;
    unsigned *kring_entries;
    unsigned *koverflow;
    struct io_uring_cqe *cqes;

    size_t ring_sz;
    void *ring_ptr;
};

可以看出后面两个结构的前面四个字段和后面两个字段是相同的。初始化 struct io_uring 的时候，使用下面的第一个函数。这个函数的作用就如名字表现的一些，是 io uring 相关队列的初始化。实际上在这个函数的实现中，会调用多个 mmap 来初始化一些内存。在后面会分析。在初始化完成之后，为了提交 IO 请求，需要获取里面 queue 的一个项。使用下面的第二个函数。在获取到了空闲的项之后，使用下面的第三、四个函数初始化读、写请求。这两个函数除了第一个参数是 struct io_uring_sqe 之外，和 preadv、pwritev 是差不多的。在准备完成之后，使用下面的第五个函数提交请求。在提交了 IO 请求的时候，可以通过下面的第六、七个函数获取请求完成的情况，这两个函数的区别是一个非阻塞和一个阻塞的区别。默认情况下吗，完成的 IO 请求还会存在内部的队列中需要通过 io_uring_cqe_seen 表标记完成操作。使用完成之后要通过 io_uring_queue_exit 来完成资源清理的工作。在 http://git.kernel.dk/cgit/liburing/tree/examples/io_uring-test.c 这里有一个简单的例子。

extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags);
extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
                       const struct iovec *iovecs, unsigned nr_vecs, off_t offset)；
void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
                    const struct iovec *iovecs, unsigned nr_vecs, off_t offset);
extern int io_uring_submit(struct io_uring *ring);
extern int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr);
extern int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr);
/*
 * Must be called after io_uring_{peek,wait}_cqe() after the cqe has
 * been processed by the application.
 */
static inline void io_uring_cqe_seen(struct io_uring *ring, struct io_uring_cqe *cqe);
extern void io_uring_queue_exit(struct io_uring *ring);

0x02 系统调用¶

io uring 相关的三个 syscall 以及几个习惯的 struct 结构如下。前面的 io_uring_queue_init 中的操作主要就是 io_uring_setup，然后再初始化 struct io_uring_sq sq 和 struct io_uring_cq cq 的内存，另外还会分配一个 struct io_uring_sqe 的数组。这里 mmap 的时候，使用了 IORING_OFF_SQ_RING、IORING_OFF_SQES 和 IORING_OFF_CQ_RING 这个预先定义的 offset，用于告诉相同要分配的是 io uring 相关的一些内存。这些内存在使用 io_uring_setup 的时候内核会分配好，但是在用户空间要使用这些内存的话，需要将这些内存映射到用户空间，这里就利用 mmap，并添加了几个 offset 用于实现 io uring 要求的映射。这些 offset 值定义了保存到这个三个结构保存到位置。

int io_uring_setup(unsigned entries, struct io_uring_params *p);
int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig);
int io_uring_register(int fd, unsigned int opcode, const void *arg, unsigned int nr_args);

#define IORING_OFF_SQ_RING      0ULL
#define IORING_OFF_CQ_RING      0x8000000ULL
#define IORING_OFF_SQES         0x10000000ULL

struct io_uring_params {
    __u32 sq_entries;
    __u32 cq_entries;
    __u32 flags;
    __u32 sq_thread_cpu;
    __u32 sq_thread_idle;
    __u32 resv[5];
    struct io_sqring_offsets sq_off;
    struct io_cqring_offsets cq_off;
};

/*
 * Filled with the offset for mmap(2)
 */
struct io_sqring_offsets {
    __u32 head;
    __u32 tail;
    __u32 ring_mask;
    __u32 ring_entries;
    __u32 flags;
    __u32 dropped;
    __u32 array;
    __u32 resv1;
    __u64 resv2;
};
struct io_cqring_offsets {
    __u32 head;
    __u32 tail;
    __u32 ring_mask;
    __u32 ring_entries;
    __u32 overflow;
    __u32 cqes;
    __u64 resv[2];
};

/*
 * IO submission data structure (Submission Queue Entry)
 */
struct io_uring_sqe {
    __u8    opcode;     /* type of operation for this sqe */
    __u8    flags;      /* IOSQE_ flags */
    __u16   ioprio;     /* ioprio for the request */
    __s32   fd;     /* file descriptor to do IO on */
    __u64   off;        /* offset into file */
    __u64   addr;       /* pointer to buffer or iovecs */
    __u32   len;        /* buffer size or number of iovecs */
    union {
        __kernel_rwf_t  rw_flags;
        __u32       fsync_flags;
        __u16       poll_events;
        __u32       sync_range_flags;
        __u32       msg_flags;
    };
    __u64   user_data;  /* data to be passed back at completion time */
    union {
        __u16   buf_index;  /* index into fixed buffers, if used */
        __u64   __pad2[3];
    };
};
/*
 * IO completion data structure (Completion Queue Entry)
 */
struct io_uring_cqe {
    __u64   user_data;  /* sqe->data submission passed back */
    __s32   res;        /* result code for this event */
    __u32   flags;
};

#define IORING_ENTER_GETEVENTS  (1U << 0)

在初始化完成之后，用户空间就可以使用这些队列来添加 IO 请求。为了提交这些请求到内核执行，这里就要使用 int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig)syscall。这里有两个相关的 flags，如果设置了 IORING_ENTER_GETEVENTS 的 flags，这个调用就会一直等到至少有 min_complete 个请求完成才会返回。如果在 io_uring_setup 的时候设置了 IORING_SETUP_SQPOLL 的 flag，则系统会使用额外的一个线程来执行轮询的操作，这个线程可以运行在指定的 CPU 核心上面，io_uring_params 中的 sq_thread_cpu 和 sq_thread_idle 连个参数可以对轮询的 CPU 和时间进行一些设置。用于内核和运用直接使用了同一块内存来处理，这里实际上可以看作一种特定的共享内存，所以在设置 SQ 提及新的 IO 请求内核里面可以直接看到，不需要经过系统调用。另外的 IORING_SQ_NEED_WAKEUP 可以在一些时候唤醒休眠中的轮询线程。如果在 io_uring_setup 的时候设置了 IORING_SETUP_SQPOLL 的 flag，则执行完成的线程可以直接通过访问相关的队列就可以获取到，不需要经过系统调用。另外 io uring 还包含了其它的一些高级特性。

0x03 内部实现¶

Kernel 中 io uring 表示一个 io uring 的实例的结构是 io uring ctx。其中两个主要的部分是 sq ring 和 cq ring。分别代表了提交的请求的 ring 和已经完成的请求返回结构的 ring。io_sq_ring 和 io_cq_ring 的前面是一些控制字段和一些其它的数据，尾部是一个分配的时候变长的一块内存。io_sq_ring 和 io_cq_ring 不同的一个地方就是 io_cq_ring 后面保存的直接是一个 struct io_uring_cqe 的数组，前面的 struct io_uring 保存了其中被使用的 entry 的访问。而 io_sq_ring 保存的是一个 offset index 的数组，里面的保存的信息才最终定位到对应的 struct io_uring_sqe，这些 struct io_uring_sqe 被保存到另外的一块内存中。从直觉上来说，sq 和 cq 这里的结构会是相同的。但是这里用了不同的方式，sq 加了一层的间接，从网上能找到一些信息，但不是很理解。个人猜测是不同的请求完成的时候不同，而这样进行一段时间之后，sqe 里面可用的 entry 就是不连续的，通过一个 index 数据间接的方式有可用当作是连续的使用。这两个数据在前面使用 mmap 分配内存的时候，对应到了不同的 offset，即前面 IORING_OFF_SQ_RING、IORING_OFF_CQ_RING 和 IORING_OFF_SQES 的预定于的值。

struct io_sq_ring {
    /*
     * Head and tail offsets into the ring; the offsets need to be
     * masked to get valid indices.
     *
     * The kernel controls head and the application controls tail.
     */
    struct io_uring     r;
    /*
     * Bitmask to apply to head and tail offsets (constant, equals
     * ring_entries - 1)
     */
    u32         ring_mask;
    /* Ring size (constant, power of 2) */
    u32         ring_entries;
    /*
     * Number of invalid entries dropped by the kernel due to
     * invalid index stored in array
     *
     * Written by the kernel, shouldn't be modified by the
     * application (i.e. get number of "new events" by comparing to
     * cached value).
     *
     * After a new SQ head value was read by the application this
     * counter includes all submissions that were dropped reaching
     * the new SQ head (and possibly more).
     */
    u32         dropped;
    /*
     * Runtime flags
     *
     * Written by the kernel, shouldn't be modified by the
     * application.
     *
     * The application needs a full memory barrier before checking
     * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
     */
    u32         flags;
    /*
     * Ring buffer of indices into array of io_uring_sqe, which is
     * mmapped by the application using the IORING_OFF_SQES offset.
     *
     * This indirection could e.g. be used to assign fixed
     * io_uring_sqe entries to operations and only submit them to
     * the queue when needed.
     *
     * The kernel modifies neither the indices array nor the entries
     * array.
     */
    u32         array[];
};

/*
 * This data is shared with the application through the mmap at offset
 * IORING_OFF_CQ_RING.
 *
 * The offsets to the member fields are published through struct
 * io_cqring_offsets when calling io_uring_setup.
 */
struct io_cq_ring {
    /*
     * Head and tail offsets into the ring; the offsets need to be
     * masked to get valid indices.
     *
     * The application controls head and the kernel tail.
     */
    struct io_uring     r;
    /*
     * Bitmask to apply to head and tail offsets (constant, equals
     * ring_entries - 1)
     */
    u32         ring_mask;
    /* Ring size (constant, power of 2) */
    u32         ring_entries;
    /*
     * Number of completion events lost because the queue was full;
     * this should be avoided by the application by making sure
     * there are not more requests pending thatn there is space in
     * the completion queue.
     *
     * Written by the kernel, shouldn't be modified by the
     * application (i.e. get number of "new events" by comparing to
     * cached value).
     *
     * As completion events come in out of order this counter is not
     * ordered with any other data.
     */
    u32         overflow;
    /*
     * Ring buffer of completion events.
     *
     * The kernel writes completion events fresh every time they are
     * produced, so the application is allowed to modify pending
     * entries.
     */
    struct io_uring_cqe cqes[];
};


struct io_ring_ctx {
    struct {
        struct percpu_ref   refs;
    } ____cacheline_aligned_in_smp;

    struct {
        unsigned int        flags;
        bool            compat;
        bool            account_mem;

        /* SQ ring */
        struct io_sq_ring   *sq_ring;
        unsigned        cached_sq_head;
        unsigned        sq_entries;
        unsigned        sq_mask;
        unsigned        sq_thread_idle;
        struct io_uring_sqe *sq_sqes;

        struct list_head    defer_list;
    } ____cacheline_aligned_in_smp;

    /* IO offload */
    struct workqueue_struct *sqo_wq;
    struct task_struct  *sqo_thread;    /* if using sq thread polling */
    struct mm_struct    *sqo_mm;
    wait_queue_head_t   sqo_wait;
    struct completion   sqo_thread_started;

    struct {
        /* CQ ring */
        struct io_cq_ring   *cq_ring;
        unsigned        cached_cq_tail;
        unsigned        cq_entries;
        unsigned        cq_mask;
        struct wait_queue_head  cq_wait;
        struct fasync_struct    *cq_fasync;
        struct eventfd_ctx  *cq_ev_fd;
    } ____cacheline_aligned_in_smp;

    /*
     * If used, fixed file set. Writers must ensure that ->refs is dead,
     * readers must ensure that ->refs is alive as long as the file* is
     * used. Only updated through io_uring_register(2).
     */
    struct file     **user_files;
    unsigned        nr_user_files;

    /* if used, fixed mapped user buffers */
    unsigned        nr_user_bufs;
    struct io_mapped_ubuf   *user_bufs;

    struct user_struct  *user;

    struct completion   ctx_done;

    struct {
        struct mutex        uring_lock;
        wait_queue_head_t   wait;
    } ____cacheline_aligned_in_smp;

    struct {
        spinlock_t      completion_lock;
        bool            poll_multi_file;
        /*
         * ->poll_list is protected by the ctx->uring_lock for
         * io_uring instances that don't use IORING_SETUP_SQPOLL.
         * For SQPOLL, only the single threaded io_sq_thread() will
         * manipulate the list, hence no extra locking is needed there.
         */
        struct list_head    poll_list;
        struct list_head    cancel_list;
    } ____cacheline_aligned_in_smp;

    struct async_list   pending_async[2];

#if defined(CONFIG_UNIX)
    struct socket       *ring_sock;
#endif
};

cqe 是内核写应用读的部分，而 sqe 是应用写内核读的部分。io uring setup 的系统调用就是初始化相关数据结构的操作。如果省略很多的错误检查、权限检查和资源配额检查等等的部分，整体的操作还是比较简单的。liburing 中初始化的一个函数如下，调用 io_uring_setup 函数的时候，传入的 io_uring_params 参数中，一部分是用户设置传入的，还有很大的一部分部分都是在内核中 io uring create 操作中设置的，之后设置好的 struct io_uring_params 会被传到用户空间，用户空间根据这些数据来使用 mmap 分配内存，初始化一些数据结构，具体可以参看 http://git.kernel.dk/cgit/liburing/tree/src/setup.c 这里，

int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) {
    struct io_uring_params p;
    int fd, ret;

    memset(&p, 0, sizeof(p));
    p.flags = flags;

    fd = io_uring_setup(entries, &p);
    if (fd < 0)
        return -errno;

    ret = io_uring_queue_mmap(fd, &p, ring);
    if (ret)
        close(fd);

    return ret;
}

内核中 io_uring_enter 的部分的代码如下，这里省略了一些检查的代码。这里整体的思路比较清晰。第一种情况，flags 设置了 IORING_SETUP_SQPOLL 参数，这样的话，内核会使用一个特定的线程去轮询操作。这里使用的轮询结构会最终对应到 struct file_operations 中的 iopoll 操作，这个操作作为一个新的接口在最近才添加到这里，Linux native aio 的新功能也使用了这个 iopoll。这里 io uring 实际上只有 vfs 层的改动，其它的都是使用以及存在的东西，而且几个核心的东西和 aio 使用的相同/类似。这一步没有设置的时候，会走到 io_ring_submit，进行提交请求的操作。另外如前面所提到的，在设置了 flagsIORING_ENTER_GETEVENTS 的情况，会等到 min_complete 个请求完成才会返回，这里会根据是否使用了 IORING_SETUP_IOPOLL 走如到两个分支。

SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
        u32, min_complete, u32, flags, const sigset_t __user *, sig,
        size_t, sigsz)
{
    struct io_ring_ctx *ctx;
    long ret = -EBADF;
    int submitted = 0;
    struct fd f;
  //...
    /*
     * For SQ polling, the thread will do all submissions and completions.
     * Just return the requested submit count, and wake the thread if
     * we were asked to.
     */
    if (ctx->flags & IORING_SETUP_SQPOLL) {
        if (flags & IORING_ENTER_SQ_WAKEUP)
            wake_up(&ctx->sqo_wait);
        submitted = to_submit;
        goto out_ctx;
    }

    ret = 0;
    if (to_submit) {
        to_submit = min(to_submit, ctx->sq_entries);
        mutex_lock(&ctx->uring_lock);
        submitted = io_ring_submit(ctx, to_submit);
        mutex_unlock(&ctx->uring_lock);
    }

    if (flags & IORING_ENTER_GETEVENTS) {
        unsigned nr_events = 0;
        min_complete = min(min_complete, ctx->cq_entries);
        if (ctx->flags & IORING_SETUP_IOPOLL) {
            ret = io_iopoll_check(ctx, &nr_events, min_complete);
        } else {
            ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
        }
    }

out_ctx:
    io_ring_drop_ctx_refs(ctx, 1);
out_fput:
    fdput(f);
    return submitted ? submitted : ret;
}

以 io_iopoll_check 为例，正常情况下执行路线是 io_iopoll_check -> io_iopoll_getevents -> io_do_iopoll -> (kiocb->ki_filp->f_op->iopoll). 在完成请求的操作之后，会调用下面这个函数提交结果到 cqe 数组中，这样应用就能看到结果了。这里的 io_cqring_fill_event 就是获取一个目前可以写入到 cqe，写入数据。这里最终调用的会是 io_get_cqring，可以见就是返回目前 tail 的后面的一个。

static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
                   struct list_head *done)
{
    void *reqs[IO_IOPOLL_BATCH];
    struct io_kiocb *req;
    int to_free;

    to_free = 0;
    while (!list_empty(done)) {
        req = list_first_entry(done, struct io_kiocb, list);
        list_del(&req->list);

        io_cqring_fill_event(ctx, req->user_data, req->result);
        (*nr_events)++;

        if (refcount_dec_and_test(&req->refs)) {
            /* If we're not using fixed files, we have to pair the
             * completion part with the file put. Use regular
             * completions for those, only batch free for fixed
             * file and non-linked commands.
             */
            if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
                REQ_F_FIXED_FILE) {
                reqs[to_free++] = req;
                if (to_free == ARRAY_SIZE(reqs))
                    io_free_req_many(ctx, reqs, &to_free);
            } else {
                io_free_req(req);
            }
        }
    }

    io_commit_cqring(ctx);
    io_free_req_many(ctx, reqs, &to_free);
}

static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
    struct io_cq_ring *ring = ctx->cq_ring;
    unsigned tail;

    tail = ctx->cached_cq_tail;
    /*
     * writes to the cq entry need to come after reading head; the
     * control dependency is enough as we're using WRITE_ONCE to
     * fill the cq entry
     */
    if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
        return NULL;

    ctx->cached_cq_tail++;
    return &ring->cqes[tail & ctx->cq_mask];
}

static int io_sq_thread(void *data) 为内核轮询线程的逻辑，在设置了对应的 flags 的时候会启动。

0x04 评估¶

这里的一些信息可以参看 [1]。另外网上可以找到不少的关于 io uring 性能的数据。在一些情况下，io uring 可以达到甚至超过 spdk 的性能。io uring 的表现很惊艳。

... For peak performance, io_uring helps us get to 1.7M 4k IOPS with polling. aio reaches a performance cliff much lower than that, at 608K. The comparison here isn't quite fair, since aio doesn't support polled IO. If we disable polling, io_uring is able to drive about 1.2M IOPS for the (otherwise) same test case.

参考¶

Efficient IO with io_uring, http://kernel.dk/io_uring.pdf

原文链接：https://nan01ab.github.io/2019/05/io_uring.html