linux-aio.c 5.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 * Linux native AIO support.
 *
 * Copyright (C) 2009 IBM, Corp.
 * Copyright (C) 2009 Red Hat, Inc.
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
#include "qemu-common.h"
#include "qemu-aio.h"
12
#include "qemu-queue.h"
13
#include "block/raw-aio.h"
P
Paolo Bonzini 已提交
14
#include "event_notifier.h"
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33

#include <libaio.h>

/*
 * Queue size (per-device).
 *
 * XXX: eventually we need to communicate this to the guest and/or make it
 *      tunable by the guest.  If we get more outstanding requests at a time
 *      than this we will get EAGAIN from io_submit which is communicated to
 *      the guest as an I/O error.
 */
#define MAX_EVENTS 128

struct qemu_laiocb {
    BlockDriverAIOCB common;
    struct qemu_laio_state *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
34 35
    QEMUIOVector *qiov;
    bool is_read;
K
Kevin Wolf 已提交
36
    QLIST_ENTRY(qemu_laiocb) node;
37 38 39 40
};

struct qemu_laio_state {
    io_context_t ctx;
P
Paolo Bonzini 已提交
41
    EventNotifier e;
42 43 44 45 46 47 48 49
    int count;
};

static inline ssize_t io_event_ret(struct io_event *ev)
{
    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
}

K
Kevin Wolf 已提交
50 51 52 53 54 55 56 57 58 59 60 61
/*
 * Completes an AIO request (calls the callback and frees the ACB).
 */
static void qemu_laio_process_completion(struct qemu_laio_state *s,
    struct qemu_laiocb *laiocb)
{
    int ret;

    s->count--;

    ret = laiocb->ret;
    if (ret != -ECANCELED) {
62
        if (ret == laiocb->nbytes) {
K
Kevin Wolf 已提交
63
            ret = 0;
64 65 66
        } else if (ret >= 0) {
            /* Short reads mean EOF, pad with zeros. */
            if (laiocb->is_read) {
67 68
                qemu_iovec_memset(laiocb->qiov, ret, 0,
                    laiocb->qiov->size - ret);
69 70 71 72
            } else {
                ret = -EINVAL;
            }
        }
K
Kevin Wolf 已提交
73 74 75 76 77 78 79

        laiocb->common.cb(laiocb->common.opaque, ret);
    }

    qemu_aio_release(laiocb);
}

P
Paolo Bonzini 已提交
80
static void qemu_laio_completion_cb(EventNotifier *e)
81
{
P
Paolo Bonzini 已提交
82
    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
83

P
Paolo Bonzini 已提交
84
    while (event_notifier_test_and_clear(&s->e)) {
85 86 87 88 89
        struct io_event events[MAX_EVENTS];
        struct timespec ts = { 0 };
        int nevents, i;

        do {
P
Paolo Bonzini 已提交
90
            nevents = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, events, &ts);
91 92 93 94 95 96 97
        } while (nevents == -EINTR);

        for (i = 0; i < nevents; i++) {
            struct iocb *iocb = events[i].obj;
            struct qemu_laiocb *laiocb =
                    container_of(iocb, struct qemu_laiocb, iocb);

K
Kevin Wolf 已提交
98
            laiocb->ret = io_event_ret(&events[i]);
K
Kevin Wolf 已提交
99
            qemu_laio_process_completion(s, laiocb);
100 101 102 103
        }
    }
}

P
Paolo Bonzini 已提交
104
static int qemu_laio_flush_cb(EventNotifier *e)
105
{
P
Paolo Bonzini 已提交
106
    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137

    return (s->count > 0) ? 1 : 0;
}

static void laio_cancel(BlockDriverAIOCB *blockacb)
{
    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
    struct io_event event;
    int ret;

    if (laiocb->ret != -EINPROGRESS)
        return;

    /*
     * Note that as of Linux 2.6.31 neither the block device code nor any
     * filesystem implements cancellation of AIO request.
     * Thus the polling loop below is the normal code path.
     */
    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
    if (ret == 0) {
        laiocb->ret = -ECANCELED;
        return;
    }

    /*
     * We have to wait for the iocb to finish.
     *
     * The only way to get the iocb status update is by polling the io context.
     * We might be able to do this slightly more optimal by removing the
     * O_NONBLOCK flag.
     */
P
Paolo Bonzini 已提交
138 139 140
    while (laiocb->ret == -EINPROGRESS) {
        qemu_laio_completion_cb(&laiocb->ctx->e);
    }
141 142
}

S
Stefan Hajnoczi 已提交
143
static const AIOCBInfo laio_aiocb_info = {
144 145 146 147 148 149 150 151 152 153 154 155 156
    .aiocb_size         = sizeof(struct qemu_laiocb),
    .cancel             = laio_cancel,
};

BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque, int type)
{
    struct qemu_laio_state *s = aio_ctx;
    struct qemu_laiocb *laiocb;
    struct iocb *iocbs;
    off_t offset = sector_num * 512;

S
Stefan Hajnoczi 已提交
157
    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
158 159 160
    laiocb->nbytes = nb_sectors * 512;
    laiocb->ctx = s;
    laiocb->ret = -EINPROGRESS;
161 162
    laiocb->is_read = (type == QEMU_AIO_READ);
    laiocb->qiov = qiov;
163 164 165 166 167 168 169 170 171 172

    iocbs = &laiocb->iocb;

    switch (type) {
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
	break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
	break;
F
Frediano Ziglio 已提交
173
    /* Currently Linux kernel does not support other operations */
174 175 176 177 178
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
        goto out_free_aiocb;
    }
P
Paolo Bonzini 已提交
179
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
180 181 182 183 184 185 186 187
    s->count++;

    if (io_submit(s->ctx, 1, &iocbs) < 0)
        goto out_dec_count;
    return &laiocb->common;

out_dec_count:
    s->count--;
188 189
out_free_aiocb:
    qemu_aio_release(laiocb);
190 191 192 193 194 195 196
    return NULL;
}

void *laio_init(void)
{
    struct qemu_laio_state *s;

197
    s = g_malloc0(sizeof(*s));
P
Paolo Bonzini 已提交
198
    if (event_notifier_init(&s->e, false) < 0) {
199
        goto out_free_state;
P
Paolo Bonzini 已提交
200
    }
201

P
Paolo Bonzini 已提交
202
    if (io_setup(MAX_EVENTS, &s->ctx) != 0) {
203
        goto out_close_efd;
P
Paolo Bonzini 已提交
204
    }
205

P
Paolo Bonzini 已提交
206 207
    qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb,
                                qemu_laio_flush_cb);
208 209 210 211

    return s;

out_close_efd:
P
Paolo Bonzini 已提交
212
    event_notifier_cleanup(&s->e);
213
out_free_state:
214
    g_free(s);
215 216
    return NULL;
}