posix-aio-compat.c 16.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 * QEMU posix-aio emulation
 *
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
12 13
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
14 15
 */

16
#include <sys/ioctl.h>
17
#include <sys/types.h>
18 19 20
#include <pthread.h>
#include <unistd.h>
#include <errno.h>
21
#include <time.h>
M
malc 已提交
22 23 24
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
25

B
Blue Swirl 已提交
26
#include "qemu-queue.h"
27
#include "osdep.h"
28
#include "sysemu.h"
29
#include "qemu-common.h"
30
#include "trace.h"
31
#include "block_int.h"
32
#include "iov.h"
33 34 35

#include "block/raw-posix-aio.h"

36
static void do_spawn_thread(void);
37 38 39 40 41 42

struct qemu_paiocb {
    BlockDriverAIOCB common;
    int aio_fildes;
    union {
        struct iovec *aio_iov;
43
        void *aio_ioctl_buf;
44 45 46 47 48 49
    };
    int aio_niov;
    size_t aio_nbytes;
#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
    off_t aio_offset;

B
Blue Swirl 已提交
50
    QTAILQ_ENTRY(qemu_paiocb) node;
51 52 53 54 55 56 57 58 59 60
    int aio_type;
    ssize_t ret;
    int active;
    struct qemu_paiocb *next;
};

typedef struct PosixAioState {
    int rfd, wfd;
    struct qemu_paiocb *first_aio;
} PosixAioState;
61 62 63 64 65


static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
static pthread_t thread_id;
M
malc 已提交
66
static pthread_attr_t attr;
67 68 69
static int max_threads = 64;
static int cur_threads = 0;
static int idle_threads = 0;
70 71 72
static int new_threads = 0;     /* backlog of threads we need to create */
static int pending_threads = 0; /* threads created but not running yet */
static QEMUBH *new_thread_bh;
B
Blue Swirl 已提交
73
static QTAILQ_HEAD(, qemu_paiocb) request_list;
74

75
#ifdef CONFIG_PREADV
76 77 78 79 80
static int preadv_present = 1;
#else
static int preadv_present = 0;
#endif

M
malc 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
static void die2(int err, const char *what)
{
    fprintf(stderr, "%s failed: %s\n", what, strerror(err));
    abort();
}

static void die(const char *what)
{
    die2(errno, what);
}

static void mutex_lock(pthread_mutex_t *mutex)
{
    int ret = pthread_mutex_lock(mutex);
    if (ret) die2(ret, "pthread_mutex_lock");
}

static void mutex_unlock(pthread_mutex_t *mutex)
{
    int ret = pthread_mutex_unlock(mutex);
    if (ret) die2(ret, "pthread_mutex_unlock");
}

static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
                           struct timespec *ts)
{
    int ret = pthread_cond_timedwait(cond, mutex, ts);
    if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait");
    return ret;
}

M
malc 已提交
112
static void cond_signal(pthread_cond_t *cond)
M
malc 已提交
113
{
M
malc 已提交
114 115
    int ret = pthread_cond_signal(cond);
    if (ret) die2(ret, "pthread_cond_signal");
M
malc 已提交
116 117 118 119 120 121 122 123 124
}

static void thread_create(pthread_t *thread, pthread_attr_t *attr,
                          void *(*start_routine)(void*), void *arg)
{
    int ret = pthread_create(thread, attr, start_routine, arg);
    if (ret) die2(ret, "pthread_create");
}

K
Kevin Wolf 已提交
125
static ssize_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
126
{
127 128 129 130 131 132 133
    int ret;

    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
    if (ret == -1)
        return -errno;

    /*
134 135
     * This looks weird, but the aio code only considers a request
     * successful if it has written the full number of bytes.
136 137 138 139 140 141
     *
     * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
     * so in fact we return the ioctl command here to make posix_aio_read()
     * happy..
     */
    return aiocb->aio_nbytes;
142 143
}

K
Kevin Wolf 已提交
144
static ssize_t handle_aiocb_flush(struct qemu_paiocb *aiocb)
145 146 147
{
    int ret;

148
    ret = qemu_fdatasync(aiocb->aio_fildes);
149 150 151 152 153
    if (ret == -1)
        return -errno;
    return 0;
}

154
#ifdef CONFIG_PREADV
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183

static ssize_t
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return preadv(fd, iov, nr_iov, offset);
}

static ssize_t
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return pwritev(fd, iov, nr_iov, offset);
}

#else

static ssize_t
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return -ENOSYS;
}

static ssize_t
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return -ENOSYS;
}

#endif

K
Kevin Wolf 已提交
184
static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
185 186 187 188
{
    ssize_t len;

    do {
189
        if (aiocb->aio_type & QEMU_AIO_WRITE)
190 191 192
            len = qemu_pwritev(aiocb->aio_fildes,
                               aiocb->aio_iov,
                               aiocb->aio_niov,
193
                               aiocb->aio_offset);
194 195 196 197
         else
            len = qemu_preadv(aiocb->aio_fildes,
                              aiocb->aio_iov,
                              aiocb->aio_niov,
198
                              aiocb->aio_offset);
199 200 201 202 203 204 205
    } while (len == -1 && errno == EINTR);

    if (len == -1)
        return -errno;
    return len;
}

206 207 208 209 210 211
/*
 * Read/writes the data to/from a given linear buffer.
 *
 * Returns the number of bytes handles or -errno in case of an error. Short
 * reads are only returned if the end of the file is reached.
 */
K
Kevin Wolf 已提交
212
static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
213
{
K
Kevin Wolf 已提交
214 215
    ssize_t offset = 0;
    ssize_t len;
216 217

    while (offset < aiocb->aio_nbytes) {
218
         if (aiocb->aio_type & QEMU_AIO_WRITE)
219 220 221 222 223 224 225
             len = pwrite(aiocb->aio_fildes,
                          (const char *)buf + offset,
                          aiocb->aio_nbytes - offset,
                          aiocb->aio_offset + offset);
         else
             len = pread(aiocb->aio_fildes,
                         buf + offset,
226 227 228
                         aiocb->aio_nbytes - offset,
                         aiocb->aio_offset + offset);

229 230 231 232 233 234 235 236 237
         if (len == -1 && errno == EINTR)
             continue;
         else if (len == -1) {
             offset = -errno;
             break;
         } else if (len == 0)
             break;

         offset += len;
238 239 240 241 242
    }

    return offset;
}

K
Kevin Wolf 已提交
243
static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
244
{
K
Kevin Wolf 已提交
245
    ssize_t nbytes;
246 247
    char *buf;

248
    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
249 250 251 252
        /*
         * If there is just a single buffer, and it is properly aligned
         * we can just use plain pread/pwrite without any problems.
         */
253 254 255 256 257 258 259 260 261
        if (aiocb->aio_niov == 1)
             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);

        /*
         * We have more than one iovec, and all are properly aligned.
         *
         * Try preadv/pwritev first and fall back to linearizing the
         * buffer if it's not supported.
         */
262
        if (preadv_present) {
263 264
            nbytes = handle_aiocb_rw_vector(aiocb);
            if (nbytes == aiocb->aio_nbytes)
265
                return nbytes;
266 267 268 269 270 271 272 273 274 275
            if (nbytes < 0 && nbytes != -ENOSYS)
                return nbytes;
            preadv_present = 0;
        }

        /*
         * XXX(hch): short read/write.  no easy way to handle the reminder
         * using these interfaces.  For now retry using plain
         * pread/pwrite?
         */
276
    }
277

278 279 280 281
    /*
     * Ok, we have to do it the hard way, copy all segments into
     * a single aligned buffer.
     */
282
    buf = qemu_blockalign(aiocb->common.bs, aiocb->aio_nbytes);
283
    if (aiocb->aio_type & QEMU_AIO_WRITE) {
284 285 286 287 288 289 290 291 292 293
        char *p = buf;
        int i;

        for (i = 0; i < aiocb->aio_niov; ++i) {
            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
            p += aiocb->aio_iov[i].iov_len;
        }
    }

    nbytes = handle_aiocb_rw_linear(aiocb, buf);
294
    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
        char *p = buf;
        size_t count = aiocb->aio_nbytes, copy;
        int i;

        for (i = 0; i < aiocb->aio_niov && count; ++i) {
            copy = count;
            if (copy > aiocb->aio_iov[i].iov_len)
                copy = aiocb->aio_iov[i].iov_len;
            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
            p     += copy;
            count -= copy;
        }
    }
    qemu_vfree(buf);

    return nbytes;
311 312
}

F
Frediano Ziglio 已提交
313 314
static void posix_aio_notify_event(void);

315 316
static void *aio_thread(void *unused)
{
317 318 319 320 321
    mutex_lock(&lock);
    pending_threads--;
    mutex_unlock(&lock);
    do_spawn_thread();

322 323
    while (1) {
        struct qemu_paiocb *aiocb;
K
Kevin Wolf 已提交
324
        ssize_t ret = 0;
325 326 327 328 329 330
        qemu_timeval tv;
        struct timespec ts;

        qemu_gettimeofday(&tv);
        ts.tv_sec = tv.tv_sec + 10;
        ts.tv_nsec = 0;
331

M
malc 已提交
332
        mutex_lock(&lock);
333

B
Blue Swirl 已提交
334
        while (QTAILQ_EMPTY(&request_list) &&
335
               !(ret == ETIMEDOUT)) {
336
            idle_threads++;
M
malc 已提交
337
            ret = cond_timedwait(&cond, &lock, &ts);
338
            idle_threads--;
339 340
        }

B
Blue Swirl 已提交
341
        if (QTAILQ_EMPTY(&request_list))
342 343
            break;

B
Blue Swirl 已提交
344 345
        aiocb = QTAILQ_FIRST(&request_list);
        QTAILQ_REMOVE(&request_list, aiocb, node);
346
        aiocb->active = 1;
M
malc 已提交
347
        mutex_unlock(&lock);
348

349 350
        switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
        case QEMU_AIO_READ:
351 352 353 354
            ret = handle_aiocb_rw(aiocb);
            if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) {
                /* A short read means that we have reached EOF. Pad the buffer
                 * with zeros for bytes after EOF. */
355 356
                iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
                           0, aiocb->aio_nbytes - ret);
357 358 359 360

                ret = aiocb->aio_nbytes;
            }
            break;
361
        case QEMU_AIO_WRITE:
362 363
            ret = handle_aiocb_rw(aiocb);
            break;
364
        case QEMU_AIO_FLUSH:
365 366
            ret = handle_aiocb_flush(aiocb);
            break;
367
        case QEMU_AIO_IOCTL:
368 369 370 371 372 373 374
            ret = handle_aiocb_ioctl(aiocb);
            break;
        default:
            fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
            ret = -EINVAL;
            break;
        }
375

M
malc 已提交
376
        mutex_lock(&lock);
377
        aiocb->ret = ret;
M
malc 已提交
378
        mutex_unlock(&lock);
379

F
Frediano Ziglio 已提交
380
        posix_aio_notify_event();
381 382 383
    }

    cur_threads--;
M
malc 已提交
384
    mutex_unlock(&lock);
385 386 387 388

    return NULL;
}

389
static void do_spawn_thread(void)
390
{
391 392
    sigset_t set, oldset;

393 394 395 396 397 398 399 400 401 402
    mutex_lock(&lock);
    if (!new_threads) {
        mutex_unlock(&lock);
        return;
    }

    new_threads--;
    pending_threads++;

    mutex_unlock(&lock);
403 404 405 406 407

    /* block all signals */
    if (sigfillset(&set)) die("sigfillset");
    if (sigprocmask(SIG_SETMASK, &set, &oldset)) die("sigprocmask");

M
malc 已提交
408
    thread_create(&thread_id, &attr, aio_thread, NULL);
409 410

    if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
411 412
}

413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
static void spawn_thread_bh_fn(void *opaque)
{
    do_spawn_thread();
}

static void spawn_thread(void)
{
    cur_threads++;
    new_threads++;
    /* If there are threads being created, they will spawn new workers, so
     * we don't spend time creating many threads in a loop holding a mutex or
     * starving the current vcpu.
     *
     * If there are no idle threads, ask the main thread to create one, so we
     * inherit the correct affinity instead of the vcpu affinity.
     */
    if (!pending_threads) {
        qemu_bh_schedule(new_thread_bh);
    }
}

434
static void qemu_paio_submit(struct qemu_paiocb *aiocb)
435 436 437
{
    aiocb->ret = -EINPROGRESS;
    aiocb->active = 0;
M
malc 已提交
438
    mutex_lock(&lock);
439 440
    if (idle_threads == 0 && cur_threads < max_threads)
        spawn_thread();
B
Blue Swirl 已提交
441
    QTAILQ_INSERT_TAIL(&request_list, aiocb, node);
M
malc 已提交
442
    mutex_unlock(&lock);
M
malc 已提交
443
    cond_signal(&cond);
444 445
}

446
static ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
447 448 449
{
    ssize_t ret;

M
malc 已提交
450
    mutex_lock(&lock);
451
    ret = aiocb->ret;
M
malc 已提交
452
    mutex_unlock(&lock);
453 454 455 456

    return ret;
}

457
static int qemu_paio_error(struct qemu_paiocb *aiocb)
458 459 460 461 462 463 464 465 466 467 468
{
    ssize_t ret = qemu_paio_return(aiocb);

    if (ret < 0)
        ret = -ret;
    else
        ret = 0;

    return ret;
}

469
static void posix_aio_read(void *opaque)
470
{
471 472
    PosixAioState *s = opaque;
    struct qemu_paiocb *acb, **pacb;
473
    int ret;
474 475 476 477 478 479 480 481 482 483 484 485 486
    ssize_t len;

    /* read all bytes from signal pipe */
    for (;;) {
        char bytes[16];

        len = read(s->rfd, bytes, sizeof(bytes));
        if (len == -1 && errno == EINTR)
            continue; /* try again */
        if (len == sizeof(bytes))
            continue; /* more to read */
        break;
    }
487 488 489 490 491 492

    for(;;) {
        pacb = &s->first_aio;
        for(;;) {
            acb = *pacb;
            if (!acb)
493
                return;
K
Kevin Wolf 已提交
494

495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
            ret = qemu_paio_error(acb);
            if (ret == ECANCELED) {
                /* remove the request */
                *pacb = acb->next;
                qemu_aio_release(acb);
            } else if (ret != EINPROGRESS) {
                /* end of aio */
                if (ret == 0) {
                    ret = qemu_paio_return(acb);
                    if (ret == acb->aio_nbytes)
                        ret = 0;
                    else
                        ret = -EINVAL;
                } else {
                    ret = -ret;
                }
511 512 513

                trace_paio_complete(acb, acb->common.opaque, ret);

514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
                /* remove the request */
                *pacb = acb->next;
                /* call the callback */
                acb->common.cb(acb->common.opaque, ret);
                qemu_aio_release(acb);
                break;
            } else {
                pacb = &acb->next;
            }
        }
    }
}

static int posix_aio_flush(void *opaque)
{
    PosixAioState *s = opaque;
    return !!s->first_aio;
}

static PosixAioState *posix_aio_state;

F
Frediano Ziglio 已提交
535
static void posix_aio_notify_event(void)
536
{
F
Frediano Ziglio 已提交
537 538
    char byte = 0;
    ssize_t ret;
539

F
Frediano Ziglio 已提交
540 541 542
    ret = write(posix_aio_state->wfd, &byte, sizeof(byte));
    if (ret < 0 && errno != EAGAIN)
        die("write()");
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
}

static void paio_remove(struct qemu_paiocb *acb)
{
    struct qemu_paiocb **pacb;

    /* remove the callback from the queue */
    pacb = &posix_aio_state->first_aio;
    for(;;) {
        if (*pacb == NULL) {
            fprintf(stderr, "paio_remove: aio request not found!\n");
            break;
        } else if (*pacb == acb) {
            *pacb = acb->next;
            qemu_aio_release(acb);
            break;
        }
        pacb = &(*pacb)->next;
    }
}

static void paio_cancel(BlockDriverAIOCB *blockacb)
{
    struct qemu_paiocb *acb = (struct qemu_paiocb *)blockacb;
    int active = 0;
568

569 570
    trace_paio_cancel(acb, acb->common.opaque);

M
malc 已提交
571
    mutex_lock(&lock);
572
    if (!acb->active) {
B
Blue Swirl 已提交
573
        QTAILQ_REMOVE(&request_list, acb, node);
574 575 576 577
        acb->ret = -ECANCELED;
    } else if (acb->ret == -EINPROGRESS) {
        active = 1;
    }
M
malc 已提交
578
    mutex_unlock(&lock);
579

580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
    if (active) {
        /* fail safe: if the aio could not be canceled, we wait for
           it */
        while (qemu_paio_error(acb) == EINPROGRESS)
            ;
    }

    paio_remove(acb);
}

static AIOPool raw_aio_pool = {
    .aiocb_size         = sizeof(struct qemu_paiocb),
    .cancel             = paio_cancel,
};

K
Kevin Wolf 已提交
595
BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
596 597 598 599 600 601 602 603
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque, int type)
{
    struct qemu_paiocb *acb;

    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
    acb->aio_type = type;
    acb->aio_fildes = fd;
K
Kevin Wolf 已提交
604

605 606 607 608
    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
    }
609 610 611 612 613 614
    acb->aio_nbytes = nb_sectors * 512;
    acb->aio_offset = sector_num * 512;

    acb->next = posix_aio_state->first_aio;
    posix_aio_state->first_aio = acb;

615
    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639
    qemu_paio_submit(acb);
    return &acb->common;
}

BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
        unsigned long int req, void *buf,
        BlockDriverCompletionFunc *cb, void *opaque)
{
    struct qemu_paiocb *acb;

    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
    acb->aio_type = QEMU_AIO_IOCTL;
    acb->aio_fildes = fd;
    acb->aio_offset = 0;
    acb->aio_ioctl_buf = buf;
    acb->aio_ioctl_cmd = req;

    acb->next = posix_aio_state->first_aio;
    posix_aio_state->first_aio = acb;

    qemu_paio_submit(acb);
    return &acb->common;
}

K
Kevin Wolf 已提交
640
int paio_init(void)
641 642 643 644 645 646
{
    PosixAioState *s;
    int fds[2];
    int ret;

    if (posix_aio_state)
K
Kevin Wolf 已提交
647
        return 0;
648

649
    s = g_malloc(sizeof(PosixAioState));
650 651

    s->first_aio = NULL;
K
Kevin Wolf 已提交
652
    if (qemu_pipe(fds) == -1) {
653
        fprintf(stderr, "failed to create pipe\n");
654
        g_free(s);
K
Kevin Wolf 已提交
655
        return -1;
656 657 658 659 660 661 662 663
    }

    s->rfd = fds[0];
    s->wfd = fds[1];

    fcntl(s->rfd, F_SETFL, O_NONBLOCK);
    fcntl(s->wfd, F_SETFL, O_NONBLOCK);

664
    qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
665 666 667 668 669 670 671 672 673

    ret = pthread_attr_init(&attr);
    if (ret)
        die2(ret, "pthread_attr_init");

    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
    if (ret)
        die2(ret, "pthread_attr_setdetachstate");

B
Blue Swirl 已提交
674
    QTAILQ_INIT(&request_list);
675
    new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
676 677

    posix_aio_state = s;
K
Kevin Wolf 已提交
678
    return 0;
679
}