posix-aio-compat.c 16.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 * QEMU posix-aio emulation
 *
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
12 13
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
14 15
 */

16
#include <sys/ioctl.h>
17
#include <sys/types.h>
18 19 20
#include <pthread.h>
#include <unistd.h>
#include <errno.h>
21
#include <time.h>
M
malc 已提交
22 23 24
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
25

B
Blue Swirl 已提交
26
#include "qemu-queue.h"
27
#include "osdep.h"
28
#include "sysemu.h"
29
#include "qemu-common.h"
30
#include "trace.h"
31 32 33 34
#include "block_int.h"

#include "block/raw-posix-aio.h"

35
static void do_spawn_thread(void);
36 37 38 39 40 41

struct qemu_paiocb {
    BlockDriverAIOCB common;
    int aio_fildes;
    union {
        struct iovec *aio_iov;
42
        void *aio_ioctl_buf;
43 44 45 46 47 48
    };
    int aio_niov;
    size_t aio_nbytes;
#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
    off_t aio_offset;

B
Blue Swirl 已提交
49
    QTAILQ_ENTRY(qemu_paiocb) node;
50 51 52 53 54 55 56 57 58 59
    int aio_type;
    ssize_t ret;
    int active;
    struct qemu_paiocb *next;
};

typedef struct PosixAioState {
    int rfd, wfd;
    struct qemu_paiocb *first_aio;
} PosixAioState;
60 61 62 63 64


static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
static pthread_t thread_id;
M
malc 已提交
65
static pthread_attr_t attr;
66 67 68
static int max_threads = 64;
static int cur_threads = 0;
static int idle_threads = 0;
69 70 71
static int new_threads = 0;     /* backlog of threads we need to create */
static int pending_threads = 0; /* threads created but not running yet */
static QEMUBH *new_thread_bh;
B
Blue Swirl 已提交
72
static QTAILQ_HEAD(, qemu_paiocb) request_list;
73

74
#ifdef CONFIG_PREADV
75 76 77 78 79
static int preadv_present = 1;
#else
static int preadv_present = 0;
#endif

M
malc 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
static void die2(int err, const char *what)
{
    fprintf(stderr, "%s failed: %s\n", what, strerror(err));
    abort();
}

static void die(const char *what)
{
    die2(errno, what);
}

static void mutex_lock(pthread_mutex_t *mutex)
{
    int ret = pthread_mutex_lock(mutex);
    if (ret) die2(ret, "pthread_mutex_lock");
}

static void mutex_unlock(pthread_mutex_t *mutex)
{
    int ret = pthread_mutex_unlock(mutex);
    if (ret) die2(ret, "pthread_mutex_unlock");
}

static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
                           struct timespec *ts)
{
    int ret = pthread_cond_timedwait(cond, mutex, ts);
    if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait");
    return ret;
}

M
malc 已提交
111
static void cond_signal(pthread_cond_t *cond)
M
malc 已提交
112
{
M
malc 已提交
113 114
    int ret = pthread_cond_signal(cond);
    if (ret) die2(ret, "pthread_cond_signal");
M
malc 已提交
115 116 117 118 119 120 121 122 123
}

static void thread_create(pthread_t *thread, pthread_attr_t *attr,
                          void *(*start_routine)(void*), void *arg)
{
    int ret = pthread_create(thread, attr, start_routine, arg);
    if (ret) die2(ret, "pthread_create");
}

K
Kevin Wolf 已提交
124
static ssize_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
125
{
126 127 128 129 130 131 132
    int ret;

    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
    if (ret == -1)
        return -errno;

    /*
133 134
     * This looks weird, but the aio code only considers a request
     * successful if it has written the full number of bytes.
135 136 137 138 139 140
     *
     * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
     * so in fact we return the ioctl command here to make posix_aio_read()
     * happy..
     */
    return aiocb->aio_nbytes;
141 142
}

K
Kevin Wolf 已提交
143
static ssize_t handle_aiocb_flush(struct qemu_paiocb *aiocb)
144 145 146
{
    int ret;

147
    ret = qemu_fdatasync(aiocb->aio_fildes);
148 149 150 151 152
    if (ret == -1)
        return -errno;
    return 0;
}

153
#ifdef CONFIG_PREADV
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182

static ssize_t
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return preadv(fd, iov, nr_iov, offset);
}

static ssize_t
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return pwritev(fd, iov, nr_iov, offset);
}

#else

static ssize_t
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return -ENOSYS;
}

static ssize_t
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return -ENOSYS;
}

#endif

K
Kevin Wolf 已提交
183
static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
184 185 186 187
{
    ssize_t len;

    do {
188
        if (aiocb->aio_type & QEMU_AIO_WRITE)
189 190 191
            len = qemu_pwritev(aiocb->aio_fildes,
                               aiocb->aio_iov,
                               aiocb->aio_niov,
192
                               aiocb->aio_offset);
193 194 195 196
         else
            len = qemu_preadv(aiocb->aio_fildes,
                              aiocb->aio_iov,
                              aiocb->aio_niov,
197
                              aiocb->aio_offset);
198 199 200 201 202 203 204
    } while (len == -1 && errno == EINTR);

    if (len == -1)
        return -errno;
    return len;
}

205 206 207 208 209 210
/*
 * Read/writes the data to/from a given linear buffer.
 *
 * Returns the number of bytes handles or -errno in case of an error. Short
 * reads are only returned if the end of the file is reached.
 */
K
Kevin Wolf 已提交
211
static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
212
{
K
Kevin Wolf 已提交
213 214
    ssize_t offset = 0;
    ssize_t len;
215 216

    while (offset < aiocb->aio_nbytes) {
217
         if (aiocb->aio_type & QEMU_AIO_WRITE)
218 219 220 221 222 223 224
             len = pwrite(aiocb->aio_fildes,
                          (const char *)buf + offset,
                          aiocb->aio_nbytes - offset,
                          aiocb->aio_offset + offset);
         else
             len = pread(aiocb->aio_fildes,
                         buf + offset,
225 226 227
                         aiocb->aio_nbytes - offset,
                         aiocb->aio_offset + offset);

228 229 230 231 232 233 234 235 236
         if (len == -1 && errno == EINTR)
             continue;
         else if (len == -1) {
             offset = -errno;
             break;
         } else if (len == 0)
             break;

         offset += len;
237 238 239 240 241
    }

    return offset;
}

K
Kevin Wolf 已提交
242
static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
243
{
K
Kevin Wolf 已提交
244
    ssize_t nbytes;
245 246
    char *buf;

247
    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
248 249 250 251
        /*
         * If there is just a single buffer, and it is properly aligned
         * we can just use plain pread/pwrite without any problems.
         */
252 253 254 255 256 257 258 259 260
        if (aiocb->aio_niov == 1)
             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);

        /*
         * We have more than one iovec, and all are properly aligned.
         *
         * Try preadv/pwritev first and fall back to linearizing the
         * buffer if it's not supported.
         */
261
        if (preadv_present) {
262 263
            nbytes = handle_aiocb_rw_vector(aiocb);
            if (nbytes == aiocb->aio_nbytes)
264
                return nbytes;
265 266 267 268 269 270 271 272 273 274
            if (nbytes < 0 && nbytes != -ENOSYS)
                return nbytes;
            preadv_present = 0;
        }

        /*
         * XXX(hch): short read/write.  no easy way to handle the reminder
         * using these interfaces.  For now retry using plain
         * pread/pwrite?
         */
275
    }
276

277 278 279 280
    /*
     * Ok, we have to do it the hard way, copy all segments into
     * a single aligned buffer.
     */
281
    buf = qemu_blockalign(aiocb->common.bs, aiocb->aio_nbytes);
282
    if (aiocb->aio_type & QEMU_AIO_WRITE) {
283 284 285 286 287 288 289 290 291 292
        char *p = buf;
        int i;

        for (i = 0; i < aiocb->aio_niov; ++i) {
            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
            p += aiocb->aio_iov[i].iov_len;
        }
    }

    nbytes = handle_aiocb_rw_linear(aiocb, buf);
293
    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
        char *p = buf;
        size_t count = aiocb->aio_nbytes, copy;
        int i;

        for (i = 0; i < aiocb->aio_niov && count; ++i) {
            copy = count;
            if (copy > aiocb->aio_iov[i].iov_len)
                copy = aiocb->aio_iov[i].iov_len;
            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
            p     += copy;
            count -= copy;
        }
    }
    qemu_vfree(buf);

    return nbytes;
310 311
}

F
Frediano Ziglio 已提交
312 313
static void posix_aio_notify_event(void);

314 315
static void *aio_thread(void *unused)
{
316 317 318 319 320
    mutex_lock(&lock);
    pending_threads--;
    mutex_unlock(&lock);
    do_spawn_thread();

321 322
    while (1) {
        struct qemu_paiocb *aiocb;
K
Kevin Wolf 已提交
323
        ssize_t ret = 0;
324 325 326 327 328 329
        qemu_timeval tv;
        struct timespec ts;

        qemu_gettimeofday(&tv);
        ts.tv_sec = tv.tv_sec + 10;
        ts.tv_nsec = 0;
330

M
malc 已提交
331
        mutex_lock(&lock);
332

B
Blue Swirl 已提交
333
        while (QTAILQ_EMPTY(&request_list) &&
334
               !(ret == ETIMEDOUT)) {
335
            idle_threads++;
M
malc 已提交
336
            ret = cond_timedwait(&cond, &lock, &ts);
337
            idle_threads--;
338 339
        }

B
Blue Swirl 已提交
340
        if (QTAILQ_EMPTY(&request_list))
341 342
            break;

B
Blue Swirl 已提交
343 344
        aiocb = QTAILQ_FIRST(&request_list);
        QTAILQ_REMOVE(&request_list, aiocb, node);
345
        aiocb->active = 1;
M
malc 已提交
346
        mutex_unlock(&lock);
347

348 349
        switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
        case QEMU_AIO_READ:
350 351 352 353 354 355 356 357 358 359 360 361 362
            ret = handle_aiocb_rw(aiocb);
            if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) {
                /* A short read means that we have reached EOF. Pad the buffer
                 * with zeros for bytes after EOF. */
                QEMUIOVector qiov;

                qemu_iovec_init_external(&qiov, aiocb->aio_iov,
                                         aiocb->aio_niov);
                qemu_iovec_memset_skip(&qiov, 0, aiocb->aio_nbytes - ret, ret);

                ret = aiocb->aio_nbytes;
            }
            break;
363
        case QEMU_AIO_WRITE:
364 365
            ret = handle_aiocb_rw(aiocb);
            break;
366
        case QEMU_AIO_FLUSH:
367 368
            ret = handle_aiocb_flush(aiocb);
            break;
369
        case QEMU_AIO_IOCTL:
370 371 372 373 374 375 376
            ret = handle_aiocb_ioctl(aiocb);
            break;
        default:
            fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
            ret = -EINVAL;
            break;
        }
377

M
malc 已提交
378
        mutex_lock(&lock);
379
        aiocb->ret = ret;
M
malc 已提交
380
        mutex_unlock(&lock);
381

F
Frediano Ziglio 已提交
382
        posix_aio_notify_event();
383 384 385
    }

    cur_threads--;
M
malc 已提交
386
    mutex_unlock(&lock);
387 388 389 390

    return NULL;
}

391
static void do_spawn_thread(void)
392
{
393 394
    sigset_t set, oldset;

395 396 397 398 399 400 401 402 403 404
    mutex_lock(&lock);
    if (!new_threads) {
        mutex_unlock(&lock);
        return;
    }

    new_threads--;
    pending_threads++;

    mutex_unlock(&lock);
405 406 407 408 409

    /* block all signals */
    if (sigfillset(&set)) die("sigfillset");
    if (sigprocmask(SIG_SETMASK, &set, &oldset)) die("sigprocmask");

M
malc 已提交
410
    thread_create(&thread_id, &attr, aio_thread, NULL);
411 412

    if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
413 414
}

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
static void spawn_thread_bh_fn(void *opaque)
{
    do_spawn_thread();
}

static void spawn_thread(void)
{
    cur_threads++;
    new_threads++;
    /* If there are threads being created, they will spawn new workers, so
     * we don't spend time creating many threads in a loop holding a mutex or
     * starving the current vcpu.
     *
     * If there are no idle threads, ask the main thread to create one, so we
     * inherit the correct affinity instead of the vcpu affinity.
     */
    if (!pending_threads) {
        qemu_bh_schedule(new_thread_bh);
    }
}

436
static void qemu_paio_submit(struct qemu_paiocb *aiocb)
437 438 439
{
    aiocb->ret = -EINPROGRESS;
    aiocb->active = 0;
M
malc 已提交
440
    mutex_lock(&lock);
441 442
    if (idle_threads == 0 && cur_threads < max_threads)
        spawn_thread();
B
Blue Swirl 已提交
443
    QTAILQ_INSERT_TAIL(&request_list, aiocb, node);
M
malc 已提交
444
    mutex_unlock(&lock);
M
malc 已提交
445
    cond_signal(&cond);
446 447
}

448
static ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
449 450 451
{
    ssize_t ret;

M
malc 已提交
452
    mutex_lock(&lock);
453
    ret = aiocb->ret;
M
malc 已提交
454
    mutex_unlock(&lock);
455 456 457 458

    return ret;
}

459
static int qemu_paio_error(struct qemu_paiocb *aiocb)
460 461 462 463 464 465 466 467 468 469 470
{
    ssize_t ret = qemu_paio_return(aiocb);

    if (ret < 0)
        ret = -ret;
    else
        ret = 0;

    return ret;
}

471
static void posix_aio_read(void *opaque)
472
{
473 474
    PosixAioState *s = opaque;
    struct qemu_paiocb *acb, **pacb;
475
    int ret;
476 477 478 479 480 481 482 483 484 485 486 487 488
    ssize_t len;

    /* read all bytes from signal pipe */
    for (;;) {
        char bytes[16];

        len = read(s->rfd, bytes, sizeof(bytes));
        if (len == -1 && errno == EINTR)
            continue; /* try again */
        if (len == sizeof(bytes))
            continue; /* more to read */
        break;
    }
489 490 491 492 493 494

    for(;;) {
        pacb = &s->first_aio;
        for(;;) {
            acb = *pacb;
            if (!acb)
495
                return;
K
Kevin Wolf 已提交
496

497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
            ret = qemu_paio_error(acb);
            if (ret == ECANCELED) {
                /* remove the request */
                *pacb = acb->next;
                qemu_aio_release(acb);
            } else if (ret != EINPROGRESS) {
                /* end of aio */
                if (ret == 0) {
                    ret = qemu_paio_return(acb);
                    if (ret == acb->aio_nbytes)
                        ret = 0;
                    else
                        ret = -EINVAL;
                } else {
                    ret = -ret;
                }
513 514 515

                trace_paio_complete(acb, acb->common.opaque, ret);

516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
                /* remove the request */
                *pacb = acb->next;
                /* call the callback */
                acb->common.cb(acb->common.opaque, ret);
                qemu_aio_release(acb);
                break;
            } else {
                pacb = &acb->next;
            }
        }
    }
}

static int posix_aio_flush(void *opaque)
{
    PosixAioState *s = opaque;
    return !!s->first_aio;
}

static PosixAioState *posix_aio_state;

F
Frediano Ziglio 已提交
537
static void posix_aio_notify_event(void)
538
{
F
Frediano Ziglio 已提交
539 540
    char byte = 0;
    ssize_t ret;
541

F
Frediano Ziglio 已提交
542 543 544
    ret = write(posix_aio_state->wfd, &byte, sizeof(byte));
    if (ret < 0 && errno != EAGAIN)
        die("write()");
545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
}

static void paio_remove(struct qemu_paiocb *acb)
{
    struct qemu_paiocb **pacb;

    /* remove the callback from the queue */
    pacb = &posix_aio_state->first_aio;
    for(;;) {
        if (*pacb == NULL) {
            fprintf(stderr, "paio_remove: aio request not found!\n");
            break;
        } else if (*pacb == acb) {
            *pacb = acb->next;
            qemu_aio_release(acb);
            break;
        }
        pacb = &(*pacb)->next;
    }
}

static void paio_cancel(BlockDriverAIOCB *blockacb)
{
    struct qemu_paiocb *acb = (struct qemu_paiocb *)blockacb;
    int active = 0;
570

571 572
    trace_paio_cancel(acb, acb->common.opaque);

M
malc 已提交
573
    mutex_lock(&lock);
574
    if (!acb->active) {
B
Blue Swirl 已提交
575
        QTAILQ_REMOVE(&request_list, acb, node);
576 577 578 579
        acb->ret = -ECANCELED;
    } else if (acb->ret == -EINPROGRESS) {
        active = 1;
    }
M
malc 已提交
580
    mutex_unlock(&lock);
581

582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
    if (active) {
        /* fail safe: if the aio could not be canceled, we wait for
           it */
        while (qemu_paio_error(acb) == EINPROGRESS)
            ;
    }

    paio_remove(acb);
}

static AIOPool raw_aio_pool = {
    .aiocb_size         = sizeof(struct qemu_paiocb),
    .cancel             = paio_cancel,
};

K
Kevin Wolf 已提交
597
BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
598 599 600 601 602 603 604 605
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque, int type)
{
    struct qemu_paiocb *acb;

    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
    acb->aio_type = type;
    acb->aio_fildes = fd;
K
Kevin Wolf 已提交
606

607 608 609 610
    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
    }
611 612 613 614 615 616
    acb->aio_nbytes = nb_sectors * 512;
    acb->aio_offset = sector_num * 512;

    acb->next = posix_aio_state->first_aio;
    posix_aio_state->first_aio = acb;

617
    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
    qemu_paio_submit(acb);
    return &acb->common;
}

BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
        unsigned long int req, void *buf,
        BlockDriverCompletionFunc *cb, void *opaque)
{
    struct qemu_paiocb *acb;

    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
    acb->aio_type = QEMU_AIO_IOCTL;
    acb->aio_fildes = fd;
    acb->aio_offset = 0;
    acb->aio_ioctl_buf = buf;
    acb->aio_ioctl_cmd = req;

    acb->next = posix_aio_state->first_aio;
    posix_aio_state->first_aio = acb;

    qemu_paio_submit(acb);
    return &acb->common;
}

K
Kevin Wolf 已提交
642
int paio_init(void)
643 644 645 646 647 648
{
    PosixAioState *s;
    int fds[2];
    int ret;

    if (posix_aio_state)
K
Kevin Wolf 已提交
649
        return 0;
650

651
    s = g_malloc(sizeof(PosixAioState));
652 653

    s->first_aio = NULL;
K
Kevin Wolf 已提交
654
    if (qemu_pipe(fds) == -1) {
655
        fprintf(stderr, "failed to create pipe\n");
656
        g_free(s);
K
Kevin Wolf 已提交
657
        return -1;
658 659 660 661 662 663 664 665
    }

    s->rfd = fds[0];
    s->wfd = fds[1];

    fcntl(s->rfd, F_SETFL, O_NONBLOCK);
    fcntl(s->wfd, F_SETFL, O_NONBLOCK);

666
    qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
667 668 669 670 671 672 673 674 675

    ret = pthread_attr_init(&attr);
    if (ret)
        die2(ret, "pthread_attr_init");

    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
    if (ret)
        die2(ret, "pthread_attr_setdetachstate");

B
Blue Swirl 已提交
676
    QTAILQ_INIT(&request_list);
677
    new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
678 679

    posix_aio_state = s;
K
Kevin Wolf 已提交
680
    return 0;
681
}