ivshmem.c 32.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Inter-VM Shared Memory PCI device.
 *
 * Author:
 *      Cam Macdonell <cam@cs.ualberta.ca>
 *
 * Based On: cirrus_vga.c
 *          Copyright (c) 2004 Fabrice Bellard
 *          Copyright (c) 2004 Makoto Suzuki (suzu)
 *
 *      and rtl8139.c
 *          Copyright (c) 2006 Igor Kovalenko
 *
 * This code is licensed under the GNU GPL v2.
15 16 17
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
18
 */
P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
20
#include "hw/hw.h"
P
Paolo Bonzini 已提交
21
#include "hw/i386/pc.h"
22
#include "hw/pci/pci.h"
23
#include "hw/pci/msi.h"
24
#include "hw/pci/msix.h"
25
#include "sysemu/kvm.h"
26
#include "migration/migration.h"
27
#include "qemu/error-report.h"
28
#include "qemu/event_notifier.h"
29
#include "sysemu/char.h"
M
Marc-André Lureau 已提交
30 31
#include "sysemu/hostmem.h"
#include "qapi/visitor.h"
32
#include "exec/ram_addr.h"
33

34 35
#include "hw/misc/ivshmem.h"

36 37
#include <sys/mman.h>

38 39 40
#define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
#define PCI_DEVICE_ID_IVSHMEM   0x1110

41
#define IVSHMEM_MAX_PEERS UINT16_MAX
42 43 44 45 46 47 48 49
#define IVSHMEM_IOEVENTFD   0
#define IVSHMEM_MSI     1

#define IVSHMEM_PEER    0
#define IVSHMEM_MASTER  1

#define IVSHMEM_REG_BAR_SIZE 0x100

50 51 52 53 54 55 56
#define IVSHMEM_DEBUG 0
#define IVSHMEM_DPRINTF(fmt, ...)                       \
    do {                                                \
        if (IVSHMEM_DEBUG) {                            \
            printf("IVSHMEM: " fmt, ## __VA_ARGS__);    \
        }                                               \
    } while (0)
57

58 59 60 61
#define TYPE_IVSHMEM "ivshmem"
#define IVSHMEM(obj) \
    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)

62 63
typedef struct Peer {
    int nb_eventfds;
64
    EventNotifier *eventfds;
65 66
} Peer;

67
typedef struct MSIVector {
68
    PCIDevice *pdev;
69
    int virq;
70
} MSIVector;
71 72

typedef struct IVShmemState {
73 74 75 76
    /*< private >*/
    PCIDevice parent_obj;
    /*< public >*/

M
Marc-André Lureau 已提交
77
    HostMemoryBackend *hostmem;
78 79 80 81
    uint32_t intrmask;
    uint32_t intrstatus;

    CharDriverState *server_chr;
A
Avi Kivity 已提交
82
    MemoryRegion ivshmem_mmio;
83

A
Avi Kivity 已提交
84 85 86 87 88 89
    /* We might need to register the BAR before we actually have the memory.
     * So prepare a container MemoryRegion for the BAR immediately and
     * add a subregion when we have the memory.
     */
    MemoryRegion bar;
    MemoryRegion ivshmem;
90
    uint64_t ivshmem_size; /* size of shared memory region */
G
Gerd Hoffmann 已提交
91
    uint32_t ivshmem_64bit;
92 93

    Peer *peers;
94
    int nb_peers;               /* space in @peers[] */
95 96 97 98

    int vm_id;
    uint32_t vectors;
    uint32_t features;
99
    MSIVector *msi_vectors;
100 101
    uint64_t msg_buf;           /* buffer for receiving server messages */
    int msg_buffered_bytes;     /* #bytes in @msg_buf */
102

103 104
    Error *migration_blocker;

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
    char * shmobj;
    char * sizearg;
    char * role;
    int role_val;   /* scalar to avoid multiple string comparisons */
} IVShmemState;

/* registers for the Inter-VM shared memory device */
enum ivshmem_registers {
    INTRMASK = 0,
    INTRSTATUS = 4,
    IVPOSITION = 8,
    DOORBELL = 12,
};

static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
                                                    unsigned int feature) {
    return (ivs->features & (1 << feature));
}

124
static void ivshmem_update_irq(IVShmemState *s)
125
{
126
    PCIDevice *d = PCI_DEVICE(s);
127
    uint32_t isr = s->intrstatus & s->intrmask;
128

129 130 131 132 133
    /* No INTx with msi=on, whether the guest enabled MSI-X or not */
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        return;
    }

134 135 136
    /* don't print ISR resets */
    if (isr) {
        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
A
Andrew Jones 已提交
137
                        isr ? 1 : 0, s->intrstatus, s->intrmask);
138 139
    }

140
    pci_set_irq(d, isr != 0);
141 142 143 144 145 146 147
}

static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);

    s->intrmask = val;
148
    ivshmem_update_irq(s);
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
}

static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
{
    uint32_t ret = s->intrmask;

    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
    return ret;
}

static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);

    s->intrstatus = val;
164
    ivshmem_update_irq(s);
165 166 167 168 169 170 171 172
}

static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
{
    uint32_t ret = s->intrstatus;

    /* reading ISR clears all interrupts */
    s->intrstatus = 0;
173
    ivshmem_update_irq(s);
174 175 176
    return ret;
}

A
Avi Kivity 已提交
177
static void ivshmem_io_write(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
178
                             uint64_t val, unsigned size)
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
{
    IVShmemState *s = opaque;

    uint16_t dest = val >> 16;
    uint16_t vector = val & 0xff;

    addr &= 0xfc;

    IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
    switch (addr)
    {
        case INTRMASK:
            ivshmem_IntrMask_write(s, val);
            break;

        case INTRSTATUS:
            ivshmem_IntrStatus_write(s, val);
            break;

        case DOORBELL:
            /* check that dest VM ID is reasonable */
200
            if (dest >= s->nb_peers) {
201 202 203 204 205
                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
                break;
            }

            /* check doorbell range */
206
            if (vector < s->peers[dest].nb_eventfds) {
207 208
                IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
                event_notifier_set(&s->peers[dest].eventfds[vector]);
209 210 211
            } else {
                IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
                                vector, dest);
212 213 214
            }
            break;
        default:
215
            IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
216 217 218
    }
}

A
Avi Kivity 已提交
219
static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
220
                                unsigned size)
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
{

    IVShmemState *s = opaque;
    uint32_t ret;

    switch (addr)
    {
        case INTRMASK:
            ret = ivshmem_IntrMask_read(s);
            break;

        case INTRSTATUS:
            ret = ivshmem_IntrStatus_read(s);
            break;

        case IVPOSITION:
237
            ret = s->vm_id;
238 239 240 241 242 243 244 245 246 247
            break;

        default:
            IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
            ret = 0;
    }

    return ret;
}

A
Avi Kivity 已提交
248 249 250 251 252 253 254 255
static const MemoryRegionOps ivshmem_mmio_ops = {
    .read = ivshmem_io_read,
    .write = ivshmem_io_write,
    .endianness = DEVICE_NATIVE_ENDIAN,
    .impl = {
        .min_access_size = 4,
        .max_access_size = 4,
    },
256 257
};

258 259
static void ivshmem_vector_notify(void *opaque)
{
260
    MSIVector *entry = opaque;
261
    PCIDevice *pdev = entry->pdev;
262
    IVShmemState *s = IVSHMEM(pdev);
263
    int vector = entry - s->msi_vectors;
264 265 266 267 268
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];

    if (!event_notifier_test_and_clear(n)) {
        return;
    }
269

270
    IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
271
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
272 273 274
        if (msix_enabled(pdev)) {
            msix_notify(pdev, vector);
        }
275 276 277
    } else {
        ivshmem_IntrStatus_write(s, 1);
    }
278 279
}

280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
                                 MSIMessage msg)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    MSIVector *v = &s->msi_vectors[vector];
    int ret;

    IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);

    ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
    if (ret < 0) {
        return ret;
    }

    return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
}

static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    int ret;

    IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);

    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n,
                                                s->msi_vectors[vector].virq);
    if (ret != 0) {
        error_report("remove_irqfd_notifier_gsi failed");
    }
}

static void ivshmem_vector_poll(PCIDevice *dev,
                                unsigned int vector_start,
                                unsigned int vector_end)
{
    IVShmemState *s = IVSHMEM(dev);
    unsigned int vector;

    IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);

    vector_end = MIN(vector_end, s->vectors);

    for (vector = vector_start; vector < vector_end; vector++) {
        EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];

        if (!msix_is_masked(dev, vector)) {
            continue;
        }

        if (event_notifier_test_and_clear(notifier)) {
            msix_set_pending(dev, vector);
        }
    }
}

337 338
static void watch_vector_notifier(IVShmemState *s, EventNotifier *n,
                                 int vector)
339
{
340
    int eventfd = event_notifier_get_fd(n);
341

342
    assert(!s->msi_vectors[vector].pdev);
343
    s->msi_vectors[vector].pdev = PCI_DEVICE(s);
344

345 346
    qemu_set_fd_handler(eventfd, ivshmem_vector_notify,
                        NULL, &s->msi_vectors[vector]);
347 348
}

M
Marc-André Lureau 已提交
349 350
static int check_shm_size(IVShmemState *s, int fd, Error **errp)
{
351 352 353 354 355
    /* check that the guest isn't going to try and map more memory than the
     * the object has allocated return -1 to indicate error */

    struct stat buf;

356
    if (fstat(fd, &buf) < 0) {
M
Marc-André Lureau 已提交
357 358
        error_setg(errp, "exiting: fstat on fd %d failed: %s",
                   fd, strerror(errno));
359 360
        return -1;
    }
361 362

    if (s->ivshmem_size > buf.st_size) {
M
Marc-André Lureau 已提交
363 364 365
        error_setg(errp, "Requested memory size greater"
                   " than shared object size (%" PRIu64 " > %" PRIu64")",
                   s->ivshmem_size, (uint64_t)buf.st_size);
366 367 368 369 370 371 372 373
        return -1;
    } else {
        return 0;
    }
}

/* create the shared memory BAR when we are not using the server, so we can
 * create the BAR and map the memory immediately */
M
Marc-André Lureau 已提交
374 375 376
static int create_shared_memory_BAR(IVShmemState *s, int fd, uint8_t attr,
                                    Error **errp)
{
377 378 379
    void * ptr;

    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
M
Marc-André Lureau 已提交
380 381 382 383 384
    if (ptr == MAP_FAILED) {
        error_setg_errno(errp, errno, "Failed to mmap shared memory");
        return -1;
    }

385
    memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s), "ivshmem.bar2",
A
Avi Kivity 已提交
386
                               s->ivshmem_size, ptr);
F
Fam Zheng 已提交
387
    qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem), fd);
388
    vmstate_register_ram(&s->ivshmem, DEVICE(s));
A
Avi Kivity 已提交
389
    memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
390 391

    /* region for shared memory */
392
    pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
M
Marc-André Lureau 已提交
393 394

    return 0;
395 396
}

397 398 399 400 401 402 403
static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_add_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
404
                              &s->peers[posn].eventfds[i]);
405 406 407 408 409 410 411 412 413
}

static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_del_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
414
                              &s->peers[posn].eventfds[i]);
415 416
}

417
static void close_peer_eventfds(IVShmemState *s, int posn)
418
{
419
    int i, n;
420

421
    assert(posn >= 0 && posn < s->nb_peers);
422
    n = s->peers[posn].nb_eventfds;
423

424 425 426 427 428 429
    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
        memory_region_transaction_begin();
        for (i = 0; i < n; i++) {
            ivshmem_del_eventfd(s, posn, i);
        }
        memory_region_transaction_commit();
430
    }
431

432
    for (i = 0; i < n; i++) {
433
        event_notifier_cleanup(&s->peers[posn].eventfds[i]);
434 435
    }

436
    g_free(s->peers[posn].eventfds);
437 438 439
    s->peers[posn].nb_eventfds = 0;
}

440
static void resize_peers(IVShmemState *s, int nb_peers)
441
{
442 443
    int old_nb_peers = s->nb_peers;
    int i;
444

445 446
    assert(nb_peers > old_nb_peers);
    IVSHMEM_DPRINTF("bumping storage to %d peers\n", nb_peers);
447

448 449
    s->peers = g_realloc(s->peers, nb_peers * sizeof(Peer));
    s->nb_peers = nb_peers;
450

451 452 453
    for (i = old_nb_peers; i < nb_peers; i++) {
        s->peers[i].eventfds = g_new0(EventNotifier, s->vectors);
        s->peers[i].nb_eventfds = 0;
454 455 456
    }
}

457 458
static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
                                     Error **errp)
459 460 461 462 463 464
{
    PCIDevice *pdev = PCI_DEVICE(s);
    MSIMessage msg = msix_get_message(pdev, vector);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
465
    assert(!s->msi_vectors[vector].pdev);
466 467 468

    ret = kvm_irqchip_add_msi_route(kvm_state, msg, pdev);
    if (ret < 0) {
469 470
        error_setg(errp, "kvm_irqchip_add_msi_route failed");
        return;
471 472 473 474 475 476
    }

    s->msi_vectors[vector].virq = ret;
    s->msi_vectors[vector].pdev = pdev;
}

477
static void setup_interrupt(IVShmemState *s, int vector, Error **errp)
478 479 480 481 482
{
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
        ivshmem_has_feature(s, IVSHMEM_MSI);
    PCIDevice *pdev = PCI_DEVICE(s);
483
    Error *err = NULL;
484 485 486 487

    IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);

    if (!with_irqfd) {
488
        IVSHMEM_DPRINTF("with eventfd\n");
489
        watch_vector_notifier(s, n, vector);
490
    } else if (msix_enabled(pdev)) {
491
        IVSHMEM_DPRINTF("with irqfd\n");
492 493 494
        ivshmem_add_kvm_msi_virq(s, vector, &err);
        if (err) {
            error_propagate(errp, err);
495 496 497 498 499 500
            return;
        }

        if (!msix_is_masked(pdev, vector)) {
            kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
                                               s->msi_vectors[vector].virq);
501
            /* TODO handle error */
502 503 504
        }
    } else {
        /* it will be delayed until msix is enabled, in write_config */
505
        IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n");
506 507 508
    }
}

509
static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
510
{
M
Marc-André Lureau 已提交
511
    Error *err = NULL;
512
    void *ptr;
513

514
    if (memory_region_is_mapped(&s->ivshmem)) {
515
        error_setg(errp, "server sent unexpected shared memory message");
516
        close(fd);
517
        return;
518 519
    }

520
    if (check_shm_size(s, fd, &err) == -1) {
521
        error_propagate(errp, err);
522
        close(fd);
523 524 525
        return;
    }

526 527 528
    /* mmap the region and map into the BAR2 */
    ptr = mmap(0, s->ivshmem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (ptr == MAP_FAILED) {
529
        error_setg_errno(errp, errno, "Failed to mmap shared memory");
530 531
        close(fd);
        return;
532
    }
533 534 535 536 537 538 539
    memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s),
                               "ivshmem.bar2", s->ivshmem_size, ptr);
    qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem), fd);
    vmstate_register_ram(&s->ivshmem, DEVICE(s));
    memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
}

540 541
static void process_msg_disconnect(IVShmemState *s, uint16_t posn,
                                   Error **errp)
542 543
{
    IVSHMEM_DPRINTF("posn %d has gone away\n", posn);
544
    if (posn >= s->nb_peers || posn == s->vm_id) {
545
        error_setg(errp, "invalid peer %d", posn);
546 547
        return;
    }
548 549
    close_peer_eventfds(s, posn);
}
550

551 552
static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd,
                                Error **errp)
553 554 555
{
    Peer *peer = &s->peers[posn];
    int vector;
556

557 558 559 560 561
    /*
     * The N-th connect message for this peer comes with the file
     * descriptor for vector N-1.  Count messages to find the vector.
     */
    if (peer->nb_eventfds >= s->vectors) {
562 563
        error_setg(errp, "Too many eventfd received, device has %d vectors",
                   s->vectors);
564
        close(fd);
M
Marc-André Lureau 已提交
565
        return;
566
    }
567
    vector = peer->nb_eventfds++;
568

569 570 571
    IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd);
    event_notifier_init_fd(&peer->eventfds[vector], fd);
    fcntl_setfl(fd, O_NONBLOCK); /* msix/irqfd poll non block */
572

573
    if (posn == s->vm_id) {
574 575
        setup_interrupt(s, vector, errp);
        /* TODO do we need to handle the error? */
576
    }
577

578 579 580 581
    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
        ivshmem_add_eventfd(s, posn, vector);
    }
}
582

583
static void process_msg(IVShmemState *s, int64_t msg, int fd, Error **errp)
584 585
{
    IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd);
586

587
    if (msg < -1 || msg > IVSHMEM_MAX_PEERS) {
588
        error_setg(errp, "server sent invalid message %" PRId64, msg);
589
        close(fd);
590 591 592
        return;
    }

593
    if (msg == -1) {
594
        process_msg_shmem(s, fd, errp);
595 596 597
        return;
    }

598 599 600
    if (msg >= s->nb_peers) {
        resize_peers(s, msg + 1);
    }
601

602
    if (fd >= 0) {
603
        process_msg_connect(s, msg, fd, errp);
604
    } else {
605
        process_msg_disconnect(s, msg, errp);
606
    }
607
}
608

609 610 611 612 613 614 615 616
static int ivshmem_can_receive(void *opaque)
{
    IVShmemState *s = opaque;

    assert(s->msg_buffered_bytes < sizeof(s->msg_buf));
    return sizeof(s->msg_buf) - s->msg_buffered_bytes;
}

617 618 619
static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
{
    IVShmemState *s = opaque;
620
    Error *err = NULL;
621 622 623
    int fd;
    int64_t msg;

624 625 626 627
    assert(size >= 0 && s->msg_buffered_bytes + size <= sizeof(s->msg_buf));
    memcpy((unsigned char *)&s->msg_buf + s->msg_buffered_bytes, buf, size);
    s->msg_buffered_bytes += size;
    if (s->msg_buffered_bytes < sizeof(s->msg_buf)) {
628
        return;
629
    }
630 631
    msg = le64_to_cpu(s->msg_buf);
    s->msg_buffered_bytes = 0;
632 633 634 635

    fd = qemu_chr_fe_get_msgfd(s->server_chr);
    IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd);

636 637 638 639
    process_msg(s, msg, fd, &err);
    if (err) {
        error_report_err(err);
    }
640 641
}

642
static int64_t ivshmem_recv_msg(IVShmemState *s, int *pfd, Error **errp)
643
{
644 645 646 647 648 649 650 651
    int64_t msg;
    int n, ret;

    n = 0;
    do {
        ret = qemu_chr_fe_read_all(s->server_chr, (uint8_t *)&msg + n,
                                 sizeof(msg) - n);
        if (ret < 0 && ret != -EINTR) {
652
            error_setg_errno(errp, -ret, "read from server failed");
653 654 655 656
            return INT64_MIN;
        }
        n += ret;
    } while (n < sizeof(msg));
657

658 659 660
    *pfd = qemu_chr_fe_get_msgfd(s->server_chr);
    return msg;
}
661

662
static void ivshmem_recv_setup(IVShmemState *s, Error **errp)
663
{
664
    Error *err = NULL;
665 666 667
    int64_t msg;
    int fd;

668 669 670 671 672 673 674 675 676 677 678 679
    msg = ivshmem_recv_msg(s, &fd, &err);
    if (err) {
        error_propagate(errp, err);
        return;
    }
    if (msg != IVSHMEM_PROTOCOL_VERSION) {
        error_setg(errp, "server sent version %" PRId64 ", expecting %d",
                   msg, IVSHMEM_PROTOCOL_VERSION);
        return;
    }
    if (fd != -1) {
        error_setg(errp, "server sent invalid version message");
680 681 682
        return;
    }

683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
    /*
     * ivshmem-server sends the remaining initial messages in a fixed
     * order, but the device has always accepted them in any order.
     * Stay as compatible as practical, just in case people use
     * servers that behave differently.
     */

    /*
     * ivshmem_device_spec.txt has always required the ID message
     * right here, and ivshmem-server has always complied.  However,
     * older versions of the device accepted it out of order, but
     * broke when an interrupt setup message arrived before it.
     */
    msg = ivshmem_recv_msg(s, &fd, &err);
    if (err) {
        error_propagate(errp, err);
        return;
    }
    if (fd != -1 || msg < 0 || msg > IVSHMEM_MAX_PEERS) {
        error_setg(errp, "server sent invalid ID message");
        return;
    }
    s->vm_id = msg;

707 708 709 710
    /*
     * Receive more messages until we got shared memory.
     */
    do {
711 712 713 714 715 716 717 718 719 720
        msg = ivshmem_recv_msg(s, &fd, &err);
        if (err) {
            error_propagate(errp, err);
            return;
        }
        process_msg(s, msg, fd, &err);
        if (err) {
            error_propagate(errp, err);
            return;
        }
721
    } while (msg != -1);
722 723 724 725 726 727 728 729

    /*
     * This function must either map the shared memory or fail.  The
     * loop above ensures that: it terminates normally only after it
     * successfully processed the server's shared memory message.
     * Assert that actually mapped the shared memory:
     */
    assert(memory_region_is_mapped(&s->ivshmem));
730 731
}

732 733 734
/* Select the MSI-X vectors used by device.
 * ivshmem maps events to vectors statically, so
 * we just enable all vectors on init and after reset. */
735
static void ivshmem_msix_vector_use(IVShmemState *s)
736
{
737
    PCIDevice *d = PCI_DEVICE(s);
738 739 740
    int i;

    for (i = 0; i < s->vectors; i++) {
741
        msix_vector_use(d, i);
742 743 744
    }
}

745 746
static void ivshmem_reset(DeviceState *d)
{
747
    IVShmemState *s = IVSHMEM(d);
748 749

    s->intrstatus = 0;
750
    s->intrmask = 0;
751 752 753
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        ivshmem_msix_vector_use(s);
    }
754 755
}

756
static int ivshmem_setup_interrupts(IVShmemState *s)
757
{
758 759
    /* allocate QEMU callback data for receiving interrupts */
    s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
760

761 762 763 764
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1)) {
            return -1;
        }
765

766
        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
767
        ivshmem_msix_vector_use(s);
768
    }
769

M
Marc-André Lureau 已提交
770
    return 0;
771 772
}

773 774 775 776 777 778
static void ivshmem_enable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
779 780 781 782 783 784 785
        Error *err = NULL;

        ivshmem_add_kvm_msi_virq(s, i, &err);
        if (err) {
            error_report_err(err);
            /* TODO do we need to handle the error? */
        }
786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
    }

    if (msix_set_vector_notifiers(pdev,
                                  ivshmem_vector_unmask,
                                  ivshmem_vector_mask,
                                  ivshmem_vector_poll)) {
        error_report("ivshmem: msix_set_vector_notifiers failed");
    }
}

static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
{
    IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);

    if (s->msi_vectors[vector].pdev == NULL) {
        return;
    }

    /* it was cleaned when masked in the frontend. */
    kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);

    s->msi_vectors[vector].pdev = NULL;
}

static void ivshmem_disable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
        ivshmem_remove_kvm_msi_virq(s, i);
    }

    msix_unset_vector_notifiers(pdev);
}

static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
M
Marc-André Lureau 已提交
823
                                 uint32_t val, int len)
824
{
825 826 827 828 829 830
    IVShmemState *s = IVSHMEM(pdev);
    int is_enabled, was_enabled = msix_enabled(pdev);

    pci_default_write_config(pdev, address, val, len);
    is_enabled = msix_enabled(pdev);

831
    if (kvm_msi_via_irqfd_enabled()) {
832 833 834 835 836 837
        if (!was_enabled && is_enabled) {
            ivshmem_enable_irqfd(s);
        } else if (was_enabled && !is_enabled) {
            ivshmem_disable_irqfd(s);
        }
    }
838 839
}

M
Marc-André Lureau 已提交
840
static void pci_ivshmem_realize(PCIDevice *dev, Error **errp)
841
{
842
    IVShmemState *s = IVSHMEM(dev);
843
    Error *err = NULL;
844
    uint8_t *pci_conf;
845 846
    uint8_t attr = PCI_BASE_ADDRESS_SPACE_MEMORY |
        PCI_BASE_ADDRESS_MEM_PREFETCH;
847

M
Marc-André Lureau 已提交
848
    if (!!s->server_chr + !!s->shmobj + !!s->hostmem != 1) {
849 850
        error_setg(errp,
                   "You must specify either 'shm', 'chardev' or 'x-memdev'");
M
Marc-André Lureau 已提交
851 852 853 854 855 856 857 858 859 860
        return;
    }

    if (s->hostmem) {
        MemoryRegion *mr;

        if (s->sizearg) {
            g_warning("size argument ignored with hostmem");
        }

861
        mr = host_memory_backend_get_memory(s->hostmem, &error_abort);
M
Marc-André Lureau 已提交
862 863
        s->ivshmem_size = memory_region_size(mr);
    } else if (s->sizearg == NULL) {
864
        s->ivshmem_size = 4 << 20; /* 4 MB default */
M
Marc-André Lureau 已提交
865
    } else {
M
Marc-André Lureau 已提交
866 867 868 869
        char *end;
        int64_t size = qemu_strtosz(s->sizearg, &end);
        if (size < 0 || *end != '\0' || !is_power_of_2(size)) {
            error_setg(errp, "Invalid size %s", s->sizearg);
M
Marc-André Lureau 已提交
870 871
            return;
        }
M
Marc-André Lureau 已提交
872
        s->ivshmem_size = size;
873 874 875 876 877
    }

    /* IRQFD requires MSI */
    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
M
Marc-André Lureau 已提交
878 879
        error_setg(errp, "ioeventfd/irqfd requires MSI");
        return;
880 881 882 883 884 885 886 887 888
    }

    /* check that role is reasonable */
    if (s->role) {
        if (strncmp(s->role, "peer", 5) == 0) {
            s->role_val = IVSHMEM_PEER;
        } else if (strncmp(s->role, "master", 7) == 0) {
            s->role_val = IVSHMEM_MASTER;
        } else {
M
Marc-André Lureau 已提交
889 890
            error_setg(errp, "'role' must be 'peer' or 'master'");
            return;
891 892 893 894 895
        }
    } else {
        s->role_val = IVSHMEM_MASTER; /* default */
    }

896
    pci_conf = dev->config;
897 898
    pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;

899 900 901 902
    /*
     * Note: we don't use INTx with IVSHMEM_MSI at all, so this is a
     * bald-faced lie then.  But it's a backwards compatible lie.
     */
903 904
    pci_config_set_interrupt_pin(pci_conf, 1);

905
    memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
A
Avi Kivity 已提交
906 907
                          "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);

908
    /* region for registers*/
909
    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
910
                     &s->ivshmem_mmio);
A
Avi Kivity 已提交
911

912
    memory_region_init(&s->bar, OBJECT(s), "ivshmem-bar2-container", s->ivshmem_size);
G
Gerd Hoffmann 已提交
913
    if (s->ivshmem_64bit) {
914
        attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
G
Gerd Hoffmann 已提交
915
    }
916

M
Marc-André Lureau 已提交
917 918 919 920 921
    if (s->hostmem != NULL) {
        MemoryRegion *mr;

        IVSHMEM_DPRINTF("using hostmem\n");

922 923
        mr = host_memory_backend_get_memory(MEMORY_BACKEND(s->hostmem),
                                            &error_abort);
M
Marc-André Lureau 已提交
924 925 926 927
        vmstate_register_ram(mr, DEVICE(s));
        memory_region_add_subregion(&s->bar, 0, mr);
        pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
    } else if (s->server_chr != NULL) {
928
        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
A
Andrew Jones 已提交
929
                        s->server_chr->filename);
930

931
        /* we allocate enough space for 16 peers and grow as needed */
932
        resize_peers(s, 16);
933

934
        pci_register_bar(dev, 2, attr, &s->bar);
935

936 937 938 939 940
        /*
         * Receive setup messages from server synchronously.
         * Older versions did it asynchronously, but that creates a
         * number of entertaining race conditions.
         */
941 942 943 944
        ivshmem_recv_setup(s, &err);
        if (err) {
            error_propagate(errp, err);
            return;
945 946
        }

947 948 949
        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive,
                              ivshmem_read, NULL, s);

950 951 952 953
        if (ivshmem_setup_interrupts(s) < 0) {
            error_setg(errp, "failed to initialize interrupts");
            return;
        }
954 955 956 957 958 959 960 961 962 963 964 965
    } else {
        /* just map the file immediately, we're not using a server */
        int fd;

        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);

        /* try opening with O_EXCL and if it succeeds zero the memory
         * by truncating to 0 */
        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
           /* truncate file to length PCI device's memory */
            if (ftruncate(fd, s->ivshmem_size) != 0) {
A
Andrew Jones 已提交
966
                error_report("could not truncate shared file");
967 968 969 970
            }

        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
M
Marc-André Lureau 已提交
971 972
            error_setg(errp, "could not open shared file");
            return;
973 974
        }

M
Marc-André Lureau 已提交
975 976
        if (check_shm_size(s, fd, errp) == -1) {
            return;
977 978
        }

979 980 981 982 983 984 985 986 987 988 989
        create_shared_memory_BAR(s, fd, attr, &err);
        if (err) {
            error_propagate(errp, err);
            return;
        }
    }

    if (s->role_val == IVSHMEM_PEER) {
        error_setg(&s->migration_blocker,
                   "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
        migrate_add_blocker(s->migration_blocker);
990 991 992
    }
}

M
Marc-André Lureau 已提交
993
static void pci_ivshmem_exit(PCIDevice *dev)
994
{
995
    IVShmemState *s = IVSHMEM(dev);
996 997
    int i;

998 999 1000 1001 1002
    if (s->migration_blocker) {
        migrate_del_blocker(s->migration_blocker);
        error_free(s->migration_blocker);
    }

1003
    if (memory_region_is_mapped(&s->ivshmem)) {
M
Marc-André Lureau 已提交
1004 1005
        if (!s->hostmem) {
            void *addr = memory_region_get_ram_ptr(&s->ivshmem);
1006
            int fd;
M
Marc-André Lureau 已提交
1007 1008 1009 1010 1011

            if (munmap(addr, s->ivshmem_size) == -1) {
                error_report("Failed to munmap shared memory %s",
                             strerror(errno));
            }
1012

F
Fam Zheng 已提交
1013 1014
            fd = qemu_get_ram_fd(memory_region_get_ram_addr(&s->ivshmem));
            if (fd != -1) {
1015
                close(fd);
F
Fam Zheng 已提交
1016
            }
M
Marc-André Lureau 已提交
1017
        }
1018 1019 1020 1021 1022 1023 1024

        vmstate_unregister_ram(&s->ivshmem, DEVICE(dev));
        memory_region_del_subregion(&s->bar, &s->ivshmem);
    }

    if (s->peers) {
        for (i = 0; i < s->nb_peers; i++) {
1025
            close_peer_eventfds(s, i);
1026 1027 1028 1029 1030 1031 1032 1033
        }
        g_free(s->peers);
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_uninit_exclusive_bar(dev);
    }

1034
    g_free(s->msi_vectors);
1035 1036
}

1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
static bool test_msix(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    return ivshmem_has_feature(s, IVSHMEM_MSI);
}

static bool test_no_msix(void *opaque, int version_id)
{
    return !test_msix(opaque, version_id);
}

static int ivshmem_pre_load(void *opaque)
{
    IVShmemState *s = opaque;

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    return 0;
}

static int ivshmem_post_load(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
1066
        ivshmem_msix_vector_use(s);
1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
    }
    return 0;
}

static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id)
{
    IVShmemState *s = opaque;
    PCIDevice *pdev = PCI_DEVICE(s);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_load_old\n");

    if (version_id != 0) {
        return -EINVAL;
    }

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    ret = pci_device_load(pdev, f);
    if (ret) {
        return ret;
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_load(pdev, f);
1095
        ivshmem_msix_vector_use(s);
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
    } else {
        s->intrstatus = qemu_get_be32(f);
        s->intrmask = qemu_get_be32(f);
    }

    return 0;
}

static const VMStateDescription ivshmem_vmsd = {
    .name = "ivshmem",
    .version_id = 1,
    .minimum_version_id = 1,
    .pre_load = ivshmem_pre_load,
    .post_load = ivshmem_post_load,
    .fields = (VMStateField[]) {
        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),

        VMSTATE_MSIX_TEST(parent_obj, IVShmemState, test_msix),
        VMSTATE_UINT32_TEST(intrstatus, IVShmemState, test_no_msix),
        VMSTATE_UINT32_TEST(intrmask, IVShmemState, test_no_msix),

        VMSTATE_END_OF_LIST()
    },
    .load_state_old = ivshmem_load_old,
    .minimum_version_id_old = 0
};

1123 1124 1125 1126 1127 1128 1129 1130
static Property ivshmem_properties[] = {
    DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
    DEFINE_PROP_STRING("size", IVShmemState, sizearg),
    DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
    DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, false),
    DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
    DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
    DEFINE_PROP_STRING("role", IVShmemState, role),
G
Gerd Hoffmann 已提交
1131
    DEFINE_PROP_UINT32("use64", IVShmemState, ivshmem_64bit, 1),
1132 1133 1134 1135 1136
    DEFINE_PROP_END_OF_LIST(),
};

static void ivshmem_class_init(ObjectClass *klass, void *data)
{
1137
    DeviceClass *dc = DEVICE_CLASS(klass);
1138 1139
    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);

M
Marc-André Lureau 已提交
1140 1141 1142
    k->realize = pci_ivshmem_realize;
    k->exit = pci_ivshmem_exit;
    k->config_write = ivshmem_write_config;
1143 1144
    k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
    k->device_id = PCI_DEVICE_ID_IVSHMEM;
1145
    k->class_id = PCI_CLASS_MEMORY_RAM;
1146 1147
    dc->reset = ivshmem_reset;
    dc->props = ivshmem_properties;
1148
    dc->vmsd = &ivshmem_vmsd;
1149
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1150
    dc->desc = "Inter-VM shared memory";
1151 1152
}

M
Marc-André Lureau 已提交
1153 1154 1155 1156 1157
static void ivshmem_check_memdev_is_busy(Object *obj, const char *name,
                                         Object *val, Error **errp)
{
    MemoryRegion *mr;

1158
    mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), &error_abort);
M
Marc-André Lureau 已提交
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
    if (memory_region_is_mapped(mr)) {
        char *path = object_get_canonical_path_component(val);
        error_setg(errp, "can't use already busy memdev: %s", path);
        g_free(path);
    } else {
        qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
    }
}

static void ivshmem_init(Object *obj)
{
    IVShmemState *s = IVSHMEM(obj);

1172
    object_property_add_link(obj, "x-memdev", TYPE_MEMORY_BACKEND,
M
Marc-André Lureau 已提交
1173 1174 1175 1176 1177 1178
                             (Object **)&s->hostmem,
                             ivshmem_check_memdev_is_busy,
                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
                             &error_abort);
}

1179
static const TypeInfo ivshmem_info = {
1180
    .name          = TYPE_IVSHMEM,
1181 1182
    .parent        = TYPE_PCI_DEVICE,
    .instance_size = sizeof(IVShmemState),
M
Marc-André Lureau 已提交
1183
    .instance_init = ivshmem_init,
1184
    .class_init    = ivshmem_class_init,
1185 1186
};

A
Andreas Färber 已提交
1187
static void ivshmem_register_types(void)
1188
{
1189
    type_register_static(&ivshmem_info);
1190 1191
}

A
Andreas Färber 已提交
1192
type_init(ivshmem_register_types)