ivshmem.c 32.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Inter-VM Shared Memory PCI device.
 *
 * Author:
 *      Cam Macdonell <cam@cs.ualberta.ca>
 *
 * Based On: cirrus_vga.c
 *          Copyright (c) 2004 Fabrice Bellard
 *          Copyright (c) 2004 Makoto Suzuki (suzu)
 *
 *      and rtl8139.c
 *          Copyright (c) 2006 Igor Kovalenko
 *
 * This code is licensed under the GNU GPL v2.
15 16 17
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
18
 */
P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
20
#include "hw/hw.h"
P
Paolo Bonzini 已提交
21
#include "hw/i386/pc.h"
22
#include "hw/pci/pci.h"
23
#include "hw/pci/msi.h"
24
#include "hw/pci/msix.h"
25
#include "sysemu/kvm.h"
26
#include "migration/migration.h"
27
#include "qemu/error-report.h"
28
#include "qemu/event_notifier.h"
29
#include "qemu/fifo8.h"
30
#include "sysemu/char.h"
M
Marc-André Lureau 已提交
31 32
#include "sysemu/hostmem.h"
#include "qapi/visitor.h"
33
#include "exec/ram_addr.h"
34

35 36
#include "hw/misc/ivshmem.h"

37 38
#include <sys/mman.h>

39 40 41
#define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
#define PCI_DEVICE_ID_IVSHMEM   0x1110

42
#define IVSHMEM_MAX_PEERS G_MAXUINT16
43 44 45 46 47 48 49 50
#define IVSHMEM_IOEVENTFD   0
#define IVSHMEM_MSI     1

#define IVSHMEM_PEER    0
#define IVSHMEM_MASTER  1

#define IVSHMEM_REG_BAR_SIZE 0x100

51 52 53 54 55 56 57
#define IVSHMEM_DEBUG 0
#define IVSHMEM_DPRINTF(fmt, ...)                       \
    do {                                                \
        if (IVSHMEM_DEBUG) {                            \
            printf("IVSHMEM: " fmt, ## __VA_ARGS__);    \
        }                                               \
    } while (0)
58

59 60 61 62
#define TYPE_IVSHMEM "ivshmem"
#define IVSHMEM(obj) \
    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)

63 64
typedef struct Peer {
    int nb_eventfds;
65
    EventNotifier *eventfds;
66 67
} Peer;

68
typedef struct MSIVector {
69
    PCIDevice *pdev;
70
    int virq;
71
} MSIVector;
72 73

typedef struct IVShmemState {
74 75 76 77
    /*< private >*/
    PCIDevice parent_obj;
    /*< public >*/

M
Marc-André Lureau 已提交
78
    HostMemoryBackend *hostmem;
79 80 81 82
    uint32_t intrmask;
    uint32_t intrstatus;

    CharDriverState *server_chr;
83
    Fifo8 incoming_fifo;
A
Avi Kivity 已提交
84
    MemoryRegion ivshmem_mmio;
85

A
Avi Kivity 已提交
86 87 88 89 90 91
    /* We might need to register the BAR before we actually have the memory.
     * So prepare a container MemoryRegion for the BAR immediately and
     * add a subregion when we have the memory.
     */
    MemoryRegion bar;
    MemoryRegion ivshmem;
92
    uint64_t ivshmem_size; /* size of shared memory region */
G
Gerd Hoffmann 已提交
93
    uint32_t ivshmem_64bit;
94 95

    Peer *peers;
96
    int nb_peers; /* how many peers we have space for */
97 98 99 100

    int vm_id;
    uint32_t vectors;
    uint32_t features;
101
    MSIVector *msi_vectors;
102

103 104
    Error *migration_blocker;

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
    char * shmobj;
    char * sizearg;
    char * role;
    int role_val;   /* scalar to avoid multiple string comparisons */
} IVShmemState;

/* registers for the Inter-VM shared memory device */
enum ivshmem_registers {
    INTRMASK = 0,
    INTRSTATUS = 4,
    IVPOSITION = 8,
    DOORBELL = 12,
};

static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
                                                    unsigned int feature) {
    return (ivs->features & (1 << feature));
}

/* accessing registers - based on rtl8139 */
125
static void ivshmem_update_irq(IVShmemState *s)
126
{
127
    PCIDevice *d = PCI_DEVICE(s);
128 129 130 131 132 133
    int isr;
    isr = (s->intrstatus & s->intrmask) & 0xffffffff;

    /* don't print ISR resets */
    if (isr) {
        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
A
Andrew Jones 已提交
134
                        isr ? 1 : 0, s->intrstatus, s->intrmask);
135 136
    }

137
    pci_set_irq(d, (isr != 0));
138 139 140 141 142 143 144 145
}

static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);

    s->intrmask = val;

146
    ivshmem_update_irq(s);
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
}

static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
{
    uint32_t ret = s->intrmask;

    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);

    return ret;
}

static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);

    s->intrstatus = val;

164
    ivshmem_update_irq(s);
165 166 167 168 169 170 171 172 173
}

static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
{
    uint32_t ret = s->intrstatus;

    /* reading ISR clears all interrupts */
    s->intrstatus = 0;

174
    ivshmem_update_irq(s);
175 176 177 178

    return ret;
}

A
Avi Kivity 已提交
179
static void ivshmem_io_write(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
180
                             uint64_t val, unsigned size)
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
{
    IVShmemState *s = opaque;

    uint16_t dest = val >> 16;
    uint16_t vector = val & 0xff;

    addr &= 0xfc;

    IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
    switch (addr)
    {
        case INTRMASK:
            ivshmem_IntrMask_write(s, val);
            break;

        case INTRSTATUS:
            ivshmem_IntrStatus_write(s, val);
            break;

        case DOORBELL:
            /* check that dest VM ID is reasonable */
202
            if (dest >= s->nb_peers) {
203 204 205 206 207
                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
                break;
            }

            /* check doorbell range */
208
            if (vector < s->peers[dest].nb_eventfds) {
209 210
                IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
                event_notifier_set(&s->peers[dest].eventfds[vector]);
211 212 213
            } else {
                IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
                                vector, dest);
214 215 216
            }
            break;
        default:
217
            IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
218 219 220
    }
}

A
Avi Kivity 已提交
221
static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
222
                                unsigned size)
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
{

    IVShmemState *s = opaque;
    uint32_t ret;

    switch (addr)
    {
        case INTRMASK:
            ret = ivshmem_IntrMask_read(s);
            break;

        case INTRSTATUS:
            ret = ivshmem_IntrStatus_read(s);
            break;

        case IVPOSITION:
            /* return my VM ID if the memory is mapped */
240
            if (memory_region_is_mapped(&s->ivshmem)) {
241 242 243 244 245 246 247 248 249 250 251 252 253 254
                ret = s->vm_id;
            } else {
                ret = -1;
            }
            break;

        default:
            IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
            ret = 0;
    }

    return ret;
}

A
Avi Kivity 已提交
255 256 257 258 259 260 261 262
static const MemoryRegionOps ivshmem_mmio_ops = {
    .read = ivshmem_io_read,
    .write = ivshmem_io_write,
    .endianness = DEVICE_NATIVE_ENDIAN,
    .impl = {
        .min_access_size = 4,
        .max_access_size = 4,
    },
263 264 265 266
};

static int ivshmem_can_receive(void * opaque)
{
267
    return sizeof(int64_t);
268 269
}

270 271
static void ivshmem_vector_notify(void *opaque)
{
272
    MSIVector *entry = opaque;
273
    PCIDevice *pdev = entry->pdev;
274
    IVShmemState *s = IVSHMEM(pdev);
275
    int vector = entry - s->msi_vectors;
276 277 278 279 280
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];

    if (!event_notifier_test_and_clear(n)) {
        return;
    }
281

282
    IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
283 284 285 286 287
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_notify(pdev, vector);
    } else {
        ivshmem_IntrStatus_write(s, 1);
    }
288 289
}

290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
                                 MSIMessage msg)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    MSIVector *v = &s->msi_vectors[vector];
    int ret;

    IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);

    ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
    if (ret < 0) {
        return ret;
    }

    return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
}

static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    int ret;

    IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);

    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n,
                                                s->msi_vectors[vector].virq);
    if (ret != 0) {
        error_report("remove_irqfd_notifier_gsi failed");
    }
}

static void ivshmem_vector_poll(PCIDevice *dev,
                                unsigned int vector_start,
                                unsigned int vector_end)
{
    IVShmemState *s = IVSHMEM(dev);
    unsigned int vector;

    IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);

    vector_end = MIN(vector_end, s->vectors);

    for (vector = vector_start; vector < vector_end; vector++) {
        EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];

        if (!msix_is_masked(dev, vector)) {
            continue;
        }

        if (event_notifier_test_and_clear(notifier)) {
            msix_set_pending(dev, vector);
        }
    }
}

347 348
static void watch_vector_notifier(IVShmemState *s, EventNotifier *n,
                                 int vector)
349
{
350
    int eventfd = event_notifier_get_fd(n);
351 352

    /* if MSI is supported we need multiple interrupts */
353
    s->msi_vectors[vector].pdev = PCI_DEVICE(s);
354

355 356
    qemu_set_fd_handler(eventfd, ivshmem_vector_notify,
                        NULL, &s->msi_vectors[vector]);
357 358
}

M
Marc-André Lureau 已提交
359 360
static int check_shm_size(IVShmemState *s, int fd, Error **errp)
{
361 362 363 364 365
    /* check that the guest isn't going to try and map more memory than the
     * the object has allocated return -1 to indicate error */

    struct stat buf;

366
    if (fstat(fd, &buf) < 0) {
M
Marc-André Lureau 已提交
367 368
        error_setg(errp, "exiting: fstat on fd %d failed: %s",
                   fd, strerror(errno));
369 370
        return -1;
    }
371 372

    if (s->ivshmem_size > buf.st_size) {
M
Marc-André Lureau 已提交
373 374 375
        error_setg(errp, "Requested memory size greater"
                   " than shared object size (%" PRIu64 " > %" PRIu64")",
                   s->ivshmem_size, (uint64_t)buf.st_size);
376 377 378 379 380 381 382 383
        return -1;
    } else {
        return 0;
    }
}

/* create the shared memory BAR when we are not using the server, so we can
 * create the BAR and map the memory immediately */
M
Marc-André Lureau 已提交
384 385 386
static int create_shared_memory_BAR(IVShmemState *s, int fd, uint8_t attr,
                                    Error **errp)
{
387 388 389
    void * ptr;

    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
M
Marc-André Lureau 已提交
390 391 392 393 394
    if (ptr == MAP_FAILED) {
        error_setg_errno(errp, errno, "Failed to mmap shared memory");
        return -1;
    }

395
    memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s), "ivshmem.bar2",
A
Avi Kivity 已提交
396
                               s->ivshmem_size, ptr);
F
Fam Zheng 已提交
397
    qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem), fd);
398
    vmstate_register_ram(&s->ivshmem, DEVICE(s));
A
Avi Kivity 已提交
399
    memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
400 401

    /* region for shared memory */
402
    pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
M
Marc-André Lureau 已提交
403 404

    return 0;
405 406
}

407 408 409 410 411 412 413
static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_add_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
414
                              &s->peers[posn].eventfds[i]);
415 416 417 418 419 420 421 422 423
}

static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_del_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
424
                              &s->peers[posn].eventfds[i]);
425 426
}

427
static void close_peer_eventfds(IVShmemState *s, int posn)
428
{
429
    int i, n;
430

431 432 433
    if (!ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
        return;
    }
434
    if (posn < 0 || posn >= s->nb_peers) {
435
        error_report("invalid peer %d", posn);
436 437
        return;
    }
438

439
    n = s->peers[posn].nb_eventfds;
440

441
    memory_region_transaction_begin();
442
    for (i = 0; i < n; i++) {
443
        ivshmem_del_eventfd(s, posn, i);
444 445
    }
    memory_region_transaction_commit();
446
    for (i = 0; i < n; i++) {
447
        event_notifier_cleanup(&s->peers[posn].eventfds[i]);
448 449
    }

450
    g_free(s->peers[posn].eventfds);
451 452 453 454
    s->peers[posn].nb_eventfds = 0;
}

/* this function increase the dynamic storage need to store data about other
455
 * peers */
456
static int resize_peers(IVShmemState *s, int new_min_size)
457
{
458

459
    int j, old_size;
460

461 462
    /* limit number of max peers */
    if (new_min_size <= 0 || new_min_size > IVSHMEM_MAX_PEERS) {
463 464
        return -1;
    }
465
    if (new_min_size <= s->nb_peers) {
466 467
        return 0;
    }
468

469 470 471
    old_size = s->nb_peers;
    s->nb_peers = new_min_size;

472
    IVSHMEM_DPRINTF("bumping storage to %d peers\n", s->nb_peers);
473

474
    s->peers = g_realloc(s->peers, s->nb_peers * sizeof(Peer));
475

476
    for (j = old_size; j < s->nb_peers; j++) {
477
        s->peers[j].eventfds = g_new0(EventNotifier, s->vectors);
478 479
        s->peers[j].nb_eventfds = 0;
    }
480 481

    return 0;
482 483
}

484 485 486 487 488 489
static bool fifo_update_and_get(IVShmemState *s, const uint8_t *buf, int size,
                                void *data, size_t len)
{
    const uint8_t *p;
    uint32_t num;

490
    assert(len <= sizeof(int64_t)); /* limitation of the fifo */
491 492 493 494 495 496 497
    if (fifo8_is_empty(&s->incoming_fifo) && size == len) {
        memcpy(data, buf, size);
        return true;
    }

    IVSHMEM_DPRINTF("short read of %d bytes\n", size);

498
    num = MIN(size, sizeof(int64_t) - fifo8_num_used(&s->incoming_fifo));
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
    fifo8_push_all(&s->incoming_fifo, buf, num);

    if (fifo8_num_used(&s->incoming_fifo) < len) {
        assert(num == 0);
        return false;
    }

    size -= num;
    buf += num;
    p = fifo8_pop_buf(&s->incoming_fifo, len, &num);
    assert(num == len);

    memcpy(data, p, len);

    if (size > 0) {
        fifo8_push_all(&s->incoming_fifo, buf, size);
    }

    return true;
}

520 521 522 523 524 525 526 527 528 529 530
static bool fifo_update_and_get_i64(IVShmemState *s,
                                    const uint8_t *buf, int size, int64_t *i64)
{
    if (fifo_update_and_get(s, buf, size, i64, sizeof(*i64))) {
        *i64 = GINT64_FROM_LE(*i64);
        return true;
    }

    return false;
}

531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
static int ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    MSIMessage msg = msix_get_message(pdev, vector);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);

    if (s->msi_vectors[vector].pdev != NULL) {
        return 0;
    }

    ret = kvm_irqchip_add_msi_route(kvm_state, msg, pdev);
    if (ret < 0) {
        error_report("ivshmem: kvm_irqchip_add_msi_route failed");
        return -1;
    }

    s->msi_vectors[vector].virq = ret;
    s->msi_vectors[vector].pdev = pdev;

    return 0;
}

static void setup_interrupt(IVShmemState *s, int vector)
{
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
        ivshmem_has_feature(s, IVSHMEM_MSI);
    PCIDevice *pdev = PCI_DEVICE(s);

    IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);

    if (!with_irqfd) {
565
        IVSHMEM_DPRINTF("with eventfd\n");
566
        watch_vector_notifier(s, n, vector);
567
    } else if (msix_enabled(pdev)) {
568
        IVSHMEM_DPRINTF("with irqfd\n");
569 570 571 572 573 574 575 576 577 578
        if (ivshmem_add_kvm_msi_virq(s, vector) < 0) {
            return;
        }

        if (!msix_is_masked(pdev, vector)) {
            kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
                                               s->msi_vectors[vector].virq);
        }
    } else {
        /* it will be delayed until msix is enabled, in write_config */
579
        IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n");
580 581 582
    }
}

583
static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
584 585
{
    IVShmemState *s = opaque;
586
    int incoming_fd;
587
    int new_eventfd;
588
    int64_t incoming_posn;
M
Marc-André Lureau 已提交
589
    Error *err = NULL;
590
    Peer *peer;
591

592
    if (!fifo_update_and_get_i64(s, buf, size, &incoming_posn)) {
593
        return;
594 595
    }

596
    if (incoming_posn < -1) {
597
        IVSHMEM_DPRINTF("invalid incoming_posn %" PRId64 "\n", incoming_posn);
598 599 600
        return;
    }

601
    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
602
    incoming_fd = qemu_chr_fe_get_msgfd(s->server_chr);
603 604
    IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n",
                    incoming_posn, incoming_fd);
605

606
    /* make sure we have enough space for this peer */
607
    if (incoming_posn >= s->nb_peers) {
608 609
        if (resize_peers(s, incoming_posn + 1) < 0) {
            error_report("failed to resize peers array");
610 611
            if (incoming_fd != -1) {
                close(incoming_fd);
612 613 614
            }
            return;
        }
615 616
    }

617 618
    peer = &s->peers[incoming_posn];

619
    if (incoming_fd == -1) {
620
        /* if posn is positive and unseen before then this is our posn*/
621
        if (incoming_posn >= 0 && s->vm_id == -1) {
622 623 624
            /* receive our posn */
            s->vm_id = incoming_posn;
        } else {
625
            /* otherwise an fd == -1 means an existing peer has gone away */
626
            IVSHMEM_DPRINTF("posn %" PRId64 " has gone away\n", incoming_posn);
627
            close_peer_eventfds(s, incoming_posn);
628
        }
M
Marc-André Lureau 已提交
629
        return;
630 631 632 633 634 635
    }

    /* if the position is -1, then it's shared memory region fd */
    if (incoming_posn == -1) {
        void * map_ptr;

636
        if (memory_region_is_mapped(&s->ivshmem)) {
637 638 639 640 641
            error_report("shm already initialized");
            close(incoming_fd);
            return;
        }

M
Marc-André Lureau 已提交
642 643 644 645
        if (check_shm_size(s, incoming_fd, &err) == -1) {
            error_report_err(err);
            close(incoming_fd);
            return;
646 647 648 649 650
        }

        /* mmap the region and map into the BAR2 */
        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
                                                            incoming_fd, 0);
M
Marc-André Lureau 已提交
651 652 653 654 655
        if (map_ptr == MAP_FAILED) {
            error_report("Failed to mmap shared memory %s", strerror(errno));
            close(incoming_fd);
            return;
        }
656
        memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s),
A
Avi Kivity 已提交
657
                                   "ivshmem.bar2", s->ivshmem_size, map_ptr);
F
Fam Zheng 已提交
658 659
        qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem),
                        incoming_fd);
660
        vmstate_register_ram(&s->ivshmem, DEVICE(s));
661

662
        IVSHMEM_DPRINTF("guest h/w addr = %p, size = %" PRIu64 "\n",
A
Andrew Jones 已提交
663
                        map_ptr, s->ivshmem_size);
664

A
Avi Kivity 已提交
665
        memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
666 667 668 669

        return;
    }

670 671 672
    /* each peer has an associated array of eventfds, and we keep
     * track of how many eventfds received so far */
    /* get a new eventfd: */
673 674 675 676 677 678 679
    if (peer->nb_eventfds >= s->vectors) {
        error_report("Too many eventfd received, device has %d vectors",
                     s->vectors);
        close(incoming_fd);
        return;
    }

680
    new_eventfd = peer->nb_eventfds++;
681

682
    /* this is an eventfd for a particular peer VM */
683
    IVSHMEM_DPRINTF("eventfds[%" PRId64 "][%d] = %d\n", incoming_posn,
684 685
                    new_eventfd, incoming_fd);
    event_notifier_init_fd(&peer->eventfds[new_eventfd], incoming_fd);
686
    fcntl_setfl(incoming_fd, O_NONBLOCK); /* msix/irqfd poll non block */
687 688

    if (incoming_posn == s->vm_id) {
689
        setup_interrupt(s, new_eventfd);
690 691 692
    }

    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
693
        ivshmem_add_eventfd(s, incoming_posn, new_eventfd);
694 695 696
    }
}

697 698 699 700
static void ivshmem_check_version(void *opaque, const uint8_t * buf, int size)
{
    IVShmemState *s = opaque;
    int tmp;
701
    int64_t version;
702

703
    if (!fifo_update_and_get_i64(s, buf, size, &version)) {
704 705 706 707 708 709 710
        return;
    }

    tmp = qemu_chr_fe_get_msgfd(s->server_chr);
    if (tmp != -1 || version != IVSHMEM_PROTOCOL_VERSION) {
        fprintf(stderr, "incompatible version, you are connecting to a ivshmem-"
                "server using a different protocol please check your setup\n");
711
        qemu_chr_add_handlers(s->server_chr, NULL, NULL, NULL, s);
712 713 714 715 716
        return;
    }

    IVSHMEM_DPRINTF("version check ok, switch to real chardev handler\n");
    qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
717
                          NULL, s);
718 719
}

720 721 722 723 724
/* Select the MSI-X vectors used by device.
 * ivshmem maps events to vectors statically, so
 * we just enable all vectors on init and after reset. */
static void ivshmem_use_msix(IVShmemState * s)
{
725
    PCIDevice *d = PCI_DEVICE(s);
726 727
    int i;

728
    IVSHMEM_DPRINTF("%s, msix present: %d\n", __func__, msix_present(d));
729
    if (!msix_present(d)) {
730 731 732 733
        return;
    }

    for (i = 0; i < s->vectors; i++) {
734
        msix_vector_use(d, i);
735 736 737
    }
}

738 739
static void ivshmem_reset(DeviceState *d)
{
740
    IVShmemState *s = IVSHMEM(d);
741 742

    s->intrstatus = 0;
743
    s->intrmask = 0;
744
    ivshmem_use_msix(s);
745 746
}

747
static int ivshmem_setup_interrupts(IVShmemState *s)
748
{
749 750
    /* allocate QEMU callback data for receiving interrupts */
    s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
751

752 753 754 755
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1)) {
            return -1;
        }
756

757 758 759
        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
        ivshmem_use_msix(s);
    }
760

M
Marc-André Lureau 已提交
761
    return 0;
762 763
}

764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
static void ivshmem_enable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
        ivshmem_add_kvm_msi_virq(s, i);
    }

    if (msix_set_vector_notifiers(pdev,
                                  ivshmem_vector_unmask,
                                  ivshmem_vector_mask,
                                  ivshmem_vector_poll)) {
        error_report("ivshmem: msix_set_vector_notifiers failed");
    }
}

static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
{
    IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);

    if (s->msi_vectors[vector].pdev == NULL) {
        return;
    }

    /* it was cleaned when masked in the frontend. */
    kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);

    s->msi_vectors[vector].pdev = NULL;
}

static void ivshmem_disable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
        ivshmem_remove_kvm_msi_virq(s, i);
    }

    msix_unset_vector_notifiers(pdev);
}

static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
M
Marc-André Lureau 已提交
808
                                 uint32_t val, int len)
809
{
810 811 812 813 814 815 816 817 818 819 820 821 822
    IVShmemState *s = IVSHMEM(pdev);
    int is_enabled, was_enabled = msix_enabled(pdev);

    pci_default_write_config(pdev, address, val, len);
    is_enabled = msix_enabled(pdev);

    if (kvm_msi_via_irqfd_enabled() && s->vm_id != -1) {
        if (!was_enabled && is_enabled) {
            ivshmem_enable_irqfd(s);
        } else if (was_enabled && !is_enabled) {
            ivshmem_disable_irqfd(s);
        }
    }
823 824
}

M
Marc-André Lureau 已提交
825
static void pci_ivshmem_realize(PCIDevice *dev, Error **errp)
826
{
827
    IVShmemState *s = IVSHMEM(dev);
828
    uint8_t *pci_conf;
829 830
    uint8_t attr = PCI_BASE_ADDRESS_SPACE_MEMORY |
        PCI_BASE_ADDRESS_MEM_PREFETCH;
831

M
Marc-André Lureau 已提交
832
    if (!!s->server_chr + !!s->shmobj + !!s->hostmem != 1) {
833 834
        error_setg(errp,
                   "You must specify either 'shm', 'chardev' or 'x-memdev'");
M
Marc-André Lureau 已提交
835 836 837 838 839 840 841 842 843 844 845 846 847
        return;
    }

    if (s->hostmem) {
        MemoryRegion *mr;

        if (s->sizearg) {
            g_warning("size argument ignored with hostmem");
        }

        mr = host_memory_backend_get_memory(s->hostmem, errp);
        s->ivshmem_size = memory_region_size(mr);
    } else if (s->sizearg == NULL) {
848
        s->ivshmem_size = 4 << 20; /* 4 MB default */
M
Marc-André Lureau 已提交
849
    } else {
M
Marc-André Lureau 已提交
850 851 852 853
        char *end;
        int64_t size = qemu_strtosz(s->sizearg, &end);
        if (size < 0 || *end != '\0' || !is_power_of_2(size)) {
            error_setg(errp, "Invalid size %s", s->sizearg);
M
Marc-André Lureau 已提交
854 855
            return;
        }
M
Marc-André Lureau 已提交
856
        s->ivshmem_size = size;
857 858
    }

859
    fifo8_create(&s->incoming_fifo, sizeof(int64_t));
860

861 862 863
    /* IRQFD requires MSI */
    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
M
Marc-André Lureau 已提交
864 865
        error_setg(errp, "ioeventfd/irqfd requires MSI");
        return;
866 867 868 869 870 871 872 873 874
    }

    /* check that role is reasonable */
    if (s->role) {
        if (strncmp(s->role, "peer", 5) == 0) {
            s->role_val = IVSHMEM_PEER;
        } else if (strncmp(s->role, "master", 7) == 0) {
            s->role_val = IVSHMEM_MASTER;
        } else {
M
Marc-André Lureau 已提交
875 876
            error_setg(errp, "'role' must be 'peer' or 'master'");
            return;
877 878 879 880 881 882
        }
    } else {
        s->role_val = IVSHMEM_MASTER; /* default */
    }

    if (s->role_val == IVSHMEM_PEER) {
883 884
        error_setg(&s->migration_blocker,
                   "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
885
        migrate_add_blocker(s->migration_blocker);
886 887
    }

888
    pci_conf = dev->config;
889 890 891 892
    pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;

    pci_config_set_interrupt_pin(pci_conf, 1);

893
    memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
A
Avi Kivity 已提交
894 895
                          "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);

896
    /* region for registers*/
897
    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
898
                     &s->ivshmem_mmio);
A
Avi Kivity 已提交
899

900
    memory_region_init(&s->bar, OBJECT(s), "ivshmem-bar2-container", s->ivshmem_size);
G
Gerd Hoffmann 已提交
901
    if (s->ivshmem_64bit) {
902
        attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
G
Gerd Hoffmann 已提交
903
    }
904

M
Marc-André Lureau 已提交
905 906 907 908 909 910 911 912 913 914
    if (s->hostmem != NULL) {
        MemoryRegion *mr;

        IVSHMEM_DPRINTF("using hostmem\n");

        mr = host_memory_backend_get_memory(MEMORY_BACKEND(s->hostmem), errp);
        vmstate_register_ram(mr, DEVICE(s));
        memory_region_add_subregion(&s->bar, 0, mr);
        pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
    } else if (s->server_chr != NULL) {
915
        /* FIXME do not rely on what chr drivers put into filename */
916 917 918 919 920
        if (strncmp(s->server_chr->filename, "unix:", 5)) {
            error_setg(errp, "chardev is not a unix client socket");
            return;
        }

921 922 923 924
        /* if we get a UNIX socket as the parameter we will talk
         * to the ivshmem server to receive the memory region */

        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
A
Andrew Jones 已提交
925
                        s->server_chr->filename);
926

927 928
        if (ivshmem_setup_interrupts(s) < 0) {
            error_setg(errp, "failed to initialize interrupts");
M
Marc-André Lureau 已提交
929
            return;
930 931
        }

932
        /* we allocate enough space for 16 peers and grow as needed */
933
        resize_peers(s, 16);
934 935
        s->vm_id = -1;

936
        pci_register_bar(dev, 2, attr, &s->bar);
937

938
        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive,
939
                              ivshmem_check_version, NULL, s);
940 941 942 943 944 945 946 947 948 949 950 951
    } else {
        /* just map the file immediately, we're not using a server */
        int fd;

        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);

        /* try opening with O_EXCL and if it succeeds zero the memory
         * by truncating to 0 */
        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
           /* truncate file to length PCI device's memory */
            if (ftruncate(fd, s->ivshmem_size) != 0) {
A
Andrew Jones 已提交
952
                error_report("could not truncate shared file");
953 954 955 956
            }

        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
M
Marc-André Lureau 已提交
957 958
            error_setg(errp, "could not open shared file");
            return;
959 960
        }

M
Marc-André Lureau 已提交
961 962
        if (check_shm_size(s, fd, errp) == -1) {
            return;
963 964
        }

M
Marc-André Lureau 已提交
965
        create_shared_memory_BAR(s, fd, attr, errp);
966 967 968
    }
}

M
Marc-André Lureau 已提交
969
static void pci_ivshmem_exit(PCIDevice *dev)
970
{
971
    IVShmemState *s = IVSHMEM(dev);
972 973 974
    int i;

    fifo8_destroy(&s->incoming_fifo);
975

976 977 978 979 980
    if (s->migration_blocker) {
        migrate_del_blocker(s->migration_blocker);
        error_free(s->migration_blocker);
    }

981
    if (memory_region_is_mapped(&s->ivshmem)) {
M
Marc-André Lureau 已提交
982 983
        if (!s->hostmem) {
            void *addr = memory_region_get_ram_ptr(&s->ivshmem);
984
            int fd;
M
Marc-André Lureau 已提交
985 986 987 988 989

            if (munmap(addr, s->ivshmem_size) == -1) {
                error_report("Failed to munmap shared memory %s",
                             strerror(errno));
            }
990

F
Fam Zheng 已提交
991 992
            fd = qemu_get_ram_fd(memory_region_get_ram_addr(&s->ivshmem));
            if (fd != -1) {
993
                close(fd);
F
Fam Zheng 已提交
994
            }
M
Marc-André Lureau 已提交
995
        }
996 997 998 999 1000 1001 1002

        vmstate_unregister_ram(&s->ivshmem, DEVICE(dev));
        memory_region_del_subregion(&s->bar, &s->ivshmem);
    }

    if (s->peers) {
        for (i = 0; i < s->nb_peers; i++) {
1003
            close_peer_eventfds(s, i);
1004 1005 1006 1007 1008 1009 1010 1011
        }
        g_free(s->peers);
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_uninit_exclusive_bar(dev);
    }

1012
    g_free(s->msi_vectors);
1013 1014
}

1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101
static bool test_msix(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    return ivshmem_has_feature(s, IVSHMEM_MSI);
}

static bool test_no_msix(void *opaque, int version_id)
{
    return !test_msix(opaque, version_id);
}

static int ivshmem_pre_load(void *opaque)
{
    IVShmemState *s = opaque;

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    return 0;
}

static int ivshmem_post_load(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        ivshmem_use_msix(s);
    }

    return 0;
}

static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id)
{
    IVShmemState *s = opaque;
    PCIDevice *pdev = PCI_DEVICE(s);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_load_old\n");

    if (version_id != 0) {
        return -EINVAL;
    }

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    ret = pci_device_load(pdev, f);
    if (ret) {
        return ret;
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_load(pdev, f);
        ivshmem_use_msix(s);
    } else {
        s->intrstatus = qemu_get_be32(f);
        s->intrmask = qemu_get_be32(f);
    }

    return 0;
}

static const VMStateDescription ivshmem_vmsd = {
    .name = "ivshmem",
    .version_id = 1,
    .minimum_version_id = 1,
    .pre_load = ivshmem_pre_load,
    .post_load = ivshmem_post_load,
    .fields = (VMStateField[]) {
        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),

        VMSTATE_MSIX_TEST(parent_obj, IVShmemState, test_msix),
        VMSTATE_UINT32_TEST(intrstatus, IVShmemState, test_no_msix),
        VMSTATE_UINT32_TEST(intrmask, IVShmemState, test_no_msix),

        VMSTATE_END_OF_LIST()
    },
    .load_state_old = ivshmem_load_old,
    .minimum_version_id_old = 0
};

1102 1103 1104 1105 1106 1107 1108 1109
static Property ivshmem_properties[] = {
    DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
    DEFINE_PROP_STRING("size", IVShmemState, sizearg),
    DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
    DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, false),
    DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
    DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
    DEFINE_PROP_STRING("role", IVShmemState, role),
G
Gerd Hoffmann 已提交
1110
    DEFINE_PROP_UINT32("use64", IVShmemState, ivshmem_64bit, 1),
1111 1112 1113 1114 1115
    DEFINE_PROP_END_OF_LIST(),
};

static void ivshmem_class_init(ObjectClass *klass, void *data)
{
1116
    DeviceClass *dc = DEVICE_CLASS(klass);
1117 1118
    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);

M
Marc-André Lureau 已提交
1119 1120 1121
    k->realize = pci_ivshmem_realize;
    k->exit = pci_ivshmem_exit;
    k->config_write = ivshmem_write_config;
1122 1123
    k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
    k->device_id = PCI_DEVICE_ID_IVSHMEM;
1124
    k->class_id = PCI_CLASS_MEMORY_RAM;
1125 1126
    dc->reset = ivshmem_reset;
    dc->props = ivshmem_properties;
1127
    dc->vmsd = &ivshmem_vmsd;
1128
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1129
    dc->desc = "Inter-VM shared memory";
1130 1131
}

M
Marc-André Lureau 已提交
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
static void ivshmem_check_memdev_is_busy(Object *obj, const char *name,
                                         Object *val, Error **errp)
{
    MemoryRegion *mr;

    mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), errp);
    if (memory_region_is_mapped(mr)) {
        char *path = object_get_canonical_path_component(val);
        error_setg(errp, "can't use already busy memdev: %s", path);
        g_free(path);
    } else {
        qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
    }
}

static void ivshmem_init(Object *obj)
{
    IVShmemState *s = IVSHMEM(obj);

1151
    object_property_add_link(obj, "x-memdev", TYPE_MEMORY_BACKEND,
M
Marc-André Lureau 已提交
1152 1153 1154 1155 1156 1157
                             (Object **)&s->hostmem,
                             ivshmem_check_memdev_is_busy,
                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
                             &error_abort);
}

1158
static const TypeInfo ivshmem_info = {
1159
    .name          = TYPE_IVSHMEM,
1160 1161
    .parent        = TYPE_PCI_DEVICE,
    .instance_size = sizeof(IVShmemState),
M
Marc-André Lureau 已提交
1162
    .instance_init = ivshmem_init,
1163
    .class_init    = ivshmem_class_init,
1164 1165
};

A
Andreas Färber 已提交
1166
static void ivshmem_register_types(void)
1167
{
1168
    type_register_static(&ivshmem_info);
1169 1170
}

A
Andreas Färber 已提交
1171
type_init(ivshmem_register_types)