ivshmem.c 32.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Inter-VM Shared Memory PCI device.
 *
 * Author:
 *      Cam Macdonell <cam@cs.ualberta.ca>
 *
 * Based On: cirrus_vga.c
 *          Copyright (c) 2004 Fabrice Bellard
 *          Copyright (c) 2004 Makoto Suzuki (suzu)
 *
 *      and rtl8139.c
 *          Copyright (c) 2006 Igor Kovalenko
 *
 * This code is licensed under the GNU GPL v2.
15 16 17
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
18
 */
19
#include "hw/hw.h"
P
Paolo Bonzini 已提交
20
#include "hw/i386/pc.h"
21
#include "hw/pci/pci.h"
22
#include "hw/pci/msi.h"
23
#include "hw/pci/msix.h"
24
#include "sysemu/kvm.h"
25
#include "migration/migration.h"
26
#include "qemu/error-report.h"
27
#include "qemu/event_notifier.h"
28
#include "qemu/fifo8.h"
29
#include "sysemu/char.h"
M
Marc-André Lureau 已提交
30 31
#include "sysemu/hostmem.h"
#include "qapi/visitor.h"
32

33 34
#include "hw/misc/ivshmem.h"

35 36
#include <sys/mman.h>
#include <sys/types.h>
37
#include <limits.h>
38

39 40 41
#define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
#define PCI_DEVICE_ID_IVSHMEM   0x1110

42
#define IVSHMEM_MAX_PEERS G_MAXUINT16
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
#define IVSHMEM_IOEVENTFD   0
#define IVSHMEM_MSI     1

#define IVSHMEM_PEER    0
#define IVSHMEM_MASTER  1

#define IVSHMEM_REG_BAR_SIZE 0x100

//#define DEBUG_IVSHMEM
#ifdef DEBUG_IVSHMEM
#define IVSHMEM_DPRINTF(fmt, ...)        \
    do {printf("IVSHMEM: " fmt, ## __VA_ARGS__); } while (0)
#else
#define IVSHMEM_DPRINTF(fmt, ...)
#endif

59 60 61 62
#define TYPE_IVSHMEM "ivshmem"
#define IVSHMEM(obj) \
    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)

M
Marc-André Lureau 已提交
63 64
#define IVSHMEM_MEMDEV_PROP "memdev"

65 66
typedef struct Peer {
    int nb_eventfds;
67
    EventNotifier *eventfds;
68 69
} Peer;

70
typedef struct MSIVector {
71
    PCIDevice *pdev;
72
    int virq;
73
} MSIVector;
74 75

typedef struct IVShmemState {
76 77 78 79
    /*< private >*/
    PCIDevice parent_obj;
    /*< public >*/

M
Marc-André Lureau 已提交
80
    HostMemoryBackend *hostmem;
81 82 83 84 85
    uint32_t intrmask;
    uint32_t intrstatus;

    CharDriverState **eventfd_chr;
    CharDriverState *server_chr;
86
    Fifo8 incoming_fifo;
A
Avi Kivity 已提交
87
    MemoryRegion ivshmem_mmio;
88

A
Avi Kivity 已提交
89 90 91 92 93 94
    /* We might need to register the BAR before we actually have the memory.
     * So prepare a container MemoryRegion for the BAR immediately and
     * add a subregion when we have the memory.
     */
    MemoryRegion bar;
    MemoryRegion ivshmem;
95
    uint64_t ivshmem_size; /* size of shared memory region */
G
Gerd Hoffmann 已提交
96
    uint32_t ivshmem_64bit;
97 98

    Peer *peers;
99
    int nb_peers; /* how many peers we have space for */
100 101 102 103

    int vm_id;
    uint32_t vectors;
    uint32_t features;
104
    MSIVector *msi_vectors;
105

106 107
    Error *migration_blocker;

108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
    char * shmobj;
    char * sizearg;
    char * role;
    int role_val;   /* scalar to avoid multiple string comparisons */
} IVShmemState;

/* registers for the Inter-VM shared memory device */
enum ivshmem_registers {
    INTRMASK = 0,
    INTRSTATUS = 4,
    IVPOSITION = 8,
    DOORBELL = 12,
};

static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
                                                    unsigned int feature) {
    return (ivs->features & (1 << feature));
}

/* accessing registers - based on rtl8139 */
128
static void ivshmem_update_irq(IVShmemState *s)
129
{
130
    PCIDevice *d = PCI_DEVICE(s);
131 132 133 134 135 136
    int isr;
    isr = (s->intrstatus & s->intrmask) & 0xffffffff;

    /* don't print ISR resets */
    if (isr) {
        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
A
Andrew Jones 已提交
137
                        isr ? 1 : 0, s->intrstatus, s->intrmask);
138 139
    }

140
    pci_set_irq(d, (isr != 0));
141 142 143 144 145 146 147 148
}

static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);

    s->intrmask = val;

149
    ivshmem_update_irq(s);
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
}

static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
{
    uint32_t ret = s->intrmask;

    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);

    return ret;
}

static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);

    s->intrstatus = val;

167
    ivshmem_update_irq(s);
168 169 170 171 172 173 174 175 176
}

static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
{
    uint32_t ret = s->intrstatus;

    /* reading ISR clears all interrupts */
    s->intrstatus = 0;

177
    ivshmem_update_irq(s);
178 179 180 181

    return ret;
}

A
Avi Kivity 已提交
182
static void ivshmem_io_write(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
183
                             uint64_t val, unsigned size)
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
{
    IVShmemState *s = opaque;

    uint16_t dest = val >> 16;
    uint16_t vector = val & 0xff;

    addr &= 0xfc;

    IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
    switch (addr)
    {
        case INTRMASK:
            ivshmem_IntrMask_write(s, val);
            break;

        case INTRSTATUS:
            ivshmem_IntrStatus_write(s, val);
            break;

        case DOORBELL:
            /* check that dest VM ID is reasonable */
205
            if (dest >= s->nb_peers) {
206 207 208 209 210
                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
                break;
            }

            /* check doorbell range */
211
            if (vector < s->peers[dest].nb_eventfds) {
212 213
                IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
                event_notifier_set(&s->peers[dest].eventfds[vector]);
214 215 216
            } else {
                IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
                                vector, dest);
217 218 219
            }
            break;
        default:
220
            IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
221 222 223
    }
}

A
Avi Kivity 已提交
224
static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
225
                                unsigned size)
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
{

    IVShmemState *s = opaque;
    uint32_t ret;

    switch (addr)
    {
        case INTRMASK:
            ret = ivshmem_IntrMask_read(s);
            break;

        case INTRSTATUS:
            ret = ivshmem_IntrStatus_read(s);
            break;

        case IVPOSITION:
            /* return my VM ID if the memory is mapped */
243
            if (memory_region_is_mapped(&s->ivshmem)) {
244 245 246 247 248 249 250 251 252 253 254 255 256 257
                ret = s->vm_id;
            } else {
                ret = -1;
            }
            break;

        default:
            IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
            ret = 0;
    }

    return ret;
}

A
Avi Kivity 已提交
258 259 260 261 262 263 264 265
static const MemoryRegionOps ivshmem_mmio_ops = {
    .read = ivshmem_io_read,
    .write = ivshmem_io_write,
    .endianness = DEVICE_NATIVE_ENDIAN,
    .impl = {
        .min_access_size = 4,
        .max_access_size = 4,
    },
266 267 268 269 270 271
};

static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
{
    IVShmemState *s = opaque;

272
    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x size: %d\n", *buf, size);
273

274
    ivshmem_IntrStatus_write(s, *buf);
275 276 277 278
}

static int ivshmem_can_receive(void * opaque)
{
279
    return sizeof(long);
280 281 282 283 284 285 286 287 288
}

static void ivshmem_event(void *opaque, int event)
{
    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
}

static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {

289
    MSIVector *entry = opaque;
290
    PCIDevice *pdev = entry->pdev;
291
    IVShmemState *s = IVSHMEM(pdev);
292
    int vector = entry - s->msi_vectors;
293

294 295
    IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
    msix_notify(pdev, vector);
296 297
}

298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
                                 MSIMessage msg)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    MSIVector *v = &s->msi_vectors[vector];
    int ret;

    IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);

    ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
    if (ret < 0) {
        return ret;
    }

    return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
}

static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    int ret;

    IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);

    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n,
                                                s->msi_vectors[vector].virq);
    if (ret != 0) {
        error_report("remove_irqfd_notifier_gsi failed");
    }
}

static void ivshmem_vector_poll(PCIDevice *dev,
                                unsigned int vector_start,
                                unsigned int vector_end)
{
    IVShmemState *s = IVSHMEM(dev);
    unsigned int vector;

    IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);

    vector_end = MIN(vector_end, s->vectors);

    for (vector = vector_start; vector < vector_end; vector++) {
        EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];

        if (!msix_is_masked(dev, vector)) {
            continue;
        }

        if (event_notifier_test_and_clear(notifier)) {
            msix_set_pending(dev, vector);
        }
    }
}

355 356
static CharDriverState* create_eventfd_chr_device(void * opaque, EventNotifier *n,
                                                  int vector)
357 358 359
{
    /* create a event character device based on the passed eventfd */
    IVShmemState *s = opaque;
360
    PCIDevice *pdev = PCI_DEVICE(s);
361
    int eventfd = event_notifier_get_fd(n);
362 363 364
    CharDriverState *chr;

    s->msi_vectors[vector].pdev = pdev;
365 366 367 368

    chr = qemu_chr_open_eventfd(eventfd);

    if (chr == NULL) {
369
        error_report("creating chardriver for eventfd %d failed", eventfd);
M
Marc-André Lureau 已提交
370
        return NULL;
371
    }
372
    qemu_chr_fe_claim_no_fail(chr);
373 374 375

    /* if MSI is supported we need multiple interrupts */
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
376
        s->msi_vectors[vector].pdev = PCI_DEVICE(s);
377 378

        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
379
                      ivshmem_event, &s->msi_vectors[vector]);
380 381 382 383 384 385 386 387 388
    } else {
        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
                      ivshmem_event, s);
    }

    return chr;

}

M
Marc-André Lureau 已提交
389 390
static int check_shm_size(IVShmemState *s, int fd, Error **errp)
{
391 392 393 394 395
    /* check that the guest isn't going to try and map more memory than the
     * the object has allocated return -1 to indicate error */

    struct stat buf;

396
    if (fstat(fd, &buf) < 0) {
M
Marc-André Lureau 已提交
397 398
        error_setg(errp, "exiting: fstat on fd %d failed: %s",
                   fd, strerror(errno));
399 400
        return -1;
    }
401 402

    if (s->ivshmem_size > buf.st_size) {
M
Marc-André Lureau 已提交
403 404 405
        error_setg(errp, "Requested memory size greater"
                   " than shared object size (%" PRIu64 " > %" PRIu64")",
                   s->ivshmem_size, (uint64_t)buf.st_size);
406 407 408 409 410 411 412 413
        return -1;
    } else {
        return 0;
    }
}

/* create the shared memory BAR when we are not using the server, so we can
 * create the BAR and map the memory immediately */
M
Marc-André Lureau 已提交
414 415 416
static int create_shared_memory_BAR(IVShmemState *s, int fd, uint8_t attr,
                                    Error **errp)
{
417 418 419
    void * ptr;

    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
M
Marc-André Lureau 已提交
420 421 422 423 424
    if (ptr == MAP_FAILED) {
        error_setg_errno(errp, errno, "Failed to mmap shared memory");
        return -1;
    }

425
    memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s), "ivshmem.bar2",
A
Avi Kivity 已提交
426
                               s->ivshmem_size, ptr);
427
    vmstate_register_ram(&s->ivshmem, DEVICE(s));
A
Avi Kivity 已提交
428
    memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
429 430

    /* region for shared memory */
431
    pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
M
Marc-André Lureau 已提交
432 433

    return 0;
434 435
}

436 437 438 439 440 441 442
static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_add_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
443
                              &s->peers[posn].eventfds[i]);
444 445 446 447 448 449 450 451 452
}

static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_del_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
453
                              &s->peers[posn].eventfds[i]);
454 455
}

456
static void close_peer_eventfds(IVShmemState *s, int posn)
457
{
458
    int i, n;
459

460 461 462
    if (!ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
        return;
    }
463
    if (posn < 0 || posn >= s->nb_peers) {
464
        error_report("invalid peer %d", posn);
465 466
        return;
    }
467

468
    n = s->peers[posn].nb_eventfds;
469

470
    memory_region_transaction_begin();
471
    for (i = 0; i < n; i++) {
472
        ivshmem_del_eventfd(s, posn, i);
473 474
    }
    memory_region_transaction_commit();
475
    for (i = 0; i < n; i++) {
476
        event_notifier_cleanup(&s->peers[posn].eventfds[i]);
477 478
    }

479
    g_free(s->peers[posn].eventfds);
480 481 482 483
    s->peers[posn].nb_eventfds = 0;
}

/* this function increase the dynamic storage need to store data about other
484
 * peers */
485
static int resize_peers(IVShmemState *s, int new_min_size)
486
{
487

488
    int j, old_size;
489

490 491
    /* limit number of max peers */
    if (new_min_size <= 0 || new_min_size > IVSHMEM_MAX_PEERS) {
492 493
        return -1;
    }
494
    if (new_min_size <= s->nb_peers) {
495 496
        return 0;
    }
497

498 499 500
    old_size = s->nb_peers;
    s->nb_peers = new_min_size;

501
    IVSHMEM_DPRINTF("bumping storage to %d peers\n", s->nb_peers);
502

503
    s->peers = g_realloc(s->peers, s->nb_peers * sizeof(Peer));
504

505
    for (j = old_size; j < s->nb_peers; j++) {
506
        s->peers[j].eventfds = g_new0(EventNotifier, s->vectors);
507 508
        s->peers[j].nb_eventfds = 0;
    }
509 510

    return 0;
511 512
}

513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
static bool fifo_update_and_get(IVShmemState *s, const uint8_t *buf, int size,
                                void *data, size_t len)
{
    const uint8_t *p;
    uint32_t num;

    assert(len <= sizeof(long)); /* limitation of the fifo */
    if (fifo8_is_empty(&s->incoming_fifo) && size == len) {
        memcpy(data, buf, size);
        return true;
    }

    IVSHMEM_DPRINTF("short read of %d bytes\n", size);

    num = MIN(size, sizeof(long) - fifo8_num_used(&s->incoming_fifo));
    fifo8_push_all(&s->incoming_fifo, buf, num);

    if (fifo8_num_used(&s->incoming_fifo) < len) {
        assert(num == 0);
        return false;
    }

    size -= num;
    buf += num;
    p = fifo8_pop_buf(&s->incoming_fifo, len, &num);
    assert(num == len);

    memcpy(data, p, len);

    if (size > 0) {
        fifo8_push_all(&s->incoming_fifo, buf, size);
    }

    return true;
}

549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
static int ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    MSIMessage msg = msix_get_message(pdev, vector);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);

    if (s->msi_vectors[vector].pdev != NULL) {
        return 0;
    }

    ret = kvm_irqchip_add_msi_route(kvm_state, msg, pdev);
    if (ret < 0) {
        error_report("ivshmem: kvm_irqchip_add_msi_route failed");
        return -1;
    }

    s->msi_vectors[vector].virq = ret;
    s->msi_vectors[vector].pdev = pdev;

    return 0;
}

static void setup_interrupt(IVShmemState *s, int vector)
{
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
        ivshmem_has_feature(s, IVSHMEM_MSI);
    PCIDevice *pdev = PCI_DEVICE(s);

    IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);

    if (!with_irqfd) {
        IVSHMEM_DPRINTF("with eventfd");
        s->eventfd_chr[vector] = create_eventfd_chr_device(s, n, vector);
    } else if (msix_enabled(pdev)) {
        IVSHMEM_DPRINTF("with irqfd");
        if (ivshmem_add_kvm_msi_virq(s, vector) < 0) {
            return;
        }

        if (!msix_is_masked(pdev, vector)) {
            kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
                                               s->msi_vectors[vector].virq);
        }
    } else {
        /* it will be delayed until msix is enabled, in write_config */
        IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled");
    }
}

601
static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
602 603
{
    IVShmemState *s = opaque;
604
    int incoming_fd;
605
    int new_eventfd;
606
    long incoming_posn;
M
Marc-André Lureau 已提交
607
    Error *err = NULL;
608
    Peer *peer;
609

610 611 612
    if (!fifo_update_and_get(s, buf, size,
                             &incoming_posn, sizeof(incoming_posn))) {
        return;
613 614
    }

615 616 617 618 619
    if (incoming_posn < -1) {
        IVSHMEM_DPRINTF("invalid incoming_posn %ld\n", incoming_posn);
        return;
    }

620
    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
621 622
    incoming_fd = qemu_chr_fe_get_msgfd(s->server_chr);
    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, incoming_fd);
623

624
    /* make sure we have enough space for this peer */
625
    if (incoming_posn >= s->nb_peers) {
626 627
        if (resize_peers(s, incoming_posn + 1) < 0) {
            error_report("failed to resize peers array");
628 629
            if (incoming_fd != -1) {
                close(incoming_fd);
630 631 632
            }
            return;
        }
633 634
    }

635 636
    peer = &s->peers[incoming_posn];

637
    if (incoming_fd == -1) {
638
        /* if posn is positive and unseen before then this is our posn*/
639
        if (incoming_posn >= 0 && s->vm_id == -1) {
640 641 642
            /* receive our posn */
            s->vm_id = incoming_posn;
        } else {
643
            /* otherwise an fd == -1 means an existing peer has gone away */
644
            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
645
            close_peer_eventfds(s, incoming_posn);
646
        }
M
Marc-André Lureau 已提交
647
        return;
648 649 650 651 652 653
    }

    /* if the position is -1, then it's shared memory region fd */
    if (incoming_posn == -1) {
        void * map_ptr;

654
        if (memory_region_is_mapped(&s->ivshmem)) {
655 656 657 658 659
            error_report("shm already initialized");
            close(incoming_fd);
            return;
        }

M
Marc-André Lureau 已提交
660 661 662 663
        if (check_shm_size(s, incoming_fd, &err) == -1) {
            error_report_err(err);
            close(incoming_fd);
            return;
664 665 666 667 668
        }

        /* mmap the region and map into the BAR2 */
        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
                                                            incoming_fd, 0);
M
Marc-André Lureau 已提交
669 670 671 672 673
        if (map_ptr == MAP_FAILED) {
            error_report("Failed to mmap shared memory %s", strerror(errno));
            close(incoming_fd);
            return;
        }
674
        memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s),
A
Avi Kivity 已提交
675
                                   "ivshmem.bar2", s->ivshmem_size, map_ptr);
676
        vmstate_register_ram(&s->ivshmem, DEVICE(s));
677

678
        IVSHMEM_DPRINTF("guest h/w addr = %p, size = %" PRIu64 "\n",
A
Andrew Jones 已提交
679
                        map_ptr, s->ivshmem_size);
680

A
Avi Kivity 已提交
681
        memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
682

683
        close(incoming_fd);
684 685 686
        return;
    }

687 688 689
    /* each peer has an associated array of eventfds, and we keep
     * track of how many eventfds received so far */
    /* get a new eventfd: */
690 691 692 693 694 695 696
    if (peer->nb_eventfds >= s->vectors) {
        error_report("Too many eventfd received, device has %d vectors",
                     s->vectors);
        close(incoming_fd);
        return;
    }

697
    new_eventfd = peer->nb_eventfds++;
698

699
    /* this is an eventfd for a particular peer VM */
700
    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn,
701 702
                    new_eventfd, incoming_fd);
    event_notifier_init_fd(&peer->eventfds[new_eventfd], incoming_fd);
703
    fcntl_setfl(incoming_fd, O_NONBLOCK); /* msix/irqfd poll non block */
704 705

    if (incoming_posn == s->vm_id) {
706
        setup_interrupt(s, new_eventfd);
707 708 709
    }

    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
710
        ivshmem_add_eventfd(s, incoming_posn, new_eventfd);
711 712 713
    }
}

714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
static void ivshmem_check_version(void *opaque, const uint8_t * buf, int size)
{
    IVShmemState *s = opaque;
    int tmp;
    long version;

    if (!fifo_update_and_get(s, buf, size,
                             &version, sizeof(version))) {
        return;
    }

    tmp = qemu_chr_fe_get_msgfd(s->server_chr);
    if (tmp != -1 || version != IVSHMEM_PROTOCOL_VERSION) {
        fprintf(stderr, "incompatible version, you are connecting to a ivshmem-"
                "server using a different protocol please check your setup\n");
        qemu_chr_delete(s->server_chr);
        s->server_chr = NULL;
        return;
    }

    IVSHMEM_DPRINTF("version check ok, switch to real chardev handler\n");
    qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
                          ivshmem_event, s);
}

739 740 741 742 743
/* Select the MSI-X vectors used by device.
 * ivshmem maps events to vectors statically, so
 * we just enable all vectors on init and after reset. */
static void ivshmem_use_msix(IVShmemState * s)
{
744
    PCIDevice *d = PCI_DEVICE(s);
745 746
    int i;

747
    IVSHMEM_DPRINTF("%s, msix present: %d\n", __func__, msix_present(d));
748
    if (!msix_present(d)) {
749 750 751 752
        return;
    }

    for (i = 0; i < s->vectors; i++) {
753
        msix_vector_use(d, i);
754 755 756
    }
}

757 758
static void ivshmem_reset(DeviceState *d)
{
759
    IVShmemState *s = IVSHMEM(d);
760 761

    s->intrstatus = 0;
762
    s->intrmask = 0;
763
    ivshmem_use_msix(s);
764 765
}

M
Marc-André Lureau 已提交
766
static int ivshmem_setup_msi(IVShmemState * s)
767
{
768
    if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1)) {
M
Marc-André Lureau 已提交
769
        return -1;
770 771
    }

772 773
    IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);

S
Stefan Weil 已提交
774
    /* allocate QEMU char devices for receiving interrupts */
775
    s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
776 777

    ivshmem_use_msix(s);
M
Marc-André Lureau 已提交
778
    return 0;
779 780
}

781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
static void ivshmem_enable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
        ivshmem_add_kvm_msi_virq(s, i);
    }

    if (msix_set_vector_notifiers(pdev,
                                  ivshmem_vector_unmask,
                                  ivshmem_vector_mask,
                                  ivshmem_vector_poll)) {
        error_report("ivshmem: msix_set_vector_notifiers failed");
    }
}

static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
{
    IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);

    if (s->msi_vectors[vector].pdev == NULL) {
        return;
    }

    /* it was cleaned when masked in the frontend. */
    kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);

    s->msi_vectors[vector].pdev = NULL;
}

static void ivshmem_disable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
        ivshmem_remove_kvm_msi_virq(s, i);
    }

    msix_unset_vector_notifiers(pdev);
}

static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
M
Marc-André Lureau 已提交
825
                                 uint32_t val, int len)
826
{
827 828 829 830 831 832 833 834 835 836 837 838 839
    IVShmemState *s = IVSHMEM(pdev);
    int is_enabled, was_enabled = msix_enabled(pdev);

    pci_default_write_config(pdev, address, val, len);
    is_enabled = msix_enabled(pdev);

    if (kvm_msi_via_irqfd_enabled() && s->vm_id != -1) {
        if (!was_enabled && is_enabled) {
            ivshmem_enable_irqfd(s);
        } else if (was_enabled && !is_enabled) {
            ivshmem_disable_irqfd(s);
        }
    }
840 841
}

M
Marc-André Lureau 已提交
842
static void pci_ivshmem_realize(PCIDevice *dev, Error **errp)
843
{
844
    IVShmemState *s = IVSHMEM(dev);
845
    uint8_t *pci_conf;
846 847
    uint8_t attr = PCI_BASE_ADDRESS_SPACE_MEMORY |
        PCI_BASE_ADDRESS_MEM_PREFETCH;
848

M
Marc-André Lureau 已提交
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
    if (!!s->server_chr + !!s->shmobj + !!s->hostmem != 1) {
        error_setg(errp, "You must specify either a shmobj, a chardev"
                   " or a hostmem");
        return;
    }

    if (s->hostmem) {
        MemoryRegion *mr;

        if (s->sizearg) {
            g_warning("size argument ignored with hostmem");
        }

        mr = host_memory_backend_get_memory(s->hostmem, errp);
        s->ivshmem_size = memory_region_size(mr);
    } else if (s->sizearg == NULL) {
865
        s->ivshmem_size = 4 << 20; /* 4 MB default */
M
Marc-André Lureau 已提交
866
    } else {
M
Marc-André Lureau 已提交
867 868 869 870
        char *end;
        int64_t size = qemu_strtosz(s->sizearg, &end);
        if (size < 0 || *end != '\0' || !is_power_of_2(size)) {
            error_setg(errp, "Invalid size %s", s->sizearg);
M
Marc-André Lureau 已提交
871 872
            return;
        }
M
Marc-André Lureau 已提交
873
        s->ivshmem_size = size;
874 875
    }

876
    fifo8_create(&s->incoming_fifo, sizeof(long));
877

878 879 880
    /* IRQFD requires MSI */
    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
M
Marc-André Lureau 已提交
881 882
        error_setg(errp, "ioeventfd/irqfd requires MSI");
        return;
883 884 885 886 887 888 889 890 891
    }

    /* check that role is reasonable */
    if (s->role) {
        if (strncmp(s->role, "peer", 5) == 0) {
            s->role_val = IVSHMEM_PEER;
        } else if (strncmp(s->role, "master", 7) == 0) {
            s->role_val = IVSHMEM_MASTER;
        } else {
M
Marc-André Lureau 已提交
892 893
            error_setg(errp, "'role' must be 'peer' or 'master'");
            return;
894 895 896 897 898 899
        }
    } else {
        s->role_val = IVSHMEM_MASTER; /* default */
    }

    if (s->role_val == IVSHMEM_PEER) {
900 901
        error_setg(&s->migration_blocker,
                   "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
902
        migrate_add_blocker(s->migration_blocker);
903 904
    }

905
    pci_conf = dev->config;
906 907 908 909
    pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;

    pci_config_set_interrupt_pin(pci_conf, 1);

910
    memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
A
Avi Kivity 已提交
911 912
                          "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);

913
    /* region for registers*/
914
    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
915
                     &s->ivshmem_mmio);
A
Avi Kivity 已提交
916

917
    memory_region_init(&s->bar, OBJECT(s), "ivshmem-bar2-container", s->ivshmem_size);
G
Gerd Hoffmann 已提交
918
    if (s->ivshmem_64bit) {
919
        attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
G
Gerd Hoffmann 已提交
920
    }
921

M
Marc-André Lureau 已提交
922 923 924 925 926 927 928 929 930 931
    if (s->hostmem != NULL) {
        MemoryRegion *mr;

        IVSHMEM_DPRINTF("using hostmem\n");

        mr = host_memory_backend_get_memory(MEMORY_BACKEND(s->hostmem), errp);
        vmstate_register_ram(mr, DEVICE(s));
        memory_region_add_subregion(&s->bar, 0, mr);
        pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
    } else if (s->server_chr != NULL) {
932 933 934 935 936
        if (strncmp(s->server_chr->filename, "unix:", 5)) {
            error_setg(errp, "chardev is not a unix client socket");
            return;
        }

937 938 939 940
        /* if we get a UNIX socket as the parameter we will talk
         * to the ivshmem server to receive the memory region */

        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
A
Andrew Jones 已提交
941
                        s->server_chr->filename);
942

M
Marc-André Lureau 已提交
943 944 945 946
        if (ivshmem_has_feature(s, IVSHMEM_MSI) &&
            ivshmem_setup_msi(s)) {
            error_setg(errp, "msix initialization failed");
            return;
947 948
        }

949
        /* we allocate enough space for 16 peers and grow as needed */
950
        resize_peers(s, 16);
951 952
        s->vm_id = -1;

953
        pci_register_bar(dev, 2, attr, &s->bar);
954

955
        s->eventfd_chr = g_malloc0(s->vectors * sizeof(CharDriverState *));
956

957 958
        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive,
                              ivshmem_check_version, ivshmem_event, s);
959 960 961 962 963 964 965 966 967 968 969 970
    } else {
        /* just map the file immediately, we're not using a server */
        int fd;

        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);

        /* try opening with O_EXCL and if it succeeds zero the memory
         * by truncating to 0 */
        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
           /* truncate file to length PCI device's memory */
            if (ftruncate(fd, s->ivshmem_size) != 0) {
A
Andrew Jones 已提交
971
                error_report("could not truncate shared file");
972 973 974 975
            }

        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
M
Marc-André Lureau 已提交
976 977
            error_setg(errp, "could not open shared file");
            return;
978 979
        }

M
Marc-André Lureau 已提交
980 981
        if (check_shm_size(s, fd, errp) == -1) {
            return;
982 983
        }

M
Marc-André Lureau 已提交
984
        create_shared_memory_BAR(s, fd, attr, errp);
985
        close(fd);
986 987 988
    }
}

M
Marc-André Lureau 已提交
989
static void pci_ivshmem_exit(PCIDevice *dev)
990
{
991
    IVShmemState *s = IVSHMEM(dev);
992 993 994
    int i;

    fifo8_destroy(&s->incoming_fifo);
995

996 997 998 999 1000
    if (s->migration_blocker) {
        migrate_del_blocker(s->migration_blocker);
        error_free(s->migration_blocker);
    }

1001
    if (memory_region_is_mapped(&s->ivshmem)) {
M
Marc-André Lureau 已提交
1002 1003 1004 1005 1006 1007 1008 1009
        if (!s->hostmem) {
            void *addr = memory_region_get_ram_ptr(&s->ivshmem);

            if (munmap(addr, s->ivshmem_size) == -1) {
                error_report("Failed to munmap shared memory %s",
                             strerror(errno));
            }
        }
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025

        vmstate_unregister_ram(&s->ivshmem, DEVICE(dev));
        memory_region_del_subregion(&s->bar, &s->ivshmem);
    }

    if (s->eventfd_chr) {
        for (i = 0; i < s->vectors; i++) {
            if (s->eventfd_chr[i]) {
                qemu_chr_free(s->eventfd_chr[i]);
            }
        }
        g_free(s->eventfd_chr);
    }

    if (s->peers) {
        for (i = 0; i < s->nb_peers; i++) {
1026
            close_peer_eventfds(s, i);
1027 1028 1029 1030 1031 1032 1033 1034
        }
        g_free(s->peers);
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_uninit_exclusive_bar(dev);
    }

1035
    g_free(s->msi_vectors);
1036 1037
}

1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
static bool test_msix(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    return ivshmem_has_feature(s, IVSHMEM_MSI);
}

static bool test_no_msix(void *opaque, int version_id)
{
    return !test_msix(opaque, version_id);
}

static int ivshmem_pre_load(void *opaque)
{
    IVShmemState *s = opaque;

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    return 0;
}

static int ivshmem_post_load(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        ivshmem_use_msix(s);
    }

    return 0;
}

static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id)
{
    IVShmemState *s = opaque;
    PCIDevice *pdev = PCI_DEVICE(s);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_load_old\n");

    if (version_id != 0) {
        return -EINVAL;
    }

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    ret = pci_device_load(pdev, f);
    if (ret) {
        return ret;
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_load(pdev, f);
        ivshmem_use_msix(s);
    } else {
        s->intrstatus = qemu_get_be32(f);
        s->intrmask = qemu_get_be32(f);
    }

    return 0;
}

static const VMStateDescription ivshmem_vmsd = {
    .name = "ivshmem",
    .version_id = 1,
    .minimum_version_id = 1,
    .pre_load = ivshmem_pre_load,
    .post_load = ivshmem_post_load,
    .fields = (VMStateField[]) {
        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),

        VMSTATE_MSIX_TEST(parent_obj, IVShmemState, test_msix),
        VMSTATE_UINT32_TEST(intrstatus, IVShmemState, test_no_msix),
        VMSTATE_UINT32_TEST(intrmask, IVShmemState, test_no_msix),

        VMSTATE_END_OF_LIST()
    },
    .load_state_old = ivshmem_load_old,
    .minimum_version_id_old = 0
};

1125 1126 1127 1128 1129 1130 1131 1132
static Property ivshmem_properties[] = {
    DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
    DEFINE_PROP_STRING("size", IVShmemState, sizearg),
    DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
    DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, false),
    DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
    DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
    DEFINE_PROP_STRING("role", IVShmemState, role),
G
Gerd Hoffmann 已提交
1133
    DEFINE_PROP_UINT32("use64", IVShmemState, ivshmem_64bit, 1),
1134 1135 1136 1137 1138
    DEFINE_PROP_END_OF_LIST(),
};

static void ivshmem_class_init(ObjectClass *klass, void *data)
{
1139
    DeviceClass *dc = DEVICE_CLASS(klass);
1140 1141
    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);

M
Marc-André Lureau 已提交
1142 1143 1144
    k->realize = pci_ivshmem_realize;
    k->exit = pci_ivshmem_exit;
    k->config_write = ivshmem_write_config;
1145 1146
    k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
    k->device_id = PCI_DEVICE_ID_IVSHMEM;
1147
    k->class_id = PCI_CLASS_MEMORY_RAM;
1148 1149
    dc->reset = ivshmem_reset;
    dc->props = ivshmem_properties;
1150
    dc->vmsd = &ivshmem_vmsd;
1151
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1152
    dc->desc = "Inter-VM shared memory";
1153 1154
}

M
Marc-André Lureau 已提交
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
static void ivshmem_check_memdev_is_busy(Object *obj, const char *name,
                                         Object *val, Error **errp)
{
    MemoryRegion *mr;

    mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), errp);
    if (memory_region_is_mapped(mr)) {
        char *path = object_get_canonical_path_component(val);
        error_setg(errp, "can't use already busy memdev: %s", path);
        g_free(path);
    } else {
        qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
    }
}

static void ivshmem_init(Object *obj)
{
    IVShmemState *s = IVSHMEM(obj);

    object_property_add_link(obj, IVSHMEM_MEMDEV_PROP, TYPE_MEMORY_BACKEND,
                             (Object **)&s->hostmem,
                             ivshmem_check_memdev_is_busy,
                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
                             &error_abort);
}

1181
static const TypeInfo ivshmem_info = {
1182
    .name          = TYPE_IVSHMEM,
1183 1184
    .parent        = TYPE_PCI_DEVICE,
    .instance_size = sizeof(IVShmemState),
M
Marc-André Lureau 已提交
1185
    .instance_init = ivshmem_init,
1186
    .class_init    = ivshmem_class_init,
1187 1188
};

A
Andreas Färber 已提交
1189
static void ivshmem_register_types(void)
1190
{
1191
    type_register_static(&ivshmem_info);
1192 1193
}

A
Andreas Färber 已提交
1194
type_init(ivshmem_register_types)