ivshmem.c 32.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Inter-VM Shared Memory PCI device.
 *
 * Author:
 *      Cam Macdonell <cam@cs.ualberta.ca>
 *
 * Based On: cirrus_vga.c
 *          Copyright (c) 2004 Fabrice Bellard
 *          Copyright (c) 2004 Makoto Suzuki (suzu)
 *
 *      and rtl8139.c
 *          Copyright (c) 2006 Igor Kovalenko
 *
 * This code is licensed under the GNU GPL v2.
15 16 17
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
18
 */
P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
20
#include "hw/hw.h"
P
Paolo Bonzini 已提交
21
#include "hw/i386/pc.h"
22
#include "hw/pci/pci.h"
23
#include "hw/pci/msi.h"
24
#include "hw/pci/msix.h"
25
#include "sysemu/kvm.h"
26
#include "migration/migration.h"
27
#include "qemu/error-report.h"
28
#include "qemu/event_notifier.h"
29
#include "qemu/fifo8.h"
30
#include "sysemu/char.h"
M
Marc-André Lureau 已提交
31 32
#include "sysemu/hostmem.h"
#include "qapi/visitor.h"
33
#include "exec/ram_addr.h"
34

35 36
#include "hw/misc/ivshmem.h"

37 38
#include <sys/mman.h>

39 40 41
#define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
#define PCI_DEVICE_ID_IVSHMEM   0x1110

42
#define IVSHMEM_MAX_PEERS G_MAXUINT16
43 44 45 46 47 48 49 50
#define IVSHMEM_IOEVENTFD   0
#define IVSHMEM_MSI     1

#define IVSHMEM_PEER    0
#define IVSHMEM_MASTER  1

#define IVSHMEM_REG_BAR_SIZE 0x100

51 52 53 54 55 56 57
#define IVSHMEM_DEBUG 0
#define IVSHMEM_DPRINTF(fmt, ...)                       \
    do {                                                \
        if (IVSHMEM_DEBUG) {                            \
            printf("IVSHMEM: " fmt, ## __VA_ARGS__);    \
        }                                               \
    } while (0)
58

59 60 61 62
#define TYPE_IVSHMEM "ivshmem"
#define IVSHMEM(obj) \
    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)

63 64
typedef struct Peer {
    int nb_eventfds;
65
    EventNotifier *eventfds;
66 67
} Peer;

68
typedef struct MSIVector {
69
    PCIDevice *pdev;
70
    int virq;
71
} MSIVector;
72 73

typedef struct IVShmemState {
74 75 76 77
    /*< private >*/
    PCIDevice parent_obj;
    /*< public >*/

M
Marc-André Lureau 已提交
78
    HostMemoryBackend *hostmem;
79 80 81 82
    uint32_t intrmask;
    uint32_t intrstatus;

    CharDriverState *server_chr;
83
    Fifo8 incoming_fifo;
A
Avi Kivity 已提交
84
    MemoryRegion ivshmem_mmio;
85

A
Avi Kivity 已提交
86 87 88 89 90 91
    /* We might need to register the BAR before we actually have the memory.
     * So prepare a container MemoryRegion for the BAR immediately and
     * add a subregion when we have the memory.
     */
    MemoryRegion bar;
    MemoryRegion ivshmem;
92
    uint64_t ivshmem_size; /* size of shared memory region */
G
Gerd Hoffmann 已提交
93
    uint32_t ivshmem_64bit;
94 95

    Peer *peers;
96
    int nb_peers; /* how many peers we have space for */
97 98 99 100

    int vm_id;
    uint32_t vectors;
    uint32_t features;
101
    MSIVector *msi_vectors;
102

103 104
    Error *migration_blocker;

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
    char * shmobj;
    char * sizearg;
    char * role;
    int role_val;   /* scalar to avoid multiple string comparisons */
} IVShmemState;

/* registers for the Inter-VM shared memory device */
enum ivshmem_registers {
    INTRMASK = 0,
    INTRSTATUS = 4,
    IVPOSITION = 8,
    DOORBELL = 12,
};

static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
                                                    unsigned int feature) {
    return (ivs->features & (1 << feature));
}

124
static void ivshmem_update_irq(IVShmemState *s)
125
{
126
    PCIDevice *d = PCI_DEVICE(s);
127
    uint32_t isr = s->intrstatus & s->intrmask;
128

129 130 131 132 133
    /* No INTx with msi=on, whether the guest enabled MSI-X or not */
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        return;
    }

134 135 136
    /* don't print ISR resets */
    if (isr) {
        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
A
Andrew Jones 已提交
137
                        isr ? 1 : 0, s->intrstatus, s->intrmask);
138 139
    }

140
    pci_set_irq(d, isr != 0);
141 142 143 144 145 146 147
}

static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);

    s->intrmask = val;
148
    ivshmem_update_irq(s);
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
}

static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
{
    uint32_t ret = s->intrmask;

    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
    return ret;
}

static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
{
    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);

    s->intrstatus = val;
164
    ivshmem_update_irq(s);
165 166 167 168 169 170 171 172
}

static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
{
    uint32_t ret = s->intrstatus;

    /* reading ISR clears all interrupts */
    s->intrstatus = 0;
173
    ivshmem_update_irq(s);
174 175 176
    return ret;
}

A
Avi Kivity 已提交
177
static void ivshmem_io_write(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
178
                             uint64_t val, unsigned size)
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
{
    IVShmemState *s = opaque;

    uint16_t dest = val >> 16;
    uint16_t vector = val & 0xff;

    addr &= 0xfc;

    IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
    switch (addr)
    {
        case INTRMASK:
            ivshmem_IntrMask_write(s, val);
            break;

        case INTRSTATUS:
            ivshmem_IntrStatus_write(s, val);
            break;

        case DOORBELL:
            /* check that dest VM ID is reasonable */
200
            if (dest >= s->nb_peers) {
201 202 203 204 205
                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
                break;
            }

            /* check doorbell range */
206
            if (vector < s->peers[dest].nb_eventfds) {
207 208
                IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
                event_notifier_set(&s->peers[dest].eventfds[vector]);
209 210 211
            } else {
                IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
                                vector, dest);
212 213 214
            }
            break;
        default:
215
            IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
216 217 218
    }
}

A
Avi Kivity 已提交
219
static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
A
Avi Kivity 已提交
220
                                unsigned size)
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
{

    IVShmemState *s = opaque;
    uint32_t ret;

    switch (addr)
    {
        case INTRMASK:
            ret = ivshmem_IntrMask_read(s);
            break;

        case INTRSTATUS:
            ret = ivshmem_IntrStatus_read(s);
            break;

        case IVPOSITION:
            /* return my VM ID if the memory is mapped */
238
            if (memory_region_is_mapped(&s->ivshmem)) {
239 240 241 242 243 244 245 246 247 248 249 250 251 252
                ret = s->vm_id;
            } else {
                ret = -1;
            }
            break;

        default:
            IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
            ret = 0;
    }

    return ret;
}

A
Avi Kivity 已提交
253 254 255 256 257 258 259 260
static const MemoryRegionOps ivshmem_mmio_ops = {
    .read = ivshmem_io_read,
    .write = ivshmem_io_write,
    .endianness = DEVICE_NATIVE_ENDIAN,
    .impl = {
        .min_access_size = 4,
        .max_access_size = 4,
    },
261 262 263 264
};

static int ivshmem_can_receive(void * opaque)
{
265
    return sizeof(int64_t);
266 267
}

268 269
static void ivshmem_vector_notify(void *opaque)
{
270
    MSIVector *entry = opaque;
271
    PCIDevice *pdev = entry->pdev;
272
    IVShmemState *s = IVSHMEM(pdev);
273
    int vector = entry - s->msi_vectors;
274 275 276 277 278
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];

    if (!event_notifier_test_and_clear(n)) {
        return;
    }
279

280
    IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
281
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
282 283 284
        if (msix_enabled(pdev)) {
            msix_notify(pdev, vector);
        }
285 286 287
    } else {
        ivshmem_IntrStatus_write(s, 1);
    }
288 289
}

290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
                                 MSIMessage msg)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    MSIVector *v = &s->msi_vectors[vector];
    int ret;

    IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);

    ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
    if (ret < 0) {
        return ret;
    }

    return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
}

static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
{
    IVShmemState *s = IVSHMEM(dev);
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    int ret;

    IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);

    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n,
                                                s->msi_vectors[vector].virq);
    if (ret != 0) {
        error_report("remove_irqfd_notifier_gsi failed");
    }
}

static void ivshmem_vector_poll(PCIDevice *dev,
                                unsigned int vector_start,
                                unsigned int vector_end)
{
    IVShmemState *s = IVSHMEM(dev);
    unsigned int vector;

    IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);

    vector_end = MIN(vector_end, s->vectors);

    for (vector = vector_start; vector < vector_end; vector++) {
        EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];

        if (!msix_is_masked(dev, vector)) {
            continue;
        }

        if (event_notifier_test_and_clear(notifier)) {
            msix_set_pending(dev, vector);
        }
    }
}

347 348
static void watch_vector_notifier(IVShmemState *s, EventNotifier *n,
                                 int vector)
349
{
350
    int eventfd = event_notifier_get_fd(n);
351

352
    assert(!s->msi_vectors[vector].pdev);
353
    s->msi_vectors[vector].pdev = PCI_DEVICE(s);
354

355 356
    qemu_set_fd_handler(eventfd, ivshmem_vector_notify,
                        NULL, &s->msi_vectors[vector]);
357 358
}

M
Marc-André Lureau 已提交
359 360
static int check_shm_size(IVShmemState *s, int fd, Error **errp)
{
361 362 363 364 365
    /* check that the guest isn't going to try and map more memory than the
     * the object has allocated return -1 to indicate error */

    struct stat buf;

366
    if (fstat(fd, &buf) < 0) {
M
Marc-André Lureau 已提交
367 368
        error_setg(errp, "exiting: fstat on fd %d failed: %s",
                   fd, strerror(errno));
369 370
        return -1;
    }
371 372

    if (s->ivshmem_size > buf.st_size) {
M
Marc-André Lureau 已提交
373 374 375
        error_setg(errp, "Requested memory size greater"
                   " than shared object size (%" PRIu64 " > %" PRIu64")",
                   s->ivshmem_size, (uint64_t)buf.st_size);
376 377 378 379 380 381 382 383
        return -1;
    } else {
        return 0;
    }
}

/* create the shared memory BAR when we are not using the server, so we can
 * create the BAR and map the memory immediately */
M
Marc-André Lureau 已提交
384 385 386
static int create_shared_memory_BAR(IVShmemState *s, int fd, uint8_t attr,
                                    Error **errp)
{
387 388 389
    void * ptr;

    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
M
Marc-André Lureau 已提交
390 391 392 393 394
    if (ptr == MAP_FAILED) {
        error_setg_errno(errp, errno, "Failed to mmap shared memory");
        return -1;
    }

395
    memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s), "ivshmem.bar2",
A
Avi Kivity 已提交
396
                               s->ivshmem_size, ptr);
F
Fam Zheng 已提交
397
    qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem), fd);
398
    vmstate_register_ram(&s->ivshmem, DEVICE(s));
A
Avi Kivity 已提交
399
    memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
400 401

    /* region for shared memory */
402
    pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
M
Marc-André Lureau 已提交
403 404

    return 0;
405 406
}

407 408 409 410 411 412 413
static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_add_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
414
                              &s->peers[posn].eventfds[i]);
415 416 417 418 419 420 421 422 423
}

static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
{
    memory_region_del_eventfd(&s->ivshmem_mmio,
                              DOORBELL,
                              4,
                              true,
                              (posn << 16) | i,
424
                              &s->peers[posn].eventfds[i]);
425 426
}

427
static void close_peer_eventfds(IVShmemState *s, int posn)
428
{
429
    int i, n;
430

431 432 433
    if (!ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
        return;
    }
434
    if (posn < 0 || posn >= s->nb_peers) {
435
        error_report("invalid peer %d", posn);
436 437
        return;
    }
438

439
    n = s->peers[posn].nb_eventfds;
440

441
    memory_region_transaction_begin();
442
    for (i = 0; i < n; i++) {
443
        ivshmem_del_eventfd(s, posn, i);
444 445
    }
    memory_region_transaction_commit();
446
    for (i = 0; i < n; i++) {
447
        event_notifier_cleanup(&s->peers[posn].eventfds[i]);
448 449
    }

450
    g_free(s->peers[posn].eventfds);
451 452 453 454
    s->peers[posn].nb_eventfds = 0;
}

/* this function increase the dynamic storage need to store data about other
455
 * peers */
456
static int resize_peers(IVShmemState *s, int new_min_size)
457
{
458

459
    int j, old_size;
460

461 462
    /* limit number of max peers */
    if (new_min_size <= 0 || new_min_size > IVSHMEM_MAX_PEERS) {
463 464
        return -1;
    }
465
    if (new_min_size <= s->nb_peers) {
466 467
        return 0;
    }
468

469 470 471
    old_size = s->nb_peers;
    s->nb_peers = new_min_size;

472
    IVSHMEM_DPRINTF("bumping storage to %d peers\n", s->nb_peers);
473

474
    s->peers = g_realloc(s->peers, s->nb_peers * sizeof(Peer));
475

476
    for (j = old_size; j < s->nb_peers; j++) {
477
        s->peers[j].eventfds = g_new0(EventNotifier, s->vectors);
478 479
        s->peers[j].nb_eventfds = 0;
    }
480 481

    return 0;
482 483
}

484 485 486 487 488 489
static bool fifo_update_and_get(IVShmemState *s, const uint8_t *buf, int size,
                                void *data, size_t len)
{
    const uint8_t *p;
    uint32_t num;

490
    assert(len <= sizeof(int64_t)); /* limitation of the fifo */
491 492 493 494 495 496 497
    if (fifo8_is_empty(&s->incoming_fifo) && size == len) {
        memcpy(data, buf, size);
        return true;
    }

    IVSHMEM_DPRINTF("short read of %d bytes\n", size);

498
    num = MIN(size, sizeof(int64_t) - fifo8_num_used(&s->incoming_fifo));
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
    fifo8_push_all(&s->incoming_fifo, buf, num);

    if (fifo8_num_used(&s->incoming_fifo) < len) {
        assert(num == 0);
        return false;
    }

    size -= num;
    buf += num;
    p = fifo8_pop_buf(&s->incoming_fifo, len, &num);
    assert(num == len);

    memcpy(data, p, len);

    if (size > 0) {
        fifo8_push_all(&s->incoming_fifo, buf, size);
    }

    return true;
}

520 521 522 523 524 525 526 527 528 529 530
static bool fifo_update_and_get_i64(IVShmemState *s,
                                    const uint8_t *buf, int size, int64_t *i64)
{
    if (fifo_update_and_get(s, buf, size, i64, sizeof(*i64))) {
        *i64 = GINT64_FROM_LE(*i64);
        return true;
    }

    return false;
}

531 532 533 534 535 536 537
static int ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    MSIMessage msg = msix_get_message(pdev, vector);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
538
    assert(!s->msi_vectors[vector].pdev);
539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561

    ret = kvm_irqchip_add_msi_route(kvm_state, msg, pdev);
    if (ret < 0) {
        error_report("ivshmem: kvm_irqchip_add_msi_route failed");
        return -1;
    }

    s->msi_vectors[vector].virq = ret;
    s->msi_vectors[vector].pdev = pdev;

    return 0;
}

static void setup_interrupt(IVShmemState *s, int vector)
{
    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
    bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
        ivshmem_has_feature(s, IVSHMEM_MSI);
    PCIDevice *pdev = PCI_DEVICE(s);

    IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);

    if (!with_irqfd) {
562
        IVSHMEM_DPRINTF("with eventfd\n");
563
        watch_vector_notifier(s, n, vector);
564
    } else if (msix_enabled(pdev)) {
565
        IVSHMEM_DPRINTF("with irqfd\n");
566 567 568 569 570 571 572 573 574 575
        if (ivshmem_add_kvm_msi_virq(s, vector) < 0) {
            return;
        }

        if (!msix_is_masked(pdev, vector)) {
            kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
                                               s->msi_vectors[vector].virq);
        }
    } else {
        /* it will be delayed until msix is enabled, in write_config */
576
        IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n");
577 578 579
    }
}

580
static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
581 582
{
    IVShmemState *s = opaque;
583
    int incoming_fd;
584
    int new_eventfd;
585
    int64_t incoming_posn;
M
Marc-André Lureau 已提交
586
    Error *err = NULL;
587
    Peer *peer;
588

589
    if (!fifo_update_and_get_i64(s, buf, size, &incoming_posn)) {
590
        return;
591 592
    }

593
    if (incoming_posn < -1) {
594
        IVSHMEM_DPRINTF("invalid incoming_posn %" PRId64 "\n", incoming_posn);
595 596 597
        return;
    }

598
    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
599
    incoming_fd = qemu_chr_fe_get_msgfd(s->server_chr);
600 601
    IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n",
                    incoming_posn, incoming_fd);
602

603
    /* make sure we have enough space for this peer */
604
    if (incoming_posn >= s->nb_peers) {
605 606
        if (resize_peers(s, incoming_posn + 1) < 0) {
            error_report("failed to resize peers array");
607 608
            if (incoming_fd != -1) {
                close(incoming_fd);
609 610 611
            }
            return;
        }
612 613
    }

614 615
    peer = &s->peers[incoming_posn];

616
    if (incoming_fd == -1) {
617
        /* if posn is positive and unseen before then this is our posn*/
618
        if (incoming_posn >= 0 && s->vm_id == -1) {
619 620 621
            /* receive our posn */
            s->vm_id = incoming_posn;
        } else {
622
            /* otherwise an fd == -1 means an existing peer has gone away */
623
            IVSHMEM_DPRINTF("posn %" PRId64 " has gone away\n", incoming_posn);
624
            close_peer_eventfds(s, incoming_posn);
625
        }
M
Marc-André Lureau 已提交
626
        return;
627 628 629 630 631 632
    }

    /* if the position is -1, then it's shared memory region fd */
    if (incoming_posn == -1) {
        void * map_ptr;

633
        if (memory_region_is_mapped(&s->ivshmem)) {
634 635 636 637 638
            error_report("shm already initialized");
            close(incoming_fd);
            return;
        }

M
Marc-André Lureau 已提交
639 640 641 642
        if (check_shm_size(s, incoming_fd, &err) == -1) {
            error_report_err(err);
            close(incoming_fd);
            return;
643 644 645 646 647
        }

        /* mmap the region and map into the BAR2 */
        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
                                                            incoming_fd, 0);
M
Marc-André Lureau 已提交
648 649 650 651 652
        if (map_ptr == MAP_FAILED) {
            error_report("Failed to mmap shared memory %s", strerror(errno));
            close(incoming_fd);
            return;
        }
653
        memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s),
A
Avi Kivity 已提交
654
                                   "ivshmem.bar2", s->ivshmem_size, map_ptr);
F
Fam Zheng 已提交
655 656
        qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem),
                        incoming_fd);
657
        vmstate_register_ram(&s->ivshmem, DEVICE(s));
658

659
        IVSHMEM_DPRINTF("guest h/w addr = %p, size = %" PRIu64 "\n",
A
Andrew Jones 已提交
660
                        map_ptr, s->ivshmem_size);
661

A
Avi Kivity 已提交
662
        memory_region_add_subregion(&s->bar, 0, &s->ivshmem);
663 664 665 666

        return;
    }

667 668 669
    /* each peer has an associated array of eventfds, and we keep
     * track of how many eventfds received so far */
    /* get a new eventfd: */
670 671 672 673 674 675 676
    if (peer->nb_eventfds >= s->vectors) {
        error_report("Too many eventfd received, device has %d vectors",
                     s->vectors);
        close(incoming_fd);
        return;
    }

677
    new_eventfd = peer->nb_eventfds++;
678

679
    /* this is an eventfd for a particular peer VM */
680
    IVSHMEM_DPRINTF("eventfds[%" PRId64 "][%d] = %d\n", incoming_posn,
681 682
                    new_eventfd, incoming_fd);
    event_notifier_init_fd(&peer->eventfds[new_eventfd], incoming_fd);
683
    fcntl_setfl(incoming_fd, O_NONBLOCK); /* msix/irqfd poll non block */
684 685

    if (incoming_posn == s->vm_id) {
686
        setup_interrupt(s, new_eventfd);
687 688 689
    }

    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
690
        ivshmem_add_eventfd(s, incoming_posn, new_eventfd);
691 692 693
    }
}

694 695 696 697
static void ivshmem_check_version(void *opaque, const uint8_t * buf, int size)
{
    IVShmemState *s = opaque;
    int tmp;
698
    int64_t version;
699

700
    if (!fifo_update_and_get_i64(s, buf, size, &version)) {
701 702 703 704 705 706 707
        return;
    }

    tmp = qemu_chr_fe_get_msgfd(s->server_chr);
    if (tmp != -1 || version != IVSHMEM_PROTOCOL_VERSION) {
        fprintf(stderr, "incompatible version, you are connecting to a ivshmem-"
                "server using a different protocol please check your setup\n");
708
        qemu_chr_add_handlers(s->server_chr, NULL, NULL, NULL, s);
709 710 711 712 713
        return;
    }

    IVSHMEM_DPRINTF("version check ok, switch to real chardev handler\n");
    qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
714
                          NULL, s);
715 716
}

717 718 719
/* Select the MSI-X vectors used by device.
 * ivshmem maps events to vectors statically, so
 * we just enable all vectors on init and after reset. */
720
static void ivshmem_msix_vector_use(IVShmemState *s)
721
{
722
    PCIDevice *d = PCI_DEVICE(s);
723 724 725
    int i;

    for (i = 0; i < s->vectors; i++) {
726
        msix_vector_use(d, i);
727 728 729
    }
}

730 731
static void ivshmem_reset(DeviceState *d)
{
732
    IVShmemState *s = IVSHMEM(d);
733 734

    s->intrstatus = 0;
735
    s->intrmask = 0;
736 737 738
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        ivshmem_msix_vector_use(s);
    }
739 740
}

741
static int ivshmem_setup_interrupts(IVShmemState *s)
742
{
743 744
    /* allocate QEMU callback data for receiving interrupts */
    s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
745

746 747 748 749
    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1)) {
            return -1;
        }
750

751
        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
752
        ivshmem_msix_vector_use(s);
753
    }
754

M
Marc-André Lureau 已提交
755
    return 0;
756 757
}

758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
static void ivshmem_enable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
        ivshmem_add_kvm_msi_virq(s, i);
    }

    if (msix_set_vector_notifiers(pdev,
                                  ivshmem_vector_unmask,
                                  ivshmem_vector_mask,
                                  ivshmem_vector_poll)) {
        error_report("ivshmem: msix_set_vector_notifiers failed");
    }
}

static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
{
    IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);

    if (s->msi_vectors[vector].pdev == NULL) {
        return;
    }

    /* it was cleaned when masked in the frontend. */
    kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);

    s->msi_vectors[vector].pdev = NULL;
}

static void ivshmem_disable_irqfd(IVShmemState *s)
{
    PCIDevice *pdev = PCI_DEVICE(s);
    int i;

    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
        ivshmem_remove_kvm_msi_virq(s, i);
    }

    msix_unset_vector_notifiers(pdev);
}

static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
M
Marc-André Lureau 已提交
802
                                 uint32_t val, int len)
803
{
804 805 806 807 808 809 810 811 812 813 814 815 816
    IVShmemState *s = IVSHMEM(pdev);
    int is_enabled, was_enabled = msix_enabled(pdev);

    pci_default_write_config(pdev, address, val, len);
    is_enabled = msix_enabled(pdev);

    if (kvm_msi_via_irqfd_enabled() && s->vm_id != -1) {
        if (!was_enabled && is_enabled) {
            ivshmem_enable_irqfd(s);
        } else if (was_enabled && !is_enabled) {
            ivshmem_disable_irqfd(s);
        }
    }
817 818
}

M
Marc-André Lureau 已提交
819
static void pci_ivshmem_realize(PCIDevice *dev, Error **errp)
820
{
821
    IVShmemState *s = IVSHMEM(dev);
822
    Error *err = NULL;
823
    uint8_t *pci_conf;
824 825
    uint8_t attr = PCI_BASE_ADDRESS_SPACE_MEMORY |
        PCI_BASE_ADDRESS_MEM_PREFETCH;
826

M
Marc-André Lureau 已提交
827
    if (!!s->server_chr + !!s->shmobj + !!s->hostmem != 1) {
828 829
        error_setg(errp,
                   "You must specify either 'shm', 'chardev' or 'x-memdev'");
M
Marc-André Lureau 已提交
830 831 832 833 834 835 836 837 838 839
        return;
    }

    if (s->hostmem) {
        MemoryRegion *mr;

        if (s->sizearg) {
            g_warning("size argument ignored with hostmem");
        }

840
        mr = host_memory_backend_get_memory(s->hostmem, &error_abort);
M
Marc-André Lureau 已提交
841 842
        s->ivshmem_size = memory_region_size(mr);
    } else if (s->sizearg == NULL) {
843
        s->ivshmem_size = 4 << 20; /* 4 MB default */
M
Marc-André Lureau 已提交
844
    } else {
M
Marc-André Lureau 已提交
845 846 847 848
        char *end;
        int64_t size = qemu_strtosz(s->sizearg, &end);
        if (size < 0 || *end != '\0' || !is_power_of_2(size)) {
            error_setg(errp, "Invalid size %s", s->sizearg);
M
Marc-André Lureau 已提交
849 850
            return;
        }
M
Marc-André Lureau 已提交
851
        s->ivshmem_size = size;
852 853 854 855 856
    }

    /* IRQFD requires MSI */
    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
M
Marc-André Lureau 已提交
857 858
        error_setg(errp, "ioeventfd/irqfd requires MSI");
        return;
859 860 861 862 863 864 865 866 867
    }

    /* check that role is reasonable */
    if (s->role) {
        if (strncmp(s->role, "peer", 5) == 0) {
            s->role_val = IVSHMEM_PEER;
        } else if (strncmp(s->role, "master", 7) == 0) {
            s->role_val = IVSHMEM_MASTER;
        } else {
M
Marc-André Lureau 已提交
868 869
            error_setg(errp, "'role' must be 'peer' or 'master'");
            return;
870 871 872 873 874
        }
    } else {
        s->role_val = IVSHMEM_MASTER; /* default */
    }

875
    pci_conf = dev->config;
876 877
    pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;

878 879 880 881
    /*
     * Note: we don't use INTx with IVSHMEM_MSI at all, so this is a
     * bald-faced lie then.  But it's a backwards compatible lie.
     */
882 883
    pci_config_set_interrupt_pin(pci_conf, 1);

884
    memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
A
Avi Kivity 已提交
885 886
                          "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);

887
    /* region for registers*/
888
    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
889
                     &s->ivshmem_mmio);
A
Avi Kivity 已提交
890

891
    memory_region_init(&s->bar, OBJECT(s), "ivshmem-bar2-container", s->ivshmem_size);
G
Gerd Hoffmann 已提交
892
    if (s->ivshmem_64bit) {
893
        attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
G
Gerd Hoffmann 已提交
894
    }
895

M
Marc-André Lureau 已提交
896 897 898 899 900
    if (s->hostmem != NULL) {
        MemoryRegion *mr;

        IVSHMEM_DPRINTF("using hostmem\n");

901 902
        mr = host_memory_backend_get_memory(MEMORY_BACKEND(s->hostmem),
                                            &error_abort);
M
Marc-André Lureau 已提交
903 904 905 906
        vmstate_register_ram(mr, DEVICE(s));
        memory_region_add_subregion(&s->bar, 0, mr);
        pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar);
    } else if (s->server_chr != NULL) {
907
        /* FIXME do not rely on what chr drivers put into filename */
908 909 910 911 912
        if (strncmp(s->server_chr->filename, "unix:", 5)) {
            error_setg(errp, "chardev is not a unix client socket");
            return;
        }

913 914 915 916
        /* if we get a UNIX socket as the parameter we will talk
         * to the ivshmem server to receive the memory region */

        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
A
Andrew Jones 已提交
917
                        s->server_chr->filename);
918

919 920
        if (ivshmem_setup_interrupts(s) < 0) {
            error_setg(errp, "failed to initialize interrupts");
M
Marc-André Lureau 已提交
921
            return;
922 923
        }

924
        /* we allocate enough space for 16 peers and grow as needed */
925
        resize_peers(s, 16);
926 927
        s->vm_id = -1;

928
        pci_register_bar(dev, 2, attr, &s->bar);
929

930
        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive,
931
                              ivshmem_check_version, NULL, s);
932 933 934 935 936 937 938 939 940 941 942 943
    } else {
        /* just map the file immediately, we're not using a server */
        int fd;

        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);

        /* try opening with O_EXCL and if it succeeds zero the memory
         * by truncating to 0 */
        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
           /* truncate file to length PCI device's memory */
            if (ftruncate(fd, s->ivshmem_size) != 0) {
A
Andrew Jones 已提交
944
                error_report("could not truncate shared file");
945 946 947 948
            }

        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
M
Marc-André Lureau 已提交
949 950
            error_setg(errp, "could not open shared file");
            return;
951 952
        }

M
Marc-André Lureau 已提交
953 954
        if (check_shm_size(s, fd, errp) == -1) {
            return;
955 956
        }

957 958 959 960 961 962 963 964 965 966 967 968 969
        create_shared_memory_BAR(s, fd, attr, &err);
        if (err) {
            error_propagate(errp, err);
            return;
        }
    }

    fifo8_create(&s->incoming_fifo, sizeof(int64_t));

    if (s->role_val == IVSHMEM_PEER) {
        error_setg(&s->migration_blocker,
                   "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
        migrate_add_blocker(s->migration_blocker);
970 971 972
    }
}

M
Marc-André Lureau 已提交
973
static void pci_ivshmem_exit(PCIDevice *dev)
974
{
975
    IVShmemState *s = IVSHMEM(dev);
976 977 978
    int i;

    fifo8_destroy(&s->incoming_fifo);
979

980 981 982 983 984
    if (s->migration_blocker) {
        migrate_del_blocker(s->migration_blocker);
        error_free(s->migration_blocker);
    }

985
    if (memory_region_is_mapped(&s->ivshmem)) {
M
Marc-André Lureau 已提交
986 987
        if (!s->hostmem) {
            void *addr = memory_region_get_ram_ptr(&s->ivshmem);
988
            int fd;
M
Marc-André Lureau 已提交
989 990 991 992 993

            if (munmap(addr, s->ivshmem_size) == -1) {
                error_report("Failed to munmap shared memory %s",
                             strerror(errno));
            }
994

F
Fam Zheng 已提交
995 996
            fd = qemu_get_ram_fd(memory_region_get_ram_addr(&s->ivshmem));
            if (fd != -1) {
997
                close(fd);
F
Fam Zheng 已提交
998
            }
M
Marc-André Lureau 已提交
999
        }
1000 1001 1002 1003 1004 1005 1006

        vmstate_unregister_ram(&s->ivshmem, DEVICE(dev));
        memory_region_del_subregion(&s->bar, &s->ivshmem);
    }

    if (s->peers) {
        for (i = 0; i < s->nb_peers; i++) {
1007
            close_peer_eventfds(s, i);
1008 1009 1010 1011 1012 1013 1014 1015
        }
        g_free(s->peers);
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_uninit_exclusive_bar(dev);
    }

1016
    g_free(s->msi_vectors);
1017 1018
}

1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
static bool test_msix(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    return ivshmem_has_feature(s, IVSHMEM_MSI);
}

static bool test_no_msix(void *opaque, int version_id)
{
    return !test_msix(opaque, version_id);
}

static int ivshmem_pre_load(void *opaque)
{
    IVShmemState *s = opaque;

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    return 0;
}

static int ivshmem_post_load(void *opaque, int version_id)
{
    IVShmemState *s = opaque;

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
1048
        ivshmem_msix_vector_use(s);
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
    }
    return 0;
}

static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id)
{
    IVShmemState *s = opaque;
    PCIDevice *pdev = PCI_DEVICE(s);
    int ret;

    IVSHMEM_DPRINTF("ivshmem_load_old\n");

    if (version_id != 0) {
        return -EINVAL;
    }

    if (s->role_val == IVSHMEM_PEER) {
        error_report("'peer' devices are not migratable");
        return -EINVAL;
    }

    ret = pci_device_load(pdev, f);
    if (ret) {
        return ret;
    }

    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
        msix_load(pdev, f);
1077
        ivshmem_msix_vector_use(s);
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
    } else {
        s->intrstatus = qemu_get_be32(f);
        s->intrmask = qemu_get_be32(f);
    }

    return 0;
}

static const VMStateDescription ivshmem_vmsd = {
    .name = "ivshmem",
    .version_id = 1,
    .minimum_version_id = 1,
    .pre_load = ivshmem_pre_load,
    .post_load = ivshmem_post_load,
    .fields = (VMStateField[]) {
        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),

        VMSTATE_MSIX_TEST(parent_obj, IVShmemState, test_msix),
        VMSTATE_UINT32_TEST(intrstatus, IVShmemState, test_no_msix),
        VMSTATE_UINT32_TEST(intrmask, IVShmemState, test_no_msix),

        VMSTATE_END_OF_LIST()
    },
    .load_state_old = ivshmem_load_old,
    .minimum_version_id_old = 0
};

1105 1106 1107 1108 1109 1110 1111 1112
static Property ivshmem_properties[] = {
    DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
    DEFINE_PROP_STRING("size", IVShmemState, sizearg),
    DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
    DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, false),
    DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
    DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
    DEFINE_PROP_STRING("role", IVShmemState, role),
G
Gerd Hoffmann 已提交
1113
    DEFINE_PROP_UINT32("use64", IVShmemState, ivshmem_64bit, 1),
1114 1115 1116 1117 1118
    DEFINE_PROP_END_OF_LIST(),
};

static void ivshmem_class_init(ObjectClass *klass, void *data)
{
1119
    DeviceClass *dc = DEVICE_CLASS(klass);
1120 1121
    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);

M
Marc-André Lureau 已提交
1122 1123 1124
    k->realize = pci_ivshmem_realize;
    k->exit = pci_ivshmem_exit;
    k->config_write = ivshmem_write_config;
1125 1126
    k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
    k->device_id = PCI_DEVICE_ID_IVSHMEM;
1127
    k->class_id = PCI_CLASS_MEMORY_RAM;
1128 1129
    dc->reset = ivshmem_reset;
    dc->props = ivshmem_properties;
1130
    dc->vmsd = &ivshmem_vmsd;
1131
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1132
    dc->desc = "Inter-VM shared memory";
1133 1134
}

M
Marc-André Lureau 已提交
1135 1136 1137 1138 1139
static void ivshmem_check_memdev_is_busy(Object *obj, const char *name,
                                         Object *val, Error **errp)
{
    MemoryRegion *mr;

1140
    mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), &error_abort);
M
Marc-André Lureau 已提交
1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
    if (memory_region_is_mapped(mr)) {
        char *path = object_get_canonical_path_component(val);
        error_setg(errp, "can't use already busy memdev: %s", path);
        g_free(path);
    } else {
        qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
    }
}

static void ivshmem_init(Object *obj)
{
    IVShmemState *s = IVSHMEM(obj);

1154
    object_property_add_link(obj, "x-memdev", TYPE_MEMORY_BACKEND,
M
Marc-André Lureau 已提交
1155 1156 1157 1158 1159 1160
                             (Object **)&s->hostmem,
                             ivshmem_check_memdev_is_busy,
                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
                             &error_abort);
}

1161
static const TypeInfo ivshmem_info = {
1162
    .name          = TYPE_IVSHMEM,
1163 1164
    .parent        = TYPE_PCI_DEVICE,
    .instance_size = sizeof(IVShmemState),
M
Marc-André Lureau 已提交
1165
    .instance_init = ivshmem_init,
1166
    .class_init    = ivshmem_class_init,
1167 1168
};

A
Andreas Färber 已提交
1169
static void ivshmem_register_types(void)
1170
{
1171
    type_register_static(&ivshmem_info);
1172 1173
}

A
Andreas Färber 已提交
1174
type_init(ivshmem_register_types)