vhost.c 33.4 KB
Newer Older
M
Michael S. Tsirkin 已提交
1 2 3 4 5 6 7 8 9 10
/*
 * vhost support
 *
 * Copyright Red Hat, Inc. 2010
 *
 * Authors:
 *  Michael S. Tsirkin <mst@redhat.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
11 12 13
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
M
Michael S. Tsirkin 已提交
14 15
 */

P
Paolo Bonzini 已提交
16
#include "hw/virtio/vhost.h"
M
Michael S. Tsirkin 已提交
17
#include "hw/hw.h"
18
#include "qemu/atomic.h"
19
#include "qemu/range.h"
20
#include <linux/vhost.h>
21
#include "exec/address-spaces.h"
K
KONRAD Frederic 已提交
22
#include "hw/virtio/virtio-bus.h"
M
Michael S. Tsirkin 已提交
23 24

static void vhost_dev_sync_region(struct vhost_dev *dev,
25
                                  MemoryRegionSection *section,
M
Michael S. Tsirkin 已提交
26 27 28 29 30 31 32 33 34 35 36 37
                                  uint64_t mfirst, uint64_t mlast,
                                  uint64_t rfirst, uint64_t rlast)
{
    uint64_t start = MAX(mfirst, rfirst);
    uint64_t end = MIN(mlast, rlast);
    vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK;
    vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1;
    uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;

    if (end < start) {
        return;
    }
38
    assert(end / VHOST_LOG_CHUNK < dev->log_size);
39
    assert(start / VHOST_LOG_CHUNK < dev->log_size);
40

M
Michael S. Tsirkin 已提交
41 42 43 44 45
    for (;from < to; ++from) {
        vhost_log_chunk_t log;
        /* We first check with non-atomic: much cheaper,
         * and we expect non-dirty to be the common case. */
        if (!*from) {
46
            addr += VHOST_LOG_CHUNK;
M
Michael S. Tsirkin 已提交
47 48
            continue;
        }
49 50 51
        /* Data must be read atomically. We don't really need barrier semantics
         * but it's easier to use atomic_* than roll our own. */
        log = atomic_xchg(from, 0);
N
Natanael Copa 已提交
52 53
        while (log) {
            int bit = ctzl(log);
M
Michael S. Tsirkin 已提交
54 55 56 57 58 59 60
            hwaddr page_addr;
            hwaddr section_offset;
            hwaddr mr_offset;
            page_addr = addr + bit * VHOST_LOG_PAGE;
            section_offset = page_addr - section->offset_within_address_space;
            mr_offset = section_offset + section->offset_within_region;
            memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
M
Michael S. Tsirkin 已提交
61 62 63 64 65 66
            log &= ~(0x1ull << bit);
        }
        addr += VHOST_LOG_CHUNK;
    }
}

A
Avi Kivity 已提交
67
static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
68
                                   MemoryRegionSection *section,
M
Michael S. Tsirkin 已提交
69 70
                                   hwaddr first,
                                   hwaddr last)
M
Michael S. Tsirkin 已提交
71 72
{
    int i;
M
Michael S. Tsirkin 已提交
73 74
    hwaddr start_addr;
    hwaddr end_addr;
A
Avi Kivity 已提交
75

M
Michael S. Tsirkin 已提交
76 77 78
    if (!dev->log_enabled || !dev->started) {
        return 0;
    }
M
Michael S. Tsirkin 已提交
79
    start_addr = section->offset_within_address_space;
80
    end_addr = range_get_last(start_addr, int128_get64(section->size));
M
Michael S. Tsirkin 已提交
81 82 83
    start_addr = MAX(first, start_addr);
    end_addr = MIN(last, end_addr);

M
Michael S. Tsirkin 已提交
84 85
    for (i = 0; i < dev->mem->nregions; ++i) {
        struct vhost_memory_region *reg = dev->mem->regions + i;
86
        vhost_dev_sync_region(dev, section, start_addr, end_addr,
M
Michael S. Tsirkin 已提交
87 88 89 90 91 92
                              reg->guest_phys_addr,
                              range_get_last(reg->guest_phys_addr,
                                             reg->memory_size));
    }
    for (i = 0; i < dev->nvqs; ++i) {
        struct vhost_virtqueue *vq = dev->vqs + i;
93
        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
M
Michael S. Tsirkin 已提交
94 95 96 97 98
                              range_get_last(vq->used_phys, vq->used_size));
    }
    return 0;
}

A
Avi Kivity 已提交
99 100 101 102 103
static void vhost_log_sync(MemoryListener *listener,
                          MemoryRegionSection *section)
{
    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                         memory_listener);
M
Michael S. Tsirkin 已提交
104 105
    vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
}
A
Avi Kivity 已提交
106

M
Michael S. Tsirkin 已提交
107 108 109 110 111 112 113 114 115
static void vhost_log_sync_range(struct vhost_dev *dev,
                                 hwaddr first, hwaddr last)
{
    int i;
    /* FIXME: this is N^2 in number of sections */
    for (i = 0; i < dev->n_mem_sections; ++i) {
        MemoryRegionSection *section = &dev->mem_sections[i];
        vhost_sync_dirty_bitmap(dev, section, first, last);
    }
A
Avi Kivity 已提交
116 117
}

M
Michael S. Tsirkin 已提交
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
/* Assign/unassign. Keep an unsorted array of non-overlapping
 * memory regions in dev->mem. */
static void vhost_dev_unassign_memory(struct vhost_dev *dev,
                                      uint64_t start_addr,
                                      uint64_t size)
{
    int from, to, n = dev->mem->nregions;
    /* Track overlapping/split regions for sanity checking. */
    int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;

    for (from = 0, to = 0; from < n; ++from, ++to) {
        struct vhost_memory_region *reg = dev->mem->regions + to;
        uint64_t reglast;
        uint64_t memlast;
        uint64_t change;

        /* clone old region */
        if (to != from) {
            memcpy(reg, dev->mem->regions + from, sizeof *reg);
        }

        /* No overlap is simple */
        if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
                            start_addr, size)) {
            continue;
        }

        /* Split only happens if supplied region
         * is in the middle of an existing one. Thus it can not
         * overlap with any other existing region. */
        assert(!split);

        reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
        memlast = range_get_last(start_addr, size);

        /* Remove whole region */
        if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
            --dev->mem->nregions;
            --to;
            ++overlap_middle;
            continue;
        }

        /* Shrink region */
        if (memlast >= reglast) {
            reg->memory_size = start_addr - reg->guest_phys_addr;
            assert(reg->memory_size);
            assert(!overlap_end);
            ++overlap_end;
            continue;
        }

        /* Shift region */
        if (start_addr <= reg->guest_phys_addr) {
            change = memlast + 1 - reg->guest_phys_addr;
            reg->memory_size -= change;
            reg->guest_phys_addr += change;
            reg->userspace_addr += change;
            assert(reg->memory_size);
            assert(!overlap_start);
            ++overlap_start;
            continue;
        }

        /* This only happens if supplied region
         * is in the middle of an existing one. Thus it can not
         * overlap with any other existing region. */
        assert(!overlap_start);
        assert(!overlap_end);
        assert(!overlap_middle);
        /* Split region: shrink first part, shift second part. */
        memcpy(dev->mem->regions + n, reg, sizeof *reg);
        reg->memory_size = start_addr - reg->guest_phys_addr;
        assert(reg->memory_size);
        change = memlast + 1 - reg->guest_phys_addr;
        reg = dev->mem->regions + n;
        reg->memory_size -= change;
        assert(reg->memory_size);
        reg->guest_phys_addr += change;
        reg->userspace_addr += change;
        /* Never add more than 1 region */
        assert(dev->mem->nregions == n);
        ++dev->mem->nregions;
        ++split;
    }
}

/* Called after unassign, so no regions overlap the given range. */
static void vhost_dev_assign_memory(struct vhost_dev *dev,
                                    uint64_t start_addr,
                                    uint64_t size,
                                    uint64_t uaddr)
{
    int from, to;
    struct vhost_memory_region *merged = NULL;
    for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
        struct vhost_memory_region *reg = dev->mem->regions + to;
        uint64_t prlast, urlast;
        uint64_t pmlast, umlast;
        uint64_t s, e, u;

        /* clone old region */
        if (to != from) {
            memcpy(reg, dev->mem->regions + from, sizeof *reg);
        }
        prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
        pmlast = range_get_last(start_addr, size);
        urlast = range_get_last(reg->userspace_addr, reg->memory_size);
        umlast = range_get_last(uaddr, size);

        /* check for overlapping regions: should never happen. */
        assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
        /* Not an adjacent or overlapping region - do not merge. */
        if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
            (pmlast + 1 != reg->guest_phys_addr ||
             umlast + 1 != reg->userspace_addr)) {
            continue;
        }

        if (merged) {
            --to;
            assert(to >= 0);
        } else {
            merged = reg;
        }
        u = MIN(uaddr, reg->userspace_addr);
        s = MIN(start_addr, reg->guest_phys_addr);
        e = MAX(pmlast, prlast);
        uaddr = merged->userspace_addr = u;
        start_addr = merged->guest_phys_addr = s;
        size = merged->memory_size = e - s + 1;
        assert(merged->memory_size);
    }

    if (!merged) {
        struct vhost_memory_region *reg = dev->mem->regions + to;
        memset(reg, 0, sizeof *reg);
        reg->memory_size = size;
        assert(reg->memory_size);
        reg->guest_phys_addr = start_addr;
        reg->userspace_addr = uaddr;
        ++to;
    }
    assert(to <= dev->mem->nregions + 1);
    dev->mem->nregions = to;
}

static uint64_t vhost_get_log_size(struct vhost_dev *dev)
{
    uint64_t log_size = 0;
    int i;
    for (i = 0; i < dev->mem->nregions; ++i) {
        struct vhost_memory_region *reg = dev->mem->regions + i;
        uint64_t last = range_get_last(reg->guest_phys_addr,
                                       reg->memory_size);
        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
    }
    for (i = 0; i < dev->nvqs; ++i) {
        struct vhost_virtqueue *vq = dev->vqs + i;
        uint64_t last = vq->used_phys + vq->used_size - 1;
        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
    }
    return log_size;
}

static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
{
    vhost_log_chunk_t *log;
    uint64_t log_base;
M
Michael S. Tsirkin 已提交
287
    int r;
288 289

    log = g_malloc0(size * sizeof *log);
M
Michael S. Tsirkin 已提交
290
    log_base = (uint64_t)(unsigned long)log;
291
    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_LOG_BASE, &log_base);
M
Michael S. Tsirkin 已提交
292
    assert(r >= 0);
M
Michael S. Tsirkin 已提交
293 294 295
    /* Sync only the range covered by the old log */
    if (dev->log_size) {
        vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
296
    }
297
    g_free(dev->log);
M
Michael S. Tsirkin 已提交
298 299 300 301 302 303 304 305 306 307 308
    dev->log = log;
    dev->log_size = size;
}

static int vhost_verify_ring_mappings(struct vhost_dev *dev,
                                      uint64_t start_addr,
                                      uint64_t size)
{
    int i;
    for (i = 0; i < dev->nvqs; ++i) {
        struct vhost_virtqueue *vq = dev->vqs + i;
A
Avi Kivity 已提交
309
        hwaddr l;
M
Michael S. Tsirkin 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
        void *p;

        if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) {
            continue;
        }
        l = vq->ring_size;
        p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
        if (!p || l != vq->ring_size) {
            fprintf(stderr, "Unable to map ring buffer for ring %d\n", i);
            return -ENOMEM;
        }
        if (p != vq->ring) {
            fprintf(stderr, "Ring buffer relocated for ring %d\n", i);
            return -EBUSY;
        }
        cpu_physical_memory_unmap(p, l, 0, 0);
    }
    return 0;
}

330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
						      uint64_t start_addr,
						      uint64_t size)
{
    int i, n = dev->mem->nregions;
    for (i = 0; i < n; ++i) {
        struct vhost_memory_region *reg = dev->mem->regions + i;
        if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
                           start_addr, size)) {
            return reg;
        }
    }
    return NULL;
}

static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
                                 uint64_t start_addr,
                                 uint64_t size,
                                 uint64_t uaddr)
{
    struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
    uint64_t reglast;
    uint64_t memlast;

    if (!reg) {
        return true;
    }

    reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
    memlast = range_get_last(start_addr, size);

    /* Need to extend region? */
    if (start_addr < reg->guest_phys_addr || memlast > reglast) {
        return true;
    }
    /* userspace_addr changed? */
    return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
}

A
Avi Kivity 已提交
369 370 371
static void vhost_set_memory(MemoryListener *listener,
                             MemoryRegionSection *section,
                             bool add)
M
Michael S. Tsirkin 已提交
372
{
A
Avi Kivity 已提交
373 374
    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                         memory_listener);
A
Avi Kivity 已提交
375
    hwaddr start_addr = section->offset_within_address_space;
376
    ram_addr_t size = int128_get64(section->size);
A
Avi Kivity 已提交
377
    bool log_dirty = memory_region_is_logging(section->mr);
M
Michael S. Tsirkin 已提交
378 379
    int s = offsetof(struct vhost_memory, regions) +
        (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
A
Avi Kivity 已提交
380 381
    void *ram;

382
    dev->mem = g_realloc(dev->mem, s);
M
Michael S. Tsirkin 已提交
383

384
    if (log_dirty) {
A
Avi Kivity 已提交
385
        add = false;
386 387
    }

M
Michael S. Tsirkin 已提交
388 389
    assert(size);

390
    /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
391
    ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
A
Avi Kivity 已提交
392 393
    if (add) {
        if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
394 395 396 397 398 399 400 401 402 403
            /* Region exists with same address. Nothing to do. */
            return;
        }
    } else {
        if (!vhost_dev_find_reg(dev, start_addr, size)) {
            /* Removing region that we don't access. Nothing to do. */
            return;
        }
    }

M
Michael S. Tsirkin 已提交
404
    vhost_dev_unassign_memory(dev, start_addr, size);
A
Avi Kivity 已提交
405
    if (add) {
M
Michael S. Tsirkin 已提交
406
        /* Add given mapping, merging adjacent regions if any */
A
Avi Kivity 已提交
407
        vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
M
Michael S. Tsirkin 已提交
408 409 410 411
    } else {
        /* Remove old mapping for this memory, if any. */
        vhost_dev_unassign_memory(dev, start_addr, size);
    }
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
    dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
    dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
    dev->memory_changed = true;
}

static bool vhost_section(MemoryRegionSection *section)
{
    return memory_region_is_ram(section->mr);
}

static void vhost_begin(MemoryListener *listener)
{
    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                         memory_listener);
    dev->mem_changed_end_addr = 0;
    dev->mem_changed_start_addr = -1;
}
M
Michael S. Tsirkin 已提交
429

430 431 432 433 434 435 436 437 438 439 440 441
static void vhost_commit(MemoryListener *listener)
{
    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                         memory_listener);
    hwaddr start_addr = 0;
    ram_addr_t size = 0;
    uint64_t log_size;
    int r;

    if (!dev->memory_changed) {
        return;
    }
M
Michael S. Tsirkin 已提交
442 443 444
    if (!dev->started) {
        return;
    }
445 446 447
    if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
        return;
    }
M
Michael S. Tsirkin 已提交
448 449

    if (dev->started) {
450 451 452
        start_addr = dev->mem_changed_start_addr;
        size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;

M
Michael S. Tsirkin 已提交
453 454 455 456 457
        r = vhost_verify_ring_mappings(dev, start_addr, size);
        assert(r >= 0);
    }

    if (!dev->log_enabled) {
458
        r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
M
Michael S. Tsirkin 已提交
459
        assert(r >= 0);
460
        dev->memory_changed = false;
M
Michael S. Tsirkin 已提交
461 462 463 464 465 466 467 468 469 470
        return;
    }
    log_size = vhost_get_log_size(dev);
    /* We allocate an extra 4K bytes to log,
     * to reduce the * number of reallocations. */
#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
    /* To log more, must increase log size before table update. */
    if (dev->log_size < log_size) {
        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
    }
471
    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
M
Michael S. Tsirkin 已提交
472 473 474 475 476
    assert(r >= 0);
    /* To log less, can only decrease log size after table update. */
    if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
        vhost_dev_log_resize(dev, log_size);
    }
477
    dev->memory_changed = false;
478 479
}

A
Avi Kivity 已提交
480 481 482
static void vhost_region_add(MemoryListener *listener,
                             MemoryRegionSection *section)
{
483 484 485
    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                         memory_listener);

A
Avi Kivity 已提交
486 487 488 489
    if (!vhost_section(section)) {
        return;
    }

490 491 492 493
    ++dev->n_mem_sections;
    dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
                                dev->n_mem_sections);
    dev->mem_sections[dev->n_mem_sections - 1] = *section;
P
Paolo Bonzini 已提交
494
    memory_region_ref(section->mr);
A
Avi Kivity 已提交
495 496 497 498 499 500
    vhost_set_memory(listener, section, true);
}

static void vhost_region_del(MemoryListener *listener,
                             MemoryRegionSection *section)
{
501 502 503 504
    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                         memory_listener);
    int i;

A
Avi Kivity 已提交
505 506 507 508
    if (!vhost_section(section)) {
        return;
    }

A
Avi Kivity 已提交
509
    vhost_set_memory(listener, section, false);
P
Paolo Bonzini 已提交
510
    memory_region_unref(section->mr);
511 512 513 514 515
    for (i = 0; i < dev->n_mem_sections; ++i) {
        if (dev->mem_sections[i].offset_within_address_space
            == section->offset_within_address_space) {
            --dev->n_mem_sections;
            memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
516
                    (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
517 518 519
            break;
        }
    }
A
Avi Kivity 已提交
520 521
}

522 523 524 525 526
static void vhost_region_nop(MemoryListener *listener,
                             MemoryRegionSection *section)
{
}

M
Michael S. Tsirkin 已提交
527 528 529 530 531 532
static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
                                    struct vhost_virtqueue *vq,
                                    unsigned idx, bool enable_log)
{
    struct vhost_vring_addr addr = {
        .index = idx,
533 534 535
        .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
        .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
        .used_user_addr = (uint64_t)(unsigned long)vq->used,
M
Michael S. Tsirkin 已提交
536 537 538
        .log_guest_addr = vq->used_phys,
        .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
    };
539
    int r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ADDR, &addr);
M
Michael S. Tsirkin 已提交
540 541 542 543 544 545 546 547 548 549 550 551 552
    if (r < 0) {
        return -errno;
    }
    return 0;
}

static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
{
    uint64_t features = dev->acked_features;
    int r;
    if (enable_log) {
        features |= 0x1 << VHOST_F_LOG_ALL;
    }
553
    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_FEATURES, &features);
M
Michael S. Tsirkin 已提交
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
    return r < 0 ? -errno : 0;
}

static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
{
    int r, t, i;
    r = vhost_dev_set_features(dev, enable_log);
    if (r < 0) {
        goto err_features;
    }
    for (i = 0; i < dev->nvqs; ++i) {
        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
                                     enable_log);
        if (r < 0) {
            goto err_vq;
        }
    }
    return 0;
err_vq:
    for (; i >= 0; --i) {
        t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
                                     dev->log_enabled);
        assert(t >= 0);
    }
    t = vhost_dev_set_features(dev, dev->log_enabled);
    assert(t >= 0);
err_features:
    return r;
}

A
Avi Kivity 已提交
584
static int vhost_migration_log(MemoryListener *listener, int enable)
M
Michael S. Tsirkin 已提交
585
{
A
Avi Kivity 已提交
586 587
    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                         memory_listener);
M
Michael S. Tsirkin 已提交
588 589 590 591 592 593 594 595 596 597 598 599 600
    int r;
    if (!!enable == dev->log_enabled) {
        return 0;
    }
    if (!dev->started) {
        dev->log_enabled = enable;
        return 0;
    }
    if (!enable) {
        r = vhost_dev_set_log(dev, false);
        if (r < 0) {
            return r;
        }
601
        g_free(dev->log);
M
Michael S. Tsirkin 已提交
602 603 604 605 606 607 608 609 610 611 612 613 614
        dev->log = NULL;
        dev->log_size = 0;
    } else {
        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
        r = vhost_dev_set_log(dev, true);
        if (r < 0) {
            return r;
        }
    }
    dev->log_enabled = enable;
    return 0;
}

A
Avi Kivity 已提交
615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
static void vhost_log_global_start(MemoryListener *listener)
{
    int r;

    r = vhost_migration_log(listener, true);
    if (r < 0) {
        abort();
    }
}

static void vhost_log_global_stop(MemoryListener *listener)
{
    int r;

    r = vhost_migration_log(listener, false);
    if (r < 0) {
        abort();
    }
}

static void vhost_log_start(MemoryListener *listener,
                            MemoryRegionSection *section)
{
    /* FIXME: implement */
}

static void vhost_log_stop(MemoryListener *listener,
                           MemoryRegionSection *section)
{
    /* FIXME: implement */
}

647
static int vhost_virtqueue_start(struct vhost_dev *dev,
M
Michael S. Tsirkin 已提交
648 649 650 651
                                struct VirtIODevice *vdev,
                                struct vhost_virtqueue *vq,
                                unsigned idx)
{
A
Avi Kivity 已提交
652
    hwaddr s, l, a;
M
Michael S. Tsirkin 已提交
653
    int r;
J
Jason Wang 已提交
654
    int vhost_vq_index = idx - dev->vq_index;
M
Michael S. Tsirkin 已提交
655
    struct vhost_vring_file file = {
J
Jason Wang 已提交
656
        .index = vhost_vq_index
M
Michael S. Tsirkin 已提交
657 658
    };
    struct vhost_vring_state state = {
J
Jason Wang 已提交
659
        .index = vhost_vq_index
M
Michael S. Tsirkin 已提交
660 661 662
    };
    struct VirtQueue *vvq = virtio_get_queue(vdev, idx);

J
Jason Wang 已提交
663 664
    assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);

M
Michael S. Tsirkin 已提交
665
    vq->num = state.num = virtio_queue_get_num(vdev, idx);
666
    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_NUM, &state);
M
Michael S. Tsirkin 已提交
667 668 669 670 671
    if (r) {
        return -errno;
    }

    state.num = virtio_queue_get_last_avail_idx(vdev, idx);
672
    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_BASE, &state);
M
Michael S. Tsirkin 已提交
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
    if (r) {
        return -errno;
    }

    s = l = virtio_queue_get_desc_size(vdev, idx);
    a = virtio_queue_get_desc_addr(vdev, idx);
    vq->desc = cpu_physical_memory_map(a, &l, 0);
    if (!vq->desc || l != s) {
        r = -ENOMEM;
        goto fail_alloc_desc;
    }
    s = l = virtio_queue_get_avail_size(vdev, idx);
    a = virtio_queue_get_avail_addr(vdev, idx);
    vq->avail = cpu_physical_memory_map(a, &l, 0);
    if (!vq->avail || l != s) {
        r = -ENOMEM;
        goto fail_alloc_avail;
    }
    vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
    vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
    vq->used = cpu_physical_memory_map(a, &l, 1);
    if (!vq->used || l != s) {
        r = -ENOMEM;
        goto fail_alloc_used;
    }

    vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
    vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
    vq->ring = cpu_physical_memory_map(a, &l, 1);
    if (!vq->ring || l != s) {
        r = -ENOMEM;
        goto fail_alloc_ring;
    }

J
Jason Wang 已提交
707
    r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
M
Michael S. Tsirkin 已提交
708 709 710 711
    if (r < 0) {
        r = -errno;
        goto fail_alloc;
    }
J
Jason Wang 已提交
712

M
Michael S. Tsirkin 已提交
713
    file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
714
    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_KICK, &file);
M
Michael S. Tsirkin 已提交
715
    if (r) {
M
Michael S. Tsirkin 已提交
716
        r = -errno;
M
Michael S. Tsirkin 已提交
717 718 719
        goto fail_kick;
    }

720 721
    /* Clear and discard previous events if any. */
    event_notifier_test_and_clear(&vq->masked_notifier);
M
Michael S. Tsirkin 已提交
722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741

    return 0;

fail_kick:
fail_alloc:
    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
                              0, 0);
fail_alloc_ring:
    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
                              0, 0);
fail_alloc_used:
    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
                              0, 0);
fail_alloc_avail:
    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
                              0, 0);
fail_alloc_desc:
    return r;
}

742
static void vhost_virtqueue_stop(struct vhost_dev *dev,
M
Michael S. Tsirkin 已提交
743 744 745 746 747
                                    struct VirtIODevice *vdev,
                                    struct vhost_virtqueue *vq,
                                    unsigned idx)
{
    struct vhost_vring_state state = {
J
Jason Wang 已提交
748
        .index = idx - dev->vq_index
M
Michael S. Tsirkin 已提交
749 750
    };
    int r;
J
Jason Wang 已提交
751
    assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
752
    r = dev->vhost_ops->vhost_call(dev, VHOST_GET_VRING_BASE, &state);
M
Michael S. Tsirkin 已提交
753 754 755 756 757
    if (r < 0) {
        fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
        fflush(stderr);
    }
    virtio_queue_set_last_avail_idx(vdev, idx, state.num);
758
    virtio_queue_invalidate_signalled_used(vdev, idx);
M
Michael S. Tsirkin 已提交
759 760 761 762 763 764 765 766 767 768 769
    assert (r >= 0);
    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
                              0, virtio_queue_get_ring_size(vdev, idx));
    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
                              1, virtio_queue_get_used_size(vdev, idx));
    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
                              0, virtio_queue_get_avail_size(vdev, idx));
    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
                              0, virtio_queue_get_desc_size(vdev, idx));
}

770 771
static void vhost_eventfd_add(MemoryListener *listener,
                              MemoryRegionSection *section,
772
                              bool match_data, uint64_t data, EventNotifier *e)
773 774 775 776 777
{
}

static void vhost_eventfd_del(MemoryListener *listener,
                              MemoryRegionSection *section,
778
                              bool match_data, uint64_t data, EventNotifier *e)
779 780 781
{
}

782 783 784 785 786 787 788 789 790 791 792 793
static int vhost_virtqueue_init(struct vhost_dev *dev,
                                struct vhost_virtqueue *vq, int n)
{
    struct vhost_vring_file file = {
        .index = n,
    };
    int r = event_notifier_init(&vq->masked_notifier, 0);
    if (r < 0) {
        return r;
    }

    file.fd = event_notifier_get_fd(&vq->masked_notifier);
794
    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_CALL, &file);
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
    if (r) {
        r = -errno;
        goto fail_call;
    }
    return 0;
fail_call:
    event_notifier_cleanup(&vq->masked_notifier);
    return r;
}

static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
{
    event_notifier_cleanup(&vq->masked_notifier);
}

810
int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
811
                   VhostBackendType backend_type, bool force)
M
Michael S. Tsirkin 已提交
812 813
{
    uint64_t features;
814
    int i, r;
815

816 817 818 819
    if (vhost_set_backend_type(hdev, backend_type) < 0) {
        return -1;
    }

820 821 822 823 824
    if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) {
        return -errno;
    }

    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_OWNER, NULL);
M
Michael S. Tsirkin 已提交
825 826 827 828
    if (r < 0) {
        goto fail;
    }

829
    r = hdev->vhost_ops->vhost_call(hdev, VHOST_GET_FEATURES, &features);
M
Michael S. Tsirkin 已提交
830 831 832
    if (r < 0) {
        goto fail;
    }
833 834 835 836 837 838 839

    for (i = 0; i < hdev->nvqs; ++i) {
        r = vhost_virtqueue_init(hdev, hdev->vqs + i, i);
        if (r < 0) {
            goto fail_vq;
        }
    }
M
Michael S. Tsirkin 已提交
840 841
    hdev->features = features;

A
Avi Kivity 已提交
842
    hdev->memory_listener = (MemoryListener) {
843 844
        .begin = vhost_begin,
        .commit = vhost_commit,
A
Avi Kivity 已提交
845 846
        .region_add = vhost_region_add,
        .region_del = vhost_region_del,
847
        .region_nop = vhost_region_nop,
A
Avi Kivity 已提交
848 849 850 851 852
        .log_start = vhost_log_start,
        .log_stop = vhost_log_stop,
        .log_sync = vhost_log_sync,
        .log_global_start = vhost_log_global_start,
        .log_global_stop = vhost_log_global_stop,
853 854
        .eventfd_add = vhost_eventfd_add,
        .eventfd_del = vhost_eventfd_del,
855
        .priority = 10
A
Avi Kivity 已提交
856
    };
857
    hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
858 859
    hdev->n_mem_sections = 0;
    hdev->mem_sections = NULL;
M
Michael S. Tsirkin 已提交
860 861 862 863
    hdev->log = NULL;
    hdev->log_size = 0;
    hdev->log_enabled = false;
    hdev->started = false;
864
    hdev->memory_changed = false;
865
    memory_listener_register(&hdev->memory_listener, &address_space_memory);
866
    hdev->force = force;
M
Michael S. Tsirkin 已提交
867
    return 0;
868 869 870 871
fail_vq:
    while (--i >= 0) {
        vhost_virtqueue_cleanup(hdev->vqs + i);
    }
M
Michael S. Tsirkin 已提交
872 873
fail:
    r = -errno;
874
    hdev->vhost_ops->vhost_backend_cleanup(hdev);
M
Michael S. Tsirkin 已提交
875 876 877 878 879
    return r;
}

void vhost_dev_cleanup(struct vhost_dev *hdev)
{
880 881 882 883
    int i;
    for (i = 0; i < hdev->nvqs; ++i) {
        vhost_virtqueue_cleanup(hdev->vqs + i);
    }
A
Avi Kivity 已提交
884
    memory_listener_unregister(&hdev->memory_listener);
885
    g_free(hdev->mem);
886
    g_free(hdev->mem_sections);
887
    hdev->vhost_ops->vhost_backend_cleanup(hdev);
M
Michael S. Tsirkin 已提交
888 889
}

890 891
bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev)
{
K
KONRAD Frederic 已提交
892 893 894 895 896 897 898
    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
    VirtioBusState *vbus = VIRTIO_BUS(qbus);
    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);

    return !k->query_guest_notifiers ||
           k->query_guest_notifiers(qbus->parent) ||
           hdev->force;
899 900
}

901 902 903 904 905
/* Stop processing guest IO notifications in qemu.
 * Start processing them in vhost in kernel.
 */
int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
{
K
KONRAD Frederic 已提交
906 907 908
    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
    VirtioBusState *vbus = VIRTIO_BUS(qbus);
    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
909
    int i, r;
K
KONRAD Frederic 已提交
910
    if (!k->set_host_notifier) {
911 912 913 914 915 916
        fprintf(stderr, "binding does not support host notifiers\n");
        r = -ENOSYS;
        goto fail;
    }

    for (i = 0; i < hdev->nvqs; ++i) {
K
KONRAD Frederic 已提交
917
        r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true);
918 919 920 921 922 923 924 925 926
        if (r < 0) {
            fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
            goto fail_vq;
        }
    }

    return 0;
fail_vq:
    while (--i >= 0) {
K
KONRAD Frederic 已提交
927
        r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
        if (r < 0) {
            fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
            fflush(stderr);
        }
        assert (r >= 0);
    }
fail:
    return r;
}

/* Stop processing guest IO notifications in vhost.
 * Start processing them in qemu.
 * This might actually run the qemu handlers right away,
 * so virtio in qemu must be completely setup when this is called.
 */
void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
{
K
KONRAD Frederic 已提交
945 946 947
    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
    VirtioBusState *vbus = VIRTIO_BUS(qbus);
    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
948 949 950
    int i, r;

    for (i = 0; i < hdev->nvqs; ++i) {
K
KONRAD Frederic 已提交
951
        r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
952 953 954 955 956 957 958 959
        if (r < 0) {
            fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
            fflush(stderr);
        }
        assert (r >= 0);
    }
}

960 961 962 963 964
/* Test and clear event pending status.
 * Should be called after unmask to avoid losing events.
 */
bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
{
J
Jason Wang 已提交
965
    struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
966
    assert(hdev->started);
J
Jason Wang 已提交
967
    assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
968 969 970 971 972 973 974 975
    return event_notifier_test_and_clear(&vq->masked_notifier);
}

/* Mask/unmask events from this vq. */
void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
                         bool mask)
{
    struct VirtQueue *vvq = virtio_get_queue(vdev, n);
J
Jason Wang 已提交
976
    int r, index = n - hdev->vq_index;
977 978

    assert(hdev->started);
J
Jason Wang 已提交
979
    assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
980 981

    struct vhost_vring_file file = {
J
Jason Wang 已提交
982
        .index = index
983 984
    };
    if (mask) {
J
Jason Wang 已提交
985
        file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
986 987 988
    } else {
        file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
    }
989
    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_VRING_CALL, &file);
990 991 992
    assert(r >= 0);
}

993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
unsigned vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
        unsigned features)
{
    const int *bit = feature_bits;
    while (*bit != VHOST_INVALID_FEATURE_BIT) {
        unsigned bit_mask = (1 << *bit);
        if (!(hdev->features & bit_mask)) {
            features &= ~bit_mask;
        }
        bit++;
    }
    return features;
}

void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
        unsigned features)
{
    const int *bit = feature_bits;
    while (*bit != VHOST_INVALID_FEATURE_BIT) {
        unsigned bit_mask = (1 << *bit);
        if (features & bit_mask) {
            hdev->acked_features |= bit_mask;
        }
        bit++;
    }
}

1020
/* Host notifiers must be enabled at this point. */
M
Michael S. Tsirkin 已提交
1021 1022 1023
int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
{
    int i, r;
1024 1025 1026

    hdev->started = true;

M
Michael S. Tsirkin 已提交
1027 1028
    r = vhost_dev_set_features(hdev, hdev->log_enabled);
    if (r < 0) {
1029
        goto fail_features;
M
Michael S. Tsirkin 已提交
1030
    }
1031
    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem);
M
Michael S. Tsirkin 已提交
1032 1033
    if (r < 0) {
        r = -errno;
1034
        goto fail_mem;
M
Michael S. Tsirkin 已提交
1035
    }
1036
    for (i = 0; i < hdev->nvqs; ++i) {
1037
        r = vhost_virtqueue_start(hdev,
J
Jason Wang 已提交
1038 1039 1040
                                  vdev,
                                  hdev->vqs + i,
                                  hdev->vq_index + i);
1041 1042 1043 1044 1045
        if (r < 0) {
            goto fail_vq;
        }
    }

M
Michael S. Tsirkin 已提交
1046 1047 1048
    if (hdev->log_enabled) {
        hdev->log_size = vhost_get_log_size(hdev);
        hdev->log = hdev->log_size ?
1049
            g_malloc0(hdev->log_size * sizeof *hdev->log) : NULL;
1050
        r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_LOG_BASE, hdev->log);
M
Michael S. Tsirkin 已提交
1051 1052
        if (r < 0) {
            r = -errno;
1053
            goto fail_log;
M
Michael S. Tsirkin 已提交
1054 1055
        }
    }
1056

M
Michael S. Tsirkin 已提交
1057
    return 0;
1058
fail_log:
M
Michael S. Tsirkin 已提交
1059 1060
fail_vq:
    while (--i >= 0) {
1061
        vhost_virtqueue_stop(hdev,
J
Jason Wang 已提交
1062 1063 1064
                             vdev,
                             hdev->vqs + i,
                             hdev->vq_index + i);
M
Michael S. Tsirkin 已提交
1065
    }
J
Jason Wang 已提交
1066
    i = hdev->nvqs;
1067 1068
fail_mem:
fail_features:
1069 1070

    hdev->started = false;
M
Michael S. Tsirkin 已提交
1071 1072 1073
    return r;
}

1074
/* Host notifiers must be enabled at this point. */
M
Michael S. Tsirkin 已提交
1075 1076
void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
{
J
Jason Wang 已提交
1077
    int i;
1078

M
Michael S. Tsirkin 已提交
1079
    for (i = 0; i < hdev->nvqs; ++i) {
1080
        vhost_virtqueue_stop(hdev,
J
Jason Wang 已提交
1081 1082 1083
                             vdev,
                             hdev->vqs + i,
                             hdev->vq_index + i);
M
Michael S. Tsirkin 已提交
1084
    }
M
Michael S. Tsirkin 已提交
1085
    vhost_log_sync_range(hdev, 0, ~0x0ull);
1086

M
Michael S. Tsirkin 已提交
1087
    hdev->started = false;
1088
    g_free(hdev->log);
1089
    hdev->log = NULL;
M
Michael S. Tsirkin 已提交
1090 1091
    hdev->log_size = 0;
}
J
Jason Wang 已提交
1092