xen-hvm.c 47.4 KB
Newer Older
A
Anthony PERARD 已提交
1 2 3 4 5 6
/*
 * Copyright (C) 2010       Citrix Ltd.
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
7 8
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
A
Anthony PERARD 已提交
9 10
 */

P
Peter Maydell 已提交
11
#include "qemu/osdep.h"
12

13
#include "cpu.h"
14
#include "hw/pci/pci.h"
15
#include "hw/pci/pci_host.h"
P
Paolo Bonzini 已提交
16
#include "hw/i386/pc.h"
S
Stefano Stabellini 已提交
17
#include "hw/i386/apic-msidef.h"
P
Paolo Bonzini 已提交
18 19
#include "hw/xen/xen_common.h"
#include "hw/xen/xen_backend.h"
20
#include "qapi/error.h"
21
#include "qapi/qapi-commands-misc.h"
22
#include "qemu/error-report.h"
23
#include "qemu/range.h"
24
#include "sysemu/xen-mapcache.h"
A
Anthony Xu 已提交
25
#include "trace.h"
26
#include "exec/address-spaces.h"
J
Jun Nakajima 已提交
27

28 29
#include <xen/hvm/ioreq.h>
#include <xen/hvm/params.h>
30
#include <xen/hvm/e820.h>
31

W
Wei Liu 已提交
32
//#define DEBUG_XEN_HVM
33

W
Wei Liu 已提交
34
#ifdef DEBUG_XEN_HVM
35 36 37 38 39 40 41
#define DPRINTF(fmt, ...) \
    do { fprintf(stderr, "xen: " fmt, ## __VA_ARGS__); } while (0)
#else
#define DPRINTF(fmt, ...) \
    do { } while (0)
#endif

A
Avi Kivity 已提交
42
static MemoryRegion ram_memory, ram_640k, ram_lo, ram_hi;
43
static MemoryRegion *framebuffer;
44
static bool xen_in_migration;
A
Avi Kivity 已提交
45

46
/* Compatibility with older version */
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

/* This allows QEMU to build on a system that has Xen 4.5 or earlier
 * installed.  This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h
 * needs to be included before this block and hw/xen/xen_common.h needs to
 * be included before xen/hvm/ioreq.h
 */
#ifndef IOREQ_TYPE_VMWARE_PORT
#define IOREQ_TYPE_VMWARE_PORT  3
struct vmware_regs {
    uint32_t esi;
    uint32_t edi;
    uint32_t ebx;
    uint32_t ecx;
    uint32_t edx;
};
typedef struct vmware_regs vmware_regs_t;

struct shared_vmport_iopage {
    struct vmware_regs vcpu_vmport_regs[1];
};
typedef struct shared_vmport_iopage shared_vmport_iopage_t;
#endif

70 71 72 73 74 75 76 77 78 79 80
static inline uint32_t xen_vcpu_eport(shared_iopage_t *shared_page, int i)
{
    return shared_page->vcpu_ioreq[i].vp_eport;
}
static inline ioreq_t *xen_vcpu_ioreq(shared_iopage_t *shared_page, int vcpu)
{
    return &shared_page->vcpu_ioreq[vcpu];
}

#define BUFFER_IO_MAX_DELAY  100

81
typedef struct XenPhysmap {
A
Avi Kivity 已提交
82
    hwaddr start_addr;
83
    ram_addr_t size;
P
Peter Crosthwaite 已提交
84
    const char *name;
A
Avi Kivity 已提交
85
    hwaddr phys_offset;
86 87 88 89

    QLIST_ENTRY(XenPhysmap) list;
} XenPhysmap;

90 91
static QLIST_HEAD(, XenPhysmap) xen_physmap;

92 93 94 95 96 97
typedef struct XenPciDevice {
    PCIDevice *pci_dev;
    uint32_t sbdf;
    QLIST_ENTRY(XenPciDevice) entry;
} XenPciDevice;

98
typedef struct XenIOState {
99
    ioservid_t ioservid;
100
    shared_iopage_t *shared_page;
101
    shared_vmport_iopage_t *shared_vmport_page;
102 103
    buffered_iopage_t *buffered_io_page;
    QEMUTimer *buffered_io_timer;
104
    CPUState **cpu_by_vcpu_id;
105 106
    /* the evtchn port for polling the notification, */
    evtchn_port_t *ioreq_local_port;
107 108
    /* evtchn remote and local ports for buffered io */
    evtchn_port_t bufioreq_remote_port;
109
    evtchn_port_t bufioreq_local_port;
110
    /* the evtchn fd for polling */
111
    xenevtchn_handle *xce_handle;
112 113 114
    /* which vcpu we are serving */
    int send_vcpu;

115
    struct xs_handle *xenstore;
A
Avi Kivity 已提交
116
    MemoryListener memory_listener;
117
    MemoryListener io_listener;
118
    QLIST_HEAD(, XenPciDevice) dev_list;
119
    DeviceListener device_listener;
A
Avi Kivity 已提交
120
    hwaddr free_phys_offset;
121
    const XenPhysmap *log_for_dirtybit;
122

123
    Notifier exit;
124
    Notifier suspend;
125
    Notifier wakeup;
126 127
} XenIOState;

128 129 130 131 132 133 134 135 136
/* Xen specific function for piix pci */

int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
{
    return irq_num + ((pci_dev->devfn >> 3) << 2);
}

void xen_piix3_set_irq(void *opaque, int irq_num, int level)
{
137 138
    xen_set_pci_intx_level(xen_domid, 0, 0, irq_num >> 2,
                           irq_num & 3, level);
139 140 141 142 143 144 145 146 147 148 149 150 151 152
}

void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len)
{
    int i;

    /* Scan for updates to PCI link routes (0x60-0x63). */
    for (i = 0; i < len; i++) {
        uint8_t v = (val >> (8 * i)) & 0xff;
        if (v & 0x80) {
            v = 0;
        }
        v &= 0xf;
        if (((address + i) >= 0x60) && ((address + i) <= 0x63)) {
153
            xen_set_pci_link_route(xen_domid, address + i - 0x60, v);
154 155 156 157
        }
    }
}

S
Stefano Stabellini 已提交
158 159 160 161 162 163 164 165
int xen_is_pirq_msi(uint32_t msi_data)
{
    /* If vector is 0, the msi is remapped into a pirq, passed as
     * dest_id.
     */
    return ((msi_data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT) == 0;
}

W
Wei Liu 已提交
166 167
void xen_hvm_inject_msi(uint64_t addr, uint32_t data)
{
168
    xen_inject_msi(xen_domid, addr, data);
W
Wei Liu 已提交
169 170
}

171
static void xen_suspend_notifier(Notifier *notifier, void *data)
172
{
173
    xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 3);
174 175
}

176 177 178 179
/* Xen Interrupt Controller */

static void xen_set_irq(void *opaque, int irq, int level)
{
180
    xen_set_isa_irq_level(xen_domid, irq, level);
181 182 183 184 185 186 187
}

qemu_irq *xen_interrupt_controller_init(void)
{
    return qemu_allocate_irqs(xen_set_irq, NULL, 16);
}

J
Jun Nakajima 已提交
188 189
/* Memory Ops */

190
static void xen_ram_init(PCMachineState *pcms,
191
                         ram_addr_t ram_size, MemoryRegion **ram_memory_p)
J
Jun Nakajima 已提交
192
{
A
Avi Kivity 已提交
193 194
    MemoryRegion *sysmem = get_system_memory();
    ram_addr_t block_len;
195 196 197
    uint64_t user_lowmem = object_property_get_uint(qdev_get_machine(),
                                                    PC_MACHINE_MAX_RAM_BELOW_4G,
                                                    &error_abort);
J
Jun Nakajima 已提交
198

199
    /* Handle the machine opt max-ram-below-4g.  It is basically doing
200 201
     * min(xen limit, user limit).
     */
G
Gerd Hoffmann 已提交
202 203 204
    if (!user_lowmem) {
        user_lowmem = HVM_BELOW_4G_RAM_END; /* default */
    }
205 206
    if (HVM_BELOW_4G_RAM_END <= user_lowmem) {
        user_lowmem = HVM_BELOW_4G_RAM_END;
207
    }
J
Jun Nakajima 已提交
208

209
    if (ram_size >= user_lowmem) {
210 211
        pcms->above_4g_mem_size = ram_size - user_lowmem;
        pcms->below_4g_mem_size = user_lowmem;
J
Jun Nakajima 已提交
212
    } else {
213 214
        pcms->above_4g_mem_size = 0;
        pcms->below_4g_mem_size = ram_size;
J
Jun Nakajima 已提交
215
    }
216
    if (!pcms->above_4g_mem_size) {
217 218 219 220 221 222
        block_len = ram_size;
    } else {
        /*
         * Xen does not allocate the memory continuously, it keeps a
         * hole of the size computed above or passed in.
         */
223
        block_len = (1ULL << 32) + pcms->above_4g_mem_size;
224
    }
225
    memory_region_init_ram(&ram_memory, NULL, "xen.ram", block_len,
226
                           &error_fatal);
227
    *ram_memory_p = &ram_memory;
J
Jun Nakajima 已提交
228

229
    memory_region_init_alias(&ram_640k, NULL, "xen.ram.640k",
A
Avi Kivity 已提交
230 231
                             &ram_memory, 0, 0xa0000);
    memory_region_add_subregion(sysmem, 0, &ram_640k);
232 233 234 235 236 237
    /* Skip of the VGA IO memory space, it will be registered later by the VGA
     * emulated device.
     *
     * The area between 0xc0000 and 0x100000 will be used by SeaBIOS to load
     * the Options ROM, so it is registered here as RAM.
     */
238
    memory_region_init_alias(&ram_lo, NULL, "xen.ram.lo",
239
                             &ram_memory, 0xc0000,
240
                             pcms->below_4g_mem_size - 0xc0000);
A
Avi Kivity 已提交
241
    memory_region_add_subregion(sysmem, 0xc0000, &ram_lo);
242
    if (pcms->above_4g_mem_size > 0) {
243
        memory_region_init_alias(&ram_hi, NULL, "xen.ram.hi",
A
Avi Kivity 已提交
244
                                 &ram_memory, 0x100000000ULL,
245
                                 pcms->above_4g_mem_size);
A
Avi Kivity 已提交
246
        memory_region_add_subregion(sysmem, 0x100000000ULL, &ram_hi);
J
Jun Nakajima 已提交
247 248 249
    }
}

250 251
void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr,
                   Error **errp)
J
Jun Nakajima 已提交
252 253 254 255 256
{
    unsigned long nr_pfn;
    xen_pfn_t *pfn_list;
    int i;

257 258 259 260 261 262 263 264
    if (runstate_check(RUN_STATE_INMIGRATE)) {
        /* RAM already populated in Xen */
        fprintf(stderr, "%s: do not alloc "RAM_ADDR_FMT
                " bytes of ram at "RAM_ADDR_FMT" when runstate is INMIGRATE\n",
                __func__, size, ram_addr); 
        return;
    }

A
Avi Kivity 已提交
265 266 267 268
    if (mr == &ram_memory) {
        return;
    }

J
Jun Nakajima 已提交
269 270 271
    trace_xen_ram_alloc(ram_addr, size);

    nr_pfn = size >> TARGET_PAGE_BITS;
272
    pfn_list = g_malloc(sizeof (*pfn_list) * nr_pfn);
J
Jun Nakajima 已提交
273 274 275 276 277 278

    for (i = 0; i < nr_pfn; i++) {
        pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i;
    }

    if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) {
279 280
        error_setg(errp, "xen: failed to populate ram at " RAM_ADDR_FMT,
                   ram_addr);
J
Jun Nakajima 已提交
281 282
    }

283
    g_free(pfn_list);
J
Jun Nakajima 已提交
284 285
}

286
static XenPhysmap *get_physmapping(hwaddr start_addr, ram_addr_t size)
287 288 289 290 291
{
    XenPhysmap *physmap = NULL;

    start_addr &= TARGET_PAGE_MASK;

292
    QLIST_FOREACH(physmap, &xen_physmap, list) {
293 294 295 296 297 298 299
        if (range_covers_byte(physmap->start_addr, physmap->size, start_addr)) {
            return physmap;
        }
    }
    return NULL;
}

300
static hwaddr xen_phys_offset_to_gaddr(hwaddr phys_offset, ram_addr_t size)
301
{
302
    hwaddr addr = phys_offset & TARGET_PAGE_MASK;
303 304
    XenPhysmap *physmap = NULL;

305
    QLIST_FOREACH(physmap, &xen_physmap, list) {
306
        if (range_covers_byte(physmap->phys_offset, physmap->size, addr)) {
307
            return physmap->start_addr + (phys_offset - physmap->phys_offset);
308 309 310
        }
    }

311
    return phys_offset;
312 313
}

314
#ifdef XEN_COMPAT_PHYSMAP
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
{
    char path[80], value[17];

    snprintf(path, sizeof(path),
            "/local/domain/0/device-model/%d/physmap/%"PRIx64"/start_addr",
            xen_domid, (uint64_t)physmap->phys_offset);
    snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->start_addr);
    if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
        return -1;
    }
    snprintf(path, sizeof(path),
            "/local/domain/0/device-model/%d/physmap/%"PRIx64"/size",
            xen_domid, (uint64_t)physmap->phys_offset);
    snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->size);
    if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
        return -1;
    }
    if (physmap->name) {
        snprintf(path, sizeof(path),
                "/local/domain/0/device-model/%d/physmap/%"PRIx64"/name",
                xen_domid, (uint64_t)physmap->phys_offset);
        if (!xs_write(state->xenstore, 0, path,
                      physmap->name, strlen(physmap->name))) {
            return -1;
        }
    }
    return 0;
}
344 345 346 347 348 349
#else
static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
{
    return 0;
}
#endif
350

351
static int xen_add_to_physmap(XenIOState *state,
A
Avi Kivity 已提交
352
                              hwaddr start_addr,
353
                              ram_addr_t size,
A
Avi Kivity 已提交
354
                              MemoryRegion *mr,
A
Avi Kivity 已提交
355
                              hwaddr offset_within_region)
356
{
357
    unsigned long nr_pages;
358 359
    int rc = 0;
    XenPhysmap *physmap = NULL;
A
Avi Kivity 已提交
360 361
    hwaddr pfn, start_gpfn;
    hwaddr phys_offset = memory_region_get_ram_addr(mr);
362
    const char *mr_name;
363

364
    if (get_physmapping(start_addr, size)) {
365 366 367 368 369 370
        return 0;
    }
    if (size <= 0) {
        return -1;
    }

371 372 373 374
    /* Xen can only handle a single dirty log region for now and we want
     * the linear framebuffer to be that region.
     * Avoid tracking any regions that is not videoram and avoid tracking
     * the legacy vga region. */
A
Avi Kivity 已提交
375 376
    if (mr == framebuffer && start_addr > 0xbffff) {
        goto go_physmap;
377 378 379 380
    }
    return -1;

go_physmap:
381 382
    DPRINTF("mapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
            start_addr, start_addr + size);
383

384 385 386 387 388 389 390 391 392
    mr_name = memory_region_name(mr);

    physmap = g_malloc(sizeof(XenPhysmap));

    physmap->start_addr = start_addr;
    physmap->size = size;
    physmap->name = mr_name;
    physmap->phys_offset = phys_offset;

393
    QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
394 395 396 397 398 399 400 401 402 403

    if (runstate_check(RUN_STATE_INMIGRATE)) {
        /* Now when we have a physmap entry we can replace a dummy mapping with
         * a real one of guest foreign memory. */
        uint8_t *p = xen_replace_cache_entry(phys_offset, start_addr, size);
        assert(p && p == memory_region_get_ram_ptr(mr));

        return 0;
    }

404 405
    pfn = phys_offset >> TARGET_PAGE_BITS;
    start_gpfn = start_addr >> TARGET_PAGE_BITS;
406 407 408 409 410 411 412 413 414 415 416
    nr_pages = size >> TARGET_PAGE_BITS;
    rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, nr_pages, pfn,
                                        start_gpfn);
    if (rc) {
        int saved_errno = errno;

        error_report("relocate_memory %lu pages from GFN %"HWADDR_PRIx
                     " to GFN %"HWADDR_PRIx" failed: %s",
                     nr_pages, pfn, start_gpfn, strerror(saved_errno));
        errno = saved_errno;
        return -1;
417 418
    }

419
    rc = xendevicemodel_pin_memory_cacheattr(xen_dmod, xen_domid,
420
                                   start_addr >> TARGET_PAGE_BITS,
421
                                   (start_addr + size - 1) >> TARGET_PAGE_BITS,
422
                                   XEN_DOMCTL_MEM_CACHEATTR_WB);
423 424 425
    if (rc) {
        error_report("pin_memory_cacheattr failed: %s", strerror(errno));
    }
426
    return xen_save_physmap(state, physmap);
427 428 429
}

static int xen_remove_from_physmap(XenIOState *state,
A
Avi Kivity 已提交
430
                                   hwaddr start_addr,
431 432 433 434
                                   ram_addr_t size)
{
    int rc = 0;
    XenPhysmap *physmap = NULL;
A
Avi Kivity 已提交
435
    hwaddr phys_offset = 0;
436

437
    physmap = get_physmapping(start_addr, size);
438 439 440 441 442 443 444
    if (physmap == NULL) {
        return -1;
    }

    phys_offset = physmap->phys_offset;
    size = physmap->size;

445 446
    DPRINTF("unmapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx", at "
            "%"HWADDR_PRIx"\n", start_addr, start_addr + size, phys_offset);
447 448 449 450

    size >>= TARGET_PAGE_BITS;
    start_addr >>= TARGET_PAGE_BITS;
    phys_offset >>= TARGET_PAGE_BITS;
451 452 453 454 455 456 457 458 459 460 461
    rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, size, start_addr,
                                        phys_offset);
    if (rc) {
        int saved_errno = errno;

        error_report("relocate_memory "RAM_ADDR_FMT" pages"
                     " from GFN %"HWADDR_PRIx
                     " to GFN %"HWADDR_PRIx" failed: %s",
                     size, start_addr, phys_offset, strerror(saved_errno));
        errno = saved_errno;
        return -1;
462 463 464 465 466 467
    }

    QLIST_REMOVE(physmap, list);
    if (state->log_for_dirtybit == physmap) {
        state->log_for_dirtybit = NULL;
    }
468
    g_free(physmap);
469 470 471 472

    return 0;
}

A
Avi Kivity 已提交
473 474 475
static void xen_set_memory(struct MemoryListener *listener,
                           MemoryRegionSection *section,
                           bool add)
476
{
A
Avi Kivity 已提交
477
    XenIOState *state = container_of(listener, XenIOState, memory_listener);
A
Avi Kivity 已提交
478
    hwaddr start_addr = section->offset_within_address_space;
479
    ram_addr_t size = int128_get64(section->size);
480
    bool log_dirty = memory_region_is_logging(section->mr, DIRTY_MEMORY_VGA);
481 482
    hvmmem_type_t mem_type;

483 484 485 486
    if (section->mr == &ram_memory) {
        return;
    } else {
        if (add) {
487
            xen_map_memory_section(xen_domid, state->ioservid,
488 489
                                   section);
        } else {
490
            xen_unmap_memory_section(xen_domid, state->ioservid,
491 492 493 494
                                     section);
        }
    }

A
Avi Kivity 已提交
495
    if (!memory_region_is_ram(section->mr)) {
496 497 498
        return;
    }

499
    if (log_dirty != add) {
A
Avi Kivity 已提交
500 501 502 503
        return;
    }

    trace_xen_client_set_memory(start_addr, size, log_dirty);
504 505 506

    start_addr &= TARGET_PAGE_MASK;
    size = TARGET_PAGE_ALIGN(size);
A
Avi Kivity 已提交
507 508 509 510 511 512 513

    if (add) {
        if (!memory_region_is_rom(section->mr)) {
            xen_add_to_physmap(state, start_addr, size,
                               section->mr, section->offset_within_region);
        } else {
            mem_type = HVMMEM_ram_ro;
514 515 516 517
            if (xen_set_mem_type(xen_domid, mem_type,
                                 start_addr >> TARGET_PAGE_BITS,
                                 size >> TARGET_PAGE_BITS)) {
                DPRINTF("xen_set_mem_type error, addr: "TARGET_FMT_plx"\n",
A
Avi Kivity 已提交
518 519
                        start_addr);
            }
520
        }
A
Avi Kivity 已提交
521
    } else {
522 523 524 525 526 527
        if (xen_remove_from_physmap(state, start_addr, size) < 0) {
            DPRINTF("physmapping does not exist at "TARGET_FMT_plx"\n", start_addr);
        }
    }
}

A
Avi Kivity 已提交
528 529 530
static void xen_region_add(MemoryListener *listener,
                           MemoryRegionSection *section)
{
P
Paolo Bonzini 已提交
531
    memory_region_ref(section->mr);
A
Avi Kivity 已提交
532 533 534 535 536 537 538
    xen_set_memory(listener, section, true);
}

static void xen_region_del(MemoryListener *listener,
                           MemoryRegionSection *section)
{
    xen_set_memory(listener, section, false);
P
Paolo Bonzini 已提交
539
    memory_region_unref(section->mr);
A
Avi Kivity 已提交
540 541
}

542 543 544 545
static void xen_io_add(MemoryListener *listener,
                       MemoryRegionSection *section)
{
    XenIOState *state = container_of(listener, XenIOState, io_listener);
546
    MemoryRegion *mr = section->mr;
547

548 549 550 551 552
    if (mr->ops == &unassigned_io_ops) {
        return;
    }

    memory_region_ref(mr);
553

554
    xen_map_io_section(xen_domid, state->ioservid, section);
555 556 557 558 559 560
}

static void xen_io_del(MemoryListener *listener,
                       MemoryRegionSection *section)
{
    XenIOState *state = container_of(listener, XenIOState, io_listener);
561 562 563 564 565
    MemoryRegion *mr = section->mr;

    if (mr->ops == &unassigned_io_ops) {
        return;
    }
566

567
    xen_unmap_io_section(xen_domid, state->ioservid, section);
568

569
    memory_region_unref(mr);
570 571 572 573 574 575 576 577 578
}

static void xen_device_realize(DeviceListener *listener,
			       DeviceState *dev)
{
    XenIOState *state = container_of(listener, XenIOState, device_listener);

    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
        PCIDevice *pci_dev = PCI_DEVICE(dev);
579 580 581 582 583 584
        XenPciDevice *xendev = g_new(XenPciDevice, 1);

        xendev->pci_dev = pci_dev;
        xendev->sbdf = PCI_BUILD_BDF(pci_dev_bus_num(pci_dev),
                                     pci_dev->devfn);
        QLIST_INSERT_HEAD(&state->dev_list, xendev, entry);
585

586
        xen_map_pcidev(xen_domid, state->ioservid, pci_dev);
587 588 589 590 591 592 593 594 595 596
    }
}

static void xen_device_unrealize(DeviceListener *listener,
				 DeviceState *dev)
{
    XenIOState *state = container_of(listener, XenIOState, device_listener);

    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
        PCIDevice *pci_dev = PCI_DEVICE(dev);
597
        XenPciDevice *xendev, *next;
598

599
        xen_unmap_pcidev(xen_domid, state->ioservid, pci_dev);
600 601 602 603 604 605 606 607

        QLIST_FOREACH_SAFE(xendev, &state->dev_list, entry, next) {
            if (xendev->pci_dev == pci_dev) {
                QLIST_REMOVE(xendev, entry);
                g_free(xendev);
                break;
            }
        }
608 609 610
    }
}

611
static void xen_sync_dirty_bitmap(XenIOState *state,
A
Avi Kivity 已提交
612
                                  hwaddr start_addr,
613
                                  ram_addr_t size)
614
{
A
Avi Kivity 已提交
615
    hwaddr npages = size >> TARGET_PAGE_BITS;
616
    const int width = sizeof(unsigned long) * 8;
L
Laurent Vivier 已提交
617
    unsigned long bitmap[DIV_ROUND_UP(npages, width)];
618 619 620
    int rc, i, j;
    const XenPhysmap *physmap = NULL;

621
    physmap = get_physmapping(start_addr, size);
622 623
    if (physmap == NULL) {
        /* not handled */
624
        return;
625 626 627 628 629
    }

    if (state->log_for_dirtybit == NULL) {
        state->log_for_dirtybit = physmap;
    } else if (state->log_for_dirtybit != physmap) {
630 631
        /* Only one range for dirty bitmap can be tracked. */
        return;
632 633
    }

634 635
    rc = xen_track_dirty_vram(xen_domid, start_addr >> TARGET_PAGE_BITS,
                              npages, bitmap);
636
    if (rc < 0) {
R
Roger Pau Monne 已提交
637 638 639 640
#ifndef ENODATA
#define ENODATA  ENOENT
#endif
        if (errno == ENODATA) {
641 642
            memory_region_set_dirty(framebuffer, 0, size);
            DPRINTF("xen: track_dirty_vram failed (0x" TARGET_FMT_plx
643
                    ", 0x" TARGET_FMT_plx "): %s\n",
R
Roger Pau Monne 已提交
644
                    start_addr, start_addr + size, strerror(errno));
645 646
        }
        return;
647 648 649 650 651
    }

    for (i = 0; i < ARRAY_SIZE(bitmap); i++) {
        unsigned long map = bitmap[i];
        while (map != 0) {
N
Natanael Copa 已提交
652
            j = ctzl(map);
653
            map &= ~(1ul << j);
654
            memory_region_set_dirty(framebuffer,
655 656
                                    (i * width + j) * TARGET_PAGE_SIZE,
                                    TARGET_PAGE_SIZE);
657 658 659 660
        };
    }
}

A
Avi Kivity 已提交
661
static void xen_log_start(MemoryListener *listener,
662 663
                          MemoryRegionSection *section,
                          int old, int new)
664
{
A
Avi Kivity 已提交
665
    XenIOState *state = container_of(listener, XenIOState, memory_listener);
666

667 668 669 670
    if (new & ~old & (1 << DIRTY_MEMORY_VGA)) {
        xen_sync_dirty_bitmap(state, section->offset_within_address_space,
                              int128_get64(section->size));
    }
671 672
}

673 674
static void xen_log_stop(MemoryListener *listener, MemoryRegionSection *section,
                         int old, int new)
675
{
A
Avi Kivity 已提交
676
    XenIOState *state = container_of(listener, XenIOState, memory_listener);
677

678 679 680
    if (old & ~new & (1 << DIRTY_MEMORY_VGA)) {
        state->log_for_dirtybit = NULL;
        /* Disable dirty bit tracking */
681
        xen_track_dirty_vram(xen_domid, 0, 0, NULL);
682
    }
683 684
}

A
Avi Kivity 已提交
685
static void xen_log_sync(MemoryListener *listener, MemoryRegionSection *section)
686
{
A
Avi Kivity 已提交
687
    XenIOState *state = container_of(listener, XenIOState, memory_listener);
688

689
    xen_sync_dirty_bitmap(state, section->offset_within_address_space,
690
                          int128_get64(section->size));
691 692
}

A
Avi Kivity 已提交
693 694
static void xen_log_global_start(MemoryListener *listener)
{
695 696 697
    if (xen_enabled()) {
        xen_in_migration = true;
    }
A
Avi Kivity 已提交
698 699 700
}

static void xen_log_global_stop(MemoryListener *listener)
701
{
702
    xen_in_migration = false;
703 704
}

A
Avi Kivity 已提交
705 706 707
static MemoryListener xen_memory_listener = {
    .region_add = xen_region_add,
    .region_del = xen_region_del,
708 709
    .log_start = xen_log_start,
    .log_stop = xen_log_stop,
A
Avi Kivity 已提交
710 711 712
    .log_sync = xen_log_sync,
    .log_global_start = xen_log_global_start,
    .log_global_stop = xen_log_global_stop,
713
    .priority = 10,
714
};
J
Jun Nakajima 已提交
715

716 717 718 719 720 721 722 723 724 725 726
static MemoryListener xen_io_listener = {
    .region_add = xen_io_add,
    .region_del = xen_io_del,
    .priority = 10,
};

static DeviceListener xen_device_listener = {
    .realize = xen_device_realize,
    .unrealize = xen_device_unrealize,
};

727 728 729 730 731 732 733 734
/* get the ioreq packets from share mem */
static ioreq_t *cpu_get_ioreq_from_shared_memory(XenIOState *state, int vcpu)
{
    ioreq_t *req = xen_vcpu_ioreq(state->shared_page, vcpu);

    if (req->state != STATE_IOREQ_READY) {
        DPRINTF("I/O request not ready: "
                "%x, ptr: %x, port: %"PRIx64", "
735
                "data: %"PRIx64", count: %u, size: %u\n",
736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
                req->state, req->data_is_ptr, req->addr,
                req->data, req->count, req->size);
        return NULL;
    }

    xen_rmb(); /* see IOREQ_READY /then/ read contents of ioreq */

    req->state = STATE_IOREQ_INPROCESS;
    return req;
}

/* use poll to get the port notification */
/* ioreq_vec--out,the */
/* retval--the number of ioreq packet */
static ioreq_t *cpu_get_ioreq(XenIOState *state)
{
    int i;
    evtchn_port_t port;

755
    port = xenevtchn_pending(state->xce_handle);
756
    if (port == state->bufioreq_local_port) {
757 758
        timer_mod(state->buffered_io_timer,
                BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
759 760 761
        return NULL;
    }

762
    if (port != -1) {
A
Anthony PERARD 已提交
763
        for (i = 0; i < max_cpus; i++) {
764 765 766 767 768
            if (state->ioreq_local_port[i] == port) {
                break;
            }
        }

A
Anthony PERARD 已提交
769
        if (i == max_cpus) {
770 771 772 773
            hw_error("Fatal error while trying to get io event!\n");
        }

        /* unmask the wanted port again */
774
        xenevtchn_unmask(state->xce_handle, port);
775 776 777 778 779 780 781 782 783 784

        /* get the io packet from shared memory */
        state->send_vcpu = i;
        return cpu_get_ioreq_from_shared_memory(state, i);
    }

    /* read error or read nothing */
    return NULL;
}

P
Paolo Bonzini 已提交
785
static uint32_t do_inp(uint32_t addr, unsigned long size)
786 787 788 789 790 791 792 793 794
{
    switch (size) {
        case 1:
            return cpu_inb(addr);
        case 2:
            return cpu_inw(addr);
        case 4:
            return cpu_inl(addr);
        default:
P
Paolo Bonzini 已提交
795
            hw_error("inp: bad size: %04x %lx", addr, size);
796 797 798
    }
}

P
Paolo Bonzini 已提交
799
static void do_outp(uint32_t addr,
800 801 802 803 804 805 806 807 808 809
        unsigned long size, uint32_t val)
{
    switch (size) {
        case 1:
            return cpu_outb(addr, val);
        case 2:
            return cpu_outw(addr, val);
        case 4:
            return cpu_outl(addr, val);
        default:
P
Paolo Bonzini 已提交
810
            hw_error("outp: bad size: %04x %lx", addr, size);
811 812 813
    }
}

814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839
/*
 * Helper functions which read/write an object from/to physical guest
 * memory, as part of the implementation of an ioreq.
 *
 * Equivalent to
 *   cpu_physical_memory_rw(addr + (req->df ? -1 : +1) * req->size * i,
 *                          val, req->size, 0/1)
 * except without the integer overflow problems.
 */
static void rw_phys_req_item(hwaddr addr,
                             ioreq_t *req, uint32_t i, void *val, int rw)
{
    /* Do everything unsigned so overflow just results in a truncated result
     * and accesses to undesired parts of guest memory, which is up
     * to the guest */
    hwaddr offset = (hwaddr)req->size * i;
    if (req->df) {
        addr -= offset;
    } else {
        addr += offset;
    }
    cpu_physical_memory_rw(addr, val, req->size, rw);
}

static inline void read_phys_req_item(hwaddr addr,
                                      ioreq_t *req, uint32_t i, void *val)
840
{
841 842 843 844 845 846 847
    rw_phys_req_item(addr, req, i, val, 0);
}
static inline void write_phys_req_item(hwaddr addr,
                                       ioreq_t *req, uint32_t i, void *val)
{
    rw_phys_req_item(addr, req, i, val, 1);
}
848

849 850 851

static void cpu_ioreq_pio(ioreq_t *req)
{
852
    uint32_t i;
853

D
Don Slutz 已提交
854 855 856
    trace_cpu_ioreq_pio(req, req->dir, req->df, req->data_is_ptr, req->addr,
                         req->data, req->count, req->size);

J
Jan Beulich 已提交
857 858 859 860
    if (req->size > sizeof(uint32_t)) {
        hw_error("PIO: bad size (%u)", req->size);
    }

861 862 863
    if (req->dir == IOREQ_READ) {
        if (!req->data_is_ptr) {
            req->data = do_inp(req->addr, req->size);
D
Don Slutz 已提交
864 865
            trace_cpu_ioreq_pio_read_reg(req, req->data, req->addr,
                                         req->size);
866 867 868 869 870
        } else {
            uint32_t tmp;

            for (i = 0; i < req->count; i++) {
                tmp = do_inp(req->addr, req->size);
871
                write_phys_req_item(req->data, req, i, &tmp);
872 873 874 875
            }
        }
    } else if (req->dir == IOREQ_WRITE) {
        if (!req->data_is_ptr) {
D
Don Slutz 已提交
876 877
            trace_cpu_ioreq_pio_write_reg(req, req->data, req->addr,
                                          req->size);
878 879 880 881 882
            do_outp(req->addr, req->size, req->data);
        } else {
            for (i = 0; i < req->count; i++) {
                uint32_t tmp = 0;

883
                read_phys_req_item(req->data, req, i, &tmp);
884 885 886 887 888 889 890 891
                do_outp(req->addr, req->size, tmp);
            }
        }
    }
}

static void cpu_ioreq_move(ioreq_t *req)
{
892
    uint32_t i;
893

D
Don Slutz 已提交
894 895 896
    trace_cpu_ioreq_move(req, req->dir, req->df, req->data_is_ptr, req->addr,
                         req->data, req->count, req->size);

J
Jan Beulich 已提交
897 898 899 900
    if (req->size > sizeof(req->data)) {
        hw_error("MMIO: bad size (%u)", req->size);
    }

901 902 903
    if (!req->data_is_ptr) {
        if (req->dir == IOREQ_READ) {
            for (i = 0; i < req->count; i++) {
904
                read_phys_req_item(req->addr, req, i, &req->data);
905 906 907
            }
        } else if (req->dir == IOREQ_WRITE) {
            for (i = 0; i < req->count; i++) {
908
                write_phys_req_item(req->addr, req, i, &req->data);
909 910 911
            }
        }
    } else {
912
        uint64_t tmp;
913 914 915

        if (req->dir == IOREQ_READ) {
            for (i = 0; i < req->count; i++) {
916 917
                read_phys_req_item(req->addr, req, i, &tmp);
                write_phys_req_item(req->data, req, i, &tmp);
918 919 920
            }
        } else if (req->dir == IOREQ_WRITE) {
            for (i = 0; i < req->count; i++) {
921 922
                read_phys_req_item(req->data, req, i, &tmp);
                write_phys_req_item(req->addr, req, i, &tmp);
923 924 925 926 927
            }
        }
    }
}

928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
static void cpu_ioreq_config(XenIOState *state, ioreq_t *req)
{
    uint32_t sbdf = req->addr >> 32;
    uint32_t reg = req->addr;
    XenPciDevice *xendev;

    if (req->size != sizeof(uint8_t) && req->size != sizeof(uint16_t) &&
        req->size != sizeof(uint32_t)) {
        hw_error("PCI config access: bad size (%u)", req->size);
    }

    if (req->count != 1) {
        hw_error("PCI config access: bad count (%u)", req->count);
    }

    QLIST_FOREACH(xendev, &state->dev_list, entry) {
        if (xendev->sbdf != sbdf) {
            continue;
        }

        if (!req->data_is_ptr) {
            if (req->dir == IOREQ_READ) {
                req->data = pci_host_config_read_common(
                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
                    req->size);
                trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
                                            req->size, req->data);
            } else if (req->dir == IOREQ_WRITE) {
                trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
                                             req->size, req->data);
                pci_host_config_write_common(
                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
                    req->data, req->size);
            }
        } else {
            uint32_t tmp;

            if (req->dir == IOREQ_READ) {
                tmp = pci_host_config_read_common(
                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
                    req->size);
                trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
                                            req->size, tmp);
                write_phys_req_item(req->data, req, 0, &tmp);
            } else if (req->dir == IOREQ_WRITE) {
                read_phys_req_item(req->data, req, 0, &tmp);
                trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
                                             req->size, tmp);
                pci_host_config_write_common(
                    xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
                    tmp, req->size);
            }
        }
    }
}

984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
static void regs_to_cpu(vmware_regs_t *vmport_regs, ioreq_t *req)
{
    X86CPU *cpu;
    CPUX86State *env;

    cpu = X86_CPU(current_cpu);
    env = &cpu->env;
    env->regs[R_EAX] = req->data;
    env->regs[R_EBX] = vmport_regs->ebx;
    env->regs[R_ECX] = vmport_regs->ecx;
    env->regs[R_EDX] = vmport_regs->edx;
    env->regs[R_ESI] = vmport_regs->esi;
    env->regs[R_EDI] = vmport_regs->edi;
}

static void regs_from_cpu(vmware_regs_t *vmport_regs)
{
    X86CPU *cpu = X86_CPU(current_cpu);
    CPUX86State *env = &cpu->env;

    vmport_regs->ebx = env->regs[R_EBX];
    vmport_regs->ecx = env->regs[R_ECX];
    vmport_regs->edx = env->regs[R_EDX];
    vmport_regs->esi = env->regs[R_ESI];
    vmport_regs->edi = env->regs[R_EDI];
}

static void handle_vmport_ioreq(XenIOState *state, ioreq_t *req)
{
    vmware_regs_t *vmport_regs;

    assert(state->shared_vmport_page);
    vmport_regs =
        &state->shared_vmport_page->vcpu_vmport_regs[state->send_vcpu];
    QEMU_BUILD_BUG_ON(sizeof(*req) < sizeof(*vmport_regs));

    current_cpu = state->cpu_by_vcpu_id[state->send_vcpu];
    regs_to_cpu(vmport_regs, req);
    cpu_ioreq_pio(req);
    regs_from_cpu(vmport_regs);
    current_cpu = NULL;
}

static void handle_ioreq(XenIOState *state, ioreq_t *req)
1028
{
D
Don Slutz 已提交
1029 1030 1031
    trace_handle_ioreq(req, req->type, req->dir, req->df, req->data_is_ptr,
                       req->addr, req->data, req->count, req->size);

1032 1033 1034 1035 1036
    if (!req->data_is_ptr && (req->dir == IOREQ_WRITE) &&
            (req->size < sizeof (target_ulong))) {
        req->data &= ((target_ulong) 1 << (8 * req->size)) - 1;
    }

D
Don Slutz 已提交
1037 1038 1039 1040
    if (req->dir == IOREQ_WRITE)
        trace_handle_ioreq_write(req, req->type, req->df, req->data_is_ptr,
                                 req->addr, req->data, req->count, req->size);

1041 1042 1043 1044 1045 1046 1047
    switch (req->type) {
        case IOREQ_TYPE_PIO:
            cpu_ioreq_pio(req);
            break;
        case IOREQ_TYPE_COPY:
            cpu_ioreq_move(req);
            break;
1048 1049 1050
        case IOREQ_TYPE_VMWARE_PORT:
            handle_vmport_ioreq(state, req);
            break;
1051 1052 1053
        case IOREQ_TYPE_TIMEOFFSET:
            break;
        case IOREQ_TYPE_INVALIDATE:
J
Jan Kiszka 已提交
1054
            xen_invalidate_map_cache();
1055
            break;
1056 1057
        case IOREQ_TYPE_PCI_CONFIG:
            cpu_ioreq_config(state, req);
1058
            break;
1059 1060 1061
        default:
            hw_error("Invalid ioreq type 0x%x\n", req->type);
    }
D
Don Slutz 已提交
1062 1063 1064 1065
    if (req->dir == IOREQ_READ) {
        trace_handle_ioreq_read(req, req->type, req->df, req->data_is_ptr,
                                req->addr, req->data, req->count, req->size);
    }
1066 1067
}

1068
static int handle_buffered_iopage(XenIOState *state)
1069
{
1070
    buffered_iopage_t *buf_page = state->buffered_io_page;
1071 1072 1073 1074
    buf_ioreq_t *buf_req = NULL;
    ioreq_t req;
    int qw;

1075
    if (!buf_page) {
1076
        return 0;
1077 1078
    }

1079
    memset(&req, 0x00, sizeof(req));
1080 1081
    req.state = STATE_IOREQ_READY;
    req.count = 1;
1082
    req.dir = IOREQ_WRITE;
1083

1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
    for (;;) {
        uint32_t rdptr = buf_page->read_pointer, wrptr;

        xen_rmb();
        wrptr = buf_page->write_pointer;
        xen_rmb();
        if (rdptr != buf_page->read_pointer) {
            continue;
        }
        if (rdptr == wrptr) {
            break;
        }
        buf_req = &buf_page->buf_ioreq[rdptr % IOREQ_BUFFER_SLOT_NUM];
1097
        req.size = 1U << buf_req->size;
1098 1099 1100
        req.addr = buf_req->addr;
        req.data = buf_req->data;
        req.type = buf_req->type;
J
Jan Beulich 已提交
1101
        xen_rmb();
1102 1103
        qw = (req.size == 8);
        if (qw) {
J
Jan Beulich 已提交
1104 1105 1106
            if (rdptr + 1 == wrptr) {
                hw_error("Incomplete quad word buffered ioreq");
            }
1107 1108
            buf_req = &buf_page->buf_ioreq[(rdptr + 1) %
                                           IOREQ_BUFFER_SLOT_NUM];
1109
            req.data |= ((uint64_t)buf_req->data) << 32;
J
Jan Beulich 已提交
1110
            xen_rmb();
1111 1112
        }

1113
        handle_ioreq(state, &req);
1114

1115
        /* Only req.data may get updated by handle_ioreq(), albeit even that
1116 1117
         * should not happen as such data would never make it to the guest (we
         * can only usefully see writes here after all).
1118 1119 1120
         */
        assert(req.state == STATE_IOREQ_READY);
        assert(req.count == 1);
1121
        assert(req.dir == IOREQ_WRITE);
1122 1123
        assert(!req.data_is_ptr);

1124
        atomic_add(&buf_page->read_pointer, qw + 1);
1125
    }
1126 1127

    return req.count;
1128 1129 1130 1131 1132 1133
}

static void handle_buffered_io(void *opaque)
{
    XenIOState *state = opaque;

1134
    if (handle_buffered_iopage(state)) {
1135 1136
        timer_mod(state->buffered_io_timer,
                BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
1137
    } else {
1138
        timer_del(state->buffered_io_timer);
1139
        xenevtchn_unmask(state->xce_handle, state->bufioreq_local_port);
1140
    }
1141 1142 1143 1144 1145 1146 1147 1148 1149
}

static void cpu_handle_ioreq(void *opaque)
{
    XenIOState *state = opaque;
    ioreq_t *req = cpu_get_ioreq(state);

    handle_buffered_iopage(state);
    if (req) {
J
Jan Beulich 已提交
1150 1151 1152 1153 1154
        ioreq_t copy = *req;

        xen_rmb();
        handle_ioreq(state, &copy);
        req->data = copy.data;
1155 1156 1157 1158

        if (req->state != STATE_IOREQ_INPROCESS) {
            fprintf(stderr, "Badness in I/O request ... not in service?!: "
                    "%x, ptr: %x, port: %"PRIx64", "
1159
                    "data: %"PRIx64", count: %u, size: %u, type: %u\n",
1160
                    req->state, req->data_is_ptr, req->addr,
1161
                    req->data, req->count, req->size, req->type);
1162
            destroy_hvm_domain(false);
1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
            return;
        }

        xen_wmb(); /* Update ioreq contents /then/ update state. */

        /*
         * We do this before we send the response so that the tools
         * have the opportunity to pick up on the reset before the
         * guest resumes and does a hlt with interrupts disabled which
         * causes Xen to powerdown the domain.
         */
1174
        if (runstate_is_running()) {
1175 1176
            ShutdownCause request;

1177
            if (qemu_shutdown_requested_get()) {
1178
                destroy_hvm_domain(false);
1179
            }
1180 1181 1182
            request = qemu_reset_requested_get();
            if (request) {
                qemu_system_reset(request);
1183
                destroy_hvm_domain(true);
1184 1185 1186 1187
            }
        }

        req->state = STATE_IORESP_READY;
1188 1189
        xenevtchn_notify(state->xce_handle,
                         state->ioreq_local_port[state->send_vcpu]);
1190 1191 1192 1193 1194 1195 1196
    }
}

static void xen_main_loop_prepare(XenIOState *state)
{
    int evtchn_fd = -1;

1197 1198
    if (state->xce_handle != NULL) {
        evtchn_fd = xenevtchn_fd(state->xce_handle);
1199 1200
    }

1201
    state->buffered_io_timer = timer_new_ms(QEMU_CLOCK_REALTIME, handle_buffered_io,
1202 1203 1204
                                                 state);

    if (evtchn_fd != -1) {
1205 1206 1207 1208 1209 1210 1211 1212
        CPUState *cpu_state;

        DPRINTF("%s: Init cpu_by_vcpu_id\n", __func__);
        CPU_FOREACH(cpu_state) {
            DPRINTF("%s: cpu_by_vcpu_id[%d]=%p\n",
                    __func__, cpu_state->cpu_index, cpu_state);
            state->cpu_by_vcpu_id[cpu_state->cpu_index] = cpu_state;
        }
1213 1214 1215 1216 1217
        qemu_set_fd_handler(evtchn_fd, cpu_handle_ioreq, NULL, state);
    }
}


1218 1219
static void xen_hvm_change_state_handler(void *opaque, int running,
                                         RunState rstate)
1220
{
1221 1222
    XenIOState *state = opaque;

1223
    if (running) {
1224
        xen_main_loop_prepare(state);
1225
    }
1226

1227
    xen_set_ioreq_server_state(xen_domid,
1228 1229
                               state->ioservid,
                               (rstate == RUN_STATE_RUNNING));
1230 1231
}

1232
static void xen_exit_notifier(Notifier *n, void *data)
1233 1234 1235
{
    XenIOState *state = container_of(n, XenIOState, exit);

1236
    xenevtchn_close(state->xce_handle);
1237
    xs_daemon_close(state->xenstore);
1238 1239
}

1240
#ifdef XEN_COMPAT_PHYSMAP
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261
static void xen_read_physmap(XenIOState *state)
{
    XenPhysmap *physmap = NULL;
    unsigned int len, num, i;
    char path[80], *value = NULL;
    char **entries = NULL;

    snprintf(path, sizeof(path),
            "/local/domain/0/device-model/%d/physmap", xen_domid);
    entries = xs_directory(state->xenstore, 0, path, &num);
    if (entries == NULL)
        return;

    for (i = 0; i < num; i++) {
        physmap = g_malloc(sizeof (XenPhysmap));
        physmap->phys_offset = strtoull(entries[i], NULL, 16);
        snprintf(path, sizeof(path),
                "/local/domain/0/device-model/%d/physmap/%s/start_addr",
                xen_domid, entries[i]);
        value = xs_read(state->xenstore, 0, path, &len);
        if (value == NULL) {
1262
            g_free(physmap);
1263 1264 1265 1266 1267 1268 1269 1270 1271 1272
            continue;
        }
        physmap->start_addr = strtoull(value, NULL, 16);
        free(value);

        snprintf(path, sizeof(path),
                "/local/domain/0/device-model/%d/physmap/%s/size",
                xen_domid, entries[i]);
        value = xs_read(state->xenstore, 0, path, &len);
        if (value == NULL) {
1273
            g_free(physmap);
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
            continue;
        }
        physmap->size = strtoull(value, NULL, 16);
        free(value);

        snprintf(path, sizeof(path),
                "/local/domain/0/device-model/%d/physmap/%s/name",
                xen_domid, entries[i]);
        physmap->name = xs_read(state->xenstore, 0, path, &len);

1284
        QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
1285 1286 1287
    }
    free(entries);
}
1288 1289 1290 1291 1292
#else
static void xen_read_physmap(XenIOState *state)
{
}
#endif
1293

1294 1295 1296 1297 1298
static void xen_wakeup_notifier(Notifier *notifier, void *data)
{
    xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 0);
}

1299
static int xen_map_ioreq_server(XenIOState *state)
A
Anthony PERARD 已提交
1300
{
1301 1302
    void *addr = NULL;
    xenforeignmemory_resource_handle *fres;
1303 1304 1305
    xen_pfn_t ioreq_pfn;
    xen_pfn_t bufioreq_pfn;
    evtchn_port_t bufioreq_evtchn;
1306 1307
    int rc;

1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328
    /*
     * Attempt to map using the resource API and fall back to normal
     * foreign mapping if this is not supported.
     */
    QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_bufioreq != 0);
    QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_ioreq(0) != 1);
    fres = xenforeignmemory_map_resource(xen_fmem, xen_domid,
                                         XENMEM_resource_ioreq_server,
                                         state->ioservid, 0, 2,
                                         &addr,
                                         PROT_READ | PROT_WRITE, 0);
    if (fres != NULL) {
        trace_xen_map_resource_ioreq(state->ioservid, addr);
        state->buffered_io_page = addr;
        state->shared_page = addr + TARGET_PAGE_SIZE;
    } else if (errno != EOPNOTSUPP) {
        error_report("failed to map ioreq server resources: error %d handle=%p",
                     errno, xen_xc);
        return -1;
    }

1329
    rc = xen_get_ioreq_server_info(xen_domid, state->ioservid,
1330 1331 1332 1333
                                   (state->shared_page == NULL) ?
                                   &ioreq_pfn : NULL,
                                   (state->buffered_io_page == NULL) ?
                                   &bufioreq_pfn : NULL,
1334 1335 1336 1337 1338 1339 1340 1341
                                   &bufioreq_evtchn);
    if (rc < 0) {
        error_report("failed to get ioreq server info: error %d handle=%p",
                     errno, xen_xc);
        return rc;
    }

    if (state->shared_page == NULL) {
1342 1343 1344 1345 1346 1347 1348 1349 1350
        DPRINTF("shared page at pfn %lx\n", ioreq_pfn);

        state->shared_page = xenforeignmemory_map(xen_fmem, xen_domid,
                                                  PROT_READ | PROT_WRITE,
                                                  1, &ioreq_pfn, NULL);
        if (state->shared_page == NULL) {
            error_report("map shared IO page returned error %d handle=%p",
                         errno, xen_xc);
        }
1351 1352 1353
    }

    if (state->buffered_io_page == NULL) {
1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
        DPRINTF("buffered io page at pfn %lx\n", bufioreq_pfn);

        state->buffered_io_page = xenforeignmemory_map(xen_fmem, xen_domid,
                                                       PROT_READ | PROT_WRITE,
                                                       1, &bufioreq_pfn,
                                                       NULL);
        if (state->buffered_io_page == NULL) {
            error_report("map buffered IO page returned error %d", errno);
            return -1;
        }
    }

    if (state->shared_page == NULL || state->buffered_io_page == NULL) {
1367 1368 1369
        return -1;
    }

1370 1371
    DPRINTF("buffered io evtchn is %x\n", bufioreq_evtchn);

1372 1373 1374 1375 1376 1377 1378 1379 1380
    state->bufioreq_remote_port = bufioreq_evtchn;

    return 0;
}

void xen_hvm_init(PCMachineState *pcms, MemoryRegion **ram_memory)
{
    int i, rc;
    xen_pfn_t ioreq_pfn;
1381 1382
    XenIOState *state;

1383
    state = g_malloc0(sizeof (XenIOState));
1384

1385 1386
    state->xce_handle = xenevtchn_open(NULL, 0);
    if (state->xce_handle == NULL) {
1387
        perror("xen: event channel open");
1388
        goto err;
1389 1390
    }

1391 1392 1393
    state->xenstore = xs_daemon_open();
    if (state->xenstore == NULL) {
        perror("xen: xenstore open");
1394
        goto err;
1395 1396
    }

1397
    xen_create_ioreq_server(xen_domid, &state->ioservid);
1398

1399 1400 1401
    state->exit.notify = xen_exit_notifier;
    qemu_add_exit_notifier(&state->exit);

1402 1403 1404
    state->suspend.notify = xen_suspend_notifier;
    qemu_register_suspend_notifier(&state->suspend);

1405 1406 1407
    state->wakeup.notify = xen_wakeup_notifier;
    qemu_register_wakeup_notifier(&state->wakeup);

1408 1409 1410 1411 1412
    /*
     * Register wake-up support in QMP query-current-machine API
     */
    qemu_register_wakeup_support();

1413
    rc = xen_map_ioreq_server(state);
1414
    if (rc < 0) {
1415
        goto err;
1416 1417
    }

1418 1419 1420 1421
    rc = xen_get_vmport_regs_pfn(xen_xc, xen_domid, &ioreq_pfn);
    if (!rc) {
        DPRINTF("shared vmport page at pfn %lx\n", ioreq_pfn);
        state->shared_vmport_page =
1422 1423
            xenforeignmemory_map(xen_fmem, xen_domid, PROT_READ|PROT_WRITE,
                                 1, &ioreq_pfn, NULL);
1424
        if (state->shared_vmport_page == NULL) {
1425 1426
            error_report("map shared vmport IO page returned error %d handle=%p",
                         errno, xen_xc);
1427
            goto err;
1428 1429
        }
    } else if (rc != -ENOSYS) {
1430 1431 1432
        error_report("get vmport regs pfn returned error %d, rc=%d",
                     errno, rc);
        goto err;
1433 1434 1435 1436 1437
    }

    /* Note: cpus is empty at this point in init */
    state->cpu_by_vcpu_id = g_malloc0(max_cpus * sizeof(CPUState *));

1438
    rc = xen_set_ioreq_server_state(xen_domid, state->ioservid, true);
1439
    if (rc < 0) {
1440
        error_report("failed to enable ioreq server info: error %d handle=%p",
1441 1442
                     errno, xen_xc);
        goto err;
1443 1444
    }

A
Anthony PERARD 已提交
1445
    state->ioreq_local_port = g_malloc0(max_cpus * sizeof (evtchn_port_t));
1446 1447

    /* FIXME: how about if we overflow the page here? */
A
Anthony PERARD 已提交
1448
    for (i = 0; i < max_cpus; i++) {
1449
        rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
1450 1451
                                        xen_vcpu_eport(state->shared_page, i));
        if (rc == -1) {
1452 1453
            error_report("shared evtchn %d bind error %d", i, errno);
            goto err;
1454 1455 1456 1457
        }
        state->ioreq_local_port[i] = rc;
    }

1458
    rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
1459
                                    state->bufioreq_remote_port);
1460
    if (rc == -1) {
1461 1462
        error_report("buffered evtchn bind error %d", errno);
        goto err;
1463 1464 1465
    }
    state->bufioreq_local_port = rc;

J
Jun Nakajima 已提交
1466
    /* Init RAM management */
1467
#ifdef XEN_COMPAT_PHYSMAP
1468
    xen_map_cache_init(xen_phys_offset_to_gaddr, state);
1469 1470 1471
#else
    xen_map_cache_init(NULL, state);
#endif
1472
    xen_ram_init(pcms, ram_size, ram_memory);
J
Jun Nakajima 已提交
1473

1474
    qemu_add_vm_change_state_handler(xen_hvm_change_state_handler, state);
1475

A
Avi Kivity 已提交
1476
    state->memory_listener = xen_memory_listener;
1477
    memory_listener_register(&state->memory_listener, &address_space_memory);
1478 1479
    state->log_for_dirtybit = NULL;

1480 1481 1482 1483
    state->io_listener = xen_io_listener;
    memory_listener_register(&state->io_listener, &address_space_io);

    state->device_listener = xen_device_listener;
1484
    QLIST_INIT(&state->dev_list);
1485 1486
    device_listener_register(&state->device_listener);

1487 1488
    /* Initialize backend core & drivers */
    if (xen_be_init() != 0) {
1489 1490
        error_report("xen backend core setup failed");
        goto err;
1491
    }
1492
    xen_be_register_common();
1493 1494

    QLIST_INIT(&xen_physmap);
1495
    xen_read_physmap(state);
1496 1497 1498 1499

    /* Disable ACPI build because Xen handles it */
    pcms->acpi_build_enabled = false;

1500
    return;
1501

1502 1503 1504
err:
    error_report("xen hardware virtual machine initialisation failed");
    exit(1);
A
Anthony PERARD 已提交
1505
}
1506

1507
void destroy_hvm_domain(bool reboot)
1508
{
1509
    xc_interface *xc_handle;
1510
    int sts;
1511
    int rc;
1512

1513 1514
    unsigned int reason = reboot ? SHUTDOWN_reboot : SHUTDOWN_poweroff;

1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
    if (xen_dmod) {
        rc = xendevicemodel_shutdown(xen_dmod, xen_domid, reason);
        if (!rc) {
            return;
        }
        if (errno != ENOTTY /* old Xen */) {
            perror("xendevicemodel_shutdown failed");
        }
        /* well, try the old thing then */
    }

1526 1527
    xc_handle = xc_interface_open(0, 0, 0);
    if (xc_handle == NULL) {
1528 1529
        fprintf(stderr, "Cannot acquire xenctrl handle\n");
    } else {
1530
        sts = xc_domain_shutdown(xc_handle, xen_domid, reason);
1531
        if (sts != 0) {
1532 1533 1534
            fprintf(stderr, "xc_domain_shutdown failed to issue %s, "
                    "sts %d, %s\n", reboot ? "reboot" : "poweroff",
                    sts, strerror(errno));
1535
        } else {
1536 1537
            fprintf(stderr, "Issued domain %d %s\n", xen_domid,
                    reboot ? "reboot" : "poweroff");
1538 1539 1540 1541
        }
        xc_interface_close(xc_handle);
    }
}
1542 1543 1544 1545 1546

void xen_register_framebuffer(MemoryRegion *mr)
{
    framebuffer = mr;
}
1547 1548 1549 1550 1551 1552 1553 1554 1555 1556

void xen_shutdown_fatal_error(const char *fmt, ...)
{
    va_list ap;

    va_start(ap, fmt);
    vfprintf(stderr, fmt, ap);
    va_end(ap);
    fprintf(stderr, "Will destroy the domain.\n");
    /* destroy the domain */
1557
    qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
1558
}
1559

1560
void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length)
1561 1562 1563 1564 1565
{
    if (unlikely(xen_in_migration)) {
        int rc;
        ram_addr_t start_pfn, nb_pages;

1566 1567
        start = xen_phys_offset_to_gaddr(start, length);

1568 1569 1570 1571 1572 1573
        if (length == 0) {
            length = TARGET_PAGE_SIZE;
        }
        start_pfn = start >> TARGET_PAGE_BITS;
        nb_pages = ((start + length + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS)
            - start_pfn;
1574
        rc = xen_modified_memory(xen_domid, start_pfn, nb_pages);
1575 1576 1577
        if (rc) {
            fprintf(stderr,
                    "%s failed for "RAM_ADDR_FMT" ("RAM_ADDR_FMT"): %i, %s\n",
1578
                    __func__, start, nb_pages, errno, strerror(errno));
1579 1580 1581
        }
    }
}
W
Wei Liu 已提交
1582 1583 1584 1585 1586 1587 1588 1589 1590

void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
{
    if (enable) {
        memory_global_dirty_log_start();
    } else {
        memory_global_dirty_log_stop();
    }
}