spapr.c 52.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
 *
 * Copyright (c) 2004-2007 Fabrice Bellard
 * Copyright (c) 2007 Jocelyn Mayer
 * Copyright (c) 2010 David Gibson, IBM Corporation.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */
27
#include "sysemu/sysemu.h"
28
#include "hw/hw.h"
29
#include "hw/fw-path-provider.h"
30
#include "elf.h"
P
Paolo Bonzini 已提交
31
#include "net/net.h"
32
#include "sysemu/block-backend.h"
33 34
#include "sysemu/cpus.h"
#include "sysemu/kvm.h"
35
#include "kvm_ppc.h"
36
#include "mmu-hash64.h"
37
#include "qom/cpu.h"
38 39

#include "hw/boards.h"
P
Paolo Bonzini 已提交
40
#include "hw/ppc/ppc.h"
41 42
#include "hw/loader.h"

P
Paolo Bonzini 已提交
43 44 45 46
#include "hw/ppc/spapr.h"
#include "hw/ppc/spapr_vio.h"
#include "hw/pci-host/spapr.h"
#include "hw/ppc/xics.h"
47
#include "hw/pci/msi.h"
48

49
#include "hw/pci/pci.h"
50 51
#include "hw/scsi/scsi.h"
#include "hw/virtio/virtio-scsi.h"
52

53
#include "exec/address-spaces.h"
54
#include "hw/usb.h"
55
#include "qemu/config-file.h"
56
#include "qemu/error-report.h"
57
#include "trace.h"
58
#include "hw/nmi.h"
A
Avi Kivity 已提交
59

60 61
#include "hw/compat.h"

62 63
#include <libfdt.h>

64 65 66 67 68 69 70 71 72 73
/* SLOF memory layout:
 *
 * SLOF raw image loaded at 0, copies its romfs right below the flat
 * device-tree, then position SLOF itself 31M below that
 *
 * So we set FW_OVERHEAD to 40MB which should account for all of that
 * and more
 *
 * We load our kernel at 4M, leaving space for SLOF initial image
 */
74
#define FDT_MAX_SIZE            0x40000
75
#define RTAS_MAX_SIZE           0x10000
76
#define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
77 78
#define FW_MAX_SIZE             0x400000
#define FW_FILE_NAME            "slof.bin"
79 80
#define FW_OVERHEAD             0x2800000
#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
81

82
#define MIN_RMA_SLOF            128UL
83 84 85

#define TIMEBASE_FREQ           512000000ULL

86
#define MAX_CPUS                255
87

88 89
#define PHANDLE_XICP            0x00001111

90 91
#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))

92
typedef struct sPAPRMachineState sPAPRMachineState;
93

94
#define TYPE_SPAPR_MACHINE      "spapr-machine"
95
#define SPAPR_MACHINE(obj) \
96
    OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE)
97 98

/**
99
 * sPAPRMachineState:
100
 */
101
struct sPAPRMachineState {
102 103
    /*< private >*/
    MachineState parent_obj;
E
Eduardo Habkost 已提交
104 105 106

    /*< public >*/
    char *kvm_type;
107 108
};

109 110
sPAPREnvironment *spapr;

111 112 113 114 115 116 117 118 119 120 121 122
static XICSState *try_create_xics(const char *type, int nr_servers,
                                  int nr_irqs)
{
    DeviceState *dev;

    dev = qdev_create(NULL, type);
    qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
    qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
    if (qdev_init(dev) < 0) {
        return NULL;
    }

123
    return XICS_COMMON(dev);
124 125 126 127 128 129
}

static XICSState *xics_system_init(int nr_servers, int nr_irqs)
{
    XICSState *icp = NULL;

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
    if (kvm_enabled()) {
        QemuOpts *machine_opts = qemu_get_machine_opts();
        bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
                                                "kernel_irqchip", true);
        bool irqchip_required = qemu_opt_get_bool(machine_opts,
                                                  "kernel_irqchip", false);
        if (irqchip_allowed) {
            icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
        }

        if (irqchip_required && !icp) {
            perror("Failed to create in-kernel XICS\n");
            abort();
        }
    }

    if (!icp) {
        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
    }

150 151 152 153 154 155 156 157
    if (!icp) {
        perror("Failed to create XICS\n");
        abort();
    }

    return icp;
}

158 159 160 161 162 163 164 165
static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
                                  int smt_threads)
{
    int i, ret = 0;
    uint32_t servers_prop[smt_threads];
    uint32_t gservers_prop[smt_threads * 2];
    int index = ppc_get_vcpu_dt_id(cpu);

166
    if (cpu->cpu_version) {
167
        ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->cpu_version);
168 169 170 171 172
        if (ret < 0) {
            return ret;
        }
    }

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
    /* Build interrupt servers and gservers properties */
    for (i = 0; i < smt_threads; i++) {
        servers_prop[i] = cpu_to_be32(index + i);
        /* Hack, direct the group queues back to cpu 0 */
        gservers_prop[i*2] = cpu_to_be32(index + i);
        gservers_prop[i*2 + 1] = 0;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
                      servers_prop, sizeof(servers_prop));
    if (ret < 0) {
        return ret;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
                      gservers_prop, sizeof(gservers_prop));

    return ret;
}

191
static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
192
{
193 194
    int ret = 0, offset, cpus_offset;
    CPUState *cs;
195 196
    char cpu_model[32];
    int smt = kvmppc_smt_threads();
197
    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
198

199 200 201 202
    CPU_FOREACH(cs) {
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
        int index = ppc_get_vcpu_dt_id(cpu);
203 204 205 206
        uint32_t associativity[] = {cpu_to_be32(0x5),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
207
                                    cpu_to_be32(cs->numa_node),
208
                                    cpu_to_be32(index)};
209

210
        if ((index % smt) != 0) {
211 212 213
            continue;
        }

214
        snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
215

216 217 218 219 220 221 222 223 224
        cpus_offset = fdt_path_offset(fdt, "/cpus");
        if (cpus_offset < 0) {
            cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
                                          "cpus");
            if (cpus_offset < 0) {
                return cpus_offset;
            }
        }
        offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
225
        if (offset < 0) {
226 227 228 229
            offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
            if (offset < 0) {
                return offset;
            }
230 231
        }

232 233 234 235 236 237 238 239 240 241
        if (nb_numa_nodes > 1) {
            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
                              sizeof(associativity));
            if (ret < 0) {
                return ret;
            }
        }

        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
                          pft_size_prop, sizeof(pft_size_prop));
242 243 244
        if (ret < 0) {
            return ret;
        }
245

246
        ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
247
                                     ppc_get_compat_smt_threads(cpu));
248 249 250
        if (ret < 0) {
            return ret;
        }
251 252 253 254
    }
    return ret;
}

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288

static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
                                     size_t maxsize)
{
    size_t maxcells = maxsize / sizeof(uint32_t);
    int i, j, count;
    uint32_t *p = prop;

    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
        struct ppc_one_seg_page_size *sps = &env->sps.sps[i];

        if (!sps->page_shift) {
            break;
        }
        for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
            if (sps->enc[count].page_shift == 0) {
                break;
            }
        }
        if ((p - prop) >= (maxcells - 3 - count * 2)) {
            break;
        }
        *(p++) = cpu_to_be32(sps->page_shift);
        *(p++) = cpu_to_be32(sps->slb_enc);
        *(p++) = cpu_to_be32(count);
        for (j = 0; j < count; j++) {
            *(p++) = cpu_to_be32(sps->enc[j].page_shift);
            *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
        }
    }

    return (p - prop) * sizeof(uint32_t);
}

289 290 291 292 293 294 295 296 297 298 299 300 301
static hwaddr spapr_node0_size(void)
{
    if (nb_numa_nodes) {
        int i;
        for (i = 0; i < nb_numa_nodes; ++i) {
            if (numa_info[i].node_mem) {
                return MIN(pow2floor(numa_info[i].node_mem), ram_size);
            }
        }
    }
    return ram_size;
}

302 303 304 305 306 307 308 309 310 311
#define _FDT(exp) \
    do { \
        int ret = (exp);                                           \
        if (ret < 0) {                                             \
            fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
                    #exp, fdt_strerror(ret));                      \
            exit(1);                                               \
        }                                                          \
    } while (0)

312 313 314 315
static void add_str(GString *s, const gchar *s1)
{
    g_string_append_len(s, s1, strlen(s1) + 1);
}
316

317
static void *spapr_create_fdt_skel(hwaddr initrd_base,
A
Avi Kivity 已提交
318 319
                                   hwaddr initrd_size,
                                   hwaddr kernel_size,
320
                                   bool little_endian,
321
                                   const char *boot_device,
322 323
                                   const char *kernel_cmdline,
                                   uint32_t epow_irq)
324 325
{
    void *fdt;
326
    CPUState *cs;
327 328
    uint32_t start_prop = cpu_to_be32(initrd_base);
    uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
329 330
    GString *hypertas = g_string_sized_new(256);
    GString *qemu_hypertas = g_string_sized_new(256);
331
    uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
332
    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
333
    int smt = kvmppc_smt_threads();
334
    unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
335 336 337
    QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
    unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
    uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
338
    char *buf;
339

340 341 342 343 344 345 346 347 348 349 350
    add_str(hypertas, "hcall-pft");
    add_str(hypertas, "hcall-term");
    add_str(hypertas, "hcall-dabr");
    add_str(hypertas, "hcall-interrupt");
    add_str(hypertas, "hcall-tce");
    add_str(hypertas, "hcall-vio");
    add_str(hypertas, "hcall-splpar");
    add_str(hypertas, "hcall-bulk");
    add_str(hypertas, "hcall-set-mode");
    add_str(qemu_hypertas, "hcall-memop1");

351
    fdt = g_malloc0(FDT_MAX_SIZE);
352 353
    _FDT((fdt_create(fdt, FDT_MAX_SIZE)));

354 355 356 357 358 359
    if (kernel_size) {
        _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
    }
    if (initrd_size) {
        _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
    }
360 361 362 363 364
    _FDT((fdt_finish_reservemap(fdt)));

    /* Root node */
    _FDT((fdt_begin_node(fdt, "")));
    _FDT((fdt_property_string(fdt, "device_type", "chrp")));
365
    _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
366
    _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
367

368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
    /*
     * Add info to guest to indentify which host is it being run on
     * and what is the uuid of the guest
     */
    if (kvmppc_get_host_model(&buf)) {
        _FDT((fdt_property_string(fdt, "host-model", buf)));
        g_free(buf);
    }
    if (kvmppc_get_host_serial(&buf)) {
        _FDT((fdt_property_string(fdt, "host-serial", buf)));
        g_free(buf);
    }

    buf = g_strdup_printf(UUID_FMT, qemu_uuid[0], qemu_uuid[1],
                          qemu_uuid[2], qemu_uuid[3], qemu_uuid[4],
                          qemu_uuid[5], qemu_uuid[6], qemu_uuid[7],
                          qemu_uuid[8], qemu_uuid[9], qemu_uuid[10],
                          qemu_uuid[11], qemu_uuid[12], qemu_uuid[13],
                          qemu_uuid[14], qemu_uuid[15]);

    _FDT((fdt_property_string(fdt, "vm,uuid", buf)));
    g_free(buf);

391 392 393 394 395 396
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));

    /* /chosen */
    _FDT((fdt_begin_node(fdt, "chosen")));

397 398 399
    /* Set Form1_affinity */
    _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));

400 401 402 403 404
    _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
    _FDT((fdt_property(fdt, "linux,initrd-start",
                       &start_prop, sizeof(start_prop))));
    _FDT((fdt_property(fdt, "linux,initrd-end",
                       &end_prop, sizeof(end_prop))));
405 406 407
    if (kernel_size) {
        uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
                              cpu_to_be64(kernel_size) };
408

409
        _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
410 411 412
        if (little_endian) {
            _FDT((fdt_property(fdt, "qemu,boot-kernel-le", NULL, 0)));
        }
413
    }
A
Avik Sil 已提交
414 415 416
    if (boot_device) {
        _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
    }
417 418 419
    if (boot_menu) {
        _FDT((fdt_property_cell(fdt, "qemu,boot-menu", boot_menu)));
    }
420 421 422
    _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
423

424 425 426 427 428 429 430 431
    _FDT((fdt_end_node(fdt)));

    /* cpus */
    _FDT((fdt_begin_node(fdt, "cpus")));

    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));

A
Andreas Färber 已提交
432
    CPU_FOREACH(cs) {
433 434
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        CPUPPCState *env = &cpu->env;
435
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
436
        PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
437
        int index = ppc_get_vcpu_dt_id(cpu);
438 439 440
        char *nodename;
        uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
                           0xffffffff, 0xffffffff};
441 442
        uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
        uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
443 444
        uint32_t page_sizes_prop[64];
        size_t page_sizes_prop_size;
445

446 447 448 449
        if ((index % smt) != 0) {
            continue;
        }

450
        nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
451 452 453

        _FDT((fdt_begin_node(fdt, nodename)));

454
        g_free(nodename);
455

D
David Gibson 已提交
456
        _FDT((fdt_property_cell(fdt, "reg", index)));
457 458 459
        _FDT((fdt_property_string(fdt, "device_type", "cpu")));

        _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR])));
460
        _FDT((fdt_property_cell(fdt, "d-cache-block-size",
461
                                env->dcache_line_size)));
462 463 464 465 466
        _FDT((fdt_property_cell(fdt, "d-cache-line-size",
                                env->dcache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-block-size",
                                env->icache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-line-size",
467
                                env->icache_line_size)));
468 469 470 471 472 473 474 475 476 477 478 479

        if (pcc->l1_dcache_size) {
            _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
        }
        if (pcc->l1_icache_size) {
            _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
        }

480 481
        _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
        _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
482 483 484
        _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
        _FDT((fdt_property_string(fdt, "status", "okay")));
        _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
485

486 487 488 489
        if (env->spr_cb[SPR_PURR].oea_read) {
            _FDT((fdt_property(fdt, "ibm,purr", NULL, 0)));
        }

D
David Gibson 已提交
490
        if (env->mmu_model & POWERPC_MMU_1TSEG) {
491 492 493 494
            _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
                               segs, sizeof(segs))));
        }

495 496 497 498
        /* Advertise VMX/VSX (vector extensions) if available
         *   0 / no property == no vector extensions
         *   1               == VMX / Altivec available
         *   2               == VSX available */
499 500 501
        if (env->insns_flags & PPC_ALTIVEC) {
            uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;

502 503 504 505 506 507
            _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx)));
        }

        /* Advertise DFP (Decimal Floating Point) if available
         *   0 / no property == no DFP
         *   1               == DFP available */
508 509
        if (env->insns_flags2 & PPC2_DFP) {
            _FDT((fdt_property_cell(fdt, "ibm,dfp", 1)));
510 511
        }

512 513 514 515 516 517 518
        page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
                                                      sizeof(page_sizes_prop));
        if (page_sizes_prop_size) {
            _FDT((fdt_property(fdt, "ibm,segment-page-sizes",
                               page_sizes_prop, page_sizes_prop_size)));
        }

519 520 521
        _FDT((fdt_property_cell(fdt, "ibm,chip-id",
                                cs->cpu_index / cpus_per_socket)));

522 523 524 525 526
        _FDT((fdt_end_node(fdt)));
    }

    _FDT((fdt_end_node(fdt)));

527 528 529
    /* RTAS */
    _FDT((fdt_begin_node(fdt, "rtas")));

530 531 532
    if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
        add_str(hypertas, "hcall-multi-tce");
    }
533 534 535 536 537 538
    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas->str,
                       hypertas->len)));
    g_string_free(hypertas, TRUE);
    _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas->str,
                       qemu_hypertas->len)));
    g_string_free(qemu_hypertas, TRUE);
539

540 541 542
    _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
        refpoints, sizeof(refpoints))));

543 544
    _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));

545
    /*
546
     * According to PAPR, rtas ibm,os-term does not guarantee a return
547 548 549 550 551 552 553
     * back to the guest cpu.
     *
     * While an additional ibm,extended-os-term property indicates that
     * rtas call return will always occur. Set this property.
     */
    _FDT((fdt_property(fdt, "ibm,extended-os-term", NULL, 0)));

554 555
    _FDT((fdt_end_node(fdt)));

556
    /* interrupt controller */
557
    _FDT((fdt_begin_node(fdt, "interrupt-controller")));
558 559 560 561 562 563 564 565

    _FDT((fdt_property_string(fdt, "device_type",
                              "PowerPC-External-Interrupt-Presentation")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
                       interrupt_server_ranges_prop,
                       sizeof(interrupt_server_ranges_prop))));
566 567 568
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
    _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
    _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
569 570 571

    _FDT((fdt_end_node(fdt)));

572 573 574 575 576 577 578
    /* vdevice */
    _FDT((fdt_begin_node(fdt, "vdevice")));

    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
579 580
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
581 582 583

    _FDT((fdt_end_node(fdt)));

584 585 586
    /* event-sources */
    spapr_events_fdt_skel(fdt, epow_irq);

587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606
    /* /hypervisor node */
    if (kvm_enabled()) {
        uint8_t hypercall[16];

        /* indicate KVM hypercall interface */
        _FDT((fdt_begin_node(fdt, "hypervisor")));
        _FDT((fdt_property_string(fdt, "compatible", "linux,kvm")));
        if (kvmppc_has_cap_fixup_hcalls()) {
            /*
             * Older KVM versions with older guest kernels were broken with the
             * magic page, don't allow the guest to map it.
             */
            kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
                                 sizeof(hypercall));
            _FDT((fdt_property(fdt, "hcall-instructions", hypercall,
                              sizeof(hypercall))));
        }
        _FDT((fdt_end_node(fdt)));
    }

607 608 609
    _FDT((fdt_end_node(fdt))); /* close root node */
    _FDT((fdt_finish(fdt)));

610 611 612
    return fdt;
}

613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
int spapr_h_cas_compose_response(target_ulong addr, target_ulong size)
{
    void *fdt, *fdt_skel;
    sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };

    size -= sizeof(hdr);

    /* Create sceleton */
    fdt_skel = g_malloc0(size);
    _FDT((fdt_create(fdt_skel, size)));
    _FDT((fdt_begin_node(fdt_skel, "")));
    _FDT((fdt_end_node(fdt_skel)));
    _FDT((fdt_finish(fdt_skel)));
    fdt = g_malloc0(size);
    _FDT((fdt_open_into(fdt_skel, fdt, size)));
    g_free(fdt_skel);

630 631
    /* Fix skeleton up */
    _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648

    /* Pack resulting tree */
    _FDT((fdt_pack(fdt)));

    if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
        trace_spapr_cas_failed(size);
        return -1;
    }

    cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
    cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
    trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
    g_free(fdt);

    return 0;
}

649 650 651 652 653 654
static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start,
                                       hwaddr size)
{
    uint32_t associativity[] = {
        cpu_to_be32(0x4), /* length */
        cpu_to_be32(0x0), cpu_to_be32(0x0),
655
        cpu_to_be32(0x0), cpu_to_be32(nodeid)
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
    };
    char mem_name[32];
    uint64_t mem_reg_property[2];
    int off;

    mem_reg_property[0] = cpu_to_be64(start);
    mem_reg_property[1] = cpu_to_be64(size);

    sprintf(mem_name, "memory@" TARGET_FMT_lx, start);
    off = fdt_add_subnode(fdt, 0, mem_name);
    _FDT(off);
    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                      sizeof(mem_reg_property))));
    _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                      sizeof(associativity))));
}

674 675
static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
{
676 677 678 679 680 681 682 683 684 685
    hwaddr mem_start, node_size;
    int i, nb_nodes = nb_numa_nodes;
    NodeInfo *nodes = numa_info;
    NodeInfo ramnode;

    /* No NUMA nodes, assume there is just one node with whole RAM */
    if (!nb_numa_nodes) {
        nb_nodes = 1;
        ramnode.node_mem = ram_size;
        nodes = &ramnode;
686
    }
687

688 689 690 691
    for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
        if (!nodes[i].node_mem) {
            continue;
        }
692 693 694
        if (mem_start >= ram_size) {
            node_size = 0;
        } else {
695
            node_size = nodes[i].node_mem;
696 697 698 699
            if (node_size > ram_size - mem_start) {
                node_size = ram_size - mem_start;
            }
        }
700 701 702 703 704 705
        if (!mem_start) {
            /* ppc_spapr_init() checks for rma_size <= node0_size already */
            spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
            mem_start += spapr->rma_size;
            node_size -= spapr->rma_size;
        }
706 707 708 709 710 711 712 713 714 715 716 717
        for ( ; node_size; ) {
            hwaddr sizetmp = pow2floor(node_size);

            /* mem_start != 0 here */
            if (ctzl(mem_start) < ctzl(sizetmp)) {
                sizetmp = 1ULL << ctzl(mem_start);
            }

            spapr_populate_memory_node(fdt, i, mem_start, sizetmp);
            node_size -= sizetmp;
            mem_start += sizetmp;
        }
718 719 720 721 722
    }

    return 0;
}

723
static void spapr_finalize_fdt(sPAPREnvironment *spapr,
A
Avi Kivity 已提交
724 725 726
                               hwaddr fdt_addr,
                               hwaddr rtas_addr,
                               hwaddr rtas_size)
727
{
728 729 730
    int ret, i;
    size_t cb = 0;
    char *bootlist;
731
    void *fdt;
732
    sPAPRPHBState *phb;
733

734
    fdt = g_malloc(FDT_MAX_SIZE);
735 736 737

    /* open out the base tree into a temp buffer for the final tweaks */
    _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
738

739 740 741 742 743 744
    ret = spapr_populate_memory(spapr, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
        exit(1);
    }

745 746 747 748 749 750
    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup vio devices in fdt\n");
        exit(1);
    }

751
    QLIST_FOREACH(phb, &spapr->phbs, list) {
752
        ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
753 754 755 756 757 758 759
    }

    if (ret < 0) {
        fprintf(stderr, "couldn't setup PCI devices in fdt\n");
        exit(1);
    }

760 761 762 763 764 765
    /* RTAS */
    ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
    if (ret < 0) {
        fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
    }

766
    /* Advertise NUMA via ibm,associativity */
767 768 769
    ret = spapr_fixup_cpu_dt(fdt, spapr);
    if (ret < 0) {
        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
770 771
    }

772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
    bootlist = get_boot_devices_list(&cb, true);
    if (cb && bootlist) {
        int offset = fdt_path_offset(fdt, "/chosen");
        if (offset < 0) {
            exit(1);
        }
        for (i = 0; i < cb; i++) {
            if (bootlist[i] == '\n') {
                bootlist[i] = ' ';
            }

        }
        ret = fdt_setprop_string(fdt, offset, "qemu,boot-list", bootlist);
    }

787
    if (!spapr->has_graphics) {
788 789
        spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
    }
790

791 792
    _FDT((fdt_pack(fdt)));

793 794 795 796 797 798
    if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
        hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
                 fdt_totalsize(fdt), FDT_MAX_SIZE);
        exit(1);
    }

799
    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
800

G
Gonglei 已提交
801
    g_free(bootlist);
802
    g_free(fdt);
803 804 805 806 807 808 809
}

static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
{
    return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
}

810
static void emulate_spapr_hypercall(PowerPCCPU *cpu)
811
{
812 813
    CPUPPCState *env = &cpu->env;

814 815 816 817
    if (msr_pr) {
        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
        env->gpr[3] = H_PRIVILEGE;
    } else {
818
        env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
819
    }
820 821
}

822 823 824 825 826 827 828 829 830 831 832 833 834
static void spapr_reset_htab(sPAPREnvironment *spapr)
{
    long shift;

    /* allocate hash page table.  For now we always make this 16mb,
     * later we should probably make it scale to the size of guest
     * RAM */

    shift = kvmppc_reset_htab(spapr->htab_shift);

    if (shift > 0) {
        /* Kernel handles htab, we don't need to allocate one */
        spapr->htab_shift = shift;
835
        kvmppc_kern_htab = true;
836 837 838 839 840 841 842 843 844 845 846 847
    } else {
        if (!spapr->htab) {
            /* Allocate an htab if we don't yet have one */
            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
        }

        /* And clear it */
        memset(spapr->htab, 0, HTAB_SIZE(spapr));
    }

    /* Update the RMA size if necessary */
    if (spapr->vrma_adjust) {
848 849
        spapr->rma_size = kvmppc_rma_size(spapr_node0_size(),
                                          spapr->htab_shift);
850
    }
851 852
}

853
static void ppc_spapr_reset(void)
854
{
855
    PowerPCCPU *first_ppc_cpu;
856
    uint32_t rtas_limit;
857

858 859
    /* Reset the hash table & recalc the RMA */
    spapr_reset_htab(spapr);
860

861
    qemu_devices_reset();
862

863 864 865 866 867 868 869 870 871
    /*
     * We place the device tree and RTAS just below either the top of the RMA,
     * or just below 2GB, whichever is lowere, so that it can be
     * processed with 32-bit real mode code if necessary
     */
    rtas_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR);
    spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
    spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;

872 873 874 875
    /* Load the fdt */
    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
                       spapr->rtas_size);

876 877 878 879
    /* Copy RTAS over */
    cpu_physical_memory_write(spapr->rtas_addr, spapr->rtas_blob,
                              spapr->rtas_size);

880
    /* Set up the entry state */
881 882 883 884 885
    first_ppc_cpu = POWERPC_CPU(first_cpu);
    first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
    first_ppc_cpu->env.gpr[5] = 0;
    first_cpu->halted = 0;
    first_ppc_cpu->env.nip = spapr->entry_point;
886 887 888

}

889 890
static void spapr_cpu_reset(void *opaque)
{
891
    PowerPCCPU *cpu = opaque;
892
    CPUState *cs = CPU(cpu);
893
    CPUPPCState *env = &cpu->env;
894

895
    cpu_reset(cs);
896 897 898 899

    /* All CPUs start halted.  CPU0 is unhalted from the machine level
     * reset code and the rest are explicitly started up by the guest
     * using an RTAS call */
900
    cs->halted = 1;
901 902

    env->spr[SPR_HIOR] = 0;
903

904
    env->external_htab = (uint8_t *)spapr->htab;
905 906 907 908 909 910 911
    if (kvm_enabled() && !env->external_htab) {
        /*
         * HV KVM, set external_htab to 1 so our ppc_hash64_load_hpte*
         * functions do the right thing.
         */
        env->external_htab = (void *)1;
    }
912
    env->htab_base = -1;
913 914 915 916 917 918 919
    /*
     * htab_mask is the mask used to normalize hash value to PTEG index.
     * htab_shift is log2 of hash table size.
     * We have 8 hpte per group, and each hpte is 16 bytes.
     * ie have 128 bytes per hpte entry.
     */
    env->htab_mask = (1ULL << ((spapr)->htab_shift - 7)) - 1;
920
    env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
921
        (spapr->htab_shift - 18);
922 923
}

D
David Gibson 已提交
924 925
static void spapr_create_nvram(sPAPREnvironment *spapr)
{
926
    DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
P
Paolo Bonzini 已提交
927
    DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
D
David Gibson 已提交
928

P
Paolo Bonzini 已提交
929
    if (dinfo) {
930
        qdev_prop_set_drive_nofail(dev, "drive", blk_by_legacy_dinfo(dinfo));
D
David Gibson 已提交
931 932 933 934 935 936 937
    }

    qdev_init_nofail(dev);

    spapr->nvram = (struct sPAPRNVRAM *)dev;
}

938
/* Returns whether we want to use VGA or not */
939 940
static int spapr_vga_init(PCIBus *pci_bus)
{
941 942
    switch (vga_interface_type) {
    case VGA_NONE:
943 944 945
        return false;
    case VGA_DEVICE:
        return true;
946 947
    case VGA_STD:
        return pci_vga_init(pci_bus) != NULL;
948
    default:
949 950
        fprintf(stderr, "This vga model is not supported,"
                "currently it only supports -vga std\n");
951
        exit(0);
952 953 954
    }
}

955 956
static const VMStateDescription vmstate_spapr = {
    .name = "spapr",
957
    .version_id = 2,
958
    .minimum_version_id = 1,
959
    .fields = (VMStateField[]) {
A
Alexey Kardashevskiy 已提交
960
        VMSTATE_UNUSED(4), /* used to be @next_irq */
961 962 963

        /* RTC offset */
        VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
964
        VMSTATE_PPC_TIMEBASE_V(tb, sPAPREnvironment, 2),
965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
        VMSTATE_END_OF_LIST()
    },
};

#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))

static int htab_save_setup(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* "Iteration" header */
    qemu_put_be32(f, spapr->htab_shift);

981 982 983 984 985 986 987 988 989 990 991 992 993 994 995
    if (spapr->htab) {
        spapr->htab_save_index = 0;
        spapr->htab_first_pass = true;
    } else {
        assert(kvm_enabled());

        spapr->htab_fd = kvmppc_get_htab_fd(false);
        if (spapr->htab_fd < 0) {
            fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
                    strerror(errno));
            return -1;
        }
    }


996 997 998 999 1000 1001 1002 1003
    return 0;
}

static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                 int64_t max_ns)
{
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int index = spapr->htab_save_index;
1004
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034

    assert(spapr->htab_first_pass);

    do {
        int chunkstart;

        /* Consume invalid HPTEs */
        while ((index < htabslots)
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        /* Consume valid HPTEs */
        chunkstart = index;
        while ((index < htabslots)
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        if (index > chunkstart) {
            int n_valid = index - chunkstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, 0);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);

1035
            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
                break;
            }
        }
    } while ((index < htabslots) && !qemu_file_rate_limit(f));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
        spapr->htab_first_pass = false;
    }
    spapr->htab_save_index = index;
}

1049 1050
static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                int64_t max_ns)
1051 1052 1053 1054 1055
{
    bool final = max_ns < 0;
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int examined = 0, sent = 0;
    int index = spapr->htab_save_index;
1056
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100

    assert(!spapr->htab_first_pass);

    do {
        int chunkstart, invalidstart;

        /* Consume non-dirty HPTEs */
        while ((index < htabslots)
               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
            index++;
            examined++;
        }

        chunkstart = index;
        /* Consume valid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        invalidstart = index;
        /* Consume invalid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        if (index > chunkstart) {
            int n_valid = invalidstart - chunkstart;
            int n_invalid = index - invalidstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, n_invalid);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);
            sent += index - chunkstart;

1101
            if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
                break;
            }
        }

        if (examined >= htabslots) {
            break;
        }

        if (index >= htabslots) {
            assert(index == htabslots);
            index = 0;
        }
    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
    }

    spapr->htab_save_index = index;

1123
    return (examined >= htabslots) && (sent == 0) ? 1 : 0;
1124 1125
}

1126 1127 1128
#define MAX_ITERATION_NS    5000000 /* 5 ms */
#define MAX_KVM_BUF_SIZE    2048

1129 1130 1131
static int htab_save_iterate(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;
1132
    int rc = 0;
1133 1134 1135 1136

    /* Iteration header */
    qemu_put_be32(f, 0);

1137 1138 1139 1140 1141 1142 1143 1144 1145
    if (!spapr->htab) {
        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd,
                              MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
        if (rc < 0) {
            return rc;
        }
    } else  if (spapr->htab_first_pass) {
1146 1147
        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
    } else {
1148
        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
1149 1150 1151 1152 1153 1154 1155
    }

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

1156
    return rc;
1157 1158 1159 1160 1161 1162 1163 1164 1165
}

static int htab_save_complete(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* Iteration header */
    qemu_put_be32(f, 0);

1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
    if (!spapr->htab) {
        int rc;

        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd, MAX_KVM_BUF_SIZE, -1);
        if (rc < 0) {
            return rc;
        }
        close(spapr->htab_fd);
        spapr->htab_fd = -1;
    } else {
        htab_save_later_pass(f, spapr, -1);
    }
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

    return 0;
}

static int htab_load(QEMUFile *f, void *opaque, int version_id)
{
    sPAPREnvironment *spapr = opaque;
    uint32_t section_hdr;
1193
    int fd = -1;
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209

    if (version_id < 1 || version_id > 1) {
        fprintf(stderr, "htab_load() bad version\n");
        return -EINVAL;
    }

    section_hdr = qemu_get_be32(f);

    if (section_hdr) {
        /* First section, just the hash shift */
        if (spapr->htab_shift != section_hdr) {
            return -EINVAL;
        }
        return 0;
    }

1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
    if (!spapr->htab) {
        assert(kvm_enabled());

        fd = kvmppc_get_htab_fd(true);
        if (fd < 0) {
            fprintf(stderr, "Unable to open fd to restore KVM hash table: %s\n",
                    strerror(errno));
        }
    }

1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
    while (true) {
        uint32_t index;
        uint16_t n_valid, n_invalid;

        index = qemu_get_be32(f);
        n_valid = qemu_get_be16(f);
        n_invalid = qemu_get_be16(f);

        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
            /* End of Stream */
            break;
        }

1233
        if ((index + n_valid + n_invalid) >
1234 1235 1236
            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
            /* Bad index in stream */
            fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
1237 1238
                    "in htab stream (htab_shift=%d)\n", index, n_valid, n_invalid,
                    spapr->htab_shift);
1239 1240 1241
            return -EINVAL;
        }

1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
        if (spapr->htab) {
            if (n_valid) {
                qemu_get_buffer(f, HPTE(spapr->htab, index),
                                HASH_PTE_SIZE_64 * n_valid);
            }
            if (n_invalid) {
                memset(HPTE(spapr->htab, index + n_valid), 0,
                       HASH_PTE_SIZE_64 * n_invalid);
            }
        } else {
            int rc;

            assert(fd >= 0);

            rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
            if (rc < 0) {
                return rc;
            }
1260 1261 1262
        }
    }

1263 1264 1265 1266 1267
    if (!spapr->htab) {
        assert(fd >= 0);
        close(fd);
    }

1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
    return 0;
}

static SaveVMHandlers savevm_htab_handlers = {
    .save_live_setup = htab_save_setup,
    .save_live_iterate = htab_save_iterate,
    .save_live_complete = htab_save_complete,
    .load_state = htab_load,
};

1278
/* pSeries LPAR / sPAPR hardware init */
1279
static void ppc_spapr_init(MachineState *machine)
1280
{
1281 1282 1283 1284 1285 1286
    ram_addr_t ram_size = machine->ram_size;
    const char *cpu_model = machine->cpu_model;
    const char *kernel_filename = machine->kernel_filename;
    const char *kernel_cmdline = machine->kernel_cmdline;
    const char *initrd_filename = machine->initrd_filename;
    const char *boot_device = machine->boot_order;
1287
    PowerPCCPU *cpu;
A
Andreas Färber 已提交
1288
    CPUPPCState *env;
1289
    PCIHostState *phb;
1290
    int i;
A
Avi Kivity 已提交
1291 1292
    MemoryRegion *sysmem = get_system_memory();
    MemoryRegion *ram = g_new(MemoryRegion, 1);
1293 1294
    MemoryRegion *rma_region;
    void *rma = NULL;
A
Avi Kivity 已提交
1295
    hwaddr rma_alloc_size;
1296
    hwaddr node0_size = spapr_node0_size();
1297 1298
    uint32_t initrd_base = 0;
    long kernel_size = 0, initrd_size = 0;
1299
    long load_limit, fw_size;
1300
    bool kernel_le = false;
1301
    char *filename;
1302

1303 1304
    msi_supported = true;

1305 1306 1307
    spapr = g_malloc0(sizeof(*spapr));
    QLIST_INIT(&spapr->phbs);

1308 1309
    cpu_ppc_hypercall = emulate_spapr_hypercall;

1310
    /* Allocate RMA if necessary */
1311
    rma_alloc_size = kvmppc_alloc_rma(&rma);
1312 1313 1314 1315 1316

    if (rma_alloc_size == -1) {
        hw_error("qemu: Unable to create RMA\n");
        exit(1);
    }
1317

1318
    if (rma_alloc_size && (rma_alloc_size < node0_size)) {
1319
        spapr->rma_size = rma_alloc_size;
1320
    } else {
1321
        spapr->rma_size = node0_size;
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335

        /* With KVM, we don't actually know whether KVM supports an
         * unbounded RMA (PR KVM) or is limited by the hash table size
         * (HV KVM using VRMA), so we always assume the latter
         *
         * In that case, we also limit the initial allocations for RTAS
         * etc... to 256M since we have no way to know what the VRMA size
         * is going to be as it depends on the size of the hash table
         * isn't determined yet.
         */
        if (kvm_enabled()) {
            spapr->vrma_adjust = 1;
            spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
        }
1336 1337
    }

1338 1339 1340 1341 1342 1343
    if (spapr->rma_size > node0_size) {
        fprintf(stderr, "Error: Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")\n",
                spapr->rma_size);
        exit(1);
    }

1344 1345
    /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
    load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;
1346

1347 1348 1349 1350 1351 1352 1353 1354 1355 1356
    /* We aim for a hash table of size 1/128 the size of RAM.  The
     * normal rule of thumb is 1/64 the size of RAM, but that's much
     * more than needed for the Linux guests we support. */
    spapr->htab_shift = 18; /* Minimum architected size */
    while (spapr->htab_shift <= 46) {
        if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
            break;
        }
        spapr->htab_shift++;
    }
1357

1358 1359 1360 1361
    /* Set up Interrupt Controller before we create the VCPUs */
    spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads,
                                  XICS_IRQS);

1362 1363
    /* init CPUs */
    if (cpu_model == NULL) {
1364
        cpu_model = kvm_enabled() ? "host" : "POWER7";
1365 1366
    }
    for (i = 0; i < smp_cpus; i++) {
1367 1368
        cpu = cpu_ppc_init(cpu_model);
        if (cpu == NULL) {
1369 1370 1371
            fprintf(stderr, "Unable to find PowerPC CPU definition\n");
            exit(1);
        }
1372 1373
        env = &cpu->env;

1374 1375 1376
        /* Set time-base frequency to 512 MHz */
        cpu_ppc_tb_init(env, TIMEBASE_FREQ);

1377 1378 1379 1380
        /* PAPR always has exception vectors in RAM not ROM. To ensure this,
         * MSR[IP] should never be set.
         */
        env->msr_mask &= ~(1 << 6);
1381 1382 1383

        /* Tell KVM that we're in PAPR mode */
        if (kvm_enabled()) {
1384
            kvmppc_set_papr(cpu);
1385 1386
        }

1387 1388 1389 1390 1391 1392
        if (cpu->max_compat) {
            if (ppc_set_compat(cpu, cpu->max_compat) < 0) {
                exit(1);
            }
        }

1393 1394
        xics_cpu_setup(spapr->icp, cpu);

1395
        qemu_register_reset(spapr_cpu_reset, cpu);
1396 1397 1398
    }

    /* allocate RAM */
1399
    spapr->ram_limit = ram_size;
1400 1401 1402
    memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
                                         spapr->ram_limit);
    memory_region_add_subregion(sysmem, 0, ram);
1403

1404 1405 1406 1407 1408 1409 1410 1411
    if (rma_alloc_size && rma) {
        rma_region = g_new(MemoryRegion, 1);
        memory_region_init_ram_ptr(rma_region, NULL, "ppc_spapr.rma",
                                   rma_alloc_size, rma);
        vmstate_register_ram_global(rma_region);
        memory_region_add_subregion(sysmem, 0, rma_region);
    }

1412
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
1413 1414 1415
    spapr->rtas_size = get_image_size(filename);
    spapr->rtas_blob = g_malloc(spapr->rtas_size);
    if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
1416 1417 1418
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
1419
    if (spapr->rtas_size > RTAS_MAX_SIZE) {
1420
        hw_error("RTAS too big ! 0x%zx bytes (max is 0x%x)\n",
1421 1422 1423
                 spapr->rtas_size, RTAS_MAX_SIZE);
        exit(1);
    }
1424
    g_free(filename);
1425

1426 1427 1428
    /* Set up EPOW events infrastructure */
    spapr_events_init(spapr);

1429
    /* Set up VIO bus */
1430 1431
    spapr->vio_bus = spapr_vio_bus_init();

P
Paolo Bonzini 已提交
1432
    for (i = 0; i < MAX_SERIAL_PORTS; i++) {
1433
        if (serial_hds[i]) {
1434
            spapr_vty_create(spapr->vio_bus, serial_hds[i]);
1435 1436
        }
    }
1437

D
David Gibson 已提交
1438 1439 1440
    /* We always have at least the nvram device on VIO */
    spapr_create_nvram(spapr);

1441
    /* Set up PCI */
1442 1443
    spapr_pci_rtas_init();

1444
    phb = spapr_create_phb(spapr, 0);
1445

P
Paolo Bonzini 已提交
1446
    for (i = 0; i < nb_nics; i++) {
1447 1448 1449
        NICInfo *nd = &nd_table[i];

        if (!nd->model) {
1450
            nd->model = g_strdup("ibmveth");
1451 1452 1453
        }

        if (strcmp(nd->model, "ibmveth") == 0) {
1454
            spapr_vlan_create(spapr->vio_bus, nd);
1455
        } else {
1456
            pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
1457 1458 1459
        }
    }

1460
    for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
1461
        spapr_vscsi_create(spapr->vio_bus);
1462 1463
    }

1464
    /* Graphics */
1465
    if (spapr_vga_init(phb->bus)) {
1466
        spapr->has_graphics = true;
1467 1468
    }

1469
    if (usb_enabled(spapr->has_graphics)) {
1470
        pci_create_simple(phb->bus, -1, "pci-ohci");
1471 1472 1473 1474 1475 1476
        if (spapr->has_graphics) {
            usbdevice_create("keyboard");
            usbdevice_create("mouse");
        }
    }

1477
    if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
1478 1479 1480 1481 1482
        fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
                "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
        exit(1);
    }

1483 1484 1485 1486 1487
    if (kernel_filename) {
        uint64_t lowaddr = 0;

        kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
                               NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
1488
        if (kernel_size == ELF_LOAD_WRONG_ENDIAN) {
1489 1490 1491 1492 1493
            kernel_size = load_elf(kernel_filename,
                                   translate_kernel_address, NULL,
                                   NULL, &lowaddr, NULL, 0, ELF_MACHINE, 0);
            kernel_le = kernel_size > 0;
        }
1494
        if (kernel_size < 0) {
1495 1496
            fprintf(stderr, "qemu: error loading %s: %s\n",
                    kernel_filename, load_elf_strerror(kernel_size));
1497 1498 1499 1500 1501
            exit(1);
        }

        /* load initrd */
        if (initrd_filename) {
1502 1503 1504 1505
            /* Try to locate the initrd in the gap between the kernel
             * and the firmware. Add a bit of space just in case
             */
            initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
1506
            initrd_size = load_image_targphys(initrd_filename, initrd_base,
1507
                                              load_limit - initrd_base);
1508 1509 1510 1511 1512 1513 1514 1515 1516
            if (initrd_size < 0) {
                fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
                        initrd_filename);
                exit(1);
            }
        } else {
            initrd_base = 0;
            initrd_size = 0;
        }
1517
    }
1518

1519 1520 1521 1522
    if (bios_name == NULL) {
        bios_name = FW_FILE_NAME;
    }
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1523 1524 1525 1526 1527 1528 1529 1530 1531
    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
    if (fw_size < 0) {
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
    g_free(filename);

    spapr->entry_point = 0x100;

1532 1533 1534 1535
    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
    register_savevm_live(NULL, "spapr/htab", -1, 1,
                         &savevm_htab_handlers, spapr);

1536
    /* Prepare the device tree */
1537
    spapr->fdt_skel = spapr_create_fdt_skel(initrd_base, initrd_size,
1538
                                            kernel_size, kernel_le,
1539 1540
                                            boot_device, kernel_cmdline,
                                            spapr->epow_irq);
1541
    assert(spapr->fdt_skel != NULL);
1542 1543
}

1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
static int spapr_kvm_type(const char *vm_type)
{
    if (!vm_type) {
        return 0;
    }

    if (!strcmp(vm_type, "HV")) {
        return 1;
    }

    if (!strcmp(vm_type, "PR")) {
        return 2;
    }

    error_report("Unknown kvm-type specified '%s'", vm_type);
    exit(1);
}

1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618
/*
 * Implementation of an interface to adjust firmware patch
 * for the bootindex property handling.
 */
static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
                                   DeviceState *dev)
{
#define CAST(type, obj, name) \
    ((type *)object_dynamic_cast(OBJECT(obj), (name)))
    SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
    sPAPRPHBState *phb = CAST(sPAPRPHBState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);

    if (d) {
        void *spapr = CAST(void, bus->parent, "spapr-vscsi");
        VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
        USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);

        if (spapr) {
            /*
             * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
             * We use SRP luns of the form 8000 | (bus << 8) | (id << 5) | lun
             * in the top 16 bits of the 64-bit LUN
             */
            unsigned id = 0x8000 | (d->id << 8) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 48);
        } else if (virtio) {
            /*
             * We use SRP luns of the form 01000000 | (target << 8) | lun
             * in the top 32 bits of the 64-bit LUN
             * Note: the quote above is from SLOF and it is wrong,
             * the actual binding is:
             * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
             */
            unsigned id = 0x1000000 | (d->id << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        } else if (usb) {
            /*
             * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
             * in the top 32 bits of the 64-bit LUN
             */
            unsigned usb_port = atoi(usb->port->path);
            unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        }
    }

    if (phb) {
        /* Replace "pci" with "pci@800000020000000" */
        return g_strdup_printf("pci@%"PRIX64, phb->buid);
    }

    return NULL;
}

E
Eduardo Habkost 已提交
1619 1620
static char *spapr_get_kvm_type(Object *obj, Error **errp)
{
1621
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1622 1623 1624 1625 1626 1627

    return g_strdup(sm->kvm_type);
}

static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
{
1628
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639

    g_free(sm->kvm_type);
    sm->kvm_type = g_strdup(value);
}

static void spapr_machine_initfn(Object *obj)
{
    object_property_add_str(obj, "kvm-type",
                            spapr_get_kvm_type, spapr_set_kvm_type, NULL);
}

1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
static void ppc_cpu_do_nmi_on_cpu(void *arg)
{
    CPUState *cs = arg;

    cpu_synchronize_state(cs);
    ppc_cpu_do_system_reset(cs);
}

static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
{
    CPUState *cs;

    CPU_FOREACH(cs) {
        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs);
    }
}

1657 1658 1659
static void spapr_machine_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);
1660
    FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
1661
    NMIClass *nc = NMI_CLASS(oc);
1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672

    mc->name = "pseries";
    mc->desc = "pSeries Logical Partition (PAPR compliant)";
    mc->is_default = 1;
    mc->init = ppc_spapr_init;
    mc->reset = ppc_spapr_reset;
    mc->block_default_type = IF_SCSI;
    mc->max_cpus = MAX_CPUS;
    mc->no_parallel = 1;
    mc->default_boot_order = NULL;
    mc->kvm_type = spapr_kvm_type;
1673

1674
    fwc->get_dev_path = spapr_get_fw_dev_path;
1675
    nc->nmi_monitor_handler = spapr_nmi;
1676 1677 1678 1679 1680
}

static const TypeInfo spapr_machine_info = {
    .name          = TYPE_SPAPR_MACHINE,
    .parent        = TYPE_MACHINE,
1681
    .instance_size = sizeof(sPAPRMachineState),
E
Eduardo Habkost 已提交
1682
    .instance_init = spapr_machine_initfn,
1683
    .class_init    = spapr_machine_class_init,
1684 1685
    .interfaces = (InterfaceInfo[]) {
        { TYPE_FW_PATH_PROVIDER },
1686
        { TYPE_NMI },
1687 1688
        { }
    },
1689 1690
};

1691 1692 1693
static void spapr_machine_2_1_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);
1694 1695 1696 1697
    static GlobalProperty compat_props[] = {
        HW_COMPAT_2_1,
        { /* end of list */ }
    };
1698 1699 1700 1701

    mc->name = "pseries-2.1";
    mc->desc = "pSeries Logical Partition (PAPR compliant) v2.1";
    mc->is_default = 0;
1702
    mc->compat_props = compat_props;
1703 1704 1705 1706 1707 1708 1709 1710
}

static const TypeInfo spapr_machine_2_1_info = {
    .name          = TYPE_SPAPR_MACHINE "2.1",
    .parent        = TYPE_SPAPR_MACHINE,
    .class_init    = spapr_machine_2_1_class_init,
};

1711
static void spapr_machine_register_types(void)
1712
{
1713
    type_register_static(&spapr_machine_info);
1714
    type_register_static(&spapr_machine_2_1_info);
1715 1716
}

1717
type_init(spapr_machine_register_types)