spapr.c 50.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
 *
 * Copyright (c) 2004-2007 Fabrice Bellard
 * Copyright (c) 2007 Jocelyn Mayer
 * Copyright (c) 2010 David Gibson, IBM Corporation.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */
27
#include "sysemu/sysemu.h"
28
#include "hw/hw.h"
29
#include "hw/fw-path-provider.h"
30
#include "elf.h"
P
Paolo Bonzini 已提交
31
#include "net/net.h"
32 33 34
#include "sysemu/blockdev.h"
#include "sysemu/cpus.h"
#include "sysemu/kvm.h"
35
#include "kvm_ppc.h"
36
#include "mmu-hash64.h"
37
#include "qom/cpu.h"
38 39

#include "hw/boards.h"
P
Paolo Bonzini 已提交
40
#include "hw/ppc/ppc.h"
41 42
#include "hw/loader.h"

P
Paolo Bonzini 已提交
43 44 45 46
#include "hw/ppc/spapr.h"
#include "hw/ppc/spapr_vio.h"
#include "hw/pci-host/spapr.h"
#include "hw/ppc/xics.h"
47
#include "hw/pci/msi.h"
48

49
#include "hw/pci/pci.h"
50 51
#include "hw/scsi/scsi.h"
#include "hw/virtio/virtio-scsi.h"
52

53
#include "exec/address-spaces.h"
54
#include "hw/usb.h"
55
#include "qemu/config-file.h"
56
#include "qemu/error-report.h"
57
#include "trace.h"
58
#include "hw/nmi.h"
A
Avi Kivity 已提交
59

60 61
#include <libfdt.h>

62 63 64 65 66 67 68 69 70 71
/* SLOF memory layout:
 *
 * SLOF raw image loaded at 0, copies its romfs right below the flat
 * device-tree, then position SLOF itself 31M below that
 *
 * So we set FW_OVERHEAD to 40MB which should account for all of that
 * and more
 *
 * We load our kernel at 4M, leaving space for SLOF initial image
 */
72
#define FDT_MAX_SIZE            0x40000
73
#define RTAS_MAX_SIZE           0x10000
74 75
#define FW_MAX_SIZE             0x400000
#define FW_FILE_NAME            "slof.bin"
76 77
#define FW_OVERHEAD             0x2800000
#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
78

79
#define MIN_RMA_SLOF            128UL
80 81 82

#define TIMEBASE_FREQ           512000000ULL

83
#define MAX_CPUS                256
84

85 86
#define PHANDLE_XICP            0x00001111

87 88
#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))

89
typedef struct sPAPRMachineState sPAPRMachineState;
90

91
#define TYPE_SPAPR_MACHINE      "spapr-machine"
92
#define SPAPR_MACHINE(obj) \
93
    OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE)
94 95

/**
96
 * sPAPRMachineState:
97
 */
98
struct sPAPRMachineState {
99 100
    /*< private >*/
    MachineState parent_obj;
E
Eduardo Habkost 已提交
101 102 103

    /*< public >*/
    char *kvm_type;
104 105
};

106 107
sPAPREnvironment *spapr;

108 109 110 111 112 113 114 115 116 117 118 119
static XICSState *try_create_xics(const char *type, int nr_servers,
                                  int nr_irqs)
{
    DeviceState *dev;

    dev = qdev_create(NULL, type);
    qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
    qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
    if (qdev_init(dev) < 0) {
        return NULL;
    }

120
    return XICS_COMMON(dev);
121 122 123 124 125 126
}

static XICSState *xics_system_init(int nr_servers, int nr_irqs)
{
    XICSState *icp = NULL;

127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
    if (kvm_enabled()) {
        QemuOpts *machine_opts = qemu_get_machine_opts();
        bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
                                                "kernel_irqchip", true);
        bool irqchip_required = qemu_opt_get_bool(machine_opts,
                                                  "kernel_irqchip", false);
        if (irqchip_allowed) {
            icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
        }

        if (irqchip_required && !icp) {
            perror("Failed to create in-kernel XICS\n");
            abort();
        }
    }

    if (!icp) {
        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
    }

147 148 149 150 151 152 153 154
    if (!icp) {
        perror("Failed to create XICS\n");
        abort();
    }

    return icp;
}

155 156 157 158 159 160 161 162
static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
                                  int smt_threads)
{
    int i, ret = 0;
    uint32_t servers_prop[smt_threads];
    uint32_t gservers_prop[smt_threads * 2];
    int index = ppc_get_vcpu_dt_id(cpu);

163
    if (cpu->cpu_version) {
164
        ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->cpu_version);
165 166 167 168 169
        if (ret < 0) {
            return ret;
        }
    }

170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
    /* Build interrupt servers and gservers properties */
    for (i = 0; i < smt_threads; i++) {
        servers_prop[i] = cpu_to_be32(index + i);
        /* Hack, direct the group queues back to cpu 0 */
        gservers_prop[i*2] = cpu_to_be32(index + i);
        gservers_prop[i*2 + 1] = 0;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
                      servers_prop, sizeof(servers_prop));
    if (ret < 0) {
        return ret;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
                      gservers_prop, sizeof(gservers_prop));

    return ret;
}

188
static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
189
{
190 191
    int ret = 0, offset, cpus_offset;
    CPUState *cs;
192 193
    char cpu_model[32];
    int smt = kvmppc_smt_threads();
194
    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
195

196 197 198 199
    CPU_FOREACH(cs) {
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
        int index = ppc_get_vcpu_dt_id(cpu);
200 201 202 203
        uint32_t associativity[] = {cpu_to_be32(0x5),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
204
                                    cpu_to_be32(cs->numa_node),
205
                                    cpu_to_be32(index)};
206

207
        if ((index % smt) != 0) {
208 209 210
            continue;
        }

211
        snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
212

213 214 215 216 217 218 219 220 221
        cpus_offset = fdt_path_offset(fdt, "/cpus");
        if (cpus_offset < 0) {
            cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
                                          "cpus");
            if (cpus_offset < 0) {
                return cpus_offset;
            }
        }
        offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
222
        if (offset < 0) {
223 224 225 226
            offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
            if (offset < 0) {
                return offset;
            }
227 228
        }

229 230 231 232 233 234 235 236 237 238
        if (nb_numa_nodes > 1) {
            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
                              sizeof(associativity));
            if (ret < 0) {
                return ret;
            }
        }

        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
                          pft_size_prop, sizeof(pft_size_prop));
239 240 241
        if (ret < 0) {
            return ret;
        }
242

243
        ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
244
                                     ppc_get_compat_smt_threads(cpu));
245 246 247
        if (ret < 0) {
            return ret;
        }
248 249 250 251
    }
    return ret;
}

252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285

static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
                                     size_t maxsize)
{
    size_t maxcells = maxsize / sizeof(uint32_t);
    int i, j, count;
    uint32_t *p = prop;

    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
        struct ppc_one_seg_page_size *sps = &env->sps.sps[i];

        if (!sps->page_shift) {
            break;
        }
        for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
            if (sps->enc[count].page_shift == 0) {
                break;
            }
        }
        if ((p - prop) >= (maxcells - 3 - count * 2)) {
            break;
        }
        *(p++) = cpu_to_be32(sps->page_shift);
        *(p++) = cpu_to_be32(sps->slb_enc);
        *(p++) = cpu_to_be32(count);
        for (j = 0; j < count; j++) {
            *(p++) = cpu_to_be32(sps->enc[j].page_shift);
            *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
        }
    }

    return (p - prop) * sizeof(uint32_t);
}

286 287 288 289 290 291 292 293 294 295
#define _FDT(exp) \
    do { \
        int ret = (exp);                                           \
        if (ret < 0) {                                             \
            fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
                    #exp, fdt_strerror(ret));                      \
            exit(1);                                               \
        }                                                          \
    } while (0)

296 297 298 299
static void add_str(GString *s, const gchar *s1)
{
    g_string_append_len(s, s1, strlen(s1) + 1);
}
300

301
static void *spapr_create_fdt_skel(hwaddr initrd_base,
A
Avi Kivity 已提交
302 303
                                   hwaddr initrd_size,
                                   hwaddr kernel_size,
304
                                   bool little_endian,
305
                                   const char *boot_device,
306 307
                                   const char *kernel_cmdline,
                                   uint32_t epow_irq)
308 309
{
    void *fdt;
310
    CPUState *cs;
311 312
    uint32_t start_prop = cpu_to_be32(initrd_base);
    uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
313 314
    GString *hypertas = g_string_sized_new(256);
    GString *qemu_hypertas = g_string_sized_new(256);
315
    uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
316
    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
317
    int smt = kvmppc_smt_threads();
318
    unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
319 320 321
    QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
    unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
    uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
322

323 324 325 326 327 328 329 330 331 332 333
    add_str(hypertas, "hcall-pft");
    add_str(hypertas, "hcall-term");
    add_str(hypertas, "hcall-dabr");
    add_str(hypertas, "hcall-interrupt");
    add_str(hypertas, "hcall-tce");
    add_str(hypertas, "hcall-vio");
    add_str(hypertas, "hcall-splpar");
    add_str(hypertas, "hcall-bulk");
    add_str(hypertas, "hcall-set-mode");
    add_str(qemu_hypertas, "hcall-memop1");

334
    fdt = g_malloc0(FDT_MAX_SIZE);
335 336
    _FDT((fdt_create(fdt, FDT_MAX_SIZE)));

337 338 339 340 341 342
    if (kernel_size) {
        _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
    }
    if (initrd_size) {
        _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
    }
343 344 345 346 347
    _FDT((fdt_finish_reservemap(fdt)));

    /* Root node */
    _FDT((fdt_begin_node(fdt, "")));
    _FDT((fdt_property_string(fdt, "device_type", "chrp")));
348
    _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
349
    _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
350 351 352 353 354 355 356

    _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));

    /* /chosen */
    _FDT((fdt_begin_node(fdt, "chosen")));

357 358 359
    /* Set Form1_affinity */
    _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));

360 361 362 363 364
    _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
    _FDT((fdt_property(fdt, "linux,initrd-start",
                       &start_prop, sizeof(start_prop))));
    _FDT((fdt_property(fdt, "linux,initrd-end",
                       &end_prop, sizeof(end_prop))));
365 366 367
    if (kernel_size) {
        uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
                              cpu_to_be64(kernel_size) };
368

369
        _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
370 371 372
        if (little_endian) {
            _FDT((fdt_property(fdt, "qemu,boot-kernel-le", NULL, 0)));
        }
373
    }
A
Avik Sil 已提交
374 375 376
    if (boot_device) {
        _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
    }
377 378 379
    if (boot_menu) {
        _FDT((fdt_property_cell(fdt, "qemu,boot-menu", boot_menu)));
    }
380 381 382
    _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
383

384 385 386 387 388 389 390 391
    _FDT((fdt_end_node(fdt)));

    /* cpus */
    _FDT((fdt_begin_node(fdt, "cpus")));

    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));

A
Andreas Färber 已提交
392
    CPU_FOREACH(cs) {
393 394
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        CPUPPCState *env = &cpu->env;
395
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
396
        PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
397
        int index = ppc_get_vcpu_dt_id(cpu);
398 399 400
        char *nodename;
        uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
                           0xffffffff, 0xffffffff};
401 402
        uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
        uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
403 404
        uint32_t page_sizes_prop[64];
        size_t page_sizes_prop_size;
405

406 407 408 409
        if ((index % smt) != 0) {
            continue;
        }

410
        nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
411 412 413

        _FDT((fdt_begin_node(fdt, nodename)));

414
        g_free(nodename);
415

D
David Gibson 已提交
416
        _FDT((fdt_property_cell(fdt, "reg", index)));
417 418 419
        _FDT((fdt_property_string(fdt, "device_type", "cpu")));

        _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR])));
420
        _FDT((fdt_property_cell(fdt, "d-cache-block-size",
421
                                env->dcache_line_size)));
422 423 424 425 426
        _FDT((fdt_property_cell(fdt, "d-cache-line-size",
                                env->dcache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-block-size",
                                env->icache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-line-size",
427
                                env->icache_line_size)));
428 429 430 431 432 433 434 435 436 437 438 439

        if (pcc->l1_dcache_size) {
            _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
        }
        if (pcc->l1_icache_size) {
            _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
        }

440 441
        _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
        _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
442 443 444
        _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
        _FDT((fdt_property_string(fdt, "status", "okay")));
        _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
445

446 447 448 449
        if (env->spr_cb[SPR_PURR].oea_read) {
            _FDT((fdt_property(fdt, "ibm,purr", NULL, 0)));
        }

D
David Gibson 已提交
450
        if (env->mmu_model & POWERPC_MMU_1TSEG) {
451 452 453 454
            _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
                               segs, sizeof(segs))));
        }

455 456 457 458
        /* Advertise VMX/VSX (vector extensions) if available
         *   0 / no property == no vector extensions
         *   1               == VMX / Altivec available
         *   2               == VSX available */
459 460 461
        if (env->insns_flags & PPC_ALTIVEC) {
            uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;

462 463 464 465 466 467
            _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx)));
        }

        /* Advertise DFP (Decimal Floating Point) if available
         *   0 / no property == no DFP
         *   1               == DFP available */
468 469
        if (env->insns_flags2 & PPC2_DFP) {
            _FDT((fdt_property_cell(fdt, "ibm,dfp", 1)));
470 471
        }

472 473 474 475 476 477 478
        page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
                                                      sizeof(page_sizes_prop));
        if (page_sizes_prop_size) {
            _FDT((fdt_property(fdt, "ibm,segment-page-sizes",
                               page_sizes_prop, page_sizes_prop_size)));
        }

479 480 481
        _FDT((fdt_property_cell(fdt, "ibm,chip-id",
                                cs->cpu_index / cpus_per_socket)));

482 483 484 485 486
        _FDT((fdt_end_node(fdt)));
    }

    _FDT((fdt_end_node(fdt)));

487 488 489
    /* RTAS */
    _FDT((fdt_begin_node(fdt, "rtas")));

490 491 492
    if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
        add_str(hypertas, "hcall-multi-tce");
    }
493 494 495 496 497 498
    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas->str,
                       hypertas->len)));
    g_string_free(hypertas, TRUE);
    _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas->str,
                       qemu_hypertas->len)));
    g_string_free(qemu_hypertas, TRUE);
499

500 501 502
    _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
        refpoints, sizeof(refpoints))));

503 504
    _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));

505 506
    _FDT((fdt_end_node(fdt)));

507
    /* interrupt controller */
508
    _FDT((fdt_begin_node(fdt, "interrupt-controller")));
509 510 511 512 513 514 515 516

    _FDT((fdt_property_string(fdt, "device_type",
                              "PowerPC-External-Interrupt-Presentation")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
                       interrupt_server_ranges_prop,
                       sizeof(interrupt_server_ranges_prop))));
517 518 519
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
    _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
    _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
520 521 522

    _FDT((fdt_end_node(fdt)));

523 524 525 526 527 528 529
    /* vdevice */
    _FDT((fdt_begin_node(fdt, "vdevice")));

    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
530 531
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
532 533 534

    _FDT((fdt_end_node(fdt)));

535 536 537
    /* event-sources */
    spapr_events_fdt_skel(fdt, epow_irq);

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
    /* /hypervisor node */
    if (kvm_enabled()) {
        uint8_t hypercall[16];

        /* indicate KVM hypercall interface */
        _FDT((fdt_begin_node(fdt, "hypervisor")));
        _FDT((fdt_property_string(fdt, "compatible", "linux,kvm")));
        if (kvmppc_has_cap_fixup_hcalls()) {
            /*
             * Older KVM versions with older guest kernels were broken with the
             * magic page, don't allow the guest to map it.
             */
            kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
                                 sizeof(hypercall));
            _FDT((fdt_property(fdt, "hcall-instructions", hypercall,
                              sizeof(hypercall))));
        }
        _FDT((fdt_end_node(fdt)));
    }

558 559 560
    _FDT((fdt_end_node(fdt))); /* close root node */
    _FDT((fdt_finish(fdt)));

561 562 563
    return fdt;
}

564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
int spapr_h_cas_compose_response(target_ulong addr, target_ulong size)
{
    void *fdt, *fdt_skel;
    sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };

    size -= sizeof(hdr);

    /* Create sceleton */
    fdt_skel = g_malloc0(size);
    _FDT((fdt_create(fdt_skel, size)));
    _FDT((fdt_begin_node(fdt_skel, "")));
    _FDT((fdt_end_node(fdt_skel)));
    _FDT((fdt_finish(fdt_skel)));
    fdt = g_malloc0(size);
    _FDT((fdt_open_into(fdt_skel, fdt, size)));
    g_free(fdt_skel);

581 582
    /* Fix skeleton up */
    _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599

    /* Pack resulting tree */
    _FDT((fdt_pack(fdt)));

    if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
        trace_spapr_cas_failed(size);
        return -1;
    }

    cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
    cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
    trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
    g_free(fdt);

    return 0;
}

600 601 602 603 604 605
static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
{
    uint32_t associativity[] = {cpu_to_be32(0x4), cpu_to_be32(0x0),
                                cpu_to_be32(0x0), cpu_to_be32(0x0),
                                cpu_to_be32(0x0)};
    char mem_name[32];
606
    hwaddr node0_size, mem_start, node_size;
607 608 609 610
    uint64_t mem_reg_property[2];
    int i, off;

    /* memory node(s) */
611 612
    if (nb_numa_nodes > 1 && numa_info[0].node_mem < ram_size) {
        node0_size = numa_info[0].node_mem;
613 614 615
    } else {
        node0_size = ram_size;
    }
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646

    /* RMA */
    mem_reg_property[0] = 0;
    mem_reg_property[1] = cpu_to_be64(spapr->rma_size);
    off = fdt_add_subnode(fdt, 0, "memory@0");
    _FDT(off);
    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                      sizeof(mem_reg_property))));
    _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                      sizeof(associativity))));

    /* RAM: Node 0 */
    if (node0_size > spapr->rma_size) {
        mem_reg_property[0] = cpu_to_be64(spapr->rma_size);
        mem_reg_property[1] = cpu_to_be64(node0_size - spapr->rma_size);

        sprintf(mem_name, "memory@" TARGET_FMT_lx, spapr->rma_size);
        off = fdt_add_subnode(fdt, 0, mem_name);
        _FDT(off);
        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
        _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                          sizeof(mem_reg_property))));
        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                          sizeof(associativity))));
    }

    /* RAM: Node 1 and beyond */
    mem_start = node0_size;
    for (i = 1; i < nb_numa_nodes; i++) {
        mem_reg_property[0] = cpu_to_be64(mem_start);
647 648 649
        if (mem_start >= ram_size) {
            node_size = 0;
        } else {
650
            node_size = numa_info[i].node_mem;
651 652 653 654 655
            if (node_size > ram_size - mem_start) {
                node_size = ram_size - mem_start;
            }
        }
        mem_reg_property[1] = cpu_to_be64(node_size);
656 657 658 659 660 661 662 663 664
        associativity[3] = associativity[4] = cpu_to_be32(i);
        sprintf(mem_name, "memory@" TARGET_FMT_lx, mem_start);
        off = fdt_add_subnode(fdt, 0, mem_name);
        _FDT(off);
        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
        _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                          sizeof(mem_reg_property))));
        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                          sizeof(associativity))));
665
        mem_start += node_size;
666 667 668 669 670
    }

    return 0;
}

671
static void spapr_finalize_fdt(sPAPREnvironment *spapr,
A
Avi Kivity 已提交
672 673 674
                               hwaddr fdt_addr,
                               hwaddr rtas_addr,
                               hwaddr rtas_size)
675
{
676 677 678
    int ret, i;
    size_t cb = 0;
    char *bootlist;
679
    void *fdt;
680
    sPAPRPHBState *phb;
681

682
    fdt = g_malloc(FDT_MAX_SIZE);
683 684 685

    /* open out the base tree into a temp buffer for the final tweaks */
    _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
686

687 688 689 690 691 692
    ret = spapr_populate_memory(spapr, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
        exit(1);
    }

693 694 695 696 697 698
    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup vio devices in fdt\n");
        exit(1);
    }

699
    QLIST_FOREACH(phb, &spapr->phbs, list) {
700
        ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
701 702 703 704 705 706 707
    }

    if (ret < 0) {
        fprintf(stderr, "couldn't setup PCI devices in fdt\n");
        exit(1);
    }

708 709 710 711 712 713
    /* RTAS */
    ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
    if (ret < 0) {
        fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
    }

714
    /* Advertise NUMA via ibm,associativity */
715 716 717
    ret = spapr_fixup_cpu_dt(fdt, spapr);
    if (ret < 0) {
        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
718 719
    }

720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
    bootlist = get_boot_devices_list(&cb, true);
    if (cb && bootlist) {
        int offset = fdt_path_offset(fdt, "/chosen");
        if (offset < 0) {
            exit(1);
        }
        for (i = 0; i < cb; i++) {
            if (bootlist[i] == '\n') {
                bootlist[i] = ' ';
            }

        }
        ret = fdt_setprop_string(fdt, offset, "qemu,boot-list", bootlist);
    }

735
    if (!spapr->has_graphics) {
736 737
        spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
    }
738

739 740
    _FDT((fdt_pack(fdt)));

741 742 743 744 745 746
    if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
        hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
                 fdt_totalsize(fdt), FDT_MAX_SIZE);
        exit(1);
    }

747
    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
748

749
    g_free(fdt);
750 751 752 753 754 755 756
}

static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
{
    return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
}

757
static void emulate_spapr_hypercall(PowerPCCPU *cpu)
758
{
759 760
    CPUPPCState *env = &cpu->env;

761 762 763 764
    if (msr_pr) {
        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
        env->gpr[3] = H_PRIVILEGE;
    } else {
765
        env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
766
    }
767 768
}

769 770 771 772 773 774 775 776 777 778 779 780 781
static void spapr_reset_htab(sPAPREnvironment *spapr)
{
    long shift;

    /* allocate hash page table.  For now we always make this 16mb,
     * later we should probably make it scale to the size of guest
     * RAM */

    shift = kvmppc_reset_htab(spapr->htab_shift);

    if (shift > 0) {
        /* Kernel handles htab, we don't need to allocate one */
        spapr->htab_shift = shift;
782
        kvmppc_kern_htab = true;
783 784 785 786 787 788 789 790 791 792 793 794
    } else {
        if (!spapr->htab) {
            /* Allocate an htab if we don't yet have one */
            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
        }

        /* And clear it */
        memset(spapr->htab, 0, HTAB_SIZE(spapr));
    }

    /* Update the RMA size if necessary */
    if (spapr->vrma_adjust) {
795 796
        hwaddr node0_size = (nb_numa_nodes > 1) ?
            numa_info[0].node_mem : ram_size;
797
        spapr->rma_size = kvmppc_rma_size(node0_size, spapr->htab_shift);
798
    }
799 800
}

801
static void ppc_spapr_reset(void)
802
{
803
    PowerPCCPU *first_ppc_cpu;
804

805 806
    /* Reset the hash table & recalc the RMA */
    spapr_reset_htab(spapr);
807

808
    qemu_devices_reset();
809 810 811 812 813 814

    /* Load the fdt */
    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
                       spapr->rtas_size);

    /* Set up the entry state */
815 816 817 818 819
    first_ppc_cpu = POWERPC_CPU(first_cpu);
    first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
    first_ppc_cpu->env.gpr[5] = 0;
    first_cpu->halted = 0;
    first_ppc_cpu->env.nip = spapr->entry_point;
820 821 822

}

823 824
static void spapr_cpu_reset(void *opaque)
{
825
    PowerPCCPU *cpu = opaque;
826
    CPUState *cs = CPU(cpu);
827
    CPUPPCState *env = &cpu->env;
828

829
    cpu_reset(cs);
830 831 832 833

    /* All CPUs start halted.  CPU0 is unhalted from the machine level
     * reset code and the rest are explicitly started up by the guest
     * using an RTAS call */
834
    cs->halted = 1;
835 836

    env->spr[SPR_HIOR] = 0;
837

838
    env->external_htab = (uint8_t *)spapr->htab;
839 840 841 842 843 844 845
    if (kvm_enabled() && !env->external_htab) {
        /*
         * HV KVM, set external_htab to 1 so our ppc_hash64_load_hpte*
         * functions do the right thing.
         */
        env->external_htab = (void *)1;
    }
846
    env->htab_base = -1;
847 848 849 850 851 852 853
    /*
     * htab_mask is the mask used to normalize hash value to PTEG index.
     * htab_shift is log2 of hash table size.
     * We have 8 hpte per group, and each hpte is 16 bytes.
     * ie have 128 bytes per hpte entry.
     */
    env->htab_mask = (1ULL << ((spapr)->htab_shift - 7)) - 1;
854
    env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
855
        (spapr->htab_shift - 18);
856 857
}

D
David Gibson 已提交
858 859
static void spapr_create_nvram(sPAPREnvironment *spapr)
{
860
    DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
P
Paolo Bonzini 已提交
861
    DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
D
David Gibson 已提交
862

P
Paolo Bonzini 已提交
863 864
    if (dinfo) {
        qdev_prop_set_drive_nofail(dev, "drive", dinfo->bdrv);
D
David Gibson 已提交
865 866 867 868 869 870 871
    }

    qdev_init_nofail(dev);

    spapr->nvram = (struct sPAPRNVRAM *)dev;
}

872
/* Returns whether we want to use VGA or not */
873 874
static int spapr_vga_init(PCIBus *pci_bus)
{
875 876
    switch (vga_interface_type) {
    case VGA_NONE:
877 878 879
        return false;
    case VGA_DEVICE:
        return true;
880 881
    case VGA_STD:
        return pci_vga_init(pci_bus) != NULL;
882
    default:
883 884
        fprintf(stderr, "This vga model is not supported,"
                "currently it only supports -vga std\n");
885
        exit(0);
886 887 888
    }
}

889 890
static const VMStateDescription vmstate_spapr = {
    .name = "spapr",
891
    .version_id = 2,
892
    .minimum_version_id = 1,
893
    .fields = (VMStateField[]) {
A
Alexey Kardashevskiy 已提交
894
        VMSTATE_UNUSED(4), /* used to be @next_irq */
895 896 897

        /* RTC offset */
        VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
898
        VMSTATE_PPC_TIMEBASE_V(tb, sPAPREnvironment, 2),
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
        VMSTATE_END_OF_LIST()
    },
};

#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))

static int htab_save_setup(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* "Iteration" header */
    qemu_put_be32(f, spapr->htab_shift);

915 916 917 918 919 920 921 922 923 924 925 926 927 928 929
    if (spapr->htab) {
        spapr->htab_save_index = 0;
        spapr->htab_first_pass = true;
    } else {
        assert(kvm_enabled());

        spapr->htab_fd = kvmppc_get_htab_fd(false);
        if (spapr->htab_fd < 0) {
            fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
                    strerror(errno));
            return -1;
        }
    }


930 931 932 933 934 935 936 937
    return 0;
}

static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                 int64_t max_ns)
{
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int index = spapr->htab_save_index;
938
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968

    assert(spapr->htab_first_pass);

    do {
        int chunkstart;

        /* Consume invalid HPTEs */
        while ((index < htabslots)
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        /* Consume valid HPTEs */
        chunkstart = index;
        while ((index < htabslots)
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        if (index > chunkstart) {
            int n_valid = index - chunkstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, 0);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);

969
            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
970 971 972 973 974 975 976 977 978 979 980 981 982
                break;
            }
        }
    } while ((index < htabslots) && !qemu_file_rate_limit(f));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
        spapr->htab_first_pass = false;
    }
    spapr->htab_save_index = index;
}

983 984
static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                int64_t max_ns)
985 986 987 988 989
{
    bool final = max_ns < 0;
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int examined = 0, sent = 0;
    int index = spapr->htab_save_index;
990
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034

    assert(!spapr->htab_first_pass);

    do {
        int chunkstart, invalidstart;

        /* Consume non-dirty HPTEs */
        while ((index < htabslots)
               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
            index++;
            examined++;
        }

        chunkstart = index;
        /* Consume valid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        invalidstart = index;
        /* Consume invalid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        if (index > chunkstart) {
            int n_valid = invalidstart - chunkstart;
            int n_invalid = index - invalidstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, n_invalid);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);
            sent += index - chunkstart;

1035
            if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
                break;
            }
        }

        if (examined >= htabslots) {
            break;
        }

        if (index >= htabslots) {
            assert(index == htabslots);
            index = 0;
        }
    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
    }

    spapr->htab_save_index = index;

1057
    return (examined >= htabslots) && (sent == 0) ? 1 : 0;
1058 1059
}

1060 1061 1062
#define MAX_ITERATION_NS    5000000 /* 5 ms */
#define MAX_KVM_BUF_SIZE    2048

1063 1064 1065
static int htab_save_iterate(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;
1066
    int rc = 0;
1067 1068 1069 1070

    /* Iteration header */
    qemu_put_be32(f, 0);

1071 1072 1073 1074 1075 1076 1077 1078 1079
    if (!spapr->htab) {
        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd,
                              MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
        if (rc < 0) {
            return rc;
        }
    } else  if (spapr->htab_first_pass) {
1080 1081
        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
    } else {
1082
        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
1083 1084 1085 1086 1087 1088 1089
    }

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

1090
    return rc;
1091 1092 1093 1094 1095 1096 1097 1098 1099
}

static int htab_save_complete(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* Iteration header */
    qemu_put_be32(f, 0);

1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113
    if (!spapr->htab) {
        int rc;

        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd, MAX_KVM_BUF_SIZE, -1);
        if (rc < 0) {
            return rc;
        }
        close(spapr->htab_fd);
        spapr->htab_fd = -1;
    } else {
        htab_save_later_pass(f, spapr, -1);
    }
1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

    return 0;
}

static int htab_load(QEMUFile *f, void *opaque, int version_id)
{
    sPAPREnvironment *spapr = opaque;
    uint32_t section_hdr;
1127
    int fd = -1;
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143

    if (version_id < 1 || version_id > 1) {
        fprintf(stderr, "htab_load() bad version\n");
        return -EINVAL;
    }

    section_hdr = qemu_get_be32(f);

    if (section_hdr) {
        /* First section, just the hash shift */
        if (spapr->htab_shift != section_hdr) {
            return -EINVAL;
        }
        return 0;
    }

1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
    if (!spapr->htab) {
        assert(kvm_enabled());

        fd = kvmppc_get_htab_fd(true);
        if (fd < 0) {
            fprintf(stderr, "Unable to open fd to restore KVM hash table: %s\n",
                    strerror(errno));
        }
    }

1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
    while (true) {
        uint32_t index;
        uint16_t n_valid, n_invalid;

        index = qemu_get_be32(f);
        n_valid = qemu_get_be16(f);
        n_invalid = qemu_get_be16(f);

        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
            /* End of Stream */
            break;
        }

1167
        if ((index + n_valid + n_invalid) >
1168 1169 1170
            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
            /* Bad index in stream */
            fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
1171 1172
                    "in htab stream (htab_shift=%d)\n", index, n_valid, n_invalid,
                    spapr->htab_shift);
1173 1174 1175
            return -EINVAL;
        }

1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193
        if (spapr->htab) {
            if (n_valid) {
                qemu_get_buffer(f, HPTE(spapr->htab, index),
                                HASH_PTE_SIZE_64 * n_valid);
            }
            if (n_invalid) {
                memset(HPTE(spapr->htab, index + n_valid), 0,
                       HASH_PTE_SIZE_64 * n_invalid);
            }
        } else {
            int rc;

            assert(fd >= 0);

            rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
            if (rc < 0) {
                return rc;
            }
1194 1195 1196
        }
    }

1197 1198 1199 1200 1201
    if (!spapr->htab) {
        assert(fd >= 0);
        close(fd);
    }

1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
    return 0;
}

static SaveVMHandlers savevm_htab_handlers = {
    .save_live_setup = htab_save_setup,
    .save_live_iterate = htab_save_iterate,
    .save_live_complete = htab_save_complete,
    .load_state = htab_load,
};

1212
/* pSeries LPAR / sPAPR hardware init */
1213
static void ppc_spapr_init(MachineState *machine)
1214
{
1215 1216 1217 1218 1219 1220
    ram_addr_t ram_size = machine->ram_size;
    const char *cpu_model = machine->cpu_model;
    const char *kernel_filename = machine->kernel_filename;
    const char *kernel_cmdline = machine->kernel_cmdline;
    const char *initrd_filename = machine->initrd_filename;
    const char *boot_device = machine->boot_order;
1221
    PowerPCCPU *cpu;
A
Andreas Färber 已提交
1222
    CPUPPCState *env;
1223
    PCIHostState *phb;
1224
    int i;
A
Avi Kivity 已提交
1225 1226
    MemoryRegion *sysmem = get_system_memory();
    MemoryRegion *ram = g_new(MemoryRegion, 1);
1227 1228
    MemoryRegion *rma_region;
    void *rma = NULL;
A
Avi Kivity 已提交
1229
    hwaddr rma_alloc_size;
1230
    hwaddr node0_size = (nb_numa_nodes > 1) ? numa_info[0].node_mem : ram_size;
1231 1232 1233
    uint32_t initrd_base = 0;
    long kernel_size = 0, initrd_size = 0;
    long load_limit, rtas_limit, fw_size;
1234
    bool kernel_le = false;
1235
    char *filename;
1236

1237 1238
    msi_supported = true;

1239 1240 1241
    spapr = g_malloc0(sizeof(*spapr));
    QLIST_INIT(&spapr->phbs);

1242 1243
    cpu_ppc_hypercall = emulate_spapr_hypercall;

1244
    /* Allocate RMA if necessary */
1245
    rma_alloc_size = kvmppc_alloc_rma(&rma);
1246 1247 1248 1249 1250

    if (rma_alloc_size == -1) {
        hw_error("qemu: Unable to create RMA\n");
        exit(1);
    }
1251

1252
    if (rma_alloc_size && (rma_alloc_size < node0_size)) {
1253
        spapr->rma_size = rma_alloc_size;
1254
    } else {
1255
        spapr->rma_size = node0_size;
1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269

        /* With KVM, we don't actually know whether KVM supports an
         * unbounded RMA (PR KVM) or is limited by the hash table size
         * (HV KVM using VRMA), so we always assume the latter
         *
         * In that case, we also limit the initial allocations for RTAS
         * etc... to 256M since we have no way to know what the VRMA size
         * is going to be as it depends on the size of the hash table
         * isn't determined yet.
         */
        if (kvm_enabled()) {
            spapr->vrma_adjust = 1;
            spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
        }
1270 1271
    }

1272 1273 1274 1275 1276 1277
    if (spapr->rma_size > node0_size) {
        fprintf(stderr, "Error: Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")\n",
                spapr->rma_size);
        exit(1);
    }

1278
    /* We place the device tree and RTAS just below either the top of the RMA,
1279 1280
     * or just below 2GB, whichever is lowere, so that it can be
     * processed with 32-bit real mode code if necessary */
1281
    rtas_limit = MIN(spapr->rma_size, 0x80000000);
1282 1283 1284
    spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
    spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
    load_limit = spapr->fdt_addr - FW_OVERHEAD;
1285

1286 1287 1288 1289 1290 1291 1292 1293 1294 1295
    /* We aim for a hash table of size 1/128 the size of RAM.  The
     * normal rule of thumb is 1/64 the size of RAM, but that's much
     * more than needed for the Linux guests we support. */
    spapr->htab_shift = 18; /* Minimum architected size */
    while (spapr->htab_shift <= 46) {
        if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
            break;
        }
        spapr->htab_shift++;
    }
1296

1297 1298 1299 1300
    /* Set up Interrupt Controller before we create the VCPUs */
    spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads,
                                  XICS_IRQS);

1301 1302
    /* init CPUs */
    if (cpu_model == NULL) {
1303
        cpu_model = kvm_enabled() ? "host" : "POWER7";
1304 1305
    }
    for (i = 0; i < smp_cpus; i++) {
1306 1307
        cpu = cpu_ppc_init(cpu_model);
        if (cpu == NULL) {
1308 1309 1310
            fprintf(stderr, "Unable to find PowerPC CPU definition\n");
            exit(1);
        }
1311 1312
        env = &cpu->env;

1313 1314 1315
        /* Set time-base frequency to 512 MHz */
        cpu_ppc_tb_init(env, TIMEBASE_FREQ);

1316 1317 1318 1319
        /* PAPR always has exception vectors in RAM not ROM. To ensure this,
         * MSR[IP] should never be set.
         */
        env->msr_mask &= ~(1 << 6);
1320 1321 1322

        /* Tell KVM that we're in PAPR mode */
        if (kvm_enabled()) {
1323
            kvmppc_set_papr(cpu);
1324 1325
        }

1326 1327 1328 1329 1330 1331
        if (cpu->max_compat) {
            if (ppc_set_compat(cpu, cpu->max_compat) < 0) {
                exit(1);
            }
        }

1332 1333
        xics_cpu_setup(spapr->icp, cpu);

1334
        qemu_register_reset(spapr_cpu_reset, cpu);
1335 1336 1337
    }

    /* allocate RAM */
1338
    spapr->ram_limit = ram_size;
1339 1340 1341
    memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
                                         spapr->ram_limit);
    memory_region_add_subregion(sysmem, 0, ram);
1342

1343 1344 1345 1346 1347 1348 1349 1350
    if (rma_alloc_size && rma) {
        rma_region = g_new(MemoryRegion, 1);
        memory_region_init_ram_ptr(rma_region, NULL, "ppc_spapr.rma",
                                   rma_alloc_size, rma);
        vmstate_register_ram_global(rma_region);
        memory_region_add_subregion(sysmem, 0, rma_region);
    }

1351
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
1352
    spapr->rtas_size = load_image_targphys(filename, spapr->rtas_addr,
1353
                                           rtas_limit - spapr->rtas_addr);
1354
    if (spapr->rtas_size < 0) {
1355 1356 1357
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
1358 1359 1360 1361 1362
    if (spapr->rtas_size > RTAS_MAX_SIZE) {
        hw_error("RTAS too big ! 0x%lx bytes (max is 0x%x)\n",
                 spapr->rtas_size, RTAS_MAX_SIZE);
        exit(1);
    }
1363
    g_free(filename);
1364

1365 1366 1367
    /* Set up EPOW events infrastructure */
    spapr_events_init(spapr);

1368
    /* Set up VIO bus */
1369 1370
    spapr->vio_bus = spapr_vio_bus_init();

P
Paolo Bonzini 已提交
1371
    for (i = 0; i < MAX_SERIAL_PORTS; i++) {
1372
        if (serial_hds[i]) {
1373
            spapr_vty_create(spapr->vio_bus, serial_hds[i]);
1374 1375
        }
    }
1376

D
David Gibson 已提交
1377 1378 1379
    /* We always have at least the nvram device on VIO */
    spapr_create_nvram(spapr);

1380
    /* Set up PCI */
1381
    spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
1382 1383
    spapr_pci_rtas_init();

1384
    phb = spapr_create_phb(spapr, 0);
1385

P
Paolo Bonzini 已提交
1386
    for (i = 0; i < nb_nics; i++) {
1387 1388 1389
        NICInfo *nd = &nd_table[i];

        if (!nd->model) {
1390
            nd->model = g_strdup("ibmveth");
1391 1392 1393
        }

        if (strcmp(nd->model, "ibmveth") == 0) {
1394
            spapr_vlan_create(spapr->vio_bus, nd);
1395
        } else {
1396
            pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
1397 1398 1399
        }
    }

1400
    for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
1401
        spapr_vscsi_create(spapr->vio_bus);
1402 1403
    }

1404
    /* Graphics */
1405
    if (spapr_vga_init(phb->bus)) {
1406
        spapr->has_graphics = true;
1407 1408
    }

1409
    if (usb_enabled(spapr->has_graphics)) {
1410
        pci_create_simple(phb->bus, -1, "pci-ohci");
1411 1412 1413 1414 1415 1416
        if (spapr->has_graphics) {
            usbdevice_create("keyboard");
            usbdevice_create("mouse");
        }
    }

1417
    if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
1418 1419 1420 1421 1422
        fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
                "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
        exit(1);
    }

1423 1424 1425 1426 1427
    if (kernel_filename) {
        uint64_t lowaddr = 0;

        kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
                               NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
1428
        if (kernel_size == ELF_LOAD_WRONG_ENDIAN) {
1429 1430 1431 1432 1433
            kernel_size = load_elf(kernel_filename,
                                   translate_kernel_address, NULL,
                                   NULL, &lowaddr, NULL, 0, ELF_MACHINE, 0);
            kernel_le = kernel_size > 0;
        }
1434
        if (kernel_size < 0) {
1435 1436
            fprintf(stderr, "qemu: error loading %s: %s\n",
                    kernel_filename, load_elf_strerror(kernel_size));
1437 1438 1439 1440 1441
            exit(1);
        }

        /* load initrd */
        if (initrd_filename) {
1442 1443 1444 1445
            /* Try to locate the initrd in the gap between the kernel
             * and the firmware. Add a bit of space just in case
             */
            initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
1446
            initrd_size = load_image_targphys(initrd_filename, initrd_base,
1447
                                              load_limit - initrd_base);
1448 1449 1450 1451 1452 1453 1454 1455 1456
            if (initrd_size < 0) {
                fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
                        initrd_filename);
                exit(1);
            }
        } else {
            initrd_base = 0;
            initrd_size = 0;
        }
1457
    }
1458

1459 1460 1461 1462
    if (bios_name == NULL) {
        bios_name = FW_FILE_NAME;
    }
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1463 1464 1465 1466 1467 1468 1469 1470 1471
    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
    if (fw_size < 0) {
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
    g_free(filename);

    spapr->entry_point = 0x100;

1472 1473 1474 1475
    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
    register_savevm_live(NULL, "spapr/htab", -1, 1,
                         &savevm_htab_handlers, spapr);

1476
    /* Prepare the device tree */
1477
    spapr->fdt_skel = spapr_create_fdt_skel(initrd_base, initrd_size,
1478
                                            kernel_size, kernel_le,
1479 1480
                                            boot_device, kernel_cmdline,
                                            spapr->epow_irq);
1481
    assert(spapr->fdt_skel != NULL);
1482 1483
}

1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501
static int spapr_kvm_type(const char *vm_type)
{
    if (!vm_type) {
        return 0;
    }

    if (!strcmp(vm_type, "HV")) {
        return 1;
    }

    if (!strcmp(vm_type, "PR")) {
        return 2;
    }

    error_report("Unknown kvm-type specified '%s'", vm_type);
    exit(1);
}

1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
/*
 * Implementation of an interface to adjust firmware patch
 * for the bootindex property handling.
 */
static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
                                   DeviceState *dev)
{
#define CAST(type, obj, name) \
    ((type *)object_dynamic_cast(OBJECT(obj), (name)))
    SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
    sPAPRPHBState *phb = CAST(sPAPRPHBState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);

    if (d) {
        void *spapr = CAST(void, bus->parent, "spapr-vscsi");
        VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
        USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);

        if (spapr) {
            /*
             * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
             * We use SRP luns of the form 8000 | (bus << 8) | (id << 5) | lun
             * in the top 16 bits of the 64-bit LUN
             */
            unsigned id = 0x8000 | (d->id << 8) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 48);
        } else if (virtio) {
            /*
             * We use SRP luns of the form 01000000 | (target << 8) | lun
             * in the top 32 bits of the 64-bit LUN
             * Note: the quote above is from SLOF and it is wrong,
             * the actual binding is:
             * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
             */
            unsigned id = 0x1000000 | (d->id << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        } else if (usb) {
            /*
             * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
             * in the top 32 bits of the 64-bit LUN
             */
            unsigned usb_port = atoi(usb->port->path);
            unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        }
    }

    if (phb) {
        /* Replace "pci" with "pci@800000020000000" */
        return g_strdup_printf("pci@%"PRIX64, phb->buid);
    }

    return NULL;
}

E
Eduardo Habkost 已提交
1559 1560
static char *spapr_get_kvm_type(Object *obj, Error **errp)
{
1561
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1562 1563 1564 1565 1566 1567

    return g_strdup(sm->kvm_type);
}

static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
{
1568
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579

    g_free(sm->kvm_type);
    sm->kvm_type = g_strdup(value);
}

static void spapr_machine_initfn(Object *obj)
{
    object_property_add_str(obj, "kvm-type",
                            spapr_get_kvm_type, spapr_set_kvm_type, NULL);
}

1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596
static void ppc_cpu_do_nmi_on_cpu(void *arg)
{
    CPUState *cs = arg;

    cpu_synchronize_state(cs);
    ppc_cpu_do_system_reset(cs);
}

static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
{
    CPUState *cs;

    CPU_FOREACH(cs) {
        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs);
    }
}

1597 1598 1599
static void spapr_machine_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);
1600
    FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
1601
    NMIClass *nc = NMI_CLASS(oc);
1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612

    mc->name = "pseries";
    mc->desc = "pSeries Logical Partition (PAPR compliant)";
    mc->is_default = 1;
    mc->init = ppc_spapr_init;
    mc->reset = ppc_spapr_reset;
    mc->block_default_type = IF_SCSI;
    mc->max_cpus = MAX_CPUS;
    mc->no_parallel = 1;
    mc->default_boot_order = NULL;
    mc->kvm_type = spapr_kvm_type;
1613

1614
    fwc->get_dev_path = spapr_get_fw_dev_path;
1615
    nc->nmi_monitor_handler = spapr_nmi;
1616 1617 1618 1619 1620
}

static const TypeInfo spapr_machine_info = {
    .name          = TYPE_SPAPR_MACHINE,
    .parent        = TYPE_MACHINE,
1621
    .instance_size = sizeof(sPAPRMachineState),
E
Eduardo Habkost 已提交
1622
    .instance_init = spapr_machine_initfn,
1623
    .class_init    = spapr_machine_class_init,
1624 1625
    .interfaces = (InterfaceInfo[]) {
        { TYPE_FW_PATH_PROVIDER },
1626
        { TYPE_NMI },
1627 1628
        { }
    },
1629 1630
};

1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645
static void spapr_machine_2_1_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);

    mc->name = "pseries-2.1";
    mc->desc = "pSeries Logical Partition (PAPR compliant) v2.1";
    mc->is_default = 0;
}

static const TypeInfo spapr_machine_2_1_info = {
    .name          = TYPE_SPAPR_MACHINE "2.1",
    .parent        = TYPE_SPAPR_MACHINE,
    .class_init    = spapr_machine_2_1_class_init,
};

1646
static void spapr_machine_register_types(void)
1647
{
1648
    type_register_static(&spapr_machine_info);
1649
    type_register_static(&spapr_machine_2_1_info);
1650 1651
}

1652
type_init(spapr_machine_register_types)