spapr.c 52.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
 *
 * Copyright (c) 2004-2007 Fabrice Bellard
 * Copyright (c) 2007 Jocelyn Mayer
 * Copyright (c) 2010 David Gibson, IBM Corporation.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */
27
#include "sysemu/sysemu.h"
28
#include "hw/hw.h"
29
#include "hw/fw-path-provider.h"
30
#include "elf.h"
P
Paolo Bonzini 已提交
31
#include "net/net.h"
32
#include "sysemu/block-backend.h"
33 34 35
#include "sysemu/blockdev.h"
#include "sysemu/cpus.h"
#include "sysemu/kvm.h"
36
#include "kvm_ppc.h"
37
#include "mmu-hash64.h"
38
#include "qom/cpu.h"
39 40

#include "hw/boards.h"
P
Paolo Bonzini 已提交
41
#include "hw/ppc/ppc.h"
42 43
#include "hw/loader.h"

P
Paolo Bonzini 已提交
44 45 46 47
#include "hw/ppc/spapr.h"
#include "hw/ppc/spapr_vio.h"
#include "hw/pci-host/spapr.h"
#include "hw/ppc/xics.h"
48
#include "hw/pci/msi.h"
49

50
#include "hw/pci/pci.h"
51 52
#include "hw/scsi/scsi.h"
#include "hw/virtio/virtio-scsi.h"
53

54
#include "exec/address-spaces.h"
55
#include "hw/usb.h"
56
#include "qemu/config-file.h"
57
#include "qemu/error-report.h"
58
#include "trace.h"
59
#include "hw/nmi.h"
A
Avi Kivity 已提交
60

61 62
#include <libfdt.h>

63 64 65 66 67 68 69 70 71 72
/* SLOF memory layout:
 *
 * SLOF raw image loaded at 0, copies its romfs right below the flat
 * device-tree, then position SLOF itself 31M below that
 *
 * So we set FW_OVERHEAD to 40MB which should account for all of that
 * and more
 *
 * We load our kernel at 4M, leaving space for SLOF initial image
 */
73
#define FDT_MAX_SIZE            0x40000
74
#define RTAS_MAX_SIZE           0x10000
75
#define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
76 77
#define FW_MAX_SIZE             0x400000
#define FW_FILE_NAME            "slof.bin"
78 79
#define FW_OVERHEAD             0x2800000
#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
80

81
#define MIN_RMA_SLOF            128UL
82 83 84

#define TIMEBASE_FREQ           512000000ULL

85
#define MAX_CPUS                255
86

87 88
#define PHANDLE_XICP            0x00001111

89 90
#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))

91
typedef struct sPAPRMachineState sPAPRMachineState;
92

93
#define TYPE_SPAPR_MACHINE      "spapr-machine"
94
#define SPAPR_MACHINE(obj) \
95
    OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE)
96 97

/**
98
 * sPAPRMachineState:
99
 */
100
struct sPAPRMachineState {
101 102
    /*< private >*/
    MachineState parent_obj;
E
Eduardo Habkost 已提交
103 104 105

    /*< public >*/
    char *kvm_type;
106 107
};

108 109
sPAPREnvironment *spapr;

110 111 112 113 114 115 116 117 118 119 120 121
static XICSState *try_create_xics(const char *type, int nr_servers,
                                  int nr_irqs)
{
    DeviceState *dev;

    dev = qdev_create(NULL, type);
    qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
    qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
    if (qdev_init(dev) < 0) {
        return NULL;
    }

122
    return XICS_COMMON(dev);
123 124 125 126 127 128
}

static XICSState *xics_system_init(int nr_servers, int nr_irqs)
{
    XICSState *icp = NULL;

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
    if (kvm_enabled()) {
        QemuOpts *machine_opts = qemu_get_machine_opts();
        bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
                                                "kernel_irqchip", true);
        bool irqchip_required = qemu_opt_get_bool(machine_opts,
                                                  "kernel_irqchip", false);
        if (irqchip_allowed) {
            icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
        }

        if (irqchip_required && !icp) {
            perror("Failed to create in-kernel XICS\n");
            abort();
        }
    }

    if (!icp) {
        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
    }

149 150 151 152 153 154 155 156
    if (!icp) {
        perror("Failed to create XICS\n");
        abort();
    }

    return icp;
}

157 158 159 160 161 162 163 164
static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
                                  int smt_threads)
{
    int i, ret = 0;
    uint32_t servers_prop[smt_threads];
    uint32_t gservers_prop[smt_threads * 2];
    int index = ppc_get_vcpu_dt_id(cpu);

165
    if (cpu->cpu_version) {
166
        ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->cpu_version);
167 168 169 170 171
        if (ret < 0) {
            return ret;
        }
    }

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
    /* Build interrupt servers and gservers properties */
    for (i = 0; i < smt_threads; i++) {
        servers_prop[i] = cpu_to_be32(index + i);
        /* Hack, direct the group queues back to cpu 0 */
        gservers_prop[i*2] = cpu_to_be32(index + i);
        gservers_prop[i*2 + 1] = 0;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
                      servers_prop, sizeof(servers_prop));
    if (ret < 0) {
        return ret;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
                      gservers_prop, sizeof(gservers_prop));

    return ret;
}

190
static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
191
{
192 193
    int ret = 0, offset, cpus_offset;
    CPUState *cs;
194 195
    char cpu_model[32];
    int smt = kvmppc_smt_threads();
196
    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
197

198 199 200 201
    CPU_FOREACH(cs) {
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
        int index = ppc_get_vcpu_dt_id(cpu);
202 203 204 205
        uint32_t associativity[] = {cpu_to_be32(0x5),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
206
                                    cpu_to_be32(cs->numa_node),
207
                                    cpu_to_be32(index)};
208

209
        if ((index % smt) != 0) {
210 211 212
            continue;
        }

213
        snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
214

215 216 217 218 219 220 221 222 223
        cpus_offset = fdt_path_offset(fdt, "/cpus");
        if (cpus_offset < 0) {
            cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
                                          "cpus");
            if (cpus_offset < 0) {
                return cpus_offset;
            }
        }
        offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
224
        if (offset < 0) {
225 226 227 228
            offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
            if (offset < 0) {
                return offset;
            }
229 230
        }

231 232 233 234 235 236 237 238 239 240
        if (nb_numa_nodes > 1) {
            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
                              sizeof(associativity));
            if (ret < 0) {
                return ret;
            }
        }

        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
                          pft_size_prop, sizeof(pft_size_prop));
241 242 243
        if (ret < 0) {
            return ret;
        }
244

245
        ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
246
                                     ppc_get_compat_smt_threads(cpu));
247 248 249
        if (ret < 0) {
            return ret;
        }
250 251 252 253
    }
    return ret;
}

254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287

static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
                                     size_t maxsize)
{
    size_t maxcells = maxsize / sizeof(uint32_t);
    int i, j, count;
    uint32_t *p = prop;

    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
        struct ppc_one_seg_page_size *sps = &env->sps.sps[i];

        if (!sps->page_shift) {
            break;
        }
        for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
            if (sps->enc[count].page_shift == 0) {
                break;
            }
        }
        if ((p - prop) >= (maxcells - 3 - count * 2)) {
            break;
        }
        *(p++) = cpu_to_be32(sps->page_shift);
        *(p++) = cpu_to_be32(sps->slb_enc);
        *(p++) = cpu_to_be32(count);
        for (j = 0; j < count; j++) {
            *(p++) = cpu_to_be32(sps->enc[j].page_shift);
            *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
        }
    }

    return (p - prop) * sizeof(uint32_t);
}

288 289 290 291 292 293 294 295 296 297 298 299 300
static hwaddr spapr_node0_size(void)
{
    if (nb_numa_nodes) {
        int i;
        for (i = 0; i < nb_numa_nodes; ++i) {
            if (numa_info[i].node_mem) {
                return MIN(pow2floor(numa_info[i].node_mem), ram_size);
            }
        }
    }
    return ram_size;
}

301 302 303 304 305 306 307 308 309 310
#define _FDT(exp) \
    do { \
        int ret = (exp);                                           \
        if (ret < 0) {                                             \
            fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
                    #exp, fdt_strerror(ret));                      \
            exit(1);                                               \
        }                                                          \
    } while (0)

311 312 313 314
static void add_str(GString *s, const gchar *s1)
{
    g_string_append_len(s, s1, strlen(s1) + 1);
}
315

316
static void *spapr_create_fdt_skel(hwaddr initrd_base,
A
Avi Kivity 已提交
317 318
                                   hwaddr initrd_size,
                                   hwaddr kernel_size,
319
                                   bool little_endian,
320
                                   const char *boot_device,
321 322
                                   const char *kernel_cmdline,
                                   uint32_t epow_irq)
323 324
{
    void *fdt;
325
    CPUState *cs;
326 327
    uint32_t start_prop = cpu_to_be32(initrd_base);
    uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
328 329
    GString *hypertas = g_string_sized_new(256);
    GString *qemu_hypertas = g_string_sized_new(256);
330
    uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
331
    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
332
    int smt = kvmppc_smt_threads();
333
    unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
334 335 336
    QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
    unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
    uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
337
    char *buf;
338

339 340 341 342 343 344 345 346 347 348 349
    add_str(hypertas, "hcall-pft");
    add_str(hypertas, "hcall-term");
    add_str(hypertas, "hcall-dabr");
    add_str(hypertas, "hcall-interrupt");
    add_str(hypertas, "hcall-tce");
    add_str(hypertas, "hcall-vio");
    add_str(hypertas, "hcall-splpar");
    add_str(hypertas, "hcall-bulk");
    add_str(hypertas, "hcall-set-mode");
    add_str(qemu_hypertas, "hcall-memop1");

350
    fdt = g_malloc0(FDT_MAX_SIZE);
351 352
    _FDT((fdt_create(fdt, FDT_MAX_SIZE)));

353 354 355 356 357 358
    if (kernel_size) {
        _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
    }
    if (initrd_size) {
        _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
    }
359 360 361 362 363
    _FDT((fdt_finish_reservemap(fdt)));

    /* Root node */
    _FDT((fdt_begin_node(fdt, "")));
    _FDT((fdt_property_string(fdt, "device_type", "chrp")));
364
    _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
365
    _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
366

367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
    /*
     * Add info to guest to indentify which host is it being run on
     * and what is the uuid of the guest
     */
    if (kvmppc_get_host_model(&buf)) {
        _FDT((fdt_property_string(fdt, "host-model", buf)));
        g_free(buf);
    }
    if (kvmppc_get_host_serial(&buf)) {
        _FDT((fdt_property_string(fdt, "host-serial", buf)));
        g_free(buf);
    }

    buf = g_strdup_printf(UUID_FMT, qemu_uuid[0], qemu_uuid[1],
                          qemu_uuid[2], qemu_uuid[3], qemu_uuid[4],
                          qemu_uuid[5], qemu_uuid[6], qemu_uuid[7],
                          qemu_uuid[8], qemu_uuid[9], qemu_uuid[10],
                          qemu_uuid[11], qemu_uuid[12], qemu_uuid[13],
                          qemu_uuid[14], qemu_uuid[15]);

    _FDT((fdt_property_string(fdt, "vm,uuid", buf)));
    g_free(buf);

390 391 392 393 394 395
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));

    /* /chosen */
    _FDT((fdt_begin_node(fdt, "chosen")));

396 397 398
    /* Set Form1_affinity */
    _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));

399 400 401 402 403
    _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
    _FDT((fdt_property(fdt, "linux,initrd-start",
                       &start_prop, sizeof(start_prop))));
    _FDT((fdt_property(fdt, "linux,initrd-end",
                       &end_prop, sizeof(end_prop))));
404 405 406
    if (kernel_size) {
        uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
                              cpu_to_be64(kernel_size) };
407

408
        _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
409 410 411
        if (little_endian) {
            _FDT((fdt_property(fdt, "qemu,boot-kernel-le", NULL, 0)));
        }
412
    }
A
Avik Sil 已提交
413 414 415
    if (boot_device) {
        _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
    }
416 417 418
    if (boot_menu) {
        _FDT((fdt_property_cell(fdt, "qemu,boot-menu", boot_menu)));
    }
419 420 421
    _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
422

423 424 425 426 427 428 429 430
    _FDT((fdt_end_node(fdt)));

    /* cpus */
    _FDT((fdt_begin_node(fdt, "cpus")));

    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));

A
Andreas Färber 已提交
431
    CPU_FOREACH(cs) {
432 433
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        CPUPPCState *env = &cpu->env;
434
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
435
        PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
436
        int index = ppc_get_vcpu_dt_id(cpu);
437 438 439
        char *nodename;
        uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
                           0xffffffff, 0xffffffff};
440 441
        uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
        uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
442 443
        uint32_t page_sizes_prop[64];
        size_t page_sizes_prop_size;
444

445 446 447 448
        if ((index % smt) != 0) {
            continue;
        }

449
        nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
450 451 452

        _FDT((fdt_begin_node(fdt, nodename)));

453
        g_free(nodename);
454

D
David Gibson 已提交
455
        _FDT((fdt_property_cell(fdt, "reg", index)));
456 457 458
        _FDT((fdt_property_string(fdt, "device_type", "cpu")));

        _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR])));
459
        _FDT((fdt_property_cell(fdt, "d-cache-block-size",
460
                                env->dcache_line_size)));
461 462 463 464 465
        _FDT((fdt_property_cell(fdt, "d-cache-line-size",
                                env->dcache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-block-size",
                                env->icache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-line-size",
466
                                env->icache_line_size)));
467 468 469 470 471 472 473 474 475 476 477 478

        if (pcc->l1_dcache_size) {
            _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
        }
        if (pcc->l1_icache_size) {
            _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
        }

479 480
        _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
        _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
481 482 483
        _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
        _FDT((fdt_property_string(fdt, "status", "okay")));
        _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
484

485 486 487 488
        if (env->spr_cb[SPR_PURR].oea_read) {
            _FDT((fdt_property(fdt, "ibm,purr", NULL, 0)));
        }

D
David Gibson 已提交
489
        if (env->mmu_model & POWERPC_MMU_1TSEG) {
490 491 492 493
            _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
                               segs, sizeof(segs))));
        }

494 495 496 497
        /* Advertise VMX/VSX (vector extensions) if available
         *   0 / no property == no vector extensions
         *   1               == VMX / Altivec available
         *   2               == VSX available */
498 499 500
        if (env->insns_flags & PPC_ALTIVEC) {
            uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;

501 502 503 504 505 506
            _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx)));
        }

        /* Advertise DFP (Decimal Floating Point) if available
         *   0 / no property == no DFP
         *   1               == DFP available */
507 508
        if (env->insns_flags2 & PPC2_DFP) {
            _FDT((fdt_property_cell(fdt, "ibm,dfp", 1)));
509 510
        }

511 512 513 514 515 516 517
        page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
                                                      sizeof(page_sizes_prop));
        if (page_sizes_prop_size) {
            _FDT((fdt_property(fdt, "ibm,segment-page-sizes",
                               page_sizes_prop, page_sizes_prop_size)));
        }

518 519 520
        _FDT((fdt_property_cell(fdt, "ibm,chip-id",
                                cs->cpu_index / cpus_per_socket)));

521 522 523 524 525
        _FDT((fdt_end_node(fdt)));
    }

    _FDT((fdt_end_node(fdt)));

526 527 528
    /* RTAS */
    _FDT((fdt_begin_node(fdt, "rtas")));

529 530 531
    if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
        add_str(hypertas, "hcall-multi-tce");
    }
532 533 534 535 536 537
    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas->str,
                       hypertas->len)));
    g_string_free(hypertas, TRUE);
    _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas->str,
                       qemu_hypertas->len)));
    g_string_free(qemu_hypertas, TRUE);
538

539 540 541
    _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
        refpoints, sizeof(refpoints))));

542 543
    _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));

544
    /*
545
     * According to PAPR, rtas ibm,os-term does not guarantee a return
546 547 548 549 550 551 552
     * back to the guest cpu.
     *
     * While an additional ibm,extended-os-term property indicates that
     * rtas call return will always occur. Set this property.
     */
    _FDT((fdt_property(fdt, "ibm,extended-os-term", NULL, 0)));

553 554
    _FDT((fdt_end_node(fdt)));

555
    /* interrupt controller */
556
    _FDT((fdt_begin_node(fdt, "interrupt-controller")));
557 558 559 560 561 562 563 564

    _FDT((fdt_property_string(fdt, "device_type",
                              "PowerPC-External-Interrupt-Presentation")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
                       interrupt_server_ranges_prop,
                       sizeof(interrupt_server_ranges_prop))));
565 566 567
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
    _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
    _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
568 569 570

    _FDT((fdt_end_node(fdt)));

571 572 573 574 575 576 577
    /* vdevice */
    _FDT((fdt_begin_node(fdt, "vdevice")));

    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
578 579
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
580 581 582

    _FDT((fdt_end_node(fdt)));

583 584 585
    /* event-sources */
    spapr_events_fdt_skel(fdt, epow_irq);

586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
    /* /hypervisor node */
    if (kvm_enabled()) {
        uint8_t hypercall[16];

        /* indicate KVM hypercall interface */
        _FDT((fdt_begin_node(fdt, "hypervisor")));
        _FDT((fdt_property_string(fdt, "compatible", "linux,kvm")));
        if (kvmppc_has_cap_fixup_hcalls()) {
            /*
             * Older KVM versions with older guest kernels were broken with the
             * magic page, don't allow the guest to map it.
             */
            kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
                                 sizeof(hypercall));
            _FDT((fdt_property(fdt, "hcall-instructions", hypercall,
                              sizeof(hypercall))));
        }
        _FDT((fdt_end_node(fdt)));
    }

606 607 608
    _FDT((fdt_end_node(fdt))); /* close root node */
    _FDT((fdt_finish(fdt)));

609 610 611
    return fdt;
}

612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
int spapr_h_cas_compose_response(target_ulong addr, target_ulong size)
{
    void *fdt, *fdt_skel;
    sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };

    size -= sizeof(hdr);

    /* Create sceleton */
    fdt_skel = g_malloc0(size);
    _FDT((fdt_create(fdt_skel, size)));
    _FDT((fdt_begin_node(fdt_skel, "")));
    _FDT((fdt_end_node(fdt_skel)));
    _FDT((fdt_finish(fdt_skel)));
    fdt = g_malloc0(size);
    _FDT((fdt_open_into(fdt_skel, fdt, size)));
    g_free(fdt_skel);

629 630
    /* Fix skeleton up */
    _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647

    /* Pack resulting tree */
    _FDT((fdt_pack(fdt)));

    if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
        trace_spapr_cas_failed(size);
        return -1;
    }

    cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
    cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
    trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
    g_free(fdt);

    return 0;
}

648 649 650 651 652 653
static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start,
                                       hwaddr size)
{
    uint32_t associativity[] = {
        cpu_to_be32(0x4), /* length */
        cpu_to_be32(0x0), cpu_to_be32(0x0),
654
        cpu_to_be32(0x0), cpu_to_be32(nodeid)
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
    };
    char mem_name[32];
    uint64_t mem_reg_property[2];
    int off;

    mem_reg_property[0] = cpu_to_be64(start);
    mem_reg_property[1] = cpu_to_be64(size);

    sprintf(mem_name, "memory@" TARGET_FMT_lx, start);
    off = fdt_add_subnode(fdt, 0, mem_name);
    _FDT(off);
    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                      sizeof(mem_reg_property))));
    _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                      sizeof(associativity))));
}

673 674
static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
{
675 676 677 678 679 680 681 682 683 684
    hwaddr mem_start, node_size;
    int i, nb_nodes = nb_numa_nodes;
    NodeInfo *nodes = numa_info;
    NodeInfo ramnode;

    /* No NUMA nodes, assume there is just one node with whole RAM */
    if (!nb_numa_nodes) {
        nb_nodes = 1;
        ramnode.node_mem = ram_size;
        nodes = &ramnode;
685
    }
686

687 688 689 690
    for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
        if (!nodes[i].node_mem) {
            continue;
        }
691 692 693
        if (mem_start >= ram_size) {
            node_size = 0;
        } else {
694
            node_size = nodes[i].node_mem;
695 696 697 698
            if (node_size > ram_size - mem_start) {
                node_size = ram_size - mem_start;
            }
        }
699 700 701 702 703 704
        if (!mem_start) {
            /* ppc_spapr_init() checks for rma_size <= node0_size already */
            spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
            mem_start += spapr->rma_size;
            node_size -= spapr->rma_size;
        }
705 706 707 708 709 710 711 712 713 714 715 716
        for ( ; node_size; ) {
            hwaddr sizetmp = pow2floor(node_size);

            /* mem_start != 0 here */
            if (ctzl(mem_start) < ctzl(sizetmp)) {
                sizetmp = 1ULL << ctzl(mem_start);
            }

            spapr_populate_memory_node(fdt, i, mem_start, sizetmp);
            node_size -= sizetmp;
            mem_start += sizetmp;
        }
717 718 719 720 721
    }

    return 0;
}

722
static void spapr_finalize_fdt(sPAPREnvironment *spapr,
A
Avi Kivity 已提交
723 724 725
                               hwaddr fdt_addr,
                               hwaddr rtas_addr,
                               hwaddr rtas_size)
726
{
727 728 729
    int ret, i;
    size_t cb = 0;
    char *bootlist;
730
    void *fdt;
731
    sPAPRPHBState *phb;
732

733
    fdt = g_malloc(FDT_MAX_SIZE);
734 735 736

    /* open out the base tree into a temp buffer for the final tweaks */
    _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
737

738 739 740 741 742 743
    ret = spapr_populate_memory(spapr, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
        exit(1);
    }

744 745 746 747 748 749
    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup vio devices in fdt\n");
        exit(1);
    }

750
    QLIST_FOREACH(phb, &spapr->phbs, list) {
751
        ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
752 753 754 755 756 757 758
    }

    if (ret < 0) {
        fprintf(stderr, "couldn't setup PCI devices in fdt\n");
        exit(1);
    }

759 760 761 762 763 764
    /* RTAS */
    ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
    if (ret < 0) {
        fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
    }

765
    /* Advertise NUMA via ibm,associativity */
766 767 768
    ret = spapr_fixup_cpu_dt(fdt, spapr);
    if (ret < 0) {
        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
769 770
    }

771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
    bootlist = get_boot_devices_list(&cb, true);
    if (cb && bootlist) {
        int offset = fdt_path_offset(fdt, "/chosen");
        if (offset < 0) {
            exit(1);
        }
        for (i = 0; i < cb; i++) {
            if (bootlist[i] == '\n') {
                bootlist[i] = ' ';
            }

        }
        ret = fdt_setprop_string(fdt, offset, "qemu,boot-list", bootlist);
    }

786
    if (!spapr->has_graphics) {
787 788
        spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
    }
789

790 791
    _FDT((fdt_pack(fdt)));

792 793 794 795 796 797
    if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
        hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
                 fdt_totalsize(fdt), FDT_MAX_SIZE);
        exit(1);
    }

798
    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
799

G
Gonglei 已提交
800
    g_free(bootlist);
801
    g_free(fdt);
802 803 804 805 806 807 808
}

static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
{
    return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
}

809
static void emulate_spapr_hypercall(PowerPCCPU *cpu)
810
{
811 812
    CPUPPCState *env = &cpu->env;

813 814 815 816
    if (msr_pr) {
        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
        env->gpr[3] = H_PRIVILEGE;
    } else {
817
        env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
818
    }
819 820
}

821 822 823 824 825 826 827 828 829 830 831 832 833
static void spapr_reset_htab(sPAPREnvironment *spapr)
{
    long shift;

    /* allocate hash page table.  For now we always make this 16mb,
     * later we should probably make it scale to the size of guest
     * RAM */

    shift = kvmppc_reset_htab(spapr->htab_shift);

    if (shift > 0) {
        /* Kernel handles htab, we don't need to allocate one */
        spapr->htab_shift = shift;
834
        kvmppc_kern_htab = true;
835 836 837 838 839 840 841 842 843 844 845 846
    } else {
        if (!spapr->htab) {
            /* Allocate an htab if we don't yet have one */
            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
        }

        /* And clear it */
        memset(spapr->htab, 0, HTAB_SIZE(spapr));
    }

    /* Update the RMA size if necessary */
    if (spapr->vrma_adjust) {
847 848
        spapr->rma_size = kvmppc_rma_size(spapr_node0_size(),
                                          spapr->htab_shift);
849
    }
850 851
}

852
static void ppc_spapr_reset(void)
853
{
854
    PowerPCCPU *first_ppc_cpu;
855
    uint32_t rtas_limit;
856

857 858
    /* Reset the hash table & recalc the RMA */
    spapr_reset_htab(spapr);
859

860
    qemu_devices_reset();
861

862 863 864 865 866 867 868 869 870
    /*
     * We place the device tree and RTAS just below either the top of the RMA,
     * or just below 2GB, whichever is lowere, so that it can be
     * processed with 32-bit real mode code if necessary
     */
    rtas_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR);
    spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
    spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;

871 872 873 874
    /* Load the fdt */
    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
                       spapr->rtas_size);

875 876 877 878
    /* Copy RTAS over */
    cpu_physical_memory_write(spapr->rtas_addr, spapr->rtas_blob,
                              spapr->rtas_size);

879
    /* Set up the entry state */
880 881 882 883 884
    first_ppc_cpu = POWERPC_CPU(first_cpu);
    first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
    first_ppc_cpu->env.gpr[5] = 0;
    first_cpu->halted = 0;
    first_ppc_cpu->env.nip = spapr->entry_point;
885 886 887

}

888 889
static void spapr_cpu_reset(void *opaque)
{
890
    PowerPCCPU *cpu = opaque;
891
    CPUState *cs = CPU(cpu);
892
    CPUPPCState *env = &cpu->env;
893

894
    cpu_reset(cs);
895 896 897 898

    /* All CPUs start halted.  CPU0 is unhalted from the machine level
     * reset code and the rest are explicitly started up by the guest
     * using an RTAS call */
899
    cs->halted = 1;
900 901

    env->spr[SPR_HIOR] = 0;
902

903
    env->external_htab = (uint8_t *)spapr->htab;
904 905 906 907 908 909 910
    if (kvm_enabled() && !env->external_htab) {
        /*
         * HV KVM, set external_htab to 1 so our ppc_hash64_load_hpte*
         * functions do the right thing.
         */
        env->external_htab = (void *)1;
    }
911
    env->htab_base = -1;
912 913 914 915 916 917 918
    /*
     * htab_mask is the mask used to normalize hash value to PTEG index.
     * htab_shift is log2 of hash table size.
     * We have 8 hpte per group, and each hpte is 16 bytes.
     * ie have 128 bytes per hpte entry.
     */
    env->htab_mask = (1ULL << ((spapr)->htab_shift - 7)) - 1;
919
    env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
920
        (spapr->htab_shift - 18);
921 922
}

D
David Gibson 已提交
923 924
static void spapr_create_nvram(sPAPREnvironment *spapr)
{
925
    DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
P
Paolo Bonzini 已提交
926
    DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
D
David Gibson 已提交
927

P
Paolo Bonzini 已提交
928
    if (dinfo) {
929 930
        qdev_prop_set_drive_nofail(dev, "drive",
                                   blk_bs(blk_by_legacy_dinfo(dinfo)));
D
David Gibson 已提交
931 932 933 934 935 936 937
    }

    qdev_init_nofail(dev);

    spapr->nvram = (struct sPAPRNVRAM *)dev;
}

938
/* Returns whether we want to use VGA or not */
939 940
static int spapr_vga_init(PCIBus *pci_bus)
{
941 942
    switch (vga_interface_type) {
    case VGA_NONE:
943 944 945
        return false;
    case VGA_DEVICE:
        return true;
946 947
    case VGA_STD:
        return pci_vga_init(pci_bus) != NULL;
948
    default:
949 950
        fprintf(stderr, "This vga model is not supported,"
                "currently it only supports -vga std\n");
951
        exit(0);
952 953 954
    }
}

955 956
static const VMStateDescription vmstate_spapr = {
    .name = "spapr",
957
    .version_id = 2,
958
    .minimum_version_id = 1,
959
    .fields = (VMStateField[]) {
A
Alexey Kardashevskiy 已提交
960
        VMSTATE_UNUSED(4), /* used to be @next_irq */
961 962 963

        /* RTC offset */
        VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
964
        VMSTATE_PPC_TIMEBASE_V(tb, sPAPREnvironment, 2),
965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
        VMSTATE_END_OF_LIST()
    },
};

#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))

static int htab_save_setup(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* "Iteration" header */
    qemu_put_be32(f, spapr->htab_shift);

981 982 983 984 985 986 987 988 989 990 991 992 993 994 995
    if (spapr->htab) {
        spapr->htab_save_index = 0;
        spapr->htab_first_pass = true;
    } else {
        assert(kvm_enabled());

        spapr->htab_fd = kvmppc_get_htab_fd(false);
        if (spapr->htab_fd < 0) {
            fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
                    strerror(errno));
            return -1;
        }
    }


996 997 998 999 1000 1001 1002 1003
    return 0;
}

static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                 int64_t max_ns)
{
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int index = spapr->htab_save_index;
1004
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034

    assert(spapr->htab_first_pass);

    do {
        int chunkstart;

        /* Consume invalid HPTEs */
        while ((index < htabslots)
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        /* Consume valid HPTEs */
        chunkstart = index;
        while ((index < htabslots)
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        if (index > chunkstart) {
            int n_valid = index - chunkstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, 0);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);

1035
            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
                break;
            }
        }
    } while ((index < htabslots) && !qemu_file_rate_limit(f));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
        spapr->htab_first_pass = false;
    }
    spapr->htab_save_index = index;
}

1049 1050
static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                int64_t max_ns)
1051 1052 1053 1054 1055
{
    bool final = max_ns < 0;
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int examined = 0, sent = 0;
    int index = spapr->htab_save_index;
1056
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100

    assert(!spapr->htab_first_pass);

    do {
        int chunkstart, invalidstart;

        /* Consume non-dirty HPTEs */
        while ((index < htabslots)
               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
            index++;
            examined++;
        }

        chunkstart = index;
        /* Consume valid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        invalidstart = index;
        /* Consume invalid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        if (index > chunkstart) {
            int n_valid = invalidstart - chunkstart;
            int n_invalid = index - invalidstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, n_invalid);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);
            sent += index - chunkstart;

1101
            if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
                break;
            }
        }

        if (examined >= htabslots) {
            break;
        }

        if (index >= htabslots) {
            assert(index == htabslots);
            index = 0;
        }
    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
    }

    spapr->htab_save_index = index;

1123
    return (examined >= htabslots) && (sent == 0) ? 1 : 0;
1124 1125
}

1126 1127 1128
#define MAX_ITERATION_NS    5000000 /* 5 ms */
#define MAX_KVM_BUF_SIZE    2048

1129 1130 1131
static int htab_save_iterate(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;
1132
    int rc = 0;
1133 1134 1135 1136

    /* Iteration header */
    qemu_put_be32(f, 0);

1137 1138 1139 1140 1141 1142 1143 1144 1145
    if (!spapr->htab) {
        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd,
                              MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
        if (rc < 0) {
            return rc;
        }
    } else  if (spapr->htab_first_pass) {
1146 1147
        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
    } else {
1148
        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
1149 1150 1151 1152 1153 1154 1155
    }

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

1156
    return rc;
1157 1158 1159 1160 1161 1162 1163 1164 1165
}

static int htab_save_complete(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* Iteration header */
    qemu_put_be32(f, 0);

1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
    if (!spapr->htab) {
        int rc;

        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd, MAX_KVM_BUF_SIZE, -1);
        if (rc < 0) {
            return rc;
        }
        close(spapr->htab_fd);
        spapr->htab_fd = -1;
    } else {
        htab_save_later_pass(f, spapr, -1);
    }
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

    return 0;
}

static int htab_load(QEMUFile *f, void *opaque, int version_id)
{
    sPAPREnvironment *spapr = opaque;
    uint32_t section_hdr;
1193
    int fd = -1;
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209

    if (version_id < 1 || version_id > 1) {
        fprintf(stderr, "htab_load() bad version\n");
        return -EINVAL;
    }

    section_hdr = qemu_get_be32(f);

    if (section_hdr) {
        /* First section, just the hash shift */
        if (spapr->htab_shift != section_hdr) {
            return -EINVAL;
        }
        return 0;
    }

1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
    if (!spapr->htab) {
        assert(kvm_enabled());

        fd = kvmppc_get_htab_fd(true);
        if (fd < 0) {
            fprintf(stderr, "Unable to open fd to restore KVM hash table: %s\n",
                    strerror(errno));
        }
    }

1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
    while (true) {
        uint32_t index;
        uint16_t n_valid, n_invalid;

        index = qemu_get_be32(f);
        n_valid = qemu_get_be16(f);
        n_invalid = qemu_get_be16(f);

        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
            /* End of Stream */
            break;
        }

1233
        if ((index + n_valid + n_invalid) >
1234 1235 1236
            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
            /* Bad index in stream */
            fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
1237 1238
                    "in htab stream (htab_shift=%d)\n", index, n_valid, n_invalid,
                    spapr->htab_shift);
1239 1240 1241
            return -EINVAL;
        }

1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
        if (spapr->htab) {
            if (n_valid) {
                qemu_get_buffer(f, HPTE(spapr->htab, index),
                                HASH_PTE_SIZE_64 * n_valid);
            }
            if (n_invalid) {
                memset(HPTE(spapr->htab, index + n_valid), 0,
                       HASH_PTE_SIZE_64 * n_invalid);
            }
        } else {
            int rc;

            assert(fd >= 0);

            rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
            if (rc < 0) {
                return rc;
            }
1260 1261 1262
        }
    }

1263 1264 1265 1266 1267
    if (!spapr->htab) {
        assert(fd >= 0);
        close(fd);
    }

1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
    return 0;
}

static SaveVMHandlers savevm_htab_handlers = {
    .save_live_setup = htab_save_setup,
    .save_live_iterate = htab_save_iterate,
    .save_live_complete = htab_save_complete,
    .load_state = htab_load,
};

1278
/* pSeries LPAR / sPAPR hardware init */
1279
static void ppc_spapr_init(MachineState *machine)
1280
{
1281 1282 1283 1284 1285 1286
    ram_addr_t ram_size = machine->ram_size;
    const char *cpu_model = machine->cpu_model;
    const char *kernel_filename = machine->kernel_filename;
    const char *kernel_cmdline = machine->kernel_cmdline;
    const char *initrd_filename = machine->initrd_filename;
    const char *boot_device = machine->boot_order;
1287
    PowerPCCPU *cpu;
A
Andreas Färber 已提交
1288
    CPUPPCState *env;
1289
    PCIHostState *phb;
1290
    int i;
A
Avi Kivity 已提交
1291 1292
    MemoryRegion *sysmem = get_system_memory();
    MemoryRegion *ram = g_new(MemoryRegion, 1);
1293 1294
    MemoryRegion *rma_region;
    void *rma = NULL;
A
Avi Kivity 已提交
1295
    hwaddr rma_alloc_size;
1296
    hwaddr node0_size = spapr_node0_size();
1297 1298
    uint32_t initrd_base = 0;
    long kernel_size = 0, initrd_size = 0;
1299
    long load_limit, fw_size;
1300
    bool kernel_le = false;
1301
    char *filename;
1302

1303 1304
    msi_supported = true;

1305 1306 1307
    spapr = g_malloc0(sizeof(*spapr));
    QLIST_INIT(&spapr->phbs);

1308 1309
    cpu_ppc_hypercall = emulate_spapr_hypercall;

1310
    /* Allocate RMA if necessary */
1311
    rma_alloc_size = kvmppc_alloc_rma(&rma);
1312 1313 1314 1315 1316

    if (rma_alloc_size == -1) {
        hw_error("qemu: Unable to create RMA\n");
        exit(1);
    }
1317

1318
    if (rma_alloc_size && (rma_alloc_size < node0_size)) {
1319
        spapr->rma_size = rma_alloc_size;
1320
    } else {
1321
        spapr->rma_size = node0_size;
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335

        /* With KVM, we don't actually know whether KVM supports an
         * unbounded RMA (PR KVM) or is limited by the hash table size
         * (HV KVM using VRMA), so we always assume the latter
         *
         * In that case, we also limit the initial allocations for RTAS
         * etc... to 256M since we have no way to know what the VRMA size
         * is going to be as it depends on the size of the hash table
         * isn't determined yet.
         */
        if (kvm_enabled()) {
            spapr->vrma_adjust = 1;
            spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
        }
1336 1337
    }

1338 1339 1340 1341 1342 1343
    if (spapr->rma_size > node0_size) {
        fprintf(stderr, "Error: Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")\n",
                spapr->rma_size);
        exit(1);
    }

1344 1345
    /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
    load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;
1346

1347 1348 1349 1350 1351 1352 1353 1354 1355 1356
    /* We aim for a hash table of size 1/128 the size of RAM.  The
     * normal rule of thumb is 1/64 the size of RAM, but that's much
     * more than needed for the Linux guests we support. */
    spapr->htab_shift = 18; /* Minimum architected size */
    while (spapr->htab_shift <= 46) {
        if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
            break;
        }
        spapr->htab_shift++;
    }
1357

1358 1359 1360 1361
    /* Set up Interrupt Controller before we create the VCPUs */
    spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads,
                                  XICS_IRQS);

1362 1363
    /* init CPUs */
    if (cpu_model == NULL) {
1364
        cpu_model = kvm_enabled() ? "host" : "POWER7";
1365 1366
    }
    for (i = 0; i < smp_cpus; i++) {
1367 1368
        cpu = cpu_ppc_init(cpu_model);
        if (cpu == NULL) {
1369 1370 1371
            fprintf(stderr, "Unable to find PowerPC CPU definition\n");
            exit(1);
        }
1372 1373
        env = &cpu->env;

1374 1375 1376
        /* Set time-base frequency to 512 MHz */
        cpu_ppc_tb_init(env, TIMEBASE_FREQ);

1377 1378 1379 1380
        /* PAPR always has exception vectors in RAM not ROM. To ensure this,
         * MSR[IP] should never be set.
         */
        env->msr_mask &= ~(1 << 6);
1381 1382 1383

        /* Tell KVM that we're in PAPR mode */
        if (kvm_enabled()) {
1384
            kvmppc_set_papr(cpu);
1385 1386
        }

1387 1388 1389 1390 1391 1392
        if (cpu->max_compat) {
            if (ppc_set_compat(cpu, cpu->max_compat) < 0) {
                exit(1);
            }
        }

1393 1394
        xics_cpu_setup(spapr->icp, cpu);

1395
        qemu_register_reset(spapr_cpu_reset, cpu);
1396 1397 1398
    }

    /* allocate RAM */
1399
    spapr->ram_limit = ram_size;
1400 1401 1402
    memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
                                         spapr->ram_limit);
    memory_region_add_subregion(sysmem, 0, ram);
1403

1404 1405 1406 1407 1408 1409 1410 1411
    if (rma_alloc_size && rma) {
        rma_region = g_new(MemoryRegion, 1);
        memory_region_init_ram_ptr(rma_region, NULL, "ppc_spapr.rma",
                                   rma_alloc_size, rma);
        vmstate_register_ram_global(rma_region);
        memory_region_add_subregion(sysmem, 0, rma_region);
    }

1412
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
1413 1414 1415
    spapr->rtas_size = get_image_size(filename);
    spapr->rtas_blob = g_malloc(spapr->rtas_size);
    if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
1416 1417 1418
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
1419
    if (spapr->rtas_size > RTAS_MAX_SIZE) {
1420
        hw_error("RTAS too big ! 0x%zx bytes (max is 0x%x)\n",
1421 1422 1423
                 spapr->rtas_size, RTAS_MAX_SIZE);
        exit(1);
    }
1424
    g_free(filename);
1425

1426 1427 1428
    /* Set up EPOW events infrastructure */
    spapr_events_init(spapr);

1429
    /* Set up VIO bus */
1430 1431
    spapr->vio_bus = spapr_vio_bus_init();

P
Paolo Bonzini 已提交
1432
    for (i = 0; i < MAX_SERIAL_PORTS; i++) {
1433
        if (serial_hds[i]) {
1434
            spapr_vty_create(spapr->vio_bus, serial_hds[i]);
1435 1436
        }
    }
1437

D
David Gibson 已提交
1438 1439 1440
    /* We always have at least the nvram device on VIO */
    spapr_create_nvram(spapr);

1441
    /* Set up PCI */
1442 1443
    spapr_pci_rtas_init();

1444
    phb = spapr_create_phb(spapr, 0);
1445

P
Paolo Bonzini 已提交
1446
    for (i = 0; i < nb_nics; i++) {
1447 1448 1449
        NICInfo *nd = &nd_table[i];

        if (!nd->model) {
1450
            nd->model = g_strdup("ibmveth");
1451 1452 1453
        }

        if (strcmp(nd->model, "ibmveth") == 0) {
1454
            spapr_vlan_create(spapr->vio_bus, nd);
1455
        } else {
1456
            pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
1457 1458 1459
        }
    }

1460
    for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
1461
        spapr_vscsi_create(spapr->vio_bus);
1462 1463
    }

1464
    /* Graphics */
1465
    if (spapr_vga_init(phb->bus)) {
1466
        spapr->has_graphics = true;
1467 1468
    }

1469
    if (usb_enabled(spapr->has_graphics)) {
1470
        pci_create_simple(phb->bus, -1, "pci-ohci");
1471 1472 1473 1474 1475 1476
        if (spapr->has_graphics) {
            usbdevice_create("keyboard");
            usbdevice_create("mouse");
        }
    }

1477
    if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
1478 1479 1480 1481 1482
        fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
                "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
        exit(1);
    }

1483 1484 1485 1486 1487
    if (kernel_filename) {
        uint64_t lowaddr = 0;

        kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
                               NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
1488
        if (kernel_size == ELF_LOAD_WRONG_ENDIAN) {
1489 1490 1491 1492 1493
            kernel_size = load_elf(kernel_filename,
                                   translate_kernel_address, NULL,
                                   NULL, &lowaddr, NULL, 0, ELF_MACHINE, 0);
            kernel_le = kernel_size > 0;
        }
1494
        if (kernel_size < 0) {
1495 1496
            fprintf(stderr, "qemu: error loading %s: %s\n",
                    kernel_filename, load_elf_strerror(kernel_size));
1497 1498 1499 1500 1501
            exit(1);
        }

        /* load initrd */
        if (initrd_filename) {
1502 1503 1504 1505
            /* Try to locate the initrd in the gap between the kernel
             * and the firmware. Add a bit of space just in case
             */
            initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
1506
            initrd_size = load_image_targphys(initrd_filename, initrd_base,
1507
                                              load_limit - initrd_base);
1508 1509 1510 1511 1512 1513 1514 1515 1516
            if (initrd_size < 0) {
                fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
                        initrd_filename);
                exit(1);
            }
        } else {
            initrd_base = 0;
            initrd_size = 0;
        }
1517
    }
1518

1519 1520 1521 1522
    if (bios_name == NULL) {
        bios_name = FW_FILE_NAME;
    }
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1523 1524 1525 1526 1527 1528 1529 1530 1531
    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
    if (fw_size < 0) {
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
    g_free(filename);

    spapr->entry_point = 0x100;

1532 1533 1534 1535
    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
    register_savevm_live(NULL, "spapr/htab", -1, 1,
                         &savevm_htab_handlers, spapr);

1536
    /* Prepare the device tree */
1537
    spapr->fdt_skel = spapr_create_fdt_skel(initrd_base, initrd_size,
1538
                                            kernel_size, kernel_le,
1539 1540
                                            boot_device, kernel_cmdline,
                                            spapr->epow_irq);
1541
    assert(spapr->fdt_skel != NULL);
1542 1543
}

1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
static int spapr_kvm_type(const char *vm_type)
{
    if (!vm_type) {
        return 0;
    }

    if (!strcmp(vm_type, "HV")) {
        return 1;
    }

    if (!strcmp(vm_type, "PR")) {
        return 2;
    }

    error_report("Unknown kvm-type specified '%s'", vm_type);
    exit(1);
}

1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618
/*
 * Implementation of an interface to adjust firmware patch
 * for the bootindex property handling.
 */
static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
                                   DeviceState *dev)
{
#define CAST(type, obj, name) \
    ((type *)object_dynamic_cast(OBJECT(obj), (name)))
    SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
    sPAPRPHBState *phb = CAST(sPAPRPHBState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);

    if (d) {
        void *spapr = CAST(void, bus->parent, "spapr-vscsi");
        VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
        USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);

        if (spapr) {
            /*
             * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
             * We use SRP luns of the form 8000 | (bus << 8) | (id << 5) | lun
             * in the top 16 bits of the 64-bit LUN
             */
            unsigned id = 0x8000 | (d->id << 8) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 48);
        } else if (virtio) {
            /*
             * We use SRP luns of the form 01000000 | (target << 8) | lun
             * in the top 32 bits of the 64-bit LUN
             * Note: the quote above is from SLOF and it is wrong,
             * the actual binding is:
             * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
             */
            unsigned id = 0x1000000 | (d->id << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        } else if (usb) {
            /*
             * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
             * in the top 32 bits of the 64-bit LUN
             */
            unsigned usb_port = atoi(usb->port->path);
            unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        }
    }

    if (phb) {
        /* Replace "pci" with "pci@800000020000000" */
        return g_strdup_printf("pci@%"PRIX64, phb->buid);
    }

    return NULL;
}

E
Eduardo Habkost 已提交
1619 1620
static char *spapr_get_kvm_type(Object *obj, Error **errp)
{
1621
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1622 1623 1624 1625 1626 1627

    return g_strdup(sm->kvm_type);
}

static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
{
1628
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639

    g_free(sm->kvm_type);
    sm->kvm_type = g_strdup(value);
}

static void spapr_machine_initfn(Object *obj)
{
    object_property_add_str(obj, "kvm-type",
                            spapr_get_kvm_type, spapr_set_kvm_type, NULL);
}

1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
static void ppc_cpu_do_nmi_on_cpu(void *arg)
{
    CPUState *cs = arg;

    cpu_synchronize_state(cs);
    ppc_cpu_do_system_reset(cs);
}

static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
{
    CPUState *cs;

    CPU_FOREACH(cs) {
        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs);
    }
}

1657 1658 1659
static void spapr_machine_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);
1660
    FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
1661
    NMIClass *nc = NMI_CLASS(oc);
1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672

    mc->name = "pseries";
    mc->desc = "pSeries Logical Partition (PAPR compliant)";
    mc->is_default = 1;
    mc->init = ppc_spapr_init;
    mc->reset = ppc_spapr_reset;
    mc->block_default_type = IF_SCSI;
    mc->max_cpus = MAX_CPUS;
    mc->no_parallel = 1;
    mc->default_boot_order = NULL;
    mc->kvm_type = spapr_kvm_type;
1673

1674
    fwc->get_dev_path = spapr_get_fw_dev_path;
1675
    nc->nmi_monitor_handler = spapr_nmi;
1676 1677 1678 1679 1680
}

static const TypeInfo spapr_machine_info = {
    .name          = TYPE_SPAPR_MACHINE,
    .parent        = TYPE_MACHINE,
1681
    .instance_size = sizeof(sPAPRMachineState),
E
Eduardo Habkost 已提交
1682
    .instance_init = spapr_machine_initfn,
1683
    .class_init    = spapr_machine_class_init,
1684 1685
    .interfaces = (InterfaceInfo[]) {
        { TYPE_FW_PATH_PROVIDER },
1686
        { TYPE_NMI },
1687 1688
        { }
    },
1689 1690
};

1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705
static void spapr_machine_2_1_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);

    mc->name = "pseries-2.1";
    mc->desc = "pSeries Logical Partition (PAPR compliant) v2.1";
    mc->is_default = 0;
}

static const TypeInfo spapr_machine_2_1_info = {
    .name          = TYPE_SPAPR_MACHINE "2.1",
    .parent        = TYPE_SPAPR_MACHINE,
    .class_init    = spapr_machine_2_1_class_init,
};

1706
static void spapr_machine_register_types(void)
1707
{
1708
    type_register_static(&spapr_machine_info);
1709
    type_register_static(&spapr_machine_2_1_info);
1710 1711
}

1712
type_init(spapr_machine_register_types)