spapr.c 51.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
 *
 * Copyright (c) 2004-2007 Fabrice Bellard
 * Copyright (c) 2007 Jocelyn Mayer
 * Copyright (c) 2010 David Gibson, IBM Corporation.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */
27
#include "sysemu/sysemu.h"
28
#include "hw/hw.h"
29
#include "hw/fw-path-provider.h"
30
#include "elf.h"
P
Paolo Bonzini 已提交
31
#include "net/net.h"
32 33 34
#include "sysemu/blockdev.h"
#include "sysemu/cpus.h"
#include "sysemu/kvm.h"
35
#include "kvm_ppc.h"
36
#include "mmu-hash64.h"
37
#include "qom/cpu.h"
38 39

#include "hw/boards.h"
P
Paolo Bonzini 已提交
40
#include "hw/ppc/ppc.h"
41 42
#include "hw/loader.h"

P
Paolo Bonzini 已提交
43 44 45 46
#include "hw/ppc/spapr.h"
#include "hw/ppc/spapr_vio.h"
#include "hw/pci-host/spapr.h"
#include "hw/ppc/xics.h"
47
#include "hw/pci/msi.h"
48

49
#include "hw/pci/pci.h"
50 51
#include "hw/scsi/scsi.h"
#include "hw/virtio/virtio-scsi.h"
52

53
#include "exec/address-spaces.h"
54
#include "hw/usb.h"
55
#include "qemu/config-file.h"
56
#include "qemu/error-report.h"
57
#include "trace.h"
58
#include "hw/nmi.h"
A
Avi Kivity 已提交
59

60 61
#include <libfdt.h>

62 63 64 65 66 67 68 69 70 71
/* SLOF memory layout:
 *
 * SLOF raw image loaded at 0, copies its romfs right below the flat
 * device-tree, then position SLOF itself 31M below that
 *
 * So we set FW_OVERHEAD to 40MB which should account for all of that
 * and more
 *
 * We load our kernel at 4M, leaving space for SLOF initial image
 */
72
#define FDT_MAX_SIZE            0x40000
73
#define RTAS_MAX_SIZE           0x10000
74
#define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
75 76
#define FW_MAX_SIZE             0x400000
#define FW_FILE_NAME            "slof.bin"
77 78
#define FW_OVERHEAD             0x2800000
#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
79

80
#define MIN_RMA_SLOF            128UL
81 82 83

#define TIMEBASE_FREQ           512000000ULL

84
#define MAX_CPUS                255
85

86 87
#define PHANDLE_XICP            0x00001111

88 89
#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))

90
typedef struct sPAPRMachineState sPAPRMachineState;
91

92
#define TYPE_SPAPR_MACHINE      "spapr-machine"
93
#define SPAPR_MACHINE(obj) \
94
    OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE)
95 96

/**
97
 * sPAPRMachineState:
98
 */
99
struct sPAPRMachineState {
100 101
    /*< private >*/
    MachineState parent_obj;
E
Eduardo Habkost 已提交
102 103 104

    /*< public >*/
    char *kvm_type;
105 106
};

107 108
sPAPREnvironment *spapr;

109 110 111 112 113 114 115 116 117 118 119 120
static XICSState *try_create_xics(const char *type, int nr_servers,
                                  int nr_irqs)
{
    DeviceState *dev;

    dev = qdev_create(NULL, type);
    qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
    qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
    if (qdev_init(dev) < 0) {
        return NULL;
    }

121
    return XICS_COMMON(dev);
122 123 124 125 126 127
}

static XICSState *xics_system_init(int nr_servers, int nr_irqs)
{
    XICSState *icp = NULL;

128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
    if (kvm_enabled()) {
        QemuOpts *machine_opts = qemu_get_machine_opts();
        bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
                                                "kernel_irqchip", true);
        bool irqchip_required = qemu_opt_get_bool(machine_opts,
                                                  "kernel_irqchip", false);
        if (irqchip_allowed) {
            icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
        }

        if (irqchip_required && !icp) {
            perror("Failed to create in-kernel XICS\n");
            abort();
        }
    }

    if (!icp) {
        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
    }

148 149 150 151 152 153 154 155
    if (!icp) {
        perror("Failed to create XICS\n");
        abort();
    }

    return icp;
}

156 157 158 159 160 161 162 163
static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
                                  int smt_threads)
{
    int i, ret = 0;
    uint32_t servers_prop[smt_threads];
    uint32_t gservers_prop[smt_threads * 2];
    int index = ppc_get_vcpu_dt_id(cpu);

164
    if (cpu->cpu_version) {
165
        ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->cpu_version);
166 167 168 169 170
        if (ret < 0) {
            return ret;
        }
    }

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
    /* Build interrupt servers and gservers properties */
    for (i = 0; i < smt_threads; i++) {
        servers_prop[i] = cpu_to_be32(index + i);
        /* Hack, direct the group queues back to cpu 0 */
        gservers_prop[i*2] = cpu_to_be32(index + i);
        gservers_prop[i*2 + 1] = 0;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
                      servers_prop, sizeof(servers_prop));
    if (ret < 0) {
        return ret;
    }
    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
                      gservers_prop, sizeof(gservers_prop));

    return ret;
}

189
static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
190
{
191 192
    int ret = 0, offset, cpus_offset;
    CPUState *cs;
193 194
    char cpu_model[32];
    int smt = kvmppc_smt_threads();
195
    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
196

197 198 199 200
    CPU_FOREACH(cs) {
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
        int index = ppc_get_vcpu_dt_id(cpu);
201 202 203 204
        uint32_t associativity[] = {cpu_to_be32(0x5),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
205
                                    cpu_to_be32(cs->numa_node),
206
                                    cpu_to_be32(index)};
207

208
        if ((index % smt) != 0) {
209 210 211
            continue;
        }

212
        snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
213

214 215 216 217 218 219 220 221 222
        cpus_offset = fdt_path_offset(fdt, "/cpus");
        if (cpus_offset < 0) {
            cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
                                          "cpus");
            if (cpus_offset < 0) {
                return cpus_offset;
            }
        }
        offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
223
        if (offset < 0) {
224 225 226 227
            offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
            if (offset < 0) {
                return offset;
            }
228 229
        }

230 231 232 233 234 235 236 237 238 239
        if (nb_numa_nodes > 1) {
            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
                              sizeof(associativity));
            if (ret < 0) {
                return ret;
            }
        }

        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
                          pft_size_prop, sizeof(pft_size_prop));
240 241 242
        if (ret < 0) {
            return ret;
        }
243

244
        ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
245
                                     ppc_get_compat_smt_threads(cpu));
246 247 248
        if (ret < 0) {
            return ret;
        }
249 250 251 252
    }
    return ret;
}

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286

static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
                                     size_t maxsize)
{
    size_t maxcells = maxsize / sizeof(uint32_t);
    int i, j, count;
    uint32_t *p = prop;

    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
        struct ppc_one_seg_page_size *sps = &env->sps.sps[i];

        if (!sps->page_shift) {
            break;
        }
        for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
            if (sps->enc[count].page_shift == 0) {
                break;
            }
        }
        if ((p - prop) >= (maxcells - 3 - count * 2)) {
            break;
        }
        *(p++) = cpu_to_be32(sps->page_shift);
        *(p++) = cpu_to_be32(sps->slb_enc);
        *(p++) = cpu_to_be32(count);
        for (j = 0; j < count; j++) {
            *(p++) = cpu_to_be32(sps->enc[j].page_shift);
            *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
        }
    }

    return (p - prop) * sizeof(uint32_t);
}

287 288 289 290 291 292 293 294 295 296 297 298 299
static hwaddr spapr_node0_size(void)
{
    if (nb_numa_nodes) {
        int i;
        for (i = 0; i < nb_numa_nodes; ++i) {
            if (numa_info[i].node_mem) {
                return MIN(pow2floor(numa_info[i].node_mem), ram_size);
            }
        }
    }
    return ram_size;
}

300 301 302 303 304 305 306 307 308 309
#define _FDT(exp) \
    do { \
        int ret = (exp);                                           \
        if (ret < 0) {                                             \
            fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
                    #exp, fdt_strerror(ret));                      \
            exit(1);                                               \
        }                                                          \
    } while (0)

310 311 312 313
static void add_str(GString *s, const gchar *s1)
{
    g_string_append_len(s, s1, strlen(s1) + 1);
}
314

315
static void *spapr_create_fdt_skel(hwaddr initrd_base,
A
Avi Kivity 已提交
316 317
                                   hwaddr initrd_size,
                                   hwaddr kernel_size,
318
                                   bool little_endian,
319
                                   const char *boot_device,
320 321
                                   const char *kernel_cmdline,
                                   uint32_t epow_irq)
322 323
{
    void *fdt;
324
    CPUState *cs;
325 326
    uint32_t start_prop = cpu_to_be32(initrd_base);
    uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
327 328
    GString *hypertas = g_string_sized_new(256);
    GString *qemu_hypertas = g_string_sized_new(256);
329
    uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
330
    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
331
    int smt = kvmppc_smt_threads();
332
    unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
333 334 335
    QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
    unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
    uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
336
    char *buf;
337

338 339 340 341 342 343 344 345 346 347 348
    add_str(hypertas, "hcall-pft");
    add_str(hypertas, "hcall-term");
    add_str(hypertas, "hcall-dabr");
    add_str(hypertas, "hcall-interrupt");
    add_str(hypertas, "hcall-tce");
    add_str(hypertas, "hcall-vio");
    add_str(hypertas, "hcall-splpar");
    add_str(hypertas, "hcall-bulk");
    add_str(hypertas, "hcall-set-mode");
    add_str(qemu_hypertas, "hcall-memop1");

349
    fdt = g_malloc0(FDT_MAX_SIZE);
350 351
    _FDT((fdt_create(fdt, FDT_MAX_SIZE)));

352 353 354 355 356 357
    if (kernel_size) {
        _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
    }
    if (initrd_size) {
        _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
    }
358 359 360 361 362
    _FDT((fdt_finish_reservemap(fdt)));

    /* Root node */
    _FDT((fdt_begin_node(fdt, "")));
    _FDT((fdt_property_string(fdt, "device_type", "chrp")));
363
    _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
364
    _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
365

366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
    /*
     * Add info to guest to indentify which host is it being run on
     * and what is the uuid of the guest
     */
    if (kvmppc_get_host_model(&buf)) {
        _FDT((fdt_property_string(fdt, "host-model", buf)));
        g_free(buf);
    }
    if (kvmppc_get_host_serial(&buf)) {
        _FDT((fdt_property_string(fdt, "host-serial", buf)));
        g_free(buf);
    }

    buf = g_strdup_printf(UUID_FMT, qemu_uuid[0], qemu_uuid[1],
                          qemu_uuid[2], qemu_uuid[3], qemu_uuid[4],
                          qemu_uuid[5], qemu_uuid[6], qemu_uuid[7],
                          qemu_uuid[8], qemu_uuid[9], qemu_uuid[10],
                          qemu_uuid[11], qemu_uuid[12], qemu_uuid[13],
                          qemu_uuid[14], qemu_uuid[15]);

    _FDT((fdt_property_string(fdt, "vm,uuid", buf)));
    g_free(buf);

389 390 391 392 393 394
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));

    /* /chosen */
    _FDT((fdt_begin_node(fdt, "chosen")));

395 396 397
    /* Set Form1_affinity */
    _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));

398 399 400 401 402
    _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
    _FDT((fdt_property(fdt, "linux,initrd-start",
                       &start_prop, sizeof(start_prop))));
    _FDT((fdt_property(fdt, "linux,initrd-end",
                       &end_prop, sizeof(end_prop))));
403 404 405
    if (kernel_size) {
        uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
                              cpu_to_be64(kernel_size) };
406

407
        _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
408 409 410
        if (little_endian) {
            _FDT((fdt_property(fdt, "qemu,boot-kernel-le", NULL, 0)));
        }
411
    }
A
Avik Sil 已提交
412 413 414
    if (boot_device) {
        _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
    }
415 416 417
    if (boot_menu) {
        _FDT((fdt_property_cell(fdt, "qemu,boot-menu", boot_menu)));
    }
418 419 420
    _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
421

422 423 424 425 426 427 428 429
    _FDT((fdt_end_node(fdt)));

    /* cpus */
    _FDT((fdt_begin_node(fdt, "cpus")));

    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));

A
Andreas Färber 已提交
430
    CPU_FOREACH(cs) {
431 432
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        CPUPPCState *env = &cpu->env;
433
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
434
        PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
435
        int index = ppc_get_vcpu_dt_id(cpu);
436 437 438
        char *nodename;
        uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
                           0xffffffff, 0xffffffff};
439 440
        uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
        uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
441 442
        uint32_t page_sizes_prop[64];
        size_t page_sizes_prop_size;
443

444 445 446 447
        if ((index % smt) != 0) {
            continue;
        }

448
        nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
449 450 451

        _FDT((fdt_begin_node(fdt, nodename)));

452
        g_free(nodename);
453

D
David Gibson 已提交
454
        _FDT((fdt_property_cell(fdt, "reg", index)));
455 456 457
        _FDT((fdt_property_string(fdt, "device_type", "cpu")));

        _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR])));
458
        _FDT((fdt_property_cell(fdt, "d-cache-block-size",
459
                                env->dcache_line_size)));
460 461 462 463 464
        _FDT((fdt_property_cell(fdt, "d-cache-line-size",
                                env->dcache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-block-size",
                                env->icache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-line-size",
465
                                env->icache_line_size)));
466 467 468 469 470 471 472 473 474 475 476 477

        if (pcc->l1_dcache_size) {
            _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
        }
        if (pcc->l1_icache_size) {
            _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
        }

478 479
        _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
        _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
480 481 482
        _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
        _FDT((fdt_property_string(fdt, "status", "okay")));
        _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
483

484 485 486 487
        if (env->spr_cb[SPR_PURR].oea_read) {
            _FDT((fdt_property(fdt, "ibm,purr", NULL, 0)));
        }

D
David Gibson 已提交
488
        if (env->mmu_model & POWERPC_MMU_1TSEG) {
489 490 491 492
            _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
                               segs, sizeof(segs))));
        }

493 494 495 496
        /* Advertise VMX/VSX (vector extensions) if available
         *   0 / no property == no vector extensions
         *   1               == VMX / Altivec available
         *   2               == VSX available */
497 498 499
        if (env->insns_flags & PPC_ALTIVEC) {
            uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;

500 501 502 503 504 505
            _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx)));
        }

        /* Advertise DFP (Decimal Floating Point) if available
         *   0 / no property == no DFP
         *   1               == DFP available */
506 507
        if (env->insns_flags2 & PPC2_DFP) {
            _FDT((fdt_property_cell(fdt, "ibm,dfp", 1)));
508 509
        }

510 511 512 513 514 515 516
        page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
                                                      sizeof(page_sizes_prop));
        if (page_sizes_prop_size) {
            _FDT((fdt_property(fdt, "ibm,segment-page-sizes",
                               page_sizes_prop, page_sizes_prop_size)));
        }

517 518 519
        _FDT((fdt_property_cell(fdt, "ibm,chip-id",
                                cs->cpu_index / cpus_per_socket)));

520 521 522 523 524
        _FDT((fdt_end_node(fdt)));
    }

    _FDT((fdt_end_node(fdt)));

525 526 527
    /* RTAS */
    _FDT((fdt_begin_node(fdt, "rtas")));

528 529 530
    if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
        add_str(hypertas, "hcall-multi-tce");
    }
531 532 533 534 535 536
    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas->str,
                       hypertas->len)));
    g_string_free(hypertas, TRUE);
    _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas->str,
                       qemu_hypertas->len)));
    g_string_free(qemu_hypertas, TRUE);
537

538 539 540
    _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
        refpoints, sizeof(refpoints))));

541 542
    _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));

543 544 545 546 547 548 549 550 551
    /*
     * According to PAPR, rtas ibm,os-term, does not gaurantee a return
     * back to the guest cpu.
     *
     * While an additional ibm,extended-os-term property indicates that
     * rtas call return will always occur. Set this property.
     */
    _FDT((fdt_property(fdt, "ibm,extended-os-term", NULL, 0)));

552 553
    _FDT((fdt_end_node(fdt)));

554
    /* interrupt controller */
555
    _FDT((fdt_begin_node(fdt, "interrupt-controller")));
556 557 558 559 560 561 562 563

    _FDT((fdt_property_string(fdt, "device_type",
                              "PowerPC-External-Interrupt-Presentation")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
                       interrupt_server_ranges_prop,
                       sizeof(interrupt_server_ranges_prop))));
564 565 566
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
    _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
    _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
567 568 569

    _FDT((fdt_end_node(fdt)));

570 571 572 573 574 575 576
    /* vdevice */
    _FDT((fdt_begin_node(fdt, "vdevice")));

    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
577 578
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
579 580 581

    _FDT((fdt_end_node(fdt)));

582 583 584
    /* event-sources */
    spapr_events_fdt_skel(fdt, epow_irq);

585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
    /* /hypervisor node */
    if (kvm_enabled()) {
        uint8_t hypercall[16];

        /* indicate KVM hypercall interface */
        _FDT((fdt_begin_node(fdt, "hypervisor")));
        _FDT((fdt_property_string(fdt, "compatible", "linux,kvm")));
        if (kvmppc_has_cap_fixup_hcalls()) {
            /*
             * Older KVM versions with older guest kernels were broken with the
             * magic page, don't allow the guest to map it.
             */
            kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
                                 sizeof(hypercall));
            _FDT((fdt_property(fdt, "hcall-instructions", hypercall,
                              sizeof(hypercall))));
        }
        _FDT((fdt_end_node(fdt)));
    }

605 606 607
    _FDT((fdt_end_node(fdt))); /* close root node */
    _FDT((fdt_finish(fdt)));

608 609 610
    return fdt;
}

611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
int spapr_h_cas_compose_response(target_ulong addr, target_ulong size)
{
    void *fdt, *fdt_skel;
    sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };

    size -= sizeof(hdr);

    /* Create sceleton */
    fdt_skel = g_malloc0(size);
    _FDT((fdt_create(fdt_skel, size)));
    _FDT((fdt_begin_node(fdt_skel, "")));
    _FDT((fdt_end_node(fdt_skel)));
    _FDT((fdt_finish(fdt_skel)));
    fdt = g_malloc0(size);
    _FDT((fdt_open_into(fdt_skel, fdt, size)));
    g_free(fdt_skel);

628 629
    /* Fix skeleton up */
    _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646

    /* Pack resulting tree */
    _FDT((fdt_pack(fdt)));

    if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
        trace_spapr_cas_failed(size);
        return -1;
    }

    cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
    cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
    trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
    g_free(fdt);

    return 0;
}

647 648 649 650 651 652
static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start,
                                       hwaddr size)
{
    uint32_t associativity[] = {
        cpu_to_be32(0x4), /* length */
        cpu_to_be32(0x0), cpu_to_be32(0x0),
653
        cpu_to_be32(0x0), cpu_to_be32(nodeid)
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
    };
    char mem_name[32];
    uint64_t mem_reg_property[2];
    int off;

    mem_reg_property[0] = cpu_to_be64(start);
    mem_reg_property[1] = cpu_to_be64(size);

    sprintf(mem_name, "memory@" TARGET_FMT_lx, start);
    off = fdt_add_subnode(fdt, 0, mem_name);
    _FDT(off);
    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                      sizeof(mem_reg_property))));
    _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                      sizeof(associativity))));
}

672 673
static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
{
674 675 676 677 678 679 680 681 682 683
    hwaddr mem_start, node_size;
    int i, nb_nodes = nb_numa_nodes;
    NodeInfo *nodes = numa_info;
    NodeInfo ramnode;

    /* No NUMA nodes, assume there is just one node with whole RAM */
    if (!nb_numa_nodes) {
        nb_nodes = 1;
        ramnode.node_mem = ram_size;
        nodes = &ramnode;
684
    }
685

686 687 688 689
    for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
        if (!nodes[i].node_mem) {
            continue;
        }
690 691 692
        if (mem_start >= ram_size) {
            node_size = 0;
        } else {
693
            node_size = nodes[i].node_mem;
694 695 696 697
            if (node_size > ram_size - mem_start) {
                node_size = ram_size - mem_start;
            }
        }
698 699 700 701 702 703
        if (!mem_start) {
            /* ppc_spapr_init() checks for rma_size <= node0_size already */
            spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
            mem_start += spapr->rma_size;
            node_size -= spapr->rma_size;
        }
704 705 706 707 708 709 710 711 712 713 714 715
        for ( ; node_size; ) {
            hwaddr sizetmp = pow2floor(node_size);

            /* mem_start != 0 here */
            if (ctzl(mem_start) < ctzl(sizetmp)) {
                sizetmp = 1ULL << ctzl(mem_start);
            }

            spapr_populate_memory_node(fdt, i, mem_start, sizetmp);
            node_size -= sizetmp;
            mem_start += sizetmp;
        }
716 717 718 719 720
    }

    return 0;
}

721
static void spapr_finalize_fdt(sPAPREnvironment *spapr,
A
Avi Kivity 已提交
722 723 724
                               hwaddr fdt_addr,
                               hwaddr rtas_addr,
                               hwaddr rtas_size)
725
{
726 727 728
    int ret, i;
    size_t cb = 0;
    char *bootlist;
729
    void *fdt;
730
    sPAPRPHBState *phb;
731

732
    fdt = g_malloc(FDT_MAX_SIZE);
733 734 735

    /* open out the base tree into a temp buffer for the final tweaks */
    _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
736

737 738 739 740 741 742
    ret = spapr_populate_memory(spapr, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
        exit(1);
    }

743 744 745 746 747 748
    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup vio devices in fdt\n");
        exit(1);
    }

749
    QLIST_FOREACH(phb, &spapr->phbs, list) {
750
        ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
751 752 753 754 755 756 757
    }

    if (ret < 0) {
        fprintf(stderr, "couldn't setup PCI devices in fdt\n");
        exit(1);
    }

758 759 760 761 762 763
    /* RTAS */
    ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
    if (ret < 0) {
        fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
    }

764
    /* Advertise NUMA via ibm,associativity */
765 766 767
    ret = spapr_fixup_cpu_dt(fdt, spapr);
    if (ret < 0) {
        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
768 769
    }

770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
    bootlist = get_boot_devices_list(&cb, true);
    if (cb && bootlist) {
        int offset = fdt_path_offset(fdt, "/chosen");
        if (offset < 0) {
            exit(1);
        }
        for (i = 0; i < cb; i++) {
            if (bootlist[i] == '\n') {
                bootlist[i] = ' ';
            }

        }
        ret = fdt_setprop_string(fdt, offset, "qemu,boot-list", bootlist);
    }

785
    if (!spapr->has_graphics) {
786 787
        spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
    }
788

789 790
    _FDT((fdt_pack(fdt)));

791 792 793 794 795 796
    if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
        hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
                 fdt_totalsize(fdt), FDT_MAX_SIZE);
        exit(1);
    }

797
    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
798

G
Gonglei 已提交
799
    g_free(bootlist);
800
    g_free(fdt);
801 802 803 804 805 806 807
}

static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
{
    return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
}

808
static void emulate_spapr_hypercall(PowerPCCPU *cpu)
809
{
810 811
    CPUPPCState *env = &cpu->env;

812 813 814 815
    if (msr_pr) {
        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
        env->gpr[3] = H_PRIVILEGE;
    } else {
816
        env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
817
    }
818 819
}

820 821 822 823 824 825 826 827 828 829 830 831 832
static void spapr_reset_htab(sPAPREnvironment *spapr)
{
    long shift;

    /* allocate hash page table.  For now we always make this 16mb,
     * later we should probably make it scale to the size of guest
     * RAM */

    shift = kvmppc_reset_htab(spapr->htab_shift);

    if (shift > 0) {
        /* Kernel handles htab, we don't need to allocate one */
        spapr->htab_shift = shift;
833
        kvmppc_kern_htab = true;
834 835 836 837 838 839 840 841 842 843 844 845
    } else {
        if (!spapr->htab) {
            /* Allocate an htab if we don't yet have one */
            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
        }

        /* And clear it */
        memset(spapr->htab, 0, HTAB_SIZE(spapr));
    }

    /* Update the RMA size if necessary */
    if (spapr->vrma_adjust) {
846 847
        spapr->rma_size = kvmppc_rma_size(spapr_node0_size(),
                                          spapr->htab_shift);
848
    }
849 850
}

851
static void ppc_spapr_reset(void)
852
{
853
    PowerPCCPU *first_ppc_cpu;
854
    uint32_t rtas_limit;
855

856 857
    /* Reset the hash table & recalc the RMA */
    spapr_reset_htab(spapr);
858

859
    qemu_devices_reset();
860

861 862 863 864 865 866 867 868 869
    /*
     * We place the device tree and RTAS just below either the top of the RMA,
     * or just below 2GB, whichever is lowere, so that it can be
     * processed with 32-bit real mode code if necessary
     */
    rtas_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR);
    spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
    spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;

870 871 872 873
    /* Load the fdt */
    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
                       spapr->rtas_size);

874 875 876 877
    /* Copy RTAS over */
    cpu_physical_memory_write(spapr->rtas_addr, spapr->rtas_blob,
                              spapr->rtas_size);

878
    /* Set up the entry state */
879 880 881 882 883
    first_ppc_cpu = POWERPC_CPU(first_cpu);
    first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
    first_ppc_cpu->env.gpr[5] = 0;
    first_cpu->halted = 0;
    first_ppc_cpu->env.nip = spapr->entry_point;
884 885 886

}

887 888
static void spapr_cpu_reset(void *opaque)
{
889
    PowerPCCPU *cpu = opaque;
890
    CPUState *cs = CPU(cpu);
891
    CPUPPCState *env = &cpu->env;
892

893
    cpu_reset(cs);
894 895 896 897

    /* All CPUs start halted.  CPU0 is unhalted from the machine level
     * reset code and the rest are explicitly started up by the guest
     * using an RTAS call */
898
    cs->halted = 1;
899 900

    env->spr[SPR_HIOR] = 0;
901

902
    env->external_htab = (uint8_t *)spapr->htab;
903 904 905 906 907 908 909
    if (kvm_enabled() && !env->external_htab) {
        /*
         * HV KVM, set external_htab to 1 so our ppc_hash64_load_hpte*
         * functions do the right thing.
         */
        env->external_htab = (void *)1;
    }
910
    env->htab_base = -1;
911 912 913 914 915 916 917
    /*
     * htab_mask is the mask used to normalize hash value to PTEG index.
     * htab_shift is log2 of hash table size.
     * We have 8 hpte per group, and each hpte is 16 bytes.
     * ie have 128 bytes per hpte entry.
     */
    env->htab_mask = (1ULL << ((spapr)->htab_shift - 7)) - 1;
918
    env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
919
        (spapr->htab_shift - 18);
920 921
}

D
David Gibson 已提交
922 923
static void spapr_create_nvram(sPAPREnvironment *spapr)
{
924
    DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
P
Paolo Bonzini 已提交
925
    DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
D
David Gibson 已提交
926

P
Paolo Bonzini 已提交
927 928
    if (dinfo) {
        qdev_prop_set_drive_nofail(dev, "drive", dinfo->bdrv);
D
David Gibson 已提交
929 930 931 932 933 934 935
    }

    qdev_init_nofail(dev);

    spapr->nvram = (struct sPAPRNVRAM *)dev;
}

936
/* Returns whether we want to use VGA or not */
937 938
static int spapr_vga_init(PCIBus *pci_bus)
{
939 940
    switch (vga_interface_type) {
    case VGA_NONE:
941 942 943
        return false;
    case VGA_DEVICE:
        return true;
944 945
    case VGA_STD:
        return pci_vga_init(pci_bus) != NULL;
946
    default:
947 948
        fprintf(stderr, "This vga model is not supported,"
                "currently it only supports -vga std\n");
949
        exit(0);
950 951 952
    }
}

953 954
static const VMStateDescription vmstate_spapr = {
    .name = "spapr",
955
    .version_id = 2,
956
    .minimum_version_id = 1,
957
    .fields = (VMStateField[]) {
A
Alexey Kardashevskiy 已提交
958
        VMSTATE_UNUSED(4), /* used to be @next_irq */
959 960 961

        /* RTC offset */
        VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
962
        VMSTATE_PPC_TIMEBASE_V(tb, sPAPREnvironment, 2),
963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978
        VMSTATE_END_OF_LIST()
    },
};

#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))

static int htab_save_setup(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* "Iteration" header */
    qemu_put_be32(f, spapr->htab_shift);

979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
    if (spapr->htab) {
        spapr->htab_save_index = 0;
        spapr->htab_first_pass = true;
    } else {
        assert(kvm_enabled());

        spapr->htab_fd = kvmppc_get_htab_fd(false);
        if (spapr->htab_fd < 0) {
            fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
                    strerror(errno));
            return -1;
        }
    }


994 995 996 997 998 999 1000 1001
    return 0;
}

static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                 int64_t max_ns)
{
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int index = spapr->htab_save_index;
1002
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032

    assert(spapr->htab_first_pass);

    do {
        int chunkstart;

        /* Consume invalid HPTEs */
        while ((index < htabslots)
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        /* Consume valid HPTEs */
        chunkstart = index;
        while ((index < htabslots)
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        if (index > chunkstart) {
            int n_valid = index - chunkstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, 0);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);

1033
            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
                break;
            }
        }
    } while ((index < htabslots) && !qemu_file_rate_limit(f));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
        spapr->htab_first_pass = false;
    }
    spapr->htab_save_index = index;
}

1047 1048
static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                int64_t max_ns)
1049 1050 1051 1052 1053
{
    bool final = max_ns < 0;
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int examined = 0, sent = 0;
    int index = spapr->htab_save_index;
1054
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098

    assert(!spapr->htab_first_pass);

    do {
        int chunkstart, invalidstart;

        /* Consume non-dirty HPTEs */
        while ((index < htabslots)
               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
            index++;
            examined++;
        }

        chunkstart = index;
        /* Consume valid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        invalidstart = index;
        /* Consume invalid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        if (index > chunkstart) {
            int n_valid = invalidstart - chunkstart;
            int n_invalid = index - invalidstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, n_invalid);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);
            sent += index - chunkstart;

1099
            if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
                break;
            }
        }

        if (examined >= htabslots) {
            break;
        }

        if (index >= htabslots) {
            assert(index == htabslots);
            index = 0;
        }
    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
    }

    spapr->htab_save_index = index;

1121
    return (examined >= htabslots) && (sent == 0) ? 1 : 0;
1122 1123
}

1124 1125 1126
#define MAX_ITERATION_NS    5000000 /* 5 ms */
#define MAX_KVM_BUF_SIZE    2048

1127 1128 1129
static int htab_save_iterate(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;
1130
    int rc = 0;
1131 1132 1133 1134

    /* Iteration header */
    qemu_put_be32(f, 0);

1135 1136 1137 1138 1139 1140 1141 1142 1143
    if (!spapr->htab) {
        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd,
                              MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
        if (rc < 0) {
            return rc;
        }
    } else  if (spapr->htab_first_pass) {
1144 1145
        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
    } else {
1146
        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
1147 1148 1149 1150 1151 1152 1153
    }

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

1154
    return rc;
1155 1156 1157 1158 1159 1160 1161 1162 1163
}

static int htab_save_complete(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* Iteration header */
    qemu_put_be32(f, 0);

1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
    if (!spapr->htab) {
        int rc;

        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd, MAX_KVM_BUF_SIZE, -1);
        if (rc < 0) {
            return rc;
        }
        close(spapr->htab_fd);
        spapr->htab_fd = -1;
    } else {
        htab_save_later_pass(f, spapr, -1);
    }
1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

    return 0;
}

static int htab_load(QEMUFile *f, void *opaque, int version_id)
{
    sPAPREnvironment *spapr = opaque;
    uint32_t section_hdr;
1191
    int fd = -1;
1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207

    if (version_id < 1 || version_id > 1) {
        fprintf(stderr, "htab_load() bad version\n");
        return -EINVAL;
    }

    section_hdr = qemu_get_be32(f);

    if (section_hdr) {
        /* First section, just the hash shift */
        if (spapr->htab_shift != section_hdr) {
            return -EINVAL;
        }
        return 0;
    }

1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
    if (!spapr->htab) {
        assert(kvm_enabled());

        fd = kvmppc_get_htab_fd(true);
        if (fd < 0) {
            fprintf(stderr, "Unable to open fd to restore KVM hash table: %s\n",
                    strerror(errno));
        }
    }

1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230
    while (true) {
        uint32_t index;
        uint16_t n_valid, n_invalid;

        index = qemu_get_be32(f);
        n_valid = qemu_get_be16(f);
        n_invalid = qemu_get_be16(f);

        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
            /* End of Stream */
            break;
        }

1231
        if ((index + n_valid + n_invalid) >
1232 1233 1234
            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
            /* Bad index in stream */
            fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
1235 1236
                    "in htab stream (htab_shift=%d)\n", index, n_valid, n_invalid,
                    spapr->htab_shift);
1237 1238 1239
            return -EINVAL;
        }

1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
        if (spapr->htab) {
            if (n_valid) {
                qemu_get_buffer(f, HPTE(spapr->htab, index),
                                HASH_PTE_SIZE_64 * n_valid);
            }
            if (n_invalid) {
                memset(HPTE(spapr->htab, index + n_valid), 0,
                       HASH_PTE_SIZE_64 * n_invalid);
            }
        } else {
            int rc;

            assert(fd >= 0);

            rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
            if (rc < 0) {
                return rc;
            }
1258 1259 1260
        }
    }

1261 1262 1263 1264 1265
    if (!spapr->htab) {
        assert(fd >= 0);
        close(fd);
    }

1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
    return 0;
}

static SaveVMHandlers savevm_htab_handlers = {
    .save_live_setup = htab_save_setup,
    .save_live_iterate = htab_save_iterate,
    .save_live_complete = htab_save_complete,
    .load_state = htab_load,
};

1276
/* pSeries LPAR / sPAPR hardware init */
1277
static void ppc_spapr_init(MachineState *machine)
1278
{
1279 1280 1281 1282 1283 1284
    ram_addr_t ram_size = machine->ram_size;
    const char *cpu_model = machine->cpu_model;
    const char *kernel_filename = machine->kernel_filename;
    const char *kernel_cmdline = machine->kernel_cmdline;
    const char *initrd_filename = machine->initrd_filename;
    const char *boot_device = machine->boot_order;
1285
    PowerPCCPU *cpu;
A
Andreas Färber 已提交
1286
    CPUPPCState *env;
1287
    PCIHostState *phb;
1288
    int i;
A
Avi Kivity 已提交
1289 1290
    MemoryRegion *sysmem = get_system_memory();
    MemoryRegion *ram = g_new(MemoryRegion, 1);
1291 1292
    MemoryRegion *rma_region;
    void *rma = NULL;
A
Avi Kivity 已提交
1293
    hwaddr rma_alloc_size;
1294
    hwaddr node0_size = spapr_node0_size();
1295 1296
    uint32_t initrd_base = 0;
    long kernel_size = 0, initrd_size = 0;
1297
    long load_limit, fw_size;
1298
    bool kernel_le = false;
1299
    char *filename;
1300

1301 1302
    msi_supported = true;

1303 1304 1305
    spapr = g_malloc0(sizeof(*spapr));
    QLIST_INIT(&spapr->phbs);

1306 1307
    cpu_ppc_hypercall = emulate_spapr_hypercall;

1308
    /* Allocate RMA if necessary */
1309
    rma_alloc_size = kvmppc_alloc_rma(&rma);
1310 1311 1312 1313 1314

    if (rma_alloc_size == -1) {
        hw_error("qemu: Unable to create RMA\n");
        exit(1);
    }
1315

1316
    if (rma_alloc_size && (rma_alloc_size < node0_size)) {
1317
        spapr->rma_size = rma_alloc_size;
1318
    } else {
1319
        spapr->rma_size = node0_size;
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333

        /* With KVM, we don't actually know whether KVM supports an
         * unbounded RMA (PR KVM) or is limited by the hash table size
         * (HV KVM using VRMA), so we always assume the latter
         *
         * In that case, we also limit the initial allocations for RTAS
         * etc... to 256M since we have no way to know what the VRMA size
         * is going to be as it depends on the size of the hash table
         * isn't determined yet.
         */
        if (kvm_enabled()) {
            spapr->vrma_adjust = 1;
            spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
        }
1334 1335
    }

1336 1337 1338 1339 1340 1341
    if (spapr->rma_size > node0_size) {
        fprintf(stderr, "Error: Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")\n",
                spapr->rma_size);
        exit(1);
    }

1342 1343
    /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
    load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;
1344

1345 1346 1347 1348 1349 1350 1351 1352 1353 1354
    /* We aim for a hash table of size 1/128 the size of RAM.  The
     * normal rule of thumb is 1/64 the size of RAM, but that's much
     * more than needed for the Linux guests we support. */
    spapr->htab_shift = 18; /* Minimum architected size */
    while (spapr->htab_shift <= 46) {
        if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
            break;
        }
        spapr->htab_shift++;
    }
1355

1356 1357 1358 1359
    /* Set up Interrupt Controller before we create the VCPUs */
    spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads,
                                  XICS_IRQS);

1360 1361
    /* init CPUs */
    if (cpu_model == NULL) {
1362
        cpu_model = kvm_enabled() ? "host" : "POWER7";
1363 1364
    }
    for (i = 0; i < smp_cpus; i++) {
1365 1366
        cpu = cpu_ppc_init(cpu_model);
        if (cpu == NULL) {
1367 1368 1369
            fprintf(stderr, "Unable to find PowerPC CPU definition\n");
            exit(1);
        }
1370 1371
        env = &cpu->env;

1372 1373 1374
        /* Set time-base frequency to 512 MHz */
        cpu_ppc_tb_init(env, TIMEBASE_FREQ);

1375 1376 1377 1378
        /* PAPR always has exception vectors in RAM not ROM. To ensure this,
         * MSR[IP] should never be set.
         */
        env->msr_mask &= ~(1 << 6);
1379 1380 1381

        /* Tell KVM that we're in PAPR mode */
        if (kvm_enabled()) {
1382
            kvmppc_set_papr(cpu);
1383 1384
        }

1385 1386 1387 1388 1389 1390
        if (cpu->max_compat) {
            if (ppc_set_compat(cpu, cpu->max_compat) < 0) {
                exit(1);
            }
        }

1391 1392
        xics_cpu_setup(spapr->icp, cpu);

1393
        qemu_register_reset(spapr_cpu_reset, cpu);
1394 1395 1396
    }

    /* allocate RAM */
1397
    spapr->ram_limit = ram_size;
1398 1399 1400
    memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
                                         spapr->ram_limit);
    memory_region_add_subregion(sysmem, 0, ram);
1401

1402 1403 1404 1405 1406 1407 1408 1409
    if (rma_alloc_size && rma) {
        rma_region = g_new(MemoryRegion, 1);
        memory_region_init_ram_ptr(rma_region, NULL, "ppc_spapr.rma",
                                   rma_alloc_size, rma);
        vmstate_register_ram_global(rma_region);
        memory_region_add_subregion(sysmem, 0, rma_region);
    }

1410
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
1411 1412 1413
    spapr->rtas_size = get_image_size(filename);
    spapr->rtas_blob = g_malloc(spapr->rtas_size);
    if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
1414 1415 1416
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
1417
    if (spapr->rtas_size > RTAS_MAX_SIZE) {
1418
        hw_error("RTAS too big ! 0x%zx bytes (max is 0x%x)\n",
1419 1420 1421
                 spapr->rtas_size, RTAS_MAX_SIZE);
        exit(1);
    }
1422
    g_free(filename);
1423

1424 1425 1426
    /* Set up EPOW events infrastructure */
    spapr_events_init(spapr);

1427
    /* Set up VIO bus */
1428 1429
    spapr->vio_bus = spapr_vio_bus_init();

P
Paolo Bonzini 已提交
1430
    for (i = 0; i < MAX_SERIAL_PORTS; i++) {
1431
        if (serial_hds[i]) {
1432
            spapr_vty_create(spapr->vio_bus, serial_hds[i]);
1433 1434
        }
    }
1435

D
David Gibson 已提交
1436 1437 1438
    /* We always have at least the nvram device on VIO */
    spapr_create_nvram(spapr);

1439
    /* Set up PCI */
1440 1441
    spapr_pci_rtas_init();

1442
    phb = spapr_create_phb(spapr, 0);
1443

P
Paolo Bonzini 已提交
1444
    for (i = 0; i < nb_nics; i++) {
1445 1446 1447
        NICInfo *nd = &nd_table[i];

        if (!nd->model) {
1448
            nd->model = g_strdup("ibmveth");
1449 1450 1451
        }

        if (strcmp(nd->model, "ibmveth") == 0) {
1452
            spapr_vlan_create(spapr->vio_bus, nd);
1453
        } else {
1454
            pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
1455 1456 1457
        }
    }

1458
    for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
1459
        spapr_vscsi_create(spapr->vio_bus);
1460 1461
    }

1462
    /* Graphics */
1463
    if (spapr_vga_init(phb->bus)) {
1464
        spapr->has_graphics = true;
1465 1466
    }

1467
    if (usb_enabled(spapr->has_graphics)) {
1468
        pci_create_simple(phb->bus, -1, "pci-ohci");
1469 1470 1471 1472 1473 1474
        if (spapr->has_graphics) {
            usbdevice_create("keyboard");
            usbdevice_create("mouse");
        }
    }

1475
    if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
1476 1477 1478 1479 1480
        fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
                "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
        exit(1);
    }

1481 1482 1483 1484 1485
    if (kernel_filename) {
        uint64_t lowaddr = 0;

        kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
                               NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
1486
        if (kernel_size == ELF_LOAD_WRONG_ENDIAN) {
1487 1488 1489 1490 1491
            kernel_size = load_elf(kernel_filename,
                                   translate_kernel_address, NULL,
                                   NULL, &lowaddr, NULL, 0, ELF_MACHINE, 0);
            kernel_le = kernel_size > 0;
        }
1492
        if (kernel_size < 0) {
1493 1494
            fprintf(stderr, "qemu: error loading %s: %s\n",
                    kernel_filename, load_elf_strerror(kernel_size));
1495 1496 1497 1498 1499
            exit(1);
        }

        /* load initrd */
        if (initrd_filename) {
1500 1501 1502 1503
            /* Try to locate the initrd in the gap between the kernel
             * and the firmware. Add a bit of space just in case
             */
            initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
1504
            initrd_size = load_image_targphys(initrd_filename, initrd_base,
1505
                                              load_limit - initrd_base);
1506 1507 1508 1509 1510 1511 1512 1513 1514
            if (initrd_size < 0) {
                fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
                        initrd_filename);
                exit(1);
            }
        } else {
            initrd_base = 0;
            initrd_size = 0;
        }
1515
    }
1516

1517 1518 1519 1520
    if (bios_name == NULL) {
        bios_name = FW_FILE_NAME;
    }
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1521 1522 1523 1524 1525 1526 1527 1528 1529
    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
    if (fw_size < 0) {
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
    g_free(filename);

    spapr->entry_point = 0x100;

1530 1531 1532 1533
    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
    register_savevm_live(NULL, "spapr/htab", -1, 1,
                         &savevm_htab_handlers, spapr);

1534
    /* Prepare the device tree */
1535
    spapr->fdt_skel = spapr_create_fdt_skel(initrd_base, initrd_size,
1536
                                            kernel_size, kernel_le,
1537 1538
                                            boot_device, kernel_cmdline,
                                            spapr->epow_irq);
1539
    assert(spapr->fdt_skel != NULL);
1540 1541
}

1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559
static int spapr_kvm_type(const char *vm_type)
{
    if (!vm_type) {
        return 0;
    }

    if (!strcmp(vm_type, "HV")) {
        return 1;
    }

    if (!strcmp(vm_type, "PR")) {
        return 2;
    }

    error_report("Unknown kvm-type specified '%s'", vm_type);
    exit(1);
}

1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616
/*
 * Implementation of an interface to adjust firmware patch
 * for the bootindex property handling.
 */
static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
                                   DeviceState *dev)
{
#define CAST(type, obj, name) \
    ((type *)object_dynamic_cast(OBJECT(obj), (name)))
    SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
    sPAPRPHBState *phb = CAST(sPAPRPHBState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);

    if (d) {
        void *spapr = CAST(void, bus->parent, "spapr-vscsi");
        VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
        USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);

        if (spapr) {
            /*
             * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
             * We use SRP luns of the form 8000 | (bus << 8) | (id << 5) | lun
             * in the top 16 bits of the 64-bit LUN
             */
            unsigned id = 0x8000 | (d->id << 8) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 48);
        } else if (virtio) {
            /*
             * We use SRP luns of the form 01000000 | (target << 8) | lun
             * in the top 32 bits of the 64-bit LUN
             * Note: the quote above is from SLOF and it is wrong,
             * the actual binding is:
             * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
             */
            unsigned id = 0x1000000 | (d->id << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        } else if (usb) {
            /*
             * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
             * in the top 32 bits of the 64-bit LUN
             */
            unsigned usb_port = atoi(usb->port->path);
            unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
                                   (uint64_t)id << 32);
        }
    }

    if (phb) {
        /* Replace "pci" with "pci@800000020000000" */
        return g_strdup_printf("pci@%"PRIX64, phb->buid);
    }

    return NULL;
}

E
Eduardo Habkost 已提交
1617 1618
static char *spapr_get_kvm_type(Object *obj, Error **errp)
{
1619
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1620 1621 1622 1623 1624 1625

    return g_strdup(sm->kvm_type);
}

static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
{
1626
    sPAPRMachineState *sm = SPAPR_MACHINE(obj);
E
Eduardo Habkost 已提交
1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637

    g_free(sm->kvm_type);
    sm->kvm_type = g_strdup(value);
}

static void spapr_machine_initfn(Object *obj)
{
    object_property_add_str(obj, "kvm-type",
                            spapr_get_kvm_type, spapr_set_kvm_type, NULL);
}

1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654
static void ppc_cpu_do_nmi_on_cpu(void *arg)
{
    CPUState *cs = arg;

    cpu_synchronize_state(cs);
    ppc_cpu_do_system_reset(cs);
}

static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
{
    CPUState *cs;

    CPU_FOREACH(cs) {
        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs);
    }
}

1655 1656 1657
static void spapr_machine_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);
1658
    FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
1659
    NMIClass *nc = NMI_CLASS(oc);
1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670

    mc->name = "pseries";
    mc->desc = "pSeries Logical Partition (PAPR compliant)";
    mc->is_default = 1;
    mc->init = ppc_spapr_init;
    mc->reset = ppc_spapr_reset;
    mc->block_default_type = IF_SCSI;
    mc->max_cpus = MAX_CPUS;
    mc->no_parallel = 1;
    mc->default_boot_order = NULL;
    mc->kvm_type = spapr_kvm_type;
1671

1672
    fwc->get_dev_path = spapr_get_fw_dev_path;
1673
    nc->nmi_monitor_handler = spapr_nmi;
1674 1675 1676 1677 1678
}

static const TypeInfo spapr_machine_info = {
    .name          = TYPE_SPAPR_MACHINE,
    .parent        = TYPE_MACHINE,
1679
    .instance_size = sizeof(sPAPRMachineState),
E
Eduardo Habkost 已提交
1680
    .instance_init = spapr_machine_initfn,
1681
    .class_init    = spapr_machine_class_init,
1682 1683
    .interfaces = (InterfaceInfo[]) {
        { TYPE_FW_PATH_PROVIDER },
1684
        { TYPE_NMI },
1685 1686
        { }
    },
1687 1688
};

1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
static void spapr_machine_2_1_class_init(ObjectClass *oc, void *data)
{
    MachineClass *mc = MACHINE_CLASS(oc);

    mc->name = "pseries-2.1";
    mc->desc = "pSeries Logical Partition (PAPR compliant) v2.1";
    mc->is_default = 0;
}

static const TypeInfo spapr_machine_2_1_info = {
    .name          = TYPE_SPAPR_MACHINE "2.1",
    .parent        = TYPE_SPAPR_MACHINE,
    .class_init    = spapr_machine_2_1_class_init,
};

1704
static void spapr_machine_register_types(void)
1705
{
1706
    type_register_static(&spapr_machine_info);
1707
    type_register_static(&spapr_machine_2_1_info);
1708 1709
}

1710
type_init(spapr_machine_register_types)