spapr.c 42.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
 *
 * Copyright (c) 2004-2007 Fabrice Bellard
 * Copyright (c) 2007 Jocelyn Mayer
 * Copyright (c) 2010 David Gibson, IBM Corporation.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */
27
#include "sysemu/sysemu.h"
28
#include "hw/hw.h"
29
#include "elf.h"
P
Paolo Bonzini 已提交
30
#include "net/net.h"
31 32 33
#include "sysemu/blockdev.h"
#include "sysemu/cpus.h"
#include "sysemu/kvm.h"
34
#include "kvm_ppc.h"
35
#include "mmu-hash64.h"
36 37

#include "hw/boards.h"
P
Paolo Bonzini 已提交
38
#include "hw/ppc/ppc.h"
39 40
#include "hw/loader.h"

P
Paolo Bonzini 已提交
41 42 43 44
#include "hw/ppc/spapr.h"
#include "hw/ppc/spapr_vio.h"
#include "hw/pci-host/spapr.h"
#include "hw/ppc/xics.h"
45
#include "hw/pci/msi.h"
46

47
#include "hw/pci/pci.h"
48

49
#include "exec/address-spaces.h"
50
#include "hw/usb.h"
51
#include "qemu/config-file.h"
A
Avi Kivity 已提交
52

53 54
#include <libfdt.h>

55 56 57 58 59 60 61 62 63 64
/* SLOF memory layout:
 *
 * SLOF raw image loaded at 0, copies its romfs right below the flat
 * device-tree, then position SLOF itself 31M below that
 *
 * So we set FW_OVERHEAD to 40MB which should account for all of that
 * and more
 *
 * We load our kernel at 4M, leaving space for SLOF initial image
 */
65
#define FDT_MAX_SIZE            0x40000
66
#define RTAS_MAX_SIZE           0x10000
67 68
#define FW_MAX_SIZE             0x400000
#define FW_FILE_NAME            "slof.bin"
69 70
#define FW_OVERHEAD             0x2800000
#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
71

72
#define MIN_RMA_SLOF            128UL
73 74 75

#define TIMEBASE_FREQ           512000000ULL

76
#define MAX_CPUS                256
77
#define XICS_IRQS               1024
78

79 80
#define PHANDLE_XICP            0x00001111

81 82
#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))

83 84
sPAPREnvironment *spapr;

85
int spapr_allocate_irq(int hint, bool lsi)
86
{
87
    int irq;
88 89 90

    if (hint) {
        irq = hint;
91 92 93
        if (hint >= spapr->next_irq) {
            spapr->next_irq = hint + 1;
        }
94 95 96 97 98
        /* FIXME: we should probably check for collisions somehow */
    } else {
        irq = spapr->next_irq++;
    }

99 100 101
    /* Configure irq type */
    if (!xics_get_qirq(spapr->icp, irq)) {
        return 0;
102 103
    }

104
    xics_set_irq_type(spapr->icp, irq, lsi);
105

106
    return irq;
107 108
}

109 110 111 112 113
/*
 * Allocate block of consequtive IRQs, returns a number of the first.
 * If msi==true, aligns the first IRQ number to num.
 */
int spapr_allocate_irq_block(int num, bool lsi, bool msi)
114 115
{
    int first = -1;
116 117 118 119 120 121 122 123 124 125 126 127 128 129
    int i, hint = 0;

    /*
     * MSIMesage::data is used for storing VIRQ so
     * it has to be aligned to num to support multiple
     * MSI vectors. MSI-X is not affected by this.
     * The hint is used for the first IRQ, the rest should
     * be allocated continously.
     */
    if (msi) {
        assert((num == 1) || (num == 2) || (num == 4) ||
               (num == 8) || (num == 16) || (num == 32));
        hint = (spapr->next_irq + num - 1) & ~(num - 1);
    }
130 131 132 133

    for (i = 0; i < num; ++i) {
        int irq;

134
        irq = spapr_allocate_irq(hint, lsi);
135 136 137 138 139 140
        if (!irq) {
            return -1;
        }

        if (0 == i) {
            first = irq;
141
            hint = 0;
142 143 144 145 146 147 148 149 150 151
        }

        /* If the above doesn't create a consecutive block then that's
         * an internal bug */
        assert(irq == (first + i));
    }

    return first;
}

152 153 154 155 156 157 158 159 160 161 162 163
static XICSState *try_create_xics(const char *type, int nr_servers,
                                  int nr_irqs)
{
    DeviceState *dev;

    dev = qdev_create(NULL, type);
    qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
    qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
    if (qdev_init(dev) < 0) {
        return NULL;
    }

164
    return XICS_COMMON(dev);
165 166 167 168 169 170
}

static XICSState *xics_system_init(int nr_servers, int nr_irqs)
{
    XICSState *icp = NULL;

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
    if (kvm_enabled()) {
        QemuOpts *machine_opts = qemu_get_machine_opts();
        bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
                                                "kernel_irqchip", true);
        bool irqchip_required = qemu_opt_get_bool(machine_opts,
                                                  "kernel_irqchip", false);
        if (irqchip_allowed) {
            icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
        }

        if (irqchip_required && !icp) {
            perror("Failed to create in-kernel XICS\n");
            abort();
        }
    }

    if (!icp) {
        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
    }

191 192 193 194 195 196 197 198
    if (!icp) {
        perror("Failed to create XICS\n");
        abort();
    }

    return icp;
}

199
static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
200 201
{
    int ret = 0, offset;
202
    CPUState *cpu;
203 204
    char cpu_model[32];
    int smt = kvmppc_smt_threads();
205
    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
206 207 208

    assert(spapr->cpu_model);

A
Andreas Färber 已提交
209
    CPU_FOREACH(cpu) {
210 211 212 213
        uint32_t associativity[] = {cpu_to_be32(0x5),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
                                    cpu_to_be32(0x0),
214
                                    cpu_to_be32(cpu->numa_node),
215
                                    cpu_to_be32(cpu->cpu_index)};
216

217
        if ((cpu->cpu_index % smt) != 0) {
218 219 220 221
            continue;
        }

        snprintf(cpu_model, 32, "/cpus/%s@%x", spapr->cpu_model,
222
                 cpu->cpu_index);
223 224 225 226 227 228

        offset = fdt_path_offset(fdt, cpu_model);
        if (offset < 0) {
            return offset;
        }

229 230 231 232 233 234 235 236 237 238
        if (nb_numa_nodes > 1) {
            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
                              sizeof(associativity));
            if (ret < 0) {
                return ret;
            }
        }

        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
                          pft_size_prop, sizeof(pft_size_prop));
239 240 241 242 243 244 245
        if (ret < 0) {
            return ret;
        }
    }
    return ret;
}

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279

static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
                                     size_t maxsize)
{
    size_t maxcells = maxsize / sizeof(uint32_t);
    int i, j, count;
    uint32_t *p = prop;

    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
        struct ppc_one_seg_page_size *sps = &env->sps.sps[i];

        if (!sps->page_shift) {
            break;
        }
        for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
            if (sps->enc[count].page_shift == 0) {
                break;
            }
        }
        if ((p - prop) >= (maxcells - 3 - count * 2)) {
            break;
        }
        *(p++) = cpu_to_be32(sps->page_shift);
        *(p++) = cpu_to_be32(sps->slb_enc);
        *(p++) = cpu_to_be32(count);
        for (j = 0; j < count; j++) {
            *(p++) = cpu_to_be32(sps->enc[j].page_shift);
            *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
        }
    }

    return (p - prop) * sizeof(uint32_t);
}

280 281 282 283 284 285 286 287 288 289 290
#define _FDT(exp) \
    do { \
        int ret = (exp);                                           \
        if (ret < 0) {                                             \
            fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
                    #exp, fdt_strerror(ret));                      \
            exit(1);                                               \
        }                                                          \
    } while (0)


291
static void *spapr_create_fdt_skel(const char *cpu_model,
A
Avi Kivity 已提交
292 293 294
                                   hwaddr initrd_base,
                                   hwaddr initrd_size,
                                   hwaddr kernel_size,
295
                                   bool little_endian,
296
                                   const char *boot_device,
297 298
                                   const char *kernel_cmdline,
                                   uint32_t epow_irq)
299 300
{
    void *fdt;
301
    CPUState *cs;
302 303
    uint32_t start_prop = cpu_to_be32(initrd_base);
    uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
304
    char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
305
        "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk\0hcall-set-mode";
306
    char qemu_hypertas_prop[] = "hcall-memop1";
307
    uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
308
    uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
309
    char *modelname;
310
    int i, smt = kvmppc_smt_threads();
311
    unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
312

313
    fdt = g_malloc0(FDT_MAX_SIZE);
314 315
    _FDT((fdt_create(fdt, FDT_MAX_SIZE)));

316 317 318 319 320 321
    if (kernel_size) {
        _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
    }
    if (initrd_size) {
        _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
    }
322 323 324 325 326
    _FDT((fdt_finish_reservemap(fdt)));

    /* Root node */
    _FDT((fdt_begin_node(fdt, "")));
    _FDT((fdt_property_string(fdt, "device_type", "chrp")));
327
    _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
328
    _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
329 330 331 332 333 334 335

    _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));

    /* /chosen */
    _FDT((fdt_begin_node(fdt, "chosen")));

336 337 338
    /* Set Form1_affinity */
    _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));

339 340 341 342 343
    _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
    _FDT((fdt_property(fdt, "linux,initrd-start",
                       &start_prop, sizeof(start_prop))));
    _FDT((fdt_property(fdt, "linux,initrd-end",
                       &end_prop, sizeof(end_prop))));
344 345 346
    if (kernel_size) {
        uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
                              cpu_to_be64(kernel_size) };
347

348
        _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
349 350 351
        if (little_endian) {
            _FDT((fdt_property(fdt, "qemu,boot-kernel-le", NULL, 0)));
        }
352
    }
A
Avik Sil 已提交
353 354 355
    if (boot_device) {
        _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
    }
356 357 358
    _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
    _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
359

360 361 362 363 364 365 366 367
    _FDT((fdt_end_node(fdt)));

    /* cpus */
    _FDT((fdt_begin_node(fdt, "cpus")));

    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));

368
    modelname = g_strdup(cpu_model);
369 370 371 372 373

    for (i = 0; i < strlen(modelname); i++) {
        modelname[i] = toupper(modelname[i]);
    }

374 375 376
    /* This is needed during FDT finalization */
    spapr->cpu_model = g_strdup(modelname);

A
Andreas Färber 已提交
377
    CPU_FOREACH(cs) {
378 379 380 381
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        CPUPPCState *env = &cpu->env;
        PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
        int index = cs->cpu_index;
382 383
        uint32_t servers_prop[smp_threads];
        uint32_t gservers_prop[smp_threads * 2];
384 385 386
        char *nodename;
        uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
                           0xffffffff, 0xffffffff};
387 388
        uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
        uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
389 390
        uint32_t page_sizes_prop[64];
        size_t page_sizes_prop_size;
391

392 393 394 395
        if ((index % smt) != 0) {
            continue;
        }

396
        nodename = g_strdup_printf("%s@%x", modelname, index);
397 398 399

        _FDT((fdt_begin_node(fdt, nodename)));

400
        g_free(nodename);
401

D
David Gibson 已提交
402
        _FDT((fdt_property_cell(fdt, "reg", index)));
403 404 405
        _FDT((fdt_property_string(fdt, "device_type", "cpu")));

        _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR])));
406
        _FDT((fdt_property_cell(fdt, "d-cache-block-size",
407
                                env->dcache_line_size)));
408 409 410 411 412
        _FDT((fdt_property_cell(fdt, "d-cache-line-size",
                                env->dcache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-block-size",
                                env->icache_line_size)));
        _FDT((fdt_property_cell(fdt, "i-cache-line-size",
413
                                env->icache_line_size)));
414 415 416 417 418 419 420 421 422 423 424 425

        if (pcc->l1_dcache_size) {
            _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
        }
        if (pcc->l1_icache_size) {
            _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size)));
        } else {
            fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
        }

426 427
        _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
        _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
428 429 430
        _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
        _FDT((fdt_property_string(fdt, "status", "okay")));
        _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
431 432 433 434 435 436 437 438 439 440

        /* Build interrupt servers and gservers properties */
        for (i = 0; i < smp_threads; i++) {
            servers_prop[i] = cpu_to_be32(index + i);
            /* Hack, direct the group queues back to cpu 0 */
            gservers_prop[i*2] = cpu_to_be32(index + i);
            gservers_prop[i*2 + 1] = 0;
        }
        _FDT((fdt_property(fdt, "ibm,ppc-interrupt-server#s",
                           servers_prop, sizeof(servers_prop))));
441
        _FDT((fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
442
                           gservers_prop, sizeof(gservers_prop))));
443

444 445 446 447
        if (env->spr_cb[SPR_PURR].oea_read) {
            _FDT((fdt_property(fdt, "ibm,purr", NULL, 0)));
        }

D
David Gibson 已提交
448
        if (env->mmu_model & POWERPC_MMU_1TSEG) {
449 450 451 452
            _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
                               segs, sizeof(segs))));
        }

453 454 455 456
        /* Advertise VMX/VSX (vector extensions) if available
         *   0 / no property == no vector extensions
         *   1               == VMX / Altivec available
         *   2               == VSX available */
457 458 459
        if (env->insns_flags & PPC_ALTIVEC) {
            uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;

460 461 462 463 464 465
            _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx)));
        }

        /* Advertise DFP (Decimal Floating Point) if available
         *   0 / no property == no DFP
         *   1               == DFP available */
466 467
        if (env->insns_flags2 & PPC2_DFP) {
            _FDT((fdt_property_cell(fdt, "ibm,dfp", 1)));
468 469
        }

470 471 472 473 474 475 476
        page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
                                                      sizeof(page_sizes_prop));
        if (page_sizes_prop_size) {
            _FDT((fdt_property(fdt, "ibm,segment-page-sizes",
                               page_sizes_prop, page_sizes_prop_size)));
        }

477 478 479
        _FDT((fdt_end_node(fdt)));
    }

480
    g_free(modelname);
481 482 483

    _FDT((fdt_end_node(fdt)));

484 485 486 487 488
    /* RTAS */
    _FDT((fdt_begin_node(fdt, "rtas")));

    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
                       sizeof(hypertas_prop))));
489 490
    _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas_prop,
                       sizeof(qemu_hypertas_prop))));
491

492 493 494
    _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
        refpoints, sizeof(refpoints))));

495 496
    _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));

497 498
    _FDT((fdt_end_node(fdt)));

499
    /* interrupt controller */
500
    _FDT((fdt_begin_node(fdt, "interrupt-controller")));
501 502 503 504 505 506 507 508

    _FDT((fdt_property_string(fdt, "device_type",
                              "PowerPC-External-Interrupt-Presentation")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
    _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
                       interrupt_server_ranges_prop,
                       sizeof(interrupt_server_ranges_prop))));
509 510 511
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
    _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
    _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
512 513 514

    _FDT((fdt_end_node(fdt)));

515 516 517 518 519 520 521
    /* vdevice */
    _FDT((fdt_begin_node(fdt, "vdevice")));

    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
522 523
    _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
    _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
524 525 526

    _FDT((fdt_end_node(fdt)));

527 528 529
    /* event-sources */
    spapr_events_fdt_skel(fdt, epow_irq);

530 531 532
    _FDT((fdt_end_node(fdt))); /* close root node */
    _FDT((fdt_finish(fdt)));

533 534 535
    return fdt;
}

536 537 538 539 540 541
static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
{
    uint32_t associativity[] = {cpu_to_be32(0x4), cpu_to_be32(0x0),
                                cpu_to_be32(0x0), cpu_to_be32(0x0),
                                cpu_to_be32(0x0)};
    char mem_name[32];
A
Avi Kivity 已提交
542
    hwaddr node0_size, mem_start;
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
    uint64_t mem_reg_property[2];
    int i, off;

    /* memory node(s) */
    node0_size = (nb_numa_nodes > 1) ? node_mem[0] : ram_size;
    if (spapr->rma_size > node0_size) {
        spapr->rma_size = node0_size;
    }

    /* RMA */
    mem_reg_property[0] = 0;
    mem_reg_property[1] = cpu_to_be64(spapr->rma_size);
    off = fdt_add_subnode(fdt, 0, "memory@0");
    _FDT(off);
    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                      sizeof(mem_reg_property))));
    _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                      sizeof(associativity))));

    /* RAM: Node 0 */
    if (node0_size > spapr->rma_size) {
        mem_reg_property[0] = cpu_to_be64(spapr->rma_size);
        mem_reg_property[1] = cpu_to_be64(node0_size - spapr->rma_size);

        sprintf(mem_name, "memory@" TARGET_FMT_lx, spapr->rma_size);
        off = fdt_add_subnode(fdt, 0, mem_name);
        _FDT(off);
        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
        _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                          sizeof(mem_reg_property))));
        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                          sizeof(associativity))));
    }

    /* RAM: Node 1 and beyond */
    mem_start = node0_size;
    for (i = 1; i < nb_numa_nodes; i++) {
        mem_reg_property[0] = cpu_to_be64(mem_start);
        mem_reg_property[1] = cpu_to_be64(node_mem[i]);
        associativity[3] = associativity[4] = cpu_to_be32(i);
        sprintf(mem_name, "memory@" TARGET_FMT_lx, mem_start);
        off = fdt_add_subnode(fdt, 0, mem_name);
        _FDT(off);
        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
        _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
                          sizeof(mem_reg_property))));
        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                          sizeof(associativity))));
        mem_start += node_mem[i];
    }

    return 0;
}

598
static void spapr_finalize_fdt(sPAPREnvironment *spapr,
A
Avi Kivity 已提交
599 600 601
                               hwaddr fdt_addr,
                               hwaddr rtas_addr,
                               hwaddr rtas_size)
602 603 604
{
    int ret;
    void *fdt;
605
    sPAPRPHBState *phb;
606

607
    fdt = g_malloc(FDT_MAX_SIZE);
608 609 610

    /* open out the base tree into a temp buffer for the final tweaks */
    _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
611

612 613 614 615 616 617
    ret = spapr_populate_memory(spapr, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
        exit(1);
    }

618 619 620 621 622 623
    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
    if (ret < 0) {
        fprintf(stderr, "couldn't setup vio devices in fdt\n");
        exit(1);
    }

624
    QLIST_FOREACH(phb, &spapr->phbs, list) {
625
        ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
626 627 628 629 630 631 632
    }

    if (ret < 0) {
        fprintf(stderr, "couldn't setup PCI devices in fdt\n");
        exit(1);
    }

633 634 635 636 637 638
    /* RTAS */
    ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
    if (ret < 0) {
        fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
    }

639
    /* Advertise NUMA via ibm,associativity */
640 641 642
    ret = spapr_fixup_cpu_dt(fdt, spapr);
    if (ret < 0) {
        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
643 644
    }

645
    if (!spapr->has_graphics) {
646 647
        spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
    }
648

649 650
    _FDT((fdt_pack(fdt)));

651 652 653 654 655 656
    if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
        hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
                 fdt_totalsize(fdt), FDT_MAX_SIZE);
        exit(1);
    }

657
    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
658

659
    g_free(fdt);
660 661 662 663 664 665 666
}

static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
{
    return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
}

667
static void emulate_spapr_hypercall(PowerPCCPU *cpu)
668
{
669 670
    CPUPPCState *env = &cpu->env;

671 672 673 674
    if (msr_pr) {
        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
        env->gpr[3] = H_PRIVILEGE;
    } else {
675
        env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
676
    }
677 678
}

679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
static void spapr_reset_htab(sPAPREnvironment *spapr)
{
    long shift;

    /* allocate hash page table.  For now we always make this 16mb,
     * later we should probably make it scale to the size of guest
     * RAM */

    shift = kvmppc_reset_htab(spapr->htab_shift);

    if (shift > 0) {
        /* Kernel handles htab, we don't need to allocate one */
        spapr->htab_shift = shift;
    } else {
        if (!spapr->htab) {
            /* Allocate an htab if we don't yet have one */
            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
        }

        /* And clear it */
        memset(spapr->htab, 0, HTAB_SIZE(spapr));
    }

    /* Update the RMA size if necessary */
    if (spapr->vrma_adjust) {
        spapr->rma_size = kvmppc_rma_size(ram_size, spapr->htab_shift);
    }
706 707
}

708
static void ppc_spapr_reset(void)
709
{
710
    PowerPCCPU *first_ppc_cpu;
711

712 713
    /* Reset the hash table & recalc the RMA */
    spapr_reset_htab(spapr);
714

715
    qemu_devices_reset();
716 717 718 719 720 721

    /* Load the fdt */
    spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
                       spapr->rtas_size);

    /* Set up the entry state */
722 723 724 725 726
    first_ppc_cpu = POWERPC_CPU(first_cpu);
    first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
    first_ppc_cpu->env.gpr[5] = 0;
    first_cpu->halted = 0;
    first_ppc_cpu->env.nip = spapr->entry_point;
727 728 729

}

730 731
static void spapr_cpu_reset(void *opaque)
{
732
    PowerPCCPU *cpu = opaque;
733
    CPUState *cs = CPU(cpu);
734
    CPUPPCState *env = &cpu->env;
735

736
    cpu_reset(cs);
737 738 739 740

    /* All CPUs start halted.  CPU0 is unhalted from the machine level
     * reset code and the rest are explicitly started up by the guest
     * using an RTAS call */
741
    cs->halted = 1;
742 743

    env->spr[SPR_HIOR] = 0;
744

745
    env->external_htab = (uint8_t *)spapr->htab;
746 747
    env->htab_base = -1;
    env->htab_mask = HTAB_SIZE(spapr) - 1;
748
    env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
749
        (spapr->htab_shift - 18);
750 751
}

D
David Gibson 已提交
752 753
static void spapr_create_nvram(sPAPREnvironment *spapr)
{
754 755
    DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
    const char *drivename = qemu_opt_get(qemu_get_machine_opts(), "nvram");
D
David Gibson 已提交
756

757 758
    if (drivename) {
        BlockDriverState *bs;
D
David Gibson 已提交
759

760 761 762 763 764
        bs = bdrv_find(drivename);
        if (!bs) {
            fprintf(stderr, "No such block device \"%s\" for nvram\n",
                    drivename);
            exit(1);
D
David Gibson 已提交
765
        }
766
        qdev_prop_set_drive_nofail(dev, "drive", bs);
D
David Gibson 已提交
767 768 769 770 771 772 773
    }

    qdev_init_nofail(dev);

    spapr->nvram = (struct sPAPRNVRAM *)dev;
}

774
/* Returns whether we want to use VGA or not */
775 776
static int spapr_vga_init(PCIBus *pci_bus)
{
777 778
    switch (vga_interface_type) {
    case VGA_NONE:
779 780
    case VGA_STD:
        return pci_vga_init(pci_bus) != NULL;
781
    default:
782 783
        fprintf(stderr, "This vga model is not supported,"
                "currently it only supports -vga std\n");
784 785
        exit(0);
        break;
786 787 788
    }
}

789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
static const VMStateDescription vmstate_spapr = {
    .name = "spapr",
    .version_id = 1,
    .minimum_version_id = 1,
    .minimum_version_id_old = 1,
    .fields      = (VMStateField []) {
        VMSTATE_UINT32(next_irq, sPAPREnvironment),

        /* RTC offset */
        VMSTATE_UINT64(rtc_offset, sPAPREnvironment),

        VMSTATE_END_OF_LIST()
    },
};

#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))

static int htab_save_setup(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* "Iteration" header */
    qemu_put_be32(f, spapr->htab_shift);

816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
    if (spapr->htab) {
        spapr->htab_save_index = 0;
        spapr->htab_first_pass = true;
    } else {
        assert(kvm_enabled());

        spapr->htab_fd = kvmppc_get_htab_fd(false);
        if (spapr->htab_fd < 0) {
            fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
                    strerror(errno));
            return -1;
        }
    }


831 832 833 834 835 836 837 838
    return 0;
}

static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                 int64_t max_ns)
{
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int index = spapr->htab_save_index;
839
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869

    assert(spapr->htab_first_pass);

    do {
        int chunkstart;

        /* Consume invalid HPTEs */
        while ((index < htabslots)
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        /* Consume valid HPTEs */
        chunkstart = index;
        while ((index < htabslots)
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            index++;
            CLEAN_HPTE(HPTE(spapr->htab, index));
        }

        if (index > chunkstart) {
            int n_valid = index - chunkstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, 0);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);

870
            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
871 872 873 874 875 876 877 878 879 880 881 882 883
                break;
            }
        }
    } while ((index < htabslots) && !qemu_file_rate_limit(f));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
        spapr->htab_first_pass = false;
    }
    spapr->htab_save_index = index;
}

884 885
static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                int64_t max_ns)
886 887 888 889 890
{
    bool final = max_ns < 0;
    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
    int examined = 0, sent = 0;
    int index = spapr->htab_save_index;
891
    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935

    assert(!spapr->htab_first_pass);

    do {
        int chunkstart, invalidstart;

        /* Consume non-dirty HPTEs */
        while ((index < htabslots)
               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
            index++;
            examined++;
        }

        chunkstart = index;
        /* Consume valid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        invalidstart = index;
        /* Consume invalid dirty HPTEs */
        while ((index < htabslots)
               && HPTE_DIRTY(HPTE(spapr->htab, index))
               && !HPTE_VALID(HPTE(spapr->htab, index))) {
            CLEAN_HPTE(HPTE(spapr->htab, index));
            index++;
            examined++;
        }

        if (index > chunkstart) {
            int n_valid = invalidstart - chunkstart;
            int n_invalid = index - invalidstart;

            qemu_put_be32(f, chunkstart);
            qemu_put_be16(f, n_valid);
            qemu_put_be16(f, n_invalid);
            qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                            HASH_PTE_SIZE_64 * n_valid);
            sent += index - chunkstart;

936
            if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
                break;
            }
        }

        if (examined >= htabslots) {
            break;
        }

        if (index >= htabslots) {
            assert(index == htabslots);
            index = 0;
        }
    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));

    if (index >= htabslots) {
        assert(index == htabslots);
        index = 0;
    }

    spapr->htab_save_index = index;

958
    return (examined >= htabslots) && (sent == 0) ? 1 : 0;
959 960
}

961 962 963
#define MAX_ITERATION_NS    5000000 /* 5 ms */
#define MAX_KVM_BUF_SIZE    2048

964 965 966
static int htab_save_iterate(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;
967
    int rc = 0;
968 969 970 971

    /* Iteration header */
    qemu_put_be32(f, 0);

972 973 974 975 976 977 978 979 980
    if (!spapr->htab) {
        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd,
                              MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
        if (rc < 0) {
            return rc;
        }
    } else  if (spapr->htab_first_pass) {
981 982
        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
    } else {
983
        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
984 985 986 987 988 989 990
    }

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

991
    return rc;
992 993 994 995 996 997 998 999 1000
}

static int htab_save_complete(QEMUFile *f, void *opaque)
{
    sPAPREnvironment *spapr = opaque;

    /* Iteration header */
    qemu_put_be32(f, 0);

1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
    if (!spapr->htab) {
        int rc;

        assert(kvm_enabled());

        rc = kvmppc_save_htab(f, spapr->htab_fd, MAX_KVM_BUF_SIZE, -1);
        if (rc < 0) {
            return rc;
        }
        close(spapr->htab_fd);
        spapr->htab_fd = -1;
    } else {
        htab_save_later_pass(f, spapr, -1);
    }
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027

    /* End marker */
    qemu_put_be32(f, 0);
    qemu_put_be16(f, 0);
    qemu_put_be16(f, 0);

    return 0;
}

static int htab_load(QEMUFile *f, void *opaque, int version_id)
{
    sPAPREnvironment *spapr = opaque;
    uint32_t section_hdr;
1028
    int fd = -1;
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044

    if (version_id < 1 || version_id > 1) {
        fprintf(stderr, "htab_load() bad version\n");
        return -EINVAL;
    }

    section_hdr = qemu_get_be32(f);

    if (section_hdr) {
        /* First section, just the hash shift */
        if (spapr->htab_shift != section_hdr) {
            return -EINVAL;
        }
        return 0;
    }

1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
    if (!spapr->htab) {
        assert(kvm_enabled());

        fd = kvmppc_get_htab_fd(true);
        if (fd < 0) {
            fprintf(stderr, "Unable to open fd to restore KVM hash table: %s\n",
                    strerror(errno));
        }
    }

1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067
    while (true) {
        uint32_t index;
        uint16_t n_valid, n_invalid;

        index = qemu_get_be32(f);
        n_valid = qemu_get_be16(f);
        n_invalid = qemu_get_be16(f);

        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
            /* End of Stream */
            break;
        }

1068
        if ((index + n_valid + n_invalid) >
1069 1070 1071
            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
            /* Bad index in stream */
            fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
1072 1073
                    "in htab stream (htab_shift=%d)\n", index, n_valid, n_invalid,
                    spapr->htab_shift);
1074 1075 1076
            return -EINVAL;
        }

1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
        if (spapr->htab) {
            if (n_valid) {
                qemu_get_buffer(f, HPTE(spapr->htab, index),
                                HASH_PTE_SIZE_64 * n_valid);
            }
            if (n_invalid) {
                memset(HPTE(spapr->htab, index + n_valid), 0,
                       HASH_PTE_SIZE_64 * n_invalid);
            }
        } else {
            int rc;

            assert(fd >= 0);

            rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
            if (rc < 0) {
                return rc;
            }
1095 1096 1097
        }
    }

1098 1099 1100 1101 1102
    if (!spapr->htab) {
        assert(fd >= 0);
        close(fd);
    }

1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
    return 0;
}

static SaveVMHandlers savevm_htab_handlers = {
    .save_live_setup = htab_save_setup,
    .save_live_iterate = htab_save_iterate,
    .save_live_complete = htab_save_complete,
    .load_state = htab_load,
};

1113
/* pSeries LPAR / sPAPR hardware init */
1114
static void ppc_spapr_init(QEMUMachineInitArgs *args)
1115
{
1116 1117 1118 1119 1120
    ram_addr_t ram_size = args->ram_size;
    const char *cpu_model = args->cpu_model;
    const char *kernel_filename = args->kernel_filename;
    const char *kernel_cmdline = args->kernel_cmdline;
    const char *initrd_filename = args->initrd_filename;
1121
    const char *boot_device = args->boot_order;
1122
    PowerPCCPU *cpu;
A
Andreas Färber 已提交
1123
    CPUPPCState *env;
1124
    PCIHostState *phb;
1125
    int i;
A
Avi Kivity 已提交
1126 1127
    MemoryRegion *sysmem = get_system_memory();
    MemoryRegion *ram = g_new(MemoryRegion, 1);
A
Avi Kivity 已提交
1128
    hwaddr rma_alloc_size;
1129 1130 1131
    uint32_t initrd_base = 0;
    long kernel_size = 0, initrd_size = 0;
    long load_limit, rtas_limit, fw_size;
1132
    bool kernel_le = false;
1133
    char *filename;
1134

1135 1136
    msi_supported = true;

1137 1138 1139
    spapr = g_malloc0(sizeof(*spapr));
    QLIST_INIT(&spapr->phbs);

1140 1141
    cpu_ppc_hypercall = emulate_spapr_hypercall;

1142 1143 1144 1145 1146 1147 1148
    /* Allocate RMA if necessary */
    rma_alloc_size = kvmppc_alloc_rma("ppc_spapr.rma", sysmem);

    if (rma_alloc_size == -1) {
        hw_error("qemu: Unable to create RMA\n");
        exit(1);
    }
1149

1150
    if (rma_alloc_size && (rma_alloc_size < ram_size)) {
1151
        spapr->rma_size = rma_alloc_size;
1152
    } else {
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
        spapr->rma_size = ram_size;

        /* With KVM, we don't actually know whether KVM supports an
         * unbounded RMA (PR KVM) or is limited by the hash table size
         * (HV KVM using VRMA), so we always assume the latter
         *
         * In that case, we also limit the initial allocations for RTAS
         * etc... to 256M since we have no way to know what the VRMA size
         * is going to be as it depends on the size of the hash table
         * isn't determined yet.
         */
        if (kvm_enabled()) {
            spapr->vrma_adjust = 1;
            spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
        }
1168 1169
    }

1170
    /* We place the device tree and RTAS just below either the top of the RMA,
1171 1172
     * or just below 2GB, whichever is lowere, so that it can be
     * processed with 32-bit real mode code if necessary */
1173
    rtas_limit = MIN(spapr->rma_size, 0x80000000);
1174 1175 1176
    spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
    spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
    load_limit = spapr->fdt_addr - FW_OVERHEAD;
1177

1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
    /* We aim for a hash table of size 1/128 the size of RAM.  The
     * normal rule of thumb is 1/64 the size of RAM, but that's much
     * more than needed for the Linux guests we support. */
    spapr->htab_shift = 18; /* Minimum architected size */
    while (spapr->htab_shift <= 46) {
        if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
            break;
        }
        spapr->htab_shift++;
    }
1188

1189 1190 1191 1192 1193
    /* Set up Interrupt Controller before we create the VCPUs */
    spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads,
                                  XICS_IRQS);
    spapr->next_irq = XICS_IRQ_BASE;

1194 1195
    /* init CPUs */
    if (cpu_model == NULL) {
1196
        cpu_model = kvm_enabled() ? "host" : "POWER7";
1197 1198
    }
    for (i = 0; i < smp_cpus; i++) {
1199 1200
        cpu = cpu_ppc_init(cpu_model);
        if (cpu == NULL) {
1201 1202 1203
            fprintf(stderr, "Unable to find PowerPC CPU definition\n");
            exit(1);
        }
1204 1205
        env = &cpu->env;

1206 1207 1208
        /* Set time-base frequency to 512 MHz */
        cpu_ppc_tb_init(env, TIMEBASE_FREQ);

1209 1210 1211 1212
        /* PAPR always has exception vectors in RAM not ROM. To ensure this,
         * MSR[IP] should never be set.
         */
        env->msr_mask &= ~(1 << 6);
1213 1214 1215

        /* Tell KVM that we're in PAPR mode */
        if (kvm_enabled()) {
1216
            kvmppc_set_papr(cpu);
1217 1218
        }

1219 1220
        xics_cpu_setup(spapr->icp, cpu);

1221
        qemu_register_reset(spapr_cpu_reset, cpu);
1222 1223 1224
    }

    /* allocate RAM */
1225
    spapr->ram_limit = ram_size;
1226 1227 1228 1229
    if (spapr->ram_limit > rma_alloc_size) {
        ram_addr_t nonrma_base = rma_alloc_size;
        ram_addr_t nonrma_size = spapr->ram_limit - rma_alloc_size;

1230
        memory_region_init_ram(ram, NULL, "ppc_spapr.ram", nonrma_size);
1231
        vmstate_register_ram_global(ram);
1232 1233
        memory_region_add_subregion(sysmem, nonrma_base, ram);
    }
1234

1235
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
1236
    spapr->rtas_size = load_image_targphys(filename, spapr->rtas_addr,
1237
                                           rtas_limit - spapr->rtas_addr);
1238
    if (spapr->rtas_size < 0) {
1239 1240 1241
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
1242 1243 1244 1245 1246
    if (spapr->rtas_size > RTAS_MAX_SIZE) {
        hw_error("RTAS too big ! 0x%lx bytes (max is 0x%x)\n",
                 spapr->rtas_size, RTAS_MAX_SIZE);
        exit(1);
    }
1247
    g_free(filename);
1248

1249 1250 1251
    /* Set up EPOW events infrastructure */
    spapr_events_init(spapr);

1252
    /* Set up VIO bus */
1253 1254
    spapr->vio_bus = spapr_vio_bus_init();

P
Paolo Bonzini 已提交
1255
    for (i = 0; i < MAX_SERIAL_PORTS; i++) {
1256
        if (serial_hds[i]) {
1257
            spapr_vty_create(spapr->vio_bus, serial_hds[i]);
1258 1259
        }
    }
1260

D
David Gibson 已提交
1261 1262 1263
    /* We always have at least the nvram device on VIO */
    spapr_create_nvram(spapr);

1264
    /* Set up PCI */
1265
    spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
1266 1267
    spapr_pci_rtas_init();

1268
    phb = spapr_create_phb(spapr, 0);
1269

P
Paolo Bonzini 已提交
1270
    for (i = 0; i < nb_nics; i++) {
1271 1272 1273
        NICInfo *nd = &nd_table[i];

        if (!nd->model) {
1274
            nd->model = g_strdup("ibmveth");
1275 1276 1277
        }

        if (strcmp(nd->model, "ibmveth") == 0) {
1278
            spapr_vlan_create(spapr->vio_bus, nd);
1279
        } else {
1280
            pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
1281 1282 1283
        }
    }

1284
    for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
1285
        spapr_vscsi_create(spapr->vio_bus);
1286 1287
    }

1288
    /* Graphics */
1289
    if (spapr_vga_init(phb->bus)) {
1290
        spapr->has_graphics = true;
1291 1292
    }

1293
    if (usb_enabled(spapr->has_graphics)) {
1294
        pci_create_simple(phb->bus, -1, "pci-ohci");
1295 1296 1297 1298 1299 1300
        if (spapr->has_graphics) {
            usbdevice_create("keyboard");
            usbdevice_create("mouse");
        }
    }

1301
    if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
1302 1303 1304 1305 1306
        fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
                "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
        exit(1);
    }

1307 1308 1309 1310 1311
    if (kernel_filename) {
        uint64_t lowaddr = 0;

        kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
                               NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
1312 1313 1314 1315 1316 1317
        if (kernel_size < 0) {
            kernel_size = load_elf(kernel_filename,
                                   translate_kernel_address, NULL,
                                   NULL, &lowaddr, NULL, 0, ELF_MACHINE, 0);
            kernel_le = kernel_size > 0;
        }
1318
        if (kernel_size < 0) {
1319 1320
            kernel_size = load_image_targphys(kernel_filename,
                                              KERNEL_LOAD_ADDR,
1321
                                              load_limit - KERNEL_LOAD_ADDR);
1322 1323 1324 1325 1326 1327 1328 1329 1330
        }
        if (kernel_size < 0) {
            fprintf(stderr, "qemu: could not load kernel '%s'\n",
                    kernel_filename);
            exit(1);
        }

        /* load initrd */
        if (initrd_filename) {
1331 1332 1333 1334
            /* Try to locate the initrd in the gap between the kernel
             * and the firmware. Add a bit of space just in case
             */
            initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
1335
            initrd_size = load_image_targphys(initrd_filename, initrd_base,
1336
                                              load_limit - initrd_base);
1337 1338 1339 1340 1341 1342 1343 1344 1345
            if (initrd_size < 0) {
                fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
                        initrd_filename);
                exit(1);
            }
        } else {
            initrd_base = 0;
            initrd_size = 0;
        }
1346
    }
1347

1348 1349 1350 1351
    if (bios_name == NULL) {
        bios_name = FW_FILE_NAME;
    }
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1352 1353 1354 1355 1356 1357 1358 1359 1360
    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
    if (fw_size < 0) {
        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
        exit(1);
    }
    g_free(filename);

    spapr->entry_point = 0x100;

1361 1362 1363 1364
    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
    register_savevm_live(NULL, "spapr/htab", -1, 1,
                         &savevm_htab_handlers, spapr);

1365
    /* Prepare the device tree */
1366
    spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
1367
                                            initrd_base, initrd_size,
1368
                                            kernel_size, kernel_le,
1369 1370
                                            boot_device, kernel_cmdline,
                                            spapr->epow_irq);
1371
    assert(spapr->fdt_skel != NULL);
1372 1373 1374 1375 1376
}

static QEMUMachine spapr_machine = {
    .name = "pseries",
    .desc = "pSeries Logical Partition (PAPR compliant)",
1377
    .is_default = 1,
1378
    .init = ppc_spapr_init,
1379
    .reset = ppc_spapr_reset,
1380
    .block_default_type = IF_SCSI,
1381 1382
    .max_cpus = MAX_CPUS,
    .no_parallel = 1,
1383
    .default_boot_order = NULL,
1384 1385 1386 1387 1388 1389 1390 1391
};

static void spapr_machine_init(void)
{
    qemu_register_machine(&spapr_machine);
}

machine_init(spapr_machine_init);