ppc64: Rudimentary Support for extra page sizes on server CPUs

More recent Power server chips (i.e. based on the 64 bit hash MMU) support more than just the traditional 4k and 16M page sizes. This can get quite complicated, because which page sizes are supported, which combinations are supported within an MMU segment and how these page sizes are encoded both in the SLB entry and the hash PTE can vary depending on the CPU model (they are not specified by the architecture). In addition the firmware or hypervisor may not permit use of certain page sizes, for various reasons. Whether various page sizes are supported on KVM, for example, depends on whether the PR or HV variant of KVM is in use, and on the page size of the memory backing the guest's RAM. This patch adds information to the CPUState and cpu defs to describe the supported page sizes and encodings. Since TCG does not yet support any extended page sizes, we just set this to NULL in the static CPU definitions, expanding this to the default 4k and 16M page sizes when we initialize the cpu state. When using KVM, however, we instead determine available page sizes using the new KVM_PPC_GET_SMMU_INFO call. For old kernels without that call, we use some defaults, with some guesswork which should do the right thing for existing HV and PR implementations. The fallback might not be correct for future versions, but that's ok, because they'll have KVM_PPC_GET_SMMU_INFO. Signed-off-by: N Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: N David Gibson <david@gibson.dropbear.id.au> Signed-off-by: N Alexander Graf <agraf@suse.de>

ppc64: Rudimentary Support for extra page sizes on server CPUs
More recent Power server chips (i.e. based on the 64 bit hash MMU) support more than just the traditional 4k and 16M page sizes. This can get quite complicated, because which page sizes are supported, which combinations are supported within an MMU segment and how these page sizes are encoded both in the SLB entry and the hash PTE can vary depending on the CPU model (they are not specified by the architecture). In addition the firmware or hypervisor may not permit use of certain page sizes, for various reasons. Whether various page sizes are supported on KVM, for example, depends on whether the PR or HV variant of KVM is in use, and on the page size of the memory backing the guest's RAM. This patch adds information to the CPUState and cpu defs to describe the supported page sizes and encodings. Since TCG does not yet support any extended page sizes, we just set this to NULL in the static CPU definitions, expanding this to the default 4k and 16M page sizes when we initialize the cpu state. When using KVM, however, we instead determine available page sizes using the new KVM_PPC_GET_SMMU_INFO call. For old kernels without that call, we use some defaults, with some guesswork which should do the right thing for existing HV and PR implementations. The fallback might not be correct for future versions, but that's ok, because they'll have KVM_PPC_GET_SMMU_INFO. Signed-off-by: N Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: N David Gibson <david@gibson.dropbear.id.au> Signed-off-by: N Alexander Graf <agraf@suse.de>
4656e1f0 · Benjamin Herrenschmidt · Alexander Graf · 77c2cf33 · 4656e1f0 · 4656e1f0
5 changed file
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -119,6 +119,8 @@ enum powerpc_mmu_t {
    POWERPC_MMU_620        = POWERPC_MMU_64 | 0x00000002,
    /* Architecture 2.06 variant                               */
    POWERPC_MMU_2_06       = POWERPC_MMU_64 | POWERPC_MMU_1TSEG | 0x00000003,
+    /* Architecture 2.06 "degraded" (no 1T segments)           */
+    POWERPC_MMU_2_06d      = POWERPC_MMU_64 | 0x00000003,
 #endif /* defined(TARGET_PPC64) */
 };

@@ -873,6 +875,29 @@ enum {
 #define DBELL_LPIDTAG_MASK             (0xfff << DBELL_LPIDTAG_SHIFT)
 #define DBELL_PIRTAG_MASK              0x3fff

+/*****************************************************************************/
+/* Segment page size information, used by recent hash MMUs
+ * The format of this structure mirrors kvm_ppc_smmu_info
+ */
+
+#define PPC_PAGE_SIZES_MAX_SZ   8
+
+struct ppc_one_page_size {
+    uint32_t page_shift;  /* Page shift (or 0) */
+    uint32_t pte_enc;     /* Encoding in the HPTE (>>12) */
+};
+
+struct ppc_one_seg_page_size {
+    uint32_t page_shift;  /* Base page shift of segment (or 0) */
+    uint32_t slb_enc;     /* SLB encoding for BookS */
+    struct ppc_one_page_size enc[PPC_PAGE_SIZES_MAX_SZ];
+};
+
+struct ppc_segment_page_sizes {
+    struct ppc_one_seg_page_size sps[PPC_PAGE_SIZES_MAX_SZ];
+};
+
+
 /*****************************************************************************/
 /* The whole PowerPC CPU context */
 #define NB_MMU_MODES 3
@@ -889,6 +914,9 @@ struct ppc_def_t {
    powerpc_input_t bus_model;
    uint32_t flags;
    int bfd_mach;
+#if defined(TARGET_PPC64)
+    const struct ppc_segment_page_sizes *sps;
+#endif
    void (*init_proc)(CPUPPCState *env);
    int  (*check_pow)(CPUPPCState *env);
 };
@@ -1012,6 +1040,9 @@ struct CPUPPCState {
    uint32_t flags;
    uint64_t insns_flags;
    uint64_t insns_flags2;
+#if defined(TARGET_PPC64)
+    struct ppc_segment_page_sizes sps;
+#endif

 #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
    target_phys_addr_t vpa;

--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -18,6 +18,7 @@
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/vfs.h>

 #include <linux/kvm.h>

@@ -167,10 +168,217 @@ static int kvm_booke206_tlb_init(CPUPPCState *env)
    return 0;
 }

+
+#if defined(TARGET_PPC64)
+static void kvm_get_fallback_smmu_info(CPUPPCState *env,
+                                       struct kvm_ppc_smmu_info *info)
+{
+    memset(info, 0, sizeof(*info));
+
+    /* We don't have the new KVM_PPC_GET_SMMU_INFO ioctl, so
+     * need to "guess" what the supported page sizes are.
+     *
+     * For that to work we make a few assumptions:
+     *
+     * - If KVM_CAP_PPC_GET_PVINFO is supported we are running "PR"
+     *   KVM which only supports 4K and 16M pages, but supports them
+     *   regardless of the backing store characteritics. We also don't
+     *   support 1T segments.
+     *
+     *   This is safe as if HV KVM ever supports that capability or PR
+     *   KVM grows supports for more page/segment sizes, those versions
+     *   will have implemented KVM_CAP_PPC_GET_SMMU_INFO and thus we
+     *   will not hit this fallback
+     *
+     * - Else we are running HV KVM. This means we only support page
+     *   sizes that fit in the backing store. Additionally we only
+     *   advertize 64K pages if the processor is ARCH 2.06 and we assume
+     *   P7 encodings for the SLB and hash table. Here too, we assume
+     *   support for any newer processor will mean a kernel that
+     *   implements KVM_CAP_PPC_GET_SMMU_INFO and thus doesn't hit
+     *   this fallback.
+     */
+    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
+        /* No flags */
+        info->flags = 0;
+        info->slb_size = 64;
+
+        /* Standard 4k base page size segment */
+        info->sps[0].page_shift = 12;
+        info->sps[0].slb_enc = 0;
+        info->sps[0].enc[0].page_shift = 12;
+        info->sps[0].enc[0].pte_enc = 0;
+
+        /* Standard 16M large page size segment */
+        info->sps[1].page_shift = 24;
+        info->sps[1].slb_enc = SLB_VSID_L;
+        info->sps[1].enc[0].page_shift = 24;
+        info->sps[1].enc[0].pte_enc = 0;
+    } else {
+        int i = 0;
+
+        /* HV KVM has backing store size restrictions */
+        info->flags = KVM_PPC_PAGE_SIZES_REAL;
+
+        if (env->mmu_model & POWERPC_MMU_1TSEG) {
+            info->flags |= KVM_PPC_1T_SEGMENTS;
+        }
+
+        if (env->mmu_model == POWERPC_MMU_2_06) {
+            info->slb_size = 32;
+        } else {
+            info->slb_size = 64;
+        }
+
+        /* Standard 4k base page size segment */
+        info->sps[i].page_shift = 12;
+        info->sps[i].slb_enc = 0;
+        info->sps[i].enc[0].page_shift = 12;
+        info->sps[i].enc[0].pte_enc = 0;
+        i++;
+
+        /* 64K on MMU 2.06 */
+        if (env->mmu_model == POWERPC_MMU_2_06) {
+            info->sps[i].page_shift = 16;
+            info->sps[i].slb_enc = 0x110;
+            info->sps[i].enc[0].page_shift = 16;
+            info->sps[i].enc[0].pte_enc = 1;
+            i++;
+        }
+
+        /* Standard 16M large page size segment */
+        info->sps[i].page_shift = 24;
+        info->sps[i].slb_enc = SLB_VSID_L;
+        info->sps[i].enc[0].page_shift = 24;
+        info->sps[i].enc[0].pte_enc = 0;
+    }
+}
+
+static void kvm_get_smmu_info(CPUPPCState *env, struct kvm_ppc_smmu_info *info)
+{
+    int ret;
+
+    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) {
+        ret = kvm_vm_ioctl(env->kvm_state, KVM_PPC_GET_SMMU_INFO, info);
+        if (ret == 0) {
+            return;
+        }
+    }
+
+    kvm_get_fallback_smmu_info(env, info);
+}
+
+static long getrampagesize(void)
+{
+    struct statfs fs;
+    int ret;
+
+    if (!mem_path) {
+        /* guest RAM is backed by normal anonymous pages */
+        return getpagesize();
+    }
+
+    do {
+        ret = statfs(mem_path, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (ret != 0) {
+        fprintf(stderr, "Couldn't statfs() memory path: %s\n",
+                strerror(errno));
+        exit(1);
+    }
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+    if (fs.f_type != HUGETLBFS_MAGIC) {
+        /* Explicit mempath, but it's ordinary pages */
+        return getpagesize();
+    }
+
+    /* It's hugepage, return the huge page size */
+    return fs.f_bsize;
+}
+
+static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift)
+{
+    if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) {
+        return true;
+    }
+
+    return (1ul << shift) <= rampgsize;
+}
+
+static void kvm_fixup_page_sizes(CPUPPCState *env)
+{
+    static struct kvm_ppc_smmu_info smmu_info;
+    static bool has_smmu_info;
+    long rampagesize;
+    int iq, ik, jq, jk;
+
+    /* We only handle page sizes for 64-bit server guests for now */
+    if (!(env->mmu_model & POWERPC_MMU_64)) {
+        return;
+    }
+
+    /* Collect MMU info from kernel if not already */
+    if (!has_smmu_info) {
+        kvm_get_smmu_info(env, &smmu_info);
+        has_smmu_info = true;
+    }
+
+    rampagesize = getrampagesize();
+
+    /* Convert to QEMU form */
+    memset(&env->sps, 0, sizeof(env->sps));
+
+    for (ik = iq = 0; ik < KVM_PPC_PAGE_SIZES_MAX_SZ; ik++) {
+        struct ppc_one_seg_page_size *qsps = &env->sps.sps[iq];
+        struct kvm_ppc_one_seg_page_size *ksps = &smmu_info.sps[ik];
+
+        if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
+                                 ksps->page_shift)) {
+            continue;
+        }
+        qsps->page_shift = ksps->page_shift;
+        qsps->slb_enc = ksps->slb_enc;
+        for (jk = jq = 0; jk < KVM_PPC_PAGE_SIZES_MAX_SZ; jk++) {
+            if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
+                                     ksps->enc[jk].page_shift)) {
+                continue;
+            }
+            qsps->enc[jq].page_shift = ksps->enc[jk].page_shift;
+            qsps->enc[jq].pte_enc = ksps->enc[jk].pte_enc;
+            if (++jq >= PPC_PAGE_SIZES_MAX_SZ) {
+                break;
+            }
+        }
+        if (++iq >= PPC_PAGE_SIZES_MAX_SZ) {
+            break;
+        }
+    }
+    env->slb_nr = smmu_info.slb_size;
+    if (smmu_info.flags & KVM_PPC_1T_SEGMENTS) {
+        env->mmu_model |= POWERPC_MMU_1TSEG;
+    } else {
+        env->mmu_model &= ~POWERPC_MMU_1TSEG;
+    }
+}
+#else /* defined (TARGET_PPC64) */
+
+static inline void kvm_fixup_page_sizes(CPUPPCState *env)
+{
+}
+
+#endif /* !defined (TARGET_PPC64) */
+
 int kvm_arch_init_vcpu(CPUPPCState *cenv)
 {
    int ret;

+    /* Gather server mmu info from KVM and update the CPU state */
+    kvm_fixup_page_sizes(cenv);
+
+    /* Synchronize sregs with kvm */
    ret = kvm_arch_sync_sregs(cenv);
    if (ret) {
        return ret;

--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -58,6 +58,11 @@ static inline int kvmppc_get_hypercall(CPUPPCState *env, uint8_t *buf, int buf_l
    return -1;
 }

+static inline int kvmppc_read_segment_page_sizes(uint32_t *prop, int maxcells)
+{
+    return -1;
+}
+
 static inline int kvmppc_set_interrupt(CPUPPCState *env, int irq, int level)
 {
    return -1;

--- a/target-ppc/mmu_helper.c
+++ b/target-ppc/mmu_helper.c
@@ -1634,6 +1634,7 @@ void dump_mmu(FILE *f, fprintf_function cpu_fprintf, CPUPPCState *env)
 #if defined(TARGET_PPC64)
    case POWERPC_MMU_64B:
    case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06d:
        mmubooks_dump_mmu(f, cpu_fprintf, env);
        break;
 #endif
@@ -1664,6 +1665,7 @@ static inline int check_physical(CPUPPCState *env, mmu_ctx_t *ctx,
    case POWERPC_MMU_620:
    case POWERPC_MMU_64B:
    case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06d:
        /* Real address are 60 bits long */
        ctx->raddr &= 0x0FFFFFFFFFFFFFFFULL;
        ctx->prot |= PAGE_WRITE;
@@ -1745,6 +1747,7 @@ int get_physical_address(CPUPPCState *env, mmu_ctx_t *ctx, target_ulong eaddr,
        case POWERPC_MMU_620:
        case POWERPC_MMU_64B:
        case POWERPC_MMU_2_06:
+        case POWERPC_MMU_2_06d:
 #endif
            if (ret < 0) {
                /* We didn't match any BAT entry or don't have BATs */
@@ -1886,6 +1889,7 @@ int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, int rw,
                case POWERPC_MMU_620:
                case POWERPC_MMU_64B:
                case POWERPC_MMU_2_06:
+                case POWERPC_MMU_2_06d:
 #endif
                    env->exception_index = POWERPC_EXCP_ISI;
                    env->error_code = 0x40000000;
@@ -1997,6 +2001,7 @@ int cpu_ppc_handle_mmu_fault(CPUPPCState *env, target_ulong address, int rw,
                case POWERPC_MMU_620:
                case POWERPC_MMU_64B:
                case POWERPC_MMU_2_06:
+                case POWERPC_MMU_2_06d:
 #endif
                    env->exception_index = POWERPC_EXCP_DSI;
                    env->error_code = 0;
@@ -2326,6 +2331,7 @@ void ppc_tlb_invalidate_all(CPUPPCState *env)
    case POWERPC_MMU_620:
    case POWERPC_MMU_64B:
    case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06d:
 #endif /* defined(TARGET_PPC64) */
        tlb_flush(env, 1);
        break;
@@ -2395,6 +2401,7 @@ void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
    case POWERPC_MMU_620:
    case POWERPC_MMU_64B:
    case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06d:
        /* tlbie invalidate TLBs for all segments */
        /* XXX: given the fact that there are too many segments to invalidate,
         *      and we still don't have a tlb_flush_mask(env, n, mask) in QEMU,

--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -9928,6 +9928,27 @@ int cpu_ppc_register_internal (CPUPPCState *env, const ppc_def_t *def)
    env->bfd_mach = def->bfd_mach;
    env->check_pow = def->check_pow;

+#if defined(TARGET_PPC64)
+    if (def->sps)
+        env->sps = *def->sps;
+    else if (env->mmu_model & POWERPC_MMU_64) {
+        /* Use default sets of page sizes */
+        static const struct ppc_segment_page_sizes defsps = {
+            .sps = {
+                { .page_shift = 12, /* 4K */
+                  .slb_enc = 0,
+                  .enc = { { .page_shift = 12, .pte_enc = 0 } }
+                },
+                { .page_shift = 24, /* 16M */
+                  .slb_enc = 0x100,
+                  .enc = { { .page_shift = 24, .pte_enc = 0 } }
+                },
+            },
+        };
+        env->sps = defsps;
+    }
+#endif /* defined(TARGET_PPC64) */
+
    if (kvm_enabled()) {
        if (kvmppc_fixup_cpu(env) != 0) {
            fprintf(stderr, "Unable to virtualize selected CPU with KVM\n");