提交 4ffe713b 编写于 作者: A Aneesh Kumar K.V 提交者: Michael Ellerman

powerpc/mm: Increase the max addressable memory to 2PB

Currently we limit the max addressable memory to 128TB. This patch increase the
limit to 2PB. We can have devices like nvdimm which adds memory above 512TB
limit.

We still don't support regular system ram above 512TB. One of the challenge with
that is the percpu allocator, that allocates per node memory and use the max
distance between them as the percpu offsets. This means with large gap in
address space ( system ram above 1PB) we will run out of vmalloc space to map
the percpu allocation.

In order to support addressable memory above 512TB, kernel should be able to
linear map this range. To do that with hash translation we now add 4 context
to kernel linear map region. Our per context addressable range is 512TB. We
still keep VMALLOC and VMEMMAP region to old size. SLB miss handlers is updated
to validate these limit.

We also limit this update to SPARSEMEM_VMEMMAP and SPARSEMEM_EXTREME
Signed-off-by: NAneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: NMichael Ellerman <mpe@ellerman.id.au>
上级 c9f80734
...@@ -521,13 +521,9 @@ extern void slb_set_size(u16 size); ...@@ -521,13 +521,9 @@ extern void slb_set_size(u16 size);
* from mmu context id and effective segment id of the address. * from mmu context id and effective segment id of the address.
* *
* For user processes max context id is limited to MAX_USER_CONTEXT. * For user processes max context id is limited to MAX_USER_CONTEXT.
* more details in get_user_context
* For kernel space, we use context ids 1-4 to map addresses as below: *
* NOTE: each context only support 64TB now. * For kernel space get_kernel_context
* 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
* 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
* 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
* 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
* *
* The proto-VSIDs are then scrambled into real VSIDs with the * The proto-VSIDs are then scrambled into real VSIDs with the
* multiplicative hash: * multiplicative hash:
...@@ -567,6 +563,21 @@ extern void slb_set_size(u16 size); ...@@ -567,6 +563,21 @@ extern void slb_set_size(u16 size);
#define ESID_BITS_MASK ((1 << ESID_BITS) - 1) #define ESID_BITS_MASK ((1 << ESID_BITS) - 1)
#define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) #define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1)
/*
* Now certain config support MAX_PHYSMEM more than 512TB. Hence we will need
* to use more than one context for linear mapping the kernel.
* For vmalloc and memmap, we use just one context with 512TB. With 64 byte
* struct page size, we need ony 32 TB in memmap for 2PB (51 bits (MAX_PHYSMEM_BITS)).
*/
#if (MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT)
#define MAX_KERNEL_CTX_CNT (1UL << (MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT))
#else
#define MAX_KERNEL_CTX_CNT 1
#endif
#define MAX_VMALLOC_CTX_CNT 1
#define MAX_MEMMAP_CTX_CNT 1
/* /*
* 256MB segment * 256MB segment
* The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
...@@ -577,12 +588,13 @@ extern void slb_set_size(u16 size); ...@@ -577,12 +588,13 @@ extern void slb_set_size(u16 size);
* We also need to avoid the last segment of the last context, because that * We also need to avoid the last segment of the last context, because that
* would give a protovsid of 0x1fffffffff. That will result in a VSID 0 * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
* because of the modulo operation in vsid scramble. * because of the modulo operation in vsid scramble.
*
* We add one extra context to MIN_USER_CONTEXT so that we can map kernel
* context easily. The +1 is to map the unused 0xe region mapping.
*/ */
#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2)
#define MIN_USER_CONTEXT (5) #define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \
MAX_MEMMAP_CTX_CNT + 2)
/* Would be nice to use KERNEL_REGION_ID here */
#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1)
/* /*
* For platforms that support on 65bit VA we limit the context bits * For platforms that support on 65bit VA we limit the context bits
...@@ -742,6 +754,39 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, ...@@ -742,6 +754,39 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits); return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits);
} }
/*
* For kernel space, we use context ids as below
* below. Range is 512TB per context.
*
* 0x00001 - [ 0xc000000000000000 - 0xc001ffffffffffff]
* 0x00002 - [ 0xc002000000000000 - 0xc003ffffffffffff]
* 0x00003 - [ 0xc004000000000000 - 0xc005ffffffffffff]
* 0x00004 - [ 0xc006000000000000 - 0xc007ffffffffffff]
* 0x00005 - [ 0xd000000000000000 - 0xd001ffffffffffff ]
* 0x00006 - Not used - Can map 0xe000000000000000 range.
* 0x00007 - [ 0xf000000000000000 - 0xf001ffffffffffff ]
*
* So we can compute the context from the region (top nibble) by
* subtracting 11, or 0xc - 1.
*/
static inline unsigned long get_kernel_context(unsigned long ea)
{
unsigned long region_id = REGION_ID(ea);
unsigned long ctx;
/*
* For linear mapping we do support multiple context
*/
if (region_id == KERNEL_REGION_ID) {
/*
* We already verified ea to be not beyond the addr limit.
*/
ctx = 1 + ((ea & ~REGION_MASK) >> MAX_EA_BITS_PER_CONTEXT);
} else
ctx = (region_id - 0xc) + MAX_KERNEL_CTX_CNT;
return ctx;
}
/* /*
* This is only valid for addresses >= PAGE_OFFSET * This is only valid for addresses >= PAGE_OFFSET
*/ */
...@@ -752,20 +797,7 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) ...@@ -752,20 +797,7 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
if (!is_kernel_addr(ea)) if (!is_kernel_addr(ea))
return 0; return 0;
/* context = get_kernel_context(ea);
* For kernel space, we use context ids 1-4 to map the address space as
* below:
*
* 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
* 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
* 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
* 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
*
* So we can compute the context from the region (top nibble) by
* subtracting 11, or 0xc - 1.
*/
context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET;
return get_vsid(context, ea, ssize); return get_vsid(context, ea, ssize);
} }
......
...@@ -309,6 +309,21 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) ...@@ -309,6 +309,21 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
*/ */
#define MMU_PAGE_COUNT 16 #define MMU_PAGE_COUNT 16
/*
* If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
* if we increase SECTIONS_WIDTH we will not store node details in page->flags and
* page_to_nid does a page->section->node lookup
* Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
* memory requirements with large number of sections.
* 51 bits is the max physical real address on POWER9
*/
#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \
defined (CONFIG_PPC_64K_PAGES)
#define MAX_PHYSMEM_BITS 51
#else
#define MAX_PHYSMEM_BITS 46
#endif
#ifdef CONFIG_PPC_BOOK3S_64 #ifdef CONFIG_PPC_BOOK3S_64
#include <asm/book3s/64/mmu.h> #include <asm/book3s/64/mmu.h>
#else /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S_64 */
......
...@@ -9,17 +9,6 @@ ...@@ -9,17 +9,6 @@
* MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space
*/ */
#define SECTION_SIZE_BITS 24 #define SECTION_SIZE_BITS 24
/*
* If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
* if we increase SECTIONS_WIDTH we will not store node details in page->flags and
* page_to_nid does a page->section->node lookup
* Hence only increase for VMEMMAP.
*/
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#define MAX_PHYSMEM_BITS 47
#else
#define MAX_PHYSMEM_BITS 46
#endif
#endif /* CONFIG_SPARSEMEM */ #endif /* CONFIG_SPARSEMEM */
......
...@@ -693,16 +693,27 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) ...@@ -693,16 +693,27 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
unsigned long flags; unsigned long flags;
int ssize; int ssize;
if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) if (id == KERNEL_REGION_ID) {
/* We only support upto MAX_PHYSMEM_BITS */
if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS))
return -EFAULT; return -EFAULT;
if (id == KERNEL_REGION_ID) {
flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
#ifdef CONFIG_SPARSEMEM_VMEMMAP #ifdef CONFIG_SPARSEMEM_VMEMMAP
} else if (id == VMEMMAP_REGION_ID) { } else if (id == VMEMMAP_REGION_ID) {
if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
return -EFAULT;
flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
#endif #endif
} else if (id == VMALLOC_REGION_ID) { } else if (id == VMALLOC_REGION_ID) {
if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
return -EFAULT;
if (ea < H_VMALLOC_END) if (ea < H_VMALLOC_END)
flags = get_paca()->vmalloc_sllp; flags = get_paca()->vmalloc_sllp;
else else
...@@ -715,8 +726,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) ...@@ -715,8 +726,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
ssize = MMU_SEGSIZE_256M; ssize = MMU_SEGSIZE_256M;
context = id - KERNEL_REGION_CONTEXT_OFFSET; context = get_kernel_context(ea);
return slb_insert_entry(ea, context, flags, ssize, true); return slb_insert_entry(ea, context, flags, ssize, true);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册