提交 14bc84ce 编写于 作者: L Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux

Pull s390 updates from Martin Schwidefsky:
 "One additional new feature for 4.1, a new PRNG based on SHA-512 for
  the zcrypt driver.

  Two memory management related changes, the page table reallocation for
  KVM is removed, and with file ptes gone the encoding of page table
  entries is improved.

  And three bug fixes"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux:
  s390/zcrypt: Introduce new SHA-512 based Pseudo Random Generator.
  s390/mm: change swap pte encoding and pgtable cleanup
  s390/mm: correct transfer of dirty & young bits in __pmd_to_pte
  s390/bpf: add dependency to z196 features
  s390/3215: free memory in error path
  s390/kvm: remove delayed reallocation of page tables for KVM
  kexec: allocate the kexec control page with KEXEC_CONTROL_MEMORY_GFP
......@@ -115,7 +115,7 @@ config S390
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z9_109_FEATURES
select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
select HAVE_DEBUG_KMEMLEAK
......
......@@ -3,9 +3,10 @@
*
* Support for s390 cryptographic instructions.
*
* Copyright IBM Corp. 2003, 2007
* Copyright IBM Corp. 2003, 2015
* Author(s): Thomas Spatzier
* Jan Glauber (jan.glauber@de.ibm.com)
* Harald Freudenberger (freude@de.ibm.com)
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
......@@ -28,6 +29,7 @@
#define CRYPT_S390_MSA 0x1
#define CRYPT_S390_MSA3 0x2
#define CRYPT_S390_MSA4 0x4
#define CRYPT_S390_MSA5 0x8
/* s390 cryptographic operations */
enum crypt_s390_operations {
......@@ -36,7 +38,8 @@ enum crypt_s390_operations {
CRYPT_S390_KIMD = 0x0300,
CRYPT_S390_KLMD = 0x0400,
CRYPT_S390_KMAC = 0x0500,
CRYPT_S390_KMCTR = 0x0600
CRYPT_S390_KMCTR = 0x0600,
CRYPT_S390_PPNO = 0x0700
};
/*
......@@ -138,6 +141,16 @@ enum crypt_s390_kmac_func {
KMAC_TDEA_192 = CRYPT_S390_KMAC | 3
};
/*
* function codes for PPNO (PERFORM PSEUDORANDOM NUMBER
* OPERATION) instruction
*/
enum crypt_s390_ppno_func {
PPNO_QUERY = CRYPT_S390_PPNO | 0,
PPNO_SHA512_DRNG_GEN = CRYPT_S390_PPNO | 3,
PPNO_SHA512_DRNG_SEED = CRYPT_S390_PPNO | 0x83
};
/**
* crypt_s390_km:
* @func: the function code passed to KM; see crypt_s390_km_func
......@@ -162,11 +175,11 @@ static inline int crypt_s390_km(long func, void *param,
int ret;
asm volatile(
"0: .insn rre,0xb92e0000,%3,%1 \n" /* KM opcode */
"1: brc 1,0b \n" /* handle partial completion */
"0: .insn rre,0xb92e0000,%3,%1\n" /* KM opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b,2b) EX_TABLE(1b,2b)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "=d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest)
: "d" (__func), "a" (__param), "0" (-1) : "cc", "memory");
if (ret < 0)
......@@ -198,11 +211,11 @@ static inline int crypt_s390_kmc(long func, void *param,
int ret;
asm volatile(
"0: .insn rre,0xb92f0000,%3,%1 \n" /* KMC opcode */
"1: brc 1,0b \n" /* handle partial completion */
"0: .insn rre,0xb92f0000,%3,%1\n" /* KMC opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b,2b) EX_TABLE(1b,2b)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "=d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest)
: "d" (__func), "a" (__param), "0" (-1) : "cc", "memory");
if (ret < 0)
......@@ -233,11 +246,11 @@ static inline int crypt_s390_kimd(long func, void *param,
int ret;
asm volatile(
"0: .insn rre,0xb93e0000,%1,%1 \n" /* KIMD opcode */
"1: brc 1,0b \n" /* handle partial completion */
"0: .insn rre,0xb93e0000,%1,%1\n" /* KIMD opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b,2b) EX_TABLE(1b,2b)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "=d" (ret), "+a" (__src), "+d" (__src_len)
: "d" (__func), "a" (__param), "0" (-1) : "cc", "memory");
if (ret < 0)
......@@ -267,11 +280,11 @@ static inline int crypt_s390_klmd(long func, void *param,
int ret;
asm volatile(
"0: .insn rre,0xb93f0000,%1,%1 \n" /* KLMD opcode */
"1: brc 1,0b \n" /* handle partial completion */
"0: .insn rre,0xb93f0000,%1,%1\n" /* KLMD opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b,2b) EX_TABLE(1b,2b)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "=d" (ret), "+a" (__src), "+d" (__src_len)
: "d" (__func), "a" (__param), "0" (-1) : "cc", "memory");
if (ret < 0)
......@@ -302,11 +315,11 @@ static inline int crypt_s390_kmac(long func, void *param,
int ret;
asm volatile(
"0: .insn rre,0xb91e0000,%1,%1 \n" /* KLAC opcode */
"1: brc 1,0b \n" /* handle partial completion */
"0: .insn rre,0xb91e0000,%1,%1\n" /* KLAC opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b,2b) EX_TABLE(1b,2b)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "=d" (ret), "+a" (__src), "+d" (__src_len)
: "d" (__func), "a" (__param), "0" (-1) : "cc", "memory");
if (ret < 0)
......@@ -340,11 +353,11 @@ static inline int crypt_s390_kmctr(long func, void *param, u8 *dest,
int ret = -1;
asm volatile(
"0: .insn rrf,0xb92d0000,%3,%1,%4,0 \n" /* KMCTR opcode */
"1: brc 1,0b \n" /* handle partial completion */
"0: .insn rrf,0xb92d0000,%3,%1,%4,0\n" /* KMCTR opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b,2b) EX_TABLE(1b,2b)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "+d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest),
"+a" (__ctr)
: "d" (__func), "a" (__param) : "cc", "memory");
......@@ -353,6 +366,47 @@ static inline int crypt_s390_kmctr(long func, void *param, u8 *dest,
return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len;
}
/**
* crypt_s390_ppno:
* @func: the function code passed to PPNO; see crypt_s390_ppno_func
* @param: address of parameter block; see POP for details on each func
* @dest: address of destination memory area
* @dest_len: size of destination memory area in bytes
* @seed: address of seed data
* @seed_len: size of seed data in bytes
*
* Executes the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION)
* operation of the CPU.
*
* Returns -1 for failure, 0 for the query func, number of random
* bytes stored in dest buffer for generate function
*/
static inline int crypt_s390_ppno(long func, void *param,
u8 *dest, long dest_len,
const u8 *seed, long seed_len)
{
register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
register void *__param asm("1") = param; /* param block (240 bytes) */
register u8 *__dest asm("2") = dest; /* buf for recv random bytes */
register long __dest_len asm("3") = dest_len; /* requested random bytes */
register const u8 *__seed asm("4") = seed; /* buf with seed data */
register long __seed_len asm("5") = seed_len; /* bytes in seed buf */
int ret = -1;
asm volatile (
"0: .insn rre,0xb93c0000,%1,%5\n" /* PPNO opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "+d" (ret), "+a"(__dest), "+d"(__dest_len)
: "d"(__func), "a"(__param), "a"(__seed), "d"(__seed_len)
: "cc", "memory");
if (ret < 0)
return ret;
return (func & CRYPT_S390_FUNC_MASK) ? dest_len - __dest_len : 0;
}
/**
* crypt_s390_func_available:
* @func: the function code of the specific function; 0 if op in general
......@@ -373,6 +427,9 @@ static inline int crypt_s390_func_available(int func,
return 0;
if (facility_mask & CRYPT_S390_MSA4 && !test_facility(77))
return 0;
if (facility_mask & CRYPT_S390_MSA5 && !test_facility(57))
return 0;
switch (func & CRYPT_S390_OP_MASK) {
case CRYPT_S390_KM:
ret = crypt_s390_km(KM_QUERY, &status, NULL, NULL, 0);
......@@ -390,8 +447,12 @@ static inline int crypt_s390_func_available(int func,
ret = crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0);
break;
case CRYPT_S390_KMCTR:
ret = crypt_s390_kmctr(KMCTR_QUERY, &status, NULL, NULL, 0,
NULL);
ret = crypt_s390_kmctr(KMCTR_QUERY, &status,
NULL, NULL, 0, NULL);
break;
case CRYPT_S390_PPNO:
ret = crypt_s390_ppno(PPNO_QUERY, &status,
NULL, 0, NULL, 0);
break;
default:
return 0;
......@@ -419,15 +480,14 @@ static inline int crypt_s390_pcc(long func, void *param)
int ret = -1;
asm volatile(
"0: .insn rre,0xb92c0000,0,0 \n" /* PCC opcode */
"1: brc 1,0b \n" /* handle partial completion */
"0: .insn rre,0xb92c0000,0,0\n" /* PCC opcode */
"1: brc 1,0b\n" /* handle partial completion */
" la %0,0\n"
"2:\n"
EX_TABLE(0b,2b) EX_TABLE(1b,2b)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
: "+d" (ret)
: "d" (__func), "a" (__param) : "cc", "memory");
return ret;
}
#endif /* _CRYPTO_ARCH_S390_CRYPT_S390_H */
此差异已折叠。
......@@ -26,6 +26,9 @@
/* Not more than 2GB */
#define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31)
/* Allocate control page with GFP_DMA */
#define KEXEC_CONTROL_MEMORY_GFP GFP_DMA
/* Maximum address we can use for the crash control pages */
#define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL)
......
......@@ -14,7 +14,9 @@ typedef struct {
unsigned long asce_bits;
unsigned long asce_limit;
unsigned long vdso_base;
/* The mmu context has extended page tables. */
/* The mmu context allocates 4K page tables. */
unsigned int alloc_pgste:1;
/* The mmu context uses extended page tables. */
unsigned int has_pgste:1;
/* The mmu context uses storage keys. */
unsigned int use_skey:1;
......
......@@ -20,8 +20,11 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.flush_mm = 0;
mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
mm->context.asce_bits |= _ASCE_TYPE_REGION3;
#ifdef CONFIG_PGSTE
mm->context.alloc_pgste = page_table_allocate_pgste;
mm->context.has_pgste = 0;
mm->context.use_skey = 0;
#endif
mm->context.asce_limit = STACK_TOP_MAX;
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
return 0;
......
......@@ -21,6 +21,7 @@ void crst_table_free(struct mm_struct *, unsigned long *);
unsigned long *page_table_alloc(struct mm_struct *);
void page_table_free(struct mm_struct *, unsigned long *);
void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
extern int page_table_allocate_pgste;
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned long key, bool nq);
......
......@@ -12,12 +12,9 @@
#define _ASM_S390_PGTABLE_H
/*
* The Linux memory management assumes a three-level page table setup. For
* s390 31 bit we "fold" the mid level into the top-level page table, so
* that we physically have the same two-level page table as the s390 mmu
* expects in 31 bit mode. For s390 64 bit we use three of the five levels
* the hardware provides (region first and region second tables are not
* used).
* The Linux memory management assumes a three-level page table setup.
* For s390 64 bit we use up to four of the five levels the hardware
* provides (region first tables are not used).
*
* The "pgd_xxx()" functions are trivial for a folded two-level
* setup: the pgd is never bad, and a pmd always exists (as it's folded
......@@ -101,8 +98,8 @@ extern unsigned long zero_page_mask;
#ifndef __ASSEMBLY__
/*
* The vmalloc and module area will always be on the topmost area of the kernel
* mapping. We reserve 96MB (31bit) / 128GB (64bit) for vmalloc and modules.
* The vmalloc and module area will always be on the topmost area of the
* kernel mapping. We reserve 128GB (64bit) for vmalloc and modules.
* On 64 bit kernels we have a 2GB area at the top of the vmalloc area where
* modules will reside. That makes sure that inter module branches always
* happen without trampolines and in addition the placement within a 2GB frame
......@@ -131,38 +128,6 @@ static inline int is_module_addr(void *addr)
}
/*
* A 31 bit pagetable entry of S390 has following format:
* | PFRA | | OS |
* 0 0IP0
* 00000000001111111111222222222233
* 01234567890123456789012345678901
*
* I Page-Invalid Bit: Page is not available for address-translation
* P Page-Protection Bit: Store access not possible for page
*
* A 31 bit segmenttable entry of S390 has following format:
* | P-table origin | |PTL
* 0 IC
* 00000000001111111111222222222233
* 01234567890123456789012345678901
*
* I Segment-Invalid Bit: Segment is not available for address-translation
* C Common-Segment Bit: Segment is not private (PoP 3-30)
* PTL Page-Table-Length: Page-table length (PTL+1*16 entries -> up to 256)
*
* The 31 bit segmenttable origin of S390 has following format:
*
* |S-table origin | | STL |
* X **GPS
* 00000000001111111111222222222233
* 01234567890123456789012345678901
*
* X Space-Switch event:
* G Segment-Invalid Bit: *
* P Private-Space Bit: Segment is not private (PoP 3-30)
* S Storage-Alteration:
* STL Segment-Table-Length: Segment-table length (STL+1*16 entries -> up to 2048)
*
* A 64 bit pagetable entry of S390 has following format:
* | PFRA |0IPC| OS |
* 0000000000111111111122222222223333333333444444444455555555556666
......@@ -220,7 +185,6 @@ static inline int is_module_addr(void *addr)
/* Software bits in the page table entry */
#define _PAGE_PRESENT 0x001 /* SW pte present bit */
#define _PAGE_TYPE 0x002 /* SW pte type bit */
#define _PAGE_YOUNG 0x004 /* SW pte young bit */
#define _PAGE_DIRTY 0x008 /* SW pte dirty bit */
#define _PAGE_READ 0x010 /* SW pte read bit */
......@@ -240,31 +204,34 @@ static inline int is_module_addr(void *addr)
* table lock held.
*
* The following table gives the different possible bit combinations for
* the pte hardware and software bits in the last 12 bits of a pte:
* the pte hardware and software bits in the last 12 bits of a pte
* (. unassigned bit, x don't care, t swap type):
*
* 842100000000
* 000084210000
* 000000008421
* .IR...wrdytp
* empty .10...000000
* swap .10...xxxx10
* file .11...xxxxx0
* prot-none, clean, old .11...000001
* prot-none, clean, young .11...000101
* prot-none, dirty, old .10...001001
* prot-none, dirty, young .10...001101
* read-only, clean, old .11...010001
* read-only, clean, young .01...010101
* read-only, dirty, old .11...011001
* read-only, dirty, young .01...011101
* read-write, clean, old .11...110001
* read-write, clean, young .01...110101
* read-write, dirty, old .10...111001
* read-write, dirty, young .00...111101
* .IR.uswrdy.p
* empty .10.00000000
* swap .11..ttttt.0
* prot-none, clean, old .11.xx0000.1
* prot-none, clean, young .11.xx0001.1
* prot-none, dirty, old .10.xx0010.1
* prot-none, dirty, young .10.xx0011.1
* read-only, clean, old .11.xx0100.1
* read-only, clean, young .01.xx0101.1
* read-only, dirty, old .11.xx0110.1
* read-only, dirty, young .01.xx0111.1
* read-write, clean, old .11.xx1100.1
* read-write, clean, young .01.xx1101.1
* read-write, dirty, old .10.xx1110.1
* read-write, dirty, young .00.xx1111.1
* HW-bits: R read-only, I invalid
* SW-bits: p present, y young, d dirty, r read, w write, s special,
* u unused, l large
*
* pte_present is true for the bit pattern .xx...xxxxx1, (pte & 0x001) == 0x001
* pte_none is true for the bit pattern .10...xxxx00, (pte & 0x603) == 0x400
* pte_swap is true for the bit pattern .10...xxxx10, (pte & 0x603) == 0x402
* pte_none is true for the bit pattern .10.00000000, pte == 0x400
* pte_swap is true for the bit pattern .11..ooooo.0, (pte & 0x201) == 0x200
* pte_present is true for the bit pattern .xx.xxxxxx.1, (pte & 0x001) == 0x001
*/
/* Bits in the segment/region table address-space-control-element */
......@@ -335,6 +302,8 @@ static inline int is_module_addr(void *addr)
* read-write, dirty, young 11..0...0...11
* The segment table origin is used to distinguish empty (origin==0) from
* read-write, old segment table entries (origin!=0)
* HW-bits: R read-only, I invalid
* SW-bits: y young, d dirty, r read, w write
*/
#define _SEGMENT_ENTRY_SPLIT_BIT 11 /* THP splitting bit number */
......@@ -423,6 +392,15 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
}
static inline int mm_alloc_pgste(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
if (unlikely(mm->context.alloc_pgste))
return 1;
#endif
return 0;
}
/*
* In the case that a guest uses storage keys
* faults should no longer be backed by zero pages
......@@ -582,10 +560,9 @@ static inline int pte_none(pte_t pte)
static inline int pte_swap(pte_t pte)
{
/* Bit pattern: (pte & 0x603) == 0x402 */
return (pte_val(pte) & (_PAGE_INVALID | _PAGE_PROTECT |
_PAGE_TYPE | _PAGE_PRESENT))
== (_PAGE_INVALID | _PAGE_TYPE);
/* Bit pattern: (pte & 0x201) == 0x200 */
return (pte_val(pte) & (_PAGE_PROTECT | _PAGE_PRESENT))
== _PAGE_PROTECT;
}
static inline int pte_special(pte_t pte)
......@@ -1586,51 +1563,51 @@ static inline int has_transparent_hugepage(void)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* 31 bit swap entry format:
* A page-table entry has some bits we have to treat in a special way.
* Bits 0, 20 and bit 23 have to be zero, otherwise an specification
* exception will occur instead of a page translation exception. The
* specifiation exception has the bad habit not to store necessary
* information in the lowcore.
* Bits 21, 22, 30 and 31 are used to indicate the page type.
* A swap pte is indicated by bit pattern (pte & 0x603) == 0x402
* This leaves the bits 1-19 and bits 24-29 to store type and offset.
* We use the 5 bits from 25-29 for the type and the 20 bits from 1-19
* plus 24 for the offset.
* 0| offset |0110|o|type |00|
* 0 0000000001111111111 2222 2 22222 33
* 0 1234567890123456789 0123 4 56789 01
*
* 64 bit swap entry format:
* A page-table entry has some bits we have to treat in a special way.
* Bits 52 and bit 55 have to be zero, otherwise an specification
* exception will occur instead of a page translation exception. The
* specifiation exception has the bad habit not to store necessary
* information in the lowcore.
* Bits 53, 54, 62 and 63 are used to indicate the page type.
* A swap pte is indicated by bit pattern (pte & 0x603) == 0x402
* This leaves the bits 0-51 and bits 56-61 to store type and offset.
* We use the 5 bits from 57-61 for the type and the 53 bits from 0-51
* plus 56 for the offset.
* | offset |0110|o|type |00|
* 0000000000111111111122222222223333333333444444444455 5555 5 55566 66
* 0123456789012345678901234567890123456789012345678901 2345 6 78901 23
* Bits 54 and 63 are used to indicate the page type.
* A swap pte is indicated by bit pattern (pte & 0x201) == 0x200
* This leaves the bits 0-51 and bits 56-62 to store type and offset.
* We use the 5 bits from 57-61 for the type and the 52 bits from 0-51
* for the offset.
* | offset |01100|type |00|
* |0000000000111111111122222222223333333333444444444455|55555|55566|66|
* |0123456789012345678901234567890123456789012345678901|23456|78901|23|
*/
#define __SWP_OFFSET_MASK (~0UL >> 11)
#define __SWP_OFFSET_MASK ((1UL << 52) - 1)
#define __SWP_OFFSET_SHIFT 12
#define __SWP_TYPE_MASK ((1UL << 5) - 1)
#define __SWP_TYPE_SHIFT 2
static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
{
pte_t pte;
offset &= __SWP_OFFSET_MASK;
pte_val(pte) = _PAGE_INVALID | _PAGE_TYPE | ((type & 0x1f) << 2) |
((offset & 1UL) << 7) | ((offset & ~1UL) << 11);
pte_val(pte) = _PAGE_INVALID | _PAGE_PROTECT;
pte_val(pte) |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT;
pte_val(pte) |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT;
return pte;
}
#define __swp_type(entry) (((entry).val >> 2) & 0x1f)
#define __swp_offset(entry) (((entry).val >> 11) | (((entry).val >> 7) & 1))
#define __swp_entry(type,offset) ((swp_entry_t) { pte_val(mk_swap_pte((type),(offset))) })
static inline unsigned long __swp_type(swp_entry_t entry)
{
return (entry.val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK;
}
static inline unsigned long __swp_offset(swp_entry_t entry)
{
return (entry.val >> __SWP_OFFSET_SHIFT) & __SWP_OFFSET_MASK;
}
static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
{
return (swp_entry_t) { pte_val(mk_swap_pte(type, offset)) };
}
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
......
......@@ -14,20 +14,23 @@ static inline pmd_t __pte_to_pmd(pte_t pte)
/*
* Convert encoding pte bits pmd bits
* .IR...wrdytp dy..R...I...wr
* empty .10...000000 -> 00..0...1...00
* prot-none, clean, old .11...000001 -> 00..1...1...00
* prot-none, clean, young .11...000101 -> 01..1...1...00
* prot-none, dirty, old .10...001001 -> 10..1...1...00
* prot-none, dirty, young .10...001101 -> 11..1...1...00
* read-only, clean, old .11...010001 -> 00..1...1...01
* read-only, clean, young .01...010101 -> 01..1...0...01
* read-only, dirty, old .11...011001 -> 10..1...1...01
* read-only, dirty, young .01...011101 -> 11..1...0...01
* read-write, clean, old .11...110001 -> 00..0...1...11
* read-write, clean, young .01...110101 -> 01..0...0...11
* read-write, dirty, old .10...111001 -> 10..0...1...11
* read-write, dirty, young .00...111101 -> 11..0...0...11
* lIR.uswrdy.p dy..R...I...wr
* empty 010.000000.0 -> 00..0...1...00
* prot-none, clean, old 111.000000.1 -> 00..1...1...00
* prot-none, clean, young 111.000001.1 -> 01..1...1...00
* prot-none, dirty, old 111.000010.1 -> 10..1...1...00
* prot-none, dirty, young 111.000011.1 -> 11..1...1...00
* read-only, clean, old 111.000100.1 -> 00..1...1...01
* read-only, clean, young 101.000101.1 -> 01..1...0...01
* read-only, dirty, old 111.000110.1 -> 10..1...1...01
* read-only, dirty, young 101.000111.1 -> 11..1...0...01
* read-write, clean, old 111.001100.1 -> 00..1...1...11
* read-write, clean, young 101.001101.1 -> 01..1...0...11
* read-write, dirty, old 110.001110.1 -> 10..0...1...11
* read-write, dirty, young 100.001111.1 -> 11..0...0...11
* HW-bits: R read-only, I invalid
* SW-bits: p present, y young, d dirty, r read, w write, s special,
* u unused, l large
*/
if (pte_present(pte)) {
pmd_val(pmd) = pte_val(pte) & PAGE_MASK;
......@@ -48,20 +51,23 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
/*
* Convert encoding pmd bits pte bits
* dy..R...I...wr .IR...wrdytp
* empty 00..0...1...00 -> .10...001100
* prot-none, clean, old 00..0...1...00 -> .10...000001
* prot-none, clean, young 01..0...1...00 -> .10...000101
* prot-none, dirty, old 10..0...1...00 -> .10...001001
* prot-none, dirty, young 11..0...1...00 -> .10...001101
* read-only, clean, old 00..1...1...01 -> .11...010001
* read-only, clean, young 01..1...1...01 -> .11...010101
* read-only, dirty, old 10..1...1...01 -> .11...011001
* read-only, dirty, young 11..1...1...01 -> .11...011101
* read-write, clean, old 00..0...1...11 -> .10...110001
* read-write, clean, young 01..0...1...11 -> .10...110101
* read-write, dirty, old 10..0...1...11 -> .10...111001
* read-write, dirty, young 11..0...1...11 -> .10...111101
* dy..R...I...wr lIR.uswrdy.p
* empty 00..0...1...00 -> 010.000000.0
* prot-none, clean, old 00..1...1...00 -> 111.000000.1
* prot-none, clean, young 01..1...1...00 -> 111.000001.1
* prot-none, dirty, old 10..1...1...00 -> 111.000010.1
* prot-none, dirty, young 11..1...1...00 -> 111.000011.1
* read-only, clean, old 00..1...1...01 -> 111.000100.1
* read-only, clean, young 01..1...0...01 -> 101.000101.1
* read-only, dirty, old 10..1...1...01 -> 111.000110.1
* read-only, dirty, young 11..1...0...01 -> 101.000111.1
* read-write, clean, old 00..1...1...11 -> 111.001100.1
* read-write, clean, young 01..1...0...11 -> 101.001101.1
* read-write, dirty, old 10..0...1...11 -> 110.001110.1
* read-write, dirty, young 11..0...0...11 -> 100.001111.1
* HW-bits: R read-only, I invalid
* SW-bits: p present, y young, d dirty, r read, w write, s special,
* u unused, l large
*/
if (pmd_present(pmd)) {
pte_val(pte) = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN_LARGE;
......@@ -70,8 +76,8 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) << 4;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) << 5;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT);
pmd_val(pmd) |= (pte_val(pte) & _PAGE_DIRTY) << 10;
pmd_val(pmd) |= (pte_val(pte) & _PAGE_YOUNG) << 10;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) >> 10;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) >> 10;
} else
pte_val(pte) = _PAGE_INVALID;
return pte;
......
......@@ -18,6 +18,7 @@
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/sysctl.h>
#include <linux/ksm.h>
#include <linux/mman.h>
......@@ -920,6 +921,40 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
}
EXPORT_SYMBOL(get_guest_storage_key);
static int page_table_allocate_pgste_min = 0;
static int page_table_allocate_pgste_max = 1;
int page_table_allocate_pgste = 0;
EXPORT_SYMBOL(page_table_allocate_pgste);
static struct ctl_table page_table_sysctl[] = {
{
.procname = "allocate_pgste",
.data = &page_table_allocate_pgste,
.maxlen = sizeof(int),
.mode = S_IRUGO | S_IWUSR,
.proc_handler = proc_dointvec,
.extra1 = &page_table_allocate_pgste_min,
.extra2 = &page_table_allocate_pgste_max,
},
{ }
};
static struct ctl_table page_table_sysctl_dir[] = {
{
.procname = "vm",
.maxlen = 0,
.mode = 0555,
.child = page_table_sysctl,
},
{ }
};
static int __init page_table_register_sysctl(void)
{
return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
}
__initcall(page_table_register_sysctl);
#else /* CONFIG_PGSTE */
static inline int page_table_with_pgste(struct page *page)
......@@ -963,7 +998,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
struct page *uninitialized_var(page);
unsigned int mask, bit;
if (mm_has_pgste(mm))
if (mm_alloc_pgste(mm))
return page_table_alloc_pgste(mm);
/* Allocate fragments of a 4K page as 1K/2K page table */
spin_lock_bh(&mm->context.list_lock);
......@@ -1165,116 +1200,25 @@ static inline void thp_split_mm(struct mm_struct *mm)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end)
{
unsigned long next, *table, *new;
struct page *page;
spinlock_t *ptl;
pmd_t *pmd;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
again:
if (pmd_none_or_clear_bad(pmd))
continue;
table = (unsigned long *) pmd_deref(*pmd);
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
if (page_table_with_pgste(page))
continue;
/* Allocate new page table with pgstes */
new = page_table_alloc_pgste(mm);
if (!new)
return -ENOMEM;
ptl = pmd_lock(mm, pmd);
if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
/* Nuke pmd entry pointing to the "short" page table */
pmdp_flush_lazy(mm, addr, pmd);
pmd_clear(pmd);
/* Copy ptes from old table to new table */
memcpy(new, table, PAGE_SIZE/2);
clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
/* Establish new table */
pmd_populate(mm, pmd, (pte_t *) new);
/* Free old table with rcu, there might be a walker! */
page_table_free_rcu(tlb, table, addr);
new = NULL;
}
spin_unlock(ptl);
if (new) {
page_table_free_pgste(new);
goto again;
}
} while (pmd++, addr = next, addr != end);
return addr;
}
static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end)
{
unsigned long next;
pud_t *pud;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
if (unlikely(IS_ERR_VALUE(next)))
return next;
} while (pud++, addr = next, addr != end);
return addr;
}
static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
unsigned long next;
pgd_t *pgd;
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
if (unlikely(IS_ERR_VALUE(next)))
return next;
} while (pgd++, addr = next, addr != end);
return 0;
}
/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
struct mmu_gather tlb;
struct mm_struct *mm = current->mm;
/* Do we have pgstes? if yes, we are done */
if (mm_has_pgste(tsk->mm))
if (mm_has_pgste(mm))
return 0;
/* Fail if the page tables are 2K */
if (!mm_alloc_pgste(mm))
return -EINVAL;
down_write(&mm->mmap_sem);
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
/* Reallocate the page tables with pgstes */
tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
mm->context.has_pgste = 1;
tlb_finish_mmu(&tlb, 0, TASK_SIZE);
up_write(&mm->mmap_sem);
return mm->context.has_pgste ? 0 : -ENOMEM;
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
......
......@@ -667,6 +667,8 @@ static struct raw3215_info *raw3215_alloc_info(void)
info->buffer = kzalloc(RAW3215_BUFFER_SIZE, GFP_KERNEL | GFP_DMA);
info->inbuf = kzalloc(RAW3215_INBUF_SIZE, GFP_KERNEL | GFP_DMA);
if (!info->buffer || !info->inbuf) {
kfree(info->inbuf);
kfree(info->buffer);
kfree(info);
return NULL;
}
......
......@@ -40,6 +40,10 @@
#error KEXEC_CONTROL_MEMORY_LIMIT not defined
#endif
#ifndef KEXEC_CONTROL_MEMORY_GFP
#define KEXEC_CONTROL_MEMORY_GFP GFP_KERNEL
#endif
#ifndef KEXEC_CONTROL_PAGE_SIZE
#error KEXEC_CONTROL_PAGE_SIZE not defined
#endif
......
......@@ -707,7 +707,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
do {
unsigned long pfn, epfn, addr, eaddr;
pages = kimage_alloc_pages(GFP_KERNEL, order);
pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
if (!pages)
break;
pfn = page_to_pfn(pages);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册