diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index d054c1b07a3c3988d2d0c1bf195f7b404ded5bef..741ddba0bf11b547a791fc41a092e426c4f9ffef 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -10,14 +10,25 @@ /** * struct gmap_struct - guest address space + * @list: list head for the mm->context gmap list * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct * @guest_to_host: radix tree with guest to host address translation * @host_to_guest: radix tree with pointer to segment table entries * @guest_table_lock: spinlock to protect all entries in the guest page table + * @ref_count: reference counter for the gmap structure * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest + * @host_to_rmap: radix tree with gmap_rmap lists + * @children: list of shadow gmap structures + * @pt_list: list of all page tables used in the shadow guest address space + * @shadow_lock: spinlock to protect the shadow gmap list + * @parent: pointer to the parent gmap for shadow guest address spaces + * @orig_asce: ASCE for which the shadow page table has been created + * @edat_level: edat level to be used for the shadow translation + * @removed: flag to indicate if a shadow guest address space has been removed + * @initialized: flag to indicate if a shadow guest address space can be used */ struct gmap { struct list_head list; @@ -26,26 +37,64 @@ struct gmap { struct radix_tree_root guest_to_host; struct radix_tree_root host_to_guest; spinlock_t guest_table_lock; + atomic_t ref_count; unsigned long *table; unsigned long asce; unsigned long asce_end; void *private; bool pfault_enabled; + /* Additional data for shadow guest address spaces */ + struct radix_tree_root host_to_rmap; + struct list_head children; + struct list_head pt_list; + spinlock_t shadow_lock; + struct gmap *parent; + unsigned long orig_asce; + int edat_level; + bool removed; + bool initialized; }; +/** + * struct gmap_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @raddr: virtual rmap address in the shadow guest address space + */ +struct gmap_rmap { + struct gmap_rmap *next; + unsigned long raddr; +}; + +#define gmap_for_each_rmap(pos, head) \ + for (pos = (head); pos; pos = pos->next) + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + /** * struct gmap_notifier - notify function block for page invalidation * @notifier_call: address of callback function */ struct gmap_notifier { struct list_head list; - void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); + struct rcu_head rcu; + void (*notifier_call)(struct gmap *gmap, unsigned long start, + unsigned long end); }; -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); -void gmap_free(struct gmap *gmap); +static inline int gmap_is_shadow(struct gmap *gmap) +{ + return !!gmap->parent; +} + +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); +void gmap_remove(struct gmap *gmap); +struct gmap *gmap_get(struct gmap *gmap); +void gmap_put(struct gmap *gmap); + void gmap_enable(struct gmap *gmap); void gmap_disable(struct gmap *gmap); +struct gmap *gmap_get_enabled(void); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); @@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); -void gmap_register_ipte_notifier(struct gmap_notifier *); -void gmap_unregister_ipte_notifier(struct gmap_notifier *); -int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); + +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level); +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake); +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake); +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake); +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake); +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, int *fake); +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); + +void gmap_register_pte_notifier(struct gmap_notifier *); +void gmap_unregister_pte_notifier(struct gmap_notifier *); +void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, + unsigned long bits); + +int gmap_mprotect_notify(struct gmap *, unsigned long start, + unsigned long len, int prot); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9eed5c18a61c390c2494fae189426661ed11b014..946fc86202fdac371a6e32028c16b11967feb668 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -145,7 +145,7 @@ struct kvm_s390_sie_block { __u64 cputm; /* 0x0028 */ __u64 ckc; /* 0x0030 */ __u64 epoch; /* 0x0038 */ - __u8 reserved40[4]; /* 0x0040 */ + __u32 svcc; /* 0x0040 */ #define LCTL_CR0 0x8000 #define LCTL_CR6 0x0200 #define LCTL_CR9 0x0040 @@ -167,6 +167,9 @@ struct kvm_s390_sie_block { #define ICPT_INST 0x04 #define ICPT_PROGI 0x08 #define ICPT_INSTPROGI 0x0C +#define ICPT_EXTINT 0x14 +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 #define ICPT_OPEREXC 0x2C #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 @@ -226,7 +229,7 @@ struct kvm_s390_sie_block { __u8 reserved1e6[2]; /* 0x01e6 */ __u64 itdba; /* 0x01e8 */ __u64 riccbd; /* 0x01f0 */ - __u8 reserved1f8[8]; /* 0x01f8 */ + __u64 gvrd; /* 0x01f8 */ } __attribute__((packed)); struct kvm_s390_itdb { @@ -281,6 +284,7 @@ struct kvm_vcpu_stat { u32 instruction_stsi; u32 instruction_stfl; u32 instruction_tprot; + u32 instruction_sie; u32 instruction_essa; u32 instruction_sthyi; u32 instruction_sigp_sense; @@ -545,12 +549,16 @@ struct kvm_guestdbg_info_arch { struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; + /* if vsie is active, currently executed shadow sie control block */ + struct kvm_s390_sie_block *vsie_block; unsigned int host_acrs[NUM_ACRS]; struct fpu host_fpregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; struct gmap *gmap; + /* backup location for the currently enabled gmap when scheduled out */ + struct gmap *enabled_gmap; struct kvm_guestdbg_info_arch guestdbg; unsigned long pfault_token; unsigned long pfault_select; @@ -635,6 +643,14 @@ struct sie_page2 { u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ } __packed; +struct kvm_s390_vsie { + struct mutex mutex; + struct radix_tree_root addr_to_page; + int page_count; + int next; + struct page *pages[KVM_MAX_VCPUS]; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -659,6 +675,7 @@ struct kvm_arch{ struct sie_page2 *sie_page2; struct kvm_s390_cpu_model model; struct kvm_s390_crypto crypto; + struct kvm_s390_vsie vsie; u64 epoch; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 081b2ad99d737780a9d7d31006a51228896172ba..b941528cc49e69ef2259978284f71fdcf3eadd5d 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -8,8 +8,9 @@ typedef struct { cpumask_t cpu_attach_mask; atomic_t attach_count; unsigned int flush_mm; - spinlock_t list_lock; + spinlock_t pgtable_lock; struct list_head pgtable_list; + spinlock_t gmap_lock; struct list_head gmap_list; unsigned long asce; unsigned long asce_limit; @@ -22,9 +23,11 @@ typedef struct { unsigned int use_skey:1; } mm_context_t; -#define INIT_MM_CONTEXT(name) \ - .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \ - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ +#define INIT_MM_CONTEXT(name) \ + .context.pgtable_lock = \ + __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ + .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ + .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), static inline int tprot(unsigned long addr) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index c837b79b455dc8615f55957e3475c00c35f56a17..3ce3854b7a41b8225c5c2b875cdfe2fe5684e8d9 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -15,8 +15,9 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - spin_lock_init(&mm->context.list_lock); + spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); + spin_lock_init(&mm->context.gmap_lock); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.attach_count, 0); diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index f874e7d51c191948c21f51481380c39c05a674a9..b5edff32e547b630ae50ecc8eeb51b5c0b25b1b8 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -147,6 +147,8 @@ static inline int devmem_is_allowed(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) +#define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index da34cb6b1f3b0347f8faef06a91d5a050b2140b9..f4eb9843eed4a6bfc399bacfd2ab5f325b9b743f 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); +struct page *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); +void page_table_free_pgste(struct page *page); extern int page_table_allocate_pgste; static inline void clear_table(unsigned long *s, unsigned long val, size_t n) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9951e7e597563239d19e469de062543fb6c4301a..c7ebba483f097f07aa0c03a5a678ac5390ead75b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -256,6 +256,7 @@ static inline int is_module_addr(void *addr) /* Bits in the region table entry */ #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ #define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ +#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ #define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ @@ -327,6 +328,7 @@ static inline int is_module_addr(void *addr) #define PGSTE_GC_BIT 0x0002000000000000UL #define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ #define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL @@ -885,10 +887,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry); void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_notify(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long bits); +int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, + pte_t *ptep, int prot, unsigned long bit); void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_t *ptep , int reset); void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte); +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 9d4d311d7e522dbcfba9d4bd01ebe67122ddf72c..8c2922f540f9e1687fc7f256af9f6001b4315871 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -109,6 +109,8 @@ struct thread_struct { unsigned long ksp; /* kernel stack pointer */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_write_flag; /* gmap fault write indication */ + unsigned int gmap_int_code; /* int code of last gmap fault */ unsigned int gmap_pfault; /* signal of a pending guest pfault */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index f0818d70d73dc88155de53079aafac93ef4d1dff..a2ffec4139ad1cb8cebe816a9f0b3e261cd97d42 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -98,6 +98,18 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 #define KVM_S390_VM_CPU_FEAT_ESOP 0 +#define KVM_S390_VM_CPU_FEAT_SIEF2 1 +#define KVM_S390_VM_CPU_FEAT_64BSCAO 2 +#define KVM_S390_VM_CPU_FEAT_SIIF 3 +#define KVM_S390_VM_CPU_FEAT_GPERE 4 +#define KVM_S390_VM_CPU_FEAT_GSLS 5 +#define KVM_S390_VM_CPU_FEAT_IB 6 +#define KVM_S390_VM_CPU_FEAT_CEI 7 +#define KVM_S390_VM_CPU_FEAT_IBS 8 +#define KVM_S390_VM_CPU_FEAT_SKEY 9 +#define KVM_S390_VM_CPU_FEAT_CMMA 10 +#define KVM_S390_VM_CPU_FEAT_PFMFI 11 +#define KVM_S390_VM_CPU_FEAT_SIGPIF 12 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 82e73e2b953d172079edda559e9eae5b37ab2ba1..09a9e6dfc09f66c9370031360c418f8a226179c0 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o +kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 8e245e764c210ad908dfb33270414b70186c6692..54200208bf24fe73f0fdb8c7b1aac639e6160e86 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include @@ -946,3 +947,241 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) return 0; return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA); } + +/** + * kvm_s390_shadow_tables - walk the guest page table and create shadow tables + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: pointer to the page table address result + * @fake: pgt references contiguous guest memory block, not a pgtable + */ +static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + struct gmap *parent; + union asce asce; + union vaddress vaddr; + unsigned long ptr; + int rc; + + *fake = 0; + *dat_protection = 0; + parent = sg->parent; + vaddr.addr = saddr; + asce.val = sg->orig_asce; + ptr = asce.origin * 4096; + if (asce.r) { + *fake = 1; + asce.dt = ASCE_TYPE_REGION1; + } + switch (asce.dt) { + case ASCE_TYPE_REGION1: + if (vaddr.rfx01 > asce.tl && !asce.r) + return PGM_REGION_FIRST_TRANS; + break; + case ASCE_TYPE_REGION2: + if (vaddr.rfx) + return PGM_ASCE_TYPE; + if (vaddr.rsx01 > asce.tl) + return PGM_REGION_SECOND_TRANS; + break; + case ASCE_TYPE_REGION3: + if (vaddr.rfx || vaddr.rsx) + return PGM_ASCE_TYPE; + if (vaddr.rtx01 > asce.tl) + return PGM_REGION_THIRD_TRANS; + break; + case ASCE_TYPE_SEGMENT: + if (vaddr.rfx || vaddr.rsx || vaddr.rtx) + return PGM_ASCE_TYPE; + if (vaddr.sx01 > asce.tl) + return PGM_SEGMENT_TRANSLATION; + break; + } + + switch (asce.dt) { + case ASCE_TYPE_REGION1: { + union region1_table_entry rfte; + + if (*fake) { + /* offset in 16EB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rsx << 53UL); + rfte.val = ptr; + goto shadow_r2t; + } + rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + if (rc) + return rc; + if (rfte.i) + return PGM_REGION_FIRST_TRANS; + if (rfte.tt != TABLE_TYPE_REGION1) + return PGM_TRANSLATION_SPEC; + if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + return PGM_REGION_SECOND_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rfte.p; + ptr = rfte.rto << 12UL; +shadow_r2t: + rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_REGION2: { + union region2_table_entry rste; + + if (*fake) { + /* offset in 8PB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rtx << 42UL); + rste.val = ptr; + goto shadow_r3t; + } + rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + if (rc) + return rc; + if (rste.i) + return PGM_REGION_SECOND_TRANS; + if (rste.tt != TABLE_TYPE_REGION2) + return PGM_TRANSLATION_SPEC; + if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + return PGM_REGION_THIRD_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rste.p; + ptr = rste.rto << 12UL; +shadow_r3t: + rste.p |= *dat_protection; + rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_REGION3: { + union region3_table_entry rtte; + + if (*fake) { + /* offset in 4TB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 31UL); + rtte.val = ptr; + goto shadow_sgt; + } + rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + if (rc) + return rc; + if (rtte.i) + return PGM_REGION_THIRD_TRANS; + if (rtte.tt != TABLE_TYPE_REGION3) + return PGM_TRANSLATION_SPEC; + if (rtte.cr && asce.p && sg->edat_level >= 2) + return PGM_TRANSLATION_SPEC; + if (rtte.fc && sg->edat_level >= 2) { + *dat_protection |= rtte.fc0.p; + *fake = 1; + ptr = rtte.fc1.rfaa << 31UL; + rtte.val = ptr; + goto shadow_sgt; + } + if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + return PGM_SEGMENT_TRANSLATION; + if (sg->edat_level >= 1) + *dat_protection |= rtte.fc0.p; + ptr = rtte.fc0.sto << 12UL; +shadow_sgt: + rtte.fc0.p |= *dat_protection; + rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_SEGMENT: { + union segment_table_entry ste; + + if (*fake) { + /* offset in 2G guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 20UL); + ste.val = ptr; + goto shadow_pgt; + } + rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + if (rc) + return rc; + if (ste.i) + return PGM_SEGMENT_TRANSLATION; + if (ste.tt != TABLE_TYPE_SEGMENT) + return PGM_TRANSLATION_SPEC; + if (ste.cs && asce.p) + return PGM_TRANSLATION_SPEC; + *dat_protection |= ste.fc0.p; + if (ste.fc && sg->edat_level >= 1) { + *fake = 1; + ptr = ste.fc1.sfaa << 20UL; + ste.val = ptr; + goto shadow_pgt; + } + ptr = ste.fc0.pto << 11UL; +shadow_pgt: + ste.fc0.p |= *dat_protection; + rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + if (rc) + return rc; + } + } + /* Return the parent address of the page table */ + *pgt = ptr; + return 0; +} + +/** + * kvm_s390_shadow_fault - handle fault on a shadow page table + * @vcpu: virtual cpu + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * + * Returns: - 0 if the shadow fault was successfully resolved + * - > 0 (pgm exception code) on exceptions while faulting + * - -EAGAIN if the caller can retry immediately + * - -EFAULT when accessing invalid guest addresses + * - -ENOMEM if out of memory + */ +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, + unsigned long saddr) +{ + union vaddress vaddr; + union page_table_entry pte; + unsigned long pgt; + int dat_protection, fake; + int rc; + + down_read(&sg->mm->mmap_sem); + /* + * We don't want any guest-2 tables to change - so the parent + * tables/pointers we read stay valid - unshadowing is however + * always possible - only guest_table_lock protects us. + */ + ipte_lock(vcpu); + + rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + if (rc) + rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, + &fake); + + vaddr.addr = saddr; + if (fake) { + /* offset in 1MB guest memory block */ + pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + goto shadow_page; + } + if (!rc) + rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + if (!rc && pte.i) + rc = PGM_PAGE_TRANSLATION; + if (!rc && (pte.z || (pte.co && sg->edat_level < 1))) + rc = PGM_TRANSLATION_SPEC; +shadow_page: + pte.p |= dat_protection; + if (!rc) + rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + ipte_unlock(vcpu); + up_read(&sg->mm->mmap_sem); + return rc; +} diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index df0a79dd81595f43f815a8b671cb8dc6502e81e2..8756569ad9381fa0cddca0b0cddddec690968ac4 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, + unsigned long saddr); + #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d72c4a877622af43a37680e99d52052e6d8a6b6e..ca19627779db09756beeb18afe59ee60d8f64ac8 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -995,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) swake_up(&vcpu->wq); vcpu->stat.halt_wakeup++; } + /* + * The VCPU might not be sleeping but is executing the VSIE. Let's + * kick it, so it leaves the SIE to process the request. + */ + kvm_s390_vsie_kick(vcpu); } enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0dcf9b8fc12c2e67e72307e80a1f3abd019dbfb0..03eeeb0ded2470a3d9ea0f6df82f6308266a3c83 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -98,6 +99,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_stfl", VCPU_STAT(instruction_stfl) }, { "instruction_tprot", VCPU_STAT(instruction_tprot) }, { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, + { "instruction_sie", VCPU_STAT(instruction_sie) }, { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, @@ -123,6 +125,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +/* allow nested virtualization in KVM (if enabled by user space) */ +static int nested; +module_param(nested, int, S_IRUGO); +MODULE_PARM_DESC(nested, "Nested virtualization support"); + /* upper facilities limit for kvm */ unsigned long kvm_s390_fac_list_mask[16] = { 0xffe6000000000000UL, @@ -141,6 +148,7 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS) static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; static struct gmap_notifier gmap_notifier; +static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; /* Section: not file related */ @@ -150,7 +158,8 @@ int kvm_arch_hardware_enable(void) return 0; } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); /* * This callback is executed during stop_machine(). All CPUs are therefore @@ -172,6 +181,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, vcpu->arch.sie_block->epoch -= *delta; if (vcpu->arch.cputm_enabled) vcpu->arch.cputm_start += *delta; + if (vcpu->arch.vsie_block) + vcpu->arch.vsie_block->epoch -= *delta; } } return NOTIFY_OK; @@ -184,7 +195,9 @@ static struct notifier_block kvm_clock_notifier = { int kvm_arch_hardware_setup(void) { gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_ipte_notifier(&gmap_notifier); + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); return 0; @@ -192,7 +205,8 @@ int kvm_arch_hardware_setup(void) void kvm_arch_hardware_unsetup(void) { - gmap_unregister_ipte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); } @@ -250,6 +264,46 @@ static void kvm_s390_cpu_feat_init(void) if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); + /* + * We need SIE support, ESOP (PROT_READ protection for gmap_shadow), + * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). + */ + if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || + !test_facility(3) || !nested) + return; + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); + if (sclp.has_64bscao) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO); + if (sclp.has_siif) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF); + if (sclp.has_gpere) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE); + if (sclp.has_gsls) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS); + if (sclp.has_ib) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB); + if (sclp.has_cei) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); + if (sclp.has_ibs) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS); + /* + * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make + * all skey handling functions read/set the skey from the PGSTE + * instead of the real storage key. + * + * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make + * pages being detected as preserved although they are resident. + * + * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will + * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY. + * + * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and + * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be + * correctly shadowed. We can do that for the PGSTE but not for PTE.I. + * + * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We + * cannot easily shadow the SCA because of the ipte lock. + */ } int kvm_arch_init(void *opaque) @@ -530,20 +584,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_alloc takes last usable address */ + /* gmap_create takes last usable address */ if (new_limit != KVM_S390_NO_MEM_LIMIT) new_limit -= 1; ret = -EBUSY; mutex_lock(&kvm->lock); if (!kvm->created_vcpus) { - /* gmap_alloc will round the limit up */ - struct gmap *new = gmap_alloc(current->mm, new_limit); + /* gmap_create will round the limit up */ + struct gmap *new = gmap_create(current->mm, new_limit); if (!new) { ret = -ENOMEM; } else { - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); new->private = kvm; kvm->arch.gmap = new; ret = 0; @@ -1392,7 +1446,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) else kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE, sclp.hamax + 1); - kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1); + kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); if (!kvm->arch.gmap) goto out_err; kvm->arch.gmap->private = kvm; @@ -1404,6 +1458,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); + kvm_s390_vsie_init(kvm); KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -1425,7 +1480,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); if (kvm_is_ucontrol(vcpu->kvm)) - gmap_free(vcpu->arch.gmap); + gmap_remove(vcpu->arch.gmap); if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -1458,16 +1513,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); + kvm_s390_vsie_destroy(kvm); KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } /* Section: vcpu related */ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) { - vcpu->arch.gmap = gmap_alloc(current->mm, -1UL); + vcpu->arch.gmap = gmap_create(current->mm, -1UL); if (!vcpu->arch.gmap) return -ENOMEM; vcpu->arch.gmap->private = vcpu->kvm; @@ -1717,7 +1773,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); - gmap_enable(vcpu->arch.gmap); + gmap_enable(vcpu->arch.enabled_gmap); atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __start_cpu_timer_accounting(vcpu); @@ -1730,7 +1786,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __stop_cpu_timer_accounting(vcpu); atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); - gmap_disable(vcpu->arch.gmap); + vcpu->arch.enabled_gmap = gmap_get_enabled(); + gmap_disable(vcpu->arch.enabled_gmap); /* Save guest register state */ save_fpu_regs(); @@ -1779,7 +1836,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } - + /* make vcpu_load load the right gmap on the first trigger */ + vcpu->arch.enabled_gmap = vcpu->arch.gmap; } static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) @@ -1976,16 +2034,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) { - int i; struct kvm *kvm = gmap->private; struct kvm_vcpu *vcpu; + unsigned long prefix; + int i; + if (gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; kvm_for_each_vcpu(i, vcpu, kvm) { /* match against both prefix pages */ - if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); + prefix = kvm_s390_get_prefix(vcpu); + if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { + VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", + start, end); kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu); } } @@ -2264,16 +2331,16 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the - * guest prefix page. gmap_ipte_notify will wait on the ptl lock. + * guest prefix page. gmap_mprotect_notify will wait on the ptl lock. * This ensures that the ipte instruction for this request has * already finished. We might race against a second unmapper that * wants to set the blocking bit. Lets just retry the request loop. */ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { int rc; - rc = gmap_ipte_notify(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu), - PAGE_SIZE * 2); + rc = gmap_mprotect_notify(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu), + PAGE_SIZE * 2, PROT_WRITE); if (rc) return rc; goto retry; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 52aa47e112d801e7aee840578dd10c18bbea1981..031f451bb2cf79a2550b14eb6bd2701a0e29d765 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT; + return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) @@ -252,6 +252,14 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); +/* implemented in vsie.c */ +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); +void kvm_s390_vsie_init(struct kvm *kvm); +void kvm_s390_vsie_destroy(struct kvm *kvm); + /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 3db3be1399929922fa559fba32702ece198e9c94..c77ad2dc334ff7f3b2ca05b104df41d84edf0a6b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -719,6 +719,7 @@ static const intercept_handler_t b2_handlers[256] = { [0x10] = handle_set_prefix, [0x11] = handle_store_prefix, [0x12] = handle_store_cpu_address, + [0x14] = kvm_s390_handle_vsie, [0x21] = handle_ipte_interlock, [0x29] = handle_iske, [0x2a] = handle_rrbe, diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 28ea0cab1f1b50c3f3b12ce63055f4c30ebc5293..1a252f5370818e5febbe5d05f96432be93f33488 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT; u16 p_asn, s_asn; psw_t *psw; - u32 flags; + bool idle; - flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); + idle = is_vcpu_idle(vcpu); psw = &dst_vcpu->arch.sie_block->gpsw; p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */ s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */ /* Inject the emergency signal? */ - if (!(flags & CPUSTAT_STOPPED) + if (!is_vcpu_stopped(vcpu) || (psw->mask & psw_int_mask) != psw_int_mask - || ((flags & CPUSTAT_WAIT) && psw->addr != 0) - || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) { + || (idle && psw->addr != 0) + || (!idle && (asn == p_asn || asn == s_asn))) { return __inject_sigp_emergency(vcpu, dst_vcpu); } else { *reg &= 0xffffffff00000000UL; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c new file mode 100644 index 0000000000000000000000000000000000000000..6895e7b3be123767d4829072ff3ba39b53350cac --- /dev/null +++ b/arch/s390/kvm/vsie.c @@ -0,0 +1,1091 @@ +/* + * kvm nested virtualization support for s390x + * + * Copyright IBM Corp. 2016 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): David Hildenbrand + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" +#include "gaccess.h" + +struct vsie_page { + struct kvm_s390_sie_block scb_s; /* 0x0000 */ + /* the pinned originial scb */ + struct kvm_s390_sie_block *scb_o; /* 0x0200 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap *gmap; /* 0x0208 */ + /* address of the last reported fault to guest2 */ + unsigned long fault_addr; /* 0x0210 */ + __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */ + struct kvm_s390_crypto_cb crycb; /* 0x0700 */ + __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ +} __packed; + +/* trigger a validity icpt for the given scb */ +static int set_validity_icpt(struct kvm_s390_sie_block *scb, + __u16 reason_code) +{ + scb->ipa = 0x1000; + scb->ipb = ((__u32) reason_code) << 16; + scb->icptcode = ICPT_VALIDITY; + return 1; +} + +/* mark the prefix as unmapped, this will block the VSIE */ +static void prefix_unmapped(struct vsie_page *vsie_page) +{ + atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* mark the prefix as unmapped and wait until the VSIE has been left */ +static void prefix_unmapped_sync(struct vsie_page *vsie_page) +{ + prefix_unmapped(vsie_page); + if (vsie_page->scb_s.prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags); + while (vsie_page->scb_s.prog0c & PROG_IN_SIE) + cpu_relax(); +} + +/* mark the prefix as mapped, this will allow the VSIE to run */ +static void prefix_mapped(struct vsie_page *vsie_page) +{ + atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* test if the prefix is mapped into the gmap shadow */ +static int prefix_is_mapped(struct vsie_page *vsie_page) +{ + return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST); +} + +/* copy the updated intervention request bits into the shadow scb */ +static void update_intervention_requests(struct vsie_page *vsie_page) +{ + const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT; + int cpuflags; + + cpuflags = atomic_read(&vsie_page->scb_o->cpuflags); + atomic_andnot(bits, &vsie_page->scb_s.cpuflags); + atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags); +} + +/* shadow (filter and validate) the cpuflags */ +static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int newflags, cpuflags = atomic_read(&scb_o->cpuflags); + + /* we don't allow ESA/390 guests */ + if (!(cpuflags & CPUSTAT_ZARCH)) + return set_validity_icpt(scb_s, 0x0001U); + + if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS)) + return set_validity_icpt(scb_s, 0x0001U); + else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR)) + return set_validity_icpt(scb_s, 0x0007U); + + /* intervention requests will be set later */ + newflags = CPUSTAT_ZARCH; + if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8)) + newflags |= CPUSTAT_GED; + if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) { + if (cpuflags & CPUSTAT_GED) + return set_validity_icpt(scb_s, 0x0001U); + newflags |= CPUSTAT_GED2; + } + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE)) + newflags |= cpuflags & CPUSTAT_P; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS)) + newflags |= cpuflags & CPUSTAT_SM; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS)) + newflags |= cpuflags & CPUSTAT_IBS; + + atomic_set(&scb_s->cpuflags, newflags); + return 0; +} + +/* + * Create a shadow copy of the crycb block and setup key wrapping, if + * requested for guest 3 and enabled for guest 2. + * + * We only accept format-1 (no AP in g2), but convert it into format-2 + * There is nothing to do for format-0. + * + * Returns: - 0 if shadowed or nothing to do + * - > 0 if control has to be given to guest 2 + */ +static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U; + unsigned long *b1, *b2; + u8 ecb3_flags; + + scb_s->crycbd = 0; + if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) + return 0; + /* format-1 is supported with message-security-assist extension 3 */ + if (!test_kvm_facility(vcpu->kvm, 76)) + return 0; + /* we may only allow it if enabled for guest 2 */ + ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & + (ECB3_AES | ECB3_DEA); + if (!ecb3_flags) + return 0; + + if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK)) + return set_validity_icpt(scb_s, 0x003CU); + else if (!crycb_addr) + return set_validity_icpt(scb_s, 0x0039U); + + /* copy only the wrapping keys */ + if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56)) + return set_validity_icpt(scb_s, 0x0035U); + + scb_s->ecb3 |= ecb3_flags; + scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 | + CRYCB_FORMAT2; + + /* xor both blocks in one run */ + b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; + b2 = (unsigned long *) + vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask; + /* as 56%8 == 0, bitmap_xor won't overwrite any data */ + bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56); + return 0; +} + +/* shadow (round up/down) the ibc to avoid validity icpt */ +static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU; + + scb_s->ibc = 0; + /* ibc installed in g2 and requested for g3 */ + if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) { + scb_s->ibc = scb_o->ibc & 0x0fffU; + /* takte care of the minimum ibc level of the machine */ + if (scb_s->ibc < min_ibc) + scb_s->ibc = min_ibc; + /* take care of the maximum ibc level set for the guest */ + if (scb_s->ibc > vcpu->kvm->arch.model.ibc) + scb_s->ibc = vcpu->kvm->arch.model.ibc; + } +} + +/* unshadow the scb, copying parameters back to the real scb */ +static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + + /* interception */ + scb_o->icptcode = scb_s->icptcode; + scb_o->icptstatus = scb_s->icptstatus; + scb_o->ipa = scb_s->ipa; + scb_o->ipb = scb_s->ipb; + scb_o->gbea = scb_s->gbea; + + /* timer */ + scb_o->cputm = scb_s->cputm; + scb_o->ckc = scb_s->ckc; + scb_o->todpr = scb_s->todpr; + + /* guest state */ + scb_o->gpsw = scb_s->gpsw; + scb_o->gg14 = scb_s->gg14; + scb_o->gg15 = scb_s->gg15; + memcpy(scb_o->gcr, scb_s->gcr, 128); + scb_o->pp = scb_s->pp; + + /* interrupt intercept */ + switch (scb_s->icptcode) { + case ICPT_PROGI: + case ICPT_INSTPROGI: + case ICPT_EXTINT: + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); + break; + case ICPT_PARTEXEC: + /* MVPG only */ + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); + break; + } + + if (scb_s->ihcpu != 0xffffU) + scb_o->ihcpu = scb_s->ihcpu; +} + +/* + * Setup the shadow scb by copying and checking the relevant parts of the g2 + * provided scb. + * + * Returns: - 0 if the scb has been shadowed + * - > 0 if control has to be given to guest 2 + */ +static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + bool had_tx = scb_s->ecb & 0x10U; + unsigned long new_mso = 0; + int rc; + + /* make sure we don't have any leftovers when reusing the scb */ + scb_s->icptcode = 0; + scb_s->eca = 0; + scb_s->ecb = 0; + scb_s->ecb2 = 0; + scb_s->ecb3 = 0; + scb_s->ecd = 0; + scb_s->fac = 0; + + rc = prepare_cpuflags(vcpu, vsie_page); + if (rc) + goto out; + + /* timer */ + scb_s->cputm = scb_o->cputm; + scb_s->ckc = scb_o->ckc; + scb_s->todpr = scb_o->todpr; + scb_s->epoch = scb_o->epoch; + + /* guest state */ + scb_s->gpsw = scb_o->gpsw; + scb_s->gg14 = scb_o->gg14; + scb_s->gg15 = scb_o->gg15; + memcpy(scb_s->gcr, scb_o->gcr, 128); + scb_s->pp = scb_o->pp; + + /* interception / execution handling */ + scb_s->gbea = scb_o->gbea; + scb_s->lctl = scb_o->lctl; + scb_s->svcc = scb_o->svcc; + scb_s->ictl = scb_o->ictl; + /* + * SKEY handling functions can't deal with false setting of PTE invalid + * bits. Therefore we cannot provide interpretation and would later + * have to provide own emulation handlers. + */ + scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + scb_s->icpua = scb_o->icpua; + + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) + new_mso = scb_o->mso & 0xfffffffffff00000UL; + /* if the hva of the prefix changes, we have to remap the prefix */ + if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix) + prefix_unmapped(vsie_page); + /* SIE will do mso/msl validity and exception checks for us */ + scb_s->msl = scb_o->msl & 0xfffffffffff00000UL; + scb_s->mso = new_mso; + scb_s->prefix = scb_o->prefix; + + /* We have to definetly flush the tlb if this scb never ran */ + if (scb_s->ihcpu != 0xffffU) + scb_s->ihcpu = scb_o->ihcpu; + + /* MVPG and Protection Exception Interpretation are always available */ + scb_s->eca |= scb_o->eca & 0x01002000U; + /* Host-protection-interruption introduced with ESOP */ + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) + scb_s->ecb |= scb_o->ecb & 0x02U; + /* transactional execution */ + if (test_kvm_facility(vcpu->kvm, 73)) { + /* remap the prefix is tx is toggled on */ + if ((scb_o->ecb & 0x10U) && !had_tx) + prefix_unmapped(vsie_page); + scb_s->ecb |= scb_o->ecb & 0x10U; + } + /* SIMD */ + if (test_kvm_facility(vcpu->kvm, 129)) { + scb_s->eca |= scb_o->eca & 0x00020000U; + scb_s->ecd |= scb_o->ecd & 0x20000000U; + } + /* Run-time-Instrumentation */ + if (test_kvm_facility(vcpu->kvm, 64)) + scb_s->ecb3 |= scb_o->ecb3 & 0x01U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) + scb_s->eca |= scb_o->eca & 0x00000001U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) + scb_s->eca |= scb_o->eca & 0x40000000U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) + scb_s->eca |= scb_o->eca & 0x80000000U; + + prepare_ibc(vcpu, vsie_page); + rc = shadow_crycb(vcpu, vsie_page); +out: + if (rc) + unshadow_scb(vcpu, vsie_page); + return rc; +} + +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct kvm *kvm = gmap->private; + struct vsie_page *cur; + unsigned long prefix; + struct page *page; + int i; + + if (!gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; + + /* + * Only new shadow blocks are added to the list during runtime, + * therefore we can safely reference them all the time. + */ + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!page) + continue; + cur = page_to_virt(page); + if (READ_ONCE(cur->gmap) != gmap) + continue; + prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; + /* with mso/msl, the prefix lies at an offset */ + prefix += cur->scb_s.mso; + if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1) + prefix_unmapped_sync(cur); + } +} + +/* + * Map the first prefix page and if tx is enabled also the second prefix page. + * + * The prefix will be protected, a gmap notifier will inform about unmaps. + * The shadow scb must not be executed until the prefix is remapped, this is + * guaranteed by properly handling PROG_REQUEST. + * + * Returns: - 0 on if successfully mapped or already mapped + * - > 0 if control has to be given to guest 2 + * - -EAGAIN if the caller can retry immediately + * - -ENOMEM if out of memory + */ +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + int rc; + + if (prefix_is_mapped(vsie_page)) + return 0; + + /* mark it as mapped so we can catch any concurrent unmappers */ + prefix_mapped(vsie_page); + + /* with mso/msl, the prefix lies at offset *mso* */ + prefix += scb_s->mso; + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + if (!rc && (scb_s->ecb & 0x10U)) + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + prefix + PAGE_SIZE); + /* + * We don't have to mprotect, we will be called for all unshadows. + * SIE will detect if protection applies and trigger a validity. + */ + if (rc) + prefix_unmapped(vsie_page); + if (rc > 0 || rc == -EFAULT) + rc = set_validity_icpt(scb_s, 0x0037U); + return rc; +} + +/* + * Pin the guest page given by gpa and set hpa to the pinned host address. + * Will always be pinned writable. + * + * Returns: - 0 on success + * - -EINVAL if the gpa is not valid guest storage + * - -ENOMEM if out of memory + */ +static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) +{ + struct page *page; + hva_t hva; + int rc; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return -EINVAL; + rc = get_user_pages_fast(hva, 1, 1, &page); + if (rc < 0) + return rc; + else if (rc != 1) + return -ENOMEM; + *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + return 0; +} + +/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ +static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) +{ + struct page *page; + + page = virt_to_page(hpa); + set_page_dirty_lock(page); + put_page(page); + /* mark the page always as dirty for migration */ + mark_page_dirty(kvm, gpa_to_gfn(gpa)); +} + +/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */ +static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + + hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol; + if (hpa) { + gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->scaol = 0; + scb_s->scaoh = 0; + } + + hpa = scb_s->itdba; + if (hpa) { + gpa = scb_o->itdba & ~0xffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->itdba = 0; + } + + hpa = scb_s->gvrd; + if (hpa) { + gpa = scb_o->gvrd & ~0x1ffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->gvrd = 0; + } + + hpa = scb_s->riccbd; + if (hpa) { + gpa = scb_o->riccbd & ~0x3fUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->riccbd = 0; + } +} + +/* + * Instead of shadowing some blocks, we can simply forward them because the + * addresses in the scb are 64 bit long. + * + * This works as long as the data lies in one page. If blocks ever exceed one + * page, we have to fall back to shadowing. + * + * As we reuse the sca, the vcpu pointers contained in it are invalid. We must + * therefore not enable any facilities that access these pointers (e.g. SIGPIF). + * + * Returns: - 0 if all blocks were pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + int rc = 0; + + gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; + if (gpa) { + if (!(gpa & ~0x1fffUL)) + rc = set_validity_icpt(scb_s, 0x0038U); + else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) + rc = set_validity_icpt(scb_s, 0x0011U); + else if ((gpa & PAGE_MASK) != + ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + rc = set_validity_icpt(scb_s, 0x003bU); + if (!rc) { + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0034U); + } + if (rc) + goto unpin; + scb_s->scaoh = (u32)((u64)hpa >> 32); + scb_s->scaol = (u32)(u64)hpa; + } + + gpa = scb_o->itdba & ~0xffUL; + if (gpa && (scb_s->ecb & 0x10U)) { + if (!(gpa & ~0x1fffU)) { + rc = set_validity_icpt(scb_s, 0x0080U); + goto unpin; + } + /* 256 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0080U); + if (rc) + goto unpin; + scb_s->itdba = hpa; + } + + gpa = scb_o->gvrd & ~0x1ffUL; + if (gpa && (scb_s->eca & 0x00020000U) && + !(scb_s->ecd & 0x20000000U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x1310U); + goto unpin; + } + /* + * 512 bytes vector registers cannot cross page boundaries + * if this block gets bigger, we have to shadow it. + */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x1310U); + if (rc) + goto unpin; + scb_s->gvrd = hpa; + } + + gpa = scb_o->riccbd & ~0x3fUL; + if (gpa && (scb_s->ecb3 & 0x01U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x0043U); + goto unpin; + } + /* 64 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0043U); + /* Validity 0x0044 will be checked by SIE */ + if (rc) + goto unpin; + scb_s->gvrd = hpa; + } + return 0; +unpin: + unpin_blocks(vcpu, vsie_page); + return rc; +} + +/* unpin the scb provided by guest 2, marking it as dirty */ +static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa = (hpa_t) vsie_page->scb_o; + + if (hpa) + unpin_guest_page(vcpu->kvm, gpa, hpa); + vsie_page->scb_o = NULL; +} + +/* + * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o. + * + * Returns: - 0 if the scb was pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa; + int rc; + + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) { + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (!rc) + rc = 1; + } + if (!rc) + vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + return rc; +} + +/* + * Inject a fault into guest 2. + * + * Returns: - > 0 if control has to be given to guest 2 + * < 0 if an error occurred during injection. + */ +static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, + bool write_flag) +{ + struct kvm_s390_pgm_info pgm = { + .code = code, + .trans_exc_code = + /* 0-51: virtual address */ + (vaddr & 0xfffffffffffff000UL) | + /* 52-53: store / fetch */ + (((unsigned int) !write_flag) + 1) << 10, + /* 62-63: asce id (alway primary == 0) */ + .exc_access_id = 0, /* always primary */ + .op_access_id = 0, /* not MVPG */ + }; + int rc; + + if (code == PGM_PROTECTION) + pgm.trans_exc_code |= 0x4UL; + + rc = kvm_s390_inject_prog_irq(vcpu, &pgm); + return rc ? rc : 1; +} + +/* + * Handle a fault during vsie execution on a gmap shadow. + * + * Returns: - 0 if the fault was resolved + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + int rc; + + if (current->thread.gmap_int_code == PGM_PROTECTION) + /* we can directly forward all protection exceptions */ + return inject_fault(vcpu, PGM_PROTECTION, + current->thread.gmap_addr, 1); + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + current->thread.gmap_addr); + if (rc > 0) { + rc = inject_fault(vcpu, rc, + current->thread.gmap_addr, + current->thread.gmap_write_flag); + if (rc >= 0) + vsie_page->fault_addr = current->thread.gmap_addr; + } + return rc; +} + +/* + * Retry the previous fault that required guest 2 intervention. This avoids + * one superfluous SIE re-entry and direct exit. + * + * Will ignore any errors. The next SIE fault will do proper fault handling. + */ +static void handle_last_fault(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + if (vsie_page->fault_addr) + kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + vsie_page->fault_addr); + vsie_page->fault_addr = 0; +} + +static inline void clear_vsie_icpt(struct vsie_page *vsie_page) +{ + vsie_page->scb_s.icptcode = 0; +} + +/* rewind the psw and clear the vsie icpt, so we can retry execution */ +static void retry_vsie_icpt(struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int ilen = insn_length(scb_s->ipa >> 8); + + /* take care of EXECUTE instructions */ + if (scb_s->icptstatus & 1) { + ilen = (scb_s->icptstatus >> 4) & 0x6; + if (!ilen) + ilen = 4; + } + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen); + clear_vsie_icpt(vsie_page); +} + +/* + * Try to shadow + enable the guest 2 provided facility list. + * Retry instruction execution if enabled for and provided by guest 2. + * + * Returns: - 0 if handled (retry or guest 2 icpt) + * - > 0 if control has to be given to guest 2 + */ +static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U; + + if (fac && test_kvm_facility(vcpu->kvm, 7)) { + retry_vsie_icpt(vsie_page); + if (read_guest_real(vcpu, fac, &vsie_page->fac, + sizeof(vsie_page->fac))) + return set_validity_icpt(scb_s, 0x1090U); + scb_s->fac = (__u32)(__u64) &vsie_page->fac; + } + return 0; +} + +/* + * Run the vsie on a shadow scb and a shadow gmap, without any further + * sanity checks, handling SIE faults. + * + * Returns: - 0 everything went fine + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int rc; + + handle_last_fault(vcpu, vsie_page); + + if (need_resched()) + schedule(); + if (test_cpu_flag(CIF_MCCK_PENDING)) + s390_handle_mcck(); + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + local_irq_disable(); + kvm_guest_enter(); + local_irq_enable(); + + rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + + local_irq_disable(); + kvm_guest_exit(); + local_irq_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + if (rc > 0) + rc = 0; /* we could still have an icpt */ + else if (rc == -EFAULT) + return handle_fault(vcpu, vsie_page); + + switch (scb_s->icptcode) { + case ICPT_INST: + if (scb_s->ipa == 0xb2b0) + rc = handle_stfle(vcpu, vsie_page); + break; + case ICPT_STOP: + /* stop not requested by g2 - must have been a kick */ + if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT)) + clear_vsie_icpt(vsie_page); + break; + case ICPT_VALIDITY: + if ((scb_s->ipa & 0xf000) != 0xf000) + scb_s->ipa += 0x1000; + break; + } + return rc; +} + +static void release_gmap_shadow(struct vsie_page *vsie_page) +{ + if (vsie_page->gmap) + gmap_put(vsie_page->gmap); + WRITE_ONCE(vsie_page->gmap, NULL); + prefix_unmapped(vsie_page); +} + +static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + unsigned long asce; + union ctlreg0 cr0; + struct gmap *gmap; + int edat; + + asce = vcpu->arch.sie_block->gcr[1]; + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + edat += edat && test_kvm_facility(vcpu->kvm, 78); + + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + return 0; + + /* release the old shadow - if any, and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); + if (IS_ERR(gmap)) + return PTR_ERR(gmap); + gmap->private = vcpu->kvm; + WRITE_ONCE(vsie_page->gmap, gmap); + return 0; +} + +/* + * Register the shadow scb at the VCPU, e.g. for kicking out of vsie. + */ +static void register_shadow_scb(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + + WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s); + /* + * External calls have to lead to a kick of the vcpu and + * therefore the vsie -> Simulate Wait state. + */ + atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + /* + * We have to adjust the g3 epoch by the g2 epoch. The epoch will + * automatically be adjusted on tod clock changes via kvm_sync_clock. + */ + preempt_disable(); + scb_s->epoch += vcpu->kvm->arch.epoch; + preempt_enable(); +} + +/* + * Unregister a shadow scb from a VCPU. + */ +static void unregister_shadow_scb(struct kvm_vcpu *vcpu) +{ + atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + WRITE_ONCE(vcpu->arch.vsie_block, NULL); +} + +/* + * Run the vsie on a shadowed scb, managing the gmap shadow, handling + * prefix pages and faults. + * + * Returns: - 0 if no errors occurred + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int rc = 0; + + while (1) { + rc = acquire_gmap_shadow(vcpu, vsie_page); + if (!rc) + rc = map_prefix(vcpu, vsie_page); + if (!rc) { + gmap_enable(vsie_page->gmap); + update_intervention_requests(vsie_page); + rc = do_vsie_run(vcpu, vsie_page); + gmap_enable(vcpu->arch.gmap); + } + atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); + + if (rc == -EAGAIN) + rc = 0; + if (rc || scb_s->icptcode || signal_pending(current) || + kvm_s390_vcpu_has_irq(vcpu, 0)) + break; + }; + + if (rc == -EFAULT) { + /* + * Addressing exceptions are always presentes as intercepts. + * As addressing exceptions are suppressing and our guest 3 PSW + * points at the responsible instruction, we have to + * forward the PSW and set the ilc. If we can't read guest 3 + * instruction, we can use an arbitrary ilc. Let's always use + * ilen = 4 for now, so we can avoid reading in guest 3 virtual + * memory. (we could also fake the shadow so the hardware + * handles it). + */ + scb_s->icptcode = ICPT_PROGI; + scb_s->iprcc = PGM_ADDRESSING; + scb_s->pgmilc = 4; + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); + } + return rc; +} + +/* + * Get or create a vsie page for a scb address. + * + * Returns: - address of a vsie page (cached or new one) + * - NULL if the same scb address is already used by another VCPU + * - ERR_PTR(-ENOMEM) if out of memory + */ +static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) +{ + struct vsie_page *vsie_page; + struct page *page; + int nr_vcpus; + + rcu_read_lock(); + page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + rcu_read_unlock(); + if (page) { + if (page_ref_inc_return(page) == 2) + return page_to_virt(page); + page_ref_dec(page); + } + + /* + * We want at least #online_vcpus shadows, so every VCPU can execute + * the VSIE in parallel. + */ + nr_vcpus = atomic_read(&kvm->online_vcpus); + + mutex_lock(&kvm->arch.vsie.mutex); + if (kvm->arch.vsie.page_count < nr_vcpus) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA); + if (!page) { + mutex_unlock(&kvm->arch.vsie.mutex); + return ERR_PTR(-ENOMEM); + } + page_ref_inc(page); + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + kvm->arch.vsie.page_count++; + } else { + /* reuse an existing entry that belongs to nobody */ + while (true) { + page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + if (page_ref_inc_return(page) == 2) + break; + page_ref_dec(page); + kvm->arch.vsie.next++; + kvm->arch.vsie.next %= nr_vcpus; + } + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + } + page->index = addr; + /* double use of the same address */ + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { + page_ref_dec(page); + mutex_unlock(&kvm->arch.vsie.mutex); + return NULL; + } + mutex_unlock(&kvm->arch.vsie.mutex); + + vsie_page = page_to_virt(page); + memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); + release_gmap_shadow(vsie_page); + vsie_page->fault_addr = 0; + vsie_page->scb_s.ihcpu = 0xffffU; + return vsie_page; +} + +/* put a vsie page acquired via get_vsie_page */ +static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) +{ + struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); + + page_ref_dec(page); +} + +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) +{ + struct vsie_page *vsie_page; + unsigned long scb_addr; + int rc; + + vcpu->stat.instruction_sie++; + if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2)) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); + + /* 512 byte alignment */ + if (unlikely(scb_addr & 0x1ffUL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0)) + return 0; + + vsie_page = get_vsie_page(vcpu->kvm, scb_addr); + if (IS_ERR(vsie_page)) + return PTR_ERR(vsie_page); + else if (!vsie_page) + /* double use of sie control block - simply do nothing */ + return 0; + + rc = pin_scb(vcpu, vsie_page, scb_addr); + if (rc) + goto out_put; + rc = shadow_scb(vcpu, vsie_page); + if (rc) + goto out_unpin_scb; + rc = pin_blocks(vcpu, vsie_page); + if (rc) + goto out_unshadow; + register_shadow_scb(vcpu, vsie_page); + rc = vsie_run(vcpu, vsie_page); + unregister_shadow_scb(vcpu); + unpin_blocks(vcpu, vsie_page); +out_unshadow: + unshadow_scb(vcpu, vsie_page); +out_unpin_scb: + unpin_scb(vcpu, vsie_page, scb_addr); +out_put: + put_vsie_page(vcpu->kvm, vsie_page); + + return rc < 0 ? rc : 0; +} + +/* Init the vsie data structures. To be called when a vm is initialized. */ +void kvm_s390_vsie_init(struct kvm *kvm) +{ + mutex_init(&kvm->arch.vsie.mutex); + INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL); +} + +/* Destroy the vsie data structures. To be called when a vm is destroyed. */ +void kvm_s390_vsie_destroy(struct kvm *kvm) +{ + struct vsie_page *vsie_page; + struct page *page; + int i; + + mutex_lock(&kvm->arch.vsie.mutex); + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = kvm->arch.vsie.pages[i]; + kvm->arch.vsie.pages[i] = NULL; + vsie_page = page_to_virt(page); + release_gmap_shadow(vsie_page); + /* free the radix tree entry */ + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + __free_page(page); + } + kvm->arch.vsie.page_count = 0; + mutex_unlock(&kvm->arch.vsie.mutex); +} + +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block); + + /* + * Even if the VCPU lets go of the shadow sie block reference, it is + * still valid in the cache. So we can safely kick it. + */ + if (scb) { + atomic_or(PROG_BLOCK_SIE, &scb->prog20); + if (scb->prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags); + } +} diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 19288c1b36d32bfed0013e6a90385d8d4c82d08c..730e0d3aa840dde54ca6215454ef3818345a1fce 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access) (struct gmap *) S390_lowcore.gmap : NULL; if (gmap) { current->thread.gmap_addr = address; + current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); + current->thread.gmap_int_code = regs->int_code & 0xffff; address = __gmap_translate(gmap, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index cace818d86eb95bae2cd6f36038f177a5548a926..af0ae6d7ac59fd8c5df92bfba714f3456b16c880 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -20,14 +20,16 @@ #include #include +#define GMAP_SHADOW_FAKE_TABLE 1ULL + /** - * gmap_alloc - allocate a guest address space + * gmap_alloc - allocate and initialize a guest address space * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) +static struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) if (!gmap) goto out; INIT_LIST_HEAD(&gmap->crst_list); + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); spin_lock_init(&gmap->guest_table_lock); - gmap->mm = mm; + spin_lock_init(&gmap->shadow_lock); + atomic_set(&gmap->ref_count, 1); page = alloc_pages(GFP_KERNEL, 2); if (!page) goto out_free; @@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) gmap->asce = atype | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | __pa(table); gmap->asce_end = limit; - down_write(&mm->mmap_sem); - list_add(&gmap->list, &mm->context.gmap_list); - up_write(&mm->mmap_sem); return gmap; out_free: @@ -80,7 +83,28 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) out: return NULL; } -EXPORT_SYMBOL_GPL(gmap_alloc); + +/** + * gmap_create - create a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + + gmap = gmap_alloc(limit); + if (!gmap) + return NULL; + gmap->mm = mm; + spin_lock(&mm->context.gmap_lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + spin_unlock(&mm->context.gmap_lock); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_create); static void gmap_flush_tlb(struct gmap *gmap) { @@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct gmap_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure + * + * No locks required. There are no references to this gmap anymore. */ -void gmap_free(struct gmap *gmap) +static void gmap_free(struct gmap *gmap) { struct page *page, *next; - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); - + /* Flush tlb of all gmaps (if not already done for shadows) */ + if (!(gmap_is_shadow(gmap) && gmap->removed)) + gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) __free_pages(page, 2); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); - down_write(&gmap->mm->mmap_sem); - list_del(&gmap->list); - up_write(&gmap->mm->mmap_sem); + + /* Free additional data for a shadow gmap */ + if (gmap_is_shadow(gmap)) { + /* Free all page tables. */ + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) + page_table_free_pgste(page); + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + /* Release reference to the parent */ + gmap_put(gmap->parent); + } + kfree(gmap); } -EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_get - increase reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * Returns the gmap pointer + */ +struct gmap *gmap_get(struct gmap *gmap) +{ + atomic_inc(&gmap->ref_count); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_get); + +/** + * gmap_put - decrease reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * If the reference counter reaches zero the guest address space is freed. + */ +void gmap_put(struct gmap *gmap) +{ + if (atomic_dec_return(&gmap->ref_count) == 0) + gmap_free(gmap); +} +EXPORT_SYMBOL_GPL(gmap_put); + +/** + * gmap_remove - remove a guest address space but do not free it yet + * @gmap: pointer to the guest address space structure + */ +void gmap_remove(struct gmap *gmap) +{ + struct gmap *sg, *next; + + /* Remove all shadow gmaps linked to this gmap */ + if (!list_empty(&gmap->children)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, &gmap->children, list) { + list_del(&sg->list); + gmap_put(sg); + } + spin_unlock(&gmap->shadow_lock); + } + /* Remove gmap from the pre-mm list */ + spin_lock(&gmap->mm->context.gmap_lock); + list_del_rcu(&gmap->list); + spin_unlock(&gmap->mm->context.gmap_lock); + synchronize_rcu(); + /* Put reference */ + gmap_put(gmap); +} +EXPORT_SYMBOL_GPL(gmap_remove); /** * gmap_enable - switch primary space to the guest address space @@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap) } EXPORT_SYMBOL_GPL(gmap_disable); +/** + * gmap_get_enabled - get a pointer to the currently enabled gmap + * + * Returns a pointer to the currently enabled gmap. 0 if none is enabled. + */ +struct gmap *gmap_get_enabled(void) +{ + return (struct gmap *) S390_lowcore.gmap; +} +EXPORT_SYMBOL_GPL(gmap_get_enabled); + /* * gmap_alloc_table is assumed to be called with mmap_sem held */ @@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return -ENOMEM; new = (unsigned long *) page_to_phys(page); crst_table_init(new, init); - spin_lock(&gmap->mm->page_table_lock); + spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); *table = (unsigned long) new | _REGION_ENTRY_LENGTH | @@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, page->index = gaddr; page = NULL; } - spin_unlock(&gmap->mm->page_table_lock); + spin_unlock(&gmap->guest_table_lock); if (page) __free_pages(page, 2); return 0; @@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) unsigned long *entry; int flush = 0; + BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); if (entry) { @@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || to + len < to) @@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((from | to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || from + len < from || to + len < to || @@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment); * This function does not establish potentially missing page table entries. * The mmap_sem of the mm that belongs to the address space must be held * when this function gets called. + * + * Note: Can also be called for shadow gmaps. */ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) { @@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + /* Note: guest_to_host is empty for a shadow gmap */ return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; } EXPORT_SYMBOL_GPL(__gmap_translate); @@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table, struct gmap *gmap; int flush; - list_for_each_entry(gmap, &mm->context.gmap_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); if (flush) gmap_flush_tlb(gmap); } + rcu_read_unlock(); } /** @@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pmd_t *pmd; int rc; + BUG_ON(gmap_is_shadow(gmap)); /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { @@ -549,116 +679,1412 @@ static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); /** - * gmap_register_ipte_notifier - register a pte invalidation callback + * gmap_register_pte_notifier - register a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) +void gmap_register_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); + list_add_rcu(&nb->list, &gmap_notifier_list); spin_unlock(&gmap_notifier_lock); } -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); +EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); /** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback + * gmap_unregister_pte_notifier - remove a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +void gmap_unregister_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); + list_del_rcu(&nb->list); spin_unlock(&gmap_notifier_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); + +/** + * gmap_call_notifier - call all registered invalidation callbacks + * @gmap: pointer to guest mapping meta data structure + * @start: start virtual address in the guest address space + * @end: end virtual address in the guest address space + */ +static void gmap_call_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct gmap_notifier *nb; + + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, start, end); +} + +/** + * gmap_table_walk - walk the gmap page tables + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @level: page table level to stop at + * + * Returns a table entry pointer for the given guest address and @level + * @level=0 : returns a pointer to a page table table entry (or NULL) + * @level=1 : returns a pointer to a segment table entry (or NULL) + * @level=2 : returns a pointer to a region-3 table entry (or NULL) + * @level=3 : returns a pointer to a region-2 table entry (or NULL) + * @level=4 : returns a pointer to a region-1 table entry (or NULL) + * + * Returns NULL if the gmap page tables could not be walked to the + * requested level. + * + * Note: Can also be called for shadow gmaps. + */ +static inline unsigned long *gmap_table_walk(struct gmap *gmap, + unsigned long gaddr, int level) +{ + unsigned long *table; + + if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) + return NULL; + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; + if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) + return NULL; + table = gmap->table; + switch (gmap->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (gaddr >> 53) & 0x7ff; + if (level == 4) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION2: + table += (gaddr >> 42) & 0x7ff; + if (level == 3) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION3: + table += (gaddr >> 31) & 0x7ff; + if (level == 2) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_SEGMENT: + table += (gaddr >> 20) & 0x7ff; + if (level == 1) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr >> 12) & 0xff; + } + return table; +} + +/** + * gmap_pte_op_walk - walk the gmap page table, get the page table lock + * and return the pte pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @ptl: pointer to the spinlock pointer + * + * Returns a pointer to the locked pte for a guest address, or NULL + * + * Note: Can also be called for shadow gmaps. + */ +static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, + spinlock_t **ptl) +{ + unsigned long *table; + + if (gmap_is_shadow(gmap)) + spin_lock(&gmap->guest_table_lock); + /* Walk the gmap page table, lock and get pte pointer */ + table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ + if (!table || *table & _SEGMENT_ENTRY_INVALID) { + if (gmap_is_shadow(gmap)) + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + if (gmap_is_shadow(gmap)) { + *ptl = &gmap->guest_table_lock; + return pte_offset_map((pmd_t *) table, gaddr); + } + return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); +} + +/** + * gmap_pte_op_fixup - force a page in and connect the gmap page table + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @vmaddr: address in the host process address space + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if the caller can retry __gmap_translate (might fail again), + * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing + * up or connecting the gmap page table. + */ +static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr, int prot) +{ + struct mm_struct *mm = gmap->mm; + unsigned int fault_flags; + bool unlocked = false; + + BUG_ON(gmap_is_shadow(gmap)); + fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + return -EFAULT; + if (unlocked) + /* lost mmap_sem, caller has to retry __gmap_translate */ + return 0; + /* Connect the page tables */ + return __gmap_link(gmap, gaddr, vmaddr); } -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); /** - * gmap_ipte_notify - mark a range of ptes for invalidation notification + * gmap_pte_op_end - release the page table lock + * @ptl: pointer to the spinlock pointer + */ +static void gmap_pte_op_end(spinlock_t *ptl) +{ + spin_unlock(ptl); +} + +/* + * gmap_protect_range - remove access rights to memory and set pgste bits * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * + * Called with sg->mm->mmap_sem in read. + * + * Note: Can also be called for shadow gmaps. */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) +static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot, unsigned long bits) { - unsigned long addr; + unsigned long vmaddr; spinlock_t *ptl; pte_t *ptep; - bool unlocked; - int rc = 0; + int rc; + + while (len) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); + gmap_pte_op_end(ptl); + } + if (rc) { + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); + if (rc) + return rc; + continue; + } + gaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +/** + * gmap_mprotect_notify - change access rights for a range of ptes and + * call the notifier if any pte changes again + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if for each page in the given range a gmap mapping exists, + * the new access rights could be set and the notifier could be armed. + * If the gmap mapping is missing for one or more pages -EFAULT is + * returned. If no memory could be allocated -ENOMEM is returned. + * This function establishes missing page table entries. + */ +int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot) +{ + int rc; - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) + return -EINVAL; + if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; down_read(&gmap->mm->mmap_sem); - while (len) { - unlocked = false; - /* Convert gmap address and connect the page tables */ - addr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(addr)) { - rc = addr; - break; + rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_mprotect_notify); + +/** + * gmap_read_table - get an unsigned long value from a guest page table using + * absolute addressing, without marking the page referenced. + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @val: pointer to the unsigned long value to return + * + * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT + * if reading using the virtual address failed. + * + * Called with gmap->mm->mmap_sem in read. + */ +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +{ + unsigned long address, vmaddr; + spinlock_t *ptl; + pte_t *ptep, pte; + int rc; + + while (1) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + pte = *ptep; + if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { + address = pte_val(pte) & PAGE_MASK; + address += gaddr & ~PAGE_MASK; + *val = *(unsigned long *) address; + pte_val(*ptep) |= _PAGE_YOUNG; + /* Do *NOT* clear the _PAGE_INVALID bit! */ + rc = 0; + } + gmap_pte_op_end(ptl); } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - rc = -EFAULT; + if (!rc) + break; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; break; } - /* While trying to map mmap_sem got unlocked. Let us retry */ - if (unlocked) - continue; - rc = __gmap_link(gmap, gaddr, addr); + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); if (rc) break; - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - VM_BUG_ON(!ptep); - /* Set notification bit in the pgste of the pte */ - if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { - ptep_set_notify(gmap->mm, addr, ptep); - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - pte_unmap_unlock(ptep, ptl); } - up_read(&gmap->mm->mmap_sem); return rc; } -EXPORT_SYMBOL_GPL(gmap_ipte_notify); +EXPORT_SYMBOL_GPL(gmap_read_table); /** - * ptep_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space - * @pte: pointer to the page table entry + * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree + * @sg: pointer to the shadow guest address space structure + * @vmaddr: vm address associated with the rmap + * @rmap: pointer to the rmap structure * - * This function is assumed to be called with the page table lock held - * for the pte to notify. + * Called with the sg->guest_table_lock */ -void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) +static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, + struct gmap_rmap *rmap) { - unsigned long offset, gaddr; - unsigned long *table; - struct gmap_notifier *nb; - struct gmap *gmap; + void **slot; - offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); - spin_lock(&gmap_notifier_lock); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (!table) + BUG_ON(!gmap_is_shadow(sg)); + slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, + &sg->guest_table_lock); + radix_tree_replace_slot(slot, rmap); + } else { + rmap->next = NULL; + radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, + rmap); + } +} + +/** + * gmap_protect_rmap - modify access rights to memory and create an rmap + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow gmap + * @paddr: address in the parent guest address space + * @len: length of the memory area to protect + * @prot: indicates access rights: none, read-only or read-write + * + * Returns 0 if successfully protected and the rmap was created, -ENOMEM + * if out of memory and -EFAULT if paddr is invalid. + */ +static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, + unsigned long paddr, unsigned long len, int prot) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + while (len) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = raddr; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) { + kfree(rmap); + return rc; + } + rc = -EAGAIN; + ptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (ptep) { + spin_lock(&sg->guest_table_lock); + rc = ptep_force_prot(parent->mm, paddr, ptep, prot, + PGSTE_VSIE_BIT); + if (!rc) + gmap_insert_rmap(sg, vmaddr, rmap); + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + } + radix_tree_preload_end(); + if (rc) { + kfree(rmap); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + return rc; continue; - gaddr = __gmap_segment_gaddr(table) + offset; - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, gaddr); + } + paddr += PAGE_SIZE; + len -= PAGE_SIZE; } - spin_unlock(&gmap_notifier_lock); + return 0; +} + +#define _SHADOW_RMAP_MASK 0x7 +#define _SHADOW_RMAP_REGION1 0x5 +#define _SHADOW_RMAP_REGION2 0x4 +#define _SHADOW_RMAP_REGION3 0x3 +#define _SHADOW_RMAP_SEGMENT 0x2 +#define _SHADOW_RMAP_PGTABLE 0x1 + +/** + * gmap_idte_one - invalidate a single region or segment table entry + * @asce: region or segment table *origin* + table-type bits + * @vaddr: virtual address to identify the table entry to flush + * + * The invalid bit of a single region or segment table entry is set + * and the associated TLB entries depending on the entry are flushed. + * The table-type of the @asce identifies the portion of the @vaddr + * that is used as the invalidation index. + */ +static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) +{ + asm volatile( + " .insn rrf,0xb98e0000,%0,%1,0,0" + : : "a" (asce), "a" (vaddr) : "cc", "memory"); +} + +/** + * gmap_unshadow_page - remove a page from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ + if (!table || *table & _PAGE_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); +} + +/** + * __gmap_unshadow_pgt - remove all entries from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @pgt: pointer to the start of a shadow page table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, + unsigned long *pgt) +{ + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < 256; i++, raddr += 1UL << 12) + pgt[i] = _PAGE_INVALID; +} + +/** + * gmap_unshadow_pgt - remove a shadow page table from a segment entry + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long sto, *ste, *pgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ + if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); + sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); + pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + *ste = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); +} + +/** + * __gmap_unshadow_sgt - remove all entries from a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @sgt: pointer to the start of a shadow segment table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, + unsigned long *sgt) +{ + unsigned long asce, *pgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; + for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) + continue; + pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + sgt[i] = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); + } +} + +/** + * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the shadow->guest_table_lock + */ +static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long r3o, *r3e, *sgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ + if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); + r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); + sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + *r3e = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * @r3t: pointer to the start of a shadow region-3 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, + unsigned long *r3t) +{ + unsigned long asce, *sgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; + for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) + continue; + sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + r3t[i] = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r2o, *r2e, *r3t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ + if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); + r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); + r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + *r2e = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r2t: pointer to the start of a shadow region-2 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, + unsigned long *r2t) +{ + unsigned long asce, *r3t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; + for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r2t[i] = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r1o, *r1e, *r2t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ + if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); + r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); + r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + *r1e = _REGION1_ENTRY_EMPTY; + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r1t: pointer to the start of a shadow region-1 table + * + * Called with the shadow->guest_table_lock + */ +static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, + unsigned long *r1t) +{ + unsigned long asce, *r2t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Clear entry and flush translation r1t -> r2t */ + gmap_idte_one(asce, raddr); + r1t[i] = _REGION1_ENTRY_EMPTY; + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow - remove a shadow page table completely + * @sg: pointer to the shadow guest address space structure + * + * Called with sg->guest_table_lock + */ +static void gmap_unshadow(struct gmap *sg) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + if (sg->removed) + return; + sg->removed = 1; + gmap_call_notifier(sg, 0, -1UL); + gmap_flush_tlb(sg); + table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + switch (sg->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + __gmap_unshadow_r1t(sg, 0, table); + break; + case _ASCE_TYPE_REGION2: + __gmap_unshadow_r2t(sg, 0, table); + break; + case _ASCE_TYPE_REGION3: + __gmap_unshadow_r3t(sg, 0, table); + break; + case _ASCE_TYPE_SEGMENT: + __gmap_unshadow_sgt(sg, 0, table); + break; + } +} + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg; + + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce != asce || sg->edat_level != edat_level || + sg->removed) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + atomic_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} +EXPORT_SYMBOL_GPL(gmap_shadow_valid); + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + BUG_ON(gmap_is_shadow(parent)); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + atomic_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + down_read(&parent->mm->mmap_sem); + rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + PROT_READ, PGSTE_VSIE_BIT); + up_read(&parent->mm->mmap_sem); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} +EXPORT_SYMBOL_GPL(gmap_shadow); + +/** + * gmap_shadow_r2t - create an empty shadow region 2 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r2t: parent gmap address of the region 2 table to get shadowed + * @fake: r2t references contiguous guest memory block, not a r2t + * + * The r2t parameter specifies the address of the source table. The + * four pages of the source table are made read-only in the parent gmap + * address space. A write to the source table area @r2t will automatically + * remove the shadow r2 table and all of its decendents. + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r2t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r2t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r2t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r2t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r2t read-only in parent gmap page table */ + raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + origin = r2t & _REGION_ENTRY_ORIGIN; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 4); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r2t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r2t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r2t); + +/** + * gmap_shadow_r3t - create a shadow region 3 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r3t: parent gmap address of the region 3 table to get shadowed + * @fake: r3t references contiguous guest memory block, not a r3t + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r3t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r3t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r3t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + } + crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r3t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r3t read-only in parent gmap page table */ + raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + origin = r3t & _REGION_ENTRY_ORIGIN; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 3); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r3t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r3t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r3t); + +/** + * gmap_shadow_sgt - create a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @sgt: parent gmap address of the segment table to get shadowed + * @fake: sgt references contiguous guest memory block, not a sgt + * + * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_sgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); + /* Allocate a shadow segment table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = sgt & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_sgt = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= sgt & _REGION_ENTRY_PROTECT; + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make sgt read-only in parent gmap page table */ + raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + origin = sgt & _REGION_ENTRY_ORIGIN; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 2); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_sgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_sgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_sgt); + +/** + * gmap_shadow_lookup_pgtable - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + unsigned long *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); + +/** + * gmap_shadow_pgt - instantiate a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: parent gmap address of the page table to get shadowed + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory, + * -EFAULT if an address in the parent gmap could not be resolved and + * + * Called with gmap->mm->mmap_sem in read + */ +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake) +{ + unsigned long raddr, origin; + unsigned long *s_pgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); + /* Allocate a shadow page table */ + page = page_table_alloc_pgste(sg->mm); + if (!page) + return -ENOMEM; + page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_pgt = (unsigned long *) page_to_phys(page); + /* Install shadow page table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _SEGMENT_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _SEGMENT_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | + (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; + list_add(&page->lru, &sg->pt_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_SEGMENT_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make pgt read-only in parent gmap page table (not the pgste) */ + raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 1); + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != + (unsigned long) s_pgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_SEGMENT_ENTRY_INVALID; + } else { + gmap_unshadow_pgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + page_table_free_pgste(page); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt); + +/** + * gmap_shadow_page - create a shadow page mapping + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pte: pte in parent gmap address space to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr, paddr; + spinlock_t *ptl; + pte_t *sptep, *tptep; + int prot; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; + + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; + + while (1) { + paddr = pte_val(pte) & PAGE_MASK; + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + break; + rc = -EAGAIN; + sptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (sptep) { + spin_lock(&sg->guest_table_lock); + /* Get page table pointer */ + tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); + if (!tptep) { + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + radix_tree_preload_end(); + break; + } + rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); + if (rc > 0) { + /* Success and a new mapping */ + gmap_insert_rmap(sg, vmaddr, rmap); + rmap = NULL; + rc = 0; + } + gmap_pte_op_end(ptl); + spin_unlock(&sg->guest_table_lock); + } + radix_tree_preload_end(); + if (!rc) + break; + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + break; + } + kfree(rmap); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_page); + +/** + * gmap_shadow_notify - handle notifications for shadow gmap + * + * Called with sg->parent->shadow_lock. + */ +static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, + unsigned long offset, pte_t *pte) +{ + struct gmap_rmap *rmap, *rnext, *head; + unsigned long gaddr, start, end, bits, raddr; + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->parent->guest_table_lock); + table = radix_tree_lookup(&sg->parent->host_to_guest, + vmaddr >> PMD_SHIFT); + gaddr = table ? __gmap_segment_gaddr(table) + offset : 0; + spin_unlock(&sg->parent->guest_table_lock); + if (!table) + return; + + spin_lock(&sg->guest_table_lock); + if (sg->removed) { + spin_unlock(&sg->guest_table_lock); + return; + } + /* Check for top level table */ + start = sg->orig_asce & _ASCE_ORIGIN; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && + gaddr < end) { + /* The complete shadow table has to go */ + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + return; + } + /* Remove the page table tree from on specific entry */ + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + gmap_for_each_rmap_safe(rmap, rnext, head) { + bits = rmap->raddr & _SHADOW_RMAP_MASK; + raddr = rmap->raddr ^ bits; + switch (bits) { + case _SHADOW_RMAP_REGION1: + gmap_unshadow_r2t(sg, raddr); + break; + case _SHADOW_RMAP_REGION2: + gmap_unshadow_r3t(sg, raddr); + break; + case _SHADOW_RMAP_REGION3: + gmap_unshadow_sgt(sg, raddr); + break; + case _SHADOW_RMAP_SEGMENT: + gmap_unshadow_pgt(sg, raddr); + break; + case _SHADOW_RMAP_PGTABLE: + gmap_unshadow_page(sg, raddr); + break; + } + kfree(rmap); + } + spin_unlock(&sg->guest_table_lock); +} + +/** + * ptep_notify - call all invalidation callbacks for a specific pte. + * @mm: pointer to the process mm_struct + * @addr: virtual address in the process address space + * @pte: pointer to the page table entry + * @bits: bits from the pgste that caused the notify call + * + * This function is assumed to be called with the page table lock held + * for the pte to notify. + */ +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, + pte_t *pte, unsigned long bits) +{ + unsigned long offset, gaddr; + unsigned long *table; + struct gmap *gmap, *sg, *next; + + offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); + offset = offset * (4096 / sizeof(pte_t)); + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, offset, pte); + spin_unlock(&gmap->shadow_lock); + } + if (!(bits & PGSTE_IN_BIT)) + continue; + spin_lock(&gmap->guest_table_lock); + table = radix_tree_lookup(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (table) + gaddr = __gmap_segment_gaddr(table) + offset; + spin_unlock(&gmap->guest_table_lock); + if (table) + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ptep_notify); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index e8b5962ac12ab8035797829f6940b9c1c7cfe227..9c57a295a045b985bf81e2f374e1b750d5986ae1 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) return new; } +#ifdef CONFIG_PGSTE + +struct page *page_table_alloc_pgste(struct mm_struct *mm) +{ + struct page *page; + unsigned long *table; + + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (page) { + table = (unsigned long *) page_to_phys(page); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } + return page; +} + +void page_table_free_pgste(struct page *page) +{ + __free_page(page); +} + +#endif /* CONFIG_PGSTE */ + /* * page table entry allocation/free routines. */ @@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Try to get a fragment of a 4K page as a 2K page table */ if (!mm_alloc_pgste(mm)) { table = NULL; - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); @@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) list_del(&page->lru); } } - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (table) return table; } @@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); } return table; } @@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 1U << bit); if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (mask != 0) return; } @@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, return; } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); table = (unsigned long *) (__pa(table) | (1U << bit)); tlb_remove_table(tlb, table); } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index fa286d0c0f2da3ba2648543246b8a469bfdc458f..293130b5aee763342f209ae4be2121328f841836 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -179,14 +179,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) return pgste; } -static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, pgste_t pgste) +static inline pgste_t pgste_pte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - if (pgste_val(pgste) & PGSTE_IN_BIT) { - pgste_val(pgste) &= ~PGSTE_IN_BIT; - ptep_notify(mm, addr, ptep); + unsigned long bits; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste_val(pgste) ^= bits; + ptep_notify(mm, addr, ptep, bits); } #endif return pgste; @@ -199,7 +202,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); } return pgste; } @@ -414,6 +417,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) pgste_set_unlock(ptep, pgste); } +/** + * ptep_force_prot - change access rights of a locked pte + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @ptep: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bit: pgste bit to set (e.g. for notification) + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. + */ +int ptep_force_prot(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int prot, unsigned long bit) +{ + pte_t entry; + pgste_t pgste; + int pte_i, pte_p; + + pgste = pgste_get_lock(ptep); + entry = *ptep; + /* Check pte entry after all locks have been acquired */ + pte_i = pte_val(entry) & _PAGE_INVALID; + pte_p = pte_val(entry) & _PAGE_PROTECT; + if ((pte_i && (prot != PROT_NONE)) || + (pte_p && (prot & PROT_WRITE))) { + pgste_set_unlock(ptep, pgste); + return -EAGAIN; + } + /* Change access rights and set pgste bit */ + if (prot == PROT_NONE && !pte_i) { + ptep_flush_direct(mm, addr, ptep); + pgste = pgste_update_all(entry, pgste, mm); + pte_val(entry) |= _PAGE_INVALID; + } + if (prot == PROT_READ && !pte_p) { + ptep_flush_direct(mm, addr, ptep); + pte_val(entry) &= ~_PAGE_INVALID; + pte_val(entry) |= _PAGE_PROTECT; + } + pgste_val(pgste) |= bit; + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + return 0; +} + +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte) +{ + pgste_t spgste, tpgste; + pte_t spte, tpte; + int rc = -EAGAIN; + + if (!(pte_val(*tptep) & _PAGE_INVALID)) + return 0; /* already shadowed */ + spgste = pgste_get_lock(sptep); + spte = *sptep; + if (!(pte_val(spte) & _PAGE_INVALID) && + !((pte_val(spte) & _PAGE_PROTECT) && + !(pte_val(pte) & _PAGE_PROTECT))) { + pgste_val(spgste) |= PGSTE_VSIE_BIT; + tpgste = pgste_get_lock(tptep); + pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT); + /* don't touch the storage key - it belongs to parent pgste */ + tpgste = pgste_set_pte(tptep, tpgste, tpte); + pgste_set_unlock(tptep, tpgste); + rc = 1; + } + pgste_set_unlock(sptep, spgste); + return rc; +} + +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) +{ + pgste_t pgste; + + pgste = pgste_get_lock(ptep); + /* notifier is called by the caller */ + ptep_flush_direct(mm, saddr, ptep); + /* don't touch the storage key - it belongs to parent pgste */ + pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); + pgste_set_unlock(ptep, pgste); +} + static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) { if (!non_swap_entry(entry)) @@ -483,7 +570,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pgste_val(pgste) &= ~PGSTE_UC_BIT; pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); __ptep_ipte(addr, ptep); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 8b5e0a9f2431a7b833bdb836600fe512dfbe0651..610e1327191849bf6ef5564ec1d2ddef690a0123 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) return ret; } +static inline int page_ref_inc_return(struct page *page) +{ + int ret = atomic_inc_return(&page->_refcount); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + __page_ref_mod_and_return(page, 1, ret); + return ret; +} + static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount);