diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h new file mode 100644 index 0000000000000000000000000000000000000000..d054c1b07a3c3988d2d0c1bf195f7b404ded5bef --- /dev/null +++ b/arch/s390/include/asm/gmap.h @@ -0,0 +1,64 @@ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016 + * Author(s): Martin Schwidefsky + */ + +#ifndef _ASM_S390_GMAP_H +#define _ASM_S390_GMAP_H + +/** + * struct gmap_struct - guest address space + * @crst_list: list of all crst tables used in the guest address space + * @mm: pointer to the parent mm_struct + * @guest_to_host: radix tree with guest to host address translation + * @host_to_guest: radix tree with pointer to segment table entries + * @guest_table_lock: spinlock to protect all entries in the guest page table + * @table: pointer to the page directory + * @asce: address space control element for gmap page table + * @pfault_enabled: defines if pfaults are applicable for the guest + */ +struct gmap { + struct list_head list; + struct list_head crst_list; + struct mm_struct *mm; + struct radix_tree_root guest_to_host; + struct radix_tree_root host_to_guest; + spinlock_t guest_table_lock; + unsigned long *table; + unsigned long asce; + unsigned long asce_end; + void *private; + bool pfault_enabled; +}; + +/** + * struct gmap_notifier - notify function block for page invalidation + * @notifier_call: address of callback function + */ +struct gmap_notifier { + struct list_head list; + void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); +}; + +struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); +void gmap_free(struct gmap *gmap); +void gmap_enable(struct gmap *gmap); +void gmap_disable(struct gmap *gmap); +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len); +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); +unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); +unsigned long gmap_translate(struct gmap *, unsigned long gaddr); +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); +int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); +void gmap_discard(struct gmap *, unsigned long from, unsigned long to); +void __gmap_zap(struct gmap *, unsigned long gaddr); +void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); + +void gmap_register_ipte_notifier(struct gmap_notifier *); +void gmap_unregister_ipte_notifier(struct gmap_notifier *); +int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); + +#endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 7b7858f158b4574b549ab99648f977a3e249068c..92487193706ccd036f03f86632b05ef7e1e91f19 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -23,10 +23,6 @@ void page_table_free(struct mm_struct *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); extern int page_table_allocate_pgste; -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned long key, bool nq); -unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr); - static inline void clear_table(unsigned long *s, unsigned long val, size_t n) { typedef struct { char _[n]; } addrtype; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 572001c8913d4d37a1487fb8816dc3eea12fa374..2f66645587a2a134d9d2a304ae6508fedfc1fd8a 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -622,111 +622,6 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) return pmd; } -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long new = 0; -#ifdef CONFIG_PGSTE - unsigned long old; - - preempt_disable(); - asm( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " nihh %0,0xff7f\n" /* clear PCL bit in old */ - " oihh %1,0x0080\n" /* set PCL bit in new */ - " csg %0,%1,%2\n" - " jl 0b\n" - : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) - : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); -#endif - return __pgste(new); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - asm( - " nihh %1,0xff7f\n" /* clear PCL bit */ - " stg %1,%0\n" - : "=Q" (ptep[PTRS_PER_PTE]) - : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) - : "cc", "memory"); - preempt_enable(); -#endif -} - -static inline pgste_t pgste_get(pte_t *ptep) -{ - unsigned long pgste = 0; -#ifdef CONFIG_PGSTE - pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); -#endif - return __pgste(pgste); -} - -static inline void pgste_set(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; -#endif -} - -bool pgste_test_and_clear_dirty(struct mm_struct *, unsigned long address); -void ptep_ipte_notify(struct mm_struct *, unsigned long addr, pte_t *); - -/** - * struct gmap_struct - guest address space - * @crst_list: list of all crst tables used in the guest address space - * @mm: pointer to the parent mm_struct - * @guest_to_host: radix tree with guest to host address translation - * @host_to_guest: radix tree with pointer to segment table entries - * @guest_table_lock: spinlock to protect all entries in the guest page table - * @table: pointer to the page directory - * @asce: address space control element for gmap page table - * @pfault_enabled: defines if pfaults are applicable for the guest - */ -struct gmap { - struct list_head list; - struct list_head crst_list; - struct mm_struct *mm; - struct radix_tree_root guest_to_host; - struct radix_tree_root host_to_guest; - spinlock_t guest_table_lock; - unsigned long *table; - unsigned long asce; - unsigned long asce_end; - void *private; - bool pfault_enabled; -}; - -/** - * struct gmap_notifier - notify function block for page invalidation - * @notifier_call: address of callback function - */ -struct gmap_notifier { - struct list_head list; - void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); -}; - -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); -void gmap_free(struct gmap *gmap); -void gmap_enable(struct gmap *gmap); -void gmap_disable(struct gmap *gmap); -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len); -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); -unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -unsigned long gmap_translate(struct gmap *, unsigned long gaddr); -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); -void gmap_discard(struct gmap *, unsigned long from, unsigned long to); -void __gmap_zap(struct gmap *, unsigned long gaddr); - - -void gmap_register_ipte_notifier(struct gmap_notifier *); -void gmap_unregister_ipte_notifier(struct gmap_notifier *); -int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); - /* * query functions pte_write/pte_dirty/pte_young only work if * pte_present() is true. Undefined behaviour if not.. @@ -984,7 +879,21 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, return 1; } -void set_pte_pgste_at(struct mm_struct *, unsigned long, pte_t *, pte_t); +/* + * Additional functions to handle KVM guest page tables + */ +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry); +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , int reset); +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + +bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq); +unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr); /* * Certain architectures need to do special things when PTEs @@ -995,7 +904,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { if (mm_has_pgste(mm)) - set_pte_pgste_at(mm, addr, ptep, entry); + ptep_set_pte_at(mm, addr, ptep, entry); else *ptep = entry; } diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 53bbc9e8b281fa9ae60f94ea81ce111d3249482d..1f95cc1faeb7624031b32c282953cc05cc001c3c 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * Make sure that the compiler is new enough. We want a compiler that diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 05f7de9869a9f1786a8b996edd10fcdb280bd7c0..1ea4095b67d729fb67c01b4be81ce8bb7efe4bcb 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "kvm-s390.h" #include "trace.h" diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f88ca72c3a77a52e05e65cfa79206cd8e41aa786..e5e8739dcde3d64dcbebca95e7e2d29cd2752a69 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 616e0a16ee886e83a7a16b86179f933a24cd0241..be1f0288443edae18396a79eb2353ce9c99854d9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -280,7 +281,7 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm, for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) { address = gfn_to_hva_memslot(memslot, cur_gfn); - if (pgste_test_and_clear_dirty(gmap->mm, address)) + if (test_and_clear_guest_dirty(gmap->mm, address)) mark_page_dirty(kvm, cur_gfn); } up_read(&gmap->mm->mmap_sem); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ed74e86d9b9eb60c88b2f91e671c9d9b9c56f316..b632d8dda9adb791f6133fc2a88400b8ae92ea12 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 839592ca265cdb620ddbe1c0505e9d9bbe4d428c..2ae54cad2b6ab2734f999694bbb968b940416d00 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -2,9 +2,11 @@ # Makefile for the linux s390-specific parts of the memory manager. # -obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o +obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o obj-y += page-states.o gup.o extable.o pageattr.o mem_detect.o +obj-y += pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PGSTE) += gmap.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 64b3ad1d657593feffe0b91bcfe1f2526a66e4bf..cce577feab1e9998f5e1b29e6011e70e8db075fe 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c new file mode 100644 index 0000000000000000000000000000000000000000..69247b4dcc43ab94a3ea7427c5f3998c7696ca86 --- /dev/null +++ b/arch/s390/mm/gmap.c @@ -0,0 +1,774 @@ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/** + * gmap_alloc - allocate a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + struct page *page; + unsigned long *table; + unsigned long etype, atype; + + if (limit < (1UL << 31)) { + limit = (1UL << 31) - 1; + atype = _ASCE_TYPE_SEGMENT; + etype = _SEGMENT_ENTRY_EMPTY; + } else if (limit < (1UL << 42)) { + limit = (1UL << 42) - 1; + atype = _ASCE_TYPE_REGION3; + etype = _REGION3_ENTRY_EMPTY; + } else if (limit < (1UL << 53)) { + limit = (1UL << 53) - 1; + atype = _ASCE_TYPE_REGION2; + etype = _REGION2_ENTRY_EMPTY; + } else { + limit = -1UL; + atype = _ASCE_TYPE_REGION1; + etype = _REGION1_ENTRY_EMPTY; + } + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); + if (!gmap) + goto out; + INIT_LIST_HEAD(&gmap->crst_list); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + spin_lock_init(&gmap->guest_table_lock); + gmap->mm = mm; + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + goto out_free; + page->index = 0; + list_add(&page->lru, &gmap->crst_list); + table = (unsigned long *) page_to_phys(page); + crst_table_init(table, etype); + gmap->table = table; + gmap->asce = atype | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | __pa(table); + gmap->asce_end = limit; + down_write(&mm->mmap_sem); + list_add(&gmap->list, &mm->context.gmap_list); + up_write(&mm->mmap_sem); + return gmap; + +out_free: + kfree(gmap); +out: + return NULL; +} +EXPORT_SYMBOL_GPL(gmap_alloc); + +static void gmap_flush_tlb(struct gmap *gmap) +{ + if (MACHINE_HAS_IDTE) + __tlb_flush_asce(gmap->mm, gmap->asce); + else + __tlb_flush_global(); +} + +static void gmap_radix_tree_free(struct radix_tree_root *root) +{ + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + radix_tree_delete(root, index); + } + } while (nr > 0); +} + +/** + * gmap_free - free a guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_free(struct gmap *gmap) +{ + struct page *page, *next; + + /* Flush tlb. */ + if (MACHINE_HAS_IDTE) + __tlb_flush_asce(gmap->mm, gmap->asce); + else + __tlb_flush_global(); + + /* Free all segment & region tables. */ + list_for_each_entry_safe(page, next, &gmap->crst_list, lru) + __free_pages(page, 2); + gmap_radix_tree_free(&gmap->guest_to_host); + gmap_radix_tree_free(&gmap->host_to_guest); + down_write(&gmap->mm->mmap_sem); + list_del(&gmap->list); + up_write(&gmap->mm->mmap_sem); + kfree(gmap); +} +EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_enable - switch primary space to the guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_enable(struct gmap *gmap) +{ + S390_lowcore.gmap = (unsigned long) gmap; +} +EXPORT_SYMBOL_GPL(gmap_enable); + +/** + * gmap_disable - switch back to the standard primary address space + * @gmap: pointer to the guest address space structure + */ +void gmap_disable(struct gmap *gmap) +{ + S390_lowcore.gmap = 0UL; +} +EXPORT_SYMBOL_GPL(gmap_disable); + +/* + * gmap_alloc_table is assumed to be called with mmap_sem held + */ +static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, + unsigned long init, unsigned long gaddr) +{ + struct page *page; + unsigned long *new; + + /* since we dont free the gmap table until gmap_free we can unlock */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + new = (unsigned long *) page_to_phys(page); + crst_table_init(new, init); + spin_lock(&gmap->mm->page_table_lock); + if (*table & _REGION_ENTRY_INVALID) { + list_add(&page->lru, &gmap->crst_list); + *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + (*table & _REGION_ENTRY_TYPE_MASK); + page->index = gaddr; + page = NULL; + } + spin_unlock(&gmap->mm->page_table_lock); + if (page) + __free_pages(page, 2); + return 0; +} + +/** + * __gmap_segment_gaddr - find virtual address from segment pointer + * @entry: pointer to a segment table entry in the guest address space + * + * Returns the virtual address in the guest address space for the segment + */ +static unsigned long __gmap_segment_gaddr(unsigned long *entry) +{ + struct page *page; + unsigned long offset, mask; + + offset = (unsigned long) entry / sizeof(unsigned long); + offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + page = virt_to_page((void *)((unsigned long) entry & mask)); + return page->index + offset; +} + +/** + * __gmap_unlink_by_vmaddr - unlink a single segment via a host address + * @gmap: pointer to the guest address space structure + * @vmaddr: address in the host process address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) +{ + unsigned long *entry; + int flush = 0; + + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); + if (entry) { + flush = (*entry != _SEGMENT_ENTRY_INVALID); + *entry = _SEGMENT_ENTRY_INVALID; + } + spin_unlock(&gmap->guest_table_lock); + return flush; +} + +/** + * __gmap_unmap_by_gaddr - unmap a single segment via a guest address + * @gmap: pointer to the guest address space structure + * @gaddr: address in the guest address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; +} + +/** + * gmap_unmap_segment - unmap segment from the guest address space + * @gmap: pointer to the guest address space structure + * @to: address in the guest address space + * @len: length of the memory area to unmap + * + * Returns 0 if the unmap succeeded, -EINVAL if not. + */ +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + if ((to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || to + len < to) + return -EINVAL; + + flush = 0; + down_write(&gmap->mm->mmap_sem); + for (off = 0; off < len; off += PMD_SIZE) + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + up_write(&gmap->mm->mmap_sem); + if (flush) + gmap_flush_tlb(gmap); + return 0; +} +EXPORT_SYMBOL_GPL(gmap_unmap_segment); + +/** + * gmap_map_segment - map a segment to the guest address space + * @gmap: pointer to the guest address space structure + * @from: source address in the parent address space + * @to: target address in the guest address space + * @len: length of the memory area to map + * + * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. + */ +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + if ((from | to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || from + len < from || to + len < to || + from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) + return -EINVAL; + + flush = 0; + down_write(&gmap->mm->mmap_sem); + for (off = 0; off < len; off += PMD_SIZE) { + /* Remove old translation */ + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + /* Store new translation */ + if (radix_tree_insert(&gmap->guest_to_host, + (to + off) >> PMD_SHIFT, + (void *) from + off)) + break; + } + up_write(&gmap->mm->mmap_sem); + if (flush) + gmap_flush_tlb(gmap); + if (off >= len) + return 0; + gmap_unmap_segment(gmap, to, len); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gmap_map_segment); + +/** + * __gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; +} +EXPORT_SYMBOL_GPL(__gmap_translate); + +/** + * gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + */ +unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long rc; + + down_read(&gmap->mm->mmap_sem); + rc = __gmap_translate(gmap, gaddr); + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_translate); + +/** + * gmap_unlink - disconnect a page table from the gmap shadow tables + * @gmap: pointer to guest mapping meta data structure + * @table: pointer to the host page table + * @vmaddr: vm address associated with the host page table + */ +void gmap_unlink(struct mm_struct *mm, unsigned long *table, + unsigned long vmaddr) +{ + struct gmap *gmap; + int flush; + + list_for_each_entry(gmap, &mm->context.gmap_list, list) { + flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); + if (flush) + gmap_flush_tlb(gmap); + } +} + +/** + * gmap_link - set up shadow page tables to connect a host to a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @vmaddr: vm address + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) +{ + struct mm_struct *mm; + unsigned long *table; + spinlock_t *ptl; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int rc; + + /* Create higher level tables in the gmap page table */ + table = gmap->table; + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { + table += (gaddr >> 53) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, + gaddr & 0xffe0000000000000UL)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { + table += (gaddr >> 42) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, + gaddr & 0xfffffc0000000000UL)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { + table += (gaddr >> 31) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, + gaddr & 0xffffffff80000000UL)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + table += (gaddr >> 20) & 0x7ff; + /* Walk the parent mm page table */ + mm = gmap->mm; + pgd = pgd_offset(mm, vmaddr); + VM_BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, vmaddr); + VM_BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, vmaddr); + VM_BUG_ON(pmd_none(*pmd)); + /* large pmds cannot yet be handled */ + if (pmd_large(*pmd)) + return -EFAULT; + /* Link gmap segment table entry location to page table. */ + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + ptl = pmd_lock(mm, pmd); + spin_lock(&gmap->guest_table_lock); + if (*table == _SEGMENT_ENTRY_INVALID) { + rc = radix_tree_insert(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT, table); + if (!rc) + *table = pmd_val(*pmd); + } else + rc = 0; + spin_unlock(&gmap->guest_table_lock); + spin_unlock(ptl); + radix_tree_preload_end(); + return rc; +} + +/** + * gmap_fault - resolve a fault on a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @fault_flags: flags to pass down to handle_mm_fault() + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + */ +int gmap_fault(struct gmap *gmap, unsigned long gaddr, + unsigned int fault_flags) +{ + unsigned long vmaddr; + int rc; + bool unlocked; + + down_read(&gmap->mm->mmap_sem); + +retry: + unlocked = false; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + goto out_up; + } + if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, + &unlocked)) { + rc = -EFAULT; + goto out_up; + } + /* + * In the case that fixup_user_fault unlocked the mmap_sem during + * faultin redo __gmap_translate to not race with a map/unmap_segment. + */ + if (unlocked) + goto retry; + + rc = __gmap_link(gmap, gaddr, vmaddr); +out_up: + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_fault); + +/* + * this function is assumed to be called with mmap_sem held + */ +void __gmap_zap(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (vmaddr) { + vmaddr |= gaddr & ~PMD_MASK; + /* Get pointer to the page table entry */ + ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); + if (likely(ptep)) + ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); + pte_unmap_unlock(ptep, ptl); + } +} +EXPORT_SYMBOL_GPL(__gmap_zap); + +void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) +{ + unsigned long gaddr, vmaddr, size; + struct vm_area_struct *vma; + + down_read(&gmap->mm->mmap_sem); + for (gaddr = from; gaddr < to; + gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (!vmaddr) + continue; + vmaddr |= gaddr & ~PMD_MASK; + /* Find vma in the parent mm */ + vma = find_vma(gmap->mm, vmaddr); + size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); + zap_page_range(vma, vmaddr, size, NULL); + } + up_read(&gmap->mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(gmap_discard); + +static LIST_HEAD(gmap_notifier_list); +static DEFINE_SPINLOCK(gmap_notifier_lock); + +/** + * gmap_register_ipte_notifier - register a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_register_ipte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_add(&nb->list, &gmap_notifier_list); + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); + +/** + * gmap_unregister_ipte_notifier - remove a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_del_init(&nb->list); + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); + +/** + * gmap_ipte_notify - mark a range of ptes for invalidation notification + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * + * Returns 0 if for each page in the given range a gmap mapping exists and + * the invalidation notification could be set. If the gmap mapping is missing + * for one or more pages -EFAULT is returned. If no memory could be allocated + * -ENOMEM is returned. This function establishes missing page table entries. + */ +int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) +{ + unsigned long addr; + spinlock_t *ptl; + pte_t *ptep; + bool unlocked; + int rc = 0; + + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) + return -EINVAL; + down_read(&gmap->mm->mmap_sem); + while (len) { + unlocked = false; + /* Convert gmap address and connect the page tables */ + addr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(addr)) { + rc = addr; + break; + } + /* Get the page mapped */ + if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, + &unlocked)) { + rc = -EFAULT; + break; + } + /* While trying to map mmap_sem got unlocked. Let us retry */ + if (unlocked) + continue; + rc = __gmap_link(gmap, gaddr, addr); + if (rc) + break; + /* Walk the process page table, lock and get pte pointer */ + ptep = get_locked_pte(gmap->mm, addr, &ptl); + VM_BUG_ON(!ptep); + /* Set notification bit in the pgste of the pte */ + if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { + ptep_set_notify(gmap->mm, addr, ptep); + gaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + pte_unmap_unlock(ptep, ptl); + } + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_ipte_notify); + +/** + * ptep_notify - call all invalidation callbacks for a specific pte. + * @mm: pointer to the process mm_struct + * @addr: virtual address in the process address space + * @pte: pointer to the page table entry + * + * This function is assumed to be called with the page table lock held + * for the pte to notify. + */ +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) +{ + unsigned long offset, gaddr; + unsigned long *table; + struct gmap_notifier *nb; + struct gmap *gmap; + + offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); + offset = offset * (4096 / sizeof(pte_t)); + spin_lock(&gmap_notifier_lock); + list_for_each_entry(gmap, &mm->context.gmap_list, list) { + table = radix_tree_lookup(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (!table) + continue; + gaddr = __gmap_segment_gaddr(table) + offset; + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, gaddr); + } + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(ptep_notify); + +static inline void thp_split_mm(struct mm_struct *mm) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct vm_area_struct *vma; + unsigned long addr; + + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (addr = vma->vm_start; + addr < vma->vm_end; + addr += PAGE_SIZE) + follow_page(vma, addr, FOLL_SPLIT); + vma->vm_flags &= ~VM_HUGEPAGE; + vma->vm_flags |= VM_NOHUGEPAGE; + } + mm->def_flags |= VM_NOHUGEPAGE; +#endif +} + +/* + * switch on pgstes for its userspace process (for kvm) + */ +int s390_enable_sie(void) +{ + struct mm_struct *mm = current->mm; + + /* Do we have pgstes? if yes, we are done */ + if (mm_has_pgste(mm)) + return 0; + /* Fail if the page tables are 2K */ + if (!mm_alloc_pgste(mm)) + return -EINVAL; + down_write(&mm->mmap_sem); + mm->context.has_pgste = 1; + /* split thp mappings and disable thp for future mappings */ + thp_split_mm(mm); + up_write(&mm->mmap_sem); + return 0; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); + +/* + * Enable storage key handling from now on and initialize the storage + * keys with the default key. + */ +static int __s390_enable_skey(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + /* + * Remove all zero page mappings, + * after establishing a policy to forbid zero page mappings + * following faults for that page will get fresh anonymous pages + */ + if (is_zero_pfn(pte_pfn(*pte))) + ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); + /* Clear storage key */ + ptep_zap_key(walk->mm, addr, pte); + return 0; +} + +int s390_enable_skey(void) +{ + struct mm_walk walk = { .pte_entry = __s390_enable_skey }; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc = 0; + + down_write(&mm->mmap_sem); + if (mm_use_skey(mm)) + goto out_up; + + mm->context.use_skey = 1; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_madvise(vma, vma->vm_start, vma->vm_end, + MADV_UNMERGEABLE, &vma->vm_flags)) { + mm->context.use_skey = 0; + rc = -ENOMEM; + goto out_up; + } + } + mm->def_flags &= ~VM_MERGEABLE; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); + +out_up: + up_write(&mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(s390_enable_skey); + +/* + * Reset CMMA state, make all pages stable again. + */ +static int __s390_reset_cmma(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + ptep_zap_unused(walk->mm, addr, pte, 1); + return 0; +} + +void s390_reset_cmma(struct mm_struct *mm) +{ + struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; + + down_write(&mm->mmap_sem); + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); + up_write(&mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(s390_reset_cmma); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c new file mode 100644 index 0000000000000000000000000000000000000000..f6c3de26cda85629350350c4d33069c53e9713c4 --- /dev/null +++ b/arch/s390/mm/pgalloc.c @@ -0,0 +1,360 @@ +/* + * Page table allocation functions + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PGSTE + +static int page_table_allocate_pgste_min = 0; +static int page_table_allocate_pgste_max = 1; +int page_table_allocate_pgste = 0; +EXPORT_SYMBOL(page_table_allocate_pgste); + +static struct ctl_table page_table_sysctl[] = { + { + .procname = "allocate_pgste", + .data = &page_table_allocate_pgste, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec, + .extra1 = &page_table_allocate_pgste_min, + .extra2 = &page_table_allocate_pgste_max, + }, + { } +}; + +static struct ctl_table page_table_sysctl_dir[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = page_table_sysctl, + }, + { } +}; + +static int __init page_table_register_sysctl(void) +{ + return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; +} +__initcall(page_table_register_sysctl); + +#endif /* CONFIG_PGSTE */ + +unsigned long *crst_table_alloc(struct mm_struct *mm) +{ + struct page *page = alloc_pages(GFP_KERNEL, 2); + + if (!page) + return NULL; + return (unsigned long *) page_to_phys(page); +} + +void crst_table_free(struct mm_struct *mm, unsigned long *table) +{ + free_pages((unsigned long) table, 2); +} + +static void __crst_table_upgrade(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + clear_user_asce(); + set_user_asce(mm); + } + __tlb_flush_local(); +} + +int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) +{ + unsigned long *table, *pgd; + unsigned long entry; + int flush; + + BUG_ON(limit > TASK_MAX_SIZE); + flush = 0; +repeat: + table = crst_table_alloc(mm); + if (!table) + return -ENOMEM; + spin_lock_bh(&mm->page_table_lock); + if (mm->context.asce_limit < limit) { + pgd = (unsigned long *) mm->pgd; + if (mm->context.asce_limit <= (1UL << 31)) { + entry = _REGION3_ENTRY_EMPTY; + mm->context.asce_limit = 1UL << 42; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION3; + } else { + entry = _REGION2_ENTRY_EMPTY; + mm->context.asce_limit = 1UL << 53; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION2; + } + crst_table_init(table, entry); + pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); + mm->pgd = (pgd_t *) table; + mm->task_size = mm->context.asce_limit; + table = NULL; + flush = 1; + } + spin_unlock_bh(&mm->page_table_lock); + if (table) + crst_table_free(mm, table); + if (mm->context.asce_limit < limit) + goto repeat; + if (flush) + on_each_cpu(__crst_table_upgrade, mm, 0); + return 0; +} + +void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) +{ + pgd_t *pgd; + + if (current->active_mm == mm) { + clear_user_asce(); + __tlb_flush_mm(mm); + } + while (mm->context.asce_limit > limit) { + pgd = mm->pgd; + switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { + case _REGION_ENTRY_TYPE_R2: + mm->context.asce_limit = 1UL << 42; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION3; + break; + case _REGION_ENTRY_TYPE_R3: + mm->context.asce_limit = 1UL << 31; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_SEGMENT; + break; + default: + BUG(); + } + mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); + mm->task_size = mm->context.asce_limit; + crst_table_free(mm, (unsigned long *) pgd); + } + if (current->active_mm == mm) + set_user_asce(mm); +} + +static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) +{ + unsigned int old, new; + + do { + old = atomic_read(v); + new = old ^ bits; + } while (atomic_cmpxchg(v, old, new) != old); + return new; +} + +/* + * page table entry allocation/free routines. + */ +unsigned long *page_table_alloc(struct mm_struct *mm) +{ + unsigned long *table; + struct page *page; + unsigned int mask, bit; + + /* Try to get a fragment of a 4K page as a 2K page table */ + if (!mm_alloc_pgste(mm)) { + table = NULL; + spin_lock_bh(&mm->context.list_lock); + if (!list_empty(&mm->context.pgtable_list)) { + page = list_first_entry(&mm->context.pgtable_list, + struct page, lru); + mask = atomic_read(&page->_mapcount); + mask = (mask | (mask >> 4)) & 3; + if (mask != 3) { + table = (unsigned long *) page_to_phys(page); + bit = mask & 1; /* =1 -> second 2K */ + if (bit) + table += PTRS_PER_PTE; + atomic_xor_bits(&page->_mapcount, 1U << bit); + list_del(&page->lru); + } + } + spin_unlock_bh(&mm->context.list_lock); + if (table) + return table; + } + /* Allocate a fresh page */ + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + /* Initialize page table */ + table = (unsigned long *) page_to_phys(page); + if (mm_alloc_pgste(mm)) { + /* Return 4K page table with PGSTEs */ + atomic_set(&page->_mapcount, 3); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } else { + /* Return the first 2K fragment of the page */ + atomic_set(&page->_mapcount, 1); + clear_table(table, _PAGE_INVALID, PAGE_SIZE); + spin_lock_bh(&mm->context.list_lock); + list_add(&page->lru, &mm->context.pgtable_list); + spin_unlock_bh(&mm->context.list_lock); + } + return table; +} + +void page_table_free(struct mm_struct *mm, unsigned long *table) +{ + struct page *page; + unsigned int bit, mask; + + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (!mm_alloc_pgste(mm)) { + /* Free 2K page table fragment of a 4K page */ + bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.list_lock); + mask = atomic_xor_bits(&page->_mapcount, 1U << bit); + if (mask & 3) + list_add(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + if (mask != 0) + return; + } + + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); +} + +void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, + unsigned long vmaddr) +{ + struct mm_struct *mm; + struct page *page; + unsigned int bit, mask; + + mm = tlb->mm; + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (mm_alloc_pgste(mm)) { + gmap_unlink(mm, table, vmaddr); + table = (unsigned long *) (__pa(table) | 3); + tlb_remove_table(tlb, table); + return; + } + bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.list_lock); + mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); + if (mask & 3) + list_add_tail(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + table = (unsigned long *) (__pa(table) | (1U << bit)); + tlb_remove_table(tlb, table); +} + +static void __tlb_remove_table(void *_table) +{ + unsigned int mask = (unsigned long) _table & 3; + void *table = (void *)((unsigned long) _table ^ mask); + struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + + switch (mask) { + case 0: /* pmd or pud */ + free_pages((unsigned long) table, 2); + break; + case 1: /* lower 2K of a 4K page table */ + case 2: /* higher 2K of a 4K page table */ + if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) + break; + /* fallthrough */ + case 3: /* 4K page table with pgstes */ + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); + break; + } +} + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely + * on IRQ disabling. See the comment near struct mmu_table_batch. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + __tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + struct mmu_table_batch *batch; + int i; + + batch = container_of(head, struct mmu_table_batch, rcu); + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + tlb->mm->context.flush_mm = 1; + if (*batch == NULL) { + *batch = (struct mmu_table_batch *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + __tlb_flush_mm_lazy(tlb->mm); + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_flush_mmu(tlb); +} diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index e24126208614ba063b46e234227bc9e2577ad6a8..4324b87f93982f08552ecb86bb9c893431507427 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -24,1140 +24,6 @@ #include #include -unsigned long *crst_table_alloc(struct mm_struct *mm) -{ - struct page *page = alloc_pages(GFP_KERNEL, 2); - - if (!page) - return NULL; - return (unsigned long *) page_to_phys(page); -} - -void crst_table_free(struct mm_struct *mm, unsigned long *table) -{ - free_pages((unsigned long) table, 2); -} - -static void __crst_table_upgrade(void *arg) -{ - struct mm_struct *mm = arg; - - if (current->active_mm == mm) { - clear_user_asce(); - set_user_asce(mm); - } - __tlb_flush_local(); -} - -int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) -{ - unsigned long *table, *pgd; - unsigned long entry; - int flush; - - BUG_ON(limit > TASK_MAX_SIZE); - flush = 0; -repeat: - table = crst_table_alloc(mm); - if (!table) - return -ENOMEM; - spin_lock_bh(&mm->page_table_lock); - if (mm->context.asce_limit < limit) { - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit <= (1UL << 31)) { - entry = _REGION3_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - } else { - entry = _REGION2_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 53; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION2; - } - crst_table_init(table, entry); - pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->task_size = mm->context.asce_limit; - table = NULL; - flush = 1; - } - spin_unlock_bh(&mm->page_table_lock); - if (table) - crst_table_free(mm, table); - if (mm->context.asce_limit < limit) - goto repeat; - if (flush) - on_each_cpu(__crst_table_upgrade, mm, 0); - return 0; -} - -void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) -{ - pgd_t *pgd; - - if (current->active_mm == mm) { - clear_user_asce(); - __tlb_flush_mm(mm); - } - while (mm->context.asce_limit > limit) { - pgd = mm->pgd; - switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { - case _REGION_ENTRY_TYPE_R2: - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - break; - case _REGION_ENTRY_TYPE_R3: - mm->context.asce_limit = 1UL << 31; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_SEGMENT; - break; - default: - BUG(); - } - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->task_size = mm->context.asce_limit; - crst_table_free(mm, (unsigned long *) pgd); - } - if (current->active_mm == mm) - set_user_asce(mm); -} - -#ifdef CONFIG_PGSTE - -/** - * gmap_alloc - allocate a guest address space - * @mm: pointer to the parent mm_struct - * @limit: maximum address of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) -{ - struct gmap *gmap; - struct page *page; - unsigned long *table; - unsigned long etype, atype; - - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; - atype = _ASCE_TYPE_SEGMENT; - etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; - atype = _ASCE_TYPE_REGION3; - etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; - atype = _ASCE_TYPE_REGION2; - etype = _REGION2_ENTRY_EMPTY; - } else { - limit = -1UL; - atype = _ASCE_TYPE_REGION1; - etype = _REGION1_ENTRY_EMPTY; - } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); - if (!gmap) - goto out; - INIT_LIST_HEAD(&gmap->crst_list); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); - spin_lock_init(&gmap->guest_table_lock); - gmap->mm = mm; - page = alloc_pages(GFP_KERNEL, 2); - if (!page) - goto out_free; - page->index = 0; - list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); - crst_table_init(table, etype); - gmap->table = table; - gmap->asce = atype | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); - gmap->asce_end = limit; - down_write(&mm->mmap_sem); - list_add(&gmap->list, &mm->context.gmap_list); - up_write(&mm->mmap_sem); - return gmap; - -out_free: - kfree(gmap); -out: - return NULL; -} -EXPORT_SYMBOL_GPL(gmap_alloc); - -static void gmap_flush_tlb(struct gmap *gmap) -{ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); -} - -static void gmap_radix_tree_free(struct radix_tree_root *root) -{ - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - radix_tree_delete(root, index); - } - } while (nr > 0); -} - -/** - * gmap_free - free a guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_free(struct gmap *gmap) -{ - struct page *page, *next; - - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); - - /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); - gmap_radix_tree_free(&gmap->guest_to_host); - gmap_radix_tree_free(&gmap->host_to_guest); - down_write(&gmap->mm->mmap_sem); - list_del(&gmap->list); - up_write(&gmap->mm->mmap_sem); - kfree(gmap); -} -EXPORT_SYMBOL_GPL(gmap_free); - -/** - * gmap_enable - switch primary space to the guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_enable(struct gmap *gmap) -{ - S390_lowcore.gmap = (unsigned long) gmap; -} -EXPORT_SYMBOL_GPL(gmap_enable); - -/** - * gmap_disable - switch back to the standard primary address space - * @gmap: pointer to the guest address space structure - */ -void gmap_disable(struct gmap *gmap) -{ - S390_lowcore.gmap = 0UL; -} -EXPORT_SYMBOL_GPL(gmap_disable); - -/* - * gmap_alloc_table is assumed to be called with mmap_sem held - */ -static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, - unsigned long init, unsigned long gaddr) -{ - struct page *page; - unsigned long *new; - - /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); - if (!page) - return -ENOMEM; - new = (unsigned long *) page_to_phys(page); - crst_table_init(new, init); - spin_lock(&gmap->mm->page_table_lock); - if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | - (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; - page = NULL; - } - spin_unlock(&gmap->mm->page_table_lock); - if (page) - __free_pages(page, 2); - return 0; -} - -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) -{ - struct page *page; - unsigned long offset, mask; - - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); - return page->index + offset; -} - -/** - * __gmap_unlink_by_vmaddr - unlink a single segment via a host address - * @gmap: pointer to the guest address space structure - * @vmaddr: address in the host process address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) -{ - unsigned long *entry; - int flush = 0; - - spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_INVALID); - *entry = _SEGMENT_ENTRY_INVALID; - } - spin_unlock(&gmap->guest_table_lock); - return flush; -} - -/** - * __gmap_unmap_by_gaddr - unmap a single segment via a guest address - * @gmap: pointer to the guest address space structure - * @gaddr: address in the guest address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; -} - -/** - * gmap_unmap_segment - unmap segment from the guest address space - * @gmap: pointer to the guest address space structure - * @to: address in the guest address space - * @len: length of the memory area to unmap - * - * Returns 0 if the unmap succeeded, -EINVAL if not. - */ -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - if ((to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || to + len < to) - return -EINVAL; - - flush = 0; - down_write(&gmap->mm->mmap_sem); - for (off = 0; off < len; off += PMD_SIZE) - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - up_write(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - return 0; -} -EXPORT_SYMBOL_GPL(gmap_unmap_segment); - -/** - * gmap_mmap_segment - map a segment to the guest address space - * @gmap: pointer to the guest address space structure - * @from: source address in the parent address space - * @to: target address in the guest address space - * @len: length of the memory area to map - * - * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. - */ -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - if ((from | to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || from + len < from || to + len < to || - from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end) - return -EINVAL; - - flush = 0; - down_write(&gmap->mm->mmap_sem); - for (off = 0; off < len; off += PMD_SIZE) { - /* Remove old translation */ - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - /* Store new translation */ - if (radix_tree_insert(&gmap->guest_to_host, - (to + off) >> PMD_SHIFT, - (void *) from + off)) - break; - } - up_write(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - if (off >= len) - return 0; - gmap_unmap_segment(gmap, to, len); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(gmap_map_segment); - -/** - * __gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - * The mmap_sem of the mm that belongs to the address space must be held - * when this function gets called. - */ -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); - return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; -} -EXPORT_SYMBOL_GPL(__gmap_translate); - -/** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long rc; - - down_read(&gmap->mm->mmap_sem); - rc = __gmap_translate(gmap, gaddr); - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_translate); - -/** - * gmap_unlink - disconnect a page table from the gmap shadow tables - * @gmap: pointer to guest mapping meta data structure - * @table: pointer to the host page table - * @vmaddr: vm address associated with the host page table - */ -static void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ - struct gmap *gmap; - int flush; - - list_for_each_entry(gmap, &mm->context.gmap_list, list) { - flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); - if (flush) - gmap_flush_tlb(gmap); - } -} - -/** - * gmap_link - set up shadow page tables to connect a host to a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @vmaddr: vm address - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - * The mmap_sem of the mm that belongs to the address space must be held - * when this function gets called. - */ -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) -{ - struct mm_struct *mm; - unsigned long *table; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - int rc; - - /* Create higher level tables in the gmap page table */ - table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - table += (gaddr >> 20) & 0x7ff; - /* Walk the parent mm page table */ - mm = gmap->mm; - pgd = pgd_offset(mm, vmaddr); - VM_BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, vmaddr); - VM_BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, vmaddr); - VM_BUG_ON(pmd_none(*pmd)); - /* large pmds cannot yet be handled */ - if (pmd_large(*pmd)) - return -EFAULT; - /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL); - if (rc) - return rc; - ptl = pmd_lock(mm, pmd); - spin_lock(&gmap->guest_table_lock); - if (*table == _SEGMENT_ENTRY_INVALID) { - rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); - if (!rc) - *table = pmd_val(*pmd); - } else - rc = 0; - spin_unlock(&gmap->guest_table_lock); - spin_unlock(ptl); - radix_tree_preload_end(); - return rc; -} - -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) -{ - unsigned long vmaddr; - int rc; - bool unlocked; - - down_read(&gmap->mm->mmap_sem); - -retry: - unlocked = false; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - goto out_up; - } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, - &unlocked)) { - rc = -EFAULT; - goto out_up; - } - /* - * In the case that fixup_user_fault unlocked the mmap_sem during - * faultin redo __gmap_translate to not race with a map/unmap_segment. - */ - if (unlocked) - goto retry; - - rc = __gmap_link(gmap, gaddr, vmaddr); -out_up: - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_fault); - -static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) -{ - if (!non_swap_entry(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) { - struct page *page = migration_entry_to_page(entry); - - dec_mm_counter(mm, mm_counter(page)); - } - free_swap_and_cache(entry); -} - -/* - * this function is assumed to be called with mmap_sem held - */ -void __gmap_zap(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr, ptev, pgstev; - pte_t *ptep, pte; - spinlock_t *ptl; - pgste_t pgste; - - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - return; - vmaddr |= gaddr & ~PMD_MASK; - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (unlikely(!ptep)) - return; - pte = *ptep; - if (!pte_swap(pte)) - goto out_pte; - /* Zap unused and logically-zero pages */ - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - ptev = pte_val(pte); - if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || - ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { - gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); - pte_clear(gmap->mm, vmaddr, ptep); - } - pgste_set_unlock(ptep, pgste); -out_pte: - pte_unmap_unlock(ptep, ptl); -} -EXPORT_SYMBOL_GPL(__gmap_zap); - -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) -{ - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - down_read(&gmap->mm->mmap_sem); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size, NULL); - } - up_read(&gmap->mm->mmap_sem); -} -EXPORT_SYMBOL_GPL(gmap_discard); - -static LIST_HEAD(gmap_notifier_list); -static DEFINE_SPINLOCK(gmap_notifier_lock); - -/** - * gmap_register_ipte_notifier - register a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); - -/** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); - -/** - * gmap_ipte_notify - mark a range of ptes for invalidation notification - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. - */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) -{ - unsigned long addr; - spinlock_t *ptl; - pte_t *ptep, entry; - pgste_t pgste; - bool unlocked; - int rc = 0; - - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) - return -EINVAL; - down_read(&gmap->mm->mmap_sem); - while (len) { - unlocked = false; - /* Convert gmap address and connect the page tables */ - addr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(addr)) { - rc = addr; - break; - } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - rc = -EFAULT; - break; - } - /* While trying to map mmap_sem got unlocked. Let us retry */ - if (unlocked) - continue; - rc = __gmap_link(gmap, gaddr, addr); - if (rc) - break; - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - VM_BUG_ON(!ptep); - /* Set notification bit in the pgste of the pte */ - entry = *ptep; - if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { - pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= PGSTE_IN_BIT; - pgste_set_unlock(ptep, pgste); - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - pte_unmap_unlock(ptep, ptl); - } - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_ipte_notify); - -/** - * ptep_ipte_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space - * @pte: pointer to the page table entry - * - * This function is assumed to be called with the page table lock held - * for the pte to notify. - */ -void ptep_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) -{ - unsigned long offset, gaddr; - unsigned long *table; - struct gmap_notifier *nb; - struct gmap *gmap; - - offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); - spin_lock(&gmap_notifier_lock); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (!table) - continue; - gaddr = __gmap_segment_gaddr(table) + offset; - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, gaddr); - } - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(ptep_ipte_notify); - -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned long key, bool nq) -{ - spinlock_t *ptl; - pgste_t old, new; - pte_t *ptep; - - down_read(&mm->mmap_sem); - ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); - return -EFAULT; - } - - new = old = pgste_get_lock(ptep); - pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; - pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long address, bits, skey; - - address = pte_val(*ptep) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(address); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); - /* Set storage key ACC and FP */ - page_set_storage_key(address, skey, !nq); - /* Merge host changed & referenced into pgste */ - pgste_val(new) |= bits << 52; - } - /* changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & - (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - pgste_val(new) |= PGSTE_UC_BIT; - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return 0; -} -EXPORT_SYMBOL(set_guest_storage_key); - -unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) -{ - spinlock_t *ptl; - pgste_t pgste; - pte_t *ptep; - uint64_t physaddr; - unsigned long key = 0; - - down_read(&mm->mmap_sem); - ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); - return -EFAULT; - } - pgste = pgste_get_lock(ptep); - - if (pte_val(*ptep) & _PAGE_INVALID) { - key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; - key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; - key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; - key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; - } else { - physaddr = pte_val(*ptep) & PAGE_MASK; - key = page_get_storage_key(physaddr); - - /* Reflect guest's logical view, not physical */ - if (pgste_val(pgste) & PGSTE_GR_BIT) - key |= _PAGE_REFERENCED; - if (pgste_val(pgste) & PGSTE_GC_BIT) - key |= _PAGE_CHANGED; - } - - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return key; -} -EXPORT_SYMBOL(get_guest_storage_key); - -static int page_table_allocate_pgste_min = 0; -static int page_table_allocate_pgste_max = 1; -int page_table_allocate_pgste = 0; -EXPORT_SYMBOL(page_table_allocate_pgste); - -static struct ctl_table page_table_sysctl[] = { - { - .procname = "allocate_pgste", - .data = &page_table_allocate_pgste, - .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec, - .extra1 = &page_table_allocate_pgste_min, - .extra2 = &page_table_allocate_pgste_max, - }, - { } -}; - -static struct ctl_table page_table_sysctl_dir[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = page_table_sysctl, - }, - { } -}; - -static int __init page_table_register_sysctl(void) -{ - return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; -} -__initcall(page_table_register_sysctl); - -#else /* CONFIG_PGSTE */ - -static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ -} - -#endif /* CONFIG_PGSTE */ - -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - unsigned int old, new; - - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; -} - -/* - * page table entry allocation/free routines. - */ -unsigned long *page_table_alloc(struct mm_struct *mm) -{ - unsigned long *table; - struct page *page; - unsigned int mask, bit; - - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.list_lock); - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_mapcount); - mask = (mask | (mask >> 4)) & 3; - if (mask != 3) { - table = (unsigned long *) page_to_phys(page); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&page->_mapcount, 1U << bit); - list_del(&page->lru); - } - } - spin_unlock_bh(&mm->context.list_lock); - if (table) - return table; - } - /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - /* Initialize page table */ - table = (unsigned long *) page_to_phys(page); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - atomic_set(&page->_mapcount, 3); - clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); - } else { - /* Return the first 2K fragment of the page */ - atomic_set(&page->_mapcount, 1); - clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); - } - return table; -} - -void page_table_free(struct mm_struct *mm, unsigned long *table) -{ - struct page *page; - unsigned int bit, mask; - - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); - mask = atomic_xor_bits(&page->_mapcount, 1U << bit); - if (mask & 3) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); - if (mask != 0) - return; - } - - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); -} - -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) -{ - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; - - mm = tlb->mm; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (mm_alloc_pgste(mm)) { - gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | 3); - tlb_remove_table(tlb, table); - return; - } - bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); - mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); - if (mask & 3) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); - table = (unsigned long *) (__pa(table) | (1U << bit)); - tlb_remove_table(tlb, table); -} - -static void __tlb_remove_table(void *_table) -{ - unsigned int mask = (unsigned long) _table & 3; - void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - - switch (mask) { - case 0: /* pmd or pud */ - free_pages((unsigned long) table, 2); - break; - case 1: /* lower 2K of a 4K page table */ - case 2: /* higher 2K of a 4K page table */ - if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) - break; - /* fallthrough */ - case 3: /* 4K page table with pgstes */ - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); - break; - } -} - -static void tlb_remove_table_smp_sync(void *arg) -{ - /* Simply deliver the interrupt */ -} - -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely - * on IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); -} - -static void tlb_remove_table_rcu(struct rcu_head *head) -{ - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); -} - -void tlb_table_flush(struct mmu_gather *tlb) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch) { - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; - } -} - -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - tlb->mm->context.flush_mm = 1; - if (*batch == NULL) { - *batch = (struct mmu_table_batch *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - __tlb_flush_mm_lazy(tlb->mm); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; - } - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_flush_mmu(tlb); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline void thp_split_vma(struct vm_area_struct *vma) -{ - unsigned long addr; - - for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) - follow_page(vma, addr, FOLL_SPLIT); -} - -static inline void thp_split_mm(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - thp_split_vma(vma); - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; - } - mm->def_flags |= VM_NOHUGEPAGE; -} -#else -static inline void thp_split_mm(struct mm_struct *mm) -{ -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - static inline pte_t ptep_flush_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -1198,6 +64,55 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, return old; } +static inline pgste_t pgste_get_lock(pte_t *ptep) +{ + unsigned long new = 0; +#ifdef CONFIG_PGSTE + unsigned long old; + + preempt_disable(); + asm( + " lg %0,%2\n" + "0: lgr %1,%0\n" + " nihh %0,0xff7f\n" /* clear PCL bit in old */ + " oihh %1,0x0080\n" /* set PCL bit in new */ + " csg %0,%1,%2\n" + " jl 0b\n" + : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) + : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); +#endif + return __pgste(new); +} + +static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + asm( + " nihh %1,0xff7f\n" /* clear PCL bit */ + " stg %1,%0\n" + : "=Q" (ptep[PTRS_PER_PTE]) + : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) + : "cc", "memory"); + preempt_enable(); +#endif +} + +static inline pgste_t pgste_get(pte_t *ptep) +{ + unsigned long pgste = 0; +#ifdef CONFIG_PGSTE + pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); +#endif + return __pgste(pgste); +} + +static inline void pgste_set(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; +#endif +} + static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, struct mm_struct *mm) { @@ -1271,63 +186,12 @@ static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, #ifdef CONFIG_PGSTE if (pgste_val(pgste) & PGSTE_IN_BIT) { pgste_val(pgste) &= ~PGSTE_IN_BIT; - ptep_ipte_notify(mm, addr, ptep); + ptep_notify(mm, addr, ptep); } #endif return pgste; } -#ifdef CONFIG_PGSTE -/* - * Test and reset if a guest page is dirty - */ -bool pgste_test_and_clear_dirty(struct mm_struct *mm, unsigned long addr) -{ - spinlock_t *ptl; - pgste_t pgste; - pte_t *ptep; - pte_t pte; - bool dirty; - - ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) - return false; - - pgste = pgste_get_lock(ptep); - dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); - pgste_val(pgste) &= ~PGSTE_UC_BIT; - pte = *ptep; - if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep); - if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) - pte_val(pte) |= _PAGE_PROTECT; - else - pte_val(pte) |= _PAGE_INVALID; - *ptep = pte; - } - pgste_set_unlock(ptep, pgste); - - spin_unlock(ptl); - return dirty; -} -EXPORT_SYMBOL_GPL(pgste_test_and_clear_dirty); - -void set_pte_pgste_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) -{ - pgste_t pgste; - - /* the mm_has_pgste() check is done in set_pte_at() */ - pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; - pgste_set_key(ptep, pgste, entry, mm); - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); -} -EXPORT_SYMBOL(set_pte_pgste_at); -#endif - static inline pgste_t ptep_xchg_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -1486,112 +350,6 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(pmdp_xchg_lazy); -/* - * switch on pgstes for its userspace process (for kvm) - */ -int s390_enable_sie(void) -{ - struct mm_struct *mm = current->mm; - - /* Do we have pgstes? if yes, we are done */ - if (mm_has_pgste(mm)) - return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) - return -EINVAL; - down_write(&mm->mmap_sem); - mm->context.has_pgste = 1; - /* split thp mappings and disable thp for future mappings */ - thp_split_mm(mm); - up_write(&mm->mmap_sem); - return 0; -} -EXPORT_SYMBOL_GPL(s390_enable_sie); - -/* - * Enable storage key handling from now on and initialize the storage - * keys with the default key. - */ -static int __s390_enable_skey(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - unsigned long ptev; - pgste_t pgste; - - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); - /* Clear storage key */ - pgste = pgste_get_lock(pte); - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | - PGSTE_GR_BIT | PGSTE_GC_BIT); - ptev = pte_val(*pte); - if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); - pgste_set_unlock(pte, pgste); - return 0; -} - -int s390_enable_skey(void) -{ - struct mm_walk walk = { .pte_entry = __s390_enable_skey }; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int rc = 0; - - down_write(&mm->mmap_sem); - if (mm_use_skey(mm)) - goto out_up; - - mm->context.use_skey = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.use_skey = 0; - rc = -ENOMEM; - goto out_up; - } - } - mm->def_flags &= ~VM_MERGEABLE; - - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); - -out_up: - up_write(&mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(s390_enable_skey); - -/* - * Reset CMMA state, make all pages stable again. - */ -static int __s390_reset_cmma(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - pgste_t pgste; - - pgste = pgste_get_lock(pte); - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; - pgste_set_unlock(pte, pgste); - return 0; -} - -void s390_reset_cmma(struct mm_struct *mm) -{ - struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; - - down_write(&mm->mmap_sem); - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); - up_write(&mm->mmap_sem); -} -EXPORT_SYMBOL_GPL(s390_reset_cmma); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -1632,3 +390,193 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_PGSTE +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) +{ + pgste_t pgste; + + /* the mm_has_pgste() check is done in set_pte_at() */ + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; + pgste_set_key(ptep, pgste, entry, mm); + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); +} + +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pgste_t pgste; + + pgste = pgste_get_lock(ptep); + pgste_val(pgste) |= PGSTE_IN_BIT; + pgste_set_unlock(ptep, pgste); +} + +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +{ + if (!non_swap_entry(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (is_migration_entry(entry)) { + struct page *page = migration_entry_to_page(entry); + + dec_mm_counter(mm, mm_counter(page)); + } + free_swap_and_cache(entry); +} + +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int reset) +{ + unsigned long pgstev; + pgste_t pgste; + pte_t pte; + + /* Zap unused and logically-zero pages */ + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + pte = *ptep; + if (pte_swap(pte) && + ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO))) { + ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + pte_clear(mm, addr, ptep); + } + if (reset) + pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste_set_unlock(ptep, pgste); +} + +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + unsigned long ptev; + pgste_t pgste; + + /* Clear storage key */ + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | + PGSTE_GR_BIT | PGSTE_GC_BIT); + ptev = pte_val(*ptep); + if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); + pgste_set_unlock(ptep, pgste); +} + +/* + * Test and reset if a guest page is dirty + */ +bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) +{ + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + pte_t pte; + bool dirty; + + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) + return false; + + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; + pte = *ptep; + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { + pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + __ptep_ipte(addr, ptep); + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + pte_val(pte) |= _PAGE_PROTECT; + else + pte_val(pte) |= _PAGE_INVALID; + *ptep = pte; + } + pgste_set_unlock(ptep, pgste); + + spin_unlock(ptl); + return dirty; +} +EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty); + +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq) +{ + unsigned long keyul; + spinlock_t *ptl; + pgste_t old, new; + pte_t *ptep; + + down_read(&mm->mmap_sem); + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + + new = old = pgste_get_lock(ptep); + pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); + keyul = (unsigned long) key; + pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; + pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + unsigned long address, bits, skey; + + address = pte_val(*ptep) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(address); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); + /* Set storage key ACC and FP */ + page_set_storage_key(address, skey, !nq); + /* Merge host changed & referenced into pgste */ + pgste_val(new) |= bits << 52; + } + /* changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & + (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + up_read(&mm->mmap_sem); + return 0; +} +EXPORT_SYMBOL(set_guest_storage_key); + +unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr) +{ + unsigned char key; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + + down_read(&mm->mmap_sem); + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + pgste = pgste_get_lock(ptep); + + if (pte_val(*ptep) & _PAGE_INVALID) { + key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; + key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; + key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; + key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; + } else { + key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + + /* Reflect guest's logical view, not physical */ + if (pgste_val(pgste) & PGSTE_GR_BIT) + key |= _PAGE_REFERENCED; + if (pgste_val(pgste) & PGSTE_GC_BIT) + key |= _PAGE_CHANGED; + } + + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + up_read(&mm->mmap_sem); + return key; +} +EXPORT_SYMBOL(get_guest_storage_key); +#endif