From 1e133ab296f3ff8d9e58a5e758291ed39ba72ad7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 11:49:57 +0100 Subject: [PATCH] s390/mm: split arch/s390/mm/pgtable.c The pgtable.c file is quite big, before it grows any larger split it into pgtable.c, pgalloc.c and gmap.c. In addition move the gmap related header definitions into the new gmap.h header and all of the pgste helpers from pgtable.h to pgtable.c. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/gmap.h | 64 ++ arch/s390/include/asm/pgalloc.h | 4 - arch/s390/include/asm/pgtable.h | 123 +-- arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kvm/diag.c | 1 + arch/s390/kvm/interrupt.c | 1 + arch/s390/kvm/kvm-s390.c | 3 +- arch/s390/kvm/priv.c | 1 + arch/s390/mm/Makefile | 4 +- arch/s390/mm/fault.c | 1 + arch/s390/mm/gmap.c | 774 ++++++++++++++++ arch/s390/mm/pgalloc.c | 360 ++++++++ arch/s390/mm/pgtable.c | 1532 +++++-------------------------- 13 files changed, 1464 insertions(+), 1405 deletions(-) create mode 100644 arch/s390/include/asm/gmap.h create mode 100644 arch/s390/mm/gmap.c create mode 100644 arch/s390/mm/pgalloc.c diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h new file mode 100644 index 000000000000..d054c1b07a3c --- /dev/null +++ b/arch/s390/include/asm/gmap.h @@ -0,0 +1,64 @@ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016 + * Author(s): Martin Schwidefsky + */ + +#ifndef _ASM_S390_GMAP_H +#define _ASM_S390_GMAP_H + +/** + * struct gmap_struct - guest address space + * @crst_list: list of all crst tables used in the guest address space + * @mm: pointer to the parent mm_struct + * @guest_to_host: radix tree with guest to host address translation + * @host_to_guest: radix tree with pointer to segment table entries + * @guest_table_lock: spinlock to protect all entries in the guest page table + * @table: pointer to the page directory + * @asce: address space control element for gmap page table + * @pfault_enabled: defines if pfaults are applicable for the guest + */ +struct gmap { + struct list_head list; + struct list_head crst_list; + struct mm_struct *mm; + struct radix_tree_root guest_to_host; + struct radix_tree_root host_to_guest; + spinlock_t guest_table_lock; + unsigned long *table; + unsigned long asce; + unsigned long asce_end; + void *private; + bool pfault_enabled; +}; + +/** + * struct gmap_notifier - notify function block for page invalidation + * @notifier_call: address of callback function + */ +struct gmap_notifier { + struct list_head list; + void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); +}; + +struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); +void gmap_free(struct gmap *gmap); +void gmap_enable(struct gmap *gmap); +void gmap_disable(struct gmap *gmap); +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len); +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); +unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); +unsigned long gmap_translate(struct gmap *, unsigned long gaddr); +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); +int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); +void gmap_discard(struct gmap *, unsigned long from, unsigned long to); +void __gmap_zap(struct gmap *, unsigned long gaddr); +void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); + +void gmap_register_ipte_notifier(struct gmap_notifier *); +void gmap_unregister_ipte_notifier(struct gmap_notifier *); +int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); + +#endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 7b7858f158b4..92487193706c 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -23,10 +23,6 @@ void page_table_free(struct mm_struct *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); extern int page_table_allocate_pgste; -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned long key, bool nq); -unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr); - static inline void clear_table(unsigned long *s, unsigned long val, size_t n) { typedef struct { char _[n]; } addrtype; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 572001c8913d..2f66645587a2 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -622,111 +622,6 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) return pmd; } -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long new = 0; -#ifdef CONFIG_PGSTE - unsigned long old; - - preempt_disable(); - asm( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " nihh %0,0xff7f\n" /* clear PCL bit in old */ - " oihh %1,0x0080\n" /* set PCL bit in new */ - " csg %0,%1,%2\n" - " jl 0b\n" - : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) - : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); -#endif - return __pgste(new); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - asm( - " nihh %1,0xff7f\n" /* clear PCL bit */ - " stg %1,%0\n" - : "=Q" (ptep[PTRS_PER_PTE]) - : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) - : "cc", "memory"); - preempt_enable(); -#endif -} - -static inline pgste_t pgste_get(pte_t *ptep) -{ - unsigned long pgste = 0; -#ifdef CONFIG_PGSTE - pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); -#endif - return __pgste(pgste); -} - -static inline void pgste_set(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; -#endif -} - -bool pgste_test_and_clear_dirty(struct mm_struct *, unsigned long address); -void ptep_ipte_notify(struct mm_struct *, unsigned long addr, pte_t *); - -/** - * struct gmap_struct - guest address space - * @crst_list: list of all crst tables used in the guest address space - * @mm: pointer to the parent mm_struct - * @guest_to_host: radix tree with guest to host address translation - * @host_to_guest: radix tree with pointer to segment table entries - * @guest_table_lock: spinlock to protect all entries in the guest page table - * @table: pointer to the page directory - * @asce: address space control element for gmap page table - * @pfault_enabled: defines if pfaults are applicable for the guest - */ -struct gmap { - struct list_head list; - struct list_head crst_list; - struct mm_struct *mm; - struct radix_tree_root guest_to_host; - struct radix_tree_root host_to_guest; - spinlock_t guest_table_lock; - unsigned long *table; - unsigned long asce; - unsigned long asce_end; - void *private; - bool pfault_enabled; -}; - -/** - * struct gmap_notifier - notify function block for page invalidation - * @notifier_call: address of callback function - */ -struct gmap_notifier { - struct list_head list; - void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); -}; - -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); -void gmap_free(struct gmap *gmap); -void gmap_enable(struct gmap *gmap); -void gmap_disable(struct gmap *gmap); -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len); -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); -unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -unsigned long gmap_translate(struct gmap *, unsigned long gaddr); -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); -void gmap_discard(struct gmap *, unsigned long from, unsigned long to); -void __gmap_zap(struct gmap *, unsigned long gaddr); - - -void gmap_register_ipte_notifier(struct gmap_notifier *); -void gmap_unregister_ipte_notifier(struct gmap_notifier *); -int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); - /* * query functions pte_write/pte_dirty/pte_young only work if * pte_present() is true. Undefined behaviour if not.. @@ -984,7 +879,21 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, return 1; } -void set_pte_pgste_at(struct mm_struct *, unsigned long, pte_t *, pte_t); +/* + * Additional functions to handle KVM guest page tables + */ +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry); +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , int reset); +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + +bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq); +unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr); /* * Certain architectures need to do special things when PTEs @@ -995,7 +904,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { if (mm_has_pgste(mm)) - set_pte_pgste_at(mm, addr, ptep, entry); + ptep_set_pte_at(mm, addr, ptep, entry); else *ptep = entry; } diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 53bbc9e8b281..1f95cc1faeb7 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * Make sure that the compiler is new enough. We want a compiler that diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 05f7de9869a9..1ea4095b67d7 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "kvm-s390.h" #include "trace.h" diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f88ca72c3a77..e5e8739dcde3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 616e0a16ee88..be1f0288443e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -280,7 +281,7 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm, for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) { address = gfn_to_hva_memslot(memslot, cur_gfn); - if (pgste_test_and_clear_dirty(gmap->mm, address)) + if (test_and_clear_guest_dirty(gmap->mm, address)) mark_page_dirty(kvm, cur_gfn); } up_read(&gmap->mm->mmap_sem); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ed74e86d9b9e..b632d8dda9ad 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 839592ca265c..2ae54cad2b6a 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -2,9 +2,11 @@ # Makefile for the linux s390-specific parts of the memory manager. # -obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o +obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o obj-y += page-states.o gup.o extable.o pageattr.o mem_detect.o +obj-y += pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PGSTE) += gmap.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 64b3ad1d6575..cce577feab1e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c new file mode 100644 index 000000000000..69247b4dcc43 --- /dev/null +++ b/arch/s390/mm/gmap.c @@ -0,0 +1,774 @@ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/** + * gmap_alloc - allocate a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + struct page *page; + unsigned long *table; + unsigned long etype, atype; + + if (limit < (1UL << 31)) { + limit = (1UL << 31) - 1; + atype = _ASCE_TYPE_SEGMENT; + etype = _SEGMENT_ENTRY_EMPTY; + } else if (limit < (1UL << 42)) { + limit = (1UL << 42) - 1; + atype = _ASCE_TYPE_REGION3; + etype = _REGION3_ENTRY_EMPTY; + } else if (limit < (1UL << 53)) { + limit = (1UL << 53) - 1; + atype = _ASCE_TYPE_REGION2; + etype = _REGION2_ENTRY_EMPTY; + } else { + limit = -1UL; + atype = _ASCE_TYPE_REGION1; + etype = _REGION1_ENTRY_EMPTY; + } + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); + if (!gmap) + goto out; + INIT_LIST_HEAD(&gmap->crst_list); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + spin_lock_init(&gmap->guest_table_lock); + gmap->mm = mm; + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + goto out_free; + page->index = 0; + list_add(&page->lru, &gmap->crst_list); + table = (unsigned long *) page_to_phys(page); + crst_table_init(table, etype); + gmap->table = table; + gmap->asce = atype | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | __pa(table); + gmap->asce_end = limit; + down_write(&mm->mmap_sem); + list_add(&gmap->list, &mm->context.gmap_list); + up_write(&mm->mmap_sem); + return gmap; + +out_free: + kfree(gmap); +out: + return NULL; +} +EXPORT_SYMBOL_GPL(gmap_alloc); + +static void gmap_flush_tlb(struct gmap *gmap) +{ + if (MACHINE_HAS_IDTE) + __tlb_flush_asce(gmap->mm, gmap->asce); + else + __tlb_flush_global(); +} + +static void gmap_radix_tree_free(struct radix_tree_root *root) +{ + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + radix_tree_delete(root, index); + } + } while (nr > 0); +} + +/** + * gmap_free - free a guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_free(struct gmap *gmap) +{ + struct page *page, *next; + + /* Flush tlb. */ + if (MACHINE_HAS_IDTE) + __tlb_flush_asce(gmap->mm, gmap->asce); + else + __tlb_flush_global(); + + /* Free all segment & region tables. */ + list_for_each_entry_safe(page, next, &gmap->crst_list, lru) + __free_pages(page, 2); + gmap_radix_tree_free(&gmap->guest_to_host); + gmap_radix_tree_free(&gmap->host_to_guest); + down_write(&gmap->mm->mmap_sem); + list_del(&gmap->list); + up_write(&gmap->mm->mmap_sem); + kfree(gmap); +} +EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_enable - switch primary space to the guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_enable(struct gmap *gmap) +{ + S390_lowcore.gmap = (unsigned long) gmap; +} +EXPORT_SYMBOL_GPL(gmap_enable); + +/** + * gmap_disable - switch back to the standard primary address space + * @gmap: pointer to the guest address space structure + */ +void gmap_disable(struct gmap *gmap) +{ + S390_lowcore.gmap = 0UL; +} +EXPORT_SYMBOL_GPL(gmap_disable); + +/* + * gmap_alloc_table is assumed to be called with mmap_sem held + */ +static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, + unsigned long init, unsigned long gaddr) +{ + struct page *page; + unsigned long *new; + + /* since we dont free the gmap table until gmap_free we can unlock */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + new = (unsigned long *) page_to_phys(page); + crst_table_init(new, init); + spin_lock(&gmap->mm->page_table_lock); + if (*table & _REGION_ENTRY_INVALID) { + list_add(&page->lru, &gmap->crst_list); + *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + (*table & _REGION_ENTRY_TYPE_MASK); + page->index = gaddr; + page = NULL; + } + spin_unlock(&gmap->mm->page_table_lock); + if (page) + __free_pages(page, 2); + return 0; +} + +/** + * __gmap_segment_gaddr - find virtual address from segment pointer + * @entry: pointer to a segment table entry in the guest address space + * + * Returns the virtual address in the guest address space for the segment + */ +static unsigned long __gmap_segment_gaddr(unsigned long *entry) +{ + struct page *page; + unsigned long offset, mask; + + offset = (unsigned long) entry / sizeof(unsigned long); + offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + page = virt_to_page((void *)((unsigned long) entry & mask)); + return page->index + offset; +} + +/** + * __gmap_unlink_by_vmaddr - unlink a single segment via a host address + * @gmap: pointer to the guest address space structure + * @vmaddr: address in the host process address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) +{ + unsigned long *entry; + int flush = 0; + + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); + if (entry) { + flush = (*entry != _SEGMENT_ENTRY_INVALID); + *entry = _SEGMENT_ENTRY_INVALID; + } + spin_unlock(&gmap->guest_table_lock); + return flush; +} + +/** + * __gmap_unmap_by_gaddr - unmap a single segment via a guest address + * @gmap: pointer to the guest address space structure + * @gaddr: address in the guest address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; +} + +/** + * gmap_unmap_segment - unmap segment from the guest address space + * @gmap: pointer to the guest address space structure + * @to: address in the guest address space + * @len: length of the memory area to unmap + * + * Returns 0 if the unmap succeeded, -EINVAL if not. + */ +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + if ((to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || to + len < to) + return -EINVAL; + + flush = 0; + down_write(&gmap->mm->mmap_sem); + for (off = 0; off < len; off += PMD_SIZE) + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + up_write(&gmap->mm->mmap_sem); + if (flush) + gmap_flush_tlb(gmap); + return 0; +} +EXPORT_SYMBOL_GPL(gmap_unmap_segment); + +/** + * gmap_map_segment - map a segment to the guest address space + * @gmap: pointer to the guest address space structure + * @from: source address in the parent address space + * @to: target address in the guest address space + * @len: length of the memory area to map + * + * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. + */ +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + if ((from | to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || from + len < from || to + len < to || + from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) + return -EINVAL; + + flush = 0; + down_write(&gmap->mm->mmap_sem); + for (off = 0; off < len; off += PMD_SIZE) { + /* Remove old translation */ + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + /* Store new translation */ + if (radix_tree_insert(&gmap->guest_to_host, + (to + off) >> PMD_SHIFT, + (void *) from + off)) + break; + } + up_write(&gmap->mm->mmap_sem); + if (flush) + gmap_flush_tlb(gmap); + if (off >= len) + return 0; + gmap_unmap_segment(gmap, to, len); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gmap_map_segment); + +/** + * __gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; +} +EXPORT_SYMBOL_GPL(__gmap_translate); + +/** + * gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + */ +unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long rc; + + down_read(&gmap->mm->mmap_sem); + rc = __gmap_translate(gmap, gaddr); + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_translate); + +/** + * gmap_unlink - disconnect a page table from the gmap shadow tables + * @gmap: pointer to guest mapping meta data structure + * @table: pointer to the host page table + * @vmaddr: vm address associated with the host page table + */ +void gmap_unlink(struct mm_struct *mm, unsigned long *table, + unsigned long vmaddr) +{ + struct gmap *gmap; + int flush; + + list_for_each_entry(gmap, &mm->context.gmap_list, list) { + flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); + if (flush) + gmap_flush_tlb(gmap); + } +} + +/** + * gmap_link - set up shadow page tables to connect a host to a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @vmaddr: vm address + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) +{ + struct mm_struct *mm; + unsigned long *table; + spinlock_t *ptl; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int rc; + + /* Create higher level tables in the gmap page table */ + table = gmap->table; + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { + table += (gaddr >> 53) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, + gaddr & 0xffe0000000000000UL)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { + table += (gaddr >> 42) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, + gaddr & 0xfffffc0000000000UL)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { + table += (gaddr >> 31) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, + gaddr & 0xffffffff80000000UL)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + table += (gaddr >> 20) & 0x7ff; + /* Walk the parent mm page table */ + mm = gmap->mm; + pgd = pgd_offset(mm, vmaddr); + VM_BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, vmaddr); + VM_BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, vmaddr); + VM_BUG_ON(pmd_none(*pmd)); + /* large pmds cannot yet be handled */ + if (pmd_large(*pmd)) + return -EFAULT; + /* Link gmap segment table entry location to page table. */ + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + ptl = pmd_lock(mm, pmd); + spin_lock(&gmap->guest_table_lock); + if (*table == _SEGMENT_ENTRY_INVALID) { + rc = radix_tree_insert(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT, table); + if (!rc) + *table = pmd_val(*pmd); + } else + rc = 0; + spin_unlock(&gmap->guest_table_lock); + spin_unlock(ptl); + radix_tree_preload_end(); + return rc; +} + +/** + * gmap_fault - resolve a fault on a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @fault_flags: flags to pass down to handle_mm_fault() + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + */ +int gmap_fault(struct gmap *gmap, unsigned long gaddr, + unsigned int fault_flags) +{ + unsigned long vmaddr; + int rc; + bool unlocked; + + down_read(&gmap->mm->mmap_sem); + +retry: + unlocked = false; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + goto out_up; + } + if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, + &unlocked)) { + rc = -EFAULT; + goto out_up; + } + /* + * In the case that fixup_user_fault unlocked the mmap_sem during + * faultin redo __gmap_translate to not race with a map/unmap_segment. + */ + if (unlocked) + goto retry; + + rc = __gmap_link(gmap, gaddr, vmaddr); +out_up: + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_fault); + +/* + * this function is assumed to be called with mmap_sem held + */ +void __gmap_zap(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (vmaddr) { + vmaddr |= gaddr & ~PMD_MASK; + /* Get pointer to the page table entry */ + ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); + if (likely(ptep)) + ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); + pte_unmap_unlock(ptep, ptl); + } +} +EXPORT_SYMBOL_GPL(__gmap_zap); + +void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) +{ + unsigned long gaddr, vmaddr, size; + struct vm_area_struct *vma; + + down_read(&gmap->mm->mmap_sem); + for (gaddr = from; gaddr < to; + gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (!vmaddr) + continue; + vmaddr |= gaddr & ~PMD_MASK; + /* Find vma in the parent mm */ + vma = find_vma(gmap->mm, vmaddr); + size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); + zap_page_range(vma, vmaddr, size, NULL); + } + up_read(&gmap->mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(gmap_discard); + +static LIST_HEAD(gmap_notifier_list); +static DEFINE_SPINLOCK(gmap_notifier_lock); + +/** + * gmap_register_ipte_notifier - register a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_register_ipte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_add(&nb->list, &gmap_notifier_list); + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); + +/** + * gmap_unregister_ipte_notifier - remove a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_del_init(&nb->list); + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); + +/** + * gmap_ipte_notify - mark a range of ptes for invalidation notification + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * + * Returns 0 if for each page in the given range a gmap mapping exists and + * the invalidation notification could be set. If the gmap mapping is missing + * for one or more pages -EFAULT is returned. If no memory could be allocated + * -ENOMEM is returned. This function establishes missing page table entries. + */ +int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) +{ + unsigned long addr; + spinlock_t *ptl; + pte_t *ptep; + bool unlocked; + int rc = 0; + + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) + return -EINVAL; + down_read(&gmap->mm->mmap_sem); + while (len) { + unlocked = false; + /* Convert gmap address and connect the page tables */ + addr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(addr)) { + rc = addr; + break; + } + /* Get the page mapped */ + if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, + &unlocked)) { + rc = -EFAULT; + break; + } + /* While trying to map mmap_sem got unlocked. Let us retry */ + if (unlocked) + continue; + rc = __gmap_link(gmap, gaddr, addr); + if (rc) + break; + /* Walk the process page table, lock and get pte pointer */ + ptep = get_locked_pte(gmap->mm, addr, &ptl); + VM_BUG_ON(!ptep); + /* Set notification bit in the pgste of the pte */ + if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { + ptep_set_notify(gmap->mm, addr, ptep); + gaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + pte_unmap_unlock(ptep, ptl); + } + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_ipte_notify); + +/** + * ptep_notify - call all invalidation callbacks for a specific pte. + * @mm: pointer to the process mm_struct + * @addr: virtual address in the process address space + * @pte: pointer to the page table entry + * + * This function is assumed to be called with the page table lock held + * for the pte to notify. + */ +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) +{ + unsigned long offset, gaddr; + unsigned long *table; + struct gmap_notifier *nb; + struct gmap *gmap; + + offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); + offset = offset * (4096 / sizeof(pte_t)); + spin_lock(&gmap_notifier_lock); + list_for_each_entry(gmap, &mm->context.gmap_list, list) { + table = radix_tree_lookup(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (!table) + continue; + gaddr = __gmap_segment_gaddr(table) + offset; + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, gaddr); + } + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(ptep_notify); + +static inline void thp_split_mm(struct mm_struct *mm) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct vm_area_struct *vma; + unsigned long addr; + + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (addr = vma->vm_start; + addr < vma->vm_end; + addr += PAGE_SIZE) + follow_page(vma, addr, FOLL_SPLIT); + vma->vm_flags &= ~VM_HUGEPAGE; + vma->vm_flags |= VM_NOHUGEPAGE; + } + mm->def_flags |= VM_NOHUGEPAGE; +#endif +} + +/* + * switch on pgstes for its userspace process (for kvm) + */ +int s390_enable_sie(void) +{ + struct mm_struct *mm = current->mm; + + /* Do we have pgstes? if yes, we are done */ + if (mm_has_pgste(mm)) + return 0; + /* Fail if the page tables are 2K */ + if (!mm_alloc_pgste(mm)) + return -EINVAL; + down_write(&mm->mmap_sem); + mm->context.has_pgste = 1; + /* split thp mappings and disable thp for future mappings */ + thp_split_mm(mm); + up_write(&mm->mmap_sem); + return 0; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); + +/* + * Enable storage key handling from now on and initialize the storage + * keys with the default key. + */ +static int __s390_enable_skey(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + /* + * Remove all zero page mappings, + * after establishing a policy to forbid zero page mappings + * following faults for that page will get fresh anonymous pages + */ + if (is_zero_pfn(pte_pfn(*pte))) + ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); + /* Clear storage key */ + ptep_zap_key(walk->mm, addr, pte); + return 0; +} + +int s390_enable_skey(void) +{ + struct mm_walk walk = { .pte_entry = __s390_enable_skey }; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc = 0; + + down_write(&mm->mmap_sem); + if (mm_use_skey(mm)) + goto out_up; + + mm->context.use_skey = 1; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_madvise(vma, vma->vm_start, vma->vm_end, + MADV_UNMERGEABLE, &vma->vm_flags)) { + mm->context.use_skey = 0; + rc = -ENOMEM; + goto out_up; + } + } + mm->def_flags &= ~VM_MERGEABLE; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); + +out_up: + up_write(&mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(s390_enable_skey); + +/* + * Reset CMMA state, make all pages stable again. + */ +static int __s390_reset_cmma(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + ptep_zap_unused(walk->mm, addr, pte, 1); + return 0; +} + +void s390_reset_cmma(struct mm_struct *mm) +{ + struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; + + down_write(&mm->mmap_sem); + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); + up_write(&mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(s390_reset_cmma); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c new file mode 100644 index 000000000000..f6c3de26cda8 --- /dev/null +++ b/arch/s390/mm/pgalloc.c @@ -0,0 +1,360 @@ +/* + * Page table allocation functions + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PGSTE + +static int page_table_allocate_pgste_min = 0; +static int page_table_allocate_pgste_max = 1; +int page_table_allocate_pgste = 0; +EXPORT_SYMBOL(page_table_allocate_pgste); + +static struct ctl_table page_table_sysctl[] = { + { + .procname = "allocate_pgste", + .data = &page_table_allocate_pgste, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec, + .extra1 = &page_table_allocate_pgste_min, + .extra2 = &page_table_allocate_pgste_max, + }, + { } +}; + +static struct ctl_table page_table_sysctl_dir[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = page_table_sysctl, + }, + { } +}; + +static int __init page_table_register_sysctl(void) +{ + return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; +} +__initcall(page_table_register_sysctl); + +#endif /* CONFIG_PGSTE */ + +unsigned long *crst_table_alloc(struct mm_struct *mm) +{ + struct page *page = alloc_pages(GFP_KERNEL, 2); + + if (!page) + return NULL; + return (unsigned long *) page_to_phys(page); +} + +void crst_table_free(struct mm_struct *mm, unsigned long *table) +{ + free_pages((unsigned long) table, 2); +} + +static void __crst_table_upgrade(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + clear_user_asce(); + set_user_asce(mm); + } + __tlb_flush_local(); +} + +int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) +{ + unsigned long *table, *pgd; + unsigned long entry; + int flush; + + BUG_ON(limit > TASK_MAX_SIZE); + flush = 0; +repeat: + table = crst_table_alloc(mm); + if (!table) + return -ENOMEM; + spin_lock_bh(&mm->page_table_lock); + if (mm->context.asce_limit < limit) { + pgd = (unsigned long *) mm->pgd; + if (mm->context.asce_limit <= (1UL << 31)) { + entry = _REGION3_ENTRY_EMPTY; + mm->context.asce_limit = 1UL << 42; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION3; + } else { + entry = _REGION2_ENTRY_EMPTY; + mm->context.asce_limit = 1UL << 53; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION2; + } + crst_table_init(table, entry); + pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); + mm->pgd = (pgd_t *) table; + mm->task_size = mm->context.asce_limit; + table = NULL; + flush = 1; + } + spin_unlock_bh(&mm->page_table_lock); + if (table) + crst_table_free(mm, table); + if (mm->context.asce_limit < limit) + goto repeat; + if (flush) + on_each_cpu(__crst_table_upgrade, mm, 0); + return 0; +} + +void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) +{ + pgd_t *pgd; + + if (current->active_mm == mm) { + clear_user_asce(); + __tlb_flush_mm(mm); + } + while (mm->context.asce_limit > limit) { + pgd = mm->pgd; + switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { + case _REGION_ENTRY_TYPE_R2: + mm->context.asce_limit = 1UL << 42; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION3; + break; + case _REGION_ENTRY_TYPE_R3: + mm->context.asce_limit = 1UL << 31; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_SEGMENT; + break; + default: + BUG(); + } + mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); + mm->task_size = mm->context.asce_limit; + crst_table_free(mm, (unsigned long *) pgd); + } + if (current->active_mm == mm) + set_user_asce(mm); +} + +static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) +{ + unsigned int old, new; + + do { + old = atomic_read(v); + new = old ^ bits; + } while (atomic_cmpxchg(v, old, new) != old); + return new; +} + +/* + * page table entry allocation/free routines. + */ +unsigned long *page_table_alloc(struct mm_struct *mm) +{ + unsigned long *table; + struct page *page; + unsigned int mask, bit; + + /* Try to get a fragment of a 4K page as a 2K page table */ + if (!mm_alloc_pgste(mm)) { + table = NULL; + spin_lock_bh(&mm->context.list_lock); + if (!list_empty(&mm->context.pgtable_list)) { + page = list_first_entry(&mm->context.pgtable_list, + struct page, lru); + mask = atomic_read(&page->_mapcount); + mask = (mask | (mask >> 4)) & 3; + if (mask != 3) { + table = (unsigned long *) page_to_phys(page); + bit = mask & 1; /* =1 -> second 2K */ + if (bit) + table += PTRS_PER_PTE; + atomic_xor_bits(&page->_mapcount, 1U << bit); + list_del(&page->lru); + } + } + spin_unlock_bh(&mm->context.list_lock); + if (table) + return table; + } + /* Allocate a fresh page */ + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + /* Initialize page table */ + table = (unsigned long *) page_to_phys(page); + if (mm_alloc_pgste(mm)) { + /* Return 4K page table with PGSTEs */ + atomic_set(&page->_mapcount, 3); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } else { + /* Return the first 2K fragment of the page */ + atomic_set(&page->_mapcount, 1); + clear_table(table, _PAGE_INVALID, PAGE_SIZE); + spin_lock_bh(&mm->context.list_lock); + list_add(&page->lru, &mm->context.pgtable_list); + spin_unlock_bh(&mm->context.list_lock); + } + return table; +} + +void page_table_free(struct mm_struct *mm, unsigned long *table) +{ + struct page *page; + unsigned int bit, mask; + + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (!mm_alloc_pgste(mm)) { + /* Free 2K page table fragment of a 4K page */ + bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.list_lock); + mask = atomic_xor_bits(&page->_mapcount, 1U << bit); + if (mask & 3) + list_add(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + if (mask != 0) + return; + } + + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); +} + +void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, + unsigned long vmaddr) +{ + struct mm_struct *mm; + struct page *page; + unsigned int bit, mask; + + mm = tlb->mm; + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (mm_alloc_pgste(mm)) { + gmap_unlink(mm, table, vmaddr); + table = (unsigned long *) (__pa(table) | 3); + tlb_remove_table(tlb, table); + return; + } + bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.list_lock); + mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); + if (mask & 3) + list_add_tail(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + table = (unsigned long *) (__pa(table) | (1U << bit)); + tlb_remove_table(tlb, table); +} + +static void __tlb_remove_table(void *_table) +{ + unsigned int mask = (unsigned long) _table & 3; + void *table = (void *)((unsigned long) _table ^ mask); + struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + + switch (mask) { + case 0: /* pmd or pud */ + free_pages((unsigned long) table, 2); + break; + case 1: /* lower 2K of a 4K page table */ + case 2: /* higher 2K of a 4K page table */ + if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) + break; + /* fallthrough */ + case 3: /* 4K page table with pgstes */ + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); + break; + } +} + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely + * on IRQ disabling. See the comment near struct mmu_table_batch. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + __tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + struct mmu_table_batch *batch; + int i; + + batch = container_of(head, struct mmu_table_batch, rcu); + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + tlb->mm->context.flush_mm = 1; + if (*batch == NULL) { + *batch = (struct mmu_table_batch *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + __tlb_flush_mm_lazy(tlb->mm); + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_flush_mmu(tlb); +} diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index e24126208614..4324b87f9398 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -24,1140 +24,6 @@ #include #include -unsigned long *crst_table_alloc(struct mm_struct *mm) -{ - struct page *page = alloc_pages(GFP_KERNEL, 2); - - if (!page) - return NULL; - return (unsigned long *) page_to_phys(page); -} - -void crst_table_free(struct mm_struct *mm, unsigned long *table) -{ - free_pages((unsigned long) table, 2); -} - -static void __crst_table_upgrade(void *arg) -{ - struct mm_struct *mm = arg; - - if (current->active_mm == mm) { - clear_user_asce(); - set_user_asce(mm); - } - __tlb_flush_local(); -} - -int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) -{ - unsigned long *table, *pgd; - unsigned long entry; - int flush; - - BUG_ON(limit > TASK_MAX_SIZE); - flush = 0; -repeat: - table = crst_table_alloc(mm); - if (!table) - return -ENOMEM; - spin_lock_bh(&mm->page_table_lock); - if (mm->context.asce_limit < limit) { - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit <= (1UL << 31)) { - entry = _REGION3_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - } else { - entry = _REGION2_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 53; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION2; - } - crst_table_init(table, entry); - pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->task_size = mm->context.asce_limit; - table = NULL; - flush = 1; - } - spin_unlock_bh(&mm->page_table_lock); - if (table) - crst_table_free(mm, table); - if (mm->context.asce_limit < limit) - goto repeat; - if (flush) - on_each_cpu(__crst_table_upgrade, mm, 0); - return 0; -} - -void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) -{ - pgd_t *pgd; - - if (current->active_mm == mm) { - clear_user_asce(); - __tlb_flush_mm(mm); - } - while (mm->context.asce_limit > limit) { - pgd = mm->pgd; - switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { - case _REGION_ENTRY_TYPE_R2: - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - break; - case _REGION_ENTRY_TYPE_R3: - mm->context.asce_limit = 1UL << 31; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_SEGMENT; - break; - default: - BUG(); - } - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->task_size = mm->context.asce_limit; - crst_table_free(mm, (unsigned long *) pgd); - } - if (current->active_mm == mm) - set_user_asce(mm); -} - -#ifdef CONFIG_PGSTE - -/** - * gmap_alloc - allocate a guest address space - * @mm: pointer to the parent mm_struct - * @limit: maximum address of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) -{ - struct gmap *gmap; - struct page *page; - unsigned long *table; - unsigned long etype, atype; - - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; - atype = _ASCE_TYPE_SEGMENT; - etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; - atype = _ASCE_TYPE_REGION3; - etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; - atype = _ASCE_TYPE_REGION2; - etype = _REGION2_ENTRY_EMPTY; - } else { - limit = -1UL; - atype = _ASCE_TYPE_REGION1; - etype = _REGION1_ENTRY_EMPTY; - } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); - if (!gmap) - goto out; - INIT_LIST_HEAD(&gmap->crst_list); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); - spin_lock_init(&gmap->guest_table_lock); - gmap->mm = mm; - page = alloc_pages(GFP_KERNEL, 2); - if (!page) - goto out_free; - page->index = 0; - list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); - crst_table_init(table, etype); - gmap->table = table; - gmap->asce = atype | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); - gmap->asce_end = limit; - down_write(&mm->mmap_sem); - list_add(&gmap->list, &mm->context.gmap_list); - up_write(&mm->mmap_sem); - return gmap; - -out_free: - kfree(gmap); -out: - return NULL; -} -EXPORT_SYMBOL_GPL(gmap_alloc); - -static void gmap_flush_tlb(struct gmap *gmap) -{ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); -} - -static void gmap_radix_tree_free(struct radix_tree_root *root) -{ - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - radix_tree_delete(root, index); - } - } while (nr > 0); -} - -/** - * gmap_free - free a guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_free(struct gmap *gmap) -{ - struct page *page, *next; - - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); - - /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); - gmap_radix_tree_free(&gmap->guest_to_host); - gmap_radix_tree_free(&gmap->host_to_guest); - down_write(&gmap->mm->mmap_sem); - list_del(&gmap->list); - up_write(&gmap->mm->mmap_sem); - kfree(gmap); -} -EXPORT_SYMBOL_GPL(gmap_free); - -/** - * gmap_enable - switch primary space to the guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_enable(struct gmap *gmap) -{ - S390_lowcore.gmap = (unsigned long) gmap; -} -EXPORT_SYMBOL_GPL(gmap_enable); - -/** - * gmap_disable - switch back to the standard primary address space - * @gmap: pointer to the guest address space structure - */ -void gmap_disable(struct gmap *gmap) -{ - S390_lowcore.gmap = 0UL; -} -EXPORT_SYMBOL_GPL(gmap_disable); - -/* - * gmap_alloc_table is assumed to be called with mmap_sem held - */ -static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, - unsigned long init, unsigned long gaddr) -{ - struct page *page; - unsigned long *new; - - /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); - if (!page) - return -ENOMEM; - new = (unsigned long *) page_to_phys(page); - crst_table_init(new, init); - spin_lock(&gmap->mm->page_table_lock); - if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | - (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; - page = NULL; - } - spin_unlock(&gmap->mm->page_table_lock); - if (page) - __free_pages(page, 2); - return 0; -} - -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) -{ - struct page *page; - unsigned long offset, mask; - - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); - return page->index + offset; -} - -/** - * __gmap_unlink_by_vmaddr - unlink a single segment via a host address - * @gmap: pointer to the guest address space structure - * @vmaddr: address in the host process address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) -{ - unsigned long *entry; - int flush = 0; - - spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_INVALID); - *entry = _SEGMENT_ENTRY_INVALID; - } - spin_unlock(&gmap->guest_table_lock); - return flush; -} - -/** - * __gmap_unmap_by_gaddr - unmap a single segment via a guest address - * @gmap: pointer to the guest address space structure - * @gaddr: address in the guest address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; -} - -/** - * gmap_unmap_segment - unmap segment from the guest address space - * @gmap: pointer to the guest address space structure - * @to: address in the guest address space - * @len: length of the memory area to unmap - * - * Returns 0 if the unmap succeeded, -EINVAL if not. - */ -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - if ((to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || to + len < to) - return -EINVAL; - - flush = 0; - down_write(&gmap->mm->mmap_sem); - for (off = 0; off < len; off += PMD_SIZE) - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - up_write(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - return 0; -} -EXPORT_SYMBOL_GPL(gmap_unmap_segment); - -/** - * gmap_mmap_segment - map a segment to the guest address space - * @gmap: pointer to the guest address space structure - * @from: source address in the parent address space - * @to: target address in the guest address space - * @len: length of the memory area to map - * - * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. - */ -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - if ((from | to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || from + len < from || to + len < to || - from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end) - return -EINVAL; - - flush = 0; - down_write(&gmap->mm->mmap_sem); - for (off = 0; off < len; off += PMD_SIZE) { - /* Remove old translation */ - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - /* Store new translation */ - if (radix_tree_insert(&gmap->guest_to_host, - (to + off) >> PMD_SHIFT, - (void *) from + off)) - break; - } - up_write(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - if (off >= len) - return 0; - gmap_unmap_segment(gmap, to, len); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(gmap_map_segment); - -/** - * __gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - * The mmap_sem of the mm that belongs to the address space must be held - * when this function gets called. - */ -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); - return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; -} -EXPORT_SYMBOL_GPL(__gmap_translate); - -/** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long rc; - - down_read(&gmap->mm->mmap_sem); - rc = __gmap_translate(gmap, gaddr); - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_translate); - -/** - * gmap_unlink - disconnect a page table from the gmap shadow tables - * @gmap: pointer to guest mapping meta data structure - * @table: pointer to the host page table - * @vmaddr: vm address associated with the host page table - */ -static void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ - struct gmap *gmap; - int flush; - - list_for_each_entry(gmap, &mm->context.gmap_list, list) { - flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); - if (flush) - gmap_flush_tlb(gmap); - } -} - -/** - * gmap_link - set up shadow page tables to connect a host to a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @vmaddr: vm address - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - * The mmap_sem of the mm that belongs to the address space must be held - * when this function gets called. - */ -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) -{ - struct mm_struct *mm; - unsigned long *table; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - int rc; - - /* Create higher level tables in the gmap page table */ - table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - table += (gaddr >> 20) & 0x7ff; - /* Walk the parent mm page table */ - mm = gmap->mm; - pgd = pgd_offset(mm, vmaddr); - VM_BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, vmaddr); - VM_BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, vmaddr); - VM_BUG_ON(pmd_none(*pmd)); - /* large pmds cannot yet be handled */ - if (pmd_large(*pmd)) - return -EFAULT; - /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL); - if (rc) - return rc; - ptl = pmd_lock(mm, pmd); - spin_lock(&gmap->guest_table_lock); - if (*table == _SEGMENT_ENTRY_INVALID) { - rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); - if (!rc) - *table = pmd_val(*pmd); - } else - rc = 0; - spin_unlock(&gmap->guest_table_lock); - spin_unlock(ptl); - radix_tree_preload_end(); - return rc; -} - -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) -{ - unsigned long vmaddr; - int rc; - bool unlocked; - - down_read(&gmap->mm->mmap_sem); - -retry: - unlocked = false; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - goto out_up; - } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, - &unlocked)) { - rc = -EFAULT; - goto out_up; - } - /* - * In the case that fixup_user_fault unlocked the mmap_sem during - * faultin redo __gmap_translate to not race with a map/unmap_segment. - */ - if (unlocked) - goto retry; - - rc = __gmap_link(gmap, gaddr, vmaddr); -out_up: - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_fault); - -static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) -{ - if (!non_swap_entry(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) { - struct page *page = migration_entry_to_page(entry); - - dec_mm_counter(mm, mm_counter(page)); - } - free_swap_and_cache(entry); -} - -/* - * this function is assumed to be called with mmap_sem held - */ -void __gmap_zap(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr, ptev, pgstev; - pte_t *ptep, pte; - spinlock_t *ptl; - pgste_t pgste; - - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - return; - vmaddr |= gaddr & ~PMD_MASK; - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (unlikely(!ptep)) - return; - pte = *ptep; - if (!pte_swap(pte)) - goto out_pte; - /* Zap unused and logically-zero pages */ - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - ptev = pte_val(pte); - if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || - ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { - gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); - pte_clear(gmap->mm, vmaddr, ptep); - } - pgste_set_unlock(ptep, pgste); -out_pte: - pte_unmap_unlock(ptep, ptl); -} -EXPORT_SYMBOL_GPL(__gmap_zap); - -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) -{ - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - down_read(&gmap->mm->mmap_sem); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size, NULL); - } - up_read(&gmap->mm->mmap_sem); -} -EXPORT_SYMBOL_GPL(gmap_discard); - -static LIST_HEAD(gmap_notifier_list); -static DEFINE_SPINLOCK(gmap_notifier_lock); - -/** - * gmap_register_ipte_notifier - register a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); - -/** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); - -/** - * gmap_ipte_notify - mark a range of ptes for invalidation notification - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. - */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) -{ - unsigned long addr; - spinlock_t *ptl; - pte_t *ptep, entry; - pgste_t pgste; - bool unlocked; - int rc = 0; - - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) - return -EINVAL; - down_read(&gmap->mm->mmap_sem); - while (len) { - unlocked = false; - /* Convert gmap address and connect the page tables */ - addr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(addr)) { - rc = addr; - break; - } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - rc = -EFAULT; - break; - } - /* While trying to map mmap_sem got unlocked. Let us retry */ - if (unlocked) - continue; - rc = __gmap_link(gmap, gaddr, addr); - if (rc) - break; - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - VM_BUG_ON(!ptep); - /* Set notification bit in the pgste of the pte */ - entry = *ptep; - if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { - pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= PGSTE_IN_BIT; - pgste_set_unlock(ptep, pgste); - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - pte_unmap_unlock(ptep, ptl); - } - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_ipte_notify); - -/** - * ptep_ipte_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space - * @pte: pointer to the page table entry - * - * This function is assumed to be called with the page table lock held - * for the pte to notify. - */ -void ptep_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) -{ - unsigned long offset, gaddr; - unsigned long *table; - struct gmap_notifier *nb; - struct gmap *gmap; - - offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); - spin_lock(&gmap_notifier_lock); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (!table) - continue; - gaddr = __gmap_segment_gaddr(table) + offset; - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, gaddr); - } - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(ptep_ipte_notify); - -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned long key, bool nq) -{ - spinlock_t *ptl; - pgste_t old, new; - pte_t *ptep; - - down_read(&mm->mmap_sem); - ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); - return -EFAULT; - } - - new = old = pgste_get_lock(ptep); - pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; - pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long address, bits, skey; - - address = pte_val(*ptep) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(address); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); - /* Set storage key ACC and FP */ - page_set_storage_key(address, skey, !nq); - /* Merge host changed & referenced into pgste */ - pgste_val(new) |= bits << 52; - } - /* changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & - (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - pgste_val(new) |= PGSTE_UC_BIT; - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return 0; -} -EXPORT_SYMBOL(set_guest_storage_key); - -unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) -{ - spinlock_t *ptl; - pgste_t pgste; - pte_t *ptep; - uint64_t physaddr; - unsigned long key = 0; - - down_read(&mm->mmap_sem); - ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); - return -EFAULT; - } - pgste = pgste_get_lock(ptep); - - if (pte_val(*ptep) & _PAGE_INVALID) { - key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; - key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; - key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; - key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; - } else { - physaddr = pte_val(*ptep) & PAGE_MASK; - key = page_get_storage_key(physaddr); - - /* Reflect guest's logical view, not physical */ - if (pgste_val(pgste) & PGSTE_GR_BIT) - key |= _PAGE_REFERENCED; - if (pgste_val(pgste) & PGSTE_GC_BIT) - key |= _PAGE_CHANGED; - } - - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return key; -} -EXPORT_SYMBOL(get_guest_storage_key); - -static int page_table_allocate_pgste_min = 0; -static int page_table_allocate_pgste_max = 1; -int page_table_allocate_pgste = 0; -EXPORT_SYMBOL(page_table_allocate_pgste); - -static struct ctl_table page_table_sysctl[] = { - { - .procname = "allocate_pgste", - .data = &page_table_allocate_pgste, - .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec, - .extra1 = &page_table_allocate_pgste_min, - .extra2 = &page_table_allocate_pgste_max, - }, - { } -}; - -static struct ctl_table page_table_sysctl_dir[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = page_table_sysctl, - }, - { } -}; - -static int __init page_table_register_sysctl(void) -{ - return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; -} -__initcall(page_table_register_sysctl); - -#else /* CONFIG_PGSTE */ - -static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ -} - -#endif /* CONFIG_PGSTE */ - -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - unsigned int old, new; - - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; -} - -/* - * page table entry allocation/free routines. - */ -unsigned long *page_table_alloc(struct mm_struct *mm) -{ - unsigned long *table; - struct page *page; - unsigned int mask, bit; - - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.list_lock); - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_mapcount); - mask = (mask | (mask >> 4)) & 3; - if (mask != 3) { - table = (unsigned long *) page_to_phys(page); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&page->_mapcount, 1U << bit); - list_del(&page->lru); - } - } - spin_unlock_bh(&mm->context.list_lock); - if (table) - return table; - } - /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - /* Initialize page table */ - table = (unsigned long *) page_to_phys(page); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - atomic_set(&page->_mapcount, 3); - clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); - } else { - /* Return the first 2K fragment of the page */ - atomic_set(&page->_mapcount, 1); - clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); - } - return table; -} - -void page_table_free(struct mm_struct *mm, unsigned long *table) -{ - struct page *page; - unsigned int bit, mask; - - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); - mask = atomic_xor_bits(&page->_mapcount, 1U << bit); - if (mask & 3) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); - if (mask != 0) - return; - } - - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); -} - -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) -{ - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; - - mm = tlb->mm; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (mm_alloc_pgste(mm)) { - gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | 3); - tlb_remove_table(tlb, table); - return; - } - bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); - mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); - if (mask & 3) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); - table = (unsigned long *) (__pa(table) | (1U << bit)); - tlb_remove_table(tlb, table); -} - -static void __tlb_remove_table(void *_table) -{ - unsigned int mask = (unsigned long) _table & 3; - void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - - switch (mask) { - case 0: /* pmd or pud */ - free_pages((unsigned long) table, 2); - break; - case 1: /* lower 2K of a 4K page table */ - case 2: /* higher 2K of a 4K page table */ - if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) - break; - /* fallthrough */ - case 3: /* 4K page table with pgstes */ - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); - break; - } -} - -static void tlb_remove_table_smp_sync(void *arg) -{ - /* Simply deliver the interrupt */ -} - -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely - * on IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); -} - -static void tlb_remove_table_rcu(struct rcu_head *head) -{ - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); -} - -void tlb_table_flush(struct mmu_gather *tlb) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch) { - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; - } -} - -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - tlb->mm->context.flush_mm = 1; - if (*batch == NULL) { - *batch = (struct mmu_table_batch *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - __tlb_flush_mm_lazy(tlb->mm); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; - } - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_flush_mmu(tlb); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline void thp_split_vma(struct vm_area_struct *vma) -{ - unsigned long addr; - - for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) - follow_page(vma, addr, FOLL_SPLIT); -} - -static inline void thp_split_mm(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - thp_split_vma(vma); - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; - } - mm->def_flags |= VM_NOHUGEPAGE; -} -#else -static inline void thp_split_mm(struct mm_struct *mm) -{ -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - static inline pte_t ptep_flush_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -1198,6 +64,55 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, return old; } +static inline pgste_t pgste_get_lock(pte_t *ptep) +{ + unsigned long new = 0; +#ifdef CONFIG_PGSTE + unsigned long old; + + preempt_disable(); + asm( + " lg %0,%2\n" + "0: lgr %1,%0\n" + " nihh %0,0xff7f\n" /* clear PCL bit in old */ + " oihh %1,0x0080\n" /* set PCL bit in new */ + " csg %0,%1,%2\n" + " jl 0b\n" + : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) + : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); +#endif + return __pgste(new); +} + +static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + asm( + " nihh %1,0xff7f\n" /* clear PCL bit */ + " stg %1,%0\n" + : "=Q" (ptep[PTRS_PER_PTE]) + : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) + : "cc", "memory"); + preempt_enable(); +#endif +} + +static inline pgste_t pgste_get(pte_t *ptep) +{ + unsigned long pgste = 0; +#ifdef CONFIG_PGSTE + pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); +#endif + return __pgste(pgste); +} + +static inline void pgste_set(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; +#endif +} + static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, struct mm_struct *mm) { @@ -1271,63 +186,12 @@ static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, #ifdef CONFIG_PGSTE if (pgste_val(pgste) & PGSTE_IN_BIT) { pgste_val(pgste) &= ~PGSTE_IN_BIT; - ptep_ipte_notify(mm, addr, ptep); + ptep_notify(mm, addr, ptep); } #endif return pgste; } -#ifdef CONFIG_PGSTE -/* - * Test and reset if a guest page is dirty - */ -bool pgste_test_and_clear_dirty(struct mm_struct *mm, unsigned long addr) -{ - spinlock_t *ptl; - pgste_t pgste; - pte_t *ptep; - pte_t pte; - bool dirty; - - ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) - return false; - - pgste = pgste_get_lock(ptep); - dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); - pgste_val(pgste) &= ~PGSTE_UC_BIT; - pte = *ptep; - if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep); - if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) - pte_val(pte) |= _PAGE_PROTECT; - else - pte_val(pte) |= _PAGE_INVALID; - *ptep = pte; - } - pgste_set_unlock(ptep, pgste); - - spin_unlock(ptl); - return dirty; -} -EXPORT_SYMBOL_GPL(pgste_test_and_clear_dirty); - -void set_pte_pgste_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) -{ - pgste_t pgste; - - /* the mm_has_pgste() check is done in set_pte_at() */ - pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; - pgste_set_key(ptep, pgste, entry, mm); - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); -} -EXPORT_SYMBOL(set_pte_pgste_at); -#endif - static inline pgste_t ptep_xchg_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -1486,112 +350,6 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(pmdp_xchg_lazy); -/* - * switch on pgstes for its userspace process (for kvm) - */ -int s390_enable_sie(void) -{ - struct mm_struct *mm = current->mm; - - /* Do we have pgstes? if yes, we are done */ - if (mm_has_pgste(mm)) - return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) - return -EINVAL; - down_write(&mm->mmap_sem); - mm->context.has_pgste = 1; - /* split thp mappings and disable thp for future mappings */ - thp_split_mm(mm); - up_write(&mm->mmap_sem); - return 0; -} -EXPORT_SYMBOL_GPL(s390_enable_sie); - -/* - * Enable storage key handling from now on and initialize the storage - * keys with the default key. - */ -static int __s390_enable_skey(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - unsigned long ptev; - pgste_t pgste; - - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); - /* Clear storage key */ - pgste = pgste_get_lock(pte); - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | - PGSTE_GR_BIT | PGSTE_GC_BIT); - ptev = pte_val(*pte); - if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); - pgste_set_unlock(pte, pgste); - return 0; -} - -int s390_enable_skey(void) -{ - struct mm_walk walk = { .pte_entry = __s390_enable_skey }; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int rc = 0; - - down_write(&mm->mmap_sem); - if (mm_use_skey(mm)) - goto out_up; - - mm->context.use_skey = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.use_skey = 0; - rc = -ENOMEM; - goto out_up; - } - } - mm->def_flags &= ~VM_MERGEABLE; - - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); - -out_up: - up_write(&mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(s390_enable_skey); - -/* - * Reset CMMA state, make all pages stable again. - */ -static int __s390_reset_cmma(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - pgste_t pgste; - - pgste = pgste_get_lock(pte); - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; - pgste_set_unlock(pte, pgste); - return 0; -} - -void s390_reset_cmma(struct mm_struct *mm) -{ - struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; - - down_write(&mm->mmap_sem); - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); - up_write(&mm->mmap_sem); -} -EXPORT_SYMBOL_GPL(s390_reset_cmma); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -1632,3 +390,193 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_PGSTE +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) +{ + pgste_t pgste; + + /* the mm_has_pgste() check is done in set_pte_at() */ + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; + pgste_set_key(ptep, pgste, entry, mm); + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); +} + +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pgste_t pgste; + + pgste = pgste_get_lock(ptep); + pgste_val(pgste) |= PGSTE_IN_BIT; + pgste_set_unlock(ptep, pgste); +} + +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +{ + if (!non_swap_entry(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (is_migration_entry(entry)) { + struct page *page = migration_entry_to_page(entry); + + dec_mm_counter(mm, mm_counter(page)); + } + free_swap_and_cache(entry); +} + +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int reset) +{ + unsigned long pgstev; + pgste_t pgste; + pte_t pte; + + /* Zap unused and logically-zero pages */ + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + pte = *ptep; + if (pte_swap(pte) && + ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO))) { + ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + pte_clear(mm, addr, ptep); + } + if (reset) + pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste_set_unlock(ptep, pgste); +} + +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + unsigned long ptev; + pgste_t pgste; + + /* Clear storage key */ + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | + PGSTE_GR_BIT | PGSTE_GC_BIT); + ptev = pte_val(*ptep); + if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); + pgste_set_unlock(ptep, pgste); +} + +/* + * Test and reset if a guest page is dirty + */ +bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) +{ + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + pte_t pte; + bool dirty; + + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) + return false; + + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; + pte = *ptep; + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { + pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + __ptep_ipte(addr, ptep); + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + pte_val(pte) |= _PAGE_PROTECT; + else + pte_val(pte) |= _PAGE_INVALID; + *ptep = pte; + } + pgste_set_unlock(ptep, pgste); + + spin_unlock(ptl); + return dirty; +} +EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty); + +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq) +{ + unsigned long keyul; + spinlock_t *ptl; + pgste_t old, new; + pte_t *ptep; + + down_read(&mm->mmap_sem); + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + + new = old = pgste_get_lock(ptep); + pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); + keyul = (unsigned long) key; + pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; + pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + unsigned long address, bits, skey; + + address = pte_val(*ptep) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(address); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); + /* Set storage key ACC and FP */ + page_set_storage_key(address, skey, !nq); + /* Merge host changed & referenced into pgste */ + pgste_val(new) |= bits << 52; + } + /* changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & + (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + up_read(&mm->mmap_sem); + return 0; +} +EXPORT_SYMBOL(set_guest_storage_key); + +unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr) +{ + unsigned char key; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + + down_read(&mm->mmap_sem); + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + pgste = pgste_get_lock(ptep); + + if (pte_val(*ptep) & _PAGE_INVALID) { + key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; + key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; + key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; + key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; + } else { + key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + + /* Reflect guest's logical view, not physical */ + if (pgste_val(pgste) & PGSTE_GR_BIT) + key |= _PAGE_REFERENCED; + if (pgste_val(pgste) & PGSTE_GC_BIT) + key |= _PAGE_CHANGED; + } + + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + up_read(&mm->mmap_sem); + return key; +} +EXPORT_SYMBOL(get_guest_storage_key); +#endif -- GitLab