提交 85eae57b 编写于 作者: P Paolo Bonzini

Merge tag 'kvm-s390-next-4.19-1' of...

Merge tag 'kvm-s390-next-4.19-1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD

KVM: s390: Features for 4.19

- initial version for host large page support. Must be enabled with
  module parameter hpage=1 and will conflict with the nested=1
  parameter.
- enable etoken facility for guests
- Fixes
......@@ -4391,6 +4391,22 @@ all such vmexits.
Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
7.14 KVM_CAP_S390_HPAGE_1M
Architectures: s390
Parameters: none
Returns: 0 on success, -EINVAL if hpage module parameter was not set
or cmma is enabled
With this capability the KVM support for memory backing with 1m pages
through hugetlbfs can be enabled for a VM. After the capability is
enabled, cmma can't be enabled anymore and pfmfi and the storage key
interpretation are disabled. If cmma has already been enabled or the
hpage module parameter is not set to 1, -EINVAL is returned.
While it is generally possible to create a huge page backed VM without
this capability, the VM will not be able to run.
8. Other capabilities.
----------------------
......
......@@ -9,6 +9,14 @@
#ifndef _ASM_S390_GMAP_H
#define _ASM_S390_GMAP_H
/* Generic bits for GMAP notification on DAT table entry changes. */
#define GMAP_NOTIFY_SHADOW 0x2
#define GMAP_NOTIFY_MPROT 0x1
/* Status bits only for huge segment entries */
#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */
#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */
/**
* struct gmap_struct - guest address space
* @list: list head for the mm->context gmap list
......@@ -132,4 +140,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
int gmap_mprotect_notify(struct gmap *, unsigned long start,
unsigned long len, int prot);
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
#endif /* _ASM_S390_GMAP_H */
......@@ -37,7 +37,10 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
#define arch_clear_hugepage_flags(page) do { } while (0)
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_arch_1, &page->flags);
}
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
......
......@@ -269,6 +269,7 @@ struct kvm_s390_sie_block {
__u8 reserved1c0[8]; /* 0x01c0 */
#define ECD_HOSTREGMGMT 0x20000000
#define ECD_MEF 0x08000000
#define ECD_ETOKENF 0x02000000
__u32 ecd; /* 0x01c8 */
__u8 reserved1cc[18]; /* 0x01cc */
__u64 pp; /* 0x01de */
......@@ -655,6 +656,7 @@ struct kvm_vcpu_arch {
seqcount_t cputm_seqcount;
__u64 cputm_start;
bool gs_enabled;
bool skey_enabled;
};
struct kvm_vm_stat {
......@@ -793,12 +795,6 @@ struct kvm_s390_vsie {
struct page *pages[KVM_MAX_VCPUS];
};
struct kvm_s390_migration_state {
unsigned long bitmap_size; /* in bits (number of guest pages) */
atomic64_t dirty_pages; /* number of dirty pages */
unsigned long *pgste_bitmap;
};
struct kvm_arch{
void *sca;
int use_esca;
......@@ -828,7 +824,8 @@ struct kvm_arch{
struct kvm_s390_vsie vsie;
u8 epdx;
u64 epoch;
struct kvm_s390_migration_state *migration_state;
int migration_mode;
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
struct kvm_s390_gisa *gisa;
......
......@@ -24,6 +24,8 @@ typedef struct {
unsigned int uses_skeys:1;
/* The mmu context uses CMM. */
unsigned int uses_cmm:1;
/* The gmaps associated with this context are allowed to use huge pages. */
unsigned int allow_gmap_hpage_1m:1;
} mm_context_t;
#define INIT_MM_CONTEXT(name) \
......
......@@ -32,6 +32,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.has_pgste = 0;
mm->context.uses_skeys = 0;
mm->context.uses_cmm = 0;
mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
case _REGION2_SIZE:
......
......@@ -268,8 +268,10 @@ static inline int is_module_addr(void *addr)
#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL
/* Bits in the segment table entry */
#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL
#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE 0xfffffffffff00730UL
#define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */
#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */
#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */
......@@ -1101,7 +1103,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
pte_t *sptep, pte_t *tptep, pte_t pte);
void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address,
pte_t *ptep);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq);
int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
......@@ -1116,6 +1119,10 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
unsigned long *oldpte, unsigned long *oldpgste);
void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr);
void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
/*
* Certain architectures need to do special things when PTEs
......
......@@ -4,7 +4,7 @@
/*
* KVM s390 specific structures and definitions
*
* Copyright IBM Corp. 2008
* Copyright IBM Corp. 2008, 2018
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
......@@ -225,6 +225,7 @@ struct kvm_guest_debug_arch {
#define KVM_SYNC_FPRS (1UL << 8)
#define KVM_SYNC_GSCB (1UL << 9)
#define KVM_SYNC_BPBC (1UL << 10)
#define KVM_SYNC_ETOKEN (1UL << 11)
/* length and alignment of the sdnx as a power of two */
#define SDNXC 8
#define SDNXL (1UL << SDNXC)
......@@ -258,6 +259,8 @@ struct kvm_sync_regs {
struct {
__u64 reserved1[2];
__u64 gscb[4];
__u64 etoken;
__u64 etoken_extension;
};
};
};
......
......@@ -172,6 +172,10 @@ static int nested;
module_param(nested, int, S_IRUGO);
MODULE_PARM_DESC(nested, "Nested virtualization support");
/* allow 1m huge page guest backing, if !nested */
static int hpage;
module_param(hpage, int, 0444);
MODULE_PARM_DESC(hpage, "1m huge page backing support");
/*
* For now we handle at most 16 double words as this is what the s390 base
......@@ -475,6 +479,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_AIS_MIGRATION:
r = 1;
break;
case KVM_CAP_S390_HPAGE_1M:
r = 0;
if (hpage)
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
r = MEM_OP_MAX_SIZE;
break;
......@@ -511,19 +520,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
}
static void kvm_s390_sync_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot)
struct kvm_memory_slot *memslot)
{
int i;
gfn_t cur_gfn, last_gfn;
unsigned long address;
unsigned long gaddr, vmaddr;
struct gmap *gmap = kvm->arch.gmap;
DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
/* Loop over all guest pages */
/* Loop over all guest segments */
cur_gfn = memslot->base_gfn;
last_gfn = memslot->base_gfn + memslot->npages;
for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
address = gfn_to_hva_memslot(memslot, cur_gfn);
for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
gaddr = gfn_to_gpa(cur_gfn);
vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
if (kvm_is_error_hva(vmaddr))
continue;
bitmap_zero(bitmap, _PAGE_ENTRIES);
gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
for (i = 0; i < _PAGE_ENTRIES; i++) {
if (test_bit(i, bitmap))
mark_page_dirty(kvm, cur_gfn + i);
}
if (test_and_clear_guest_dirty(gmap->mm, address))
mark_page_dirty(kvm, cur_gfn);
if (fatal_signal_pending(current))
return;
cond_resched();
......@@ -667,6 +687,27 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s",
r ? "(not available)" : "(success)");
break;
case KVM_CAP_S390_HPAGE_1M:
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
r = -EBUSY;
else if (!hpage || kvm->arch.use_cmma)
r = -EINVAL;
else {
r = 0;
kvm->mm->context.allow_gmap_hpage_1m = 1;
/*
* We might have to create fake 4k page
* tables. To avoid that the hardware works on
* stale PGSTEs, we emulate these instructions.
*/
kvm->arch.use_skf = 0;
kvm->arch.use_pfmfi = 0;
}
mutex_unlock(&kvm->lock);
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_HPAGE %s",
r ? "(not available)" : "(success)");
break;
case KVM_CAP_S390_USER_STSI:
VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI");
kvm->arch.user_stsi = 1;
......@@ -714,10 +755,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!sclp.has_cmma)
break;
ret = -EBUSY;
VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
mutex_lock(&kvm->lock);
if (!kvm->created_vcpus) {
if (kvm->created_vcpus)
ret = -EBUSY;
else if (kvm->mm->context.allow_gmap_hpage_1m)
ret = -EINVAL;
else {
kvm->arch.use_cmma = 1;
/* Not compatible with cmma. */
kvm->arch.use_pfmfi = 0;
......@@ -862,54 +906,37 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
*/
static int kvm_s390_vm_start_migration(struct kvm *kvm)
{
struct kvm_s390_migration_state *mgs;
struct kvm_memory_slot *ms;
/* should be the only one */
struct kvm_memslots *slots;
unsigned long ram_pages;
unsigned long ram_pages = 0;
int slotnr;
/* migration mode already enabled */
if (kvm->arch.migration_state)
if (kvm->arch.migration_mode)
return 0;
slots = kvm_memslots(kvm);
if (!slots || !slots->used_slots)
return -EINVAL;
mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
if (!mgs)
return -ENOMEM;
kvm->arch.migration_state = mgs;
if (kvm->arch.use_cmma) {
if (!kvm->arch.use_cmma) {
kvm->arch.migration_mode = 1;
return 0;
}
/* mark all the pages in active slots as dirty */
for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
ms = slots->memslots + slotnr;
/*
* Get the first slot. They are reverse sorted by base_gfn, so
* the first slot is also the one at the end of the address
* space. We have verified above that at least one slot is
* present.
* The second half of the bitmap is only used on x86,
* and would be wasted otherwise, so we put it to good
* use here to keep track of the state of the storage
* attributes.
*/
ms = slots->memslots;
/* round up so we only use full longs */
ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
/* allocate enough bytes to store all the bits */
mgs->pgste_bitmap = vmalloc(ram_pages / 8);
if (!mgs->pgste_bitmap) {
kfree(mgs);
kvm->arch.migration_state = NULL;
return -ENOMEM;
}
mgs->bitmap_size = ram_pages;
atomic64_set(&mgs->dirty_pages, ram_pages);
/* mark all the pages in active slots as dirty */
for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
ms = slots->memslots + slotnr;
bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
}
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms));
ram_pages += ms->npages;
}
atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
kvm->arch.migration_mode = 1;
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
return 0;
}
......@@ -919,21 +946,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
*/
static int kvm_s390_vm_stop_migration(struct kvm *kvm)
{
struct kvm_s390_migration_state *mgs;
/* migration mode already disabled */
if (!kvm->arch.migration_state)
if (!kvm->arch.migration_mode)
return 0;
mgs = kvm->arch.migration_state;
kvm->arch.migration_state = NULL;
if (kvm->arch.use_cmma) {
kvm->arch.migration_mode = 0;
if (kvm->arch.use_cmma)
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
/* We have to wait for the essa emulation to finish */
synchronize_srcu(&kvm->srcu);
vfree(mgs->pgste_bitmap);
}
kfree(mgs);
return 0;
}
......@@ -961,7 +979,7 @@ static int kvm_s390_vm_set_migration(struct kvm *kvm,
static int kvm_s390_vm_get_migration(struct kvm *kvm,
struct kvm_device_attr *attr)
{
u64 mig = (kvm->arch.migration_state != NULL);
u64 mig = kvm->arch.migration_mode;
if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
return -ENXIO;
......@@ -1540,6 +1558,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
uint8_t *keys;
uint64_t hva;
int srcu_idx, i, r = 0;
bool unlocked;
if (args->flags != 0)
return -EINVAL;
......@@ -1564,9 +1583,11 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (r)
goto out;
i = 0;
down_read(&current->mm->mmap_sem);
srcu_idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < args->count; i++) {
while (i < args->count) {
unlocked = false;
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
......@@ -1580,8 +1601,14 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
}
r = set_guest_storage_key(current->mm, hva, keys[i], 0);
if (r)
break;
if (r) {
r = fixup_user_fault(current, current->mm, hva,
FAULT_FLAG_WRITE, &unlocked);
if (r)
break;
}
if (!r)
i++;
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
up_read(&current->mm->mmap_sem);
......@@ -1599,6 +1626,134 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
/* for consistency */
#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
/*
* Similar to gfn_to_memslot, but returns the index of a memslot also when the
* address falls in a hole. In that case the index of one of the memslots
* bordering the hole is returned.
*/
static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
{
int start = 0, end = slots->used_slots;
int slot = atomic_read(&slots->lru_slot);
struct kvm_memory_slot *memslots = slots->memslots;
if (gfn >= memslots[slot].base_gfn &&
gfn < memslots[slot].base_gfn + memslots[slot].npages)
return slot;
while (start < end) {
slot = start + (end - start) / 2;
if (gfn >= memslots[slot].base_gfn)
end = slot;
else
start = slot + 1;
}
if (gfn >= memslots[start].base_gfn &&
gfn < memslots[start].base_gfn + memslots[start].npages) {
atomic_set(&slots->lru_slot, start);
}
return start;
}
static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
u8 *res, unsigned long bufsize)
{
unsigned long pgstev, hva, cur_gfn = args->start_gfn;
args->count = 0;
while (args->count < bufsize) {
hva = gfn_to_hva(kvm, cur_gfn);
/*
* We return an error if the first value was invalid, but we
* return successfully if at least one value was copied.
*/
if (kvm_is_error_hva(hva))
return args->count ? 0 : -EFAULT;
if (get_pgste(kvm->mm, hva, &pgstev) < 0)
pgstev = 0;
res[args->count++] = (pgstev >> 24) & 0x43;
cur_gfn++;
}
return 0;
}
static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
unsigned long cur_gfn)
{
int slotidx = gfn_to_memslot_approx(slots, cur_gfn);
struct kvm_memory_slot *ms = slots->memslots + slotidx;
unsigned long ofs = cur_gfn - ms->base_gfn;
if (ms->base_gfn + ms->npages <= cur_gfn) {
slotidx--;
/* If we are above the highest slot, wrap around */
if (slotidx < 0)
slotidx = slots->used_slots - 1;
ms = slots->memslots + slotidx;
ofs = 0;
}
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
while ((slotidx > 0) && (ofs >= ms->npages)) {
slotidx--;
ms = slots->memslots + slotidx;
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
}
return ms->base_gfn + ofs;
}
static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
u8 *res, unsigned long bufsize)
{
unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *ms;
cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
ms = gfn_to_memslot(kvm, cur_gfn);
args->count = 0;
args->start_gfn = cur_gfn;
if (!ms)
return 0;
next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
while (args->count < bufsize) {
hva = gfn_to_hva(kvm, cur_gfn);
if (kvm_is_error_hva(hva))
return 0;
/* Decrement only if we actually flipped the bit to 0 */
if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
atomic64_dec(&kvm->arch.cmma_dirty_pages);
if (get_pgste(kvm->mm, hva, &pgstev) < 0)
pgstev = 0;
/* Save the value */
res[args->count++] = (pgstev >> 24) & 0x43;
/* If the next bit is too far away, stop. */
if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
return 0;
/* If we reached the previous "next", find the next one */
if (cur_gfn == next_gfn)
next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
/* Reached the end of memory or of the buffer, stop */
if ((next_gfn >= mem_end) ||
(next_gfn - args->start_gfn >= bufsize))
return 0;
cur_gfn++;
/* Reached the end of the current memslot, take the next one. */
if (cur_gfn - ms->base_gfn >= ms->npages) {
ms = gfn_to_memslot(kvm, cur_gfn);
if (!ms)
return 0;
}
}
return 0;
}
/*
* This function searches for the next page with dirty CMMA attributes, and
* saves the attributes in the buffer up to either the end of the buffer or
......@@ -1610,22 +1765,18 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
static int kvm_s390_get_cmma_bits(struct kvm *kvm,
struct kvm_s390_cmma_log *args)
{
struct kvm_s390_migration_state *s = kvm->arch.migration_state;
unsigned long bufsize, hva, pgstev, i, next, cur;
int srcu_idx, peek, r = 0, rr;
u8 *res;
cur = args->start_gfn;
i = next = pgstev = 0;
unsigned long bufsize;
int srcu_idx, peek, ret;
u8 *values;
if (unlikely(!kvm->arch.use_cmma))
if (!kvm->arch.use_cmma)
return -ENXIO;
/* Invalid/unsupported flags were specified */
if (args->flags & ~KVM_S390_CMMA_PEEK)
return -EINVAL;
/* Migration mode query, and we are not doing a migration */
peek = !!(args->flags & KVM_S390_CMMA_PEEK);
if (!peek && !s)
if (!peek && !kvm->arch.migration_mode)
return -EINVAL;
/* CMMA is disabled or was not used, or the buffer has length zero */
bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
......@@ -1633,74 +1784,35 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
memset(args, 0, sizeof(*args));
return 0;
}
if (!peek) {
/* We are not peeking, and there are no dirty pages */
if (!atomic64_read(&s->dirty_pages)) {
memset(args, 0, sizeof(*args));
return 0;
}
cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
args->start_gfn);
if (cur >= s->bitmap_size) /* nothing found, loop back */
cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
if (cur >= s->bitmap_size) { /* again! (very unlikely) */
memset(args, 0, sizeof(*args));
return 0;
}
next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
/* We are not peeking, and there are no dirty pages */
if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) {
memset(args, 0, sizeof(*args));
return 0;
}
res = vmalloc(bufsize);
if (!res)
values = vmalloc(bufsize);
if (!values)
return -ENOMEM;
args->start_gfn = cur;
down_read(&kvm->mm->mmap_sem);
srcu_idx = srcu_read_lock(&kvm->srcu);
while (i < bufsize) {
hva = gfn_to_hva(kvm, cur);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
break;
}
/* decrement only if we actually flipped the bit to 0 */
if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
atomic64_dec(&s->dirty_pages);
r = get_pgste(kvm->mm, hva, &pgstev);
if (r < 0)
pgstev = 0;
/* save the value */
res[i++] = (pgstev >> 24) & 0x43;
/*
* if the next bit is too far away, stop.
* if we reached the previous "next", find the next one
*/
if (!peek) {
if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
break;
if (cur == next)
next = find_next_bit(s->pgste_bitmap,
s->bitmap_size, cur + 1);
/* reached the end of the bitmap or of the buffer, stop */
if ((next >= s->bitmap_size) ||
(next >= args->start_gfn + bufsize))
break;
}
cur++;
}
if (peek)
ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
else
ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
srcu_read_unlock(&kvm->srcu, srcu_idx);
up_read(&kvm->mm->mmap_sem);
args->count = i;
args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
rr = copy_to_user((void __user *)args->values, res, args->count);
if (rr)
r = -EFAULT;
if (kvm->arch.migration_mode)
args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
else
args->remaining = 0;
vfree(res);
return r;
if (copy_to_user((void __user *)args->values, values, args->count))
ret = -EFAULT;
vfree(values);
return ret;
}
/*
......@@ -2139,10 +2251,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
kvm_s390_vsie_destroy(kvm);
if (kvm->arch.migration_state) {
vfree(kvm->arch.migration_state->pgste_bitmap);
kfree(kvm->arch.migration_state);
}
KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
}
......@@ -2300,6 +2408,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
if (test_kvm_facility(vcpu->kvm, 133))
vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
if (test_kvm_facility(vcpu->kvm, 156))
vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
/* fprs can be synchronized via vrs, even if the guest has no vx. With
* MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
*/
......@@ -2549,7 +2659,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
}
if (test_kvm_facility(vcpu->kvm, 139))
vcpu->arch.sie_block->ecd |= ECD_MEF;
if (test_kvm_facility(vcpu->kvm, 156))
vcpu->arch.sie_block->ecd |= ECD_ETOKENF;
if (vcpu->arch.sie_block->gd) {
vcpu->arch.sie_block->eca |= ECA_AIV;
VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
......@@ -3467,6 +3578,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
preempt_enable();
}
/* SIE will load etoken directly from SDNX and therefore kvm_run */
kvm_run->kvm_dirty_regs = 0;
}
......@@ -3506,7 +3618,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
__ctl_clear_bit(2, 4);
vcpu->arch.host_gscb = NULL;
}
/* SIE will save etoken directly into SDNX and therefore kvm_run */
}
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
......@@ -4082,6 +4194,11 @@ static int __init kvm_s390_init(void)
return -ENODEV;
}
if (nested && hpage) {
pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently");
return -EINVAL;
}
for (i = 0; i < 16; i++)
kvm_s390_fac_base[i] |=
S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
......
......@@ -205,13 +205,10 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
{
int rc;
struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
trace_kvm_s390_skey_related_inst(vcpu);
/* Already enabled? */
if (vcpu->kvm->arch.use_skf &&
!(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) &&
!kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
if (vcpu->arch.skey_enabled)
return 0;
rc = s390_enable_skey();
......@@ -222,9 +219,10 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS);
if (!vcpu->kvm->arch.use_skf)
sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
else
sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
vcpu->arch.skey_enabled = true;
return 0;
}
......@@ -246,9 +244,10 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
static int handle_iske(struct kvm_vcpu *vcpu)
{
unsigned long addr;
unsigned long gaddr, vmaddr;
unsigned char key;
int reg1, reg2;
bool unlocked;
int rc;
vcpu->stat.instruction_iske++;
......@@ -262,18 +261,28 @@ static int handle_iske(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
addr = kvm_s390_logical_to_effective(vcpu, addr);
addr = kvm_s390_real_to_abs(vcpu, addr);
addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
if (kvm_is_error_hva(addr))
gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
retry:
unlocked = false;
down_read(&current->mm->mmap_sem);
rc = get_guest_storage_key(current->mm, addr, &key);
up_read(&current->mm->mmap_sem);
rc = get_guest_storage_key(current->mm, vmaddr, &key);
if (rc) {
rc = fixup_user_fault(current, current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
if (!rc) {
up_read(&current->mm->mmap_sem);
goto retry;
}
}
if (rc)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
up_read(&current->mm->mmap_sem);
vcpu->run->s.regs.gprs[reg1] &= ~0xff;
vcpu->run->s.regs.gprs[reg1] |= key;
return 0;
......@@ -281,8 +290,9 @@ static int handle_iske(struct kvm_vcpu *vcpu)
static int handle_rrbe(struct kvm_vcpu *vcpu)
{
unsigned long addr;
unsigned long vmaddr, gaddr;
int reg1, reg2;
bool unlocked;
int rc;
vcpu->stat.instruction_rrbe++;
......@@ -296,19 +306,27 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
addr = kvm_s390_logical_to_effective(vcpu, addr);
addr = kvm_s390_real_to_abs(vcpu, addr);
addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
if (kvm_is_error_hva(addr))
gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
retry:
unlocked = false;
down_read(&current->mm->mmap_sem);
rc = reset_guest_reference_bit(current->mm, addr);
up_read(&current->mm->mmap_sem);
rc = reset_guest_reference_bit(current->mm, vmaddr);
if (rc < 0) {
rc = fixup_user_fault(current, current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
if (!rc) {
up_read(&current->mm->mmap_sem);
goto retry;
}
}
if (rc < 0)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
up_read(&current->mm->mmap_sem);
kvm_s390_set_psw_cc(vcpu, rc);
return 0;
}
......@@ -323,6 +341,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
unsigned long start, end;
unsigned char key, oldkey;
int reg1, reg2;
bool unlocked;
int rc;
vcpu->stat.instruction_sske++;
......@@ -355,19 +374,28 @@ static int handle_sske(struct kvm_vcpu *vcpu)
}
while (start != end) {
unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
unlocked = false;
if (kvm_is_error_hva(addr))
if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
down_read(&current->mm->mmap_sem);
rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
m3 & SSKE_NQ, m3 & SSKE_MR,
m3 & SSKE_MC);
up_read(&current->mm->mmap_sem);
if (rc < 0)
if (rc < 0) {
rc = fixup_user_fault(current, current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
rc = !rc ? -EAGAIN : rc;
}
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
start += PAGE_SIZE;
up_read(&current->mm->mmap_sem);
if (rc >= 0)
start += PAGE_SIZE;
}
if (m3 & (SSKE_MC | SSKE_MR)) {
......@@ -948,15 +976,16 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
while (start != end) {
unsigned long useraddr;
unsigned long vmaddr;
bool unlocked = false;
/* Translate guest address to host address */
useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
if (kvm_is_error_hva(useraddr))
vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
if (clear_user((void __user *)useraddr, PAGE_SIZE))
if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
}
......@@ -966,14 +995,20 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
down_read(&current->mm->mmap_sem);
rc = cond_set_guest_storage_key(current->mm, useraddr,
rc = cond_set_guest_storage_key(current->mm, vmaddr,
key, NULL, nq, mr, mc);
up_read(&current->mm->mmap_sem);
if (rc < 0)
if (rc < 0) {
rc = fixup_user_fault(current, current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
rc = !rc ? -EAGAIN : rc;
}
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
}
start += PAGE_SIZE;
up_read(&current->mm->mmap_sem);
if (rc >= 0)
start += PAGE_SIZE;
}
}
if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) {
......@@ -987,9 +1022,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
return 0;
}
static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
/*
* Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
*/
static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
{
struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
int r1, r2, nappended, entries;
unsigned long gfn, hva, res, pgstev, ptev;
unsigned long *cbrlo;
......@@ -1039,10 +1076,12 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
cbrlo[entries] = gfn << PAGE_SHIFT;
}
if (orc && gfn < ms->bitmap_size) {
/* increment only if we are really flipping the bit to 1 */
if (!test_and_set_bit(gfn, ms->pgste_bitmap))
atomic64_inc(&ms->dirty_pages);
if (orc) {
struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn);
/* Increment only if we are really flipping the bit */
if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
}
return nappended;
......@@ -1071,7 +1110,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
: ESSA_SET_STABLE_IF_RESIDENT))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
if (likely(!vcpu->kvm->arch.migration_state)) {
if (!vcpu->kvm->arch.migration_mode) {
/*
* CMMA is enabled in the KVM settings, but is disabled in
* the SIE block and in the mm_context, and we are not doing
......@@ -1099,10 +1138,16 @@ static int handle_essa(struct kvm_vcpu *vcpu)
/* Retry the ESSA instruction */
kvm_s390_retry_instr(vcpu);
} else {
/* Account for the possible extra cbrl entry */
i = do_essa(vcpu, orc);
int srcu_idx;
down_read(&vcpu->kvm->mm->mmap_sem);
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
i = __do_essa(vcpu, orc);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
up_read(&vcpu->kvm->mm->mmap_sem);
if (i < 0)
return i;
/* Account for the possible extra cbrl entry */
entries += i;
}
vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
......
......@@ -2,7 +2,7 @@
/*
* kvm nested virtualization support for s390x
*
* Copyright IBM Corp. 2016
* Copyright IBM Corp. 2016, 2018
*
* Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
*/
......@@ -378,6 +378,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (test_kvm_facility(vcpu->kvm, 139))
scb_s->ecd |= scb_o->ecd & ECD_MEF;
/* etoken */
if (test_kvm_facility(vcpu->kvm, 156))
scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
prepare_ibc(vcpu, vsie_page);
rc = shadow_crycb(vcpu, vsie_page);
out:
......@@ -627,7 +631,8 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vsie_page->riccbd_gpa = gpa;
scb_s->riccbd = hpa;
}
if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
if (((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) ||
(scb_s->ecd & ECD_ETOKENF)) {
unsigned long sdnxc;
gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL;
......@@ -818,6 +823,8 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* - < 0 if an error occurred
*/
static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
__releases(vcpu->kvm->srcu)
__acquires(vcpu->kvm->srcu)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
......
......@@ -2,8 +2,10 @@
/*
* KVM guest address space mapping code
*
* Copyright IBM Corp. 2007, 2016
* Copyright IBM Corp. 2007, 2016, 2018
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
* David Hildenbrand <david@redhat.com>
* Janosch Frank <frankja@linux.vnet.ibm.com>
*/
#include <linux/kernel.h>
......@@ -521,6 +523,9 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
rcu_read_unlock();
}
static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
unsigned long gaddr);
/**
* gmap_link - set up shadow page tables to connect a host to a guest address
* @gmap: pointer to guest mapping meta data structure
......@@ -541,6 +546,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
u64 unprot;
int rc;
BUG_ON(gmap_is_shadow(gmap));
......@@ -584,8 +590,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
return -EFAULT;
pmd = pmd_offset(pud, vmaddr);
VM_BUG_ON(pmd_none(*pmd));
/* large pmds cannot yet be handled */
if (pmd_large(*pmd))
/* Are we allowed to use huge pages? */
if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
rc = radix_tree_preload(GFP_KERNEL);
......@@ -596,10 +602,22 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
if (*table == _SEGMENT_ENTRY_EMPTY) {
rc = radix_tree_insert(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT, table);
if (!rc)
*table = pmd_val(*pmd);
} else
rc = 0;
if (!rc) {
if (pmd_large(*pmd)) {
*table = (pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
| _SEGMENT_ENTRY_GMAP_UC;
} else
*table = pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS;
}
} else if (*table & _SEGMENT_ENTRY_PROTECT &&
!(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
unprot = (u64)*table;
unprot &= ~_SEGMENT_ENTRY_PROTECT;
unprot |= _SEGMENT_ENTRY_GMAP_UC;
gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
}
spin_unlock(&gmap->guest_table_lock);
spin_unlock(ptl);
radix_tree_preload_end();
......@@ -690,6 +708,12 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
vmaddr |= gaddr & ~PMD_MASK;
/* Find vma in the parent mm */
vma = find_vma(gmap->mm, vmaddr);
/*
* We do not discard pages that are backed by
* hugetlbfs, so we don't have to refault them.
*/
if (vma && is_vm_hugetlb_page(vma))
continue;
size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
zap_page_range(vma, vmaddr, size);
}
......@@ -864,7 +888,128 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
*/
static void gmap_pte_op_end(spinlock_t *ptl)
{
spin_unlock(ptl);
if (ptl)
spin_unlock(ptl);
}
/**
* gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
* and return the pmd pointer
* @gmap: pointer to guest mapping meta data structure
* @gaddr: virtual address in the guest address space
*
* Returns a pointer to the pmd for a guest address, or NULL
*/
static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
{
pmd_t *pmdp;
BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
if (!pmdp || pmd_none(*pmdp)) {
spin_unlock(&gmap->guest_table_lock);
return NULL;
}
/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
if (!pmd_large(*pmdp))
spin_unlock(&gmap->guest_table_lock);
return pmdp;
}
/**
* gmap_pmd_op_end - release the guest_table_lock if needed
* @gmap: pointer to the guest mapping meta data structure
* @pmdp: pointer to the pmd
*/
static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
{
if (pmd_large(*pmdp))
spin_unlock(&gmap->guest_table_lock);
}
/*
* gmap_protect_pmd - remove access rights to memory and set pmd notification bits
* @pmdp: pointer to the pmd to be protected
* @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
* @bits: notification bits to set
*
* Returns:
* 0 if successfully protected
* -EAGAIN if a fixup is needed
* -EINVAL if unsupported notifier bits have been specified
*
* Expected to be called with sg->mm->mmap_sem in read and
* guest_table_lock held.
*/
static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
pmd_t *pmdp, int prot, unsigned long bits)
{
int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
pmd_t new = *pmdp;
/* Fixup needed */
if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
return -EAGAIN;
if (prot == PROT_NONE && !pmd_i) {
pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (prot == PROT_READ && !pmd_p) {
pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (bits & GMAP_NOTIFY_MPROT)
pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
/* Shadow GMAP protection needs split PMDs */
if (bits & GMAP_NOTIFY_SHADOW)
return -EINVAL;
return 0;
}
/*
* gmap_protect_pte - remove access rights to memory and set pgste bits
* @gmap: pointer to guest mapping meta data structure
* @gaddr: virtual address in the guest address space
* @pmdp: pointer to the pmd associated with the pte
* @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
* @bits: notification bits to set
*
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EAGAIN if a fixup is needed.
*
* Expected to be called with sg->mm->mmap_sem in read
*/
static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
pmd_t *pmdp, int prot, unsigned long bits)
{
int rc;
pte_t *ptep;
spinlock_t *ptl = NULL;
unsigned long pbits = 0;
if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
return -EAGAIN;
ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
if (!ptep)
return -ENOMEM;
pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
/* Protect and unlock. */
rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
gmap_pte_op_end(ptl);
return rc;
}
/*
......@@ -883,30 +1028,45 @@ static void gmap_pte_op_end(spinlock_t *ptl)
static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
unsigned long len, int prot, unsigned long bits)
{
unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
unsigned long vmaddr, dist;
pmd_t *pmdp;
int rc;
BUG_ON(gmap_is_shadow(gmap));
while (len) {
rc = -EAGAIN;
ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
if (ptep) {
rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
gmap_pte_op_end(ptl);
pmdp = gmap_pmd_op_walk(gmap, gaddr);
if (pmdp) {
if (!pmd_large(*pmdp)) {
rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
bits);
if (!rc) {
len -= PAGE_SIZE;
gaddr += PAGE_SIZE;
}
} else {
rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
bits);
if (!rc) {
dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
len = len < dist ? 0 : len - dist;
gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
}
}
gmap_pmd_op_end(gmap, pmdp);
}
if (rc) {
if (rc == -EINVAL)
return rc;
/* -EAGAIN, fixup of userspace mm and gmap */
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr))
return vmaddr;
rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
if (rc)
return rc;
continue;
}
gaddr += PAGE_SIZE;
len -= PAGE_SIZE;
}
return 0;
}
......@@ -935,7 +1095,7 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
down_read(&gmap->mm->mmap_sem);
rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
up_read(&gmap->mm->mmap_sem);
return rc;
}
......@@ -1474,6 +1634,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
unsigned long limit;
int rc;
BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
BUG_ON(gmap_is_shadow(parent));
spin_lock(&parent->shadow_lock);
sg = gmap_find_shadow(parent, asce, edat_level);
......@@ -1526,7 +1687,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
down_read(&parent->mm->mmap_sem);
rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
PROT_READ, PGSTE_VSIE_BIT);
PROT_READ, GMAP_NOTIFY_SHADOW);
up_read(&parent->mm->mmap_sem);
spin_lock(&parent->shadow_lock);
new->initialized = true;
......@@ -2092,6 +2253,225 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
}
EXPORT_SYMBOL_GPL(ptep_notify);
static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
unsigned long gaddr)
{
pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
}
/**
* gmap_pmdp_xchg - exchange a gmap pmd with another
* @gmap: pointer to the guest address space structure
* @pmdp: pointer to the pmd entry
* @new: replacement entry
* @gaddr: the affected guest address
*
* This function is assumed to be called with the guest_table_lock
* held.
*/
static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
unsigned long gaddr)
{
gaddr &= HPAGE_MASK;
pmdp_notify_gmap(gmap, pmdp, gaddr);
pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
IDTE_GLOBAL);
else if (MACHINE_HAS_IDTE)
__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
*pmdp = new;
}
static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
int purge)
{
pmd_t *pmdp;
struct gmap *gmap;
unsigned long gaddr;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT);
if (pmdp) {
gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
_SEGMENT_ENTRY_GMAP_UC));
if (purge)
__pmdp_csp(pmdp);
pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
}
spin_unlock(&gmap->guest_table_lock);
}
rcu_read_unlock();
}
/**
* gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
* flushing
* @mm: pointer to the process mm_struct
* @vmaddr: virtual address in the process address space
*/
void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
{
gmap_pmdp_clear(mm, vmaddr, 0);
}
EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
/**
* gmap_pmdp_csp - csp all affected guest pmd entries
* @mm: pointer to the process mm_struct
* @vmaddr: virtual address in the process address space
*/
void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
{
gmap_pmdp_clear(mm, vmaddr, 1);
}
EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
/**
* gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
* @mm: pointer to the process mm_struct
* @vmaddr: virtual address in the process address space
*/
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
{
unsigned long *entry, gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
entry = radix_tree_delete(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT);
if (entry) {
pmdp = (pmd_t *)entry;
gaddr = __gmap_segment_gaddr(entry);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
_SEGMENT_ENTRY_GMAP_UC));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_LOCAL);
else if (MACHINE_HAS_IDTE)
__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
*entry = _SEGMENT_ENTRY_EMPTY;
}
spin_unlock(&gmap->guest_table_lock);
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
/**
* gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
* @mm: pointer to the process mm_struct
* @vmaddr: virtual address in the process address space
*/
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
{
unsigned long *entry, gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
entry = radix_tree_delete(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT);
if (entry) {
pmdp = (pmd_t *)entry;
gaddr = __gmap_segment_gaddr(entry);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
_SEGMENT_ENTRY_GMAP_UC));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_GLOBAL);
else if (MACHINE_HAS_IDTE)
__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
*entry = _SEGMENT_ENTRY_EMPTY;
}
spin_unlock(&gmap->guest_table_lock);
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
/**
* gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
* @gmap: pointer to guest address space
* @pmdp: pointer to the pmd to be tested
* @gaddr: virtual address in the guest address space
*
* This function is assumed to be called with the guest_table_lock
* held.
*/
bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
unsigned long gaddr)
{
if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
return false;
/* Already protected memory, which did not change is clean */
if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
!(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
return false;
/* Clear UC indication and reset protection */
pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
return true;
}
/**
* gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
* @gmap: pointer to guest address space
* @bitmap: dirty bitmap for this pmd
* @gaddr: virtual address in the guest address space
* @vmaddr: virtual address in the host address space
*
* This function is assumed to be called with the guest_table_lock
* held.
*/
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
unsigned long gaddr, unsigned long vmaddr)
{
int i;
pmd_t *pmdp;
pte_t *ptep;
spinlock_t *ptl;
pmdp = gmap_pmd_op_walk(gmap, gaddr);
if (!pmdp)
return;
if (pmd_large(*pmdp)) {
if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
bitmap_fill(bitmap, _PAGE_ENTRIES);
} else {
for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
if (!ptep)
continue;
if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
set_bit(i, bitmap);
spin_unlock(ptl);
}
}
gmap_pmd_op_end(gmap, pmdp);
}
EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
static inline void thp_split_mm(struct mm_struct *mm)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
......@@ -2168,17 +2548,45 @@ EXPORT_SYMBOL_GPL(s390_enable_sie);
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
*/
static int __s390_enable_skey(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
/* Clear storage key */
ptep_zap_key(walk->mm, addr, pte);
return 0;
}
static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
unsigned long hmask, unsigned long next,
struct mm_walk *walk)
{
pmd_t *pmd = (pmd_t *)pte;
unsigned long start, end;
struct page *page = pmd_page(*pmd);
/*
* The write check makes sure we do not set a key on shared
* memory. This is needed as the walker does not differentiate
* between actual guest memory and the process executable or
* shared libraries.
*/
if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
!(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
return 0;
start = pmd_val(*pmd) & HPAGE_MASK;
end = start + HPAGE_SIZE - 1;
__storage_key_init_range(start, end);
set_bit(PG_arch_1, &page->flags);
return 0;
}
int s390_enable_skey(void)
{
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_walk walk = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
};
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int rc = 0;
......
......@@ -123,6 +123,29 @@ static inline pte_t __rste_to_pte(unsigned long rste)
return pte;
}
static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
{
struct page *page;
unsigned long size, paddr;
if (!mm_uses_skeys(mm) ||
rste & _SEGMENT_ENTRY_INVALID)
return;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
page = pud_page(__pud(rste));
size = PUD_SIZE;
paddr = rste & PUD_MASK;
} else {
page = pmd_page(__pmd(rste));
size = PMD_SIZE;
paddr = rste & PMD_MASK;
}
if (!test_and_set_bit(PG_arch_1, &page->flags))
__storage_key_init_range(paddr, paddr + size - 1);
}
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
......@@ -137,6 +160,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
else
rste |= _SEGMENT_ENTRY_LARGE;
clear_huge_pte_skeys(mm, rste);
pte_val(*ptep) = rste;
}
......
......@@ -14,7 +14,7 @@
static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
{
asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0"
asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0"
: [addr] "+a" (addr) : [skey] "d" (skey));
return addr;
}
......@@ -23,8 +23,6 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
{
unsigned long boundary, size;
if (!PAGE_DEFAULT_KEY)
return;
while (start < end) {
if (MACHINE_HAS_EDAT1) {
/* set storage keys for a 1MB frame */
......@@ -37,7 +35,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
continue;
}
}
page_set_storage_key(start, PAGE_DEFAULT_KEY, 0);
page_set_storage_key(start, PAGE_DEFAULT_KEY, 1);
start += PAGE_SIZE;
}
}
......
......@@ -347,18 +347,27 @@ static inline void pmdp_idte_local(struct mm_struct *mm,
mm->context.asce, IDTE_LOCAL);
else
__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
gmap_pmdp_idte_local(mm, addr);
}
static inline void pmdp_idte_global(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
if (MACHINE_HAS_TLB_GUEST)
if (MACHINE_HAS_TLB_GUEST) {
__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
mm->context.asce, IDTE_GLOBAL);
else if (MACHINE_HAS_IDTE)
if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
gmap_pmdp_idte_global(mm, addr);
} else if (MACHINE_HAS_IDTE) {
__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
else
if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
gmap_pmdp_idte_global(mm, addr);
} else {
__pmdp_csp(pmdp);
if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
gmap_pmdp_csp(mm, addr);
}
}
static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
......@@ -392,6 +401,8 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
cpumask_of(smp_processor_id()))) {
pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
mm->context.flush_mm = 1;
if (mm_has_pgste(mm))
gmap_pmdp_invalidate(mm, addr);
} else {
pmdp_idte_global(mm, addr, pmdp);
}
......@@ -399,6 +410,24 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
return old;
}
static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return NULL;
pud = pud_alloc(mm, p4d, addr);
if (!pud)
return NULL;
pmd = pmd_alloc(mm, pud, addr);
return pmd;
}
pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t new)
{
......@@ -693,40 +722,14 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
/*
* Test and reset if a guest page is dirty
*/
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgste_t pgste;
pte_t *ptep;
pte_t pte;
bool dirty;
int nodat;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return false;
pud = pud_alloc(mm, p4d, addr);
if (!pud)
return false;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return false;
/* We can't run guests backed by huge pages, but userspace can
* still set them up and then try to migrate them without any
* migration support.
*/
if (pmd_large(*pmd))
return true;
ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (unlikely(!ptep))
return false;
pgste = pgste_get_lock(ptep);
dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
pgste_val(pgste) &= ~PGSTE_UC_BIT;
......@@ -742,21 +745,43 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
*ptep = pte;
}
pgste_set_unlock(ptep, pgste);
spin_unlock(ptl);
return dirty;
}
EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty);
EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq)
{
unsigned long keyul;
unsigned long keyul, paddr;
spinlock_t *ptl;
pgste_t old, new;
pmd_t *pmdp;
pte_t *ptep;
ptep = get_locked_pte(mm, addr, &ptl);
pmdp = pmd_alloc_map(mm, addr);
if (unlikely(!pmdp))
return -EFAULT;
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
return -EFAULT;
}
if (pmd_large(*pmdp)) {
paddr = pmd_val(*pmdp) & HPAGE_MASK;
paddr |= addr & ~HPAGE_MASK;
/*
* Huge pmds need quiescing operations, they are
* always mapped.
*/
page_set_storage_key(paddr, key, 1);
spin_unlock(ptl);
return 0;
}
spin_unlock(ptl);
ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
if (unlikely(!ptep))
return -EFAULT;
......@@ -767,14 +792,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
unsigned long address, bits, skey;
unsigned long bits, skey;
address = pte_val(*ptep) & PAGE_MASK;
skey = (unsigned long) page_get_storage_key(address);
paddr = pte_val(*ptep) & PAGE_MASK;
skey = (unsigned long) page_get_storage_key(paddr);
bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
/* Set storage key ACC and FP */
page_set_storage_key(address, skey, !nq);
page_set_storage_key(paddr, skey, !nq);
/* Merge host changed & referenced into pgste */
pgste_val(new) |= bits << 52;
}
......@@ -830,11 +855,32 @@ EXPORT_SYMBOL(cond_set_guest_storage_key);
int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
{
spinlock_t *ptl;
unsigned long paddr;
pgste_t old, new;
pmd_t *pmdp;
pte_t *ptep;
int cc = 0;
ptep = get_locked_pte(mm, addr, &ptl);
pmdp = pmd_alloc_map(mm, addr);
if (unlikely(!pmdp))
return -EFAULT;
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
return -EFAULT;
}
if (pmd_large(*pmdp)) {
paddr = pmd_val(*pmdp) & HPAGE_MASK;
paddr |= addr & ~HPAGE_MASK;
cc = page_reset_referenced(paddr);
spin_unlock(ptl);
return cc;
}
spin_unlock(ptl);
ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
if (unlikely(!ptep))
return -EFAULT;
......@@ -843,7 +889,8 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
pgste_val(new) &= ~PGSTE_GR_BIT;
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
paddr = pte_val(*ptep) & PAGE_MASK;
cc = page_reset_referenced(paddr);
/* Merge real referenced bit into host-set */
pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
}
......@@ -862,18 +909,42 @@ EXPORT_SYMBOL(reset_guest_reference_bit);
int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char *key)
{
unsigned long paddr;
spinlock_t *ptl;
pgste_t pgste;
pmd_t *pmdp;
pte_t *ptep;
ptep = get_locked_pte(mm, addr, &ptl);
pmdp = pmd_alloc_map(mm, addr);
if (unlikely(!pmdp))
return -EFAULT;
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
/* Not yet mapped memory has a zero key */
spin_unlock(ptl);
*key = 0;
return 0;
}
if (pmd_large(*pmdp)) {
paddr = pmd_val(*pmdp) & HPAGE_MASK;
paddr |= addr & ~HPAGE_MASK;
*key = page_get_storage_key(paddr);
spin_unlock(ptl);
return 0;
}
spin_unlock(ptl);
ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
if (unlikely(!ptep))
return -EFAULT;
pgste = pgste_get_lock(ptep);
*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
paddr = pte_val(*ptep) & PAGE_MASK;
if (!(pte_val(*ptep) & _PAGE_INVALID))
*key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
*key = page_get_storage_key(paddr);
/* Reflect guest's logical view, not physical */
*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
pgste_set_unlock(ptep, pgste);
......
......@@ -4,7 +4,7 @@
* numbering scheme from the Princples of Operations: most significant bit
* has bit number 0.
*
* Copyright IBM Corp. 2015
* Copyright IBM Corp. 2015, 2018
*
*/
......@@ -106,6 +106,7 @@ static struct facility_def facility_defs[] = {
.name = "FACILITIES_KVM_CPUMODEL",
.bits = (int[]){
156, /* etoken facility */
-1 /* END */
}
},
......
......@@ -309,6 +309,13 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
}
static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot)
{
unsigned long len = kvm_dirty_bitmap_bytes(memslot);
return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
}
struct kvm_s390_adapter_int {
u64 ind_addr;
u64 summary_addr;
......
......@@ -949,6 +949,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_GET_MSR_FEATURES 153
#define KVM_CAP_HYPERV_EVENTFD 154
#define KVM_CAP_HYPERV_TLBFLUSH 155
#define KVM_CAP_S390_HPAGE_1M 156
#ifdef KVM_CAP_IRQ_ROUTING
......
......@@ -1169,7 +1169,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
n = kvm_dirty_bitmap_bytes(memslot);
dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
memset(dirty_bitmap_buffer, 0, n);
spin_lock(&kvm->mmu_lock);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册