提交 91562cf9 编写于 作者: L Linus Torvalds

Merge tag 'powerpc-6.1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman:

 - Fix a case of rescheduling with user access unlocked, when preempt is
   enabled.

 - A follow-up fix for a recent fix, which could lead to IRQ state
   assertions firing incorrectly.

 - Two fixes for lockdep warnings seen when using kfence with the Hash
   MMU.

 - Two fixes for preempt warnings seen when using the Hash MMU.

 - Two fixes for the VAS coprocessor mechanism used on pseries.

 - Prevent building some of our older KVM backends when
   CONTEXT_TRACKING_USER is enabled, as it's known to cause crashes.

 - A couple of fixes for issues seen with PMU NMIs.

Thanks to Nicholas Piggin, Guenter Roeck, Frederic Barrat Haren Myneni,
Sachin Sant, and Samuel Holland.

* tag 'powerpc-6.1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
  powerpc/64s/interrupt: Fix clear of PACA_IRQS_HARD_DIS when returning to soft-masked context
  powerpc/64s/interrupt: Perf NMI should not take normal exit path
  powerpc/64/interrupt: Prevent NMI PMI causing a dangerous warning
  KVM: PPC: BookS PR-KVM and BookE do not support context tracking
  powerpc: Fix reschedule bug in KUAP-unlocked user copy
  powerpc/64s: Fix hash__change_memory_range preemption warning
  powerpc/64s: Disable preemption in hash lazy mmu mode
  powerpc/64s: make linear_map_hash_lock a raw spinlock
  powerpc/64s: make HPTE lock and native_tlbie_lock irq-safe
  powerpc/64s: Add lockdep for HPTE lock
  powerpc/pseries: Use lparcfg to reconfig VAS windows for DLPAR CPU
  powerpc/pseries/vas: Add VAS IRQ primary handler
......@@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void)
if (radix_enabled())
return;
/*
* apply_to_page_range can call us this preempt enabled when
* operating on kernel page tables.
*/
preempt_disable();
batch = this_cpu_ptr(&ppc64_tlb_batch);
batch->active = 1;
}
......@@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void)
if (batch->index)
__flush_tlb_pending(batch);
batch->active = 0;
preempt_enable();
}
#define arch_flush_lazy_mmu_mode() do {} while (0)
......
......@@ -813,6 +813,13 @@ kernel_dbg_exc:
EXCEPTION_COMMON(0x260)
CHECK_NAPPING()
addi r3,r1,STACK_FRAME_OVERHEAD
/*
* XXX: Returning from performance_monitor_exception taken as a
* soft-NMI (Linux irqs disabled) may be risky to use interrupt_return
* and could cause bugs in return or elsewhere. That case should just
* restore registers and return. There is a workaround for one known
* problem in interrupt_exit_kernel_prepare().
*/
bl performance_monitor_exception
b interrupt_return
......
......@@ -2357,9 +2357,21 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
EXC_COMMON_BEGIN(performance_monitor_common)
GEN_COMMON performance_monitor
addi r3,r1,STACK_FRAME_OVERHEAD
bl performance_monitor_exception
lbz r4,PACAIRQSOFTMASK(r13)
cmpdi r4,IRQS_ENABLED
bne 1f
bl performance_monitor_exception_async
b interrupt_return_srr
1:
bl performance_monitor_exception_nmi
/* Clear MSR_RI before setting SRR0 and SRR1. */
li r9,0
mtmsrd r9,1
kuap_kernel_restore r9, r10
EXCEPTION_RESTORE_REGS hsrr=0
RFI_TO_KERNEL
/**
* Interrupt 0xf20 - Vector Unavailable Interrupt.
......
......@@ -374,10 +374,18 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
if (regs_is_unrecoverable(regs))
unrecoverable_exception(regs);
/*
* CT_WARN_ON comes here via program_check_exception,
* so avoid recursion.
* CT_WARN_ON comes here via program_check_exception, so avoid
* recursion.
*
* Skip the assertion on PMIs on 64e to work around a problem caused
* by NMI PMIs incorrectly taking this interrupt return path, it's
* possible for this to hit after interrupt exit to user switches
* context to user. See also the comment in the performance monitor
* handler in exceptions-64e.S
*/
if (TRAP(regs) != INTERRUPT_PROGRAM)
if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) &&
TRAP(regs) != INTERRUPT_PROGRAM &&
TRAP(regs) != INTERRUPT_PERFMON)
CT_WARN_ON(ct_state() == CONTEXT_USER);
kuap = kuap_get_and_assert_locked();
......
......@@ -532,15 +532,24 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel)
* Returning to soft-disabled context.
* Check if a MUST_HARD_MASK interrupt has become pending, in which
* case we need to disable MSR[EE] in the return context.
*
* The MSR[EE] check catches among other things the short incoherency
* in hard_irq_disable() between clearing MSR[EE] and setting
* PACA_IRQ_HARD_DIS.
*/
ld r12,_MSR(r1)
andi. r10,r12,MSR_EE
beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled
lbz r11,PACAIRQHAPPENED(r13)
andi. r10,r11,PACA_IRQ_MUST_HARD_MASK
beq .Lfast_kernel_interrupt_return_\srr\() // No HARD_MASK pending
bne 1f // HARD_MASK is pending
// No HARD_MASK pending, clear possible HARD_DIS set by interrupt
andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l
stb r11,PACAIRQHAPPENED(r13)
b .Lfast_kernel_interrupt_return_\srr\()
/* Must clear MSR_EE from _MSR */
1: /* Must clear MSR_EE from _MSR */
#ifdef CONFIG_PPC_BOOK3S
li r10,0
/* Clear valid before changing _MSR */
......
......@@ -51,6 +51,7 @@ config KVM_BOOK3S_HV_POSSIBLE
config KVM_BOOK3S_32
tristate "KVM support for PowerPC book3s_32 processors"
depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT
depends on !CONTEXT_TRACKING_USER
select KVM
select KVM_BOOK3S_32_HANDLER
select KVM_BOOK3S_PR_POSSIBLE
......@@ -105,6 +106,7 @@ config KVM_BOOK3S_64_HV
config KVM_BOOK3S_64_PR
tristate "KVM support without using hypervisor mode in host"
depends on KVM_BOOK3S_64
depends on !CONTEXT_TRACKING_USER
select KVM_BOOK3S_PR_POSSIBLE
help
Support running guest kernels in virtual machines on processors
......@@ -190,6 +192,7 @@ config KVM_EXIT_TIMING
config KVM_E500V2
bool "KVM support for PowerPC E500v2 processors"
depends on PPC_E500 && !PPC_E500MC
depends on !CONTEXT_TRACKING_USER
select KVM
select KVM_MMIO
select MMU_NOTIFIER
......@@ -205,6 +208,7 @@ config KVM_E500V2
config KVM_E500MC
bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
depends on PPC_E500MC
depends on !CONTEXT_TRACKING_USER
select KVM
select KVM_MMIO
select KVM_BOOKE_HV
......
......@@ -36,7 +36,17 @@ int exit_vmx_usercopy(void)
{
disable_kernel_altivec();
pagefault_enable();
preempt_enable();
preempt_enable_no_resched();
/*
* Must never explicitly call schedule (including preempt_enable())
* while in a kuap-unlocked user copy, because the AMR register will
* not be saved and restored across context switch. However preempt
* kernels need to be preempted as soon as possible if need_resched is
* set and we are preemptible. The hack here is to schedule a
* decrementer to fire here and reschedule for us if necessary.
*/
if (IS_ENABLED(CONFIG_PREEMPT) && need_resched())
set_dec(1);
return 0;
}
......
......@@ -43,6 +43,29 @@
static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
#ifdef CONFIG_LOCKDEP
static struct lockdep_map hpte_lock_map =
STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map);
static void acquire_hpte_lock(void)
{
lock_map_acquire(&hpte_lock_map);
}
static void release_hpte_lock(void)
{
lock_map_release(&hpte_lock_map);
}
#else
static void acquire_hpte_lock(void)
{
}
static void release_hpte_lock(void)
{
}
#endif
static inline unsigned long ___tlbie(unsigned long vpn, int psize,
int apsize, int ssize)
{
......@@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
acquire_hpte_lock();
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
break;
......@@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
release_hpte_lock();
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
......@@ -243,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
unsigned long flags;
int i;
local_irq_save(flags);
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
......@@ -263,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
hptep++;
}
if (i == HPTES_PER_GROUP)
if (i == HPTES_PER_GROUP) {
local_irq_restore(flags);
return -1;
}
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
......@@ -286,10 +316,13 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
release_hpte_lock();
hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
local_irq_restore(flags);
return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
......@@ -327,6 +360,7 @@ static long native_hpte_remove(unsigned long hpte_group)
return -1;
/* Invalidate the hpte. NOTE: this also unlocks it */
release_hpte_lock();
hptep->v = 0;
return i;
......@@ -339,6 +373,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0, local = 0;
unsigned long irqflags;
local_irq_save(irqflags);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
......@@ -382,6 +419,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
if (!(flags & HPTE_NOHPTE_UPDATE))
tlbie(vpn, bpsize, apsize, ssize, local);
local_irq_restore(irqflags);
return ret;
}
......@@ -445,6 +484,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
unsigned long vsid;
long slot;
struct hash_pte *hptep;
unsigned long flags;
local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
......@@ -463,6 +505,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
* actual page size will be same.
*/
tlbie(vpn, psize, psize, ssize, 0);
local_irq_restore(flags);
}
/*
......@@ -476,6 +520,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
unsigned long vsid;
long slot;
struct hash_pte *hptep;
unsigned long flags;
local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
......@@ -493,6 +540,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
/* Invalidate the TLB */
tlbie(vpn, psize, psize, ssize, 0);
local_irq_restore(flags);
return 0;
}
......@@ -517,10 +567,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
/* recheck with locks held */
hpte_v = hpte_get_old_v(hptep);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
/* Invalidate the hpte. NOTE: this also unlocks it */
release_hpte_lock();
hptep->v = 0;
else
} else
native_unlock_hpte(hptep);
}
/*
......@@ -580,10 +631,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
hpte_v = hpte_get_old_v(hptep);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
/*
* Invalidate the hpte. NOTE: this also unlocks it
*/
/* Invalidate the hpte. NOTE: this also unlocks it */
release_hpte_lock();
hptep->v = 0;
} else
native_unlock_hpte(hptep);
......@@ -765,8 +814,10 @@ static void native_flush_hash_range(unsigned long number, int local)
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
else {
release_hpte_lock();
hptep->v = 0;
}
} pte_iterate_hashed_end();
}
......
......@@ -404,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
struct change_memory_parms {
unsigned long start, end, newpp;
unsigned int step, nr_cpus, master_cpu;
unsigned int step, nr_cpus;
atomic_t master_cpu;
atomic_t cpu_counter;
};
......@@ -478,7 +479,8 @@ static int change_memory_range_fn(void *data)
{
struct change_memory_parms *parms = data;
if (parms->master_cpu != smp_processor_id())
// First CPU goes through, all others wait.
if (atomic_xchg(&parms->master_cpu, 1) == 1)
return chmem_secondary_loop(parms);
// Wait for all but one CPU (this one) to call-in
......@@ -516,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
chmem_parms.end = end;
chmem_parms.step = step;
chmem_parms.newpp = newpp;
chmem_parms.master_cpu = smp_processor_id();
atomic_set(&chmem_parms.master_cpu, 0);
cpus_read_lock();
......
......@@ -1981,7 +1981,7 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
}
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
static DEFINE_SPINLOCK(linear_map_hash_lock);
static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
......@@ -2005,10 +2005,10 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
mmu_linear_psize, mmu_kernel_ssize);
BUG_ON (ret < 0);
spin_lock(&linear_map_hash_lock);
raw_spin_lock(&linear_map_hash_lock);
BUG_ON(linear_map_hash_slots[lmi] & 0x80);
linear_map_hash_slots[lmi] = ret | 0x80;
spin_unlock(&linear_map_hash_lock);
raw_spin_unlock(&linear_map_hash_lock);
}
static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
......@@ -2018,14 +2018,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
spin_lock(&linear_map_hash_lock);
raw_spin_lock(&linear_map_hash_lock);
if (!(linear_map_hash_slots[lmi] & 0x80)) {
spin_unlock(&linear_map_hash_lock);
raw_spin_unlock(&linear_map_hash_lock);
return;
}
hidx = linear_map_hash_slots[lmi] & 0x7f;
linear_map_hash_slots[lmi] = 0;
spin_unlock(&linear_map_hash_lock);
raw_spin_unlock(&linear_map_hash_lock);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
......
......@@ -35,6 +35,7 @@
#include <asm/drmem.h>
#include "pseries.h"
#include "vas.h" /* pseries_vas_dlpar_cpu() */
/*
* This isn't a module but we expose that to userspace
......@@ -748,6 +749,16 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
return -EINVAL;
retval = update_ppp(new_entitled_ptr, NULL);
if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
/*
* The hypervisor assigns VAS resources based
* on entitled capacity for shared mode.
* Reconfig VAS windows based on DLPAR CPU events.
*/
if (pseries_vas_dlpar_cpu() != 0)
retval = H_HARDWARE;
}
} else if (!strcmp(kbuf, "capacity_weight")) {
char *endp;
*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
......
......@@ -200,16 +200,41 @@ static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data)
struct vas_user_win_ref *tsk_ref;
int rc;
rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb));
if (!rc) {
tsk_ref = &txwin->vas_win.task_ref;
vas_dump_crb(&crb);
vas_update_csb(&crb, tsk_ref);
while (atomic_read(&txwin->pending_faults)) {
rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb));
if (!rc) {
tsk_ref = &txwin->vas_win.task_ref;
vas_dump_crb(&crb);
vas_update_csb(&crb, tsk_ref);
}
atomic_dec(&txwin->pending_faults);
}
return IRQ_HANDLED;
}
/*
* irq_default_primary_handler() can be used only with IRQF_ONESHOT
* which disables IRQ before executing the thread handler and enables
* it after. But this disabling interrupt sets the VAS IRQ OFF
* state in the hypervisor. If the NX generates fault interrupt
* during this window, the hypervisor will not deliver this
* interrupt to the LPAR. So use VAS specific IRQ handler instead
* of calling the default primary handler.
*/
static irqreturn_t pseries_vas_irq_handler(int irq, void *data)
{
struct pseries_vas_window *txwin = data;
/*
* The thread hanlder will process this interrupt if it is
* already running.
*/
atomic_inc(&txwin->pending_faults);
return IRQ_WAKE_THREAD;
}
/*
* Allocate window and setup IRQ mapping.
*/
......@@ -240,8 +265,9 @@ static int allocate_setup_window(struct pseries_vas_window *txwin,
goto out_irq;
}
rc = request_threaded_irq(txwin->fault_virq, NULL,
pseries_vas_fault_thread_fn, IRQF_ONESHOT,
rc = request_threaded_irq(txwin->fault_virq,
pseries_vas_irq_handler,
pseries_vas_fault_thread_fn, 0,
txwin->name, txwin);
if (rc) {
pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n",
......@@ -826,6 +852,25 @@ int vas_reconfig_capabilties(u8 type, int new_nr_creds)
mutex_unlock(&vas_pseries_mutex);
return rc;
}
int pseries_vas_dlpar_cpu(void)
{
int new_nr_creds, rc;
rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat,
(u64)virt_to_phys(&hv_cop_caps));
if (!rc) {
new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds);
rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds);
}
if (rc)
pr_err("Failed reconfig VAS capabilities with DLPAR\n");
return rc;
}
/*
* Total number of default credits available (target_credits)
* in LPAR depends on number of cores configured. It varies based on
......@@ -840,7 +885,15 @@ static int pseries_vas_notifier(struct notifier_block *nb,
struct of_reconfig_data *rd = data;
struct device_node *dn = rd->dn;
const __be32 *intserv = NULL;
int new_nr_creds, len, rc = 0;
int len;
/*
* For shared CPU partition, the hypervisor assigns total credits
* based on entitled core capacity. So updating VAS windows will
* be called from lparcfg_write().
*/
if (is_shared_processor())
return NOTIFY_OK;
if ((action == OF_RECONFIG_ATTACH_NODE) ||
(action == OF_RECONFIG_DETACH_NODE))
......@@ -852,19 +905,7 @@ static int pseries_vas_notifier(struct notifier_block *nb,
if (!intserv)
return NOTIFY_OK;
rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat,
(u64)virt_to_phys(&hv_cop_caps));
if (!rc) {
new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds);
rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE,
new_nr_creds);
}
if (rc)
pr_err("Failed reconfig VAS capabilities with DLPAR\n");
return rc;
return pseries_vas_dlpar_cpu();
}
static struct notifier_block pseries_vas_nb = {
......
......@@ -132,6 +132,7 @@ struct pseries_vas_window {
u64 flags;
char *name;
int fault_virq;
atomic_t pending_faults; /* Number of pending faults */
};
int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps);
......@@ -140,10 +141,15 @@ int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps);
#ifdef CONFIG_PPC_VAS
int vas_migration_handler(int action);
int pseries_vas_dlpar_cpu(void);
#else
static inline int vas_migration_handler(int action)
{
return 0;
}
static inline int pseries_vas_dlpar_cpu(void)
{
return 0;
}
#endif
#endif /* _VAS_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册