提交 545ac138 编写于 作者: J Jeremy Fitzhardinge 提交者: H. Peter Anvin

x86, spinlock: Replace pv spinlocks with pv ticketlocks

Rather than outright replacing the entire spinlock implementation in
order to paravirtualize it, keep the ticket lock implementation but add
a couple of pvops hooks on the slow patch (long spin on lock, unlocking
a contended lock).

Ticket locks have a number of nice properties, but they also have some
surprising behaviours in virtual environments.  They enforce a strict
FIFO ordering on cpus trying to take a lock; however, if the hypervisor
scheduler does not schedule the cpus in the correct order, the system can
waste a huge amount of time spinning until the next cpu can take the lock.

(See Thomas Friebel's talk "Prevent Guests from Spinning Around"
http://www.xen.org/files/xensummitboston08/LHP.pdf for more details.)

To address this, we add two hooks:
 - __ticket_spin_lock which is called after the cpu has been
   spinning on the lock for a significant number of iterations but has
   failed to take the lock (presumably because the cpu holding the lock
   has been descheduled).  The lock_spinning pvop is expected to block
   the cpu until it has been kicked by the current lock holder.
 - __ticket_spin_unlock, which on releasing a contended lock
   (there are more cpus with tail tickets), it looks to see if the next
   cpu is blocked and wakes it if so.

When compiled with CONFIG_PARAVIRT_SPINLOCKS disabled, a set of stub
functions causes all the extra code to go away.

Results:
=======
setup: 32 core machine with 32 vcpu KVM guest (HT off)  with 8GB RAM
base = 3.11-rc
patched = base + pvspinlock V12

+-----------------+----------------+--------+
 dbench (Throughput in MB/sec. Higher is better)
+-----------------+----------------+--------+
|   base (stdev %)|patched(stdev%) | %gain  |
+-----------------+----------------+--------+
| 15035.3   (0.3) |15150.0   (0.6) |   0.8  |
|  1470.0   (2.2) | 1713.7   (1.9) |  16.6  |
|   848.6   (4.3) |  967.8   (4.3) |  14.0  |
|   652.9   (3.5) |  685.3   (3.7) |   5.0  |
+-----------------+----------------+--------+

pvspinlock shows benefits for overcommit ratio > 1 for PLE enabled cases,
and undercommits results are flat
Signed-off-by: NJeremy Fitzhardinge <jeremy@goop.org>
Link: http://lkml.kernel.org/r/1376058122-8248-2-git-send-email-raghavendra.kt@linux.vnet.ibm.comReviewed-by: NKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: NAttilio Rao <attilio.rao@citrix.com>
[ Raghavendra: Changed SPIN_THRESHOLD, fixed redefinition of arch_spinlock_t]
Signed-off-by: NRaghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Acked-by: NIngo Molnar <mingo@kernel.org>
Signed-off-by: NH. Peter Anvin <hpa@linux.intel.com>
上级 c095ba72
...@@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, ...@@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
static inline int arch_spin_is_locked(struct arch_spinlock *lock) static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
__ticket_t ticket)
{ {
return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); PVOP_VCALL2(pv_lock_ops.lock_spinning, lock, ticket);
} }
static inline int arch_spin_is_contended(struct arch_spinlock *lock) static __always_inline void ____ticket_unlock_kick(struct arch_spinlock *lock,
__ticket_t ticket)
{ {
return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
}
#define arch_spin_is_contended arch_spin_is_contended
static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
{
PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
}
static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
unsigned long flags)
{
PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
}
static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
{
return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
}
static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
{
PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
} }
#endif #endif
......
...@@ -327,13 +327,15 @@ struct pv_mmu_ops { ...@@ -327,13 +327,15 @@ struct pv_mmu_ops {
}; };
struct arch_spinlock; struct arch_spinlock;
#ifdef CONFIG_SMP
#include <asm/spinlock_types.h>
#else
typedef u16 __ticket_t;
#endif
struct pv_lock_ops { struct pv_lock_ops {
int (*spin_is_locked)(struct arch_spinlock *lock); void (*lock_spinning)(struct arch_spinlock *lock, __ticket_t ticket);
int (*spin_is_contended)(struct arch_spinlock *lock); void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
void (*spin_lock)(struct arch_spinlock *lock);
void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
int (*spin_trylock)(struct arch_spinlock *lock);
void (*spin_unlock)(struct arch_spinlock *lock);
}; };
/* This contains all the paravirt structures: we get a convenient /* This contains all the paravirt structures: we get a convenient
......
...@@ -34,6 +34,35 @@ ...@@ -34,6 +34,35 @@
# define UNLOCK_LOCK_PREFIX # define UNLOCK_LOCK_PREFIX
#endif #endif
/* How long a lock should spin before we consider blocking */
#define SPIN_THRESHOLD (1 << 15)
#ifndef CONFIG_PARAVIRT_SPINLOCKS
static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
__ticket_t ticket)
{
}
static __always_inline void ____ticket_unlock_kick(struct arch_spinlock *lock,
__ticket_t ticket)
{
}
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
/*
* If a spinlock has someone waiting on it, then kick the appropriate
* waiting cpu.
*/
static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
__ticket_t next)
{
if (unlikely(lock->tickets.tail != next))
____ticket_unlock_kick(lock, next);
}
/* /*
* Ticket locks are conceptually two parts, one indicating the current head of * Ticket locks are conceptually two parts, one indicating the current head of
* the queue, and the other indicating the current tail. The lock is acquired * the queue, and the other indicating the current tail. The lock is acquired
...@@ -47,19 +76,24 @@ ...@@ -47,19 +76,24 @@
* in the high part, because a wide xadd increment of the low part would carry * in the high part, because a wide xadd increment of the low part would carry
* up and contaminate the high part. * up and contaminate the high part.
*/ */
static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) static __always_inline void __ticket_spin_lock(struct arch_spinlock *lock)
{ {
register struct __raw_tickets inc = { .tail = 1 }; register struct __raw_tickets inc = { .tail = 1 };
inc = xadd(&lock->tickets, inc); inc = xadd(&lock->tickets, inc);
for (;;) { for (;;) {
if (inc.head == inc.tail) unsigned count = SPIN_THRESHOLD;
break;
cpu_relax(); do {
inc.head = ACCESS_ONCE(lock->tickets.head); if (inc.head == inc.tail)
goto out;
cpu_relax();
inc.head = ACCESS_ONCE(lock->tickets.head);
} while (--count);
__ticket_lock_spinning(lock, inc.tail);
} }
barrier(); /* make sure nothing creeps before the lock is taken */ out: barrier(); /* make sure nothing creeps before the lock is taken */
} }
static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
...@@ -78,7 +112,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) ...@@ -78,7 +112,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{ {
__ticket_t next = lock->tickets.head + 1;
__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
__ticket_unlock_kick(lock, next);
} }
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
...@@ -95,8 +132,6 @@ static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) ...@@ -95,8 +132,6 @@ static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
return (__ticket_t)(tmp.tail - tmp.head) > 1; return (__ticket_t)(tmp.tail - tmp.head) > 1;
} }
#ifndef CONFIG_PARAVIRT_SPINLOCKS
static inline int arch_spin_is_locked(arch_spinlock_t *lock) static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{ {
return __ticket_spin_is_locked(lock); return __ticket_spin_is_locked(lock);
...@@ -129,8 +164,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, ...@@ -129,8 +164,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
arch_spin_lock(lock); arch_spin_lock(lock);
} }
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
{ {
while (arch_spin_is_locked(lock)) while (arch_spin_is_locked(lock))
......
#ifndef _ASM_X86_SPINLOCK_TYPES_H #ifndef _ASM_X86_SPINLOCK_TYPES_H
#define _ASM_X86_SPINLOCK_TYPES_H #define _ASM_X86_SPINLOCK_TYPES_H
#ifndef __LINUX_SPINLOCK_TYPES_H
# error "please don't include this file directly"
#endif
#include <linux/types.h> #include <linux/types.h>
#if (CONFIG_NR_CPUS < 256) #if (CONFIG_NR_CPUS < 256)
......
...@@ -7,21 +7,10 @@ ...@@ -7,21 +7,10 @@
#include <asm/paravirt.h> #include <asm/paravirt.h>
static inline void
default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
{
arch_spin_lock(lock);
}
struct pv_lock_ops pv_lock_ops = { struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
.spin_is_locked = __ticket_spin_is_locked, .lock_spinning = paravirt_nop,
.spin_is_contended = __ticket_spin_is_contended, .unlock_kick = paravirt_nop,
.spin_lock = __ticket_spin_lock,
.spin_lock_flags = default_spin_lock_flags,
.spin_trylock = __ticket_spin_trylock,
.spin_unlock = __ticket_spin_unlock,
#endif #endif
}; };
EXPORT_SYMBOL(pv_lock_ops); EXPORT_SYMBOL(pv_lock_ops);
......
...@@ -139,6 +139,9 @@ struct xen_spinlock { ...@@ -139,6 +139,9 @@ struct xen_spinlock {
xen_spinners_t spinners; /* count of waiting cpus */ xen_spinners_t spinners; /* count of waiting cpus */
}; };
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
#if 0
static int xen_spin_is_locked(struct arch_spinlock *lock) static int xen_spin_is_locked(struct arch_spinlock *lock)
{ {
struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *xl = (struct xen_spinlock *)lock;
...@@ -167,7 +170,6 @@ static int xen_spin_trylock(struct arch_spinlock *lock) ...@@ -167,7 +170,6 @@ static int xen_spin_trylock(struct arch_spinlock *lock)
} }
static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(char *, irq_name);
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
/* /*
...@@ -354,6 +356,7 @@ static void xen_spin_unlock(struct arch_spinlock *lock) ...@@ -354,6 +356,7 @@ static void xen_spin_unlock(struct arch_spinlock *lock)
if (unlikely(xl->spinners)) if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl); xen_spin_unlock_slow(xl);
} }
#endif
static irqreturn_t dummy_handler(int irq, void *dev_id) static irqreturn_t dummy_handler(int irq, void *dev_id)
{ {
...@@ -418,13 +421,14 @@ void __init xen_init_spinlocks(void) ...@@ -418,13 +421,14 @@ void __init xen_init_spinlocks(void)
return; return;
BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
#if 0
pv_lock_ops.spin_is_locked = xen_spin_is_locked; pv_lock_ops.spin_is_locked = xen_spin_is_locked;
pv_lock_ops.spin_is_contended = xen_spin_is_contended; pv_lock_ops.spin_is_contended = xen_spin_is_contended;
pv_lock_ops.spin_lock = xen_spin_lock; pv_lock_ops.spin_lock = xen_spin_lock;
pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
pv_lock_ops.spin_trylock = xen_spin_trylock; pv_lock_ops.spin_trylock = xen_spin_trylock;
pv_lock_ops.spin_unlock = xen_spin_unlock; pv_lock_ops.spin_unlock = xen_spin_unlock;
#endif
} }
#ifdef CONFIG_XEN_DEBUG_FS #ifdef CONFIG_XEN_DEBUG_FS
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册