提交 2636acee 编写于 作者: A Alex Kogan 提交者: Xie XiuQi

locking/qspinlock: Introduce CNA into the slow path of qspinlock

hulk inclusion
category: feature
bugzilla: 13227
CVE: NA

-------------------------------------------------

In CNA, spinning threads are organized in two queues, a main queue for
threads running on the same node as the current lock holder, and a
secondary queue for threads running on other nodes. At the unlock time,
the lock holder scans the main queue looking for a thread running on
the same node. If found (call it thread T), all threads in the main queue
between the current lock holder and T are moved to the end of the
secondary queue, and the lock is passed to T. If such T is not found, the
lock is passed to the first node in the secondary queue. Finally, if the
secondary queue is empty, the lock is passed to the next thread in the
main queue. For more details, see https://arxiv.org/abs/1810.05600.

Note that this variant of CNA may introduce starvation by continuously
passing the lock to threads running on the same node. This issue
will be addressed later in the series.

Enabling CNA is controlled via a new configuration option
(NUMA_AWARE_SPINLOCKS). The CNA variant is patched in
at the boot time only if we run a multi-node machine, and the new
config is enabled. For the time being, the patching requires
CONFIG_PARAVIRT_SPINLOCKS to be enabled as well.
However, this should be resolved once static_call() is available.
Signed-off-by: NAlex Kogan <alex.kogan@oracle.com>
Reviewed-by: NSteve Sistare <steven.sistare@oracle.com>
Signed-off-by: NWei Li <liwei391@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 6ebce3eb
......@@ -1529,6 +1529,24 @@ config NUMA
Otherwise, you should say N.
config NUMA_AWARE_SPINLOCKS
bool "Numa-aware spinlocks"
depends on NUMA && QUEUED_SPINLOCKS
# For now, we depend on PARAVIRT_SPINLOCKS to make the patching work.
# This is awkward, but hopefully would be resolved once static_call()
# is available.
depends on PARAVIRT_SPINLOCKS
default y
help
Introduce NUMA (Non Uniform Memory Access) awareness into
the slow path of spinlocks.
The kernel will try to keep the lock on the same node,
thus reducing the number of remote cache misses, while
trading some of the short term fairness for better performance.
Say N if you want absolute first come first serve fairness.
config AMD_NUMA
def_bool y
prompt "Old style AMD Opteron NUMA detection"
......
......@@ -30,6 +30,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo
return val;
}
#ifdef CONFIG_NUMA_AWARE_SPINLOCKS
extern void __cna_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_init_lock_hash(void);
......
......@@ -643,6 +643,18 @@ void __init alternative_instructions(void)
(unsigned long)__smp_locks_end);
#endif
#if defined(CONFIG_NUMA_AWARE_SPINLOCKS)
/*
* If we have multiple NUMA nodes, switch from native
* to the NUMA-friendly slow path for spin locks.
*/
if (nr_node_ids > 1 && pv_lock_ops.queued_spin_lock_slowpath ==
native_queued_spin_lock_slowpath) {
pv_lock_ops.queued_spin_lock_slowpath =
__cna_queued_spin_lock_slowpath;
}
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
restart_nmi();
......
......@@ -17,7 +17,7 @@
struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */
u64 locked; /* 1 if lock acquired */
int count; /* nesting count, see qspinlock.c */
};
......
......@@ -20,7 +20,7 @@
* Peter Zijlstra <peterz@infradead.org>
*/
#ifndef _GEN_PV_LOCK_SLOWPATH
#if !defined(_GEN_PV_LOCK_SLOWPATH) && !defined(_GEN_CNA_LOCK_SLOWPATH)
#include <linux/smp.h>
#include <linux/bug.h>
......@@ -77,18 +77,14 @@
#define MAX_NODES 4
/*
* On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
* size and four of them will fit nicely in one 64-byte cacheline. For
* pvqspinlock, however, we need more space for extra data. To accommodate
* that, we insert two more long words to pad it up to 32 bytes. IOW, only
* two of them can fit in a cacheline in this case. That is OK as it is rare
* to have more than 2 levels of slowpath nesting in actual use. We don't
* want to penalize pvqspinlocks to optimize for a rare case in native
* qspinlocks.
* On 64-bit architectures, the mcs_spinlock structure will be 20 bytes in
* size. For pvqspinlock or the NUMA-aware variant, however, we need more
* space for extra data. To accommodate that, we insert two more long words
* to pad it up to 36 bytes.
*/
struct qnode {
struct mcs_spinlock mcs;
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#if defined(CONFIG_PARAVIRT_SPINLOCKS) || defined(CONFIG_NUMA_AWARE_SPINLOCKS)
long reserved[2];
#endif
};
......@@ -327,7 +323,7 @@ static __always_inline void __pass_mcs_lock(struct mcs_spinlock *node,
#define set_locked_empty_mcs __set_locked_empty_mcs
#define pass_mcs_lock __pass_mcs_lock
#endif /* _GEN_PV_LOCK_SLOWPATH */
#endif /* _GEN_PV_LOCK_SLOWPATH && _GEN_CNA_LOCK_SLOWPATH */
/**
* queued_spin_lock_slowpath - acquire the queued spinlock
......@@ -584,6 +580,29 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);
/*
* Generate the code for NUMA-aware spin locks
*/
#if !defined(_GEN_CNA_LOCK_SLOWPATH) && defined(CONFIG_NUMA_AWARE_SPINLOCKS)
#define _GEN_CNA_LOCK_SLOWPATH
#undef pv_init_node
#define pv_init_node cna_init_node
#undef set_locked_empty_mcs
#define set_locked_empty_mcs cna_set_locked_empty_mcs
#undef pass_mcs_lock
#define pass_mcs_lock cna_pass_mcs_lock
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __cna_queued_spin_lock_slowpath
#include "qspinlock_cna.h"
#include "qspinlock.c"
#endif
/*
* Generate the paravirt code for queued_spin_unlock_slowpath().
*/
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _GEN_CNA_LOCK_SLOWPATH
#error "do not include this file"
#endif
#include <linux/topology.h>
/*
* Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock).
*
* In CNA, spinning threads are organized in two queues, a main queue for
* threads running on the same node as the current lock holder, and a
* secondary queue for threads running on other nodes. At the unlock time,
* the lock holder scans the main queue looking for a thread running on
* the same node. If found (call it thread T), all threads in the main queue
* between the current lock holder and T are moved to the end of the
* secondary queue, and the lock is passed to T. If such T is not found, the
* lock is passed to the first node in the secondary queue. Finally, if the
* secondary queue is empty, the lock is passed to the next thread in the
* main queue. To avoid starvation of threads in the secondary queue,
* those threads are moved back to the head of the main queue
* after a certain expected number of intra-node lock hand-offs.
*
* For more details, see https://arxiv.org/abs/1810.05600.
*
* Authors: Alex Kogan <alex.kogan@oracle.com>
* Dave Dice <dave.dice@oracle.com>
*/
struct cna_node {
struct mcs_spinlock mcs;
u32 numa_node;
u32 encoded_tail;
struct cna_node *tail; /* points to the secondary queue tail */
};
#define CNA_NODE(ptr) ((struct cna_node *)(ptr))
static void cna_init_node(struct mcs_spinlock *node)
{
struct cna_node *cn = CNA_NODE(node);
struct mcs_spinlock *base_node;
int cpuid;
BUILD_BUG_ON(sizeof(struct cna_node) > sizeof(struct qnode));
/* we store a pointer in the node's @locked field */
BUILD_BUG_ON(sizeof(uintptr_t) > sizeof_field(struct mcs_spinlock, locked));
cpuid = smp_processor_id();
cn->numa_node = cpu_to_node(cpuid);
base_node = this_cpu_ptr(&qnodes[0].mcs);
cn->encoded_tail = encode_tail(cpuid, base_node->count - 1);
}
/**
* find_successor - Scan the main waiting queue looking for the first
* thread running on the same node as the lock holder. If found (call it
* thread T), move all threads in the main queue between the lock holder
* and T to the end of the secondary queue and return T; otherwise, return NULL.
*/
static struct cna_node *find_successor(struct mcs_spinlock *me)
{
struct cna_node *me_cna = CNA_NODE(me);
struct cna_node *head_other, *tail_other, *cur;
struct cna_node *next = CNA_NODE(READ_ONCE(me->next));
int my_node;
/* @next should be set, else we would not be calling this function. */
WARN_ON_ONCE(next == NULL);
my_node = me_cna->numa_node;
/*
* Fast path - check whether the immediate successor runs on
* the same node.
*/
if (next->numa_node == my_node)
return next;
head_other = next;
tail_other = next;
/*
* Traverse the main waiting queue starting from the successor of my
* successor, and look for a thread running on the same node.
*/
cur = CNA_NODE(READ_ONCE(next->mcs.next));
while (cur) {
if (cur->numa_node == my_node) {
/*
* Found a thread on the same node. Move threads
* between me and that node into the secondary queue.
*/
if (me->locked > 1)
CNA_NODE(me->locked)->tail->mcs.next =
(struct mcs_spinlock *)head_other;
else
me->locked = (uintptr_t)head_other;
tail_other->mcs.next = NULL;
CNA_NODE(me->locked)->tail = tail_other;
return cur;
}
tail_other = cur;
cur = CNA_NODE(READ_ONCE(cur->mcs.next));
}
return NULL;
}
static inline bool cna_set_locked_empty_mcs(struct qspinlock *lock, u32 val,
struct mcs_spinlock *node)
{
/* Check whether the secondary queue is empty. */
if (node->locked <= 1) {
if (atomic_try_cmpxchg_relaxed(&lock->val, &val,
_Q_LOCKED_VAL))
return true; /* No contention */
} else {
/*
* Pass the lock to the first thread in the secondary
* queue, but first try to update the queue's tail to
* point to the last node in the secondary queue.
*/
struct cna_node *succ = CNA_NODE(node->locked);
u32 new = succ->tail->encoded_tail + _Q_LOCKED_VAL;
if (atomic_try_cmpxchg_relaxed(&lock->val, &val, new)) {
arch_mcs_spin_unlock_contended(&succ->mcs.locked, 1);
return true;
}
}
return false;
}
static inline void cna_pass_mcs_lock(struct mcs_spinlock *node,
struct mcs_spinlock *next)
{
struct cna_node *succ = NULL;
u64 *var = &next->locked;
u64 val = 1;
succ = find_successor(node);
if (succ) {
var = &succ->mcs.locked;
/*
* We unlock a successor by passing a non-zero value,
* so set @val to 1 iff @locked is 0, which will happen
* if we acquired the MCS lock when its queue was empty
*/
val = node->locked + (node->locked == 0);
} else if (node->locked > 1) { /* if the secondary queue is not empty */
/* pass the lock to the first node in that queue */
succ = CNA_NODE(node->locked);
succ->tail->mcs.next = next;
var = &succ->mcs.locked;
} /*
* Otherwise, pass the lock to the immediate successor
* in the main queue.
*/
arch_mcs_spin_unlock_contended(var, val);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册