diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bb2459d71758d6550975c8324cd9a05463a171f..4bc3d336d0b7d05f19b7b7e4c49abf350308d174 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1529,6 +1529,24 @@ config NUMA Otherwise, you should say N. +config NUMA_AWARE_SPINLOCKS + bool "Numa-aware spinlocks" + depends on NUMA && QUEUED_SPINLOCKS + # For now, we depend on PARAVIRT_SPINLOCKS to make the patching work. + # This is awkward, but hopefully would be resolved once static_call() + # is available. + depends on PARAVIRT_SPINLOCKS + default y + help + Introduce NUMA (Non Uniform Memory Access) awareness into + the slow path of spinlocks. + + The kernel will try to keep the lock on the same node, + thus reducing the number of remote cache misses, while + trading some of the short term fairness for better performance. + + Say N if you want absolute first come first serve fairness. + config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 055c60a057567ff655be0157c4756e4dd36dbccc..47aafe9897e8e58bda65f126f8965d9fcb21d7d7 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -30,6 +30,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo return val; } +#ifdef CONFIG_NUMA_AWARE_SPINLOCKS +extern void __cna_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +#endif + #ifdef CONFIG_PARAVIRT_SPINLOCKS extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 40ad1702c871b1d80209d9eb22a29072b9246a4a..c0c17c82dc35b589435c289bf4d0fc4521effa13 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -643,6 +643,18 @@ void __init alternative_instructions(void) (unsigned long)__smp_locks_end); #endif +#if defined(CONFIG_NUMA_AWARE_SPINLOCKS) + /* + * If we have multiple NUMA nodes, switch from native + * to the NUMA-friendly slow path for spin locks. + */ + if (nr_node_ids > 1 && pv_lock_ops.queued_spin_lock_slowpath == + native_queued_spin_lock_slowpath) { + pv_lock_ops.queued_spin_lock_slowpath = + __cna_queued_spin_lock_slowpath; + } +#endif + apply_paravirt(__parainstructions, __parainstructions_end); restart_nmi(); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index bc6d3244e1afa3cf118bf2f392b1d6689d42b29f..36b802babc8840471ceb18c4c258ea55d6391537 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -17,7 +17,7 @@ struct mcs_spinlock { struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ + u64 locked; /* 1 if lock acquired */ int count; /* nesting count, see qspinlock.c */ }; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 39180000aa77147eb1c5aba159d6d11dbfed1727..6138ec2b6b2d0ea5158646244573a3c2fcfb46b3 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -20,7 +20,7 @@ * Peter Zijlstra */ -#ifndef _GEN_PV_LOCK_SLOWPATH +#if !defined(_GEN_PV_LOCK_SLOWPATH) && !defined(_GEN_CNA_LOCK_SLOWPATH) #include #include @@ -77,18 +77,14 @@ #define MAX_NODES 4 /* - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in - * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate - * that, we insert two more long words to pad it up to 32 bytes. IOW, only - * two of them can fit in a cacheline in this case. That is OK as it is rare - * to have more than 2 levels of slowpath nesting in actual use. We don't - * want to penalize pvqspinlocks to optimize for a rare case in native - * qspinlocks. + * On 64-bit architectures, the mcs_spinlock structure will be 20 bytes in + * size. For pvqspinlock or the NUMA-aware variant, however, we need more + * space for extra data. To accommodate that, we insert two more long words + * to pad it up to 36 bytes. */ struct qnode { struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS +#if defined(CONFIG_PARAVIRT_SPINLOCKS) || defined(CONFIG_NUMA_AWARE_SPINLOCKS) long reserved[2]; #endif }; @@ -327,7 +323,7 @@ static __always_inline void __pass_mcs_lock(struct mcs_spinlock *node, #define set_locked_empty_mcs __set_locked_empty_mcs #define pass_mcs_lock __pass_mcs_lock -#endif /* _GEN_PV_LOCK_SLOWPATH */ +#endif /* _GEN_PV_LOCK_SLOWPATH && _GEN_CNA_LOCK_SLOWPATH */ /** * queued_spin_lock_slowpath - acquire the queued spinlock @@ -584,6 +580,29 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) } EXPORT_SYMBOL(queued_spin_lock_slowpath); +/* + * Generate the code for NUMA-aware spin locks + */ +#if !defined(_GEN_CNA_LOCK_SLOWPATH) && defined(CONFIG_NUMA_AWARE_SPINLOCKS) +#define _GEN_CNA_LOCK_SLOWPATH + +#undef pv_init_node +#define pv_init_node cna_init_node + +#undef set_locked_empty_mcs +#define set_locked_empty_mcs cna_set_locked_empty_mcs + +#undef pass_mcs_lock +#define pass_mcs_lock cna_pass_mcs_lock + +#undef queued_spin_lock_slowpath +#define queued_spin_lock_slowpath __cna_queued_spin_lock_slowpath + +#include "qspinlock_cna.h" +#include "qspinlock.c" + +#endif + /* * Generate the paravirt code for queued_spin_unlock_slowpath(). */ diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h new file mode 100644 index 0000000000000000000000000000000000000000..efb9b12b2f9bed988eb9357b6ca8a91cf98926a6 --- /dev/null +++ b/kernel/locking/qspinlock_cna.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _GEN_CNA_LOCK_SLOWPATH +#error "do not include this file" +#endif + +#include + +/* + * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). + * + * In CNA, spinning threads are organized in two queues, a main queue for + * threads running on the same node as the current lock holder, and a + * secondary queue for threads running on other nodes. At the unlock time, + * the lock holder scans the main queue looking for a thread running on + * the same node. If found (call it thread T), all threads in the main queue + * between the current lock holder and T are moved to the end of the + * secondary queue, and the lock is passed to T. If such T is not found, the + * lock is passed to the first node in the secondary queue. Finally, if the + * secondary queue is empty, the lock is passed to the next thread in the + * main queue. To avoid starvation of threads in the secondary queue, + * those threads are moved back to the head of the main queue + * after a certain expected number of intra-node lock hand-offs. + * + * For more details, see https://arxiv.org/abs/1810.05600. + * + * Authors: Alex Kogan + * Dave Dice + */ + +struct cna_node { + struct mcs_spinlock mcs; + u32 numa_node; + u32 encoded_tail; + struct cna_node *tail; /* points to the secondary queue tail */ +}; + +#define CNA_NODE(ptr) ((struct cna_node *)(ptr)) + +static void cna_init_node(struct mcs_spinlock *node) +{ + struct cna_node *cn = CNA_NODE(node); + struct mcs_spinlock *base_node; + int cpuid; + + BUILD_BUG_ON(sizeof(struct cna_node) > sizeof(struct qnode)); + /* we store a pointer in the node's @locked field */ + BUILD_BUG_ON(sizeof(uintptr_t) > sizeof_field(struct mcs_spinlock, locked)); + + cpuid = smp_processor_id(); + cn->numa_node = cpu_to_node(cpuid); + + base_node = this_cpu_ptr(&qnodes[0].mcs); + cn->encoded_tail = encode_tail(cpuid, base_node->count - 1); +} + +/** + * find_successor - Scan the main waiting queue looking for the first + * thread running on the same node as the lock holder. If found (call it + * thread T), move all threads in the main queue between the lock holder + * and T to the end of the secondary queue and return T; otherwise, return NULL. + */ +static struct cna_node *find_successor(struct mcs_spinlock *me) +{ + struct cna_node *me_cna = CNA_NODE(me); + struct cna_node *head_other, *tail_other, *cur; + struct cna_node *next = CNA_NODE(READ_ONCE(me->next)); + int my_node; + + /* @next should be set, else we would not be calling this function. */ + WARN_ON_ONCE(next == NULL); + + my_node = me_cna->numa_node; + + /* + * Fast path - check whether the immediate successor runs on + * the same node. + */ + if (next->numa_node == my_node) + return next; + + head_other = next; + tail_other = next; + + /* + * Traverse the main waiting queue starting from the successor of my + * successor, and look for a thread running on the same node. + */ + cur = CNA_NODE(READ_ONCE(next->mcs.next)); + while (cur) { + if (cur->numa_node == my_node) { + /* + * Found a thread on the same node. Move threads + * between me and that node into the secondary queue. + */ + if (me->locked > 1) + CNA_NODE(me->locked)->tail->mcs.next = + (struct mcs_spinlock *)head_other; + else + me->locked = (uintptr_t)head_other; + tail_other->mcs.next = NULL; + CNA_NODE(me->locked)->tail = tail_other; + return cur; + } + tail_other = cur; + cur = CNA_NODE(READ_ONCE(cur->mcs.next)); + } + return NULL; +} + +static inline bool cna_set_locked_empty_mcs(struct qspinlock *lock, u32 val, + struct mcs_spinlock *node) +{ + /* Check whether the secondary queue is empty. */ + if (node->locked <= 1) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, + _Q_LOCKED_VAL)) + return true; /* No contention */ + } else { + /* + * Pass the lock to the first thread in the secondary + * queue, but first try to update the queue's tail to + * point to the last node in the secondary queue. + */ + struct cna_node *succ = CNA_NODE(node->locked); + u32 new = succ->tail->encoded_tail + _Q_LOCKED_VAL; + + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, new)) { + arch_mcs_spin_unlock_contended(&succ->mcs.locked, 1); + return true; + } + } + + return false; +} + +static inline void cna_pass_mcs_lock(struct mcs_spinlock *node, + struct mcs_spinlock *next) +{ + struct cna_node *succ = NULL; + u64 *var = &next->locked; + u64 val = 1; + + succ = find_successor(node); + + if (succ) { + var = &succ->mcs.locked; + /* + * We unlock a successor by passing a non-zero value, + * so set @val to 1 iff @locked is 0, which will happen + * if we acquired the MCS lock when its queue was empty + */ + val = node->locked + (node->locked == 0); + } else if (node->locked > 1) { /* if the secondary queue is not empty */ + /* pass the lock to the first node in that queue */ + succ = CNA_NODE(node->locked); + succ->tail->mcs.next = next; + var = &succ->mcs.locked; + } /* + * Otherwise, pass the lock to the immediate successor + * in the main queue. + */ + + arch_mcs_spin_unlock_contended(var, val); +}