IB/hfi1: Add global structure for affinity assignments

When HFI units get initialized, they each use their own mask copy for affinity assignments. On a multi-HFI system, affinity assignments overbook CPU cores as each HFI doesn't have knowledge of affinity assignments for other HFI units. Therefore, some CPU cores are never used for interrupt handlers in systems with high number of CPU cores per NUMA node. For multi-HFI systems, SDMA engine interrupt assignments start all over from the first CPU in the local NUMA node after the first HFI initialization. This change allows assignments to continue where the last HFI unit left off. Add global structure for affinity assignments for multiple HFIs to share affinity mask. Reviewed-by: N Jianxin Xiong <jianxin.xiong@intel.com> Reviewed-by: N Jubin John <jubin.john@intel.com> Reviewed-by: N Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: N Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: N Doug Ledford <dledford@redhat.com>

IB/hfi1: Add global structure for affinity assignments
When HFI units get initialized, they each use their own mask copy for affinity assignments. On a multi-HFI system, affinity assignments overbook CPU cores as each HFI doesn't have knowledge of affinity assignments for other HFI units. Therefore, some CPU cores are never used for interrupt handlers in systems with high number of CPU cores per NUMA node. For multi-HFI systems, SDMA engine interrupt assignments start all over from the first CPU in the local NUMA node after the first HFI initialization. This change allows assignments to continue where the last HFI unit left off. Add global structure for affinity assignments for multiple HFIs to share affinity mask. Reviewed-by: N Jianxin Xiong <jianxin.xiong@intel.com> Reviewed-by: N Jubin John <jubin.john@intel.com> Reviewed-by: N Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: N Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: N Doug Ledford <dledford@redhat.com>
4197344b · Dennis Dalessandro · Doug Ledford · 2b719046 · 4197344b · 4197344b
4 changed file
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -53,6 +53,11 @@
 #include "sdma.h"
 #include "trace.h"

+struct hfi1_affinity_node_list node_affinity = {
+	.list = LIST_HEAD_INIT(node_affinity.list),
+	.lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
+};
+
 /* Name of IRQ types, indexed by enum irq_type */
 static const char * const irq_type_names[] = {
 	"SDMA",
@@ -69,45 +74,100 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 }

 /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *dd)
+void init_real_cpu_mask(void)
 {
-	struct hfi1_affinity *info;
 	int possible, curr_cpu, i, ht;

-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	cpumask_clear(&info->real_cpu_mask);
+	cpumask_clear(&node_affinity.real_cpu_mask);

 	/* Start with cpu online mask as the real cpu mask */
-	cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);

 	/*
 	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
 	 */
-	possible = cpumask_weight(&info->real_cpu_mask);
+	possible = cpumask_weight(&node_affinity.real_cpu_mask);
 	ht = cpumask_weight(topology_sibling_cpumask(
-					cpumask_first(&info->real_cpu_mask)));
+				cpumask_first(&node_affinity.real_cpu_mask)));
 	/*
 	 * Step 1.  Skip over the first N HT siblings and use them as the
 	 * "real" cores.  Assumes that HT cores are not enumerated in
 	 * succession (except in the single core case).
 	 */
-	curr_cpu = cpumask_first(&info->real_cpu_mask);
+	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
 	for (i = 0; i < possible / ht; i++)
-		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 	/*
 	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 	 * skip any gaps.
 	 */
 	for (; i < possible; i++) {
-		cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
-		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+		cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 	}
+}

-	dd->affinity = info;
-	return 0;
+void node_affinity_init(void)
+{
+	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
+	/*
+	 * The real cpu mask is part of the affinity struct but it has to be
+	 * initialized early. It is needed to calculate the number of user
+	 * contexts in set_up_context_variables().
+	 */
+	init_real_cpu_mask();
+}
+
+void node_affinity_destroy(void)
+{
+	struct list_head *pos, *q;
+	struct hfi1_affinity_node *entry;
+
+	spin_lock(&node_affinity.lock);
+	list_for_each_safe(pos, q, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node,
+				   list);
+		list_del(pos);
+		kfree(entry);
+	}
+	spin_unlock(&node_affinity.lock);
+}
+
+static struct hfi1_affinity_node *node_affinity_allocate(int node)
+{
+	struct hfi1_affinity_node *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+	entry->node = node;
+	INIT_LIST_HEAD(&entry->list);
+
+	return entry;
+}
+
+/*
+ * It appends an entry to the list.
+ * It *must* be called with node_affinity.lock held.
+ */
+static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
+{
+	list_add_tail(&entry->list, &node_affinity.list);
+}
+
+/* It must be called with node_affinity.lock held */
+static struct hfi1_affinity_node *node_affinity_lookup(int node)
+{
+	struct list_head *pos;
+	struct hfi1_affinity_node *entry;
+
+	list_for_each(pos, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node, list);
+		if (entry->node == node)
+			return entry;
+	}
+
+	return NULL;
 }

 /*
@@ -121,10 +181,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd)
 * to the node relative 1 as necessary.
 *
 */
-void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 {
 	int node = pcibus_to_node(dd->pcidev->bus);
-	struct hfi1_affinity *info = dd->affinity;
+	struct hfi1_affinity_node *entry;
 	const struct cpumask *local_mask;
 	int curr_cpu, possible, i;

@@ -132,55 +192,75 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 		node = numa_node_id();
 	dd->node = node;

-	spin_lock_init(&info->lock);
-
-	init_cpu_mask_set(&info->def_intr);
-	init_cpu_mask_set(&info->rcv_intr);
-	init_cpu_mask_set(&info->proc);
-
 	local_mask = cpumask_of_node(dd->node);
 	if (cpumask_first(local_mask) >= nr_cpu_ids)
 		local_mask = topology_core_cpumask(0);
-	/* Use the "real" cpu mask of this node as the default */
-	cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
-
-	/*  fill in the receive list */
-	possible = cpumask_weight(&info->def_intr.mask);
-	curr_cpu = cpumask_first(&info->def_intr.mask);
-	if (possible == 1) {
-		/*  only one CPU, everyone will use it */
-		cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-	} else {
-		/*
-		 * Retain the first CPU in the default list for the control
-		 * context.
-		 */
-		curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-		/*
-		 * Remove the remaining kernel receive queues from
-		 * the default list and add them to the receive list.
-		 */
-		for (i = 0; i < dd->n_krcv_queues - 1; i++) {
-			cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-			cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-			curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-			if (curr_cpu >= nr_cpu_ids)
-				break;
+
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
+
+	/*
+	 * If this is the first time this NUMA node's affinity is used,
+	 * create an entry in the global affinity structure and initialize it.
+	 */
+	if (!entry) {
+		entry = node_affinity_allocate(node);
+		if (!entry) {
+			dd_dev_err(dd,
+				   "Unable to allocate global affinity node\n");
+			return -ENOMEM;
 		}
-	}
+		init_cpu_mask_set(&entry->def_intr);
+		init_cpu_mask_set(&entry->rcv_intr);
+		/* Use the "real" cpu mask of this node as the default */
+		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
+			    local_mask);
+
+		/* fill in the receive list */
+		possible = cpumask_weight(&entry->def_intr.mask);
+		curr_cpu = cpumask_first(&entry->def_intr.mask);
+
+		if (possible == 1) {
+			/* only one CPU, everyone will use it */
+			cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
+		} else {
+			/*
+			 * Retain the first CPU in the default list for the
+			 * control context.
+			 */
+			curr_cpu = cpumask_next(curr_cpu,
+						&entry->def_intr.mask);

-	cpumask_copy(&info->proc.mask, cpu_online_mask);
-}
+			/*
+			 * Remove the remaining kernel receive queues from
+			 * the default list and add them to the receive list.
+			 */
+			for (i = 0; i < dd->n_krcv_queues - 1; i++) {
+				cpumask_clear_cpu(curr_cpu,
+						  &entry->def_intr.mask);
+				cpumask_set_cpu(curr_cpu,
+						&entry->rcv_intr.mask);
+				curr_cpu = cpumask_next(curr_cpu,
+							&entry->def_intr.mask);
+				if (curr_cpu >= nr_cpu_ids)
+					break;
+			}
+		}

-void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
-{
-	kfree(dd->affinity);
+		spin_lock(&node_affinity.lock);
+		node_affinity_add_tail(entry);
+		spin_unlock(&node_affinity.lock);
+	}
+
+	return 0;
 }

 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 {
 	int ret;
 	cpumask_var_t diff;
+	struct hfi1_affinity_node *entry;
 	struct cpu_mask_set *set;
 	struct sdma_engine *sde = NULL;
 	struct hfi1_ctxtdata *rcd = NULL;
@@ -194,21 +274,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 	if (!ret)
 		return -ENOMEM;

+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
+
 	switch (msix->type) {
 	case IRQ_SDMA:
 		sde = (struct sdma_engine *)msix->arg;
 		scnprintf(extra, 64, "engine %u", sde->this_idx);
 		/* fall through */
 	case IRQ_GENERAL:
-		set = &dd->affinity->def_intr;
+		set = &entry->def_intr;
 		break;
 	case IRQ_RCVCTXT:
 		rcd = (struct hfi1_ctxtdata *)msix->arg;
 		if (rcd->ctxt == HFI1_CTRL_CTXT) {
-			set = &dd->affinity->def_intr;
+			set = &entry->def_intr;
 			cpu = cpumask_first(&set->mask);
 		} else {
-			set = &dd->affinity->rcv_intr;
+			set = &entry->rcv_intr;
 		}
 		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 		break;
@@ -222,8 +306,8 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 	 * is set above.  Skip accounting for it.  Everything else finds its
 	 * CPU here.
 	 */
-	if (cpu == -1) {
-		spin_lock(&dd->affinity->lock);
+	if (cpu == -1 && set) {
+		spin_lock(&node_affinity.lock);
 		if (cpumask_equal(&set->mask, &set->used)) {
 			/*
 			 * We've used up all the CPUs, bump up the generation
@@ -235,7 +319,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 		cpumask_andnot(diff, &set->mask, &set->used);
 		cpu = cpumask_first(diff);
 		cpumask_set_cpu(cpu, &set->used);
-		spin_unlock(&dd->affinity->lock);
+		spin_unlock(&node_affinity.lock);
 	}

 	switch (msix->type) {
@@ -263,30 +347,35 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 {
 	struct cpu_mask_set *set = NULL;
 	struct hfi1_ctxtdata *rcd;
+	struct hfi1_affinity_node *entry;
+
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);

 	switch (msix->type) {
 	case IRQ_SDMA:
 	case IRQ_GENERAL:
-		set = &dd->affinity->def_intr;
+		set = &entry->def_intr;
 		break;
 	case IRQ_RCVCTXT:
 		rcd = (struct hfi1_ctxtdata *)msix->arg;
 		/* only do accounting for non control contexts */
 		if (rcd->ctxt != HFI1_CTRL_CTXT)
-			set = &dd->affinity->rcv_intr;
+			set = &entry->rcv_intr;
 		break;
 	default:
 		return;
 	}

 	if (set) {
-		spin_lock(&dd->affinity->lock);
+		spin_lock(&node_affinity.lock);
 		cpumask_andnot(&set->used, &set->used, &msix->mask);
 		if (cpumask_empty(&set->used) && set->gen) {
 			set->gen--;
 			cpumask_copy(&set->used, &set->mask);
 		}
-		spin_unlock(&dd->affinity->lock);
+		spin_unlock(&node_affinity.lock);
 	}

 	irq_set_affinity_hint(msix->msix.vector, NULL);
@@ -297,9 +386,10 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 {
 	int cpu = -1, ret;
 	cpumask_var_t diff, mask, intrs;
+	struct hfi1_affinity_node *entry;
 	const struct cpumask *node_mask,
 		*proc_mask = tsk_cpus_allowed(current);
-	struct cpu_mask_set *set = &dd->affinity->proc;
+	struct cpu_mask_set *set = &node_affinity.proc;

 	/*
 	 * check whether process/context affinity has already
@@ -338,7 +428,7 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 	if (!ret)
 		goto free_mask;

-	spin_lock(&dd->affinity->lock);
+	spin_lock(&node_affinity.lock);
 	/*
 	 * If we've used all available CPUs, clear the mask and start
 	 * overloading.
@@ -348,13 +438,14 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 		cpumask_clear(&set->used);
 	}

+	entry = node_affinity_lookup(dd->node);
 	/* CPUs used by interrupt handlers */
-	cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
-			     &dd->affinity->def_intr.mask :
-			     &dd->affinity->def_intr.used));
-	cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
-				  &dd->affinity->rcv_intr.mask :
-				  &dd->affinity->rcv_intr.used));
+	cpumask_copy(intrs, (entry->def_intr.gen ?
+			     &entry->def_intr.mask :
+			     &entry->def_intr.used));
+	cpumask_or(intrs, intrs, (entry->rcv_intr.gen ?
+				  &entry->rcv_intr.mask :
+				  &entry->rcv_intr.used));
 	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
 		  cpumask_pr_args(intrs));

@@ -400,7 +491,7 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 		cpu = -1;
 	else
 		cpumask_set_cpu(cpu, &set->used);
-	spin_unlock(&dd->affinity->lock);
+	spin_unlock(&node_affinity.lock);

 	free_cpumask_var(intrs);
 free_mask:
@@ -413,16 +504,16 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)

 void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
 {
-	struct cpu_mask_set *set = &dd->affinity->proc;
+	struct cpu_mask_set *set = &node_affinity.proc;

 	if (cpu < 0)
 		return;
-	spin_lock(&dd->affinity->lock);
+	spin_lock(&node_affinity.lock);
 	cpumask_clear_cpu(cpu, &set->used);
 	if (cpumask_empty(&set->used) && set->gen) {
 		set->gen--;
 		cpumask_copy(&set->used, &set->mask);
 	}
-	spin_unlock(&dd->affinity->lock);
+	spin_unlock(&node_affinity.lock);
 }

--- a/drivers/infiniband/hw/hfi1/affinity.h
+++ b/drivers/infiniband/hw/hfi1/affinity.h
@@ -82,11 +82,9 @@ struct hfi1_affinity {
 struct hfi1_msix_entry;

 /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *);
+void init_real_cpu_mask(void);
 /* Initialize driver affinity data */
-void hfi1_dev_affinity_init(struct hfi1_devdata *);
-/* Free driver affinity data */
-void hfi1_dev_affinity_free(struct hfi1_devdata *);
+int hfi1_dev_affinity_init(struct hfi1_devdata *);
 /*
 * Set IRQ affinity to a CPU. The function will determine the
 * CPU and set the affinity to it.
@@ -105,4 +103,23 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *, int);
 /* Release a CPU used by a user process. */
 void hfi1_put_proc_affinity(struct hfi1_devdata *, int);

+struct hfi1_affinity_node {
+	int node;
+	struct cpu_mask_set def_intr;
+	struct cpu_mask_set rcv_intr;
+	struct list_head list;
+};
+
+struct hfi1_affinity_node_list {
+	struct list_head list;
+	struct cpumask real_cpu_mask;
+	struct cpu_mask_set proc;
+	/* protect affinity node list */
+	spinlock_t lock;
+};
+
+void node_affinity_init(void);
+void node_affinity_destroy(void);
+extern struct hfi1_affinity_node_list node_affinity;
+
 #endif /* _HFI1_AFFINITY_H */
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -63,6 +63,7 @@
 #include "efivar.h"
 #include "platform.h"
 #include "aspm.h"
+#include "affinity.h"

 #define NUM_IB_PORTS 1

@@ -12838,7 +12839,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 	 */
 	if (num_user_contexts < 0)
 		num_user_contexts =
-			cpumask_weight(&dd->affinity->real_cpu_mask);
+			cpumask_weight(&node_affinity.real_cpu_mask);

 	total_contexts = num_kernel_contexts + num_user_contexts;

@@ -14468,19 +14469,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 		 (dd->revision >> CCE_REVISION_SW_SHIFT)
 		    & CCE_REVISION_SW_MASK);

-	/*
-	 * The real cpu mask is part of the affinity struct but has to be
-	 * initialized earlier than the rest of the affinity struct because it
-	 * is needed to calculate the number of user contexts in
-	 * set_up_context_variables(). However, hfi1_dev_affinity_init(),
-	 * which initializes the rest of the affinity struct members,
-	 * depends on set_up_context_variables() for the number of kernel
-	 * contexts, so it cannot be called before set_up_context_variables().
-	 */
-	ret = init_real_cpu_mask(dd);
-	if (ret)
-		goto bail_cleanup;
-
 	ret = set_up_context_variables(dd);
 	if (ret)
 		goto bail_cleanup;
@@ -14494,7 +14482,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 	/* set up KDETH QP prefix in both RX and TX CSRs */
 	init_kdeth_qp(dd);

-	hfi1_dev_affinity_init(dd);
+	ret = hfi1_dev_affinity_init(dd);
+	if (ret)
+		goto bail_cleanup;

 	/* send contexts must be set up before receive contexts */
 	ret = init_send_contexts(dd);

--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -64,6 +64,7 @@
 #include "debugfs.h"
 #include "verbs.h"
 #include "aspm.h"
+#include "affinity.h"

 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -1004,7 +1005,6 @@ static void __hfi1_free_devdata(struct kobject *kobj)
 	rcu_barrier(); /* wait for rcu callbacks to complete */
 	free_percpu(dd->int_counter);
 	free_percpu(dd->rcv_limit);
-	hfi1_dev_affinity_free(dd);
 	free_percpu(dd->send_schedule);
 	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
@@ -1198,6 +1198,8 @@ static int __init hfi1_mod_init(void)
 	if (ret)
 		goto bail;

+	node_affinity_init();
+
 	/* validate max MTU before any devices start */
 	if (!valid_opa_max_mtu(hfi1_max_mtu)) {
 		pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
@@ -1278,6 +1280,7 @@ module_init(hfi1_mod_init);
 static void __exit hfi1_mod_cleanup(void)
 {
 	pci_unregister_driver(&hfi1_pci_driver);
+	node_affinity_destroy();
 	hfi1_wss_exit();
 	hfi1_dbg_exit();
 	hfi1_cpulist_count = 0;