cpumask_set_cpu_local_first => cpumask_local_spread, lament

da91309e (cpumask: Utility function to set n'th cpu...) created a genuinely weird function. I never saw it before, it went through DaveM. (He only does this to make us other maintainers feel better about our own mistakes.) cpumask_set_cpu_local_first's purpose is say "I need to spread things across N online cpus, choose the ones on this numa node first"; you call it in a loop. It can fail. One of the two callers ignores this, the other aborts and fails the device open. It can fail in two ways: allocating the off-stack cpumask, or through a convoluted codepath which AFAICT can only occur if cpu_online_mask changes. Which shouldn't happen, because if cpu_online_mask can change while you call this, it could return a now-offline cpu anyway. It contains a nonsensical test "!cpumask_of_node(numa_node)". This was drawn to my attention by Geert, who said this causes a warning on Sparc. It sets a single bit in a cpumask instead of returning a cpu number, because that's what the callers want. It could be made more efficient by passing the previous cpu rather than an index, but that would be more invasive to the callers. Fixes: da91309e Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (then rebased) Tested-by: N Amir Vadai <amirv@mellanox.com> Acked-by: N Amir Vadai <amirv@mellanox.com> Acked-by: N David S. Miller <davem@davemloft.net>

cpumask_set_cpu_local_first => cpumask_local_spread, lament
da91309e (cpumask: Utility function to set n'th cpu...) created a genuinely weird function. I never saw it before, it went through DaveM. (He only does this to make us other maintainers feel better about our own mistakes.) cpumask_set_cpu_local_first's purpose is say "I need to spread things across N online cpus, choose the ones on this numa node first"; you call it in a loop. It can fail. One of the two callers ignores this, the other aborts and fails the device open. It can fail in two ways: allocating the off-stack cpumask, or through a convoluted codepath which AFAICT can only occur if cpu_online_mask changes. Which shouldn't happen, because if cpu_online_mask can change while you call this, it could return a now-offline cpu anyway. It contains a nonsensical test "!cpumask_of_node(numa_node)". This was drawn to my attention by Geert, who said this causes a warning on Sparc. It sets a single bit in a cpumask instead of returning a cpu number, because that's what the callers want. It could be made more efficient by passing the previous cpu rather than an index, but that would be more invasive to the callers. Fixes: da91309e Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (then rebased) Tested-by: N Amir Vadai <amirv@mellanox.com> Acked-by: N Amir Vadai <amirv@mellanox.com> Acked-by: N David S. Miller <davem@davemloft.net>
f36963c9 · Rusty Russell · 37815bf8 · f36963c9 · f36963c9 · f36963c9
5 changed file
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2358,11 +2358,11 @@ static int be_evt_queues_create(struct be_adapter *adapter)
 				    adapter->cfg_num_qs);
 	for_all_evt_queues(adapter, eqo, i) {
+		int numa_node = dev_to_node(&adapter->pdev->dev);
 		if (!zalloc_cpumask_var(&eqo->affinity_mask, GFP_KERNEL))
 			return -ENOMEM;
-		cpumask_set_cpu_local_first(i, dev_to_node(&adapter->pdev->dev),
+		cpumask_set_cpu(cpumask_local_spread(i, numa_node),
-					    eqo->affinity_mask);
+				eqo->affinity_mask);
 		netif_napi_add(adapter->netdev, &eqo->napi, be_poll,
 			       BE_NAPI_WEIGHT);
 		napi_hash_add(&eqo->napi);

--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1501,17 +1501,13 @@ static int mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
 {
 	struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx];
 	int numa_node = priv->mdev->dev->numa_node;
-	int ret = 0;
 	if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL))
 		return -ENOMEM;
-	ret = cpumask_set_cpu_local_first(ring_idx, numa_node,
+	cpumask_set_cpu(cpumask_local_spread(ring_idx, numa_node),
-					  ring->affinity_mask);
+			ring->affinity_mask);
-	if (ret)
+	return 0;
-		free_cpumask_var(ring->affinity_mask);
-	return ret;
 }
 static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)

--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -144,9 +144,9 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	ring->queue_index = queue_index;
 	if (queue_index < priv->num_tx_rings_p_up)
-		cpumask_set_cpu_local_first(queue_index,
+		cpumask_set_cpu(cpumask_local_spread(queue_index,
-					    priv->mdev->dev->numa_node,
+						     priv->mdev->dev->numa_node),
-					    &ring->affinity_mask);
+				&ring->affinity_mask);
 	*pring = ring;
 	return 0;

--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -151,10 +151,8 @@ static inline unsigned int cpumask_any_but(const struct cpumask *mask,
 	return 1;
 }
-static inline int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp)
+static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 {
-	set_bit(0, cpumask_bits(dstp));
 	return 0;
 }
@@ -208,7 +206,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
 int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
-int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp);
+unsigned int cpumask_local_spread(unsigned int i, int node);
 /**
 * for_each_cpu - iterate over every cpu in a mask

--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -139,64 +139,42 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 #endif
 /**
- * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first
+ * cpumask_local_spread - select the i'th cpu with local numa cpu's first
- *
 * @i: index number
- * @numa_node: local numa_node
+ * @node: local numa_node
- * @dstp: cpumask with the relevant cpu bit set according to the policy
 *
- * This function sets the cpumask according to a numa aware policy.
+ * This function selects an online CPU according to a numa aware policy;
- * cpumask could be used as an affinity hint for the IRQ related to a
+ * local cpus are returned first, followed by non-local ones, then it
- * queue. When the policy is to spread queues across cores - local cores
+ * wraps around.
- * first.
 *
- * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set
+ * It's not very efficient, but useful for setup.
- * the cpu bit and need to re-call the function.
 */
-int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp)
+unsigned int cpumask_local_spread(unsigned int i, int node)
 {
-	cpumask_var_t mask;
 	int cpu;
-	int ret = 0;
-	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
+	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();
-	if (numa_node == -1 || !cpumask_of_node(numa_node)) {
+	if (node == -1) {
-		/* Use all online cpu's for non numa aware system */
+		for_each_cpu(cpu, cpu_online_mask)
-		cpumask_copy(mask, cpu_online_mask);
+			if (i-- == 0)
+				return cpu;
 	} else {
-		int n;
+		/* NUMA first. */
+		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
-		cpumask_and(mask,
+			if (i-- == 0)
-			    cpumask_of_node(numa_node), cpu_online_mask);
+				return cpu;
-		n = cpumask_weight(mask);
+		for_each_cpu(cpu, cpu_online_mask) {
-		if (i >= n) {
+			/* Skip NUMA nodes, done above. */
-			i -= n;
+			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
+				continue;
-			/* If index > number of local cpu's, mask out local
-			 * cpu's
+			if (i-- == 0)
-			 */
+				return cpu;
-			cpumask_andnot(mask, cpu_online_mask, mask);
 		}
 	}
+	BUG();
-	for_each_cpu(cpu, mask) {
-		if (--i < 0)
-			goto out;
-	}
-	ret = -EAGAIN;
-out:
-	free_cpumask_var(mask);
-	if (!ret)
-		cpumask_set_cpu(cpu, dstp);
-	return ret;
 }
-EXPORT_SYMBOL(cpumask_set_cpu_local_first);
+EXPORT_SYMBOL(cpumask_local_spread);