neigh: speedup neigh_hh_init()

When a new dst is used to send a frame, neigh_resolve_output() tries to associate an struct hh_cache to this dst, calling neigh_hh_init() with the neigh rwlock write locked. Most of the time, hh_cache is already known and linked into neighbour, so we find it and increment its refcount. This patch changes the logic so that we call neigh_hh_init() with neighbour lock read locked only, so that fast path can be run in parallel by concurrent cpus. This brings part of the speedup we got with commit c7d4426a (introduce DST_NOCACHE flag) for non cached dsts, even for cached ones, removing one of the contention point that routers hit on multiqueue enabled machines. Further improvements would need to use a seqlock instead of an rwlock to protect neigh->ha[], to not dirty neigh too often and remove two atomic ops. Signed-off-by: N Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: N David S. Miller <davem@davemloft.net>

neigh: speedup neigh_hh_init()
When a new dst is used to send a frame, neigh_resolve_output() tries to associate an struct hh_cache to this dst, calling neigh_hh_init() with the neigh rwlock write locked. Most of the time, hh_cache is already known and linked into neighbour, so we find it and increment its refcount. This patch changes the logic so that we call neigh_hh_init() with neighbour lock read locked only, so that fast path can be run in parallel by concurrent cpus. This brings part of the speedup we got with commit c7d4426a (introduce DST_NOCACHE flag) for non cached dsts, even for cached ones, removing one of the contention point that routers hit on multiqueue enabled machines. Further improvements would need to use a seqlock instead of an rwlock to protect neigh->ha[], to not dirty neigh too often and remove two atomic ops. Signed-off-by: N Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: N David S. Miller <davem@davemloft.net>
34d101dd · Eric Dumazet · David S. Miller · 37f9fc45 · 34d101dd · 34d101dd
隐藏空白更改
内联并排

Showing with 69 addition and 40 deletion

include/linux/netdevice.h include/linux/netdevice.h +6 -0

net/core/dst.c net/core/dst.c +2 -2

net/core/neighbour.c net/core/neighbour.c +61 -38

未找到文件。
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -281,6 +281,12 @@ struct hh_cache {
 	unsigned long	hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
 };
+static inline void hh_cache_put(struct hh_cache *hh)
+{
+	if (atomic_dec_and_test(&hh->hh_refcnt))
+		kfree(hh);
+}
 /* Reserve HH_DATA_MOD byte aligned hard_header_len, but at least that much.
 * Alternative is:
 *   dev->hard_header_len ? (dev->hard_header_len +

--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -228,8 +228,8 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 	child = dst->child;
 	dst->hh = NULL;
-	if (hh && atomic_dec_and_test(&hh->hh_refcnt))
+	if (hh)
-		kfree(hh);
+		hh_cache_put(hh);
 	if (neigh) {
 		dst->neighbour = NULL;

--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -709,8 +709,7 @@ void neigh_destroy(struct neighbour *neigh)
 		write_seqlock_bh(&hh->hh_lock);
 		hh->hh_output = neigh_blackhole;
 		write_sequnlock_bh(&hh->hh_lock);
-		if (atomic_dec_and_test(&hh->hh_refcnt))
+		hh_cache_put(hh);
-			kfree(hh);
 	}
 	skb_queue_purge(&neigh->arp_queue);
@@ -1210,39 +1209,67 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
 }
 EXPORT_SYMBOL(neigh_event_ns);
+static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
+				   __be16 protocol)
+{
+	struct hh_cache *hh;
+	for (hh = n->hh; hh; hh = hh->hh_next) {
+		if (hh->hh_type == protocol) {
+			atomic_inc(&hh->hh_refcnt);
+			if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+				hh_cache_put(hh);
+			return true;
+		}
+	}
+	return false;
+}
+/* called with read_lock_bh(&n->lock); */
 static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 			  __be16 protocol)
 {
 	struct hh_cache	*hh;
 	struct net_device *dev = dst->dev;
-	for (hh = n->hh; hh; hh = hh->hh_next)
+	if (likely(neigh_hh_lookup(n, dst, protocol)))
-		if (hh->hh_type == protocol)
+		return;
-			break;
-	if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
+	/* slow path */
-		seqlock_init(&hh->hh_lock);
+	hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
-		hh->hh_type = protocol;
+	if (!hh)
-		atomic_set(&hh->hh_refcnt, 0);
+		return;
-		hh->hh_next = NULL;
-		if (dev->header_ops->cache(n, hh)) {
+	seqlock_init(&hh->hh_lock);
-			kfree(hh);
+	hh->hh_type = protocol;
-			hh = NULL;
+	atomic_set(&hh->hh_refcnt, 2);
-		} else {
-			atomic_inc(&hh->hh_refcnt);
+	if (dev->header_ops->cache(n, hh)) {
-			hh->hh_next = n->hh;
+		kfree(hh);
-			n->hh	    = hh;
+		return;
-			if (n->nud_state & NUD_CONNECTED)
-				hh->hh_output = n->ops->hh_output;
-			else
-				hh->hh_output = n->ops->output;
-		}
 	}
-	if (hh)	{
+	read_unlock(&n->lock);
-		atomic_inc(&hh->hh_refcnt);
+	write_lock(&n->lock);
-		dst->hh = hh;
+	/* must check if another thread already did the insert */
+	if (neigh_hh_lookup(n, dst, protocol)) {
+		kfree(hh);
+		goto end;
 	}
+	if (n->nud_state & NUD_CONNECTED)
+		hh->hh_output = n->ops->hh_output;
+	else
+		hh->hh_output = n->ops->output;
+	hh->hh_next = n->hh;
+	n->hh	    = hh;
+	if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+		hh_cache_put(hh);
+end:
+	write_unlock(&n->lock);
+	read_lock(&n->lock);
 }
 /* This function can be used in contexts, where only old dev_queue_xmit
@@ -1281,21 +1308,17 @@ int neigh_resolve_output(struct sk_buff *skb)
 	if (!neigh_event_send(neigh, skb)) {
 		int err;
 		struct net_device *dev = neigh->dev;
+		read_lock_bh(&neigh->lock);
 		if (dev->header_ops->cache &&
 		    !dst->hh &&
-		    !(dst->flags & DST_NOCACHE)) {
+		    !(dst->flags & DST_NOCACHE))
-			write_lock_bh(&neigh->lock);
+			neigh_hh_init(neigh, dst, dst->ops->protocol);
-			if (!dst->hh)
-				neigh_hh_init(neigh, dst, dst->ops->protocol);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+				      neigh->ha, NULL, skb->len);
-					      neigh->ha, NULL, skb->len);
+		read_unlock_bh(&neigh->lock);
-			write_unlock_bh(&neigh->lock);
-		} else {
-			read_lock_bh(&neigh->lock);
-			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-					      neigh->ha, NULL, skb->len);
-			read_unlock_bh(&neigh->lock);
-		}
 		if (err >= 0)
 			rc = neigh->ops->queue_xmit(skb);
 		else