net: add additional lock to qdisc to increase throughput

When many cpus compete for sending frames on a given qdisc, the qdisc spinlock suffers from very high contention. The cpu owning __QDISC_STATE_RUNNING bit has same priority to acquire the lock, and cannot dequeue packets fast enough, since it must wait for this lock for each dequeued packet. One solution to this problem is to force all cpus spinning on a second lock before trying to get the main lock, when/if they see __QDISC_STATE_RUNNING already set. The owning cpu then compete with at most one other cpu for the main lock, allowing for higher dequeueing rate. Based on a previous patch from Alexander Duyck. I added the heuristic to avoid the atomic in fast path, and put the new lock far away from the cache line used by the dequeue worker. Also try to release the busylock lock as late as possible. Tests with following script gave a boost from ~50.000 pps to ~600.000 pps on a dual quad core machine (E5450 @3.00GHz), tg3 driver. (A single netperf flow can reach ~800.000 pps on this platform) for j in `seq 0 3`; do for i in `seq 0 7`; do netperf -H 192.168.0.1 -t UDP_STREAM -l 60 -N -T $i -- -m 6 & done done Signed-off-by: N Eric Dumazet <eric.dumazet@gmail.com> Acked-by: N Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: N David S. Miller <davem@davemloft.net>

net: add additional lock to qdisc to increase throughput
When many cpus compete for sending frames on a given qdisc, the qdisc spinlock suffers from very high contention. The cpu owning __QDISC_STATE_RUNNING bit has same priority to acquire the lock, and cannot dequeue packets fast enough, since it must wait for this lock for each dequeued packet. One solution to this problem is to force all cpus spinning on a second lock before trying to get the main lock, when/if they see __QDISC_STATE_RUNNING already set. The owning cpu then compete with at most one other cpu for the main lock, allowing for higher dequeueing rate. Based on a previous patch from Alexander Duyck. I added the heuristic to avoid the atomic in fast path, and put the new lock far away from the cache line used by the dequeue worker. Also try to release the busylock lock as late as possible. Tests with following script gave a boost from ~50.000 pps to ~600.000 pps on a dual quad core machine (E5450 @3.00GHz), tg3 driver. (A single netperf flow can reach ~800.000 pps on this platform) for j in `seq 0 3`; do for i in `seq 0 7`; do netperf -H 192.168.0.1 -t UDP_STREAM -l 60 -N -T $i -- -m 6 & done done Signed-off-by: N Eric Dumazet <eric.dumazet@gmail.com> Acked-by: N Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: N David S. Miller <davem@davemloft.net>
79640a4c · Eric Dumazet · David S. Miller · 097811bb · 79640a4c · 79640a4c
显示空白变更内容
内联并排

Showing with 28 addition and 5 deletion

include/net/sch_generic.h include/net/sch_generic.h +2 -1

net/core/dev.c net/core/dev.c +25 -4

net/sched/sch_generic.c net/sched/sch_generic.c +1 -0

未找到文件。
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -81,6 +81,7 @@ struct Qdisc {
 	unsigned long		__state;
 	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
+	spinlock_t		busylock;
 };
 static inline bool qdisc_is_running(struct Qdisc *qdisc)

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2040,8 +2040,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct netdev_queue *txq)
 {
 	spinlock_t *root_lock = qdisc_lock(q);
+	bool contended = qdisc_is_running(q);
 	int rc;
+	/*
+	 * Heuristic to force contended enqueues to serialize on a
+	 * separate lock before trying to get qdisc main lock.
+	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
+	 * and dequeue packets faster.
+	 */
+	if (unlikely(contended))
+		spin_lock(&q->busylock);
 	spin_lock(root_lock);
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 		kfree_skb(skb);
@@ -2056,19 +2066,30 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
 			skb_dst_force(skb);
 		__qdisc_update_bstats(q, skb->len);
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
 			__qdisc_run(q);
-		else
+		} else
 			qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		skb_dst_force(skb);
 		rc = qdisc_enqueue_root(skb, q);
-		qdisc_run(q);
+		if (qdisc_run_begin(q)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
+			__qdisc_run(q);
+		}
 	}
 	spin_unlock(root_lock);
+	if (unlikely(contended))
+		spin_unlock(&q->busylock);
 	return rc;
 }

--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -561,6 +561,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	INIT_LIST_HEAD(&sch->list);
 	skb_queue_head_init(&sch->q);
+	spin_lock_init(&sch->busylock);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;