xps: fix xps for stacked devices

A typical qdisc setup is the following : bond0 : bonding device, using HTB hierarchy eth1/eth2 : slaves, multiqueue NIC, using MQ + FQ qdisc XPS allows to spread packets on specific tx queues, based on the cpu doing the send. Problem is that dequeues from bond0 qdisc can happen on random cpus, due to the fact that qdisc_run() can dequeue a batch of packets. CPUA -> queue packet P1 on bond0 qdisc, P1->ooo_okay=1 CPUA -> queue packet P2 on bond0 qdisc, P2->ooo_okay=0 CPUB -> dequeue packet P1 from bond0 enqueue packet on eth1/eth2 CPUC -> dequeue packet P2 from bond0 enqueue packet on eth1/eth2 using sk cache (ooo_okay is 0) get_xps_queue() then might select wrong queue for P1, since current cpu might be different than CPUA. P2 might be sent on the old queue (stored in sk->sk_tx_queue_mapping), if CPUC runs a bit faster (or CPUB spins a bit on qdisc lock) Effect of this bug is TCP reorders, and more generally not optimal TX queue placement. (A victim bulk flow can be migrated to the wrong TX queue for a while) To fix this, we have to record sender cpu number the first time dev_queue_xmit() is called for one tx skb. We can union napi_id (used on receive path) and sender_cpu, granted we clear sender_cpu in skb_scrub_packet() (credit to Willem for this union idea) Signed-off-by: N Eric Dumazet <edumazet@google.com> Cc: Willem de Bruijn <willemb@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Cc: Yuchung Cheng <ycheng@google.com> Signed-off-by: N David S. Miller <davem@davemloft.net>

xps: fix xps for stacked devices
A typical qdisc setup is the following : bond0 : bonding device, using HTB hierarchy eth1/eth2 : slaves, multiqueue NIC, using MQ + FQ qdisc XPS allows to spread packets on specific tx queues, based on the cpu doing the send. Problem is that dequeues from bond0 qdisc can happen on random cpus, due to the fact that qdisc_run() can dequeue a batch of packets. CPUA -> queue packet P1 on bond0 qdisc, P1->ooo_okay=1 CPUA -> queue packet P2 on bond0 qdisc, P2->ooo_okay=0 CPUB -> dequeue packet P1 from bond0 enqueue packet on eth1/eth2 CPUC -> dequeue packet P2 from bond0 enqueue packet on eth1/eth2 using sk cache (ooo_okay is 0) get_xps_queue() then might select wrong queue for P1, since current cpu might be different than CPUA. P2 might be sent on the old queue (stored in sk->sk_tx_queue_mapping), if CPUC runs a bit faster (or CPUB spins a bit on qdisc lock) Effect of this bug is TCP reorders, and more generally not optimal TX queue placement. (A victim bulk flow can be migrated to the wrong TX queue for a while) To fix this, we have to record sender cpu number the first time dev_queue_xmit() is called for one tx skb. We can union napi_id (used on receive path) and sender_cpu, granted we clear sender_cpu in skb_scrub_packet() (credit to Willem for this union idea) Signed-off-by: N Eric Dumazet <edumazet@google.com> Cc: Willem de Bruijn <willemb@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Cc: Yuchung Cheng <ycheng@google.com> Signed-off-by: N David S. Miller <davem@davemloft.net>
2bd82484 · Eric Dumazet · David S. Miller · 7e8acbb6 · 2bd82484 · 2bd82484
隐藏空白更改
内联并排

Showing with 15 addition and 3 deletion

include/linux/skbuff.h include/linux/skbuff.h +5 -2

net/core/flow_dissector.c net/core/flow_dissector.c +6 -1

net/core/skbuff.c net/core/skbuff.c +4 -0

未找到文件。
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -626,8 +626,11 @@ struct sk_buff {
 	__u32			hash;
 	__be16			vlan_proto;
 	__u16			vlan_tci;
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int	napi_id;
+#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
+	union {
+		unsigned int	napi_id;
+		unsigned int	sender_cpu;
+	};
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;

--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -422,7 +422,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	dev_maps = rcu_dereference(dev->xps_maps);
 	if (dev_maps) {
 		map = rcu_dereference(
-		    dev_maps->cpu_map[raw_smp_processor_id()]);
+		    dev_maps->cpu_map[skb->sender_cpu - 1]);
 		if (map) {
 			if (map->len == 1)
 				queue_index = map->queues[0];
@@ -468,6 +468,11 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 {
 	int queue_index = 0;

+#ifdef CONFIG_XPS
+	if (skb->sender_cpu == 0)
+		skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
 	if (dev->real_num_tx_queues != 1) {
 		const struct net_device_ops *ops = dev->netdev_ops;
 		if (ops->ndo_select_queue)

--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -825,6 +825,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	CHECK_SKB_FIELD(napi_id);
 #endif
+#ifdef CONFIG_XPS
+	CHECK_SKB_FIELD(sender_cpu);
+#endif
 #ifdef CONFIG_NET_SCHED
 	CHECK_SKB_FIELD(tc_index);
 #ifdef CONFIG_NET_CLS_ACT
@@ -4169,6 +4172,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
 	skb->mark = 0;
+	skb->sender_cpu = 0;
 	skb_init_secmark(skb);
 	secpath_reset(skb);
 	nf_reset(skb);