net: Consistent skb timestamping

With RPS inclusion, skb timestamping is not consistent in RX path. If netif_receive_skb() is used, its deferred after RPS dispatch. If netif_rx() is used, its done before RPS dispatch. This can give strange tcpdump timestamps results. I think timestamping should be done as soon as possible in the receive path, to get meaningful values (ie timestamps taken at the time packet was delivered by NIC driver to our stack), even if NAPI already can defer timestamping a bit (RPS can help to reduce the gap) Tom Herbert prefer to sample timestamps after RPS dispatch. In case sampling is expensive (HPET/acpi_pm on x86), this makes sense. Let admins switch from one mode to another, using a new sysctl, /proc/sys/net/core/netdev_tstamp_prequeue Its default value (1), means timestamps are taken as soon as possible, before backlog queueing, giving accurate timestamps. Setting a 0 value permits to sample timestamps when processing backlog, after RPS dispatch, to lower the load of the pre-RPS cpu. Signed-off-by: N Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: N David S. Miller <davem@davemloft.net>

net: Consistent skb timestamping
With RPS inclusion, skb timestamping is not consistent in RX path. If netif_receive_skb() is used, its deferred after RPS dispatch. If netif_rx() is used, its done before RPS dispatch. This can give strange tcpdump timestamps results. I think timestamping should be done as soon as possible in the receive path, to get meaningful values (ie timestamps taken at the time packet was delivered by NIC driver to our stack), even if NAPI already can defer timestamping a bit (RPS can help to reduce the gap) Tom Herbert prefer to sample timestamps after RPS dispatch. In case sampling is expensive (HPET/acpi_pm on x86), this makes sense. Let admins switch from one mode to another, using a new sysctl, /proc/sys/net/core/netdev_tstamp_prequeue Its default value (1), means timestamps are taken as soon as possible, before backlog queueing, giving accurate timestamps. Setting a 0 value permits to sample timestamps when processing backlog, after RPS dispatch, to lower the load of the pre-RPS cpu. Signed-off-by: N Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: N David S. Miller <davem@davemloft.net>
3b098e2d · Eric Dumazet · David S. Miller · a1aa3483 · 3b098e2d · 3b098e2d
4 changed file
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -84,6 +84,16 @@ netdev_max_backlog
 Maximum number  of  packets,  queued  on  the  INPUT  side, when the interface
 receives packets faster than kernel can process them.

+netdev_tstamp_prequeue
+----------------------
+
+If set to 0, RX packet timestamps can be sampled after RPS processing, when
+the target CPU processes packets. It might give some delay on timestamps, but
+permit to distribute the load on several cpus.
+
+If set to 1 (default), timestamps are sampled as soon as possible, before
+queueing.
+
 optmem_max
 ----------


--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2100,6 +2100,7 @@ extern const struct net_device_stats *dev_get_stats(struct net_device *dev);
 extern void		dev_txq_stats_fold(const struct net_device *dev, struct net_device_stats *stats);

 extern int		netdev_max_backlog;
+extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
 extern int		netdev_set_master(struct net_device *dev, struct net_device *master);
 extern int skb_checksum_help(struct sk_buff *skb);

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1454,7 +1454,7 @@ void net_disable_timestamp(void)
 }
 EXPORT_SYMBOL(net_disable_timestamp);

-static inline void net_timestamp(struct sk_buff *skb)
+static inline void net_timestamp_set(struct sk_buff *skb)
 {
 	if (atomic_read(&netstamp_needed))
 		__net_timestamp(skb);
@@ -1462,6 +1462,12 @@ static inline void net_timestamp(struct sk_buff *skb)
 		skb->tstamp.tv64 = 0;
 }

+static inline void net_timestamp_check(struct sk_buff *skb)
+{
+	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
+		__net_timestamp(skb);
+}
+
 /**
 * dev_forward_skb - loopback an skb to another netif
 *
@@ -1508,9 +1514,9 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)

 #ifdef CONFIG_NET_CLS_ACT
 	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
-		net_timestamp(skb);
+		net_timestamp_set(skb);
 #else
-	net_timestamp(skb);
+	net_timestamp_set(skb);
 #endif

 	rcu_read_lock();
@@ -2201,6 +2207,7 @@ EXPORT_SYMBOL(dev_queue_xmit);
  =======================================================================*/

 int netdev_max_backlog __read_mostly = 1000;
+int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;            /* old backlog weight */

@@ -2465,8 +2472,8 @@ int netif_rx(struct sk_buff *skb)
 	if (netpoll_rx(skb))
 		return NET_RX_DROP;

-	if (!skb->tstamp.tv64)
-		net_timestamp(skb);
+	if (netdev_tstamp_prequeue)
+		net_timestamp_check(skb);

 #ifdef CONFIG_RPS
 	{
@@ -2791,8 +2798,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	int ret = NET_RX_DROP;
 	__be16 type;

-	if (!skb->tstamp.tv64)
-		net_timestamp(skb);
+	if (!netdev_tstamp_prequeue)
+		net_timestamp_check(skb);

 	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
 		return NET_RX_SUCCESS;
@@ -2910,23 +2917,28 @@ static int __netif_receive_skb(struct sk_buff *skb)
 */
 int netif_receive_skb(struct sk_buff *skb)
 {
+	if (netdev_tstamp_prequeue)
+		net_timestamp_check(skb);
+
 #ifdef CONFIG_RPS
-	struct rps_dev_flow voidflow, *rflow = &voidflow;
-	int cpu, ret;
+	{
+		struct rps_dev_flow voidflow, *rflow = &voidflow;
+		int cpu, ret;

-	rcu_read_lock();
+		rcu_read_lock();
+
+		cpu = get_rps_cpu(skb->dev, skb, &rflow);

-	cpu = get_rps_cpu(skb->dev, skb, &rflow);
+		if (cpu >= 0) {
+			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+			rcu_read_unlock();
+		} else {
+			rcu_read_unlock();
+			ret = __netif_receive_skb(skb);
+		}

-	if (cpu >= 0) {
-		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
-		rcu_read_unlock();
-	} else {
-		rcu_read_unlock();
-		ret = __netif_receive_skb(skb);
+		return ret;
 	}
-
-	return ret;
 #else
 	return __netif_receive_skb(skb);
 #endif

--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -121,6 +121,13 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "netdev_tstamp_prequeue",
+		.data		= &netdev_tstamp_prequeue,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "message_cost",
 		.data		= &net_ratelimit_state.interval,