Merge branch 'vrf-pktinfo'

David Ahern says: ==================== net: vrf: Fixup PKTINFO to return enslaved device index Applications such as OSPF and BFD need the original ingress device not the VRF device; the latter can be derived from the former. To that end move the packet intercept from an rx handler that is invoked by __netif_receive_skb_core to the ipv4 and ipv6 receive processing. IPv6 already saves the skb_iif to the control buffer in ipv6_rcv. Since the skb->dev has not been switched the cb has the enslaved device. Make the same happen for IPv4 by adding the skb_iif to inet_skb_parm and set it in ipv4 code after clearing the skb control buffer similar to IPv6. From there the pktinfo can just pull it from cb with the PKTINFO_SKB_CB cast. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>

Merge branch 'vrf-pktinfo'
David Ahern says: ==================== net: vrf: Fixup PKTINFO to return enslaved device index Applications such as OSPF and BFD need the original ingress device not the VRF device; the latter can be derived from the former. To that end move the packet intercept from an rx handler that is invoked by __netif_receive_skb_core to the ipv4 and ipv6 receive processing. IPv6 already saves the skb_iif to the control buffer in ipv6_rcv. Since the skb->dev has not been switched the cb has the enslaved device. Make the same happen for IPv4 by adding the skb_iif to inet_skb_parm and set it in ipv4 code after clearing the skb control buffer similar to IPv6. From there the pktinfo can just pull it from cb with the PKTINFO_SKB_CB cast. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>
c3f1010b · David S. Miller · ca4aa976 · 0b922b7a · c3f1010b · c3f1010b
10 changed file
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -42,9 +42,6 @@
 #define DRV_NAME	"vrf"
 #define DRV_VERSION	"1.0"

-#define vrf_master_get_rcu(dev) \
-	((struct net_device *)rcu_dereference(dev->rx_handler_data))
-
 struct net_vrf {
 	struct rtable           *rth;
 	struct rt6_info		*rt6;
@@ -60,90 +57,12 @@ struct pcpu_dstats {
 	struct u64_stats_sync	syncp;
 };

-/* neighbor handling is done with actual device; do not want
- * to flip skb->dev for those ndisc packets. This really fails
- * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
- * a start.
- */
-#if IS_ENABLED(CONFIG_IPV6)
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
-	const struct ipv6hdr *ipv6h;
-	struct ipv6hdr _ipv6h;
-	bool rc = true;
-
-	ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
-	if (!ipv6h)
-		goto out;
-
-	if (ipv6h->nexthdr == NEXTHDR_ICMP) {
-		const struct icmp6hdr *icmph;
-		struct icmp6hdr _icmph;
-
-		icmph = skb_header_pointer(skb, sizeof(_ipv6h),
-					   sizeof(_icmph), &_icmph);
-		if (!icmph)
-			goto out;
-
-		switch (icmph->icmp6_type) {
-		case NDISC_ROUTER_SOLICITATION:
-		case NDISC_ROUTER_ADVERTISEMENT:
-		case NDISC_NEIGHBOUR_SOLICITATION:
-		case NDISC_NEIGHBOUR_ADVERTISEMENT:
-		case NDISC_REDIRECT:
-			rc = false;
-			break;
-		}
-	}
-
-out:
-	return rc;
-}
-#else
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
-	return false;
-}
-#endif
-
-static bool is_ip_rx_frame(struct sk_buff *skb)
-{
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		return true;
-	case htons(ETH_P_IPV6):
-		return check_ipv6_frame(skb);
-	}
-	return false;
-}
-
 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
 {
 	vrf_dev->stats.tx_errors++;
 	kfree_skb(skb);
 }

-/* note: already called with rcu_read_lock */
-static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
-{
-	struct sk_buff *skb = *pskb;
-
-	if (is_ip_rx_frame(skb)) {
-		struct net_device *dev = vrf_master_get_rcu(skb->dev);
-		struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
-
-		u64_stats_update_begin(&dstats->syncp);
-		dstats->rx_pkts++;
-		dstats->rx_bytes += skb->len;
-		u64_stats_update_end(&dstats->syncp);
-
-		skb->dev = dev;
-
-		return RX_HANDLER_ANOTHER;
-	}
-	return RX_HANDLER_PASS;
-}
-
 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
 						 struct rtnl_link_stats64 *stats)
 {
@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 {
 	int ret;

-	/* register the packet handler for slave ports */
-	ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
-	if (ret) {
-		netdev_err(port_dev,
-			   "Device %s failed to register rx_handler\n",
-			   port_dev->name);
-		goto out_fail;
-	}
-
 	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
 	if (ret < 0)
-		goto out_unregister;
+		return ret;

 	port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
 	cycle_netdev(port_dev);

 	return 0;
-
-out_unregister:
-	netdev_rx_handler_unregister(port_dev);
-out_fail:
-	return ret;
 }

 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
 	netdev_upper_dev_unlink(port_dev, dev);
 	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;

-	netdev_rx_handler_unregister(port_dev);
-
 	cycle_netdev(port_dev);

 	return 0;
@@ -669,6 +572,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
 	return rc;
 }

+#if IS_ENABLED(CONFIG_IPV6)
+/* neighbor handling is done with actual device; do not want
+ * to flip skb->dev for those ndisc packets. This really fails
+ * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
+ * a start.
+ */
+static bool ipv6_ndisc_frame(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	bool rc = false;
+
+	if (iph->nexthdr == NEXTHDR_ICMP) {
+		const struct icmp6hdr *icmph;
+		struct icmp6hdr _icmph;
+
+		icmph = skb_header_pointer(skb, sizeof(*iph),
+					   sizeof(_icmph), &_icmph);
+		if (!icmph)
+			goto out;
+
+		switch (icmph->icmp6_type) {
+		case NDISC_ROUTER_SOLICITATION:
+		case NDISC_ROUTER_ADVERTISEMENT:
+		case NDISC_NEIGHBOUR_SOLICITATION:
+		case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		case NDISC_REDIRECT:
+			rc = true;
+			break;
+		}
+	}
+
+out:
+	return rc;
+}
+
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+				   struct sk_buff *skb)
+{
+	/* if packet is NDISC keep the ingress interface */
+	if (!ipv6_ndisc_frame(skb)) {
+		skb->dev = vrf_dev;
+		skb->skb_iif = vrf_dev->ifindex;
+
+		skb_push(skb, skb->mac_len);
+		dev_queue_xmit_nit(skb, vrf_dev);
+		skb_pull(skb, skb->mac_len);
+
+		IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
+	}
+
+	return skb;
+}
+
+#else
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+				   struct sk_buff *skb)
+{
+	return skb;
+}
+#endif
+
+static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
+				  struct sk_buff *skb)
+{
+	skb->dev = vrf_dev;
+	skb->skb_iif = vrf_dev->ifindex;
+
+	skb_push(skb, skb->mac_len);
+	dev_queue_xmit_nit(skb, vrf_dev);
+	skb_pull(skb, skb->mac_len);
+
+	return skb;
+}
+
+/* called with rcu lock held */
+static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
+				  struct sk_buff *skb,
+				  u16 proto)
+{
+	switch (proto) {
+	case AF_INET:
+		return vrf_ip_rcv(vrf_dev, skb);
+	case AF_INET6:
+		return vrf_ip6_rcv(vrf_dev, skb);
+	}
+
+	return skb;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 					 const struct flowi6 *fl6)
@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_fib_table	= vrf_fib_table,
 	.l3mdev_get_rtable	= vrf_get_rtable,
 	.l3mdev_get_saddr	= vrf_get_saddr,
+	.l3mdev_l3_rcv		= vrf_l3_rcv,
 #if IS_ENABLED(CONFIG_IPV6)
 	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
 #endif

--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -118,14 +118,29 @@ struct inet6_skb_parm {
 #define IP6SKB_ROUTERALERT	8
 #define IP6SKB_FRAGMENTED      16
 #define IP6SKB_HOPBYHOP        32
+#define IP6SKB_L3SLAVE         64
 };

+#if defined(CONFIG_NET_L3_MASTER_DEV)
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+	return flags & IP6SKB_L3SLAVE;
+}
+#else
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+	return false;
+}
+#endif
+
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
 #define IP6CBMTU(skb)	((struct ip6_mtuinfo *)((skb)->cb))

 static inline int inet6_iif(const struct sk_buff *skb)
 {
-	return IP6CB(skb)->iif;
+	bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
+
+	return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
 }

 struct tcp6_request_sock {

--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
 			const struct sk_buff *skb);

+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+
 extern int		netdev_budget;

 /* Called by rtnetlink.c:rtnl_unlock() */

--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -36,6 +36,7 @@
 struct sock;

 struct inet_skb_parm {
+	int			iif;
 	struct ip_options	opt;		/* Compiled IP options		*/
 	unsigned char		flags;


--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -25,6 +25,8 @@

 struct l3mdev_ops {
 	u32		(*l3mdev_fib_table)(const struct net_device *dev);
+	struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
+					  struct sk_buff *skb, u16 proto);

 	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);

 struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);

+static inline
+struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
+{
+	struct net_device *master = NULL;
+
+	if (netif_is_l3_slave(skb->dev))
+		master = netdev_master_upper_dev_get_rcu(skb->dev);
+	else if (netif_is_l3_master(skb->dev))
+		master = skb->dev;
+
+	if (master && master->l3mdev_ops->l3mdev_l3_rcv)
+		skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
+
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+	return l3mdev_l3_rcv(skb, AF_INET);
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+	return l3mdev_l3_rcv(skb, AF_INET6);
+}
+
 #else

 static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
 {
 	return NULL;
 }
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+	return skb;
+}
 #endif

 #endif /* _NET_L3MDEV_H_ */
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -786,7 +786,9 @@ struct tcp_skb_cb {
 */
 static inline int tcp_v6_iif(const struct sk_buff *skb)
 {
-	return TCP_SKB_CB(skb)->header.h6.iif;
+	bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags);
+
+	return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
 }
 #endif


--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 *	taps currently in use.
 */

-static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
 	struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);

 /**
 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change

--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;

+	/* if ingress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip_rcv(skb);
+	if (!skb)
+		return NET_RX_SUCCESS;
+
 	if (net->ipv4.sysctl_ip_early_demux &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
@@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,

 	/* Remove any debris in the socket control block */
 	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	IPCB(skb)->iif = skb->skb_iif;

 	/* Must drop socket now because of tproxy. */
 	skb_orphan(skb);

--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1193,7 +1193,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
 		       ipv6_sk_rxinfo(sk);

 	if (prepare && skb_rtable(skb)) {
-		pktinfo->ipi_ifindex = inet_iif(skb);
+		/* skb->cb is overloaded: prior to this point it is IP{6}CB
+		 * which has interface index (iif) as the first member of the
+		 * underlying inet{6}_skb_parm struct. This code then overlays
+		 * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
+		 * element so the iif is picked up from the prior IPCB
+		 */
 		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
 	} else {
 		pktinfo->ipi_ifindex = 0;

--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,6 +49,13 @@

 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	/* if ingress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip6_rcv(skb);
+	if (!skb)
+		return NET_RX_SUCCESS;
+
 	if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
 		const struct inet6_protocol *ipprot;