Merge branch 'vrf-ipv6'

David Ahern says: ==================== net: VRF support in IPv6 stack Initial support for VRF in IPv6 stack. Makes IPv6 functionality on par with IPv4 -- ping, tcp client/server and udp client/server all work fine. tcpdump on vrf device and external tap (e.g., host side tap device) shows all packets with proper addresses. IPv6 does not need the source address operation like IPv4. Verified vti6 works properly in my setup as does use of an IPv6 address on the VRF device. v3 - re-based to top of net-next (updates per net namespace changes by Eric) - fixed dst_entry typecasts as requested by Dave - added flags to inet6_rtm_getroute (IPv6 version of deaa0a6a) v2 - fixed CONFIG_IPV6 dependency as questioned by Cong - if IPV6 is a module, kbuild ensures VRF is a module - if IPV6 is disabled IPV6 functionality is compiled out of VRF module - addressed comments from Nik over IRC - removed duplicate call to netif_is_l3_master in l3mdev_rt6_dst_by_oif - changed allocation flag from GFP_ATOMIC to GFP_KERNEL since it is init time - added free of rt6i_pcpu - check_ipv6_frame returns false only if packet is NDISC type ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>

Merge branch 'vrf-ipv6'
David Ahern says: ==================== net: VRF support in IPv6 stack Initial support for VRF in IPv6 stack. Makes IPv6 functionality on par with IPv4 -- ping, tcp client/server and udp client/server all work fine. tcpdump on vrf device and external tap (e.g., host side tap device) shows all packets with proper addresses. IPv6 does not need the source address operation like IPv4. Verified vti6 works properly in my setup as does use of an IPv6 address on the VRF device. v3 - re-based to top of net-next (updates per net namespace changes by Eric) - fixed dst_entry typecasts as requested by Dave - added flags to inet6_rtm_getroute (IPv6 version of deaa0a6a) v2 - fixed CONFIG_IPV6 dependency as questioned by Cong - if IPV6 is a module, kbuild ensures VRF is a module - if IPV6 is disabled IPV6 functionality is compiled out of VRF module - addressed comments from Nik over IRC - removed duplicate call to netif_is_l3_master in l3mdev_rt6_dst_by_oif - changed allocation flag from GFP_ATOMIC to GFP_KERNEL since it is init time - added free of rt6i_pcpu - check_ipv6_frame returns false only if packet is NDISC type ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>
4b918163 · David S. Miller · af379392 · ca254490 · 4b918163 · 4b918163
9 changed file
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -298,8 +298,10 @@ config NLMON

 config NET_VRF
 	tristate "Virtual Routing and Forwarding (Lite)"
-	depends on IP_MULTIPLE_TABLES && IPV6_MULTIPLE_TABLES
+	depends on IP_MULTIPLE_TABLES
 	depends on NET_L3_MASTER_DEV
+	depends on IPV6 || IPV6=n
+	depends on IPV6_MULTIPLE_TABLES || IPV6=n
 	---help---
 	  This option enables the support for mapping interfaces into VRF's. The
 	  support enables VRF devices.

--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -30,6 +30,7 @@
 #include <net/arp.h>
 #include <net/ip.h>
 #include <net/ip_fib.h>
+#include <net/ip6_fib.h>
 #include <net/ip6_route.h>
 #include <net/rtnetlink.h>
 #include <net/route.h>
@@ -57,6 +58,7 @@ struct slave_queue {
 struct net_vrf {
 	struct slave_queue      queue;
 	struct rtable           *rth;
+	struct rt6_info		*rt6;
 	u32                     tb_id;
 };

@@ -104,12 +106,56 @@ static struct dst_ops vrf_dst_ops = {
 	.default_advmss	= vrf_default_advmss,
 };

+/* neighbor handling is done with actual device; do not want
+ * to flip skb->dev for those ndisc packets. This really fails
+ * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
+ * a start.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static bool check_ipv6_frame(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data;
+	size_t hlen = sizeof(*ipv6h);
+	bool rc = true;
+
+	if (skb->len < hlen)
+		goto out;
+
+	if (ipv6h->nexthdr == NEXTHDR_ICMP) {
+		const struct icmp6hdr *icmph;
+
+		if (skb->len < hlen + sizeof(*icmph))
+			goto out;
+
+		icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h));
+		switch (icmph->icmp6_type) {
+		case NDISC_ROUTER_SOLICITATION:
+		case NDISC_ROUTER_ADVERTISEMENT:
+		case NDISC_NEIGHBOUR_SOLICITATION:
+		case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		case NDISC_REDIRECT:
+			rc = false;
+			break;
+		}
+	}
+
+out:
+	return rc;
+}
+#else
+static bool check_ipv6_frame(const struct sk_buff *skb)
+{
+	return false;
+}
+#endif
+
 static bool is_ip_rx_frame(struct sk_buff *skb)
 {
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-	case htons(ETH_P_IPV6):
 		return true;
+	case htons(ETH_P_IPV6):
+		return check_ipv6_frame(skb);
 	}
 	return false;
 }
@@ -169,12 +215,53 @@ static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
 	return stats;
 }

+#if IS_ENABLED(CONFIG_IPV6)
 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 					   struct net_device *dev)
 {
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	struct flowi6 fl6 = {
+		/* needed to match OIF rule */
+		.flowi6_oif = dev->ifindex,
+		.flowi6_iif = LOOPBACK_IFINDEX,
+		.daddr = iph->daddr,
+		.saddr = iph->saddr,
+		.flowlabel = ip6_flowinfo(iph),
+		.flowi6_mark = skb->mark,
+		.flowi6_proto = iph->nexthdr,
+		.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF,
+	};
+	int ret = NET_XMIT_DROP;
+	struct dst_entry *dst;
+	struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
+
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst == dst_null)
+		goto err;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	ret = ip6_local_out(net, skb->sk, skb);
+	if (unlikely(net_xmit_eval(ret)))
+		dev->stats.tx_errors++;
+	else
+		ret = NET_XMIT_SUCCESS;
+
+	return ret;
+err:
 	vrf_tx_error(dev, skb);
 	return NET_XMIT_DROP;
 }
+#else
+static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
+					   struct net_device *dev)
+{
+	vrf_tx_error(dev, skb);
+	return NET_XMIT_DROP;
+}
+#endif

 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4,
 			    struct net_device *vrf_dev)
@@ -269,6 +356,157 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
 	return ret;
 }

+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie)
+{
+	return dst;
+}
+
+static struct dst_ops vrf_dst_ops6 = {
+	.family		= AF_INET6,
+	.local_out	= ip6_local_out,
+	.check		= vrf_ip6_check,
+	.mtu		= vrf_v4_mtu,
+	.destroy	= vrf_dst_destroy,
+	.default_advmss	= vrf_default_advmss,
+};
+
+static int init_dst_ops6_kmem_cachep(void)
+{
+	vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache",
+						     sizeof(struct rt6_info),
+						     0,
+						     SLAB_HWCACHE_ALIGN,
+						     NULL);
+
+	if (!vrf_dst_ops6.kmem_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void free_dst_ops6_kmem_cachep(void)
+{
+	kmem_cache_destroy(vrf_dst_ops6.kmem_cachep);
+}
+
+static int vrf_input6(struct sk_buff *skb)
+{
+	skb->dev->stats.rx_errors++;
+	kfree_skb(skb);
+	return 0;
+}
+
+/* modelled after ip6_finish_output2 */
+static int vrf_finish_output6(struct net *net, struct sock *sk,
+			      struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct net_device *dev = dst->dev;
+	struct neighbour *neigh;
+	struct in6_addr *nexthop;
+	int ret;
+
+	skb->protocol = htons(ETH_P_IPV6);
+	skb->dev = dev;
+
+	rcu_read_lock_bh();
+	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
+	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
+	if (!IS_ERR(neigh)) {
+		ret = dst_neigh_output(dst, neigh, skb);
+		rcu_read_unlock_bh();
+		return ret;
+	}
+	rcu_read_unlock_bh();
+
+	IP6_INC_STATS(dev_net(dst->dev),
+		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+/* modelled after ip6_output */
+static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+			    net, sk, skb, NULL, skb_dst(skb)->dev,
+			    vrf_finish_output6,
+			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+}
+
+static void vrf_rt6_destroy(struct net_vrf *vrf)
+{
+	dst_destroy(&vrf->rt6->dst);
+	free_percpu(vrf->rt6->rt6i_pcpu);
+	vrf->rt6 = NULL;
+}
+
+static int vrf_rt6_create(struct net_device *dev)
+{
+	struct net_vrf *vrf = netdev_priv(dev);
+	struct dst_entry *dst;
+	struct rt6_info *rt6;
+	int cpu;
+	int rc = -ENOMEM;
+
+	rt6 = dst_alloc(&vrf_dst_ops6, dev, 0,
+			DST_OBSOLETE_NONE,
+			(DST_HOST | DST_NOPOLICY | DST_NOXFRM));
+	if (!rt6)
+		goto out;
+
+	dst = &rt6->dst;
+
+	rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL);
+	if (!rt6->rt6i_pcpu) {
+		dst_destroy(dst);
+		goto out;
+	}
+	for_each_possible_cpu(cpu) {
+		struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu);
+		*p =  NULL;
+	}
+
+	memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst));
+
+	INIT_LIST_HEAD(&rt6->rt6i_siblings);
+	INIT_LIST_HEAD(&rt6->rt6i_uncached);
+
+	rt6->dst.input	= vrf_input6;
+	rt6->dst.output	= vrf_output6;
+
+	rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id);
+
+	atomic_set(&rt6->dst.__refcnt, 2);
+
+	vrf->rt6 = rt6;
+	rc = 0;
+out:
+	return rc;
+}
+#else
+static int init_dst_ops6_kmem_cachep(void)
+{
+	return 0;
+}
+
+static void free_dst_ops6_kmem_cachep(void)
+{
+}
+
+static void vrf_rt6_destroy(struct net_vrf *vrf)
+{
+}
+
+static int vrf_rt6_create(struct net_device *dev)
+{
+	return 0;
+}
+#endif
+
 /* modelled after ip_finish_output2 */
 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
@@ -490,6 +728,7 @@ static void vrf_dev_uninit(struct net_device *dev)
 	struct slave *slave, *next;

 	vrf_rtable_destroy(vrf);
+	vrf_rt6_destroy(vrf);

 	list_for_each_entry_safe(slave, next, head, list)
 		vrf_del_slave(dev, slave->dev);
@@ -513,10 +752,15 @@ static int vrf_dev_init(struct net_device *dev)
 	if (!vrf->rth)
 		goto out_stats;

+	if (vrf_rt6_create(dev) != 0)
+		goto out_rth;
+
 	dev->flags = IFF_MASTER | IFF_NOARP;

 	return 0;

+out_rth:
+	vrf_rtable_destroy(vrf);
 out_stats:
 	free_percpu(dev->dstats);
 	dev->dstats = NULL;
@@ -586,10 +830,30 @@ static void vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
 	fl4->flowi4_scope = scope;
 }

+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
+					 const struct flowi6 *fl6)
+{
+	struct rt6_info *rt = NULL;
+
+	if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
+		struct net_vrf *vrf = netdev_priv(dev);
+
+		rt = vrf->rt6;
+		atomic_inc(&rt->dst.__refcnt);
+	}
+
+	return (struct dst_entry *)rt;
+}
+#endif
+
 static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_fib_table	= vrf_fib_table,
 	.l3mdev_get_rtable	= vrf_get_rtable,
 	.l3mdev_get_saddr	= vrf_get_saddr,
+#if IS_ENABLED(CONFIG_IPV6)
+	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
+#endif
 };

 static void vrf_get_drvinfo(struct net_device *dev,
@@ -731,6 +995,10 @@ static int __init vrf_init_module(void)
 	if (!vrf_dst_ops.kmem_cachep)
 		return -ENOMEM;

+	rc = init_dst_ops6_kmem_cachep();
+	if (rc != 0)
+		goto error2;
+
 	register_netdevice_notifier(&vrf_notifier_block);

 	rc = rtnl_link_register(&vrf_link_ops);
@@ -741,6 +1009,8 @@ static int __init vrf_init_module(void)

 error:
 	unregister_netdevice_notifier(&vrf_notifier_block);
+	free_dst_ops6_kmem_cachep();
+error2:
 	kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
 	return rc;
 }
@@ -750,6 +1020,7 @@ static void __exit vrf_cleanup_module(void)
 	rtnl_link_unregister(&vrf_link_ops);
 	unregister_netdevice_notifier(&vrf_notifier_block);
 	kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
+	free_dst_ops6_kmem_cachep();
 }

 module_init(vrf_init_module);

--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -19,14 +19,22 @@
 * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
 *
 * @l3mdev_get_saddr: Get source address for a flow
+ *
+ * @l3mdev_get_rt6_dst: Get cached IPv6 rt6_info (dst_entry) for device
 */

 struct l3mdev_ops {
 	u32		(*l3mdev_fib_table)(const struct net_device *dev);
+
+	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
 					     const struct flowi4 *fl4);
 	void		(*l3mdev_get_saddr)(struct net_device *dev,
 					    struct flowi4 *fl4);
+
+	/* IPv6 ops */
+	struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev,
+						 const struct flowi6 *fl6);
 };

 #ifdef CONFIG_NET_L3_MASTER_DEV
@@ -123,6 +131,31 @@ static inline void l3mdev_get_saddr(struct net *net, int ifindex,
 	}
 }

+static inline struct dst_entry *l3mdev_get_rt6_dst(const struct net_device *dev,
+						   const struct flowi6 *fl6)
+{
+	if (netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_get_rt6_dst)
+		return dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
+
+	return NULL;
+}
+
+static inline
+struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net,
+					const struct flowi6 *fl6)
+{
+	struct dst_entry *dst = NULL;
+	struct net_device *dev;
+
+	dev = dev_get_by_index(net, fl6->flowi6_oif);
+	if (dev) {
+		dst = l3mdev_get_rt6_dst(dev, fl6);
+		dev_put(dev);
+	}
+
+	return dst;
+}
+
 #else

 static inline int l3mdev_master_ifindex_rcu(struct net_device *dev)
@@ -171,6 +204,19 @@ static inline void l3mdev_get_saddr(struct net *net, int ifindex,
 				    struct flowi4 *fl4)
 {
 }
+
+static inline
+struct dst_entry *l3mdev_get_rt6_dst(const struct net_device *dev,
+				     const struct flowi6 *fl6)
+{
+	return NULL;
+}
+static inline
+struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net,
+					const struct flowi6 *fl6)
+{
+	return NULL;
+}
 #endif

 #endif /* _NET_L3MDEV_H_ */
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -81,6 +81,7 @@
 #include <net/ip.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/l3mdev.h>
 #include <linux/if_tunnel.h>
 #include <linux/rtnetlink.h>
 #include <linux/netconf.h>
@@ -2146,7 +2147,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 		      unsigned long expires, u32 flags)
 {
 	struct fib6_config cfg = {
-		.fc_table = RT6_TABLE_PREFIX,
+		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
 		.fc_metric = IP6_RT_PRIO_ADDRCONF,
 		.fc_ifindex = dev->ifindex,
 		.fc_expires = expires,
@@ -2179,8 +2180,9 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 	struct fib6_node *fn;
 	struct rt6_info *rt = NULL;
 	struct fib6_table *table;
+	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;

-	table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX);
+	table = fib6_get_table(dev_net(dev), tb_id);
 	if (!table)
 		return NULL;

@@ -2211,7 +2213,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 static void addrconf_add_mroute(struct net_device *dev)
 {
 	struct fib6_config cfg = {
-		.fc_table = RT6_TABLE_LOCAL,
+		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL,
 		.fc_metric = IP6_RT_PRIO_ADDRCONF,
 		.fc_ifindex = dev->ifindex,
 		.fc_dst_len = 8,
@@ -3029,6 +3031,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 {
 	struct in6_addr addr;

+	/* no link local addresses on L3 master devices */
+	if (netif_is_l3_master(idev->dev))
+		return;
+
 	ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);

 	if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) {

--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -68,6 +68,7 @@
 #include <net/xfrm.h>
 #include <net/inet_common.h>
 #include <net/dsfield.h>
+#include <net/l3mdev.h>

 #include <asm/uaccess.h>

@@ -496,6 +497,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;

+	if (!fl6.flowi6_oif)
+		fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev);
+
 	dst = icmpv6_route_lookup(net, skb, sk, &fl6);
 	if (IS_ERR(dst))
 		goto out;
@@ -575,7 +579,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.daddr = ipv6_hdr(skb)->saddr;
 	if (saddr)
 		fl6.saddr = *saddr;
-	fl6.flowi6_oif = skb->dev->ifindex;
+	fl6.flowi6_oif = l3mdev_fib_oif(skb->dev);
 	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
 	fl6.flowi6_mark = mark;
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));

--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -264,6 +264,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id)

 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fib6_get_table);

 static void __net_init fib6_tables_init(struct net *net)
 {

--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -55,6 +55,7 @@
 #include <net/xfrm.h>
 #include <net/checksum.h>
 #include <linux/mroute6.h>
+#include <net/l3mdev.h>

 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
@@ -885,7 +886,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 #ifdef CONFIG_IPV6_SUBTREES
 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 #endif
-	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
+	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
+	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 		dst_release(dst);
 		dst = NULL;
 	}
@@ -1037,7 +1039,7 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
 	if (final_dst)
 		fl6->daddr = *final_dst;
 	if (!fl6->flowi6_oif)
-		fl6->flowi6_oif = dst->dev->ifindex;
+		fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);

 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 }

--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -67,6 +67,7 @@
 #include <net/flow.h>
 #include <net/ip6_checksum.h>
 #include <net/inet_common.h>
+#include <net/l3mdev.h>
 #include <linux/proc_fs.h>

 #include <linux/netfilter.h>
@@ -147,6 +148,7 @@ struct neigh_table nd_tbl = {
 	.gc_thresh2 =	 512,
 	.gc_thresh3 =	1024,
 };
+EXPORT_SYMBOL_GPL(nd_tbl);

 static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
 {
@@ -441,8 +443,11 @@ static void ndisc_send_skb(struct sk_buff *skb,

 	if (!dst) {
 		struct flowi6 fl6;
+		int oif = l3mdev_fib_oif(skb->dev);

-		icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex);
+		icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
+		if (oif != skb->dev->ifindex)
+			fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
 		dst = icmp6_dst_alloc(skb->dev, &fl6);
 		if (IS_ERR(dst)) {
 			kfree_skb(skb);
@@ -766,7 +771,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)

 	ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
 	if (ifp) {
-
+have_ifp:
 		if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
 			if (dad) {
 				/*
@@ -792,6 +797,18 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 	} else {
 		struct net *net = dev_net(dev);

+		/* perhaps an address on the master device */
+		if (netif_is_l3_slave(dev)) {
+			struct net_device *mdev;
+
+			mdev = netdev_master_upper_dev_get_rcu(dev);
+			if (mdev) {
+				ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1);
+				if (ifp)
+					goto have_ifp;
+			}
+		}
+
 		idev = in6_dev_get(dev);
 		if (!idev) {
 			/* XXX: count this drop? */
@@ -1483,6 +1500,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	struct flowi6 fl6;
 	int rd_len;
 	u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+	int oif = l3mdev_fib_oif(dev);
 	bool ret;

 	if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
@@ -1499,7 +1517,10 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	}

 	icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
-			 &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
+			 &saddr_buf, &ipv6_hdr(skb)->saddr, oif);
+
+	if (oif != skb->dev->ifindex)
+		fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;

 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {

--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -61,6 +61,7 @@
 #include <net/nexthop.h>
 #include <net/lwtunnel.h>
 #include <net/ip_tunnels.h>
+#include <net/l3mdev.h>

 #include <asm/uaccess.h>

@@ -1044,6 +1045,9 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;

+	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+		oif = 0;
+
 redo_rt6_select:
 	rt = rt6_select(fn, oif, strict);
 	if (rt->rt6i_nsiblings)
@@ -1141,7 +1145,7 @@ void ip6_route_input(struct sk_buff *skb)
 	int flags = RT6_LOOKUP_F_HAS_SADDR;
 	struct ip_tunnel_info *tun_info;
 	struct flowi6 fl6 = {
-		.flowi6_iif = skb->dev->ifindex,
+		.flowi6_iif = l3mdev_fib_oif(skb->dev),
 		.daddr = iph->daddr,
 		.saddr = iph->saddr,
 		.flowlabel = ip6_flowinfo(iph),
@@ -1165,8 +1169,13 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
 				    struct flowi6 *fl6)
 {
+	struct dst_entry *dst;
 	int flags = 0;

+	dst = l3mdev_rt6_dst_by_oif(net, fl6);
+	if (dst)
+		return dst;
+
 	fl6->flowi6_iif = LOOPBACK_IFINDEX;

 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
@@ -2263,7 +2272,6 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 					   unsigned int pref)
 {
 	struct fib6_config cfg = {
-		.fc_table	= RT6_TABLE_INFO,
 		.fc_metric	= IP6_RT_PRIO_USER,
 		.fc_ifindex	= ifindex,
 		.fc_dst_len	= prefixlen,
@@ -2274,6 +2282,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 		.fc_nlinfo.nl_net = net,
 	};

+	cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
 	cfg.fc_dst = *prefix;
 	cfg.fc_gateway = *gwaddr;

@@ -2314,7 +2323,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
 				     unsigned int pref)
 {
 	struct fib6_config cfg = {
-		.fc_table	= RT6_TABLE_DFLT,
+		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
 		.fc_metric	= IP6_RT_PRIO_USER,
 		.fc_ifindex	= dev->ifindex,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2361,7 +2370,8 @@ static void rtmsg_to_fib6_config(struct net *net,
 {
 	memset(cfg, 0, sizeof(*cfg));

-	cfg->fc_table = RT6_TABLE_MAIN;
+	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
+			 : RT6_TABLE_MAIN;
 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
 	cfg->fc_metric = rtmsg->rtmsg_metric;
 	cfg->fc_expires = rtmsg->rtmsg_info;
@@ -2470,6 +2480,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 				    const struct in6_addr *addr,
 				    bool anycast)
 {
+	u32 tb_id;
 	struct net *net = dev_net(idev->dev);
 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
 					    DST_NOCOUNT);
@@ -2492,7 +2503,8 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
 	rt->rt6i_dst.plen = 128;
-	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
+	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
+	rt->rt6i_table = fib6_get_table(net, tb_id);
 	rt->dst.flags |= DST_NOCACHE;

 	atomic_set(&rt->dst.__refcnt, 1);
@@ -3254,6 +3266,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	} else {
 		fl6.flowi6_oif = oif;

+		if (netif_index_is_l3_master(net, oif)) {
+			fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
+					   FLOWI_FLAG_SKIP_NH_OIF;
+		}
+
 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
 	}