diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt index 14422f8fcdc474f5f32cde7868803cf67fbb0805..24196cef7c9104dc0aebe48b03bf91495f7f1176 100644 --- a/Documentation/networking/ipvlan.txt +++ b/Documentation/networking/ipvlan.txt @@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module There are no module parameters for this driver and it can be configured using IProute2/ip utility. - ip link add link type ipvlan mode { l2 | L3 } + ip link add link type ipvlan mode { l2 | l3 | l3s } e.g. ip link add link ipvl0 eth0 type ipvlan mode l2 @@ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be used before packets are queued on the outbound device. In this mode the slaves will not receive nor can send multicast / broadcast traffic. +4.3 L3S mode: + This is very similar to the L3 mode except that iptables (conn-tracking) +works in this mode and hence it is L3-symmetric (L3s). This will have slightly less +performance but that shouldn't matter since you are choosing this mode over plain-L3 +mode to make conn-tracking work. 5. What to choose (macvlan vs. ipvlan)? These two devices are very similar in many regards and the specific use diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 0c5415b05ea945f39da2482ca8b34fad91ff3c31..8768a625350df32f984cbb8a5026ab3c7f5d871a 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -149,6 +149,7 @@ config IPVLAN tristate "IP-VLAN support" depends on INET depends on IPV6 + depends on NET_L3_MASTER_DEV ---help--- This allows one to create virtual devices off of a main interface and packets will be delivered based on the dest L3 (IPv6/IPv4 addr) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 695a5dc9ace36c01d50233f6bd5e62e42581d39a..7e0732f5ea0747dd4df3d53b7c92f3864f6cd0cb 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #define IPVLAN_DRV "ipvlan" #define IPV_DRV_VER "0.1" @@ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); +struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, + u16 proto); +unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); #endif /* __IPVLAN_H */ diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index b5f9511d819efd3c4464cab22d9bfdc2a94269c4..b4e990743e1da0cb3a946f5473d02cce7447bd1a 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) case IPVLAN_MODE_L2: return ipvlan_xmit_mode_l2(skb, dev); case IPVLAN_MODE_L3: + case IPVLAN_MODE_L3S: return ipvlan_xmit_mode_l3(skb, dev); } @@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); + case IPVLAN_MODE_L3S: + return RX_HANDLER_PASS; } /* Should not reach here */ @@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) kfree_skb(skb); return RX_HANDLER_CONSUMED; } + +static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb, + struct net_device *dev) +{ + struct ipvl_addr *addr = NULL; + struct ipvl_port *port; + void *lyr3h; + int addr_type; + + if (!dev || !netif_is_ipvlan_port(dev)) + goto out; + + port = ipvlan_port_get_rcu(dev); + if (!port || port->mode != IPVLAN_MODE_L3S) + goto out; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); +out: + return addr; +} + +struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, + u16 proto) +{ + struct ipvl_addr *addr; + struct net_device *sdev; + + addr = ipvlan_skb_to_addr(skb, dev); + if (!addr) + goto out; + + sdev = addr->master->dev; + switch (proto) { + case AF_INET: + { + int err; + struct iphdr *ip4h = ip_hdr(skb); + + err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, + ip4h->tos, sdev); + if (unlikely(err)) + goto out; + break; + } + case AF_INET6: + { + struct dst_entry *dst; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct flowi6 fl6 = { + .flowi6_iif = sdev->ifindex, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + skb_dst_drop(skb); + dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags); + skb_dst_set(skb, dst); + break; + } + default: + break; + } + +out: + return skb; +} + +unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct ipvl_addr *addr; + unsigned int len; + + addr = ipvlan_skb_to_addr(skb, skb->dev); + if (!addr) + goto out; + + skb->dev = addr->master->dev; + len = skb->len + ETH_HLEN; + ipvlan_count_rx(addr->master, len, true, false); +out: + return NF_ACCEPT; +} diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 18b4e8c7f68aa1824838f583f6d03fd19705f8a1..f442eb366863e034055c8cf773c9b4b0e287ae87 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -9,24 +9,87 @@ #include "ipvlan.h" +static u32 ipvl_nf_hook_refcnt = 0; + +static struct nf_hook_ops ipvl_nfops[] __read_mostly = { + { + .hook = ipvlan_nf_input, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = INT_MAX, + }, + { + .hook = ipvlan_nf_input, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = INT_MAX, + }, +}; + +static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = { + .l3mdev_l3_rcv = ipvlan_l3_rcv, +}; + static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) { ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj; } -static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) +static int ipvlan_register_nf_hook(void) +{ + int err = 0; + + if (!ipvl_nf_hook_refcnt) { + err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); + if (!err) + ipvl_nf_hook_refcnt = 1; + } else { + ipvl_nf_hook_refcnt++; + } + + return err; +} + +static void ipvlan_unregister_nf_hook(void) +{ + WARN_ON(!ipvl_nf_hook_refcnt); + + ipvl_nf_hook_refcnt--; + if (!ipvl_nf_hook_refcnt) + _nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); +} + +static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) { struct ipvl_dev *ipvlan; + struct net_device *mdev = port->dev; + int err = 0; + ASSERT_RTNL(); if (port->mode != nval) { + if (nval == IPVLAN_MODE_L3S) { + /* New mode is L3S */ + err = ipvlan_register_nf_hook(); + if (!err) { + mdev->l3mdev_ops = &ipvl_l3mdev_ops; + mdev->priv_flags |= IFF_L3MDEV_MASTER; + } else + return err; + } else if (port->mode == IPVLAN_MODE_L3S) { + /* Old mode was L3S */ + mdev->priv_flags &= ~IFF_L3MDEV_MASTER; + ipvlan_unregister_nf_hook(); + mdev->l3mdev_ops = NULL; + } list_for_each_entry(ipvlan, &port->ipvlans, pnode) { - if (nval == IPVLAN_MODE_L3) + if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) ipvlan->dev->flags |= IFF_NOARP; else ipvlan->dev->flags &= ~IFF_NOARP; } port->mode = nval; } + return err; } static int ipvlan_port_create(struct net_device *dev) @@ -74,6 +137,11 @@ static void ipvlan_port_destroy(struct net_device *dev) struct ipvl_port *port = ipvlan_port_get_rtnl(dev); dev->priv_flags &= ~IFF_IPVLAN_MASTER; + if (port->mode == IPVLAN_MODE_L3S) { + dev->priv_flags &= ~IFF_L3MDEV_MASTER; + ipvlan_unregister_nf_hook(); + dev->l3mdev_ops = NULL; + } netdev_rx_handler_unregister(dev); cancel_work_sync(&port->wq); __skb_queue_purge(&port->backlog); @@ -132,7 +200,8 @@ static int ipvlan_open(struct net_device *dev) struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_addr *addr; - if (ipvlan->port->mode == IPVLAN_MODE_L3) + if (ipvlan->port->mode == IPVLAN_MODE_L3 || + ipvlan->port->mode == IPVLAN_MODE_L3S) dev->flags |= IFF_NOARP; else dev->flags &= ~IFF_NOARP; @@ -372,13 +441,14 @@ static int ipvlan_nl_changelink(struct net_device *dev, { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); + int err = 0; if (data && data[IFLA_IPVLAN_MODE]) { u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); - ipvlan_set_port_mode(port, nmode); + err = ipvlan_set_port_mode(port, nmode); } - return 0; + return err; } static size_t ipvlan_nl_getsize(const struct net_device *dev) @@ -473,10 +543,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, unregister_netdevice(dev); return err; } + err = ipvlan_set_port_mode(port, mode); + if (err) { + unregister_netdevice(dev); + return err; + } list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); - ipvlan_set_port_mode(port, mode); - netif_stacked_transfer_operstate(phy_dev, dev); return 0; } diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9230f9aee896facb508fcf23df9e48e842a976c9..e82b76781bf65fe503c8493621bd094e5af1381c 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg); void nf_unregister_hook(struct nf_hook_ops *reg); int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); +int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); +void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d97305d0e71f1a4b0b175c50fba2aabadc5c12f3..e0cd318d5103fb9a9a7be9ddfda40f8696925fcc 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) } void ip6_route_input(struct sk_buff *skb); +struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2351776a724f91fba17868eabf390541238756a9..7ec9e99d5491e9a9fe710ef631bc14b1fe4b2756 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -464,6 +464,7 @@ enum { enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, IPVLAN_MODE_MAX }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad4a7ff301fccf8f194a2bb8a64d8ba27efe937b..4dab585f7642658f62a8a4dbe356bc6a4cf926cb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table * return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); } -static struct dst_entry *ip6_route_input_lookup(struct net *net, - struct net_device *dev, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); } +EXPORT_SYMBOL_GPL(ip6_route_input_lookup); void ip6_route_input(struct sk_buff *skb) { diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f39276d1c2d76788dfc018f2ebdc07d2855f977d..2c5327e43a88a3906e8f9653ebcb72e1882cb7f9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks); static LIST_HEAD(nf_hook_list); -int nf_register_hook(struct nf_hook_ops *reg) +static int _nf_register_hook(struct nf_hook_ops *reg) { struct net *net, *last; int ret; - rtnl_lock(); for_each_net(net) { ret = nf_register_net_hook(net, reg); if (ret && ret != -ENOENT) goto rollback; } list_add_tail(®->list, &nf_hook_list); - rtnl_unlock(); return 0; rollback: @@ -210,19 +208,34 @@ int nf_register_hook(struct nf_hook_ops *reg) break; nf_unregister_net_hook(net, reg); } + return ret; +} + +int nf_register_hook(struct nf_hook_ops *reg) +{ + int ret; + + rtnl_lock(); + ret = _nf_register_hook(reg); rtnl_unlock(); + return ret; } EXPORT_SYMBOL(nf_register_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +static void _nf_unregister_hook(struct nf_hook_ops *reg) { struct net *net; - rtnl_lock(); list_del(®->list); for_each_net(net) nf_unregister_net_hook(net, reg); +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + rtnl_lock(); + _nf_unregister_hook(reg); rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -246,6 +259,26 @@ int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_register_hooks); +/* Caller MUST take rtnl_lock() */ +int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = _nf_register_hook(®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + _nf_unregister_hooks(reg, i); + return err; +} +EXPORT_SYMBOL(_nf_register_hooks); + void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) { while (n-- > 0) @@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_unregister_hooks); +/* Caller MUST take rtnl_lock */ +void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + while (n-- > 0) + _nf_unregister_hook(®[n]); +} +EXPORT_SYMBOL(_nf_unregister_hooks); + unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, struct nf_hook_state *state,