diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 1c0fa364323e280f2ed25c86bdcb8f3508a26c34..51cccddfe403e55214ac5c1809bfdac2cef72f53 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1192,6 +1192,45 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } +static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed, + __be32 *protocol, + struct sk_buff *skb, u32 vxflags) +{ + struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed; + + /* Need to have Next Protocol set for interfaces in GPE mode. */ + if (!gpe->np_applied) + return false; + /* "The initial version is 0. If a receiver does not support the + * version indicated it MUST drop the packet. + */ + if (gpe->version != 0) + return false; + /* "When the O bit is set to 1, the packet is an OAM packet and OAM + * processing MUST occur." However, we don't implement OAM + * processing, thus drop the packet. + */ + if (gpe->oam_flag) + return false; + + switch (gpe->next_protocol) { + case VXLAN_GPE_NP_IPV4: + *protocol = htons(ETH_P_IP); + break; + case VXLAN_GPE_NP_IPV6: + *protocol = htons(ETH_P_IPV6); + break; + case VXLAN_GPE_NP_ETHERNET: + *protocol = htons(ETH_P_TEB); + break; + default: + return false; + } + + unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS; + return true; +} + static bool vxlan_set_mac(struct vxlan_dev *vxlan, struct vxlan_sock *vs, struct sk_buff *skb) @@ -1257,9 +1296,11 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; + __be32 protocol = htons(ETH_P_TEB); + bool raw_proto = false; void *oiph; - /* Need Vxlan and inner Ethernet header to be present */ + /* Need UDP and VXLAN header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) return 1; @@ -1283,9 +1324,18 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (!vxlan) goto drop; - if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB), - !net_eq(vxlan->net, dev_net(vxlan->dev)))) - goto drop; + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + if (vs->flags & VXLAN_F_GPE) { + if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags)) + goto drop; + raw_proto = true; + } + + if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, + !net_eq(vxlan->net, dev_net(vxlan->dev)))) + goto drop; if (vxlan_collect_metadata(vs)) { __be32 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); @@ -1304,14 +1354,14 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ if (vs->flags & VXLAN_F_REMCSUM_RX) if (!vxlan_remcsum(&unparsed, skb, vs->flags)) goto drop; if (vs->flags & VXLAN_F_GBP) vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); + /* Note that GBP and GPE can never be active together. This is + * ensured in vxlan_dev_configure. + */ if (unparsed.vx_flags || unparsed.vx_vni) { /* If there are any unprocessed flags remaining treat @@ -1325,8 +1375,13 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; } - if (!vxlan_set_mac(vxlan, vs, skb)) - goto drop; + if (!raw_proto) { + if (!vxlan_set_mac(vxlan, vs, skb)) + goto drop; + } else { + skb->dev = vxlan->dev; + skb->pkt_type = PACKET_HOST; + } oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -1685,6 +1740,27 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); } +static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags, + __be16 protocol) +{ + struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; + + gpe->np_applied = 1; + + switch (protocol) { + case htons(ETH_P_IP): + gpe->next_protocol = VXLAN_GPE_NP_IPV4; + return 0; + case htons(ETH_P_IPV6): + gpe->next_protocol = VXLAN_GPE_NP_IPV6; + return 0; + case htons(ETH_P_TEB): + gpe->next_protocol = VXLAN_GPE_NP_ETHERNET; + return 0; + } + return -EPFNOSUPPORT; +} + static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, int iphdr_len, __be32 vni, struct vxlan_metadata *md, u32 vxflags, @@ -1694,6 +1770,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, int min_headroom; int err; int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + __be16 inner_protocol = htons(ETH_P_TEB); if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { @@ -1712,10 +1789,8 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + if (unlikely(err)) + goto out_free; skb = vlan_hwaccel_push_inside(skb); if (WARN_ON(!skb)) @@ -1744,9 +1819,19 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, vxflags, md); + if (vxflags & VXLAN_F_GPE) { + err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol); + if (err < 0) + goto out_free; + inner_protocol = skb->protocol; + } - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + skb_set_inner_protocol(skb, inner_protocol); return 0; + +out_free: + kfree_skb(skb); + return err; } static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, @@ -2106,9 +2191,17 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) info = skb_tunnel_info(skb); skb_reset_mac_header(skb); - eth = eth_hdr(skb); - if ((vxlan->flags & VXLAN_F_PROXY)) { + if (vxlan->flags & VXLAN_F_COLLECT_METADATA) { + if (info && info->mode & IP_TUNNEL_INFO_TX) + vxlan_xmit_one(skb, dev, NULL, false); + else + kfree_skb(skb); + return NETDEV_TX_OK; + } + + if (vxlan->flags & VXLAN_F_PROXY) { + eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_ARP) return arp_reduce(dev, skb); #if IS_ENABLED(CONFIG_IPV6) @@ -2123,18 +2216,10 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) return neigh_reduce(dev, skb); } - eth = eth_hdr(skb); #endif } - if (vxlan->flags & VXLAN_F_COLLECT_METADATA) { - if (info && info->mode & IP_TUNNEL_INFO_TX) - vxlan_xmit_one(skb, dev, NULL, false); - else - kfree_skb(skb); - return NETDEV_TX_OK; - } - + eth = eth_hdr(skb); f = vxlan_find_mac(vxlan, eth->h_dest); did_rsc = false; @@ -2404,7 +2489,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) return 0; } -static const struct net_device_ops vxlan_netdev_ops = { +static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, @@ -2421,6 +2506,17 @@ static const struct net_device_ops vxlan_netdev_ops = { .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; +static const struct net_device_ops vxlan_netdev_raw_ops = { + .ndo_init = vxlan_init, + .ndo_uninit = vxlan_uninit, + .ndo_open = vxlan_open, + .ndo_stop = vxlan_stop, + .ndo_start_xmit = vxlan_xmit, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_change_mtu = vxlan_change_mtu, + .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, +}; + /* Info for udev, that this is a virtual tunnel endpoint */ static struct device_type vxlan_type = { .name = "vxlan", @@ -2458,10 +2554,6 @@ static void vxlan_setup(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; - eth_hw_addr_random(dev); - ether_setup(dev); - - dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = free_netdev; SET_NETDEV_DEVTYPE(dev, &vxlan_type); @@ -2476,8 +2568,7 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; netif_keep_dst(dev); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + dev->priv_flags |= IFF_NO_QUEUE; INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); @@ -2496,6 +2587,26 @@ static void vxlan_setup(struct net_device *dev) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); } +static void vxlan_ether_setup(struct net_device *dev) +{ + eth_hw_addr_random(dev); + ether_setup(dev); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->netdev_ops = &vxlan_netdev_ether_ops; +} + +static void vxlan_raw_setup(struct net_device *dev) +{ + dev->type = ARPHRD_NONE; + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->mtu = ETH_DATA_LEN; + dev->tx_queue_len = 1000; + dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; + dev->netdev_ops = &vxlan_netdev_raw_ops; +} + static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, @@ -2522,6 +2633,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, + [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, }; @@ -2722,6 +2834,21 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, __be16 default_port = vxlan->cfg.dst_port; struct net_device *lowerdev = NULL; + if (conf->flags & VXLAN_F_GPE) { + if (conf->flags & ~VXLAN_F_ALLOWED_GPE) + return -EINVAL; + /* For now, allow GPE only together with COLLECT_METADATA. + * This can be relaxed later; in such case, the other side + * of the PtP link will have to be provided. + */ + if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) + return -EINVAL; + + vxlan_raw_setup(dev); + } else { + vxlan_ether_setup(dev); + } + vxlan->net = src_net; dst->remote_vni = conf->vni; @@ -2783,8 +2910,12 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, dev->needed_headroom = needed_headroom; memcpy(&vxlan->cfg, conf, sizeof(*conf)); - if (!vxlan->cfg.dst_port) - vxlan->cfg.dst_port = default_port; + if (!vxlan->cfg.dst_port) { + if (conf->flags & VXLAN_F_GPE) + vxlan->cfg.dst_port = 4790; /* IANA assigned VXLAN-GPE port */ + else + vxlan->cfg.dst_port = default_port; + } vxlan->flags |= conf->flags; if (!vxlan->cfg.age_interval) @@ -2955,6 +3086,9 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (data[IFLA_VXLAN_GBP]) conf.flags |= VXLAN_F_GBP; + if (data[IFLA_VXLAN_GPE]) + conf.flags |= VXLAN_F_GPE; + if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL; @@ -2971,6 +3105,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, case -EEXIST: pr_info("duplicate VNI %u\n", be32_to_cpu(conf.vni)); break; + + case -EINVAL: + pr_info("unsupported combination of extensions\n"); + break; } return err; @@ -3098,6 +3236,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_flag(skb, IFLA_VXLAN_GBP)) goto nla_put_failure; + if (vxlan->flags & VXLAN_F_GPE && + nla_put_flag(skb, IFLA_VXLAN_GPE)) + goto nla_put_failure; + if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL && nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 56050f913339e075f1c22c29bdf5c1975900baed..16435d8b1f935180d42dad87e962f64575de1b48 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -295,8 +295,15 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, return INET_ECN_encapsulate(tos, inner); } -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, - bool xnet); +int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, + __be16 inner_proto, bool raw_proto, bool xnet); + +static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, + __be16 inner_proto, bool xnet) +{ + return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); +} + void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 73ed2e951c020d28c21457d907c55764ea7b3d88..dcc6f40571154977ba628e35fd190d640560e63a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -119,6 +119,64 @@ struct vxlanhdr_gbp { #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) +/* + * VXLAN Generic Protocol Extension (VXLAN_F_GPE): + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |R|R|Ver|I|P|R|O| Reserved |Next Protocol | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Ver = Version. Indicates VXLAN GPE protocol version. + * + * P = Next Protocol Bit. The P bit is set to indicate that the + * Next Protocol field is present. + * + * O = OAM Flag Bit. The O bit is set to indicate that the packet + * is an OAM packet. + * + * Next Protocol = This 8 bit field indicates the protocol header + * immediately following the VXLAN GPE header. + * + * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01 + */ + +struct vxlanhdr_gpe { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 oam_flag:1, + reserved_flags1:1, + np_applied:1, + instance_applied:1, + version:2, +reserved_flags2:2; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved_flags2:2, + version:2, + instance_applied:1, + np_applied:1, + reserved_flags1:1, + oam_flag:1; +#endif + u8 reserved_flags3; + u8 reserved_flags4; + u8 next_protocol; + __be32 vx_vni; +}; + +/* VXLAN-GPE header flags. */ +#define VXLAN_HF_VER cpu_to_be32(BIT(29) | BIT(28)) +#define VXLAN_HF_NP cpu_to_be32(BIT(26)) +#define VXLAN_HF_OAM cpu_to_be32(BIT(24)) + +#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \ + cpu_to_be32(0xff)) + +/* VXLAN-GPE header Next Protocol. */ +#define VXLAN_GPE_NP_IPV4 0x01 +#define VXLAN_GPE_NP_IPV6 0x02 +#define VXLAN_GPE_NP_ETHERNET 0x03 +#define VXLAN_GPE_NP_NSH 0x04 + struct vxlan_metadata { u32 gbp; }; @@ -206,16 +264,26 @@ struct vxlan_dev { #define VXLAN_F_GBP 0x800 #define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 #define VXLAN_F_COLLECT_METADATA 0x2000 +#define VXLAN_F_GPE 0x4000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable */ #define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \ + VXLAN_F_GPE | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ VXLAN_F_COLLECT_METADATA) +/* Flags that can be set together with VXLAN_F_GPE. */ +#define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ + VXLAN_F_IPV6 | \ + VXLAN_F_UDP_ZERO_CSUM_TX | \ + VXLAN_F_UDP_ZERO_CSUM6_TX | \ + VXLAN_F_UDP_ZERO_CSUM6_RX | \ + VXLAN_F_COLLECT_METADATA) + struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index c488066fb53a685b81aa80b6c306a6b53e837bfc..9427f17d06d6d2c0fa6ccf6b4ddc85598cc1c2ea 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -488,6 +488,7 @@ enum { IFLA_VXLAN_REMCSUM_NOPARTIAL, IFLA_VXLAN_COLLECT_METADATA, IFLA_VXLAN_LABEL, + IFLA_VXLAN_GPE, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index b3ab1205dfdf5f58ffe3d83baababab92da6e590..43445df61efd984a56f3746f598d8f24ff04c718 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,15 +86,15 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, - bool xnet) +int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, + __be16 inner_proto, bool raw_proto, bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; skb_pull_rcsum(skb, hdr_len); - if (inner_proto == htons(ETH_P_TEB)) { + if (!raw_proto && inner_proto == htons(ETH_P_TEB)) { struct ethhdr *eh; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) @@ -117,7 +117,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, return iptunnel_pull_offloads(skb); } -EXPORT_SYMBOL_GPL(iptunnel_pull_header); +EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags)