提交 79dfab43 编写于 作者: D David S. Miller

Merge branch 'VXLAN-underlay-VRF'

Alexis Bauvin says:

====================
net: Add VRF support for VXLAN underlay

v6 -> v7:
- proper locking for device in udp_tunnel following Sabrina Dubroca's advice

v5 -> v6:
- remove automatic rebinding patch following Roopa Prabhu's advice

v4 -> v5:
- move test script to its own patch (6/6)
- add schematic for test script
- apply David Ahern comments to the test script

v3 -> v4:
- rename vxlan_is_in_l3mdev_chain to netdev_is_upper master
- move it to net/core/dev.c
- make it return bool instead of int
- check if remote_ifindex is zero before resolving the l3mdev
- add testing script

v2 -> v3:
- fix build when CONFIG_NET_IPV6 is off
- fix build "unused l3mdev_master_upper_ifindex_by_index" build error with some
  configs

v1 -> v2:
- move vxlan_get_l3mdev from vxlan driver to l3mdev driver as
  l3mdev_master_upper_ifindex_by_index
- vxlan: rename variables named l3mdev_ifindex to ifindex

v0 -> v1:
- fix typos

We are trying to isolate the VXLAN traffic from different VMs with VRF as shown
in the schemas below:

+-------------------------+   +----------------------------+
| +----------+            |   |     +------------+         |
| |          |            |   |     |            |         |
| | tap-red  |            |   |     |  tap-blue  |         |
| |          |            |   |     |            |         |
| +----+-----+            |   |     +-----+------+         |
|      |                  |   |           |                |
|      |                  |   |           |                |
| +----+---+              |   |      +----+----+           |
| |        |              |   |      |         |           |
| | br-red |              |   |      | br-blue |           |
| |        |              |   |      |         |           |
| +----+---+              |   |      +----+----+           |
|      |                  |   |           |                |
|      |                  |   |           |                |
|      |                  |   |           |                |
| +----+--------+         |   |     +--------------+       |
| |             |         |   |     |              |       |
| |  vxlan-red  |         |   |     |  vxlan-blue  |       |
| |             |         |   |     |              |       |
| +------+------+         |   |     +-------+------+       |
|        |                |   |             |              |
|        |           VRF  |   |             |          VRF |
|        |           red  |   |             |         blue |
+-------------------------+   +----------------------------+
         |                                  |
         |                                  |
 +---------------------------------------------------------+
 |       |                                  |              |
 |       |                                  |              |
 |       |         +--------------+         |              |
 |       |         |              |         |              |
 |       +---------+  eth0.2030   +---------+              |
 |                 |  10.0.0.1/24 |                        |
 |                 +-----+--------+                    VRF |
 |                       |                            green|
 +---------------------------------------------------------+
                         |
                         |
                    +----+---+
                    |        |
                    |  eth0  |
                    |        |
                    +--------+

iproute2 commands to reproduce the setup:

ip link add green type vrf table 1
ip link set green up
ip link add eth0.2030 link eth0 type vlan id 2030
ip link set eth0.2030 master green
ip addr add 10.0.0.1/24 dev eth0.2030
ip link set eth0.2030 up

ip link add blue type vrf table 2
ip link set blue up
ip link add br-blue type bridge
ip link set br-blue master blue
ip link set br-blue up
ip link add vxlan-blue type vxlan id 2 local 10.0.0.1 dev eth0.2030 \
 port 4789
ip link set vxlan-blue master br-blue
ip link set vxlan-blue up
ip link set tap-blue master br-blue
ip link set tap-blue up

ip link add red type vrf table 3
ip link set red up
ip link add br-red type bridge
ip link set br-red master red
ip link set br-red up
ip link add vxlan-red type vxlan id 3 local 10.0.0.1 dev eth0.2030 \
 port 4789
ip link set vxlan-red master br-red
ip link set vxlan-red up
ip link set tap-red master br-red
ip link set tap-red up

We faced some issue in the datapath, here are the details:

* Egress traffic:
The vxlan packets are sent directly to the default VRF because it's where the
socket is bound, therefore the traffic has a default route via eth0. the
workaround is to force this traffic to VRF green with ip rules.

* Ingress traffic:
When receiving the traffic on eth0.2030 the vxlan socket is unreachable from
VRF green. The workaround is to enable *udp_l3mdev_accept* sysctl, but
this breaks isolation between overlay and underlay: packets sent from
blue or red by e.g. a guest VM will be accepted by the socket, allowing
injection of VXLAN packets from the overlay.

This patch series fixes the issues describe above by allowing VXLAN socket to be
bound to a specific VRF device therefore looking up in the correct table.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -188,7 +188,7 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
* and enabled unshareable flags.
*/
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
__be16 port, u32 flags)
__be16 port, u32 flags, int ifindex)
{
struct vxlan_sock *vs;
......@@ -197,7 +197,8 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
if (inet_sk(vs->sock->sk)->inet_sport == port &&
vxlan_get_sk_family(vs) == family &&
vs->flags == flags)
vs->flags == flags &&
vs->sock->sk->sk_bound_dev_if == ifindex)
return vs;
}
return NULL;
......@@ -237,7 +238,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
{
struct vxlan_sock *vs;
vs = vxlan_find_sock(net, family, port, flags);
vs = vxlan_find_sock(net, family, port, flags, ifindex);
if (!vs)
return NULL;
......@@ -2288,6 +2289,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct rtable *rt;
__be16 df = 0;
if (!ifindex)
ifindex = sock4->sock->sk->sk_bound_dev_if;
rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
dst->sin.sin_addr.s_addr,
&local_ip.sin.sin_addr.s_addr,
......@@ -2337,6 +2341,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
} else {
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
if (!ifindex)
ifindex = sock6->sock->sk->sk_bound_dev_if;
ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
label, &dst->sin6.sin6_addr,
&local_ip.sin6.sin6_addr,
......@@ -2951,7 +2958,7 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
};
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
__be16 port, u32 flags)
__be16 port, u32 flags, int ifindex)
{
struct socket *sock;
struct udp_port_cfg udp_conf;
......@@ -2969,6 +2976,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
}
udp_conf.local_udp_port = port;
udp_conf.bind_ifindex = ifindex;
/* Open UDP socket */
err = udp_sock_create(net, &udp_conf, &sock);
......@@ -2980,7 +2988,8 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
/* Create new listen socket if needed */
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
__be16 port, u32 flags)
__be16 port, u32 flags,
int ifindex)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_sock *vs;
......@@ -2995,7 +3004,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
for (h = 0; h < VNI_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vs->vni_list[h]);
sock = vxlan_create_sock(net, ipv6, port, flags);
sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
if (IS_ERR(sock)) {
kfree(vs);
return ERR_CAST(sock);
......@@ -3033,11 +3042,17 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_sock *vs = NULL;
struct vxlan_dev_node *node;
int l3mdev_index = 0;
if (vxlan->cfg.remote_ifindex)
l3mdev_index = l3mdev_master_upper_ifindex_by_index(
vxlan->net, vxlan->cfg.remote_ifindex);
if (!vxlan->cfg.no_share) {
spin_lock(&vn->sock_lock);
vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
vxlan->cfg.dst_port, vxlan->cfg.flags);
vxlan->cfg.dst_port, vxlan->cfg.flags,
l3mdev_index);
if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
spin_unlock(&vn->sock_lock);
return -EBUSY;
......@@ -3046,7 +3061,8 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
}
if (!vs)
vs = vxlan_socket_create(vxlan->net, ipv6,
vxlan->cfg.dst_port, vxlan->cfg.flags);
vxlan->cfg.dst_port, vxlan->cfg.flags,
l3mdev_index);
if (IS_ERR(vs))
return PTR_ERR(vs);
#if IS_ENABLED(CONFIG_IPV6)
......
......@@ -101,6 +101,17 @@ struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
return master;
}
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
rcu_read_lock();
ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
rcu_read_unlock();
return ifindex;
}
u32 l3mdev_fib_table_rcu(const struct net_device *dev);
u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
static inline u32 l3mdev_fib_table(const struct net_device *dev)
......@@ -207,6 +218,17 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
return 0;
}
static inline
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
{
return 0;
}
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
return 0;
}
static inline
struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
{
......
......@@ -30,6 +30,7 @@ struct udp_port_cfg {
__be16 local_udp_port;
__be16 peer_udp_port;
int bind_ifindex;
unsigned int use_udp_checksums:1,
use_udp6_tx_checksums:1,
use_udp6_rx_checksums:1,
......
......@@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
if (err < 0)
goto error;
if (cfg->bind_ifindex) {
struct net_device *dev;
dev = dev_get_by_index(net, cfg->bind_ifindex);
if (!dev) {
err = -ENODEV;
goto error;
}
err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
dev->name, strlen(dev->name) + 1);
dev_put(dev);
if (err < 0)
goto error;
}
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
......
......@@ -31,6 +31,22 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
if (err < 0)
goto error;
}
if (cfg->bind_ifindex) {
struct net_device *dev;
dev = dev_get_by_index(net, cfg->bind_ifindex);
if (!dev) {
err = -ENODEV;
goto error;
}
err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
dev->name, strlen(dev->name) + 1);
dev_put(dev);
if (err < 0)
goto error;
}
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
......
......@@ -46,6 +46,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev)
}
EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
/**
* l3mdev_master_upper_ifindex_by_index - get index of upper l3 master
* device
* @net: network namespace for device index lookup
* @ifindex: targeted interface
*/
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
{
struct net_device *dev;
dev = dev_get_by_index_rcu(net, ifindex);
while (dev && !netif_is_l3_master(dev))
dev = netdev_master_upper_dev_get(dev);
return dev ? dev->ifindex : 0;
}
EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
/**
* l3mdev_fib_table - get FIB table id associated with an L3
* master interface
......
......@@ -7,7 +7,7 @@ CFLAGS += -I../../../../usr/include/
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
TEST_PROGS += udpgro_bench.sh udpgro.sh
TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh
TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# This test is for checking VXLAN underlay in a non-default VRF.
#
# It simulates two hypervisors running a VM each using four network namespaces:
# two for the HVs, two for the VMs.
# A small VXLAN tunnel is made between the two hypervisors to have the two vms
# in the same virtual L2:
#
# +-------------------+ +-------------------+
# | | | |
# | vm-1 netns | | vm-2 netns |
# | | | |
# | +-------------+ | | +-------------+ |
# | | veth-hv | | | | veth-hv | |
# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | |
# | +-------------+ | | +-------------+ |
# | . | | . |
# +-------------------+ +-------------------+
# . .
# . .
# . .
# +-----------------------------------+ +------------------------------------+
# | . | | . |
# | +----------+ | | +----------+ |
# | | veth-tap | | | | veth-tap | |
# | +----+-----+ | | +----+-----+ |
# | | | | | |
# | +--+--+ +--------------+ | | +--------------+ +--+--+ |
# | | br0 | | vrf-underlay | | | | vrf-underlay | | br0 | |
# | +--+--+ +-------+------+ | | +------+-------+ +--+--+ |
# | | | | | | | |
# | +---+----+ +-------+-------+ | | +-------+-------+ +---+----+ |
# | | vxlan0 |....| veth0 |.|...|.| veth0 |....| vxlan0 | |
# | +--------+ | 172.16.0.1/24 | | | | 172.16.0.2/24 | +--------+ |
# | +---------------+ | | +---------------+ |
# | | | |
# | hv-1 netns | | hv-2 netns |
# | | | |
# +-----------------------------------+ +------------------------------------+
#
# This tests both the connectivity between vm-1 and vm-2, and that the underlay
# can be moved in and out of the vrf by unsetting and setting veth0's master.
set -e
cleanup() {
ip link del veth-hv-1 2>/dev/null || true
ip link del veth-tap 2>/dev/null || true
for ns in hv-1 hv-2 vm-1 vm-2; do
ip netns del $ns || true
done
}
# Clean start
cleanup &> /dev/null
[[ $1 == "clean" ]] && exit 0
trap cleanup EXIT
# Setup "Hypervisors" simulated with netns
ip link add veth-hv-1 type veth peer name veth-hv-2
setup-hv-networking() {
hv=$1
ip netns add hv-$hv
ip link set veth-hv-$hv netns hv-$hv
ip -netns hv-$hv link set veth-hv-$hv name veth0
ip -netns hv-$hv link add vrf-underlay type vrf table 1
ip -netns hv-$hv link set vrf-underlay up
ip -netns hv-$hv addr add 172.16.0.$hv/24 dev veth0
ip -netns hv-$hv link set veth0 up
ip -netns hv-$hv link add br0 type bridge
ip -netns hv-$hv link set br0 up
ip -netns hv-$hv link add vxlan0 type vxlan id 10 local 172.16.0.$hv dev veth0 dstport 4789
ip -netns hv-$hv link set vxlan0 master br0
ip -netns hv-$hv link set vxlan0 up
}
setup-hv-networking 1
setup-hv-networking 2
# Check connectivity between HVs by pinging hv-2 from hv-1
echo -n "Checking HV connectivity "
ip netns exec hv-1 ping -c 1 -W 1 172.16.0.2 &> /dev/null || (echo "[FAIL]"; false)
echo "[ OK ]"
# Setups a "VM" simulated by a netns an a veth pair
setup-vm() {
id=$1
ip netns add vm-$id
ip link add veth-tap type veth peer name veth-hv
ip link set veth-tap netns hv-$id
ip -netns hv-$id link set veth-tap master br0
ip -netns hv-$id link set veth-tap up
ip link set veth-hv netns vm-$id
ip -netns vm-$id addr add 10.0.0.$id/24 dev veth-hv
ip -netns vm-$id link set veth-hv up
}
setup-vm 1
setup-vm 2
# Setup VTEP routes to make ARP work
bridge -netns hv-1 fdb add 00:00:00:00:00:00 dev vxlan0 dst 172.16.0.2 self permanent
bridge -netns hv-2 fdb add 00:00:00:00:00:00 dev vxlan0 dst 172.16.0.1 self permanent
echo -n "Check VM connectivity through VXLAN (underlay in the default VRF) "
ip netns exec vm-1 ping -c 1 -W 1 10.0.0.2 &> /dev/null || (echo "[FAIL]"; false)
echo "[ OK ]"
# Move the underlay to a non-default VRF
ip -netns hv-1 link set veth0 vrf vrf-underlay
ip -netns hv-1 link set veth0 down
ip -netns hv-1 link set veth0 up
ip -netns hv-2 link set veth0 vrf vrf-underlay
ip -netns hv-2 link set veth0 down
ip -netns hv-2 link set veth0 up
echo -n "Check VM connectivity through VXLAN (underlay in a VRF) "
ip netns exec vm-1 ping -c 1 -W 1 10.0.0.2 &> /dev/null || (echo "[FAIL]"; false)
echo "[ OK ]"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册