提交 38428d68 编写于 作者: R Roopa Prabhu 提交者: David S. Miller

nexthop: support for fdb ecmp nexthops

This patch introduces ecmp nexthops and nexthop groups
for mac fdb entries. In subsequent patches this is used
by the vxlan driver fdb entries. The use case is
E-VPN multihoming [1,2,3] which requires bridged vxlan traffic
to be load balanced to remote switches (vteps) belonging to
the same multi-homed ethernet segment (This is analogous to
a multi-homed LAG but over vxlan).

Changes include new nexthop flag NHA_FDB for nexthops
referenced by fdb entries. These nexthops only have ip.
This patch includes appropriate checks to avoid routes
referencing such nexthops.

example:
$ip nexthop add id 12 via 172.16.1.2 fdb
$ip nexthop add id 13 via 172.16.1.3 fdb
$ip nexthop add id 102 group 12/13 fdb

$bridge fdb add 02:02:00:00:00:13 dev vxlan1000 nhid 101 self

[1] E-VPN https://tools.ietf.org/html/rfc7432
[2] E-VPN VxLAN: https://tools.ietf.org/html/rfc8365
[3] LPC talk with mention of nexthop groups for L2 ecmp
http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf

v4 - fixed uninitialized variable reported by kernel test robot
Reported-by: Nkernel test robot <rong.a.chen@intel.com>
Signed-off-by: NRoopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: NDavid Ahern <dsahern@gmail.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 7b1b843a
......@@ -65,6 +65,7 @@ struct fib6_config {
struct nl_info fc_nlinfo;
struct nlattr *fc_encap;
u16 fc_encap_type;
bool fc_is_fdb;
};
struct fib6_node {
......
......@@ -26,6 +26,7 @@ struct nh_config {
u8 nh_family;
u8 nh_protocol;
u8 nh_blackhole;
u8 nh_fdb;
u32 nh_flags;
int nh_ifindex;
......@@ -52,6 +53,7 @@ struct nh_info {
u8 family;
bool reject_nh;
bool fdb_nh;
union {
struct fib_nh_common fib_nhc;
......@@ -80,6 +82,7 @@ struct nexthop {
struct rb_node rb_node; /* entry on netns rbtree */
struct list_head fi_list; /* v4 entries using nh */
struct list_head f6i_list; /* v6 entries using nh */
struct list_head fdb_list; /* fdb entries using this nh */
struct list_head grp_list; /* nh group entries using this nh */
struct net *net;
......@@ -88,6 +91,7 @@ struct nexthop {
u8 protocol; /* app managing this nh */
u8 nh_flags;
bool is_group;
bool is_fdb_nh;
refcount_t refcnt;
struct rcu_head rcu;
......@@ -304,4 +308,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
int nexthop_for_each_fib6_nh(struct nexthop *nh,
int (*cb)(struct fib6_nh *nh, void *arg),
void *arg);
static inline int nexthop_get_family(struct nexthop *nh)
{
struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
return nhi->family;
}
static inline
struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
{
struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
return &nhi->fib_nhc;
}
static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
int hash)
{
struct nh_info *nhi;
struct nexthop *nhp;
nhp = nexthop_select_path(nh, hash);
if (unlikely(!nhp))
return NULL;
nhi = rcu_dereference(nhp->nh_info);
return &nhi->fib_nhc;
}
#endif
......@@ -49,6 +49,9 @@ enum {
NHA_GROUPS, /* flag; only return nexthop groups in dump */
NHA_MASTER, /* u32; only return nexthops with given master dev */
NHA_FDB, /* flag; nexthop belongs to a bridge fdb */
/* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
__NHA_MAX,
};
......
......@@ -33,6 +33,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
[NHA_ENCAP] = { .type = NLA_NESTED },
[NHA_GROUPS] = { .type = NLA_FLAG },
[NHA_MASTER] = { .type = NLA_U32 },
[NHA_FDB] = { .type = NLA_FLAG },
};
static unsigned int nh_dev_hashfn(unsigned int val)
......@@ -107,6 +108,7 @@ static struct nexthop *nexthop_alloc(void)
INIT_LIST_HEAD(&nh->fi_list);
INIT_LIST_HEAD(&nh->f6i_list);
INIT_LIST_HEAD(&nh->grp_list);
INIT_LIST_HEAD(&nh->fdb_list);
}
return nh;
}
......@@ -227,6 +229,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_u32(skb, NHA_ID, nh->id))
goto nla_put_failure;
if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB))
goto nla_put_failure;
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
......@@ -241,7 +246,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_flag(skb, NHA_BLACKHOLE))
goto nla_put_failure;
goto out;
} else {
} else if (!nh->is_fdb_nh) {
const struct net_device *dev;
dev = nhi->fib_nhc.nhc_dev;
......@@ -387,12 +392,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
return true;
}
static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
struct netlink_ext_ack *extack)
{
struct nh_info *nhi;
if (!nh->is_fdb_nh) {
NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
return -EINVAL;
}
nhi = rtnl_dereference(nh->nh_info);
if (*nh_family == AF_UNSPEC) {
*nh_family = nhi->family;
} else if (*nh_family != nhi->family) {
NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
return -EINVAL;
}
return 0;
}
static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
unsigned int len = nla_len(tb[NHA_GROUP]);
u8 nh_family = AF_UNSPEC;
struct nexthop_grp *nhg;
unsigned int i, j;
u8 nhg_fdb = 0;
if (len & (sizeof(struct nexthop_grp) - 1)) {
NL_SET_ERR_MSG(extack,
......@@ -421,6 +449,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
}
}
if (tb[NHA_FDB])
nhg_fdb = 1;
nhg = nla_data(tb[NHA_GROUP]);
for (i = 0; i < len; ++i) {
struct nexthop *nh;
......@@ -432,11 +462,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
}
if (!valid_group_nh(nh, len, extack))
return -EINVAL;
if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
return -EINVAL;
if (!nhg_fdb && nh->is_fdb_nh) {
NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
return -EINVAL;
}
}
for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
if (!tb[i])
continue;
if (tb[NHA_FDB])
continue;
NL_SET_ERR_MSG(extack,
"No other attributes can be set in nexthop groups");
return -EINVAL;
......@@ -495,6 +534,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
if (hash > atomic_read(&nhge->upper_bound))
continue;
if (nhge->nh->is_fdb_nh)
return nhge->nh;
/* nexthops always check if it is good and does
* not rely on a sysctl for this behavior
*/
......@@ -564,6 +606,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
{
struct nh_info *nhi;
if (nh->is_fdb_nh) {
NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
return -EINVAL;
}
/* fib6_src is unique to a fib6_info and limits the ability to cache
* routes in fib6_nh within a nexthop that is potentially shared
* across multiple fib entries. If the config wants to use source
......@@ -640,6 +687,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope,
{
int err = 0;
if (nh->is_fdb_nh) {
NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
err = -EINVAL;
goto out;
}
if (nh->is_group) {
struct nh_group *nhg;
......@@ -1125,6 +1178,9 @@ static struct nexthop *nexthop_create_group(struct net *net,
nh_group_rebalance(nhg);
}
if (cfg->nh_fdb)
nh->is_fdb_nh = 1;
rcu_assign_pointer(nh->nh_grp, nhg);
return nh;
......@@ -1152,7 +1208,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
.fc_encap = cfg->nh_encap,
.fc_encap_type = cfg->nh_encap_type,
};
u32 tb_id = l3mdev_fib_table(cfg->dev);
u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
int err;
err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
......@@ -1161,6 +1217,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
goto out;
}
if (nh->is_fdb_nh)
goto out;
/* sets nh_dev if successful */
err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
if (!err) {
......@@ -1186,6 +1245,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh,
.fc_flags = cfg->nh_flags,
.fc_encap = cfg->nh_encap,
.fc_encap_type = cfg->nh_encap_type,
.fc_is_fdb = cfg->nh_fdb,
};
int err;
......@@ -1227,6 +1287,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
nhi->family = cfg->nh_family;
nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
if (cfg->nh_fdb)
nh->is_fdb_nh = 1;
if (cfg->nh_blackhole) {
nhi->reject_nh = 1;
cfg->nh_ifindex = net->loopback_dev->ifindex;
......@@ -1248,7 +1311,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
}
/* add the entry to the device based hash */
nexthop_devhash_add(net, nhi);
if (!nh->is_fdb_nh)
nexthop_devhash_add(net, nhi);
rcu_assign_pointer(nh->nh_info, nhi);
......@@ -1352,6 +1416,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
if (tb[NHA_ID])
cfg->nh_id = nla_get_u32(tb[NHA_ID]);
if (tb[NHA_FDB]) {
if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
goto out;
}
if (nhm->nh_flags) {
NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
goto out;
}
cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
}
if (tb[NHA_GROUP]) {
if (nhm->nh_family != AF_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid family for group");
......@@ -1375,8 +1452,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
if (tb[NHA_BLACKHOLE]) {
if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
goto out;
}
......@@ -1385,26 +1462,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
goto out;
}
if (!tb[NHA_OIF]) {
NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
if (!cfg->nh_fdb && !tb[NHA_OIF]) {
NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
goto out;
}
cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
if (cfg->nh_ifindex)
cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
if (!cfg->nh_fdb && tb[NHA_OIF]) {
cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
if (cfg->nh_ifindex)
cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
if (!cfg->dev) {
NL_SET_ERR_MSG(extack, "Invalid device index");
goto out;
} else if (!(cfg->dev->flags & IFF_UP)) {
NL_SET_ERR_MSG(extack, "Nexthop device is not up");
err = -ENETDOWN;
goto out;
} else if (!netif_carrier_ok(cfg->dev)) {
NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
err = -ENETDOWN;
goto out;
if (!cfg->dev) {
NL_SET_ERR_MSG(extack, "Invalid device index");
goto out;
} else if (!(cfg->dev->flags & IFF_UP)) {
NL_SET_ERR_MSG(extack, "Nexthop device is not up");
err = -ENETDOWN;
goto out;
} else if (!netif_carrier_ok(cfg->dev)) {
NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
err = -ENETDOWN;
goto out;
}
}
err = -EINVAL;
......@@ -1633,7 +1712,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
int *master_idx, bool *group_filter,
struct netlink_callback *cb)
bool *fdb_filter, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
struct nlattr *tb[NHA_MAX + 1];
......@@ -1670,6 +1749,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
case NHA_GROUPS:
*group_filter = true;
break;
case NHA_FDB:
*fdb_filter = true;
break;
default:
NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
return -EINVAL;
......@@ -1688,17 +1770,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
/* rtnl */
static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
{
bool group_filter = false, fdb_filter = false;
struct nhmsg *nhm = nlmsg_data(cb->nlh);
int dev_filter_idx = 0, master_idx = 0;
struct net *net = sock_net(skb->sk);
struct rb_root *root = &net->nexthop.rb_root;
bool group_filter = false;
struct rb_node *node;
int idx = 0, s_idx;
int err;
err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
&group_filter, cb);
&group_filter, &fdb_filter, cb);
if (err < 0)
return err;
......
......@@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
#ifdef CONFIG_IPV6_ROUTER_PREF
fib6_nh->last_probe = jiffies;
#endif
if (cfg->fc_is_fdb) {
fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
fib6_nh->fib_nh_gw_family = AF_INET6;
return 0;
}
err = -ENODEV;
if (cfg->fc_ifindex) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册