提交 7f9091f0 编写于 作者: D David S. Miller

Merge branch 'link_netns'

Merge branch 'link_netns'

Nicolas Dichtel says:

====================
netns: allow to identify peer netns

The goal of this serie is to be able to multicast netlink messages with an
attribute that identify a peer netns.
This is needed by the userland to interpret some information contained in
netlink messages (like IFLA_LINK value, but also some other attributes in case
of x-netns netdevice (see also
http://thread.gmane.org/gmane.linux.network/315933/focus=316064 and
http://thread.gmane.org/gmane.linux.kernel.containers/28301/focus=4239)).

Ids of peer netns can be set by userland via a new rtnl cmd RTM_NEWNSID. When
the kernel needs an id for a peer (for example when advertising a new x-netns
interface via netlink), if the user didn't allocate an id, one will be
automatically allocated.
These ids are stored per netns and are local (ie only valid in the netns where
they are set). To avoid allocating an int for each peer netns, I use
idr_for_each() to retrieve the id of a peer netns. Note that it will be possible
to add a table (struct net -> id) later to optimize this lookup if needed.

Patch 1/4 introduces the rtnetlink API mechanism to set and get these ids.
Patch 2/4 and 3/4 implements an example of how to use these ids when advertising
information about a x-netns interface.
And patch 4/4 shows that the netlink messages can be symetric between a GET and
a SET.

iproute2 patches are available, I can send them on demand.

Here is a small screenshot to show how it can be used by userland.

$ ip netns add foo
$ ip netns del foo
$ ip netns
$ touch /var/run/netns/init_net
$ mount --bind /proc/1/ns/net /var/run/netns/init_net
$ ip netns add foo
$ ip -n foo netns
foo
init_net
$ ip -n foo netns set init_net 0
$ ip -n foo netns set foo 1

$ ip netns
foo
init_net
$ ip -n foo netns
foo (id: 1)
init_net (id: 0)

$ ip -n foo link add ipip1 link-netnsid 0 type ipip remote 10.16.0.121 local 10.16.0.249
$ ip -n foo link ls ipip1
6: ipip1@NONE: <POINTOPOINT,NOARP> mtu 1480 qdisc noop state DOWN mode DEFAULT group default
    link/ipip 10.16.0.249 peer 10.16.0.121 link-netnsid 0

$ ip netns
foo
init_net
$ ip -n foo link add ipip2 type ipip remote 10.16.0.121 local 10.16.0.249
$ ip -n foo link set ipip2 netns init_net
$ ip link ls ipip2
7: ipip2@NONE: <POINTOPOINT,NOARP> mtu 1480 qdisc noop state DOWN mode DEFAULT group default
    link/ipip 10.16.0.249 peer 10.16.0.121 link-netnsid 0
$ ip netns
foo (id: 0)
init_net

v4 -> v5:
  use rtnetlink instead of genetlink
  allocate automatically an id if user didn't assign one
  rename include/uapi/linux/netns.h to include/uapi/linux/net_namespace.h
  add vxlan in patch #3

RFCv3 -> v4:
  rebase on net-next
  add copyright text in the new netns.h file

RFCv2 -> RFCv3:
  ids are now defined by userland (via netlink). Ids are stored in each netns
  (and they are local to this netns).
  add get_link_net support for ip6 tunnels
  netnsid is now a s32 instead of a u32

RFCv1 -> RFCv2:
  remove useless ()
  ids are now stored in the user ns. It's possible to get an id for a peer netns
  only if the current netns and the peer netns have the same user ns parent.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -6578,6 +6578,7 @@ F: include/linux/netdevice.h
F: include/uapi/linux/in.h
F: include/uapi/linux/net.h
F: include/uapi/linux/netdevice.h
F: include/uapi/linux/net_namespace.h
F: tools/net/
F: tools/testing/selftests/net/
F: lib/random32.c
......
......@@ -2923,6 +2923,13 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
return -EMSGSIZE;
}
static struct net *vxlan_get_link_net(const struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
return vxlan->net;
}
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
.kind = "vxlan",
.maxtype = IFLA_VXLAN_MAX,
......@@ -2934,6 +2941,7 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
.dellink = vxlan_dellink,
.get_size = vxlan_get_size,
.fill_info = vxlan_fill_info,
.get_link_net = vxlan_get_link_net,
};
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
......
......@@ -70,6 +70,7 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw);
__u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr,
const struct in6_addr *raddr);
struct net *ip6_tnl_get_link_net(const struct net_device *dev);
static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
......
......@@ -141,6 +141,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
int ip_tunnel_init(struct net_device *dev);
void ip_tunnel_uninit(struct net_device *dev);
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
struct net *ip_tunnel_get_link_net(const struct net_device *dev);
int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname);
......
......@@ -60,6 +60,7 @@ struct net {
struct list_head exit_list; /* Use only net_mutex */
struct user_namespace *user_ns; /* Owning user namespace */
struct idr netns_ids;
struct ns_common ns;
......@@ -290,6 +291,9 @@ static inline struct net *read_pnet(struct net * const *pnet)
#define __net_initconst __initconst
#endif
int peernet2id(struct net *net, struct net *peer);
struct net *get_net_ns_by_id(struct net *net, int id);
struct pernet_operations {
struct list_head list;
int (*init)(struct net *net);
......
......@@ -46,6 +46,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
* to create when creating a new device.
* @get_num_rx_queues: Function to determine number of receive queues
* to create when creating a new device.
* @get_link_net: Function to get the i/o netns of the device
*/
struct rtnl_link_ops {
struct list_head list;
......@@ -93,6 +94,7 @@ struct rtnl_link_ops {
int (*fill_slave_info)(struct sk_buff *skb,
const struct net_device *dev,
const struct net_device *slave_dev);
struct net *(*get_link_net)(const struct net_device *dev);
};
int __rtnl_link_register(struct rtnl_link_ops *ops);
......
......@@ -283,6 +283,7 @@ header-y += net.h
header-y += netlink_diag.h
header-y += netlink.h
header-y += netrom.h
header-y += net_namespace.h
header-y += net_tstamp.h
header-y += nfc.h
header-y += nfs2.h
......
......@@ -146,6 +146,7 @@ enum {
IFLA_PHYS_PORT_ID,
IFLA_CARRIER_CHANGES,
IFLA_PHYS_SWITCH_ID,
IFLA_LINK_NETNSID,
__IFLA_MAX
};
......
/* Copyright (c) 2015 6WIND S.A.
* Author: Nicolas Dichtel <nicolas.dichtel@6wind.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*/
#ifndef _UAPI_LINUX_NET_NAMESPACE_H_
#define _UAPI_LINUX_NET_NAMESPACE_H_
/* Attributes of RTM_NEWNSID/RTM_GETNSID messages */
enum {
NETNSA_NONE,
#define NETNSA_NSID_NOT_ASSIGNED -1
NETNSA_NSID,
NETNSA_PID,
NETNSA_FD,
__NETNSA_MAX,
};
#define NETNSA_MAX (__NETNSA_MAX - 1)
#endif /* _UAPI_LINUX_NET_NAMESPACE_H_ */
......@@ -132,6 +132,11 @@ enum {
RTM_GETMDB = 86,
#define RTM_GETMDB RTM_GETMDB
RTM_NEWNSID = 88,
#define RTM_NEWNSID RTM_NEWNSID
RTM_GETNSID = 90,
#define RTM_GETNSID RTM_GETNSID
__RTM_MAX,
#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1)
};
......
......@@ -15,6 +15,10 @@
#include <linux/file.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/rtnetlink.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
......@@ -144,6 +148,77 @@ static void ops_free_list(const struct pernet_operations *ops,
}
}
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
int min = 0, max = 0;
ASSERT_RTNL();
if (reqid >= 0) {
min = reqid;
max = reqid + 1;
}
return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
}
/* This function is used by idr_for_each(). If net is equal to peer, the
* function returns the id so that idr_for_each() stops. Because we cannot
* returns the id 0 (idr_for_each() will not stop), we return the magic value
* NET_ID_ZERO (-1) for it.
*/
#define NET_ID_ZERO -1
static int net_eq_idr(int id, void *net, void *peer)
{
if (net_eq(net, peer))
return id ? : NET_ID_ZERO;
return 0;
}
static int __peernet2id(struct net *net, struct net *peer, bool alloc)
{
int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
ASSERT_RTNL();
/* Magic value for id 0. */
if (id == NET_ID_ZERO)
return 0;
if (id > 0)
return id;
if (alloc)
return alloc_netid(net, peer, -1);
return -ENOENT;
}
/* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned.
*/
int peernet2id(struct net *net, struct net *peer)
{
int id = __peernet2id(net, peer, true);
return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
}
struct net *get_net_ns_by_id(struct net *net, int id)
{
struct net *peer;
if (id < 0)
return NULL;
rcu_read_lock();
peer = idr_find(&net->netns_ids, id);
if (peer)
get_net(peer);
rcu_read_unlock();
return peer;
}
/*
* setup_net runs the initializers for the network namespace object.
*/
......@@ -158,6 +233,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
atomic_set(&net->passive, 1);
net->dev_base_seq = 1;
net->user_ns = user_ns;
idr_init(&net->netns_ids);
#ifdef NETNS_REFCNT_DEBUG
atomic_set(&net->use_count, 0);
......@@ -288,6 +364,14 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry(net, &net_kill_list, cleanup_list) {
list_del_rcu(&net->list);
list_add_tail(&net->exit_list, &net_exit_list);
for_each_net(tmp) {
int id = __peernet2id(tmp, net, false);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
}
idr_destroy(&net->netns_ids);
}
rtnl_unlock();
......@@ -402,6 +486,130 @@ static struct pernet_operations __net_initdata net_ns_ops = {
.exit = net_ns_net_exit,
};
static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
[NETNSA_NONE] = { .type = NLA_UNSPEC },
[NETNSA_NSID] = { .type = NLA_S32 },
[NETNSA_PID] = { .type = NLA_U32 },
[NETNSA_FD] = { .type = NLA_U32 },
};
static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
struct net *peer;
int nsid, err;
err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
rtnl_net_policy);
if (err < 0)
return err;
if (!tb[NETNSA_NSID])
return -EINVAL;
nsid = nla_get_s32(tb[NETNSA_NSID]);
if (tb[NETNSA_PID])
peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
else if (tb[NETNSA_FD])
peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
else
return -EINVAL;
if (IS_ERR(peer))
return PTR_ERR(peer);
if (__peernet2id(net, peer, false) >= 0) {
err = -EEXIST;
goto out;
}
err = alloc_netid(net, peer, nsid);
if (err > 0)
err = 0;
out:
put_net(peer);
return err;
}
static int rtnl_net_get_size(void)
{
return NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ nla_total_size(sizeof(s32)) /* NETNSA_NSID */
;
}
static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
int cmd, struct net *net, struct net *peer)
{
struct nlmsghdr *nlh;
struct rtgenmsg *rth;
int id;
ASSERT_RTNL();
nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
if (!nlh)
return -EMSGSIZE;
rth = nlmsg_data(nlh);
rth->rtgen_family = AF_UNSPEC;
id = __peernet2id(net, peer, false);
if (id < 0)
id = NETNSA_NSID_NOT_ASSIGNED;
if (nla_put_s32(skb, NETNSA_NSID, id))
goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
struct sk_buff *msg;
int err = -ENOBUFS;
struct net *peer;
err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
rtnl_net_policy);
if (err < 0)
return err;
if (tb[NETNSA_PID])
peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
else if (tb[NETNSA_FD])
peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
else
return -EINVAL;
if (IS_ERR(peer))
return PTR_ERR(peer);
msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
if (!msg) {
err = -ENOMEM;
goto out;
}
err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
RTM_GETNSID, net, peer);
if (err < 0)
goto err_out;
err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
goto out;
err_out:
nlmsg_free(msg);
out:
put_net(peer);
return err;
}
static int __init net_ns_init(void)
{
struct net_generic *ng;
......@@ -435,6 +643,9 @@ static int __init net_ns_init(void)
register_pernet_subsys(&net_ns_ops);
rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, NULL, NULL);
return 0;
}
......
......@@ -875,6 +875,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(4) /* IFLA_LINK_NETNSID */
+ nla_total_size(ext_filter_mask
& RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
......@@ -1169,6 +1170,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
goto nla_put_failure;
}
if (dev->rtnl_link_ops &&
dev->rtnl_link_ops->get_link_net) {
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
if (!net_eq(dev_net(dev), link_net)) {
int id = peernet2id(dev_net(dev), link_net);
if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
goto nla_put_failure;
}
}
if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
goto nla_put_failure;
......@@ -1235,6 +1248,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
......@@ -2008,7 +2022,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0];
struct nlattr **data = NULL;
struct nlattr **slave_data = NULL;
struct net *dest_net;
struct net *dest_net, *link_net = NULL;
if (ops) {
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
......@@ -2114,7 +2128,18 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (IS_ERR(dest_net))
return PTR_ERR(dest_net);
dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
if (tb[IFLA_LINK_NETNSID]) {
int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
link_net = get_net_ns_by_id(dest_net, id);
if (!link_net) {
err = -EINVAL;
goto out;
}
}
dev = rtnl_create_link(link_net ? : dest_net, ifname,
name_assign_type, ops, tb);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out;
......@@ -2142,9 +2167,16 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
}
}
err = rtnl_configure_link(dev, ifm);
if (err < 0)
if (err < 0) {
unregister_netdevice(dev);
goto out;
}
if (link_net)
err = dev_change_net_namespace(dev, dest_net, ifname);
out:
if (link_net)
put_net(link_net);
put_net(dest_net);
return err;
}
......
......@@ -829,6 +829,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
.dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
.get_link_net = ip_tunnel_get_link_net,
};
static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
......@@ -843,6 +844,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
.get_link_net = ip_tunnel_get_link_net,
};
static int __net_init ipgre_tap_init_net(struct net *net)
......
......@@ -972,6 +972,14 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
}
EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
struct net *ip_tunnel_get_link_net(const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
return tunnel->net;
}
EXPORT_SYMBOL(ip_tunnel_get_link_net);
int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
......
......@@ -531,6 +531,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
.dellink = ip_tunnel_dellink,
.get_size = vti_get_size,
.fill_info = vti_fill_info,
.get_link_net = ip_tunnel_get_link_net,
};
static int __init vti_init(void)
......
......@@ -498,6 +498,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
.dellink = ip_tunnel_dellink,
.get_size = ipip_get_size,
.fill_info = ipip_fill_info,
.get_link_net = ip_tunnel_get_link_net,
};
static struct xfrm_tunnel ipip_handler __read_mostly = {
......
......@@ -1662,6 +1662,7 @@ static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
.dellink = ip6gre_dellink,
.get_size = ip6gre_get_size,
.fill_info = ip6gre_fill_info,
.get_link_net = ip6_tnl_get_link_net,
};
static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
......
......@@ -1760,6 +1760,14 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
return -EMSGSIZE;
}
struct net *ip6_tnl_get_link_net(const struct net_device *dev)
{
struct ip6_tnl *tunnel = netdev_priv(dev);
return tunnel->net;
}
EXPORT_SYMBOL(ip6_tnl_get_link_net);
static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_LINK] = { .type = NLA_U32 },
[IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) },
......@@ -1783,6 +1791,7 @@ static struct rtnl_link_ops ip6_link_ops __read_mostly = {
.dellink = ip6_tnl_dellink,
.get_size = ip6_tnl_get_size,
.fill_info = ip6_tnl_fill_info,
.get_link_net = ip6_tnl_get_link_net,
};
static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
......
......@@ -1016,6 +1016,7 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
.changelink = vti6_changelink,
.get_size = vti6_get_size,
.fill_info = vti6_fill_info,
.get_link_net = ip6_tnl_get_link_net,
};
static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
......
......@@ -1763,6 +1763,7 @@ static struct rtnl_link_ops sit_link_ops __read_mostly = {
.get_size = ipip6_get_size,
.fill_info = ipip6_fill_info,
.dellink = ipip6_dellink,
.get_link_net = ip_tunnel_get_link_net,
};
static struct xfrm_tunnel sit_handler __read_mostly = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册