提交 4d95b72f 编写于 作者: D David S. Miller

Merge branch 'netns-scalability'

Nicolas Dichtel says:

====================
netns: ease netlink use with a lot of netns

This idea was informally discussed in Ottawa / netdev0.1. The goal is to
ease the use/scalability of netns, from a userland point of view.
Today, users need to open one netlink socket per family and per netns.
Thus, when the number of netns inscreases (for example 5K or more), the
number of sockets needed to manage them grows a lot.

The goal of this series is to be able to monitor netlink events, for a
specified family, for a set of netns, with only one netlink socket. For
this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID.
When this option is set on a netlink socket, this socket will receive
netlink notifications from all netns that have a nsid assigned into the
netns where the socket has been opened.
The nsid is sent to userland via an anscillary data.

Here is an example with a patched iproute2. vxlan10 is created in the
current netns (netns0, nsid 0) and then moved to another netns (netns1,
nsid 1):

$ ip netns exec netns0 ip monitor all-nsid label
[nsid 0][NSID]nsid 1 (iproute2 netns name: netns1)
[nsid 0][NEIGH]??? lladdr 00:00:00:00:00:00 REACHABLE,PERMANENT
[nsid 0][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff
[nsid 0][LINK]Deleted 5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff
[nsid 1][NSID]nsid 0 (iproute2 netns name: netns0)
[nsid 1][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff link-netnsid 0
[nsid 1][ADDR]5: vxlan10    inet 192.168.0.249/24 brd 192.168.0.255 scope global vxlan10
       valid_lft forever preferred_lft forever
[nsid 1][ROUTE]local 192.168.0.249 dev vxlan10  table local  proto kernel  scope host  src 192.168.0.249
[nsid 1][ROUTE]ff00::/8 dev vxlan10  table local  metric 256  pref medium
[nsid 1][ROUTE]2001:123::/64 dev vxlan10  proto kernel  metric 256  pref medium
[nsid 1][LINK]5: vxlan10@NONE: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UNKNOWN group default
    link/ether 92:33:17:e6:e7:1d brd ff:ff:ff:ff:ff:ff link-netnsid 0
[nsid 1][ROUTE]broadcast 192.168.0.255 dev vxlan10  table local  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]192.168.0.0/24 dev vxlan10  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]broadcast 192.168.0.0 dev vxlan10  table local  proto kernel  scope link  src 192.168.0.249
[nsid 1][ROUTE]fe80::/64 dev vxlan10  proto kernel  metric 256  pref medium
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
...@@ -336,7 +336,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, ...@@ -336,7 +336,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
if (!net_eq(dev_net(vxlan->dev), vxlan->net) && if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
nla_put_s32(skb, NDA_LINK_NETNSID, nla_put_s32(skb, NDA_LINK_NETNSID,
peernet2id(dev_net(vxlan->dev), vxlan->net))) peernet2id_alloc(dev_net(vxlan->dev), vxlan->net)))
goto nla_put_failure; goto nla_put_failure;
if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
......
...@@ -28,6 +28,8 @@ struct netlink_skb_parms { ...@@ -28,6 +28,8 @@ struct netlink_skb_parms {
__u32 dst_group; __u32 dst_group;
__u32 flags; __u32 flags;
struct sock *sk; struct sock *sk;
bool nsid_is_set;
int nsid;
}; };
#define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb))
......
...@@ -271,7 +271,9 @@ static inline struct net *read_pnet(const possible_net_t *pnet) ...@@ -271,7 +271,9 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
#define __net_initconst __initconst #define __net_initconst __initconst
#endif #endif
int peernet2id_alloc(struct net *net, struct net *peer);
int peernet2id(struct net *net, struct net *peer); int peernet2id(struct net *net, struct net *peer);
bool peernet_has_id(struct net *net, struct net *peer);
struct net *get_net_ns_by_id(struct net *net, int id); struct net *get_net_ns_by_id(struct net *net, int id);
struct pernet_operations { struct pernet_operations {
......
...@@ -108,6 +108,7 @@ struct nlmsgerr { ...@@ -108,6 +108,7 @@ struct nlmsgerr {
#define NETLINK_NO_ENOBUFS 5 #define NETLINK_NO_ENOBUFS 5
#define NETLINK_RX_RING 6 #define NETLINK_RX_RING 6
#define NETLINK_TX_RING 7 #define NETLINK_TX_RING 7
#define NETLINK_LISTEN_ALL_NSID 8
struct nl_pktinfo { struct nl_pktinfo {
__u32 group; __u32 group;
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
static LIST_HEAD(pernet_list); static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list; static struct list_head *first_device = &pernet_list;
DEFINE_MUTEX(net_mutex); DEFINE_MUTEX(net_mutex);
static DEFINE_SPINLOCK(nsid_lock);
LIST_HEAD(net_namespace_list); LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list);
...@@ -147,24 +148,17 @@ static void ops_free_list(const struct pernet_operations *ops, ...@@ -147,24 +148,17 @@ static void ops_free_list(const struct pernet_operations *ops,
} }
} }
static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, /* should be called with nsid_lock held */
int id);
static int alloc_netid(struct net *net, struct net *peer, int reqid) static int alloc_netid(struct net *net, struct net *peer, int reqid)
{ {
int min = 0, max = 0, id; int min = 0, max = 0;
ASSERT_RTNL();
if (reqid >= 0) { if (reqid >= 0) {
min = reqid; min = reqid;
max = reqid + 1; max = reqid + 1;
} }
id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
if (id >= 0)
rtnl_net_notifyid(net, peer, RTM_NEWNSID, id);
return id;
} }
/* This function is used by idr_for_each(). If net is equal to peer, the /* This function is used by idr_for_each(). If net is equal to peer, the
...@@ -180,11 +174,16 @@ static int net_eq_idr(int id, void *net, void *peer) ...@@ -180,11 +174,16 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0; return 0;
} }
static int __peernet2id(struct net *net, struct net *peer, bool alloc) /* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
* is set to true, thus the caller knows that the new id must be notified via
* rtnl.
*/
static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
{ {
int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
bool alloc_it = *alloc;
ASSERT_RTNL(); *alloc = false;
/* Magic value for id 0. */ /* Magic value for id 0. */
if (id == NET_ID_ZERO) if (id == NET_ID_ZERO)
...@@ -192,36 +191,77 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) ...@@ -192,36 +191,77 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
if (id > 0) if (id > 0)
return id; return id;
if (alloc) if (alloc_it) {
return alloc_netid(net, peer, -1); id = alloc_netid(net, peer, -1);
*alloc = true;
return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
}
return -ENOENT; return NETNSA_NSID_NOT_ASSIGNED;
} }
/* should be called with nsid_lock held */
static int __peernet2id(struct net *net, struct net *peer)
{
bool no = false;
return __peernet2id_alloc(net, peer, &no);
}
static void rtnl_net_notifyid(struct net *net, int cmd, int id);
/* This function returns the id of a peer netns. If no id is assigned, one will /* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned. * be allocated and returned.
*/ */
int peernet2id_alloc(struct net *net, struct net *peer)
{
unsigned long flags;
bool alloc;
int id;
spin_lock_irqsave(&nsid_lock, flags);
alloc = atomic_read(&peer->count) == 0 ? false : true;
id = __peernet2id_alloc(net, peer, &alloc);
spin_unlock_irqrestore(&nsid_lock, flags);
if (alloc && id >= 0)
rtnl_net_notifyid(net, RTM_NEWNSID, id);
return id;
}
EXPORT_SYMBOL(peernet2id_alloc);
/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer) int peernet2id(struct net *net, struct net *peer)
{ {
bool alloc = atomic_read(&peer->count) == 0 ? false : true; unsigned long flags;
int id; int id;
id = __peernet2id(net, peer, alloc); spin_lock_irqsave(&nsid_lock, flags);
return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; id = __peernet2id(net, peer);
spin_unlock_irqrestore(&nsid_lock, flags);
return id;
}
/* This function returns true is the peer netns has an id assigned into the
* current netns.
*/
bool peernet_has_id(struct net *net, struct net *peer)
{
return peernet2id(net, peer) >= 0;
} }
EXPORT_SYMBOL(peernet2id);
struct net *get_net_ns_by_id(struct net *net, int id) struct net *get_net_ns_by_id(struct net *net, int id)
{ {
unsigned long flags;
struct net *peer; struct net *peer;
if (id < 0) if (id < 0)
return NULL; return NULL;
rcu_read_lock(); rcu_read_lock();
spin_lock_irqsave(&nsid_lock, flags);
peer = idr_find(&net->netns_ids, id); peer = idr_find(&net->netns_ids, id);
if (peer) if (peer)
get_net(peer); get_net(peer);
spin_unlock_irqrestore(&nsid_lock, flags);
rcu_read_unlock(); rcu_read_unlock();
return peer; return peer;
...@@ -362,14 +402,19 @@ static void cleanup_net(struct work_struct *work) ...@@ -362,14 +402,19 @@ static void cleanup_net(struct work_struct *work)
list_del_rcu(&net->list); list_del_rcu(&net->list);
list_add_tail(&net->exit_list, &net_exit_list); list_add_tail(&net->exit_list, &net_exit_list);
for_each_net(tmp) { for_each_net(tmp) {
int id = __peernet2id(tmp, net, false); int id;
if (id >= 0) { spin_lock_irq(&nsid_lock);
rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id); idr_remove(&tmp->netns_ids, id);
spin_unlock_irq(&nsid_lock);
if (id >= 0)
rtnl_net_notifyid(tmp, RTM_DELNSID, id);
} }
} spin_lock_irq(&nsid_lock);
idr_destroy(&net->netns_ids); idr_destroy(&net->netns_ids);
spin_unlock_irq(&nsid_lock);
} }
rtnl_unlock(); rtnl_unlock();
...@@ -497,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) ...@@ -497,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{ {
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1]; struct nlattr *tb[NETNSA_MAX + 1];
unsigned long flags;
struct net *peer; struct net *peer;
int nsid, err; int nsid, err;
...@@ -517,14 +563,18 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) ...@@ -517,14 +563,18 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
if (IS_ERR(peer)) if (IS_ERR(peer))
return PTR_ERR(peer); return PTR_ERR(peer);
if (__peernet2id(net, peer, false) >= 0) { spin_lock_irqsave(&nsid_lock, flags);
if (__peernet2id(net, peer) >= 0) {
err = -EEXIST; err = -EEXIST;
goto out; goto out;
} }
err = alloc_netid(net, peer, nsid); err = alloc_netid(net, peer, nsid);
if (err > 0) spin_unlock_irqrestore(&nsid_lock, flags);
if (err >= 0) {
rtnl_net_notifyid(net, RTM_NEWNSID, err);
err = 0; err = 0;
}
out: out:
put_net(peer); put_net(peer);
return err; return err;
...@@ -538,14 +588,10 @@ static int rtnl_net_get_size(void) ...@@ -538,14 +588,10 @@ static int rtnl_net_get_size(void)
} }
static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
int cmd, struct net *net, struct net *peer, int cmd, struct net *net, int nsid)
int nsid)
{ {
struct nlmsghdr *nlh; struct nlmsghdr *nlh;
struct rtgenmsg *rth; struct rtgenmsg *rth;
int id;
ASSERT_RTNL();
nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
if (!nlh) if (!nlh)
...@@ -554,14 +600,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, ...@@ -554,14 +600,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
rth = nlmsg_data(nlh); rth = nlmsg_data(nlh);
rth->rtgen_family = AF_UNSPEC; rth->rtgen_family = AF_UNSPEC;
if (nsid >= 0) { if (nla_put_s32(skb, NETNSA_NSID, nsid))
id = nsid;
} else {
id = __peernet2id(net, peer, false);
if (id < 0)
id = NETNSA_NSID_NOT_ASSIGNED;
}
if (nla_put_s32(skb, NETNSA_NSID, id))
goto nla_put_failure; goto nla_put_failure;
nlmsg_end(skb, nlh); nlmsg_end(skb, nlh);
...@@ -578,7 +617,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) ...@@ -578,7 +617,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *tb[NETNSA_MAX + 1]; struct nlattr *tb[NETNSA_MAX + 1];
struct sk_buff *msg; struct sk_buff *msg;
struct net *peer; struct net *peer;
int err; int err, id;
err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
rtnl_net_policy); rtnl_net_policy);
...@@ -600,8 +639,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) ...@@ -600,8 +639,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
goto out; goto out;
} }
id = peernet2id(net, peer);
err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
RTM_GETNSID, net, peer, -1); RTM_GETNSID, net, id);
if (err < 0) if (err < 0)
goto err_out; goto err_out;
...@@ -633,7 +673,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) ...@@ -633,7 +673,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)
ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWNSID, net_cb->net, peer, id); RTM_NEWNSID, net_cb->net, id);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -652,17 +692,17 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -652,17 +692,17 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
.idx = 0, .idx = 0,
.s_idx = cb->args[0], .s_idx = cb->args[0],
}; };
unsigned long flags;
ASSERT_RTNL(); spin_lock_irqsave(&nsid_lock, flags);
idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
spin_unlock_irqrestore(&nsid_lock, flags);
cb->args[0] = net_cb.idx; cb->args[0] = net_cb.idx;
return skb->len; return skb->len;
} }
static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, static void rtnl_net_notifyid(struct net *net, int cmd, int id)
int id)
{ {
struct sk_buff *msg; struct sk_buff *msg;
int err = -ENOMEM; int err = -ENOMEM;
...@@ -671,7 +711,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, ...@@ -671,7 +711,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
if (!msg) if (!msg)
goto out; goto out;
err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id); err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
if (err < 0) if (err < 0)
goto err_out; goto err_out;
......
...@@ -1204,7 +1204,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, ...@@ -1204,7 +1204,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
if (!net_eq(dev_net(dev), link_net)) { if (!net_eq(dev_net(dev), link_net)) {
int id = peernet2id(dev_net(dev), link_net); int id = peernet2id_alloc(dev_net(dev), link_net);
if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
goto nla_put_failure; goto nla_put_failure;
......
...@@ -76,17 +76,18 @@ struct listeners { ...@@ -76,17 +76,18 @@ struct listeners {
}; };
/* state bits */ /* state bits */
#define NETLINK_CONGESTED 0x0 #define NETLINK_S_CONGESTED 0x0
/* flags */ /* flags */
#define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_F_KERNEL_SOCKET 0x1
#define NETLINK_RECV_PKTINFO 0x2 #define NETLINK_F_RECV_PKTINFO 0x2
#define NETLINK_BROADCAST_SEND_ERROR 0x4 #define NETLINK_F_BROADCAST_SEND_ERROR 0x4
#define NETLINK_RECV_NO_ENOBUFS 0x8 #define NETLINK_F_RECV_NO_ENOBUFS 0x8
#define NETLINK_F_LISTEN_ALL_NSID 0x10
static inline int netlink_is_kernel(struct sock *sk) static inline int netlink_is_kernel(struct sock *sk)
{ {
return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
} }
struct netlink_table *nl_table; struct netlink_table *nl_table;
...@@ -256,8 +257,9 @@ static void netlink_overrun(struct sock *sk) ...@@ -256,8 +257,9 @@ static void netlink_overrun(struct sock *sk)
{ {
struct netlink_sock *nlk = nlk_sk(sk); struct netlink_sock *nlk = nlk_sk(sk);
if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) {
if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { if (!test_and_set_bit(NETLINK_S_CONGESTED,
&nlk_sk(sk)->state)) {
sk->sk_err = ENOBUFS; sk->sk_err = ENOBUFS;
sk->sk_error_report(sk); sk->sk_error_report(sk);
} }
...@@ -270,8 +272,8 @@ static void netlink_rcv_wake(struct sock *sk) ...@@ -270,8 +272,8 @@ static void netlink_rcv_wake(struct sock *sk)
struct netlink_sock *nlk = nlk_sk(sk); struct netlink_sock *nlk = nlk_sk(sk);
if (skb_queue_empty(&sk->sk_receive_queue)) if (skb_queue_empty(&sk->sk_receive_queue))
clear_bit(NETLINK_CONGESTED, &nlk->state); clear_bit(NETLINK_S_CONGESTED, &nlk->state);
if (!test_bit(NETLINK_CONGESTED, &nlk->state)) if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
wake_up_interruptible(&nlk->wait); wake_up_interruptible(&nlk->wait);
} }
...@@ -1656,7 +1658,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, ...@@ -1656,7 +1658,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
nlk = nlk_sk(sk); nlk = nlk_sk(sk);
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
test_bit(NETLINK_CONGESTED, &nlk->state)) && test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
!netlink_skb_is_mmaped(skb)) { !netlink_skb_is_mmaped(skb)) {
DECLARE_WAITQUEUE(wait, current); DECLARE_WAITQUEUE(wait, current);
if (!*timeo) { if (!*timeo) {
...@@ -1671,7 +1673,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, ...@@ -1671,7 +1673,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
add_wait_queue(&nlk->wait, &wait); add_wait_queue(&nlk->wait, &wait);
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
test_bit(NETLINK_CONGESTED, &nlk->state)) && test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
!sock_flag(sk, SOCK_DEAD)) !sock_flag(sk, SOCK_DEAD))
*timeo = schedule_timeout(*timeo); *timeo = schedule_timeout(*timeo);
...@@ -1895,7 +1897,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) ...@@ -1895,7 +1897,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
struct netlink_sock *nlk = nlk_sk(sk); struct netlink_sock *nlk = nlk_sk(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!test_bit(NETLINK_CONGESTED, &nlk->state)) { !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
netlink_skb_set_owner_r(skb, sk); netlink_skb_set_owner_r(skb, sk);
__netlink_sendskb(sk, skb); __netlink_sendskb(sk, skb);
return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
...@@ -1931,9 +1933,18 @@ static void do_one_broadcast(struct sock *sk, ...@@ -1931,9 +1933,18 @@ static void do_one_broadcast(struct sock *sk,
!test_bit(p->group - 1, nlk->groups)) !test_bit(p->group - 1, nlk->groups))
return; return;
if (!net_eq(sock_net(sk), p->net)) if (!net_eq(sock_net(sk), p->net)) {
if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
return; return;
if (!peernet_has_id(sock_net(sk), p->net))
return;
if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
CAP_NET_BROADCAST))
return;
}
if (p->failure) { if (p->failure) {
netlink_overrun(sk); netlink_overrun(sk);
return; return;
...@@ -1956,23 +1967,33 @@ static void do_one_broadcast(struct sock *sk, ...@@ -1956,23 +1967,33 @@ static void do_one_broadcast(struct sock *sk,
netlink_overrun(sk); netlink_overrun(sk);
/* Clone failed. Notify ALL listeners. */ /* Clone failed. Notify ALL listeners. */
p->failure = 1; p->failure = 1;
if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
p->delivery_failure = 1; p->delivery_failure = 1;
} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { goto out;
}
if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
kfree_skb(p->skb2); kfree_skb(p->skb2);
p->skb2 = NULL; p->skb2 = NULL;
} else if (sk_filter(sk, p->skb2)) { goto out;
}
if (sk_filter(sk, p->skb2)) {
kfree_skb(p->skb2); kfree_skb(p->skb2);
p->skb2 = NULL; p->skb2 = NULL;
} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { goto out;
}
NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
NETLINK_CB(p->skb2).nsid_is_set = true;
val = netlink_broadcast_deliver(sk, p->skb2);
if (val < 0) {
netlink_overrun(sk); netlink_overrun(sk);
if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
p->delivery_failure = 1; p->delivery_failure = 1;
} else { } else {
p->congested |= val; p->congested |= val;
p->delivered = 1; p->delivered = 1;
p->skb2 = NULL; p->skb2 = NULL;
} }
out:
sock_put(sk); sock_put(sk);
} }
...@@ -2057,7 +2078,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) ...@@ -2057,7 +2078,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
!test_bit(p->group - 1, nlk->groups)) !test_bit(p->group - 1, nlk->groups))
goto out; goto out;
if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) { if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) {
ret = 1; ret = 1;
goto out; goto out;
} }
...@@ -2076,7 +2097,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) ...@@ -2076,7 +2097,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
* @code: error code, must be negative (as usual in kernelspace) * @code: error code, must be negative (as usual in kernelspace)
* *
* This function returns the number of broadcast listeners that have set the * This function returns the number of broadcast listeners that have set the
* NETLINK_RECV_NO_ENOBUFS socket option. * NETLINK_NO_ENOBUFS socket option.
*/ */
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{ {
...@@ -2136,9 +2157,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, ...@@ -2136,9 +2157,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
switch (optname) { switch (optname) {
case NETLINK_PKTINFO: case NETLINK_PKTINFO:
if (val) if (val)
nlk->flags |= NETLINK_RECV_PKTINFO; nlk->flags |= NETLINK_F_RECV_PKTINFO;
else else
nlk->flags &= ~NETLINK_RECV_PKTINFO; nlk->flags &= ~NETLINK_F_RECV_PKTINFO;
err = 0; err = 0;
break; break;
case NETLINK_ADD_MEMBERSHIP: case NETLINK_ADD_MEMBERSHIP:
...@@ -2167,18 +2188,18 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, ...@@ -2167,18 +2188,18 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
} }
case NETLINK_BROADCAST_ERROR: case NETLINK_BROADCAST_ERROR:
if (val) if (val)
nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR;
else else
nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR;
err = 0; err = 0;
break; break;
case NETLINK_NO_ENOBUFS: case NETLINK_NO_ENOBUFS:
if (val) { if (val) {
nlk->flags |= NETLINK_RECV_NO_ENOBUFS; nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS;
clear_bit(NETLINK_CONGESTED, &nlk->state); clear_bit(NETLINK_S_CONGESTED, &nlk->state);
wake_up_interruptible(&nlk->wait); wake_up_interruptible(&nlk->wait);
} else { } else {
nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS;
} }
err = 0; err = 0;
break; break;
...@@ -2201,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, ...@@ -2201,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
break; break;
} }
#endif /* CONFIG_NETLINK_MMAP */ #endif /* CONFIG_NETLINK_MMAP */
case NETLINK_LISTEN_ALL_NSID:
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
return -EPERM;
if (val)
nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
else
nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
err = 0;
break;
default: default:
err = -ENOPROTOOPT; err = -ENOPROTOOPT;
} }
...@@ -2227,7 +2258,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, ...@@ -2227,7 +2258,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
if (len < sizeof(int)) if (len < sizeof(int))
return -EINVAL; return -EINVAL;
len = sizeof(int); len = sizeof(int);
val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0;
if (put_user(len, optlen) || if (put_user(len, optlen) ||
put_user(val, optval)) put_user(val, optval))
return -EFAULT; return -EFAULT;
...@@ -2237,7 +2268,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, ...@@ -2237,7 +2268,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
if (len < sizeof(int)) if (len < sizeof(int))
return -EINVAL; return -EINVAL;
len = sizeof(int); len = sizeof(int);
val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0;
if (put_user(len, optlen) || if (put_user(len, optlen) ||
put_user(val, optval)) put_user(val, optval))
return -EFAULT; return -EFAULT;
...@@ -2247,7 +2278,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, ...@@ -2247,7 +2278,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
if (len < sizeof(int)) if (len < sizeof(int))
return -EINVAL; return -EINVAL;
len = sizeof(int); len = sizeof(int);
val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0;
if (put_user(len, optlen) || if (put_user(len, optlen) ||
put_user(val, optval)) put_user(val, optval))
return -EFAULT; return -EFAULT;
...@@ -2267,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) ...@@ -2267,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
} }
static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
struct sk_buff *skb)
{
if (!NETLINK_CB(skb).nsid_is_set)
return;
put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
&NETLINK_CB(skb).nsid);
}
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{ {
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
...@@ -2418,8 +2459,10 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ...@@ -2418,8 +2459,10 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
msg->msg_namelen = sizeof(*addr); msg->msg_namelen = sizeof(*addr);
} }
if (nlk->flags & NETLINK_RECV_PKTINFO) if (nlk->flags & NETLINK_F_RECV_PKTINFO)
netlink_cmsg_recv_pktinfo(msg, skb); netlink_cmsg_recv_pktinfo(msg, skb);
if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
netlink_cmsg_listen_all_nsid(sk, msg, skb);
memset(&scm, 0, sizeof(scm)); memset(&scm, 0, sizeof(scm));
scm.creds = *NETLINK_CREDS(skb); scm.creds = *NETLINK_CREDS(skb);
...@@ -2502,7 +2545,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, ...@@ -2502,7 +2545,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
goto out_sock_release; goto out_sock_release;
nlk = nlk_sk(sk); nlk = nlk_sk(sk);
nlk->flags |= NETLINK_KERNEL_SOCKET; nlk->flags |= NETLINK_F_KERNEL_SOCKET;
netlink_table_grab(); netlink_table_grab();
if (!nl_table[unit].registered) { if (!nl_table[unit].registered) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册