af_netlink.c 64.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * NETLINK      Kernel-user communication protocol.
 *
4
 * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5
 * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6
 * 				Patrick McHardy <kaber@trash.net>
L
Linus Torvalds 已提交
7 8 9 10 11
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
12
 *
L
Linus Torvalds 已提交
13 14 15 16
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 * 				 use nlk_sk, as sk->protinfo is on a diet 8)
17 18 19 20 21 22
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 * 				 - inc module use count of module that owns
 * 				   the kernel socket in case userspace opens
 * 				   socket of same protocol
 * 				 - remove all module support, since netlink is
 * 				   mandatory if CONFIG_NET=y these days
L
Linus Torvalds 已提交
23 24 25 26
 */

#include <linux/module.h>

27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
43
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
44 45 46 47 48 49 50 51 52 53 54 55 56
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
A
Andrew Morton 已提交
57
#include <linux/audit.h>
58
#include <linux/mutex.h>
59
#include <linux/vmalloc.h>
60
#include <linux/if_arp.h>
61
#include <linux/rhashtable.h>
62
#include <asm/cacheflush.h>
63
#include <linux/hash.h>
64
#include <linux/genetlink.h>
65
#include <linux/net_namespace.h>
A
Andrew Morton 已提交
66

67
#include <net/net_namespace.h>
C
Cong Wang 已提交
68
#include <net/netns/generic.h>
L
Linus Torvalds 已提交
69 70
#include <net/sock.h>
#include <net/scm.h>
71
#include <net/netlink.h>
L
Linus Torvalds 已提交
72

73
#include "af_netlink.h"
L
Linus Torvalds 已提交
74

75 76 77
struct listeners {
	struct rcu_head		rcu;
	unsigned long		masks[0];
78 79
};

80
/* state bits */
81
#define NETLINK_S_CONGESTED		0x0
82

83
static inline int netlink_is_kernel(struct sock *sk)
84
{
85
	return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
86 87
}

88
struct netlink_table *nl_table __read_mostly;
89
EXPORT_SYMBOL_GPL(nl_table);
L
Linus Torvalds 已提交
90 91 92

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];

static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
	"nlk_cb_mutex-ROUTE",
	"nlk_cb_mutex-1",
	"nlk_cb_mutex-USERSOCK",
	"nlk_cb_mutex-FIREWALL",
	"nlk_cb_mutex-SOCK_DIAG",
	"nlk_cb_mutex-NFLOG",
	"nlk_cb_mutex-XFRM",
	"nlk_cb_mutex-SELINUX",
	"nlk_cb_mutex-ISCSI",
	"nlk_cb_mutex-AUDIT",
	"nlk_cb_mutex-FIB_LOOKUP",
	"nlk_cb_mutex-CONNECTOR",
	"nlk_cb_mutex-NETFILTER",
	"nlk_cb_mutex-IP6_FW",
	"nlk_cb_mutex-DNRTMSG",
	"nlk_cb_mutex-KOBJECT_UEVENT",
	"nlk_cb_mutex-GENERIC",
	"nlk_cb_mutex-17",
	"nlk_cb_mutex-SCSITRANSPORT",
	"nlk_cb_mutex-ECRYPTFS",
	"nlk_cb_mutex-RDMA",
	"nlk_cb_mutex-CRYPTO",
	"nlk_cb_mutex-SMC",
	"nlk_cb_mutex-23",
	"nlk_cb_mutex-24",
	"nlk_cb_mutex-25",
	"nlk_cb_mutex-26",
	"nlk_cb_mutex-27",
	"nlk_cb_mutex-28",
	"nlk_cb_mutex-29",
	"nlk_cb_mutex-30",
	"nlk_cb_mutex-31",
	"nlk_cb_mutex-MAX_LINKS"
};

L
Linus Torvalds 已提交
131 132
static int netlink_dump(struct sock *sk);

133
/* nl_table locking explained:
134
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
Y
Ying Xue 已提交
135
 * and removal are protected with per bucket lock while using RCU list
136 137 138 139
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
140
 */
141 142
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
L
Linus Torvalds 已提交
143 144
static atomic_t nl_table_users = ATOMIC_INIT(0);

145 146
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

W
WANG Cong 已提交
147
static BLOCKING_NOTIFIER_HEAD(netlink_chain);
L
Linus Torvalds 已提交
148

149

150 151
static const struct rhashtable_params netlink_rhashtable_params;

152
static inline u32 netlink_group_mask(u32 group)
153 154 155 156
{
	return group ? 1 << (group - 1) : 0;
}

157 158 159 160 161 162 163 164 165 166 167 168 169 170
static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
					   gfp_t gfp_mask)
{
	unsigned int len = skb_end_offset(skb);
	struct sk_buff *new;

	new = alloc_skb(len, gfp_mask);
	if (new == NULL)
		return NULL;

	NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
	NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
	NETLINK_CB(new).creds = NETLINK_CB(skb).creds;

171
	skb_put_data(new, skb->data, len);
172 173 174
	return new;
}

C
Cong Wang 已提交
175 176 177 178 179 180 181
static unsigned int netlink_tap_net_id;

struct netlink_tap_net {
	struct list_head netlink_tap_all;
	spinlock_t netlink_tap_lock;
};

182 183
int netlink_add_tap(struct netlink_tap *nt)
{
C
Cong Wang 已提交
184 185 186
	struct net *net = dev_net(nt->dev);
	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

187 188 189
	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
		return -EINVAL;

C
Cong Wang 已提交
190 191 192
	spin_lock(&nn->netlink_tap_lock);
	list_add_rcu(&nt->list, &nn->netlink_tap_all);
	spin_unlock(&nn->netlink_tap_lock);
193

194
	__module_get(nt->module);
195 196 197 198 199

	return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

200
static int __netlink_remove_tap(struct netlink_tap *nt)
201
{
C
Cong Wang 已提交
202 203
	struct net *net = dev_net(nt->dev);
	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
204 205 206
	bool found = false;
	struct netlink_tap *tmp;

C
Cong Wang 已提交
207
	spin_lock(&nn->netlink_tap_lock);
208

C
Cong Wang 已提交
209
	list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
210 211 212 213 214 215 216 217 218
		if (nt == tmp) {
			list_del_rcu(&nt->list);
			found = true;
			goto out;
		}
	}

	pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
C
Cong Wang 已提交
219
	spin_unlock(&nn->netlink_tap_lock);
220

221
	if (found)
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
		module_put(nt->module);

	return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
	int ret;

	ret = __netlink_remove_tap(nt);
	synchronize_net();

	return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

C
Cong Wang 已提交
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
static __net_init int netlink_tap_init_net(struct net *net)
{
	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

	INIT_LIST_HEAD(&nn->netlink_tap_all);
	spin_lock_init(&nn->netlink_tap_lock);
	return 0;
}

static void __net_exit netlink_tap_exit_net(struct net *net)
{
}

static struct pernet_operations netlink_tap_net_ops = {
	.init = netlink_tap_init_net,
	.exit = netlink_tap_exit_net,
	.id   = &netlink_tap_net_id,
	.size = sizeof(struct netlink_tap_net),
};

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
static bool netlink_filter_tap(const struct sk_buff *skb)
{
	struct sock *sk = skb->sk;

	/* We take the more conservative approach and
	 * whitelist socket protocols that may pass.
	 */
	switch (sk->sk_protocol) {
	case NETLINK_ROUTE:
	case NETLINK_USERSOCK:
	case NETLINK_SOCK_DIAG:
	case NETLINK_NFLOG:
	case NETLINK_XFRM:
	case NETLINK_FIB_LOOKUP:
	case NETLINK_NETFILTER:
	case NETLINK_GENERIC:
V
Varka Bhadram 已提交
274
		return true;
275 276
	}

V
Varka Bhadram 已提交
277
	return false;
278 279
}

280 281 282 283
static int __netlink_deliver_tap_skb(struct sk_buff *skb,
				     struct net_device *dev)
{
	struct sk_buff *nskb;
284
	struct sock *sk = skb->sk;
285 286 287
	int ret = -ENOMEM;

	dev_hold(dev);
288

289
	if (is_vmalloc_addr(skb->head))
290 291 292
		nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
	else
		nskb = skb_clone(skb, GFP_ATOMIC);
293 294
	if (nskb) {
		nskb->dev = dev;
295
		nskb->protocol = htons((u16) sk->sk_protocol);
296 297
		nskb->pkt_type = netlink_is_kernel(sk) ?
				 PACKET_KERNEL : PACKET_USER;
298
		skb_reset_network_header(nskb);
299 300 301 302 303 304 305 306 307
		ret = dev_queue_xmit(nskb);
		if (unlikely(ret > 0))
			ret = net_xmit_errno(ret);
	}

	dev_put(dev);
	return ret;
}

C
Cong Wang 已提交
308
static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
309 310 311 312
{
	int ret;
	struct netlink_tap *tmp;

313 314 315
	if (!netlink_filter_tap(skb))
		return;

C
Cong Wang 已提交
316
	list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
317 318 319 320 321 322
		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
		if (unlikely(ret))
			break;
	}
}

C
Cong Wang 已提交
323
static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
324
{
C
Cong Wang 已提交
325 326
	struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

327 328
	rcu_read_lock();

C
Cong Wang 已提交
329 330
	if (unlikely(!list_empty(&nn->netlink_tap_all)))
		__netlink_deliver_tap(skb, nn);
331 332 333 334

	rcu_read_unlock();
}

335 336 337 338
static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
				       struct sk_buff *skb)
{
	if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
C
Cong Wang 已提交
339
		netlink_deliver_tap(sock_net(dst), skb);
340 341
}

342 343 344 345
static void netlink_overrun(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

346 347 348
	if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) {
		if (!test_and_set_bit(NETLINK_S_CONGESTED,
				      &nlk_sk(sk)->state)) {
349 350 351 352 353 354 355 356 357 358 359 360
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
		}
	}
	atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (skb_queue_empty(&sk->sk_receive_queue))
361 362
		clear_bit(NETLINK_S_CONGESTED, &nlk->state);
	if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
363 364 365
		wake_up_interruptible(&nlk->wait);
}

366 367
static void netlink_skb_destructor(struct sk_buff *skb)
{
368
	if (is_vmalloc_addr(skb->head)) {
369 370 371 372
		if (!skb->cloned ||
		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
			vfree(skb->head);

373 374
		skb->head = NULL;
	}
375 376
	if (skb->sk != NULL)
		sock_rfree(skb);
377 378 379 380 381 382 383 384 385 386 387
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
	WARN_ON(skb->sk != NULL);
	skb->sk = sk;
	skb->destructor = netlink_skb_destructor;
	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
	sk_mem_charge(sk, skb->truesize);
}

388
static void netlink_sock_destruct(struct sock *sk)
L
Linus Torvalds 已提交
389
{
390 391
	struct netlink_sock *nlk = nlk_sk(sk);

392
	if (nlk->cb_running) {
393 394
		if (nlk->cb.done)
			nlk->cb.done(&nlk->cb);
395 396
		module_put(nlk->cb.module);
		kfree_skb(nlk->cb.skb);
397 398
	}

L
Linus Torvalds 已提交
399 400 401
	skb_queue_purge(&sk->sk_receive_queue);

	if (!sock_flag(sk, SOCK_DEAD)) {
402
		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
L
Linus Torvalds 已提交
403 404
		return;
	}
405 406

	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
407
	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
408
	WARN_ON(nlk_sk(sk)->groups);
L
Linus Torvalds 已提交
409 410
}

411 412 413 414 415
static void netlink_sock_destruct_work(struct work_struct *work)
{
	struct netlink_sock *nlk = container_of(work, struct netlink_sock,
						work);

416
	sk_free(&nlk->sk);
417 418
}

419 420
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
L
Linus Torvalds 已提交
421 422 423 424
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

425
void netlink_table_grab(void)
426
	__acquires(nl_table_lock)
L
Linus Torvalds 已提交
427
{
428 429
	might_sleep();

430
	write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
431 432 433 434 435

	if (atomic_read(&nl_table_users)) {
		DECLARE_WAITQUEUE(wait, current);

		add_wait_queue_exclusive(&nl_table_wait, &wait);
436
		for (;;) {
L
Linus Torvalds 已提交
437 438 439
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (atomic_read(&nl_table_users) == 0)
				break;
440
			write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
441
			schedule();
442
			write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
443 444 445 446 447 448 449
		}

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nl_table_wait, &wait);
	}
}

450
void netlink_table_ungrab(void)
451
	__releases(nl_table_lock)
L
Linus Torvalds 已提交
452
{
453
	write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
454 455 456
	wake_up(&nl_table_wait);
}

457
static inline void
L
Linus Torvalds 已提交
458 459 460 461 462 463 464 465 466
netlink_lock_table(void)
{
	/* read_lock() synchronizes us to netlink_table_grab */

	read_lock(&nl_table_lock);
	atomic_inc(&nl_table_users);
	read_unlock(&nl_table_lock);
}

467
static inline void
L
Linus Torvalds 已提交
468 469 470 471 472 473
netlink_unlock_table(void)
{
	if (atomic_dec_and_test(&nl_table_users))
		wake_up(&nl_table_wait);
}

474
struct netlink_compare_arg
L
Linus Torvalds 已提交
475
{
476
	possible_net_t pnet;
477 478
	u32 portid;
};
L
Linus Torvalds 已提交
479

480 481 482
/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
	(offsetof(struct netlink_compare_arg, portid) + sizeof(u32))
483 484 485

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
				  const void *ptr)
L
Linus Torvalds 已提交
486
{
487 488
	const struct netlink_compare_arg *x = arg->key;
	const struct netlink_sock *nlk = ptr;
L
Linus Torvalds 已提交
489

490
	return nlk->portid != x->portid ||
491 492 493 494 495 496 497 498 499
	       !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
				     struct net *net, u32 portid)
{
	memset(arg, 0, sizeof(*arg));
	write_pnet(&arg->pnet, net);
	arg->portid = portid;
L
Linus Torvalds 已提交
500 501
}

502 503
static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
				     struct net *net)
L
Linus Torvalds 已提交
504
{
505
	struct netlink_compare_arg arg;
L
Linus Torvalds 已提交
506

507 508 509
	netlink_compare_arg_init(&arg, net, portid);
	return rhashtable_lookup_fast(&table->hash, &arg,
				      netlink_rhashtable_params);
L
Linus Torvalds 已提交
510 511
}

512
static int __netlink_insert(struct netlink_table *table, struct sock *sk)
Y
Ying Xue 已提交
513
{
514
	struct netlink_compare_arg arg;
Y
Ying Xue 已提交
515

516
	netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
517 518 519
	return rhashtable_lookup_insert_key(&table->hash, &arg,
					    &nlk_sk(sk)->node,
					    netlink_rhashtable_params);
Y
Ying Xue 已提交
520 521
}

522
static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
L
Linus Torvalds 已提交
523
{
524 525
	struct netlink_table *table = &nl_table[protocol];
	struct sock *sk;
L
Linus Torvalds 已提交
526

527 528 529 530 531
	rcu_read_lock();
	sk = __netlink_lookup(table, portid, net);
	if (sk)
		sock_hold(sk);
	rcu_read_unlock();
L
Linus Torvalds 已提交
532

533
	return sk;
L
Linus Torvalds 已提交
534 535
}

536
static const struct proto_ops netlink_ops;
L
Linus Torvalds 已提交
537

538 539 540 541 542 543
static void
netlink_update_listeners(struct sock *sk)
{
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
	unsigned long mask;
	unsigned int i;
544 545 546 547 548
	struct listeners *listeners;

	listeners = nl_deref_protected(tbl->listeners);
	if (!listeners)
		return;
549

550
	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
551
		mask = 0;
552
		sk_for_each_bound(sk, &tbl->mc_list) {
553 554 555
			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
				mask |= nlk_sk(sk)->groups[i];
		}
556
		listeners->masks[i] = mask;
557 558 559 560 561
	}
	/* this function is only called with the netlink table "grabbed", which
	 * makes sure updates are visible before bind or setsockopt return. */
}

562
static int netlink_insert(struct sock *sk, u32 portid)
L
Linus Torvalds 已提交
563
{
564
	struct netlink_table *table = &nl_table[sk->sk_protocol];
565
	int err;
L
Linus Torvalds 已提交
566

Y
Ying Xue 已提交
567
	lock_sock(sk);
L
Linus Torvalds 已提交
568

569 570
	err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
	if (nlk_sk(sk)->bound)
L
Linus Torvalds 已提交
571 572 573
		goto err;

	err = -ENOMEM;
574 575
	if (BITS_PER_LONG > 32 &&
	    unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
L
Linus Torvalds 已提交
576 577
		goto err;

578
	nlk_sk(sk)->portid = portid;
579
	sock_hold(sk);
580

581 582
	err = __netlink_insert(table, sk);
	if (err) {
583 584 585 586 587
		/* In case the hashtable backend returns with -EBUSY
		 * from here, it must not escape to the caller.
		 */
		if (unlikely(err == -EBUSY))
			err = -EOVERFLOW;
588 589
		if (err == -EEXIST)
			err = -EADDRINUSE;
Y
Ying Xue 已提交
590
		sock_put(sk);
591
		goto err;
592 593
	}

594 595 596
	/* We need to ensure that the socket is hashed and visible. */
	smp_wmb();
	nlk_sk(sk)->bound = portid;
597

L
Linus Torvalds 已提交
598
err:
Y
Ying Xue 已提交
599
	release_sock(sk);
L
Linus Torvalds 已提交
600 601 602 603 604
	return err;
}

static void netlink_remove(struct sock *sk)
{
605 606 607
	struct netlink_table *table;

	table = &nl_table[sk->sk_protocol];
608 609
	if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
				    netlink_rhashtable_params)) {
610
		WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
611 612 613
		__sock_put(sk);
	}

L
Linus Torvalds 已提交
614
	netlink_table_grab();
615
	if (nlk_sk(sk)->subscriptions) {
L
Linus Torvalds 已提交
616
		__sk_del_bind_node(sk);
617 618
		netlink_update_listeners(sk);
	}
619 620
	if (sk->sk_protocol == NETLINK_GENERIC)
		atomic_inc(&genl_sk_destructing_cnt);
L
Linus Torvalds 已提交
621 622 623 624 625 626 627 628 629
	netlink_table_ungrab();
}

static struct proto netlink_proto = {
	.name	  = "NETLINK",
	.owner	  = THIS_MODULE,
	.obj_size = sizeof(struct netlink_sock),
};

630
static int __netlink_create(struct net *net, struct socket *sock,
631 632
			    struct mutex *cb_mutex, int protocol,
			    int kern)
L
Linus Torvalds 已提交
633 634 635
{
	struct sock *sk;
	struct netlink_sock *nlk;
636 637 638

	sock->ops = &netlink_ops;

639
	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
640 641 642 643 644 645
	if (!sk)
		return -ENOMEM;

	sock_init_data(sock, sk);

	nlk = nlk_sk(sk);
E
Eric Dumazet 已提交
646
	if (cb_mutex) {
647
		nlk->cb_mutex = cb_mutex;
E
Eric Dumazet 已提交
648
	} else {
649 650
		nlk->cb_mutex = &nlk->cb_def_mutex;
		mutex_init(nlk->cb_mutex);
651 652 653
		lockdep_set_class_and_name(nlk->cb_mutex,
					   nlk_cb_mutex_keys + protocol,
					   nlk_cb_mutex_key_strings[protocol]);
654
	}
655 656 657 658 659 660 661
	init_waitqueue_head(&nlk->wait);

	sk->sk_destruct = netlink_sock_destruct;
	sk->sk_protocol = protocol;
	return 0;
}

662 663
static int netlink_create(struct net *net, struct socket *sock, int protocol,
			  int kern)
664 665
{
	struct module *module = NULL;
666
	struct mutex *cb_mutex;
667
	struct netlink_sock *nlk;
668 669
	int (*bind)(struct net *net, int group);
	void (*unbind)(struct net *net, int group);
670
	int err = 0;
L
Linus Torvalds 已提交
671 672 673 674 675 676

	sock->state = SS_UNCONNECTED;

	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
		return -ESOCKTNOSUPPORT;

677
	if (protocol < 0 || protocol >= MAX_LINKS)
L
Linus Torvalds 已提交
678 679
		return -EPROTONOSUPPORT;

680
	netlink_lock_table();
681
#ifdef CONFIG_MODULES
682
	if (!nl_table[protocol].registered) {
683
		netlink_unlock_table();
684
		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
685
		netlink_lock_table();
686
	}
687 688 689 690
#endif
	if (nl_table[protocol].registered &&
	    try_module_get(nl_table[protocol].module))
		module = nl_table[protocol].module;
691 692
	else
		err = -EPROTONOSUPPORT;
693
	cb_mutex = nl_table[protocol].cb_mutex;
694
	bind = nl_table[protocol].bind;
695
	unbind = nl_table[protocol].unbind;
696
	netlink_unlock_table();
697

698 699 700
	if (err < 0)
		goto out;

701
	err = __netlink_create(net, sock, cb_mutex, protocol, kern);
702
	if (err < 0)
703 704
		goto out_module;

705
	local_bh_disable();
706
	sock_prot_inuse_add(net, &netlink_proto, 1);
707 708
	local_bh_enable();

709 710
	nlk = nlk_sk(sock->sk);
	nlk->module = module;
711
	nlk->netlink_bind = bind;
712
	nlk->netlink_unbind = unbind;
713 714
out:
	return err;
L
Linus Torvalds 已提交
715

716 717 718
out_module:
	module_put(module);
	goto out;
L
Linus Torvalds 已提交
719 720
}

721 722 723
static void deferred_put_nlk_sk(struct rcu_head *head)
{
	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
724 725
	struct sock *sk = &nlk->sk;

726 727 728
	kfree(nlk->groups);
	nlk->groups = NULL;

729
	if (!refcount_dec_and_test(&sk->sk_refcnt))
730 731 732 733 734 735 736
		return;

	if (nlk->cb_running && nlk->cb.done) {
		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
		schedule_work(&nlk->work);
		return;
	}
737

738
	sk_free(sk);
739 740
}

L
Linus Torvalds 已提交
741 742 743 744 745 746 747 748 749
static int netlink_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk;

	if (!sk)
		return 0;

	netlink_remove(sk);
750
	sock_orphan(sk);
L
Linus Torvalds 已提交
751 752
	nlk = nlk_sk(sk);

753 754 755 756
	/*
	 * OK. Socket is unlinked, any packets that arrive now
	 * will be purged.
	 */
L
Linus Torvalds 已提交
757

758 759 760 761 762 763 764 765 766 767 768 769 770 771
	/* must not acquire netlink_table_lock in any way again before unbind
	 * and notifying genetlink is done as otherwise it might deadlock
	 */
	if (nlk->netlink_unbind) {
		int i;

		for (i = 0; i < nlk->ngroups; i++)
			if (test_bit(i, nlk->groups))
				nlk->netlink_unbind(sock_net(sk), i + 1);
	}
	if (sk->sk_protocol == NETLINK_GENERIC &&
	    atomic_dec_return(&genl_sk_destructing_cnt) == 0)
		wake_up(&genl_sk_destructing_waitq);

L
Linus Torvalds 已提交
772 773 774 775 776
	sock->sk = NULL;
	wake_up_interruptible_all(&nlk->wait);

	skb_queue_purge(&sk->sk_write_queue);

777
	if (nlk->portid && nlk->bound) {
L
Linus Torvalds 已提交
778
		struct netlink_notify n = {
779
						.net = sock_net(sk),
L
Linus Torvalds 已提交
780
						.protocol = sk->sk_protocol,
781
						.portid = nlk->portid,
L
Linus Torvalds 已提交
782
					  };
W
WANG Cong 已提交
783
		blocking_notifier_call_chain(&netlink_chain,
784
				NETLINK_URELEASE, &n);
785
	}
786

787
	module_put(nlk->module);
788

789
	if (netlink_is_kernel(sk)) {
790
		netlink_table_grab();
791 792
		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
		if (--nl_table[sk->sk_protocol].registered == 0) {
793 794 795 796 797
			struct listeners *old;

			old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
			RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
			kfree_rcu(old, rcu);
798
			nl_table[sk->sk_protocol].module = NULL;
799
			nl_table[sk->sk_protocol].bind = NULL;
800
			nl_table[sk->sk_protocol].unbind = NULL;
801
			nl_table[sk->sk_protocol].flags = 0;
802 803
			nl_table[sk->sk_protocol].registered = 0;
		}
804
		netlink_table_ungrab();
E
Eric Dumazet 已提交
805
	}
806

807
	local_bh_disable();
808
	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
809
	local_bh_enable();
810
	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
L
Linus Torvalds 已提交
811 812 813 814 815 816
	return 0;
}

static int netlink_autobind(struct socket *sock)
{
	struct sock *sk = sock->sk;
817
	struct net *net = sock_net(sk);
818
	struct netlink_table *table = &nl_table[sk->sk_protocol];
819
	s32 portid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
820
	int err;
H
Herbert Xu 已提交
821 822
	s32 rover = -4096;
	bool ok;
L
Linus Torvalds 已提交
823 824 825

retry:
	cond_resched();
826
	rcu_read_lock();
H
Herbert Xu 已提交
827 828 829
	ok = !__netlink_lookup(table, portid, net);
	rcu_read_unlock();
	if (!ok) {
830
		/* Bind collision, search negative portid values. */
H
Herbert Xu 已提交
831 832 833 834
		if (rover == -4096)
			/* rover will be in range [S32_MIN, -4097] */
			rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN);
		else if (rover >= -4096)
835
			rover = -4097;
H
Herbert Xu 已提交
836
		portid = rover--;
837
		goto retry;
L
Linus Torvalds 已提交
838 839
	}

840
	err = netlink_insert(sk, portid);
L
Linus Torvalds 已提交
841 842
	if (err == -EADDRINUSE)
		goto retry;
843 844 845 846 847 848

	/* If 2 threads race to autobind, that is fine.  */
	if (err == -EBUSY)
		err = 0;

	return err;
L
Linus Torvalds 已提交
849 850
}

851 852 853 854 855 856 857 858 859 860 861 862 863
/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
			struct user_namespace *user_ns, int cap)
{
864 865 866
	return ((nsp->flags & NETLINK_SKB_DST) ||
		file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
		ns_capable(user_ns, cap);
867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
			struct user_namespace *user_ns, int cap)
{
	return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

918
static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
919
{
920
	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
921
		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
922
}
L
Linus Torvalds 已提交
923

924 925 926 927 928 929 930 931 932 933 934 935
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (nlk->subscriptions && !subscriptions)
		__sk_del_bind_node(sk);
	else if (!nlk->subscriptions && subscriptions)
		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
	nlk->subscriptions = subscriptions;
}

936
static int netlink_realloc_groups(struct sock *sk)
937 938 939
{
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int groups;
940
	unsigned long *new_groups;
941 942
	int err = 0;

943 944
	netlink_table_grab();

945
	groups = nl_table[sk->sk_protocol].groups;
946
	if (!nl_table[sk->sk_protocol].registered) {
947
		err = -ENOENT;
948 949
		goto out_unlock;
	}
950

951 952
	if (nlk->ngroups >= groups)
		goto out_unlock;
953

954 955 956 957 958
	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
	if (new_groups == NULL) {
		err = -ENOMEM;
		goto out_unlock;
	}
959
	memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
960 961 962
	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

	nlk->groups = new_groups;
963
	nlk->ngroups = groups;
964 965 966
 out_unlock:
	netlink_table_ungrab();
	return err;
967 968
}

969
static void netlink_undo_bind(int group, long unsigned int groups,
970
			      struct sock *sk)
971
{
972
	struct netlink_sock *nlk = nlk_sk(sk);
973 974 975 976 977 978
	int undo;

	if (!nlk->netlink_unbind)
		return;

	for (undo = 0; undo < group; undo++)
979
		if (test_bit(undo, &groups))
980
			nlk->netlink_unbind(sock_net(sk), undo + 1);
981 982
}

983 984
static int netlink_bind(struct socket *sock, struct sockaddr *addr,
			int addr_len)
L
Linus Torvalds 已提交
985 986
{
	struct sock *sk = sock->sk;
987
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
988 989
	struct netlink_sock *nlk = nlk_sk(sk);
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
990
	int err = 0;
991
	long unsigned int groups = nladdr->nl_groups;
992
	bool bound;
993

994 995 996
	if (addr_len < sizeof(struct sockaddr_nl))
		return -EINVAL;

L
Linus Torvalds 已提交
997 998 999 1000
	if (nladdr->nl_family != AF_NETLINK)
		return -EINVAL;

	/* Only superuser is allowed to listen multicasts */
1001
	if (groups) {
1002
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
1003
			return -EPERM;
1004 1005 1006
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
1007
	}
L
Linus Torvalds 已提交
1008

1009 1010 1011 1012 1013
	bound = nlk->bound;
	if (bound) {
		/* Ensure nlk->portid is up-to-date. */
		smp_rmb();

1014
		if (nladdr->nl_pid != nlk->portid)
L
Linus Torvalds 已提交
1015
			return -EINVAL;
1016
	}
1017

1018
	netlink_lock_table();
1019 1020 1021 1022 1023 1024
	if (nlk->netlink_bind && groups) {
		int group;

		for (group = 0; group < nlk->ngroups; group++) {
			if (!test_bit(group, &groups))
				continue;
1025
			err = nlk->netlink_bind(net, group + 1);
1026 1027
			if (!err)
				continue;
1028
			netlink_undo_bind(group, groups, sk);
1029
			goto unlock;
1030 1031 1032
		}
	}

1033 1034 1035 1036
	/* No need for barriers here as we return to user-space without
	 * using any of the bound attributes.
	 */
	if (!bound) {
L
Linus Torvalds 已提交
1037
		err = nladdr->nl_pid ?
1038
			netlink_insert(sk, nladdr->nl_pid) :
L
Linus Torvalds 已提交
1039
			netlink_autobind(sock);
1040
		if (err) {
1041
			netlink_undo_bind(nlk->ngroups, groups, sk);
1042
			goto unlock;
1043
		}
L
Linus Torvalds 已提交
1044 1045
	}

1046
	if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
1047 1048
		goto unlock;
	netlink_unlock_table();
L
Linus Torvalds 已提交
1049 1050

	netlink_table_grab();
1051
	netlink_update_subscriptions(sk, nlk->subscriptions +
1052
					 hweight32(groups) -
1053
					 hweight32(nlk->groups[0]));
1054
	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
1055
	netlink_update_listeners(sk);
L
Linus Torvalds 已提交
1056 1057 1058
	netlink_table_ungrab();

	return 0;
1059 1060 1061 1062

unlock:
	netlink_unlock_table();
	return err;
L
Linus Torvalds 已提交
1063 1064 1065 1066 1067 1068 1069 1070
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
			   int alen, int flags)
{
	int err = 0;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1071
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
L
Linus Torvalds 已提交
1072

1073 1074 1075
	if (alen < sizeof(addr->sa_family))
		return -EINVAL;

L
Linus Torvalds 已提交
1076 1077
	if (addr->sa_family == AF_UNSPEC) {
		sk->sk_state	= NETLINK_UNCONNECTED;
1078
		nlk->dst_portid	= 0;
1079
		nlk->dst_group  = 0;
L
Linus Torvalds 已提交
1080 1081 1082 1083 1084
		return 0;
	}
	if (addr->sa_family != AF_NETLINK)
		return -EINVAL;

1085
	if ((nladdr->nl_groups || nladdr->nl_pid) &&
1086
	    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
L
Linus Torvalds 已提交
1087 1088
		return -EPERM;

1089 1090 1091 1092
	/* No need for barriers here as we return to user-space without
	 * using any of the bound attributes.
	 */
	if (!nlk->bound)
L
Linus Torvalds 已提交
1093 1094 1095 1096
		err = netlink_autobind(sock);

	if (err == 0) {
		sk->sk_state	= NETLINK_CONNECTED;
1097
		nlk->dst_portid = nladdr->nl_pid;
1098
		nlk->dst_group  = ffs(nladdr->nl_groups);
L
Linus Torvalds 已提交
1099 1100 1101 1102 1103
	}

	return err;
}

1104 1105
static int netlink_getname(struct socket *sock, struct sockaddr *addr,
			   int *addr_len, int peer)
L
Linus Torvalds 已提交
1106 1107 1108
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1109
	DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
1110

L
Linus Torvalds 已提交
1111 1112 1113 1114 1115
	nladdr->nl_family = AF_NETLINK;
	nladdr->nl_pad = 0;
	*addr_len = sizeof(*nladdr);

	if (peer) {
1116
		nladdr->nl_pid = nlk->dst_portid;
1117
		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
L
Linus Torvalds 已提交
1118
	} else {
1119
		nladdr->nl_pid = nlk->portid;
1120
		netlink_lock_table();
1121
		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
1122
		netlink_unlock_table();
L
Linus Torvalds 已提交
1123 1124 1125 1126
	}
	return 0;
}

1127 1128 1129 1130 1131 1132 1133 1134
static int netlink_ioctl(struct socket *sock, unsigned int cmd,
			 unsigned long arg)
{
	/* try to hand this ioctl down to the NIC drivers.
	 */
	return -ENOIOCTLCMD;
}

1135
static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
L
Linus Torvalds 已提交
1136 1137 1138 1139
{
	struct sock *sock;
	struct netlink_sock *nlk;

1140
	sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
L
Linus Torvalds 已提交
1141 1142 1143 1144 1145
	if (!sock)
		return ERR_PTR(-ECONNREFUSED);

	/* Don't bother queuing skb if kernel socket has no input function */
	nlk = nlk_sk(sock);
1146
	if (sock->sk_state == NETLINK_CONNECTED &&
1147
	    nlk->dst_portid != nlk_sk(ssk)->portid) {
L
Linus Torvalds 已提交
1148 1149 1150 1151 1152 1153 1154 1155
		sock_put(sock);
		return ERR_PTR(-ECONNREFUSED);
	}
	return sock;
}

struct sock *netlink_getsockbyfilp(struct file *filp)
{
A
Al Viro 已提交
1156
	struct inode *inode = file_inode(filp);
L
Linus Torvalds 已提交
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
	struct sock *sock;

	if (!S_ISSOCK(inode->i_mode))
		return ERR_PTR(-ENOTSOCK);

	sock = SOCKET_I(inode)->sk;
	if (sock->sk_family != AF_NETLINK)
		return ERR_PTR(-EINVAL);

	sock_hold(sock);
	return sock;
}

1170 1171
static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
					       int broadcast)
1172 1173 1174 1175
{
	struct sk_buff *skb;
	void *data;

1176
	if (size <= NLMSG_GOODSIZE || broadcast)
1177 1178
		return alloc_skb(size, GFP_KERNEL);

1179 1180
	size = SKB_DATA_ALIGN(size) +
	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1181 1182 1183

	data = vmalloc(size);
	if (data == NULL)
1184
		return NULL;
1185

E
Eric Dumazet 已提交
1186
	skb = __build_skb(data, size);
1187 1188
	if (skb == NULL)
		vfree(data);
E
Eric Dumazet 已提交
1189
	else
1190
		skb->destructor = netlink_skb_destructor;
1191 1192 1193 1194

	return skb;
}

L
Linus Torvalds 已提交
1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
1205
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
P
Patrick McHardy 已提交
1206
		      long *timeo, struct sock *ssk)
L
Linus Torvalds 已提交
1207 1208 1209 1210 1211
{
	struct netlink_sock *nlk;

	nlk = nlk_sk(sk);

1212
	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1213
	     test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
L
Linus Torvalds 已提交
1214
		DECLARE_WAITQUEUE(wait, current);
P
Patrick McHardy 已提交
1215
		if (!*timeo) {
1216
			if (!ssk || netlink_is_kernel(ssk))
L
Linus Torvalds 已提交
1217 1218 1219 1220 1221 1222 1223 1224 1225 1226
				netlink_overrun(sk);
			sock_put(sk);
			kfree_skb(skb);
			return -EAGAIN;
		}

		__set_current_state(TASK_INTERRUPTIBLE);
		add_wait_queue(&nlk->wait, &wait);

		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1227
		     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
L
Linus Torvalds 已提交
1228
		    !sock_flag(sk, SOCK_DEAD))
P
Patrick McHardy 已提交
1229
			*timeo = schedule_timeout(*timeo);
L
Linus Torvalds 已提交
1230 1231 1232 1233 1234 1235 1236

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nlk->wait, &wait);
		sock_put(sk);

		if (signal_pending(current)) {
			kfree_skb(skb);
P
Patrick McHardy 已提交
1237
			return sock_intr_errno(*timeo);
L
Linus Torvalds 已提交
1238 1239 1240
		}
		return 1;
	}
1241
	netlink_skb_set_owner_r(skb, sk);
L
Linus Torvalds 已提交
1242 1243 1244
	return 0;
}

1245
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1246 1247 1248
{
	int len = skb->len;

C
Cong Wang 已提交
1249
	netlink_deliver_tap(sock_net(sk), skb);
1250

1251
	skb_queue_tail(&sk->sk_receive_queue, skb);
1252
	sk->sk_data_ready(sk);
1253 1254 1255 1256 1257 1258 1259
	return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
	int len = __netlink_sendskb(sk, skb);

L
Linus Torvalds 已提交
1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
	sock_put(sk);
	return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
	kfree_skb(skb);
	sock_put(sk);
}

1270
static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
L
Linus Torvalds 已提交
1271 1272 1273
{
	int delta;

1274
	WARN_ON(skb->sk != NULL);
1275
	delta = skb->end - skb->tail;
1276
	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
L
Linus Torvalds 已提交
1277 1278 1279 1280 1281 1282
		return skb;

	if (skb_shared(skb)) {
		struct sk_buff *nskb = skb_clone(skb, allocation);
		if (!nskb)
			return skb;
1283
		consume_skb(skb);
L
Linus Torvalds 已提交
1284 1285 1286
		skb = nskb;
	}

1287 1288 1289
	pskb_expand_head(skb, 0, -delta,
			 (allocation & ~__GFP_DIRECT_RECLAIM) |
			 __GFP_NOWARN | __GFP_NORETRY);
L
Linus Torvalds 已提交
1290 1291 1292
	return skb;
}

1293 1294
static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
				  struct sock *ssk)
1295 1296 1297 1298 1299 1300 1301
{
	int ret;
	struct netlink_sock *nlk = nlk_sk(sk);

	ret = -ECONNREFUSED;
	if (nlk->netlink_rcv != NULL) {
		ret = skb->len;
1302
		netlink_skb_set_owner_r(skb, sk);
1303
		NETLINK_CB(skb).sk = ssk;
1304
		netlink_deliver_tap_kernel(sk, ssk, skb);
1305
		nlk->netlink_rcv(skb);
1306 1307 1308
		consume_skb(skb);
	} else {
		kfree_skb(skb);
1309 1310 1311 1312 1313 1314
	}
	sock_put(sk);
	return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
1315
		    u32 portid, int nonblock)
L
Linus Torvalds 已提交
1316 1317 1318 1319 1320 1321 1322 1323 1324
{
	struct sock *sk;
	int err;
	long timeo;

	skb = netlink_trim(skb, gfp_any());

	timeo = sock_sndtimeo(ssk, nonblock);
retry:
1325
	sk = netlink_getsockbyportid(ssk, portid);
L
Linus Torvalds 已提交
1326 1327 1328 1329
	if (IS_ERR(sk)) {
		kfree_skb(skb);
		return PTR_ERR(sk);
	}
1330
	if (netlink_is_kernel(sk))
1331
		return netlink_unicast_kernel(sk, skb, ssk);
1332

1333
	if (sk_filter(sk, skb)) {
W
Wang Chen 已提交
1334
		err = skb->len;
1335 1336 1337 1338 1339
		kfree_skb(skb);
		sock_put(sk);
		return err;
	}

1340
	err = netlink_attachskb(sk, skb, &timeo, ssk);
L
Linus Torvalds 已提交
1341 1342 1343 1344 1345
	if (err == 1)
		goto retry;
	if (err)
		return err;

1346
	return netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
1347
}
1348
EXPORT_SYMBOL(netlink_unicast);
L
Linus Torvalds 已提交
1349

1350 1351 1352
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
	int res = 0;
1353
	struct listeners *listeners;
1354

1355
	BUG_ON(!netlink_is_kernel(sk));
1356 1357 1358 1359

	rcu_read_lock();
	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

1360
	if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
1361
		res = test_bit(group - 1, listeners->masks);
1362 1363 1364

	rcu_read_unlock();

1365 1366 1367 1368
	return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

1369
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1370 1371 1372 1373
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
1374
	    !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
1375
		netlink_skb_set_owner_r(skb, sk);
1376
		__netlink_sendskb(sk, skb);
1377
		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
L
Linus Torvalds 已提交
1378 1379 1380 1381 1382 1383
	}
	return -1;
}

struct netlink_broadcast_data {
	struct sock *exclude_sk;
1384
	struct net *net;
1385
	u32 portid;
L
Linus Torvalds 已提交
1386 1387
	u32 group;
	int failure;
1388
	int delivery_failure;
L
Linus Torvalds 已提交
1389 1390
	int congested;
	int delivered;
A
Al Viro 已提交
1391
	gfp_t allocation;
L
Linus Torvalds 已提交
1392
	struct sk_buff *skb, *skb2;
1393 1394
	int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
	void *tx_data;
L
Linus Torvalds 已提交
1395 1396
};

1397 1398
static void do_one_broadcast(struct sock *sk,
				    struct netlink_broadcast_data *p)
L
Linus Torvalds 已提交
1399 1400 1401 1402 1403
{
	struct netlink_sock *nlk = nlk_sk(sk);
	int val;

	if (p->exclude_sk == sk)
1404
		return;
L
Linus Torvalds 已提交
1405

1406
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
1407
	    !test_bit(p->group - 1, nlk->groups))
1408
		return;
L
Linus Torvalds 已提交
1409

1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420
	if (!net_eq(sock_net(sk), p->net)) {
		if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
			return;

		if (!peernet_has_id(sock_net(sk), p->net))
			return;

		if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
				     CAP_NET_BROADCAST))
			return;
	}
1421

L
Linus Torvalds 已提交
1422 1423
	if (p->failure) {
		netlink_overrun(sk);
1424
		return;
L
Linus Torvalds 已提交
1425 1426 1427 1428
	}

	sock_hold(sk);
	if (p->skb2 == NULL) {
1429
		if (skb_shared(p->skb)) {
L
Linus Torvalds 已提交
1430 1431
			p->skb2 = skb_clone(p->skb, p->allocation);
		} else {
1432 1433 1434 1435 1436 1437
			p->skb2 = skb_get(p->skb);
			/*
			 * skb ownership may have been set when
			 * delivered to a previous socket.
			 */
			skb_orphan(p->skb2);
L
Linus Torvalds 已提交
1438 1439 1440 1441 1442 1443
		}
	}
	if (p->skb2 == NULL) {
		netlink_overrun(sk);
		/* Clone failed. Notify ALL listeners. */
		p->failure = 1;
1444
		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
1445
			p->delivery_failure = 1;
1446 1447 1448
		goto out;
	}
	if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
1449 1450
		kfree_skb(p->skb2);
		p->skb2 = NULL;
1451 1452 1453
		goto out;
	}
	if (sk_filter(sk, p->skb2)) {
1454 1455
		kfree_skb(p->skb2);
		p->skb2 = NULL;
1456 1457 1458
		goto out;
	}
	NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
1459 1460
	if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
		NETLINK_CB(p->skb2).nsid_is_set = true;
1461 1462
	val = netlink_broadcast_deliver(sk, p->skb2);
	if (val < 0) {
L
Linus Torvalds 已提交
1463
		netlink_overrun(sk);
1464
		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
1465
			p->delivery_failure = 1;
L
Linus Torvalds 已提交
1466 1467 1468 1469 1470
	} else {
		p->congested |= val;
		p->delivered = 1;
		p->skb2 = NULL;
	}
1471
out:
L
Linus Torvalds 已提交
1472 1473 1474
	sock_put(sk);
}

1475
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
1476 1477 1478
	u32 group, gfp_t allocation,
	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
	void *filter_data)
L
Linus Torvalds 已提交
1479
{
1480
	struct net *net = sock_net(ssk);
L
Linus Torvalds 已提交
1481 1482 1483 1484 1485 1486
	struct netlink_broadcast_data info;
	struct sock *sk;

	skb = netlink_trim(skb, allocation);

	info.exclude_sk = ssk;
1487
	info.net = net;
1488
	info.portid = portid;
L
Linus Torvalds 已提交
1489 1490
	info.group = group;
	info.failure = 0;
1491
	info.delivery_failure = 0;
L
Linus Torvalds 已提交
1492 1493 1494 1495 1496
	info.congested = 0;
	info.delivered = 0;
	info.allocation = allocation;
	info.skb = skb;
	info.skb2 = NULL;
1497 1498
	info.tx_filter = filter;
	info.tx_data = filter_data;
L
Linus Torvalds 已提交
1499 1500 1501 1502 1503

	/* While we sleep in clone, do not allow to change socket list */

	netlink_lock_table();

1504
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
L
Linus Torvalds 已提交
1505 1506
		do_one_broadcast(sk, &info);

1507
	consume_skb(skb);
1508

L
Linus Torvalds 已提交
1509 1510
	netlink_unlock_table();

1511 1512
	if (info.delivery_failure) {
		kfree_skb(info.skb2);
1513
		return -ENOBUFS;
E
Eric Dumazet 已提交
1514 1515
	}
	consume_skb(info.skb2);
1516

L
Linus Torvalds 已提交
1517
	if (info.delivered) {
1518
		if (info.congested && gfpflags_allow_blocking(allocation))
L
Linus Torvalds 已提交
1519 1520 1521 1522 1523
			yield();
		return 0;
	}
	return -ESRCH;
}
1524 1525
EXPORT_SYMBOL(netlink_broadcast_filtered);

1526
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
1527 1528
		      u32 group, gfp_t allocation)
{
1529
	return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
1530 1531
		NULL, NULL);
}
1532
EXPORT_SYMBOL(netlink_broadcast);
L
Linus Torvalds 已提交
1533 1534 1535

struct netlink_set_err_data {
	struct sock *exclude_sk;
1536
	u32 portid;
L
Linus Torvalds 已提交
1537 1538 1539 1540
	u32 group;
	int code;
};

1541
static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
L
Linus Torvalds 已提交
1542 1543
{
	struct netlink_sock *nlk = nlk_sk(sk);
1544
	int ret = 0;
L
Linus Torvalds 已提交
1545 1546 1547 1548

	if (sk == p->exclude_sk)
		goto out;

O
Octavian Purdila 已提交
1549
	if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
1550 1551
		goto out;

1552
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
1553
	    !test_bit(p->group - 1, nlk->groups))
L
Linus Torvalds 已提交
1554 1555
		goto out;

1556
	if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) {
1557 1558 1559 1560
		ret = 1;
		goto out;
	}

L
Linus Torvalds 已提交
1561 1562 1563
	sk->sk_err = p->code;
	sk->sk_error_report(sk);
out:
1564
	return ret;
L
Linus Torvalds 已提交
1565 1566
}

1567 1568 1569
/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
1570
 * @portid: the PORTID of a process that we want to skip (if any)
1571
 * @group: the broadcast group that will notice the error
1572
 * @code: error code, must be negative (as usual in kernelspace)
1573 1574
 *
 * This function returns the number of broadcast listeners that have set the
1575
 * NETLINK_NO_ENOBUFS socket option.
1576
 */
1577
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
L
Linus Torvalds 已提交
1578 1579 1580
{
	struct netlink_set_err_data info;
	struct sock *sk;
1581
	int ret = 0;
L
Linus Torvalds 已提交
1582 1583

	info.exclude_sk = ssk;
1584
	info.portid = portid;
L
Linus Torvalds 已提交
1585
	info.group = group;
1586 1587
	/* sk->sk_err wants a positive error value */
	info.code = -code;
L
Linus Torvalds 已提交
1588 1589 1590

	read_lock(&nl_table_lock);

1591
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
1592
		ret += do_one_set_err(sk, &info);
L
Linus Torvalds 已提交
1593 1594

	read_unlock(&nl_table_lock);
1595
	return ret;
L
Linus Torvalds 已提交
1596
}
1597
EXPORT_SYMBOL(netlink_set_err);
L
Linus Torvalds 已提交
1598

1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615
/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
				     unsigned int group,
				     int is_new)
{
	int old, new = !!is_new, subscriptions;

	old = test_bit(group - 1, nlk->groups);
	subscriptions = nlk->subscriptions - old + new;
	if (new)
		__set_bit(group - 1, nlk->groups);
	else
		__clear_bit(group - 1, nlk->groups);
	netlink_update_subscriptions(&nlk->sk, subscriptions);
	netlink_update_listeners(&nlk->sk);
}

1616
static int netlink_setsockopt(struct socket *sock, int level, int optname,
1617
			      char __user *optval, unsigned int optlen)
1618 1619 1620
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1621 1622
	unsigned int val = 0;
	int err;
1623 1624 1625 1626

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

1627
	if (optlen >= sizeof(int) &&
1628
	    get_user(val, (unsigned int __user *)optval))
1629 1630 1631 1632 1633
		return -EFAULT;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (val)
1634
			nlk->flags |= NETLINK_F_RECV_PKTINFO;
1635
		else
1636
			nlk->flags &= ~NETLINK_F_RECV_PKTINFO;
1637 1638 1639 1640
		err = 0;
		break;
	case NETLINK_ADD_MEMBERSHIP:
	case NETLINK_DROP_MEMBERSHIP: {
1641
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
1642
			return -EPERM;
1643 1644 1645
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
1646 1647
		if (!val || val - 1 >= nlk->ngroups)
			return -EINVAL;
1648
		if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
1649
			err = nlk->netlink_bind(sock_net(sk), val);
1650 1651 1652
			if (err)
				return err;
		}
1653
		netlink_table_grab();
1654 1655
		netlink_update_socket_mc(nlk, val,
					 optname == NETLINK_ADD_MEMBERSHIP);
1656
		netlink_table_ungrab();
1657
		if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
1658
			nlk->netlink_unbind(sock_net(sk), val);
1659

1660 1661 1662
		err = 0;
		break;
	}
1663 1664
	case NETLINK_BROADCAST_ERROR:
		if (val)
1665
			nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR;
1666
		else
1667
			nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR;
1668 1669
		err = 0;
		break;
1670 1671
	case NETLINK_NO_ENOBUFS:
		if (val) {
1672 1673
			nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS;
			clear_bit(NETLINK_S_CONGESTED, &nlk->state);
1674
			wake_up_interruptible(&nlk->wait);
E
Eric Dumazet 已提交
1675
		} else {
1676
			nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS;
E
Eric Dumazet 已提交
1677
		}
1678 1679
		err = 0;
		break;
1680 1681 1682 1683 1684 1685 1686 1687 1688 1689
	case NETLINK_LISTEN_ALL_NSID:
		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
			return -EPERM;

		if (val)
			nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
		else
			nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
		err = 0;
		break;
1690 1691 1692 1693 1694 1695 1696
	case NETLINK_CAP_ACK:
		if (val)
			nlk->flags |= NETLINK_F_CAP_ACK;
		else
			nlk->flags &= ~NETLINK_F_CAP_ACK;
		err = 0;
		break;
J
Johannes Berg 已提交
1697 1698 1699 1700 1701 1702 1703
	case NETLINK_EXT_ACK:
		if (val)
			nlk->flags |= NETLINK_F_EXT_ACK;
		else
			nlk->flags &= ~NETLINK_F_EXT_ACK;
		err = 0;
		break;
1704 1705 1706 1707 1708 1709 1710
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
1711
			      char __user *optval, int __user *optlen)
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int len, val, err;

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
1730
		val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0;
H
Heiko Carstens 已提交
1731 1732 1733
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
1734 1735
		err = 0;
		break;
1736 1737 1738 1739
	case NETLINK_BROADCAST_ERROR:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
1740
		val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0;
1741 1742 1743 1744 1745
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
1746 1747 1748 1749
	case NETLINK_NO_ENOBUFS:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
1750
		val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0;
1751 1752 1753 1754 1755
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
1756 1757 1758 1759
	case NETLINK_LIST_MEMBERSHIPS: {
		int pos, idx, shift;

		err = 0;
1760
		netlink_lock_table();
1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774
		for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
			if (len - pos < sizeof(u32))
				break;

			idx = pos / sizeof(unsigned long);
			shift = (pos % sizeof(unsigned long)) * 8;
			if (put_user((u32)(nlk->groups[idx] >> shift),
				     (u32 __user *)(optval + pos))) {
				err = -EFAULT;
				break;
			}
		}
		if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen))
			err = -EFAULT;
1775
		netlink_unlock_table();
1776 1777
		break;
	}
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787
	case NETLINK_CAP_ACK:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0;
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
J
Johannes Berg 已提交
1788 1789 1790 1791 1792 1793 1794 1795 1796
	case NETLINK_EXT_ACK:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0;
		if (put_user(len, optlen) || put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
	struct nl_pktinfo info;

	info.group = NETLINK_CB(skb).dst_group;
	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

1811 1812 1813 1814 1815 1816 1817 1818 1819 1820
static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
					 struct sk_buff *skb)
{
	if (!NETLINK_CB(skb).nsid_is_set)
		return;

	put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
		 &NETLINK_CB(skb).nsid);
}

1821
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
L
Linus Torvalds 已提交
1822 1823 1824
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1825
	DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
1826
	u32 dst_portid;
1827
	u32 dst_group;
L
Linus Torvalds 已提交
1828 1829 1830
	struct sk_buff *skb;
	int err;
	struct scm_cookie scm;
1831
	u32 netlink_skb_flags = 0;
L
Linus Torvalds 已提交
1832 1833 1834 1835

	if (msg->msg_flags&MSG_OOB)
		return -EOPNOTSUPP;

C
Christoph Hellwig 已提交
1836
	err = scm_send(sock, msg, &scm, true);
L
Linus Torvalds 已提交
1837 1838 1839 1840
	if (err < 0)
		return err;

	if (msg->msg_namelen) {
1841
		err = -EINVAL;
L
Linus Torvalds 已提交
1842
		if (addr->nl_family != AF_NETLINK)
1843
			goto out;
1844
		dst_portid = addr->nl_pid;
1845
		dst_group = ffs(addr->nl_groups);
1846
		err =  -EPERM;
1847
		if ((dst_group || dst_portid) &&
1848
		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
1849
			goto out;
1850
		netlink_skb_flags |= NETLINK_SKB_DST;
L
Linus Torvalds 已提交
1851
	} else {
1852
		dst_portid = nlk->dst_portid;
1853
		dst_group = nlk->dst_group;
L
Linus Torvalds 已提交
1854 1855
	}

1856
	if (!nlk->bound) {
L
Linus Torvalds 已提交
1857 1858 1859
		err = netlink_autobind(sock);
		if (err)
			goto out;
1860 1861 1862
	} else {
		/* Ensure nlk is hashed and visible. */
		smp_rmb();
L
Linus Torvalds 已提交
1863 1864 1865 1866 1867 1868
	}

	err = -EMSGSIZE;
	if (len > sk->sk_sndbuf - 32)
		goto out;
	err = -ENOBUFS;
1869
	skb = netlink_alloc_large_skb(len, dst_group);
1870
	if (skb == NULL)
L
Linus Torvalds 已提交
1871 1872
		goto out;

1873
	NETLINK_CB(skb).portid	= nlk->portid;
1874
	NETLINK_CB(skb).dst_group = dst_group;
C
Christoph Hellwig 已提交
1875
	NETLINK_CB(skb).creds	= scm.creds;
1876
	NETLINK_CB(skb).flags	= netlink_skb_flags;
L
Linus Torvalds 已提交
1877 1878

	err = -EFAULT;
A
Al Viro 已提交
1879
	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
L
Linus Torvalds 已提交
1880 1881 1882 1883 1884 1885 1886 1887 1888 1889
		kfree_skb(skb);
		goto out;
	}

	err = security_netlink_send(sk, skb);
	if (err) {
		kfree_skb(skb);
		goto out;
	}

1890
	if (dst_group) {
1891
		refcount_inc(&skb->users);
1892
		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
L
Linus Torvalds 已提交
1893
	}
1894
	err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
L
Linus Torvalds 已提交
1895 1896

out:
C
Christoph Hellwig 已提交
1897
	scm_destroy(&scm);
L
Linus Torvalds 已提交
1898 1899 1900
	return err;
}

1901
static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
L
Linus Torvalds 已提交
1902 1903 1904 1905 1906 1907 1908
			   int flags)
{
	struct scm_cookie scm;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int noblock = flags&MSG_DONTWAIT;
	size_t copied;
J
Johannes Berg 已提交
1909
	struct sk_buff *skb, *data_skb;
1910
	int err, ret;
L
Linus Torvalds 已提交
1911 1912 1913 1914 1915 1916

	if (flags&MSG_OOB)
		return -EOPNOTSUPP;

	copied = 0;

1917 1918
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
L
Linus Torvalds 已提交
1919 1920
		goto out;

J
Johannes Berg 已提交
1921 1922
	data_skb = skb;

1923 1924 1925
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
	if (unlikely(skb_shinfo(skb)->frag_list)) {
		/*
J
Johannes Berg 已提交
1926 1927 1928
		 * If this skb has a frag_list, then here that means that we
		 * will have to use the frag_list skb's data for compat tasks
		 * and the regular skb's data for normal (non-compat) tasks.
1929
		 *
J
Johannes Berg 已提交
1930 1931 1932 1933
		 * If we need to send the compat skb, assign it to the
		 * 'data_skb' variable so that it will be used below for data
		 * copying. We keep 'skb' for everything else, including
		 * freeing both later.
1934
		 */
J
Johannes Berg 已提交
1935 1936
		if (flags & MSG_CMSG_COMPAT)
			data_skb = skb_shinfo(skb)->frag_list;
1937 1938 1939
	}
#endif

E
Eric Dumazet 已提交
1940 1941 1942
	/* Record the max length of recvmsg() calls for future allocations */
	nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
	nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
1943
				     SKB_WITH_OVERHEAD(32768));
E
Eric Dumazet 已提交
1944

J
Johannes Berg 已提交
1945
	copied = data_skb->len;
L
Linus Torvalds 已提交
1946 1947 1948 1949 1950
	if (len < copied) {
		msg->msg_flags |= MSG_TRUNC;
		copied = len;
	}

J
Johannes Berg 已提交
1951
	skb_reset_transport_header(data_skb);
1952
	err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
L
Linus Torvalds 已提交
1953 1954

	if (msg->msg_name) {
1955
		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
L
Linus Torvalds 已提交
1956 1957
		addr->nl_family = AF_NETLINK;
		addr->nl_pad    = 0;
1958
		addr->nl_pid	= NETLINK_CB(skb).portid;
1959
		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
L
Linus Torvalds 已提交
1960 1961 1962
		msg->msg_namelen = sizeof(*addr);
	}

1963
	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
1964
		netlink_cmsg_recv_pktinfo(msg, skb);
1965 1966
	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
		netlink_cmsg_listen_all_nsid(sk, msg, skb);
1967

C
Christoph Hellwig 已提交
1968 1969
	memset(&scm, 0, sizeof(scm));
	scm.creds = *NETLINK_CREDS(skb);
1970
	if (flags & MSG_TRUNC)
J
Johannes Berg 已提交
1971
		copied = data_skb->len;
1972

L
Linus Torvalds 已提交
1973 1974
	skb_free_datagram(sk, skb);

1975 1976
	if (nlk->cb_running &&
	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
1977 1978
		ret = netlink_dump(sk);
		if (ret) {
1979
			sk->sk_err = -ret;
1980 1981 1982
			sk->sk_error_report(sk);
		}
	}
L
Linus Torvalds 已提交
1983

C
Christoph Hellwig 已提交
1984
	scm_recv(sock, msg, &scm, flags);
L
Linus Torvalds 已提交
1985 1986 1987 1988 1989
out:
	netlink_rcv_wake(sk);
	return err ? : copied;
}

1990
static void netlink_data_ready(struct sock *sk)
L
Linus Torvalds 已提交
1991
{
1992
	BUG();
L
Linus Torvalds 已提交
1993 1994 1995
}

/*
1996
 *	We export these functions to other modules. They provide a
L
Linus Torvalds 已提交
1997 1998 1999 2000 2001
 *	complete set of kernel non-blocking support for message
 *	queueing.
 */

struct sock *
2002 2003
__netlink_kernel_create(struct net *net, int unit, struct module *module,
			struct netlink_kernel_cfg *cfg)
L
Linus Torvalds 已提交
2004 2005 2006
{
	struct socket *sock;
	struct sock *sk;
2007
	struct netlink_sock *nlk;
2008
	struct listeners *listeners = NULL;
2009 2010
	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
	unsigned int groups;
L
Linus Torvalds 已提交
2011

2012
	BUG_ON(!nl_table);
L
Linus Torvalds 已提交
2013

2014
	if (unit < 0 || unit >= MAX_LINKS)
L
Linus Torvalds 已提交
2015 2016 2017 2018
		return NULL;

	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
		return NULL;
2019 2020

	if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0)
2021 2022 2023
		goto out_sock_release_nosk;

	sk = sock->sk;
2024

2025
	if (!cfg || cfg->groups < 32)
2026
		groups = 32;
2027 2028
	else
		groups = cfg->groups;
2029

2030
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2031 2032 2033
	if (!listeners)
		goto out_sock_release;

L
Linus Torvalds 已提交
2034
	sk->sk_data_ready = netlink_data_ready;
2035 2036
	if (cfg && cfg->input)
		nlk_sk(sk)->netlink_rcv = cfg->input;
L
Linus Torvalds 已提交
2037

2038
	if (netlink_insert(sk, 0))
2039
		goto out_sock_release;
2040

2041
	nlk = nlk_sk(sk);
2042
	nlk->flags |= NETLINK_F_KERNEL_SOCKET;
2043 2044

	netlink_table_grab();
2045 2046
	if (!nl_table[unit].registered) {
		nl_table[unit].groups = groups;
2047
		rcu_assign_pointer(nl_table[unit].listeners, listeners);
2048 2049
		nl_table[unit].cb_mutex = cb_mutex;
		nl_table[unit].module = module;
2050 2051
		if (cfg) {
			nl_table[unit].bind = cfg->bind;
2052
			nl_table[unit].unbind = cfg->unbind;
2053
			nl_table[unit].flags = cfg->flags;
2054 2055
			if (cfg->compare)
				nl_table[unit].compare = cfg->compare;
2056
		}
2057
		nl_table[unit].registered = 1;
2058 2059
	} else {
		kfree(listeners);
2060
		nl_table[unit].registered++;
2061
	}
2062
	netlink_table_ungrab();
2063 2064
	return sk;

2065
out_sock_release:
2066
	kfree(listeners);
2067
	netlink_kernel_release(sk);
2068 2069 2070
	return NULL;

out_sock_release_nosk:
2071
	sock_release(sock);
2072
	return NULL;
L
Linus Torvalds 已提交
2073
}
2074
EXPORT_SYMBOL(__netlink_kernel_create);
2075 2076 2077 2078

void
netlink_kernel_release(struct sock *sk)
{
2079 2080 2081 2082
	if (sk == NULL || sk->sk_socket == NULL)
		return;

	sock_release(sk->sk_socket);
2083 2084 2085
}
EXPORT_SYMBOL(netlink_kernel_release);

2086
int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
2087
{
2088
	struct listeners *new, *old;
2089 2090 2091 2092 2093 2094
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];

	if (groups < 32)
		groups = 32;

	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
2095 2096
		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
		if (!new)
2097
			return -ENOMEM;
2098
		old = nl_deref_protected(tbl->listeners);
2099 2100 2101
		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
		rcu_assign_pointer(tbl->listeners, new);

2102
		kfree_rcu(old, rcu);
2103 2104 2105
	}
	tbl->groups = groups;

2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126
	return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
	int err;

	netlink_table_grab();
	err = __netlink_change_ngroups(sk, groups);
2127
	netlink_table_ungrab();
2128

2129 2130 2131
	return err;
}

2132 2133 2134 2135 2136
void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
	struct sock *sk;
	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];

2137
	sk_for_each_bound(sk, &tbl->mc_list)
2138 2139 2140
		netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

2141
struct nlmsghdr *
2142
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
2143 2144
{
	struct nlmsghdr *nlh;
2145
	int size = nlmsg_msg_size(len);
2146

2147
	nlh = skb_put(skb, NLMSG_ALIGN(size));
2148 2149 2150
	nlh->nlmsg_type = type;
	nlh->nlmsg_len = size;
	nlh->nlmsg_flags = flags;
2151
	nlh->nlmsg_pid = portid;
2152 2153
	nlh->nlmsg_seq = seq;
	if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
2154
		memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
2155 2156 2157 2158
	return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

L
Linus Torvalds 已提交
2159 2160 2161 2162 2163 2164 2165 2166 2167
/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_callback *cb;
2168
	struct sk_buff *skb = NULL;
L
Linus Torvalds 已提交
2169
	struct nlmsghdr *nlh;
2170
	struct module *module;
2171
	int err = -ENOBUFS;
2172
	int alloc_min_size;
2173
	int alloc_size;
L
Linus Torvalds 已提交
2174

2175
	mutex_lock(nlk->cb_mutex);
2176
	if (!nlk->cb_running) {
2177 2178
		err = -EINVAL;
		goto errout_skb;
L
Linus Torvalds 已提交
2179 2180
	}

2181
	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2182
		goto errout_skb;
E
Eric Dumazet 已提交
2183 2184 2185 2186 2187 2188

	/* NLMSG_GOODSIZE is small to avoid high order allocations being
	 * required, but it makes sense to _attempt_ a 16K bytes allocation
	 * to reduce number of system calls on dump operations, if user
	 * ever provided a big enough buffer.
	 */
2189 2190 2191 2192 2193
	cb = &nlk->cb;
	alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

	if (alloc_min_size < nlk->max_recvmsg_len) {
		alloc_size = nlk->max_recvmsg_len;
2194 2195 2196
		skb = alloc_skb(alloc_size,
				(GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
				__GFP_NOWARN | __GFP_NORETRY);
E
Eric Dumazet 已提交
2197
	}
2198 2199
	if (!skb) {
		alloc_size = alloc_min_size;
2200
		skb = alloc_skb(alloc_size, GFP_KERNEL);
2201
	}
2202
	if (!skb)
2203
		goto errout_skb;
2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214

	/* Trim skb to allocated size. User is expected to provide buffer as
	 * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at
	 * netlink_recvmsg())). dump will pack as many smaller messages as
	 * could fit within the allocated skb. skb is typically allocated
	 * with larger space than required (could be as much as near 2x the
	 * requested size with align to next power of 2 approach). Allowing
	 * dump to use the excess space makes it difficult for a user to have a
	 * reasonable static buffer based on the expected largest dump of a
	 * single netdev. The outcome is MSG_TRUNC error.
	 */
2215
	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
2216
	netlink_skb_set_owner_r(skb, sk);
2217

2218 2219
	if (nlk->dump_done_errno > 0)
		nlk->dump_done_errno = cb->dump(skb, cb);
L
Linus Torvalds 已提交
2220

2221 2222
	if (nlk->dump_done_errno > 0 ||
	    skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
2223
		mutex_unlock(nlk->cb_mutex);
2224 2225 2226

		if (sk_filter(sk, skb))
			kfree_skb(skb);
2227 2228
		else
			__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2229 2230 2231
		return 0;
	}

2232 2233 2234
	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE,
			       sizeof(nlk->dump_done_errno), NLM_F_MULTI);
	if (WARN_ON(!nlh))
2235 2236
		goto errout_skb;

2237 2238
	nl_dump_check_consistent(cb, nlh);

2239 2240
	memcpy(nlmsg_data(nlh), &nlk->dump_done_errno,
	       sizeof(nlk->dump_done_errno));
2241

2242 2243
	if (sk_filter(sk, skb))
		kfree_skb(skb);
2244 2245
	else
		__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2246

2247 2248
	if (cb->done)
		cb->done(cb);
L
Linus Torvalds 已提交
2249

2250
	nlk->cb_running = false;
2251 2252
	module = cb->module;
	skb = cb->skb;
2253
	mutex_unlock(nlk->cb_mutex);
2254 2255
	module_put(module);
	consume_skb(skb);
L
Linus Torvalds 已提交
2256
	return 0;
2257

2258
errout_skb:
2259
	mutex_unlock(nlk->cb_mutex);
2260 2261
	kfree_skb(skb);
	return err;
L
Linus Torvalds 已提交
2262 2263
}

2264 2265 2266
int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
			 const struct nlmsghdr *nlh,
			 struct netlink_dump_control *control)
L
Linus Torvalds 已提交
2267 2268 2269 2270
{
	struct netlink_callback *cb;
	struct sock *sk;
	struct netlink_sock *nlk;
2271
	int ret;
L
Linus Torvalds 已提交
2272

2273
	refcount_inc(&skb->users);
2274

2275
	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
L
Linus Torvalds 已提交
2276
	if (sk == NULL) {
2277 2278
		ret = -ECONNREFUSED;
		goto error_free;
L
Linus Torvalds 已提交
2279
	}
2280

2281
	nlk = nlk_sk(sk);
2282
	mutex_lock(nlk->cb_mutex);
2283
	/* A dump is in progress... */
2284
	if (nlk->cb_running) {
2285
		ret = -EBUSY;
2286
		goto error_unlock;
L
Linus Torvalds 已提交
2287
	}
2288
	/* add reference of module which cb->dump belongs to */
2289
	if (!try_module_get(control->module)) {
2290
		ret = -EPROTONOSUPPORT;
2291
		goto error_unlock;
2292 2293
	}

2294 2295
	cb = &nlk->cb;
	memset(cb, 0, sizeof(*cb));
2296
	cb->start = control->start;
2297 2298 2299 2300 2301 2302 2303 2304
	cb->dump = control->dump;
	cb->done = control->done;
	cb->nlh = nlh;
	cb->data = control->data;
	cb->module = control->module;
	cb->min_dump_alloc = control->min_dump_alloc;
	cb->skb = skb;

2305 2306 2307 2308 2309 2310
	if (cb->start) {
		ret = cb->start(cb);
		if (ret)
			goto error_unlock;
	}

2311
	nlk->cb_running = true;
2312
	nlk->dump_done_errno = INT_MAX;
2313

2314
	mutex_unlock(nlk->cb_mutex);
L
Linus Torvalds 已提交
2315

2316
	ret = netlink_dump(sk);
2317

L
Linus Torvalds 已提交
2318
	sock_put(sk);
2319

2320 2321 2322
	if (ret)
		return ret;

2323 2324 2325 2326
	/* We successfully started a dump, by returning -EINTR we
	 * signal not to send ACK even if it was requested.
	 */
	return -EINTR;
2327 2328 2329 2330 2331 2332 2333

error_unlock:
	sock_put(sk);
	mutex_unlock(nlk->cb_mutex);
error_free:
	kfree_skb(skb);
	return ret;
L
Linus Torvalds 已提交
2334
}
2335
EXPORT_SYMBOL(__netlink_dump_start);
L
Linus Torvalds 已提交
2336

J
Johannes Berg 已提交
2337 2338
void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
		 const struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
2339 2340 2341 2342
{
	struct sk_buff *skb;
	struct nlmsghdr *rep;
	struct nlmsgerr *errmsg;
2343
	size_t payload = sizeof(*errmsg);
J
Johannes Berg 已提交
2344
	size_t tlvlen = 0;
2345
	struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
J
Johannes Berg 已提交
2346
	unsigned int flags = 0;
2347
	bool nlk_has_extack = nlk->flags & NETLINK_F_EXT_ACK;
L
Linus Torvalds 已提交
2348

2349
	/* Error messages get the original request appened, unless the user
J
Johannes Berg 已提交
2350 2351
	 * requests to cap the error message, and get extra error data if
	 * requested.
2352
	 */
2353 2354 2355
	if (nlk_has_extack && extack && extack->_msg)
		tlvlen += nla_total_size(strlen(extack->_msg) + 1);

J
Johannes Berg 已提交
2356 2357 2358 2359 2360
	if (err) {
		if (!(nlk->flags & NETLINK_F_CAP_ACK))
			payload += nlmsg_len(nlh);
		else
			flags |= NLM_F_CAPPED;
2361 2362
		if (nlk_has_extack && extack && extack->bad_attr)
			tlvlen += nla_total_size(sizeof(u32));
J
Johannes Berg 已提交
2363 2364
	} else {
		flags |= NLM_F_CAPPED;
2365

2366
		if (nlk_has_extack && extack && extack->cookie_len)
2367
			tlvlen += nla_total_size(extack->cookie_len);
J
Johannes Berg 已提交
2368
	}
L
Linus Torvalds 已提交
2369

J
Johannes Berg 已提交
2370 2371 2372 2373
	if (tlvlen)
		flags |= NLM_F_ACK_TLVS;

	skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
L
Linus Torvalds 已提交
2374
	if (!skb) {
2375 2376
		NETLINK_CB(in_skb).sk->sk_err = ENOBUFS;
		NETLINK_CB(in_skb).sk->sk_error_report(NETLINK_CB(in_skb).sk);
L
Linus Torvalds 已提交
2377 2378 2379
		return;
	}

2380
	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
J
Johannes Berg 已提交
2381
			  NLMSG_ERROR, payload, flags);
2382
	errmsg = nlmsg_data(rep);
L
Linus Torvalds 已提交
2383
	errmsg->error = err;
2384
	memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));
J
Johannes Berg 已提交
2385

2386
	if (nlk_has_extack && extack) {
2387 2388 2389 2390
		if (extack->_msg) {
			WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
					       extack->_msg));
		}
2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404
		if (err) {
			if (extack->bad_attr &&
			    !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
				     (u8 *)extack->bad_attr >= in_skb->data +
							       in_skb->len))
				WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
						    (u8 *)extack->bad_attr -
						    in_skb->data));
		} else {
			if (extack->cookie_len)
				WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
						extack->cookie_len,
						extack->cookie));
		}
J
Johannes Berg 已提交
2405 2406 2407 2408
	}

	nlmsg_end(skb, rep);

2409
	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
L
Linus Torvalds 已提交
2410
}
2411
EXPORT_SYMBOL(netlink_ack);
L
Linus Torvalds 已提交
2412

2413
int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
J
Johannes Berg 已提交
2414 2415
						   struct nlmsghdr *,
						   struct netlink_ext_ack *))
2416
{
J
Johannes Berg 已提交
2417
	struct netlink_ext_ack extack = {};
2418 2419 2420 2421
	struct nlmsghdr *nlh;
	int err;

	while (skb->len >= nlmsg_total_size(0)) {
2422 2423
		int msglen;

2424
		nlh = nlmsg_hdr(skb);
2425
		err = 0;
2426

2427
		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
2428 2429
			return 0;

2430 2431
		/* Only requests are handled by the kernel */
		if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
2432
			goto ack;
2433 2434 2435

		/* Skip control messages */
		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
2436
			goto ack;
2437

J
Johannes Berg 已提交
2438
		err = cb(skb, nlh, &extack);
2439 2440 2441 2442
		if (err == -EINTR)
			goto skip;

ack:
2443
		if (nlh->nlmsg_flags & NLM_F_ACK || err)
J
Johannes Berg 已提交
2444
			netlink_ack(skb, nlh, err, &extack);
2445

2446
skip:
2447
		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
2448 2449 2450
		if (msglen > skb->len)
			msglen = skb->len;
		skb_pull(skb, msglen);
2451 2452 2453 2454
	}

	return 0;
}
2455
EXPORT_SYMBOL(netlink_rcv_skb);
2456

2457 2458 2459 2460
/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
2461
 * @portid: destination netlink portid for reports or 0
2462 2463 2464 2465
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
2466
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
2467 2468 2469 2470 2471
		 unsigned int group, int report, gfp_t flags)
{
	int err = 0;

	if (group) {
2472
		int exclude_portid = 0;
2473 2474

		if (report) {
2475
			refcount_inc(&skb->users);
2476
			exclude_portid = portid;
2477 2478
		}

2479 2480
		/* errors reported via destination sk->sk_err, but propagate
		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
2481
		err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
2482 2483
	}

2484 2485 2486
	if (report) {
		int err2;

2487
		err2 = nlmsg_unicast(sk, skb, portid);
2488 2489 2490
		if (!err || err == -ESRCH)
			err = err2;
	}
2491 2492 2493

	return err;
}
2494
EXPORT_SYMBOL(nlmsg_notify);
2495

L
Linus Torvalds 已提交
2496 2497
#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
2498
	struct seq_net_private p;
2499
	struct rhashtable_iter hti;
L
Linus Torvalds 已提交
2500 2501 2502
	int link;
};

2503
static int netlink_walk_start(struct nl_seq_iter *iter)
L
Linus Torvalds 已提交
2504
{
2505
	int err;
L
Linus Torvalds 已提交
2506

2507 2508
	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti,
				   GFP_KERNEL);
2509 2510 2511
	if (err) {
		iter->link = MAX_LINKS;
		return err;
L
Linus Torvalds 已提交
2512
	}
2513

2514 2515 2516
	rhashtable_walk_start(&iter->hti);

	return 0;
L
Linus Torvalds 已提交
2517 2518
}

2519
static void netlink_walk_stop(struct nl_seq_iter *iter)
L
Linus Torvalds 已提交
2520
{
2521 2522
	rhashtable_walk_stop(&iter->hti);
	rhashtable_walk_exit(&iter->hti);
L
Linus Torvalds 已提交
2523 2524
}

2525
static void *__netlink_seq_next(struct seq_file *seq)
L
Linus Torvalds 已提交
2526
{
2527
	struct nl_seq_iter *iter = seq->private;
2528
	struct netlink_sock *nlk;
L
Linus Torvalds 已提交
2529

2530 2531 2532
	do {
		for (;;) {
			int err;
L
Linus Torvalds 已提交
2533

2534
			nlk = rhashtable_walk_next(&iter->hti);
2535

2536 2537 2538
			if (IS_ERR(nlk)) {
				if (PTR_ERR(nlk) == -EAGAIN)
					continue;
2539

2540 2541
				return nlk;
			}
L
Linus Torvalds 已提交
2542

2543 2544
			if (nlk)
				break;
L
Linus Torvalds 已提交
2545

2546 2547 2548
			netlink_walk_stop(iter);
			if (++iter->link >= MAX_LINKS)
				return NULL;
2549

2550 2551 2552
			err = netlink_walk_start(iter);
			if (err)
				return ERR_PTR(err);
L
Linus Torvalds 已提交
2553
		}
2554
	} while (sock_net(&nlk->sk) != seq_file_net(seq));
L
Linus Torvalds 已提交
2555

2556 2557
	return nlk;
}
L
Linus Torvalds 已提交
2558

2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581
static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
{
	struct nl_seq_iter *iter = seq->private;
	void *obj = SEQ_START_TOKEN;
	loff_t pos;
	int err;

	iter->link = 0;

	err = netlink_walk_start(iter);
	if (err)
		return ERR_PTR(err);

	for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
		obj = __netlink_seq_next(seq);

	return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	++*pos;
	return __netlink_seq_next(seq);
L
Linus Torvalds 已提交
2582 2583 2584 2585
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
2586 2587 2588 2589 2590 2591
	struct nl_seq_iter *iter = seq->private;

	if (iter->link >= MAX_LINKS)
		return;

	netlink_walk_stop(iter);
L
Linus Torvalds 已提交
2592 2593 2594 2595 2596
}


static int netlink_seq_show(struct seq_file *seq, void *v)
{
E
Eric Dumazet 已提交
2597
	if (v == SEQ_START_TOKEN) {
L
Linus Torvalds 已提交
2598 2599
		seq_puts(seq,
			 "sk       Eth Pid    Groups   "
2600
			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
E
Eric Dumazet 已提交
2601
	} else {
L
Linus Torvalds 已提交
2602 2603 2604
		struct sock *s = v;
		struct netlink_sock *nlk = nlk_sk(s);

2605
		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
L
Linus Torvalds 已提交
2606 2607
			   s,
			   s->sk_protocol,
2608
			   nlk->portid,
2609
			   nlk->groups ? (u32)nlk->groups[0] : 0,
2610 2611
			   sk_rmem_alloc_get(s),
			   sk_wmem_alloc_get(s),
2612
			   nlk->cb_running,
2613
			   refcount_read(&s->sk_refcnt),
2614 2615
			   atomic_read(&s->sk_drops),
			   sock_i_ino(s)
L
Linus Torvalds 已提交
2616 2617 2618 2619 2620 2621
			);

	}
	return 0;
}

2622
static const struct seq_operations netlink_seq_ops = {
L
Linus Torvalds 已提交
2623 2624 2625 2626 2627 2628 2629 2630 2631
	.start  = netlink_seq_start,
	.next   = netlink_seq_next,
	.stop   = netlink_seq_stop,
	.show   = netlink_seq_show,
};


static int netlink_seq_open(struct inode *inode, struct file *file)
{
2632 2633
	return seq_open_net(inode, file, &netlink_seq_ops,
				sizeof(struct nl_seq_iter));
2634 2635
}

2636
static const struct file_operations netlink_seq_fops = {
L
Linus Torvalds 已提交
2637 2638 2639 2640
	.owner		= THIS_MODULE,
	.open		= netlink_seq_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
2641
	.release	= seq_release_net,
L
Linus Torvalds 已提交
2642 2643 2644 2645 2646 2647
};

#endif

int netlink_register_notifier(struct notifier_block *nb)
{
W
WANG Cong 已提交
2648
	return blocking_notifier_chain_register(&netlink_chain, nb);
L
Linus Torvalds 已提交
2649
}
2650
EXPORT_SYMBOL(netlink_register_notifier);
L
Linus Torvalds 已提交
2651 2652 2653

int netlink_unregister_notifier(struct notifier_block *nb)
{
W
WANG Cong 已提交
2654
	return blocking_notifier_chain_unregister(&netlink_chain, nb);
L
Linus Torvalds 已提交
2655
}
2656
EXPORT_SYMBOL(netlink_unregister_notifier);
2657

2658
static const struct proto_ops netlink_ops = {
L
Linus Torvalds 已提交
2659 2660 2661 2662 2663 2664 2665 2666
	.family =	PF_NETLINK,
	.owner =	THIS_MODULE,
	.release =	netlink_release,
	.bind =		netlink_bind,
	.connect =	netlink_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	sock_no_accept,
	.getname =	netlink_getname,
2667
	.poll =		datagram_poll,
2668
	.ioctl =	netlink_ioctl,
L
Linus Torvalds 已提交
2669 2670
	.listen =	sock_no_listen,
	.shutdown =	sock_no_shutdown,
2671 2672
	.setsockopt =	netlink_setsockopt,
	.getsockopt =	netlink_getsockopt,
L
Linus Torvalds 已提交
2673 2674
	.sendmsg =	netlink_sendmsg,
	.recvmsg =	netlink_recvmsg,
2675
	.mmap =		sock_no_mmap,
L
Linus Torvalds 已提交
2676 2677 2678
	.sendpage =	sock_no_sendpage,
};

2679
static const struct net_proto_family netlink_family_ops = {
L
Linus Torvalds 已提交
2680 2681 2682 2683 2684
	.family = PF_NETLINK,
	.create = netlink_create,
	.owner	= THIS_MODULE,	/* for consistency 8) */
};

2685
static int __net_init netlink_net_init(struct net *net)
2686 2687
{
#ifdef CONFIG_PROC_FS
2688
	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
2689 2690 2691 2692 2693
		return -ENOMEM;
#endif
	return 0;
}

2694
static void __net_exit netlink_net_exit(struct net *net)
2695 2696
{
#ifdef CONFIG_PROC_FS
2697
	remove_proc_entry("netlink", net->proc_net);
2698 2699 2700
#endif
}

2701 2702
static void __init netlink_add_usersock_entry(void)
{
2703
	struct listeners *listeners;
2704 2705
	int groups = 32;

2706
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2707
	if (!listeners)
2708
		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
2709 2710 2711 2712

	netlink_table_grab();

	nl_table[NETLINK_USERSOCK].groups = groups;
2713
	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
2714 2715
	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
	nl_table[NETLINK_USERSOCK].registered = 1;
2716
	nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
2717 2718 2719 2720

	netlink_table_ungrab();
}

2721
static struct pernet_operations __net_initdata netlink_net_ops = {
2722 2723 2724 2725
	.init = netlink_net_init,
	.exit = netlink_net_exit,
};

2726
static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
2727 2728 2729 2730
{
	const struct netlink_sock *nlk = data;
	struct netlink_compare_arg arg;

2731
	netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
2732
	return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
2733 2734 2735 2736 2737 2738 2739
}

static const struct rhashtable_params netlink_rhashtable_params = {
	.head_offset = offsetof(struct netlink_sock, node),
	.key_len = netlink_compare_arg_len,
	.obj_hashfn = netlink_hash,
	.obj_cmpfn = netlink_compare,
2740
	.automatic_shrinking = true,
2741 2742
};

L
Linus Torvalds 已提交
2743 2744 2745 2746 2747 2748 2749 2750
static int __init netlink_proto_init(void)
{
	int i;
	int err = proto_register(&netlink_proto, 0);

	if (err != 0)
		goto out;

2751
	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
L
Linus Torvalds 已提交
2752

2753
	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
2754 2755
	if (!nl_table)
		goto panic;
L
Linus Torvalds 已提交
2756 2757

	for (i = 0; i < MAX_LINKS; i++) {
2758 2759
		if (rhashtable_init(&nl_table[i].hash,
				    &netlink_rhashtable_params) < 0) {
2760 2761
			while (--i > 0)
				rhashtable_destroy(&nl_table[i].hash);
L
Linus Torvalds 已提交
2762
			kfree(nl_table);
2763
			goto panic;
L
Linus Torvalds 已提交
2764 2765 2766
		}
	}

2767 2768
	netlink_add_usersock_entry();

L
Linus Torvalds 已提交
2769
	sock_register(&netlink_family_ops);
2770
	register_pernet_subsys(&netlink_net_ops);
C
Cong Wang 已提交
2771
	register_pernet_subsys(&netlink_tap_net_ops);
2772
	/* The netlink device handler may be needed early. */
L
Linus Torvalds 已提交
2773 2774 2775
	rtnetlink_init();
out:
	return err;
2776 2777
panic:
	panic("netlink_init: Cannot allocate nl_table\n");
L
Linus Torvalds 已提交
2778 2779 2780
}

core_initcall(netlink_proto_init);