af_netlink.c 75.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * NETLINK      Kernel-user communication protocol.
 *
4
 * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5
 * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6
 * 				Patrick McHardy <kaber@trash.net>
L
Linus Torvalds 已提交
7 8 9 10 11
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
12
 *
L
Linus Torvalds 已提交
13 14 15 16
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 * 				 use nlk_sk, as sk->protinfo is on a diet 8)
17 18 19 20 21 22
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 * 				 - inc module use count of module that owns
 * 				   the kernel socket in case userspace opens
 * 				   socket of same protocol
 * 				 - remove all module support, since netlink is
 * 				   mandatory if CONFIG_NET=y these days
L
Linus Torvalds 已提交
23 24 25 26
 */

#include <linux/module.h>

27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
A
Andrew Morton 已提交
57
#include <linux/audit.h>
58
#include <linux/mutex.h>
59
#include <linux/vmalloc.h>
60
#include <linux/if_arp.h>
61
#include <linux/rhashtable.h>
62
#include <asm/cacheflush.h>
63
#include <linux/hash.h>
64
#include <linux/genetlink.h>
A
Andrew Morton 已提交
65

66
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
67 68
#include <net/sock.h>
#include <net/scm.h>
69
#include <net/netlink.h>
L
Linus Torvalds 已提交
70

71
#include "af_netlink.h"
L
Linus Torvalds 已提交
72

73 74 75
struct listeners {
	struct rcu_head		rcu;
	unsigned long		masks[0];
76 77
};

78
/* state bits */
79
#define NETLINK_S_CONGESTED		0x0
80 81

/* flags */
82 83 84 85
#define NETLINK_F_KERNEL_SOCKET		0x1
#define NETLINK_F_RECV_PKTINFO		0x2
#define NETLINK_F_BROADCAST_SEND_ERROR	0x4
#define NETLINK_F_RECV_NO_ENOBUFS	0x8
86
#define NETLINK_F_LISTEN_ALL_NSID	0x10
87

88
static inline int netlink_is_kernel(struct sock *sk)
89
{
90
	return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
91 92
}

93
struct netlink_table *nl_table __read_mostly;
94
EXPORT_SYMBOL_GPL(nl_table);
L
Linus Torvalds 已提交
95 96 97 98

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static int netlink_dump(struct sock *sk);
99
static void netlink_skb_destructor(struct sk_buff *skb);
L
Linus Torvalds 已提交
100

101
/* nl_table locking explained:
102
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
Y
Ying Xue 已提交
103
 * and removal are protected with per bucket lock while using RCU list
104 105 106 107
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
108
 */
109 110
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
L
Linus Torvalds 已提交
111 112
static atomic_t nl_table_users = ATOMIC_INIT(0);

113 114
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

115
static ATOMIC_NOTIFIER_HEAD(netlink_chain);
L
Linus Torvalds 已提交
116

117 118 119
static DEFINE_SPINLOCK(netlink_tap_lock);
static struct list_head netlink_tap_all __read_mostly;

120 121
static const struct rhashtable_params netlink_rhashtable_params;

122
static inline u32 netlink_group_mask(u32 group)
123 124 125 126
{
	return group ? 1 << (group - 1) : 0;
}

127 128 129 130 131 132 133 134 135
int netlink_add_tap(struct netlink_tap *nt)
{
	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
		return -EINVAL;

	spin_lock(&netlink_tap_lock);
	list_add_rcu(&nt->list, &netlink_tap_all);
	spin_unlock(&netlink_tap_lock);

136
	__module_get(nt->module);
137 138 139 140 141

	return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

142
static int __netlink_remove_tap(struct netlink_tap *nt)
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
{
	bool found = false;
	struct netlink_tap *tmp;

	spin_lock(&netlink_tap_lock);

	list_for_each_entry(tmp, &netlink_tap_all, list) {
		if (nt == tmp) {
			list_del_rcu(&nt->list);
			found = true;
			goto out;
		}
	}

	pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
	spin_unlock(&netlink_tap_lock);

161
	if (found)
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
		module_put(nt->module);

	return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
	int ret;

	ret = __netlink_remove_tap(nt);
	synchronize_net();

	return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
static bool netlink_filter_tap(const struct sk_buff *skb)
{
	struct sock *sk = skb->sk;

	/* We take the more conservative approach and
	 * whitelist socket protocols that may pass.
	 */
	switch (sk->sk_protocol) {
	case NETLINK_ROUTE:
	case NETLINK_USERSOCK:
	case NETLINK_SOCK_DIAG:
	case NETLINK_NFLOG:
	case NETLINK_XFRM:
	case NETLINK_FIB_LOOKUP:
	case NETLINK_NETFILTER:
	case NETLINK_GENERIC:
V
Varka Bhadram 已提交
194
		return true;
195 196
	}

V
Varka Bhadram 已提交
197
	return false;
198 199
}

200 201 202 203
static int __netlink_deliver_tap_skb(struct sk_buff *skb,
				     struct net_device *dev)
{
	struct sk_buff *nskb;
204
	struct sock *sk = skb->sk;
205 206 207 208 209 210
	int ret = -ENOMEM;

	dev_hold(dev);
	nskb = skb_clone(skb, GFP_ATOMIC);
	if (nskb) {
		nskb->dev = dev;
211
		nskb->protocol = htons((u16) sk->sk_protocol);
212 213
		nskb->pkt_type = netlink_is_kernel(sk) ?
				 PACKET_KERNEL : PACKET_USER;
214
		skb_reset_network_header(nskb);
215 216 217 218 219 220 221 222 223 224 225 226 227 228
		ret = dev_queue_xmit(nskb);
		if (unlikely(ret > 0))
			ret = net_xmit_errno(ret);
	}

	dev_put(dev);
	return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb)
{
	int ret;
	struct netlink_tap *tmp;

229 230 231
	if (!netlink_filter_tap(skb))
		return;

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
		if (unlikely(ret))
			break;
	}
}

static void netlink_deliver_tap(struct sk_buff *skb)
{
	rcu_read_lock();

	if (unlikely(!list_empty(&netlink_tap_all)))
		__netlink_deliver_tap(skb);

	rcu_read_unlock();
}

249 250 251 252 253 254 255
static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
				       struct sk_buff *skb)
{
	if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
		netlink_deliver_tap(skb);
}

256 257 258 259
static void netlink_overrun(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

260 261 262
	if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) {
		if (!test_and_set_bit(NETLINK_S_CONGESTED,
				      &nlk_sk(sk)->state)) {
263 264 265 266 267 268 269 270 271 272 273 274
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
		}
	}
	atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (skb_queue_empty(&sk->sk_receive_queue))
275 276
		clear_bit(NETLINK_S_CONGESTED, &nlk->state);
	if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
277 278 279
		wake_up_interruptible(&nlk->wait);
}

280
#ifdef CONFIG_NETLINK_MMAP
281 282 283 284 285
static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
{
	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
}

286 287 288 289 290
static bool netlink_rx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
}

291 292 293 294 295
static bool netlink_tx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
}

296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
static __pure struct page *pgvec_to_page(const void *addr)
{
	if (is_vmalloc_addr(addr))
		return vmalloc_to_page(addr);
	else
		return virt_to_page(addr);
}

static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
{
	unsigned int i;

	for (i = 0; i < len; i++) {
		if (pg_vec[i] != NULL) {
			if (is_vmalloc_addr(pg_vec[i]))
				vfree(pg_vec[i]);
			else
				free_pages((unsigned long)pg_vec[i], order);
		}
	}
	kfree(pg_vec);
}

static void *alloc_one_pg_vec_page(unsigned long order)
{
	void *buffer;
	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
			  __GFP_NOWARN | __GFP_NORETRY;

	buffer = (void *)__get_free_pages(gfp_flags, order);
	if (buffer != NULL)
		return buffer;

	buffer = vzalloc((1 << order) * PAGE_SIZE);
	if (buffer != NULL)
		return buffer;

	gfp_flags &= ~__GFP_NORETRY;
	return (void *)__get_free_pages(gfp_flags, order);
}

static void **alloc_pg_vec(struct netlink_sock *nlk,
			   struct nl_mmap_req *req, unsigned int order)
{
	unsigned int block_nr = req->nm_block_nr;
	unsigned int i;
342
	void **pg_vec;
343 344 345 346 347 348

	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
	if (pg_vec == NULL)
		return NULL;

	for (i = 0; i < block_nr; i++) {
349
		pg_vec[i] = alloc_one_pg_vec_page(order);
350 351 352 353 354 355 356 357 358 359
		if (pg_vec[i] == NULL)
			goto err1;
	}

	return pg_vec;
err1:
	free_pg_vec(pg_vec, order, block_nr);
	return NULL;
}

360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391

static void
__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
		   unsigned int order)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct sk_buff_head *queue;
	struct netlink_ring *ring;

	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;

	spin_lock_bh(&queue->lock);

	ring->frame_max		= req->nm_frame_nr - 1;
	ring->head		= 0;
	ring->frame_size	= req->nm_frame_size;
	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;

	swap(ring->pg_vec_len, req->nm_block_nr);
	swap(ring->pg_vec_order, order);
	swap(ring->pg_vec, pg_vec);

	__skb_queue_purge(queue);
	spin_unlock_bh(&queue->lock);

	WARN_ON(atomic_read(&nlk->mapped));

	if (pg_vec)
		free_pg_vec(pg_vec, order, req->nm_block_nr);
}

392
static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
393
			    bool tx_ring)
394 395 396 397 398 399 400 401
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	void **pg_vec = NULL;
	unsigned int order = 0;

	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;

402 403 404 405
	if (atomic_read(&nlk->mapped))
		return -EBUSY;
	if (atomic_read(&ring->pending))
		return -EBUSY;
406 407 408 409 410 411 412

	if (req->nm_block_nr) {
		if (ring->pg_vec != NULL)
			return -EBUSY;

		if ((int)req->nm_block_size <= 0)
			return -EINVAL;
T
Tobias Klauser 已提交
413
		if (!PAGE_ALIGNED(req->nm_block_size))
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
			return -EINVAL;
		if (req->nm_frame_size < NL_MMAP_HDRLEN)
			return -EINVAL;
		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
			return -EINVAL;

		ring->frames_per_block = req->nm_block_size /
					 req->nm_frame_size;
		if (ring->frames_per_block == 0)
			return -EINVAL;
		if (ring->frames_per_block * req->nm_block_nr !=
		    req->nm_frame_nr)
			return -EINVAL;

		order = get_order(req->nm_block_size);
		pg_vec = alloc_pg_vec(nlk, req, order);
		if (pg_vec == NULL)
			return -ENOMEM;
	} else {
		if (req->nm_frame_nr)
			return -EINVAL;
	}

	mutex_lock(&nlk->pg_vec_lock);
438 439 440 441
	if (atomic_read(&nlk->mapped) == 0) {
		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
		mutex_unlock(&nlk->pg_vec_lock);
		return 0;
442
	}
443

444 445 446 447
	mutex_unlock(&nlk->pg_vec_lock);

	if (pg_vec)
		free_pg_vec(pg_vec, order, req->nm_block_nr);
448 449

	return -EBUSY;
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531
}

static void netlink_mm_open(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_inc(&nlk_sk(sk)->mapped);
}

static void netlink_mm_close(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_dec(&nlk_sk(sk)->mapped);
}

static const struct vm_operations_struct netlink_mmap_ops = {
	.open	= netlink_mm_open,
	.close	= netlink_mm_close,
};

static int netlink_mmap(struct file *file, struct socket *sock,
			struct vm_area_struct *vma)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	unsigned long start, size, expected;
	unsigned int i;
	int err = -EINVAL;

	if (vma->vm_pgoff)
		return -EINVAL;

	mutex_lock(&nlk->pg_vec_lock);

	expected = 0;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;
		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
	}

	if (expected == 0)
		goto out;

	size = vma->vm_end - vma->vm_start;
	if (size != expected)
		goto out;

	start = vma->vm_start;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;

		for (i = 0; i < ring->pg_vec_len; i++) {
			struct page *page;
			void *kaddr = ring->pg_vec[i];
			unsigned int pg_num;

			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
				page = pgvec_to_page(kaddr);
				err = vm_insert_page(vma, start, page);
				if (err < 0)
					goto out;
				start += PAGE_SIZE;
				kaddr += PAGE_SIZE;
			}
		}
	}

	atomic_inc(&nlk->mapped);
	vma->vm_ops = &netlink_mmap_ops;
	err = 0;
out:
	mutex_unlock(&nlk->pg_vec_lock);
532
	return err;
533
}
534

D
David Miller 已提交
535
static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
536 537 538 539 540 541
{
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
	struct page *p_start, *p_end;

	/* First page is flushed through netlink_{get,set}_status */
	p_start = pgvec_to_page(hdr + PAGE_SIZE);
D
David Miller 已提交
542
	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
	while (p_start <= p_end) {
		flush_dcache_page(p_start);
		p_start++;
	}
#endif
}

static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
{
	smp_rmb();
	flush_dcache_page(pgvec_to_page(hdr));
	return hdr->nm_status;
}

static void netlink_set_status(struct nl_mmap_hdr *hdr,
			       enum nl_mmap_status status)
{
560
	smp_mb();
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
	hdr->nm_status = status;
	flush_dcache_page(pgvec_to_page(hdr));
}

static struct nl_mmap_hdr *
__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
{
	unsigned int pg_vec_pos, frame_off;

	pg_vec_pos = pos / ring->frames_per_block;
	frame_off  = pos % ring->frames_per_block;

	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
}

static struct nl_mmap_hdr *
netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
		     enum nl_mmap_status status)
{
	struct nl_mmap_hdr *hdr;

	hdr = __netlink_lookup_frame(ring, pos);
	if (netlink_get_status(hdr) != status)
		return NULL;

	return hdr;
}

static struct nl_mmap_hdr *
netlink_current_frame(const struct netlink_ring *ring,
		      enum nl_mmap_status status)
{
	return netlink_lookup_frame(ring, ring->head, status);
}

static struct nl_mmap_hdr *
netlink_previous_frame(const struct netlink_ring *ring,
		       enum nl_mmap_status status)
{
	unsigned int prev;

	prev = ring->head ? ring->head - 1 : ring->frame_max;
	return netlink_lookup_frame(ring, prev, status);
}

static void netlink_increment_head(struct netlink_ring *ring)
{
	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
}

static void netlink_forward_ring(struct netlink_ring *ring)
{
	unsigned int head = ring->head, pos = head;
	const struct nl_mmap_hdr *hdr;

	do {
		hdr = __netlink_lookup_frame(ring, pos);
		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
			break;
		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
			break;
		netlink_increment_head(ring);
	} while (ring->head != head);
}

626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
static bool netlink_dump_space(struct netlink_sock *nlk)
{
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;
	unsigned int n;

	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		return false;

	n = ring->head + ring->frame_max / 2;
	if (n > ring->frame_max)
		n -= ring->frame_max;

	hdr = __netlink_lookup_frame(ring, n);

	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
}

645 646 647 648 649 650
static unsigned int netlink_poll(struct file *file, struct socket *sock,
				 poll_table *wait)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int mask;
651
	int err;
652

653 654 655 656 657
	if (nlk->rx_ring.pg_vec != NULL) {
		/* Memory mapped sockets don't call recvmsg(), so flow control
		 * for dumps is performed here. A dump is allowed to continue
		 * if at least half the ring is unused.
		 */
658
		while (nlk->cb_running && netlink_dump_space(nlk)) {
659 660
			err = netlink_dump(sk);
			if (err < 0) {
661
				sk->sk_err = -err;
662 663 664 665 666 667
				sk->sk_error_report(sk);
				break;
			}
		}
		netlink_rcv_wake(sk);
	}
668

669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
	mask = datagram_poll(file, sock, wait);

	spin_lock_bh(&sk->sk_receive_queue.lock);
	if (nlk->rx_ring.pg_vec) {
		netlink_forward_ring(&nlk->rx_ring);
		if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
			mask |= POLLIN | POLLRDNORM;
	}
	spin_unlock_bh(&sk->sk_receive_queue.lock);

	spin_lock_bh(&sk->sk_write_queue.lock);
	if (nlk->tx_ring.pg_vec) {
		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
			mask |= POLLOUT | POLLWRNORM;
	}
	spin_unlock_bh(&sk->sk_write_queue.lock);

	return mask;
}

static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
{
	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
}

static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
				   struct netlink_ring *ring,
				   struct nl_mmap_hdr *hdr)
{
	unsigned int size;
	void *data;

	size = ring->frame_size - NL_MMAP_HDRLEN;
	data = (void *)hdr + NL_MMAP_HDRLEN;

	skb->head	= data;
	skb->data	= data;
	skb_reset_tail_pointer(skb);
	skb->end	= skb->tail + size;
	skb->len	= 0;

	skb->destructor	= netlink_skb_destructor;
	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
	NETLINK_CB(skb).sk = sk;
}
714 715 716

static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
				u32 dst_portid, u32 dst_group,
C
Christoph Hellwig 已提交
717
				struct scm_cookie *scm)
718 719 720 721 722 723 724 725 726 727 728 729 730 731
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;
	struct sk_buff *skb;
	unsigned int maxlen;
	int err = 0, len = 0;

	mutex_lock(&nlk->pg_vec_lock);

	ring   = &nlk->tx_ring;
	maxlen = ring->frame_size - NL_MMAP_HDRLEN;

	do {
D
David Miller 已提交
732 733
		unsigned int nm_len;

734 735 736 737 738 739 740
		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
		if (hdr == NULL) {
			if (!(msg->msg_flags & MSG_DONTWAIT) &&
			    atomic_read(&nlk->tx_ring.pending))
				schedule();
			continue;
		}
D
David Miller 已提交
741 742 743

		nm_len = ACCESS_ONCE(hdr->nm_len);
		if (nm_len > maxlen) {
744 745 746 747
			err = -EINVAL;
			goto out;
		}

D
David Miller 已提交
748
		netlink_frame_flush_dcache(hdr, nm_len);
749

D
David Miller 已提交
750 751 752 753
		skb = alloc_skb(nm_len, GFP_KERNEL);
		if (skb == NULL) {
			err = -ENOBUFS;
			goto out;
754
		}
D
David Miller 已提交
755 756 757
		__skb_put(skb, nm_len);
		memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
		netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
758 759 760 761 762

		netlink_increment_head(ring);

		NETLINK_CB(skb).portid	  = nlk->portid;
		NETLINK_CB(skb).dst_group = dst_group;
C
Christoph Hellwig 已提交
763
		NETLINK_CB(skb).creds	  = scm->creds;
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791

		err = security_netlink_send(sk, skb);
		if (err) {
			kfree_skb(skb);
			goto out;
		}

		if (unlikely(dst_group)) {
			atomic_inc(&skb->users);
			netlink_broadcast(sk, skb, dst_portid, dst_group,
					  GFP_KERNEL);
		}
		err = netlink_unicast(sk, skb, dst_portid,
				      msg->msg_flags & MSG_DONTWAIT);
		if (err < 0)
			goto out;
		len += err;

	} while (hdr != NULL ||
		 (!(msg->msg_flags & MSG_DONTWAIT) &&
		  atomic_read(&nlk->tx_ring.pending)));

	if (len > 0)
		err = len;
out:
	mutex_unlock(&nlk->pg_vec_lock);
	return err;
}
792 793 794 795 796 797 798 799 800

static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
{
	struct nl_mmap_hdr *hdr;

	hdr = netlink_mmap_hdr(skb);
	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
801 802
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
D
David Miller 已提交
803
	netlink_frame_flush_dcache(hdr, hdr->nm_len);
804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);

	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
	kfree_skb(skb);
}

static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL) {
		spin_unlock_bh(&sk->sk_receive_queue.lock);
		kfree_skb(skb);
821
		netlink_overrun(sk);
822 823 824 825 826 827 828 829 830
		return;
	}
	netlink_increment_head(ring);
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);

	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
831 832
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
833 834 835
	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
}

836
#else /* CONFIG_NETLINK_MMAP */
837
#define netlink_skb_is_mmaped(skb)	false
838
#define netlink_rx_is_mmaped(sk)	false
839
#define netlink_tx_is_mmaped(sk)	false
840
#define netlink_mmap			sock_no_mmap
841
#define netlink_poll			datagram_poll
C
Christoph Hellwig 已提交
842
#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
843 844
#endif /* CONFIG_NETLINK_MMAP */

845 846
static void netlink_skb_destructor(struct sk_buff *skb)
{
847 848 849 850 851 852 853 854 855 856 857 858 859 860
#ifdef CONFIG_NETLINK_MMAP
	struct nl_mmap_hdr *hdr;
	struct netlink_ring *ring;
	struct sock *sk;

	/* If a packet from the kernel to userspace was freed because of an
	 * error without being delivered to userspace, the kernel must reset
	 * the status. In the direction userspace to kernel, the status is
	 * always reset here after the packet was processed and freed.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		hdr = netlink_mmap_hdr(skb);
		sk = NETLINK_CB(skb).sk;

861 862 863 864 865 866 867 868 869
		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
			ring = &nlk_sk(sk)->tx_ring;
		} else {
			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
				hdr->nm_len = 0;
				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
			}
			ring = &nlk_sk(sk)->rx_ring;
870 871 872 873 874 875
		}

		WARN_ON(atomic_read(&ring->pending) == 0);
		atomic_dec(&ring->pending);
		sock_put(sk);

876
		skb->head = NULL;
877 878
	}
#endif
879
	if (is_vmalloc_addr(skb->head)) {
880 881 882 883
		if (!skb->cloned ||
		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
			vfree(skb->head);

884 885
		skb->head = NULL;
	}
886 887
	if (skb->sk != NULL)
		sock_rfree(skb);
888 889 890 891 892 893 894 895 896 897 898
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
	WARN_ON(skb->sk != NULL);
	skb->sk = sk;
	skb->destructor = netlink_skb_destructor;
	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
	sk_mem_charge(sk, skb->truesize);
}

L
Linus Torvalds 已提交
899 900
static void netlink_sock_destruct(struct sock *sk)
{
901 902
	struct netlink_sock *nlk = nlk_sk(sk);

903 904 905
	if (nlk->cb_running) {
		if (nlk->cb.done)
			nlk->cb.done(&nlk->cb);
906

907 908
		module_put(nlk->cb.module);
		kfree_skb(nlk->cb.skb);
909 910
	}

L
Linus Torvalds 已提交
911
	skb_queue_purge(&sk->sk_receive_queue);
912 913 914 915 916 917
#ifdef CONFIG_NETLINK_MMAP
	if (1) {
		struct nl_mmap_req req;

		memset(&req, 0, sizeof(req));
		if (nlk->rx_ring.pg_vec)
918
			__netlink_set_ring(sk, &req, false, NULL, 0);
919 920
		memset(&req, 0, sizeof(req));
		if (nlk->tx_ring.pg_vec)
921
			__netlink_set_ring(sk, &req, true, NULL, 0);
922 923
	}
#endif /* CONFIG_NETLINK_MMAP */
L
Linus Torvalds 已提交
924 925

	if (!sock_flag(sk, SOCK_DEAD)) {
926
		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
L
Linus Torvalds 已提交
927 928
		return;
	}
929 930 931 932

	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
	WARN_ON(nlk_sk(sk)->groups);
L
Linus Torvalds 已提交
933 934
}

935 936
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
L
Linus Torvalds 已提交
937 938 939 940
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

941
void netlink_table_grab(void)
942
	__acquires(nl_table_lock)
L
Linus Torvalds 已提交
943
{
944 945
	might_sleep();

946
	write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
947 948 949 950 951

	if (atomic_read(&nl_table_users)) {
		DECLARE_WAITQUEUE(wait, current);

		add_wait_queue_exclusive(&nl_table_wait, &wait);
952
		for (;;) {
L
Linus Torvalds 已提交
953 954 955
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (atomic_read(&nl_table_users) == 0)
				break;
956
			write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
957
			schedule();
958
			write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
959 960 961 962 963 964 965
		}

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nl_table_wait, &wait);
	}
}

966
void netlink_table_ungrab(void)
967
	__releases(nl_table_lock)
L
Linus Torvalds 已提交
968
{
969
	write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
970 971 972
	wake_up(&nl_table_wait);
}

973
static inline void
L
Linus Torvalds 已提交
974 975 976 977 978 979 980 981 982
netlink_lock_table(void)
{
	/* read_lock() synchronizes us to netlink_table_grab */

	read_lock(&nl_table_lock);
	atomic_inc(&nl_table_users);
	read_unlock(&nl_table_lock);
}

983
static inline void
L
Linus Torvalds 已提交
984 985 986 987 988 989
netlink_unlock_table(void)
{
	if (atomic_dec_and_test(&nl_table_users))
		wake_up(&nl_table_wait);
}

990
struct netlink_compare_arg
L
Linus Torvalds 已提交
991
{
992
	possible_net_t pnet;
993 994
	u32 portid;
};
L
Linus Torvalds 已提交
995

996 997 998
/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
	(offsetof(struct netlink_compare_arg, portid) + sizeof(u32))
999 1000 1001

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
				  const void *ptr)
L
Linus Torvalds 已提交
1002
{
1003 1004
	const struct netlink_compare_arg *x = arg->key;
	const struct netlink_sock *nlk = ptr;
L
Linus Torvalds 已提交
1005

1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
	return nlk->portid != x->portid ||
	       !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
				     struct net *net, u32 portid)
{
	memset(arg, 0, sizeof(*arg));
	write_pnet(&arg->pnet, net);
	arg->portid = portid;
L
Linus Torvalds 已提交
1016 1017
}

1018 1019
static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
				     struct net *net)
L
Linus Torvalds 已提交
1020
{
1021
	struct netlink_compare_arg arg;
L
Linus Torvalds 已提交
1022

1023 1024 1025
	netlink_compare_arg_init(&arg, net, portid);
	return rhashtable_lookup_fast(&table->hash, &arg,
				      netlink_rhashtable_params);
L
Linus Torvalds 已提交
1026 1027
}

1028
static int __netlink_insert(struct netlink_table *table, struct sock *sk)
Y
Ying Xue 已提交
1029
{
1030
	struct netlink_compare_arg arg;
Y
Ying Xue 已提交
1031

1032 1033 1034 1035
	netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
	return rhashtable_lookup_insert_key(&table->hash, &arg,
					    &nlk_sk(sk)->node,
					    netlink_rhashtable_params);
Y
Ying Xue 已提交
1036 1037
}

1038
static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
L
Linus Torvalds 已提交
1039
{
1040 1041
	struct netlink_table *table = &nl_table[protocol];
	struct sock *sk;
L
Linus Torvalds 已提交
1042

1043 1044 1045 1046 1047
	rcu_read_lock();
	sk = __netlink_lookup(table, portid, net);
	if (sk)
		sock_hold(sk);
	rcu_read_unlock();
L
Linus Torvalds 已提交
1048

1049
	return sk;
L
Linus Torvalds 已提交
1050 1051
}

1052
static const struct proto_ops netlink_ops;
L
Linus Torvalds 已提交
1053

1054 1055 1056 1057 1058 1059
static void
netlink_update_listeners(struct sock *sk)
{
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
	unsigned long mask;
	unsigned int i;
1060 1061 1062 1063 1064
	struct listeners *listeners;

	listeners = nl_deref_protected(tbl->listeners);
	if (!listeners)
		return;
1065

1066
	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
1067
		mask = 0;
1068
		sk_for_each_bound(sk, &tbl->mc_list) {
1069 1070 1071
			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
				mask |= nlk_sk(sk)->groups[i];
		}
1072
		listeners->masks[i] = mask;
1073 1074 1075 1076 1077
	}
	/* this function is only called with the netlink table "grabbed", which
	 * makes sure updates are visible before bind or setsockopt return. */
}

1078
static int netlink_insert(struct sock *sk, u32 portid)
L
Linus Torvalds 已提交
1079
{
1080
	struct netlink_table *table = &nl_table[sk->sk_protocol];
1081
	int err;
L
Linus Torvalds 已提交
1082

Y
Ying Xue 已提交
1083
	lock_sock(sk);
L
Linus Torvalds 已提交
1084 1085

	err = -EBUSY;
1086
	if (nlk_sk(sk)->portid)
L
Linus Torvalds 已提交
1087 1088 1089
		goto err;

	err = -ENOMEM;
1090 1091
	if (BITS_PER_LONG > 32 &&
	    unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
L
Linus Torvalds 已提交
1092 1093
		goto err;

1094
	nlk_sk(sk)->portid = portid;
1095
	sock_hold(sk);
1096

1097 1098
	err = __netlink_insert(table, sk);
	if (err) {
1099 1100 1101 1102 1103
		/* In case the hashtable backend returns with -EBUSY
		 * from here, it must not escape to the caller.
		 */
		if (unlikely(err == -EBUSY))
			err = -EOVERFLOW;
1104 1105
		if (err == -EEXIST)
			err = -EADDRINUSE;
1106
		nlk_sk(sk)->portid = 0;
Y
Ying Xue 已提交
1107
		sock_put(sk);
1108 1109
	}

L
Linus Torvalds 已提交
1110
err:
Y
Ying Xue 已提交
1111
	release_sock(sk);
L
Linus Torvalds 已提交
1112 1113 1114 1115 1116
	return err;
}

static void netlink_remove(struct sock *sk)
{
1117 1118 1119
	struct netlink_table *table;

	table = &nl_table[sk->sk_protocol];
1120 1121
	if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
				    netlink_rhashtable_params)) {
1122 1123 1124 1125
		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
		__sock_put(sk);
	}

L
Linus Torvalds 已提交
1126
	netlink_table_grab();
1127
	if (nlk_sk(sk)->subscriptions) {
L
Linus Torvalds 已提交
1128
		__sk_del_bind_node(sk);
1129 1130
		netlink_update_listeners(sk);
	}
1131 1132
	if (sk->sk_protocol == NETLINK_GENERIC)
		atomic_inc(&genl_sk_destructing_cnt);
L
Linus Torvalds 已提交
1133 1134 1135 1136 1137 1138 1139 1140 1141
	netlink_table_ungrab();
}

static struct proto netlink_proto = {
	.name	  = "NETLINK",
	.owner	  = THIS_MODULE,
	.obj_size = sizeof(struct netlink_sock),
};

1142
static int __netlink_create(struct net *net, struct socket *sock,
1143 1144
			    struct mutex *cb_mutex, int protocol,
			    int kern)
L
Linus Torvalds 已提交
1145 1146 1147
{
	struct sock *sk;
	struct netlink_sock *nlk;
1148 1149 1150

	sock->ops = &netlink_ops;

1151
	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
1152 1153 1154 1155 1156 1157
	if (!sk)
		return -ENOMEM;

	sock_init_data(sock, sk);

	nlk = nlk_sk(sk);
E
Eric Dumazet 已提交
1158
	if (cb_mutex) {
1159
		nlk->cb_mutex = cb_mutex;
E
Eric Dumazet 已提交
1160
	} else {
1161 1162 1163
		nlk->cb_mutex = &nlk->cb_def_mutex;
		mutex_init(nlk->cb_mutex);
	}
1164
	init_waitqueue_head(&nlk->wait);
1165 1166 1167
#ifdef CONFIG_NETLINK_MMAP
	mutex_init(&nlk->pg_vec_lock);
#endif
1168 1169 1170 1171 1172 1173

	sk->sk_destruct = netlink_sock_destruct;
	sk->sk_protocol = protocol;
	return 0;
}

1174 1175
static int netlink_create(struct net *net, struct socket *sock, int protocol,
			  int kern)
1176 1177
{
	struct module *module = NULL;
1178
	struct mutex *cb_mutex;
1179
	struct netlink_sock *nlk;
1180 1181
	int (*bind)(struct net *net, int group);
	void (*unbind)(struct net *net, int group);
1182
	int err = 0;
L
Linus Torvalds 已提交
1183 1184 1185 1186 1187 1188

	sock->state = SS_UNCONNECTED;

	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
		return -ESOCKTNOSUPPORT;

1189
	if (protocol < 0 || protocol >= MAX_LINKS)
L
Linus Torvalds 已提交
1190 1191
		return -EPROTONOSUPPORT;

1192
	netlink_lock_table();
1193
#ifdef CONFIG_MODULES
1194
	if (!nl_table[protocol].registered) {
1195
		netlink_unlock_table();
1196
		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
1197
		netlink_lock_table();
1198
	}
1199 1200 1201 1202
#endif
	if (nl_table[protocol].registered &&
	    try_module_get(nl_table[protocol].module))
		module = nl_table[protocol].module;
1203 1204
	else
		err = -EPROTONOSUPPORT;
1205
	cb_mutex = nl_table[protocol].cb_mutex;
1206
	bind = nl_table[protocol].bind;
1207
	unbind = nl_table[protocol].unbind;
1208
	netlink_unlock_table();
1209

1210 1211 1212
	if (err < 0)
		goto out;

1213
	err = __netlink_create(net, sock, cb_mutex, protocol, kern);
1214
	if (err < 0)
1215 1216
		goto out_module;

1217
	local_bh_disable();
1218
	sock_prot_inuse_add(net, &netlink_proto, 1);
1219 1220
	local_bh_enable();

1221 1222
	nlk = nlk_sk(sock->sk);
	nlk->module = module;
1223
	nlk->netlink_bind = bind;
1224
	nlk->netlink_unbind = unbind;
1225 1226
out:
	return err;
L
Linus Torvalds 已提交
1227

1228 1229 1230
out_module:
	module_put(module);
	goto out;
L
Linus Torvalds 已提交
1231 1232
}

1233 1234 1235 1236 1237 1238 1239
static void deferred_put_nlk_sk(struct rcu_head *head)
{
	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);

	sock_put(&nlk->sk);
}

L
Linus Torvalds 已提交
1240 1241 1242 1243 1244 1245 1246 1247 1248
static int netlink_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk;

	if (!sk)
		return 0;

	netlink_remove(sk);
1249
	sock_orphan(sk);
L
Linus Torvalds 已提交
1250 1251
	nlk = nlk_sk(sk);

1252 1253 1254 1255
	/*
	 * OK. Socket is unlinked, any packets that arrive now
	 * will be purged.
	 */
L
Linus Torvalds 已提交
1256

1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270
	/* must not acquire netlink_table_lock in any way again before unbind
	 * and notifying genetlink is done as otherwise it might deadlock
	 */
	if (nlk->netlink_unbind) {
		int i;

		for (i = 0; i < nlk->ngroups; i++)
			if (test_bit(i, nlk->groups))
				nlk->netlink_unbind(sock_net(sk), i + 1);
	}
	if (sk->sk_protocol == NETLINK_GENERIC &&
	    atomic_dec_return(&genl_sk_destructing_cnt) == 0)
		wake_up(&genl_sk_destructing_waitq);

L
Linus Torvalds 已提交
1271 1272 1273 1274 1275
	sock->sk = NULL;
	wake_up_interruptible_all(&nlk->wait);

	skb_queue_purge(&sk->sk_write_queue);

1276
	if (nlk->portid) {
L
Linus Torvalds 已提交
1277
		struct netlink_notify n = {
1278
						.net = sock_net(sk),
L
Linus Torvalds 已提交
1279
						.protocol = sk->sk_protocol,
1280
						.portid = nlk->portid,
L
Linus Torvalds 已提交
1281
					  };
1282 1283
		atomic_notifier_call_chain(&netlink_chain,
				NETLINK_URELEASE, &n);
1284
	}
1285

1286
	module_put(nlk->module);
1287

1288
	if (netlink_is_kernel(sk)) {
1289
		netlink_table_grab();
1290 1291
		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
		if (--nl_table[sk->sk_protocol].registered == 0) {
1292 1293 1294 1295 1296
			struct listeners *old;

			old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
			RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
			kfree_rcu(old, rcu);
1297
			nl_table[sk->sk_protocol].module = NULL;
1298
			nl_table[sk->sk_protocol].bind = NULL;
1299
			nl_table[sk->sk_protocol].unbind = NULL;
1300
			nl_table[sk->sk_protocol].flags = 0;
1301 1302
			nl_table[sk->sk_protocol].registered = 0;
		}
1303
		netlink_table_ungrab();
E
Eric Dumazet 已提交
1304
	}
1305

1306 1307 1308
	kfree(nlk->groups);
	nlk->groups = NULL;

1309
	local_bh_disable();
1310
	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
1311
	local_bh_enable();
1312
	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
L
Linus Torvalds 已提交
1313 1314 1315 1316 1317 1318
	return 0;
}

static int netlink_autobind(struct socket *sock)
{
	struct sock *sk = sock->sk;
1319
	struct net *net = sock_net(sk);
1320
	struct netlink_table *table = &nl_table[sk->sk_protocol];
1321
	s32 portid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
1322
	int err;
H
Herbert Xu 已提交
1323 1324
	s32 rover = -4096;
	bool ok;
L
Linus Torvalds 已提交
1325 1326 1327

retry:
	cond_resched();
1328
	rcu_read_lock();
H
Herbert Xu 已提交
1329 1330 1331
	ok = !__netlink_lookup(table, portid, net);
	rcu_read_unlock();
	if (!ok) {
1332
		/* Bind collision, search negative portid values. */
H
Herbert Xu 已提交
1333 1334 1335 1336
		if (rover == -4096)
			/* rover will be in range [S32_MIN, -4097] */
			rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN);
		else if (rover >= -4096)
1337
			rover = -4097;
H
Herbert Xu 已提交
1338
		portid = rover--;
1339
		goto retry;
L
Linus Torvalds 已提交
1340 1341
	}

1342
	err = netlink_insert(sk, portid);
L
Linus Torvalds 已提交
1343 1344
	if (err == -EADDRINUSE)
		goto retry;
1345 1346 1347 1348 1349 1350

	/* If 2 threads race to autobind, that is fine.  */
	if (err == -EBUSY)
		err = 0;

	return err;
L
Linus Torvalds 已提交
1351 1352
}

1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
			struct user_namespace *user_ns, int cap)
{
1366 1367 1368
	return ((nsp->flags & NETLINK_SKB_DST) ||
		file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
		ns_capable(user_ns, cap);
1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
			struct user_namespace *user_ns, int cap)
{
	return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

1420
static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
1421
{
1422
	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
1423
		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
1424
}
L
Linus Torvalds 已提交
1425

1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (nlk->subscriptions && !subscriptions)
		__sk_del_bind_node(sk);
	else if (!nlk->subscriptions && subscriptions)
		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
	nlk->subscriptions = subscriptions;
}

1438
static int netlink_realloc_groups(struct sock *sk)
1439 1440 1441
{
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int groups;
1442
	unsigned long *new_groups;
1443 1444
	int err = 0;

1445 1446
	netlink_table_grab();

1447
	groups = nl_table[sk->sk_protocol].groups;
1448
	if (!nl_table[sk->sk_protocol].registered) {
1449
		err = -ENOENT;
1450 1451
		goto out_unlock;
	}
1452

1453 1454
	if (nlk->ngroups >= groups)
		goto out_unlock;
1455

1456 1457 1458 1459 1460
	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
	if (new_groups == NULL) {
		err = -ENOMEM;
		goto out_unlock;
	}
1461
	memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
1462 1463 1464
	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

	nlk->groups = new_groups;
1465
	nlk->ngroups = groups;
1466 1467 1468
 out_unlock:
	netlink_table_ungrab();
	return err;
1469 1470
}

1471
static void netlink_undo_bind(int group, long unsigned int groups,
1472
			      struct sock *sk)
1473
{
1474
	struct netlink_sock *nlk = nlk_sk(sk);
1475 1476 1477 1478 1479 1480
	int undo;

	if (!nlk->netlink_unbind)
		return;

	for (undo = 0; undo < group; undo++)
1481
		if (test_bit(undo, &groups))
1482
			nlk->netlink_unbind(sock_net(sk), undo + 1);
1483 1484
}

1485 1486
static int netlink_bind(struct socket *sock, struct sockaddr *addr,
			int addr_len)
L
Linus Torvalds 已提交
1487 1488
{
	struct sock *sk = sock->sk;
1489
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1490 1491 1492
	struct netlink_sock *nlk = nlk_sk(sk);
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
	int err;
1493
	long unsigned int groups = nladdr->nl_groups;
1494

1495 1496 1497
	if (addr_len < sizeof(struct sockaddr_nl))
		return -EINVAL;

L
Linus Torvalds 已提交
1498 1499 1500 1501
	if (nladdr->nl_family != AF_NETLINK)
		return -EINVAL;

	/* Only superuser is allowed to listen multicasts */
1502
	if (groups) {
1503
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
1504
			return -EPERM;
1505 1506 1507
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
1508
	}
L
Linus Torvalds 已提交
1509

1510
	if (nlk->portid)
1511
		if (nladdr->nl_pid != nlk->portid)
L
Linus Torvalds 已提交
1512
			return -EINVAL;
1513 1514 1515 1516 1517 1518 1519

	if (nlk->netlink_bind && groups) {
		int group;

		for (group = 0; group < nlk->ngroups; group++) {
			if (!test_bit(group, &groups))
				continue;
1520
			err = nlk->netlink_bind(net, group + 1);
1521 1522
			if (!err)
				continue;
1523
			netlink_undo_bind(group, groups, sk);
1524 1525 1526 1527 1528
			return err;
		}
	}

	if (!nlk->portid) {
L
Linus Torvalds 已提交
1529
		err = nladdr->nl_pid ?
1530
			netlink_insert(sk, nladdr->nl_pid) :
L
Linus Torvalds 已提交
1531
			netlink_autobind(sock);
1532
		if (err) {
1533
			netlink_undo_bind(nlk->ngroups, groups, sk);
L
Linus Torvalds 已提交
1534
			return err;
1535
		}
L
Linus Torvalds 已提交
1536 1537
	}

1538
	if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
L
Linus Torvalds 已提交
1539 1540 1541
		return 0;

	netlink_table_grab();
1542
	netlink_update_subscriptions(sk, nlk->subscriptions +
1543
					 hweight32(groups) -
1544
					 hweight32(nlk->groups[0]));
1545
	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
1546
	netlink_update_listeners(sk);
L
Linus Torvalds 已提交
1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
	netlink_table_ungrab();

	return 0;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
			   int alen, int flags)
{
	int err = 0;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1558
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
L
Linus Torvalds 已提交
1559

1560 1561 1562
	if (alen < sizeof(addr->sa_family))
		return -EINVAL;

L
Linus Torvalds 已提交
1563 1564
	if (addr->sa_family == AF_UNSPEC) {
		sk->sk_state	= NETLINK_UNCONNECTED;
1565
		nlk->dst_portid	= 0;
1566
		nlk->dst_group  = 0;
L
Linus Torvalds 已提交
1567 1568 1569 1570 1571
		return 0;
	}
	if (addr->sa_family != AF_NETLINK)
		return -EINVAL;

1572
	if ((nladdr->nl_groups || nladdr->nl_pid) &&
1573
	    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
L
Linus Torvalds 已提交
1574 1575
		return -EPERM;

1576
	if (!nlk->portid)
L
Linus Torvalds 已提交
1577 1578 1579 1580
		err = netlink_autobind(sock);

	if (err == 0) {
		sk->sk_state	= NETLINK_CONNECTED;
1581
		nlk->dst_portid = nladdr->nl_pid;
1582
		nlk->dst_group  = ffs(nladdr->nl_groups);
L
Linus Torvalds 已提交
1583 1584 1585 1586 1587
	}

	return err;
}

1588 1589
static int netlink_getname(struct socket *sock, struct sockaddr *addr,
			   int *addr_len, int peer)
L
Linus Torvalds 已提交
1590 1591 1592
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1593
	DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
1594

L
Linus Torvalds 已提交
1595 1596 1597 1598 1599
	nladdr->nl_family = AF_NETLINK;
	nladdr->nl_pad = 0;
	*addr_len = sizeof(*nladdr);

	if (peer) {
1600
		nladdr->nl_pid = nlk->dst_portid;
1601
		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
L
Linus Torvalds 已提交
1602
	} else {
1603
		nladdr->nl_pid = nlk->portid;
1604
		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
L
Linus Torvalds 已提交
1605 1606 1607 1608
	}
	return 0;
}

1609
static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
L
Linus Torvalds 已提交
1610 1611 1612 1613
{
	struct sock *sock;
	struct netlink_sock *nlk;

1614
	sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
L
Linus Torvalds 已提交
1615 1616 1617 1618 1619
	if (!sock)
		return ERR_PTR(-ECONNREFUSED);

	/* Don't bother queuing skb if kernel socket has no input function */
	nlk = nlk_sk(sock);
1620
	if (sock->sk_state == NETLINK_CONNECTED &&
1621
	    nlk->dst_portid != nlk_sk(ssk)->portid) {
L
Linus Torvalds 已提交
1622 1623 1624 1625 1626 1627 1628 1629
		sock_put(sock);
		return ERR_PTR(-ECONNREFUSED);
	}
	return sock;
}

struct sock *netlink_getsockbyfilp(struct file *filp)
{
A
Al Viro 已提交
1630
	struct inode *inode = file_inode(filp);
L
Linus Torvalds 已提交
1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
	struct sock *sock;

	if (!S_ISSOCK(inode->i_mode))
		return ERR_PTR(-ENOTSOCK);

	sock = SOCKET_I(inode)->sk;
	if (sock->sk_family != AF_NETLINK)
		return ERR_PTR(-EINVAL);

	sock_hold(sock);
	return sock;
}

1644 1645
static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
					       int broadcast)
1646 1647 1648 1649
{
	struct sk_buff *skb;
	void *data;

1650
	if (size <= NLMSG_GOODSIZE || broadcast)
1651 1652
		return alloc_skb(size, GFP_KERNEL);

1653 1654
	size = SKB_DATA_ALIGN(size) +
	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1655 1656 1657

	data = vmalloc(size);
	if (data == NULL)
1658
		return NULL;
1659

E
Eric Dumazet 已提交
1660
	skb = __build_skb(data, size);
1661 1662
	if (skb == NULL)
		vfree(data);
E
Eric Dumazet 已提交
1663
	else
1664
		skb->destructor = netlink_skb_destructor;
1665 1666 1667 1668

	return skb;
}

L
Linus Torvalds 已提交
1669 1670 1671 1672 1673 1674 1675 1676 1677 1678
/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
1679
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
P
Patrick McHardy 已提交
1680
		      long *timeo, struct sock *ssk)
L
Linus Torvalds 已提交
1681 1682 1683 1684 1685
{
	struct netlink_sock *nlk;

	nlk = nlk_sk(sk);

1686
	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1687
	     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
1688
	    !netlink_skb_is_mmaped(skb)) {
L
Linus Torvalds 已提交
1689
		DECLARE_WAITQUEUE(wait, current);
P
Patrick McHardy 已提交
1690
		if (!*timeo) {
1691
			if (!ssk || netlink_is_kernel(ssk))
L
Linus Torvalds 已提交
1692 1693 1694 1695 1696 1697 1698 1699 1700 1701
				netlink_overrun(sk);
			sock_put(sk);
			kfree_skb(skb);
			return -EAGAIN;
		}

		__set_current_state(TASK_INTERRUPTIBLE);
		add_wait_queue(&nlk->wait, &wait);

		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1702
		     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
L
Linus Torvalds 已提交
1703
		    !sock_flag(sk, SOCK_DEAD))
P
Patrick McHardy 已提交
1704
			*timeo = schedule_timeout(*timeo);
L
Linus Torvalds 已提交
1705 1706 1707 1708 1709 1710 1711

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nlk->wait, &wait);
		sock_put(sk);

		if (signal_pending(current)) {
			kfree_skb(skb);
P
Patrick McHardy 已提交
1712
			return sock_intr_errno(*timeo);
L
Linus Torvalds 已提交
1713 1714 1715
		}
		return 1;
	}
1716
	netlink_skb_set_owner_r(skb, sk);
L
Linus Torvalds 已提交
1717 1718 1719
	return 0;
}

1720
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1721 1722 1723
{
	int len = skb->len;

1724 1725
	netlink_deliver_tap(skb);

1726 1727 1728 1729 1730 1731 1732 1733
#ifdef CONFIG_NETLINK_MMAP
	if (netlink_skb_is_mmaped(skb))
		netlink_queue_mmaped_skb(sk, skb);
	else if (netlink_rx_is_mmaped(sk))
		netlink_ring_set_copied(sk, skb);
	else
#endif /* CONFIG_NETLINK_MMAP */
		skb_queue_tail(&sk->sk_receive_queue, skb);
1734
	sk->sk_data_ready(sk);
1735 1736 1737 1738 1739 1740 1741
	return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
	int len = __netlink_sendskb(sk, skb);

L
Linus Torvalds 已提交
1742 1743 1744 1745 1746 1747 1748 1749 1750 1751
	sock_put(sk);
	return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
	kfree_skb(skb);
	sock_put(sk);
}

1752
static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
L
Linus Torvalds 已提交
1753 1754 1755
{
	int delta;

1756
	WARN_ON(skb->sk != NULL);
1757 1758
	if (netlink_skb_is_mmaped(skb))
		return skb;
L
Linus Torvalds 已提交
1759

1760
	delta = skb->end - skb->tail;
1761
	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
L
Linus Torvalds 已提交
1762 1763 1764 1765 1766 1767
		return skb;

	if (skb_shared(skb)) {
		struct sk_buff *nskb = skb_clone(skb, allocation);
		if (!nskb)
			return skb;
1768
		consume_skb(skb);
L
Linus Torvalds 已提交
1769 1770 1771 1772 1773 1774 1775 1776 1777
		skb = nskb;
	}

	if (!pskb_expand_head(skb, 0, -delta, allocation))
		skb->truesize -= delta;

	return skb;
}

1778 1779
static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
				  struct sock *ssk)
1780 1781 1782 1783 1784 1785 1786
{
	int ret;
	struct netlink_sock *nlk = nlk_sk(sk);

	ret = -ECONNREFUSED;
	if (nlk->netlink_rcv != NULL) {
		ret = skb->len;
1787
		netlink_skb_set_owner_r(skb, sk);
1788
		NETLINK_CB(skb).sk = ssk;
1789
		netlink_deliver_tap_kernel(sk, ssk, skb);
1790
		nlk->netlink_rcv(skb);
1791 1792 1793
		consume_skb(skb);
	} else {
		kfree_skb(skb);
1794 1795 1796 1797 1798 1799
	}
	sock_put(sk);
	return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
1800
		    u32 portid, int nonblock)
L
Linus Torvalds 已提交
1801 1802 1803 1804 1805 1806 1807 1808 1809
{
	struct sock *sk;
	int err;
	long timeo;

	skb = netlink_trim(skb, gfp_any());

	timeo = sock_sndtimeo(ssk, nonblock);
retry:
1810
	sk = netlink_getsockbyportid(ssk, portid);
L
Linus Torvalds 已提交
1811 1812 1813 1814
	if (IS_ERR(sk)) {
		kfree_skb(skb);
		return PTR_ERR(sk);
	}
1815
	if (netlink_is_kernel(sk))
1816
		return netlink_unicast_kernel(sk, skb, ssk);
1817

1818
	if (sk_filter(sk, skb)) {
W
Wang Chen 已提交
1819
		err = skb->len;
1820 1821 1822 1823 1824
		kfree_skb(skb);
		sock_put(sk);
		return err;
	}

1825
	err = netlink_attachskb(sk, skb, &timeo, ssk);
L
Linus Torvalds 已提交
1826 1827 1828 1829 1830
	if (err == 1)
		goto retry;
	if (err)
		return err;

1831
	return netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
1832
}
1833
EXPORT_SYMBOL(netlink_unicast);
L
Linus Torvalds 已提交
1834

1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853
struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
				  u32 dst_portid, gfp_t gfp_mask)
{
#ifdef CONFIG_NETLINK_MMAP
	struct sock *sk = NULL;
	struct sk_buff *skb;
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;
	unsigned int maxlen;

	sk = netlink_getsockbyportid(ssk, dst_portid);
	if (IS_ERR(sk))
		goto out;

	ring = &nlk_sk(sk)->rx_ring;
	/* fast-path without atomic ops for common case: non-mmaped receiver */
	if (ring->pg_vec == NULL)
		goto out_put;

1854 1855 1856
	if (ring->frame_size - NL_MMAP_HDRLEN < size)
		goto out_put;

1857 1858 1859 1860 1861 1862 1863 1864 1865
	skb = alloc_skb_head(gfp_mask);
	if (skb == NULL)
		goto err1;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	/* check again under lock */
	if (ring->pg_vec == NULL)
		goto out_free;

1866
	/* check again under lock */
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885
	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
	if (maxlen < size)
		goto out_free;

	netlink_forward_ring(ring);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		goto err2;
	netlink_ring_setup_skb(skb, sk, ring, hdr);
	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
	atomic_inc(&ring->pending);
	netlink_increment_head(ring);

	spin_unlock_bh(&sk->sk_receive_queue.lock);
	return skb;

err2:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
1886
	netlink_overrun(sk);
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901
err1:
	sock_put(sk);
	return NULL;

out_free:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
out_put:
	sock_put(sk);
out:
#endif
	return alloc_skb(size, gfp_mask);
}
EXPORT_SYMBOL_GPL(netlink_alloc_skb);

1902 1903 1904
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
	int res = 0;
1905
	struct listeners *listeners;
1906

1907
	BUG_ON(!netlink_is_kernel(sk));
1908 1909 1910 1911

	rcu_read_lock();
	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

1912
	if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
1913
		res = test_bit(group - 1, listeners->masks);
1914 1915 1916

	rcu_read_unlock();

1917 1918 1919 1920
	return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

1921
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1922 1923 1924 1925
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
1926
	    !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
1927
		netlink_skb_set_owner_r(skb, sk);
1928
		__netlink_sendskb(sk, skb);
1929
		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
L
Linus Torvalds 已提交
1930 1931 1932 1933 1934 1935
	}
	return -1;
}

struct netlink_broadcast_data {
	struct sock *exclude_sk;
1936
	struct net *net;
1937
	u32 portid;
L
Linus Torvalds 已提交
1938 1939
	u32 group;
	int failure;
1940
	int delivery_failure;
L
Linus Torvalds 已提交
1941 1942
	int congested;
	int delivered;
A
Al Viro 已提交
1943
	gfp_t allocation;
L
Linus Torvalds 已提交
1944
	struct sk_buff *skb, *skb2;
1945 1946
	int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
	void *tx_data;
L
Linus Torvalds 已提交
1947 1948
};

1949 1950
static void do_one_broadcast(struct sock *sk,
				    struct netlink_broadcast_data *p)
L
Linus Torvalds 已提交
1951 1952 1953 1954 1955
{
	struct netlink_sock *nlk = nlk_sk(sk);
	int val;

	if (p->exclude_sk == sk)
1956
		return;
L
Linus Torvalds 已提交
1957

1958
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
1959
	    !test_bit(p->group - 1, nlk->groups))
1960
		return;
L
Linus Torvalds 已提交
1961

1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972
	if (!net_eq(sock_net(sk), p->net)) {
		if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
			return;

		if (!peernet_has_id(sock_net(sk), p->net))
			return;

		if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
				     CAP_NET_BROADCAST))
			return;
	}
1973

L
Linus Torvalds 已提交
1974 1975
	if (p->failure) {
		netlink_overrun(sk);
1976
		return;
L
Linus Torvalds 已提交
1977 1978 1979 1980
	}

	sock_hold(sk);
	if (p->skb2 == NULL) {
1981
		if (skb_shared(p->skb)) {
L
Linus Torvalds 已提交
1982 1983
			p->skb2 = skb_clone(p->skb, p->allocation);
		} else {
1984 1985 1986 1987 1988 1989
			p->skb2 = skb_get(p->skb);
			/*
			 * skb ownership may have been set when
			 * delivered to a previous socket.
			 */
			skb_orphan(p->skb2);
L
Linus Torvalds 已提交
1990 1991 1992 1993 1994 1995
		}
	}
	if (p->skb2 == NULL) {
		netlink_overrun(sk);
		/* Clone failed. Notify ALL listeners. */
		p->failure = 1;
1996
		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
1997
			p->delivery_failure = 1;
1998 1999 2000
		goto out;
	}
	if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
2001 2002
		kfree_skb(p->skb2);
		p->skb2 = NULL;
2003 2004 2005
		goto out;
	}
	if (sk_filter(sk, p->skb2)) {
2006 2007
		kfree_skb(p->skb2);
		p->skb2 = NULL;
2008 2009 2010 2011 2012 2013
		goto out;
	}
	NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
	NETLINK_CB(p->skb2).nsid_is_set = true;
	val = netlink_broadcast_deliver(sk, p->skb2);
	if (val < 0) {
L
Linus Torvalds 已提交
2014
		netlink_overrun(sk);
2015
		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
2016
			p->delivery_failure = 1;
L
Linus Torvalds 已提交
2017 2018 2019 2020 2021
	} else {
		p->congested |= val;
		p->delivered = 1;
		p->skb2 = NULL;
	}
2022
out:
L
Linus Torvalds 已提交
2023 2024 2025
	sock_put(sk);
}

2026
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
2027 2028 2029
	u32 group, gfp_t allocation,
	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
	void *filter_data)
L
Linus Torvalds 已提交
2030
{
2031
	struct net *net = sock_net(ssk);
L
Linus Torvalds 已提交
2032 2033 2034 2035 2036 2037
	struct netlink_broadcast_data info;
	struct sock *sk;

	skb = netlink_trim(skb, allocation);

	info.exclude_sk = ssk;
2038
	info.net = net;
2039
	info.portid = portid;
L
Linus Torvalds 已提交
2040 2041
	info.group = group;
	info.failure = 0;
2042
	info.delivery_failure = 0;
L
Linus Torvalds 已提交
2043 2044 2045 2046 2047
	info.congested = 0;
	info.delivered = 0;
	info.allocation = allocation;
	info.skb = skb;
	info.skb2 = NULL;
2048 2049
	info.tx_filter = filter;
	info.tx_data = filter_data;
L
Linus Torvalds 已提交
2050 2051 2052 2053 2054

	/* While we sleep in clone, do not allow to change socket list */

	netlink_lock_table();

2055
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
L
Linus Torvalds 已提交
2056 2057
		do_one_broadcast(sk, &info);

2058
	consume_skb(skb);
2059

L
Linus Torvalds 已提交
2060 2061
	netlink_unlock_table();

2062 2063
	if (info.delivery_failure) {
		kfree_skb(info.skb2);
2064
		return -ENOBUFS;
E
Eric Dumazet 已提交
2065 2066
	}
	consume_skb(info.skb2);
2067

L
Linus Torvalds 已提交
2068 2069 2070 2071 2072 2073 2074
	if (info.delivered) {
		if (info.congested && (allocation & __GFP_WAIT))
			yield();
		return 0;
	}
	return -ESRCH;
}
2075 2076
EXPORT_SYMBOL(netlink_broadcast_filtered);

2077
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
2078 2079
		      u32 group, gfp_t allocation)
{
2080
	return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
2081 2082
		NULL, NULL);
}
2083
EXPORT_SYMBOL(netlink_broadcast);
L
Linus Torvalds 已提交
2084 2085 2086

struct netlink_set_err_data {
	struct sock *exclude_sk;
2087
	u32 portid;
L
Linus Torvalds 已提交
2088 2089 2090 2091
	u32 group;
	int code;
};

2092
static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
L
Linus Torvalds 已提交
2093 2094
{
	struct netlink_sock *nlk = nlk_sk(sk);
2095
	int ret = 0;
L
Linus Torvalds 已提交
2096 2097 2098 2099

	if (sk == p->exclude_sk)
		goto out;

O
Octavian Purdila 已提交
2100
	if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
2101 2102
		goto out;

2103
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
2104
	    !test_bit(p->group - 1, nlk->groups))
L
Linus Torvalds 已提交
2105 2106
		goto out;

2107
	if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) {
2108 2109 2110 2111
		ret = 1;
		goto out;
	}

L
Linus Torvalds 已提交
2112 2113 2114
	sk->sk_err = p->code;
	sk->sk_error_report(sk);
out:
2115
	return ret;
L
Linus Torvalds 已提交
2116 2117
}

2118 2119 2120
/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
2121
 * @portid: the PORTID of a process that we want to skip (if any)
2122
 * @group: the broadcast group that will notice the error
2123
 * @code: error code, must be negative (as usual in kernelspace)
2124 2125
 *
 * This function returns the number of broadcast listeners that have set the
2126
 * NETLINK_NO_ENOBUFS socket option.
2127
 */
2128
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
L
Linus Torvalds 已提交
2129 2130 2131
{
	struct netlink_set_err_data info;
	struct sock *sk;
2132
	int ret = 0;
L
Linus Torvalds 已提交
2133 2134

	info.exclude_sk = ssk;
2135
	info.portid = portid;
L
Linus Torvalds 已提交
2136
	info.group = group;
2137 2138
	/* sk->sk_err wants a positive error value */
	info.code = -code;
L
Linus Torvalds 已提交
2139 2140 2141

	read_lock(&nl_table_lock);

2142
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
2143
		ret += do_one_set_err(sk, &info);
L
Linus Torvalds 已提交
2144 2145

	read_unlock(&nl_table_lock);
2146
	return ret;
L
Linus Torvalds 已提交
2147
}
2148
EXPORT_SYMBOL(netlink_set_err);
L
Linus Torvalds 已提交
2149

2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166
/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
				     unsigned int group,
				     int is_new)
{
	int old, new = !!is_new, subscriptions;

	old = test_bit(group - 1, nlk->groups);
	subscriptions = nlk->subscriptions - old + new;
	if (new)
		__set_bit(group - 1, nlk->groups);
	else
		__clear_bit(group - 1, nlk->groups);
	netlink_update_subscriptions(&nlk->sk, subscriptions);
	netlink_update_listeners(&nlk->sk);
}

2167
static int netlink_setsockopt(struct socket *sock, int level, int optname,
2168
			      char __user *optval, unsigned int optlen)
2169 2170 2171
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2172 2173
	unsigned int val = 0;
	int err;
2174 2175 2176 2177

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

2178 2179
	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
	    optlen >= sizeof(int) &&
2180
	    get_user(val, (unsigned int __user *)optval))
2181 2182 2183 2184 2185
		return -EFAULT;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (val)
2186
			nlk->flags |= NETLINK_F_RECV_PKTINFO;
2187
		else
2188
			nlk->flags &= ~NETLINK_F_RECV_PKTINFO;
2189 2190 2191 2192
		err = 0;
		break;
	case NETLINK_ADD_MEMBERSHIP:
	case NETLINK_DROP_MEMBERSHIP: {
2193
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
2194
			return -EPERM;
2195 2196 2197
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
2198 2199
		if (!val || val - 1 >= nlk->ngroups)
			return -EINVAL;
2200
		if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
2201
			err = nlk->netlink_bind(sock_net(sk), val);
2202 2203 2204
			if (err)
				return err;
		}
2205
		netlink_table_grab();
2206 2207
		netlink_update_socket_mc(nlk, val,
					 optname == NETLINK_ADD_MEMBERSHIP);
2208
		netlink_table_ungrab();
2209
		if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
2210
			nlk->netlink_unbind(sock_net(sk), val);
2211

2212 2213 2214
		err = 0;
		break;
	}
2215 2216
	case NETLINK_BROADCAST_ERROR:
		if (val)
2217
			nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR;
2218
		else
2219
			nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR;
2220 2221
		err = 0;
		break;
2222 2223
	case NETLINK_NO_ENOBUFS:
		if (val) {
2224 2225
			nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS;
			clear_bit(NETLINK_S_CONGESTED, &nlk->state);
2226
			wake_up_interruptible(&nlk->wait);
E
Eric Dumazet 已提交
2227
		} else {
2228
			nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS;
E
Eric Dumazet 已提交
2229
		}
2230 2231
		err = 0;
		break;
2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245
#ifdef CONFIG_NETLINK_MMAP
	case NETLINK_RX_RING:
	case NETLINK_TX_RING: {
		struct nl_mmap_req req;

		/* Rings might consume more memory than queue limits, require
		 * CAP_NET_ADMIN.
		 */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
		if (optlen < sizeof(req))
			return -EINVAL;
		if (copy_from_user(&req, optval, sizeof(req)))
			return -EFAULT;
2246
		err = netlink_set_ring(sk, &req,
2247 2248 2249 2250
				       optname == NETLINK_TX_RING);
		break;
	}
#endif /* CONFIG_NETLINK_MMAP */
2251 2252 2253 2254 2255 2256 2257 2258 2259 2260
	case NETLINK_LISTEN_ALL_NSID:
		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
			return -EPERM;

		if (val)
			nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
		else
			nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
		err = 0;
		break;
2261 2262 2263 2264 2265 2266 2267
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
2268
			      char __user *optval, int __user *optlen)
2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int len, val, err;

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
2287
		val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0;
H
Heiko Carstens 已提交
2288 2289 2290
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
2291 2292
		err = 0;
		break;
2293 2294 2295 2296
	case NETLINK_BROADCAST_ERROR:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
2297
		val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0;
2298 2299 2300 2301 2302
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2303 2304 2305 2306
	case NETLINK_NO_ENOBUFS:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
2307
		val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0;
2308 2309 2310 2311 2312
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334
	case NETLINK_LIST_MEMBERSHIPS: {
		int pos, idx, shift;

		err = 0;
		netlink_table_grab();
		for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
			if (len - pos < sizeof(u32))
				break;

			idx = pos / sizeof(unsigned long);
			shift = (pos % sizeof(unsigned long)) * 8;
			if (put_user((u32)(nlk->groups[idx] >> shift),
				     (u32 __user *)(optval + pos))) {
				err = -EFAULT;
				break;
			}
		}
		if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen))
			err = -EFAULT;
		netlink_table_ungrab();
		break;
	}
2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
	struct nl_pktinfo info;

	info.group = NETLINK_CB(skb).dst_group;
	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

2349 2350 2351 2352 2353 2354 2355 2356 2357 2358
static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
					 struct sk_buff *skb)
{
	if (!NETLINK_CB(skb).nsid_is_set)
		return;

	put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
		 &NETLINK_CB(skb).nsid);
}

2359
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
L
Linus Torvalds 已提交
2360 2361 2362
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2363
	DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
2364
	u32 dst_portid;
2365
	u32 dst_group;
L
Linus Torvalds 已提交
2366 2367 2368
	struct sk_buff *skb;
	int err;
	struct scm_cookie scm;
2369
	u32 netlink_skb_flags = 0;
L
Linus Torvalds 已提交
2370 2371 2372 2373

	if (msg->msg_flags&MSG_OOB)
		return -EOPNOTSUPP;

C
Christoph Hellwig 已提交
2374
	err = scm_send(sock, msg, &scm, true);
L
Linus Torvalds 已提交
2375 2376 2377 2378
	if (err < 0)
		return err;

	if (msg->msg_namelen) {
2379
		err = -EINVAL;
L
Linus Torvalds 已提交
2380
		if (addr->nl_family != AF_NETLINK)
2381
			goto out;
2382
		dst_portid = addr->nl_pid;
2383
		dst_group = ffs(addr->nl_groups);
2384
		err =  -EPERM;
2385
		if ((dst_group || dst_portid) &&
2386
		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
2387
			goto out;
2388
		netlink_skb_flags |= NETLINK_SKB_DST;
L
Linus Torvalds 已提交
2389
	} else {
2390
		dst_portid = nlk->dst_portid;
2391
		dst_group = nlk->dst_group;
L
Linus Torvalds 已提交
2392 2393
	}

2394
	if (!nlk->portid) {
L
Linus Torvalds 已提交
2395 2396 2397 2398 2399
		err = netlink_autobind(sock);
		if (err)
			goto out;
	}

2400 2401 2402
	/* It's a really convoluted way for userland to ask for mmaped
	 * sendmsg(), but that's what we've got...
	 */
2403
	if (netlink_tx_is_mmaped(sk) &&
2404 2405
	    msg->msg_iter.type == ITER_IOVEC &&
	    msg->msg_iter.nr_segs == 1 &&
A
Al Viro 已提交
2406
	    msg->msg_iter.iov->iov_base == NULL) {
2407
		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
C
Christoph Hellwig 已提交
2408
					   &scm);
2409 2410 2411
		goto out;
	}

L
Linus Torvalds 已提交
2412 2413 2414 2415
	err = -EMSGSIZE;
	if (len > sk->sk_sndbuf - 32)
		goto out;
	err = -ENOBUFS;
2416
	skb = netlink_alloc_large_skb(len, dst_group);
2417
	if (skb == NULL)
L
Linus Torvalds 已提交
2418 2419
		goto out;

2420
	NETLINK_CB(skb).portid	= nlk->portid;
2421
	NETLINK_CB(skb).dst_group = dst_group;
C
Christoph Hellwig 已提交
2422
	NETLINK_CB(skb).creds	= scm.creds;
2423
	NETLINK_CB(skb).flags	= netlink_skb_flags;
L
Linus Torvalds 已提交
2424 2425

	err = -EFAULT;
A
Al Viro 已提交
2426
	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
L
Linus Torvalds 已提交
2427 2428 2429 2430 2431 2432 2433 2434 2435 2436
		kfree_skb(skb);
		goto out;
	}

	err = security_netlink_send(sk, skb);
	if (err) {
		kfree_skb(skb);
		goto out;
	}

2437
	if (dst_group) {
L
Linus Torvalds 已提交
2438
		atomic_inc(&skb->users);
2439
		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
L
Linus Torvalds 已提交
2440
	}
2441
	err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
L
Linus Torvalds 已提交
2442 2443

out:
C
Christoph Hellwig 已提交
2444
	scm_destroy(&scm);
L
Linus Torvalds 已提交
2445 2446 2447
	return err;
}

2448
static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
L
Linus Torvalds 已提交
2449 2450 2451 2452 2453 2454 2455
			   int flags)
{
	struct scm_cookie scm;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int noblock = flags&MSG_DONTWAIT;
	size_t copied;
J
Johannes Berg 已提交
2456
	struct sk_buff *skb, *data_skb;
2457
	int err, ret;
L
Linus Torvalds 已提交
2458 2459 2460 2461 2462 2463

	if (flags&MSG_OOB)
		return -EOPNOTSUPP;

	copied = 0;

2464 2465
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
L
Linus Torvalds 已提交
2466 2467
		goto out;

J
Johannes Berg 已提交
2468 2469
	data_skb = skb;

2470 2471 2472
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
	if (unlikely(skb_shinfo(skb)->frag_list)) {
		/*
J
Johannes Berg 已提交
2473 2474 2475
		 * If this skb has a frag_list, then here that means that we
		 * will have to use the frag_list skb's data for compat tasks
		 * and the regular skb's data for normal (non-compat) tasks.
2476
		 *
J
Johannes Berg 已提交
2477 2478 2479 2480
		 * If we need to send the compat skb, assign it to the
		 * 'data_skb' variable so that it will be used below for data
		 * copying. We keep 'skb' for everything else, including
		 * freeing both later.
2481
		 */
J
Johannes Berg 已提交
2482 2483
		if (flags & MSG_CMSG_COMPAT)
			data_skb = skb_shinfo(skb)->frag_list;
2484 2485 2486
	}
#endif

E
Eric Dumazet 已提交
2487 2488 2489 2490 2491
	/* Record the max length of recvmsg() calls for future allocations */
	nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
	nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
				     16384);

J
Johannes Berg 已提交
2492
	copied = data_skb->len;
L
Linus Torvalds 已提交
2493 2494 2495 2496 2497
	if (len < copied) {
		msg->msg_flags |= MSG_TRUNC;
		copied = len;
	}

J
Johannes Berg 已提交
2498
	skb_reset_transport_header(data_skb);
2499
	err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
L
Linus Torvalds 已提交
2500 2501

	if (msg->msg_name) {
2502
		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
L
Linus Torvalds 已提交
2503 2504
		addr->nl_family = AF_NETLINK;
		addr->nl_pad    = 0;
2505
		addr->nl_pid	= NETLINK_CB(skb).portid;
2506
		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
L
Linus Torvalds 已提交
2507 2508 2509
		msg->msg_namelen = sizeof(*addr);
	}

2510
	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
2511
		netlink_cmsg_recv_pktinfo(msg, skb);
2512 2513
	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
		netlink_cmsg_listen_all_nsid(sk, msg, skb);
2514

C
Christoph Hellwig 已提交
2515 2516
	memset(&scm, 0, sizeof(scm));
	scm.creds = *NETLINK_CREDS(skb);
2517
	if (flags & MSG_TRUNC)
J
Johannes Berg 已提交
2518
		copied = data_skb->len;
2519

L
Linus Torvalds 已提交
2520 2521
	skb_free_datagram(sk, skb);

2522 2523
	if (nlk->cb_running &&
	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
2524 2525
		ret = netlink_dump(sk);
		if (ret) {
2526
			sk->sk_err = -ret;
2527 2528 2529
			sk->sk_error_report(sk);
		}
	}
L
Linus Torvalds 已提交
2530

C
Christoph Hellwig 已提交
2531
	scm_recv(sock, msg, &scm, flags);
L
Linus Torvalds 已提交
2532 2533 2534 2535 2536
out:
	netlink_rcv_wake(sk);
	return err ? : copied;
}

2537
static void netlink_data_ready(struct sock *sk)
L
Linus Torvalds 已提交
2538
{
2539
	BUG();
L
Linus Torvalds 已提交
2540 2541 2542
}

/*
2543
 *	We export these functions to other modules. They provide a
L
Linus Torvalds 已提交
2544 2545 2546 2547 2548
 *	complete set of kernel non-blocking support for message
 *	queueing.
 */

struct sock *
2549 2550
__netlink_kernel_create(struct net *net, int unit, struct module *module,
			struct netlink_kernel_cfg *cfg)
L
Linus Torvalds 已提交
2551 2552 2553
{
	struct socket *sock;
	struct sock *sk;
2554
	struct netlink_sock *nlk;
2555
	struct listeners *listeners = NULL;
2556 2557
	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
	unsigned int groups;
L
Linus Torvalds 已提交
2558

2559
	BUG_ON(!nl_table);
L
Linus Torvalds 已提交
2560

2561
	if (unit < 0 || unit >= MAX_LINKS)
L
Linus Torvalds 已提交
2562 2563 2564 2565
		return NULL;

	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
		return NULL;
2566 2567

	if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0)
2568 2569 2570
		goto out_sock_release_nosk;

	sk = sock->sk;
2571

2572
	if (!cfg || cfg->groups < 32)
2573
		groups = 32;
2574 2575
	else
		groups = cfg->groups;
2576

2577
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2578 2579 2580
	if (!listeners)
		goto out_sock_release;

L
Linus Torvalds 已提交
2581
	sk->sk_data_ready = netlink_data_ready;
2582 2583
	if (cfg && cfg->input)
		nlk_sk(sk)->netlink_rcv = cfg->input;
L
Linus Torvalds 已提交
2584

2585
	if (netlink_insert(sk, 0))
2586
		goto out_sock_release;
2587

2588
	nlk = nlk_sk(sk);
2589
	nlk->flags |= NETLINK_F_KERNEL_SOCKET;
2590 2591

	netlink_table_grab();
2592 2593
	if (!nl_table[unit].registered) {
		nl_table[unit].groups = groups;
2594
		rcu_assign_pointer(nl_table[unit].listeners, listeners);
2595 2596
		nl_table[unit].cb_mutex = cb_mutex;
		nl_table[unit].module = module;
2597 2598
		if (cfg) {
			nl_table[unit].bind = cfg->bind;
2599
			nl_table[unit].unbind = cfg->unbind;
2600
			nl_table[unit].flags = cfg->flags;
2601 2602
			if (cfg->compare)
				nl_table[unit].compare = cfg->compare;
2603
		}
2604
		nl_table[unit].registered = 1;
2605 2606
	} else {
		kfree(listeners);
2607
		nl_table[unit].registered++;
2608
	}
2609
	netlink_table_ungrab();
2610 2611
	return sk;

2612
out_sock_release:
2613
	kfree(listeners);
2614
	netlink_kernel_release(sk);
2615 2616 2617
	return NULL;

out_sock_release_nosk:
2618
	sock_release(sock);
2619
	return NULL;
L
Linus Torvalds 已提交
2620
}
2621
EXPORT_SYMBOL(__netlink_kernel_create);
2622 2623 2624 2625

void
netlink_kernel_release(struct sock *sk)
{
2626 2627 2628 2629
	if (sk == NULL || sk->sk_socket == NULL)
		return;

	sock_release(sk->sk_socket);
2630 2631 2632
}
EXPORT_SYMBOL(netlink_kernel_release);

2633
int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
2634
{
2635
	struct listeners *new, *old;
2636 2637 2638 2639 2640 2641
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];

	if (groups < 32)
		groups = 32;

	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
2642 2643
		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
		if (!new)
2644
			return -ENOMEM;
2645
		old = nl_deref_protected(tbl->listeners);
2646 2647 2648
		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
		rcu_assign_pointer(tbl->listeners, new);

2649
		kfree_rcu(old, rcu);
2650 2651 2652
	}
	tbl->groups = groups;

2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673
	return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
	int err;

	netlink_table_grab();
	err = __netlink_change_ngroups(sk, groups);
2674
	netlink_table_ungrab();
2675

2676 2677 2678
	return err;
}

2679 2680 2681 2682 2683
void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
	struct sock *sk;
	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];

2684
	sk_for_each_bound(sk, &tbl->mc_list)
2685 2686 2687
		netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

2688
struct nlmsghdr *
2689
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
2690 2691
{
	struct nlmsghdr *nlh;
2692
	int size = nlmsg_msg_size(len);
2693

2694
	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size));
2695 2696 2697
	nlh->nlmsg_type = type;
	nlh->nlmsg_len = size;
	nlh->nlmsg_flags = flags;
2698
	nlh->nlmsg_pid = portid;
2699 2700
	nlh->nlmsg_seq = seq;
	if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
2701
		memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
2702 2703 2704 2705
	return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

L
Linus Torvalds 已提交
2706 2707 2708 2709 2710 2711 2712 2713 2714
/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_callback *cb;
2715
	struct sk_buff *skb = NULL;
L
Linus Torvalds 已提交
2716
	struct nlmsghdr *nlh;
2717
	int len, err = -ENOBUFS;
2718
	int alloc_size;
L
Linus Torvalds 已提交
2719

2720
	mutex_lock(nlk->cb_mutex);
2721
	if (!nlk->cb_running) {
2722 2723
		err = -EINVAL;
		goto errout_skb;
L
Linus Torvalds 已提交
2724 2725
	}

2726
	cb = &nlk->cb;
2727 2728
	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

2729 2730 2731
	if (!netlink_rx_is_mmaped(sk) &&
	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
		goto errout_skb;
E
Eric Dumazet 已提交
2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752

	/* NLMSG_GOODSIZE is small to avoid high order allocations being
	 * required, but it makes sense to _attempt_ a 16K bytes allocation
	 * to reduce number of system calls on dump operations, if user
	 * ever provided a big enough buffer.
	 */
	if (alloc_size < nlk->max_recvmsg_len) {
		skb = netlink_alloc_skb(sk,
					nlk->max_recvmsg_len,
					nlk->portid,
					GFP_KERNEL |
					__GFP_NOWARN |
					__GFP_NORETRY);
		/* available room should be exact amount to avoid MSG_TRUNC */
		if (skb)
			skb_reserve(skb, skb_tailroom(skb) -
					 nlk->max_recvmsg_len);
	}
	if (!skb)
		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
					GFP_KERNEL);
2753
	if (!skb)
2754
		goto errout_skb;
2755
	netlink_skb_set_owner_r(skb, sk);
2756

L
Linus Torvalds 已提交
2757 2758 2759
	len = cb->dump(skb, cb);

	if (len > 0) {
2760
		mutex_unlock(nlk->cb_mutex);
2761 2762 2763

		if (sk_filter(sk, skb))
			kfree_skb(skb);
2764 2765
		else
			__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2766 2767 2768
		return 0;
	}

2769 2770 2771 2772
	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
	if (!nlh)
		goto errout_skb;

2773 2774
	nl_dump_check_consistent(cb, nlh);

2775 2776
	memcpy(nlmsg_data(nlh), &len, sizeof(len));

2777 2778
	if (sk_filter(sk, skb))
		kfree_skb(skb);
2779 2780
	else
		__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2781

2782 2783
	if (cb->done)
		cb->done(cb);
L
Linus Torvalds 已提交
2784

2785 2786
	nlk->cb_running = false;
	mutex_unlock(nlk->cb_mutex);
2787
	module_put(cb->module);
2788
	consume_skb(cb->skb);
L
Linus Torvalds 已提交
2789
	return 0;
2790

2791
errout_skb:
2792
	mutex_unlock(nlk->cb_mutex);
2793 2794
	kfree_skb(skb);
	return err;
L
Linus Torvalds 已提交
2795 2796
}

2797 2798 2799
int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
			 const struct nlmsghdr *nlh,
			 struct netlink_dump_control *control)
L
Linus Torvalds 已提交
2800 2801 2802 2803
{
	struct netlink_callback *cb;
	struct sock *sk;
	struct netlink_sock *nlk;
2804
	int ret;
L
Linus Torvalds 已提交
2805

2806 2807 2808 2809 2810 2811
	/* Memory mapped dump requests need to be copied to avoid looping
	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
	 * a reference to the skb.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		skb = skb_copy(skb, GFP_KERNEL);
2812
		if (skb == NULL)
2813 2814 2815 2816
			return -ENOBUFS;
	} else
		atomic_inc(&skb->users);

2817
	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
L
Linus Torvalds 已提交
2818
	if (sk == NULL) {
2819 2820
		ret = -ECONNREFUSED;
		goto error_free;
L
Linus Torvalds 已提交
2821
	}
2822

2823
	nlk = nlk_sk(sk);
2824
	mutex_lock(nlk->cb_mutex);
2825
	/* A dump is in progress... */
2826
	if (nlk->cb_running) {
2827
		ret = -EBUSY;
2828
		goto error_unlock;
L
Linus Torvalds 已提交
2829
	}
2830
	/* add reference of module which cb->dump belongs to */
2831
	if (!try_module_get(control->module)) {
2832
		ret = -EPROTONOSUPPORT;
2833
		goto error_unlock;
2834 2835
	}

2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847
	cb = &nlk->cb;
	memset(cb, 0, sizeof(*cb));
	cb->dump = control->dump;
	cb->done = control->done;
	cb->nlh = nlh;
	cb->data = control->data;
	cb->module = control->module;
	cb->min_dump_alloc = control->min_dump_alloc;
	cb->skb = skb;

	nlk->cb_running = true;

2848
	mutex_unlock(nlk->cb_mutex);
L
Linus Torvalds 已提交
2849

2850
	ret = netlink_dump(sk);
L
Linus Torvalds 已提交
2851
	sock_put(sk);
2852

2853 2854 2855
	if (ret)
		return ret;

2856 2857 2858 2859
	/* We successfully started a dump, by returning -EINTR we
	 * signal not to send ACK even if it was requested.
	 */
	return -EINTR;
2860 2861 2862 2863 2864 2865 2866

error_unlock:
	sock_put(sk);
	mutex_unlock(nlk->cb_mutex);
error_free:
	kfree_skb(skb);
	return ret;
L
Linus Torvalds 已提交
2867
}
2868
EXPORT_SYMBOL(__netlink_dump_start);
L
Linus Torvalds 已提交
2869 2870 2871 2872 2873 2874

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
{
	struct sk_buff *skb;
	struct nlmsghdr *rep;
	struct nlmsgerr *errmsg;
2875
	size_t payload = sizeof(*errmsg);
L
Linus Torvalds 已提交
2876

2877 2878 2879
	/* error messages get the original request appened */
	if (err)
		payload += nlmsg_len(nlh);
L
Linus Torvalds 已提交
2880

2881 2882
	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
				NETLINK_CB(in_skb).portid, GFP_KERNEL);
L
Linus Torvalds 已提交
2883 2884 2885
	if (!skb) {
		struct sock *sk;

2886
		sk = netlink_lookup(sock_net(in_skb->sk),
2887
				    in_skb->sk->sk_protocol,
2888
				    NETLINK_CB(in_skb).portid);
L
Linus Torvalds 已提交
2889 2890 2891 2892 2893 2894 2895 2896
		if (sk) {
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
			sock_put(sk);
		}
		return;
	}

2897
	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2898
			  NLMSG_ERROR, payload, 0);
2899
	errmsg = nlmsg_data(rep);
L
Linus Torvalds 已提交
2900
	errmsg->error = err;
2901
	memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
2902
	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
L
Linus Torvalds 已提交
2903
}
2904
EXPORT_SYMBOL(netlink_ack);
L
Linus Torvalds 已提交
2905

2906
int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
2907
						     struct nlmsghdr *))
2908 2909 2910 2911 2912
{
	struct nlmsghdr *nlh;
	int err;

	while (skb->len >= nlmsg_total_size(0)) {
2913 2914
		int msglen;

2915
		nlh = nlmsg_hdr(skb);
2916
		err = 0;
2917

2918
		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
2919 2920
			return 0;

2921 2922
		/* Only requests are handled by the kernel */
		if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
2923
			goto ack;
2924 2925 2926

		/* Skip control messages */
		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
2927
			goto ack;
2928

2929
		err = cb(skb, nlh);
2930 2931 2932 2933
		if (err == -EINTR)
			goto skip;

ack:
2934
		if (nlh->nlmsg_flags & NLM_F_ACK || err)
2935 2936
			netlink_ack(skb, nlh, err);

2937
skip:
2938
		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
2939 2940 2941
		if (msglen > skb->len)
			msglen = skb->len;
		skb_pull(skb, msglen);
2942 2943 2944 2945
	}

	return 0;
}
2946
EXPORT_SYMBOL(netlink_rcv_skb);
2947

2948 2949 2950 2951
/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
2952
 * @portid: destination netlink portid for reports or 0
2953 2954 2955 2956
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
2957
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
2958 2959 2960 2961 2962
		 unsigned int group, int report, gfp_t flags)
{
	int err = 0;

	if (group) {
2963
		int exclude_portid = 0;
2964 2965 2966

		if (report) {
			atomic_inc(&skb->users);
2967
			exclude_portid = portid;
2968 2969
		}

2970 2971
		/* errors reported via destination sk->sk_err, but propagate
		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
2972
		err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
2973 2974
	}

2975 2976 2977
	if (report) {
		int err2;

2978
		err2 = nlmsg_unicast(sk, skb, portid);
2979 2980 2981
		if (!err || err == -ESRCH)
			err = err2;
	}
2982 2983 2984

	return err;
}
2985
EXPORT_SYMBOL(nlmsg_notify);
2986

L
Linus Torvalds 已提交
2987 2988
#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
2989
	struct seq_net_private p;
2990
	struct rhashtable_iter hti;
L
Linus Torvalds 已提交
2991 2992 2993
	int link;
};

2994
static int netlink_walk_start(struct nl_seq_iter *iter)
L
Linus Torvalds 已提交
2995
{
2996
	int err;
L
Linus Torvalds 已提交
2997

2998 2999 3000 3001
	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti);
	if (err) {
		iter->link = MAX_LINKS;
		return err;
L
Linus Torvalds 已提交
3002
	}
3003 3004 3005

	err = rhashtable_walk_start(&iter->hti);
	return err == -EAGAIN ? 0 : err;
L
Linus Torvalds 已提交
3006 3007
}

3008
static void netlink_walk_stop(struct nl_seq_iter *iter)
L
Linus Torvalds 已提交
3009
{
3010 3011
	rhashtable_walk_stop(&iter->hti);
	rhashtable_walk_exit(&iter->hti);
L
Linus Torvalds 已提交
3012 3013
}

3014
static void *__netlink_seq_next(struct seq_file *seq)
L
Linus Torvalds 已提交
3015
{
3016
	struct nl_seq_iter *iter = seq->private;
3017
	struct netlink_sock *nlk;
L
Linus Torvalds 已提交
3018

3019 3020 3021
	do {
		for (;;) {
			int err;
L
Linus Torvalds 已提交
3022

3023
			nlk = rhashtable_walk_next(&iter->hti);
3024

3025 3026 3027
			if (IS_ERR(nlk)) {
				if (PTR_ERR(nlk) == -EAGAIN)
					continue;
3028

3029 3030
				return nlk;
			}
L
Linus Torvalds 已提交
3031

3032 3033
			if (nlk)
				break;
L
Linus Torvalds 已提交
3034

3035 3036 3037
			netlink_walk_stop(iter);
			if (++iter->link >= MAX_LINKS)
				return NULL;
3038

3039 3040 3041
			err = netlink_walk_start(iter);
			if (err)
				return ERR_PTR(err);
L
Linus Torvalds 已提交
3042
		}
3043
	} while (sock_net(&nlk->sk) != seq_file_net(seq));
L
Linus Torvalds 已提交
3044

3045 3046
	return nlk;
}
L
Linus Torvalds 已提交
3047

3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070
static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
{
	struct nl_seq_iter *iter = seq->private;
	void *obj = SEQ_START_TOKEN;
	loff_t pos;
	int err;

	iter->link = 0;

	err = netlink_walk_start(iter);
	if (err)
		return ERR_PTR(err);

	for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
		obj = __netlink_seq_next(seq);

	return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	++*pos;
	return __netlink_seq_next(seq);
L
Linus Torvalds 已提交
3071 3072 3073 3074
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
3075 3076 3077 3078 3079 3080
	struct nl_seq_iter *iter = seq->private;

	if (iter->link >= MAX_LINKS)
		return;

	netlink_walk_stop(iter);
L
Linus Torvalds 已提交
3081 3082 3083 3084 3085
}


static int netlink_seq_show(struct seq_file *seq, void *v)
{
E
Eric Dumazet 已提交
3086
	if (v == SEQ_START_TOKEN) {
L
Linus Torvalds 已提交
3087 3088
		seq_puts(seq,
			 "sk       Eth Pid    Groups   "
3089
			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
E
Eric Dumazet 已提交
3090
	} else {
L
Linus Torvalds 已提交
3091 3092 3093
		struct sock *s = v;
		struct netlink_sock *nlk = nlk_sk(s);

3094
		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
L
Linus Torvalds 已提交
3095 3096
			   s,
			   s->sk_protocol,
3097
			   nlk->portid,
3098
			   nlk->groups ? (u32)nlk->groups[0] : 0,
3099 3100
			   sk_rmem_alloc_get(s),
			   sk_wmem_alloc_get(s),
3101
			   nlk->cb_running,
3102
			   atomic_read(&s->sk_refcnt),
3103 3104
			   atomic_read(&s->sk_drops),
			   sock_i_ino(s)
L
Linus Torvalds 已提交
3105 3106 3107 3108 3109 3110
			);

	}
	return 0;
}

3111
static const struct seq_operations netlink_seq_ops = {
L
Linus Torvalds 已提交
3112 3113 3114 3115 3116 3117 3118 3119 3120
	.start  = netlink_seq_start,
	.next   = netlink_seq_next,
	.stop   = netlink_seq_stop,
	.show   = netlink_seq_show,
};


static int netlink_seq_open(struct inode *inode, struct file *file)
{
3121 3122
	return seq_open_net(inode, file, &netlink_seq_ops,
				sizeof(struct nl_seq_iter));
3123 3124
}

3125
static const struct file_operations netlink_seq_fops = {
L
Linus Torvalds 已提交
3126 3127 3128 3129
	.owner		= THIS_MODULE,
	.open		= netlink_seq_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
3130
	.release	= seq_release_net,
L
Linus Torvalds 已提交
3131 3132 3133 3134 3135 3136
};

#endif

int netlink_register_notifier(struct notifier_block *nb)
{
3137
	return atomic_notifier_chain_register(&netlink_chain, nb);
L
Linus Torvalds 已提交
3138
}
3139
EXPORT_SYMBOL(netlink_register_notifier);
L
Linus Torvalds 已提交
3140 3141 3142

int netlink_unregister_notifier(struct notifier_block *nb)
{
3143
	return atomic_notifier_chain_unregister(&netlink_chain, nb);
L
Linus Torvalds 已提交
3144
}
3145
EXPORT_SYMBOL(netlink_unregister_notifier);
3146

3147
static const struct proto_ops netlink_ops = {
L
Linus Torvalds 已提交
3148 3149 3150 3151 3152 3153 3154 3155
	.family =	PF_NETLINK,
	.owner =	THIS_MODULE,
	.release =	netlink_release,
	.bind =		netlink_bind,
	.connect =	netlink_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	sock_no_accept,
	.getname =	netlink_getname,
3156
	.poll =		netlink_poll,
L
Linus Torvalds 已提交
3157 3158 3159
	.ioctl =	sock_no_ioctl,
	.listen =	sock_no_listen,
	.shutdown =	sock_no_shutdown,
3160 3161
	.setsockopt =	netlink_setsockopt,
	.getsockopt =	netlink_getsockopt,
L
Linus Torvalds 已提交
3162 3163
	.sendmsg =	netlink_sendmsg,
	.recvmsg =	netlink_recvmsg,
3164
	.mmap =		netlink_mmap,
L
Linus Torvalds 已提交
3165 3166 3167
	.sendpage =	sock_no_sendpage,
};

3168
static const struct net_proto_family netlink_family_ops = {
L
Linus Torvalds 已提交
3169 3170 3171 3172 3173
	.family = PF_NETLINK,
	.create = netlink_create,
	.owner	= THIS_MODULE,	/* for consistency 8) */
};

3174
static int __net_init netlink_net_init(struct net *net)
3175 3176
{
#ifdef CONFIG_PROC_FS
3177
	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
3178 3179 3180 3181 3182
		return -ENOMEM;
#endif
	return 0;
}

3183
static void __net_exit netlink_net_exit(struct net *net)
3184 3185
{
#ifdef CONFIG_PROC_FS
3186
	remove_proc_entry("netlink", net->proc_net);
3187 3188 3189
#endif
}

3190 3191
static void __init netlink_add_usersock_entry(void)
{
3192
	struct listeners *listeners;
3193 3194
	int groups = 32;

3195
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
3196
	if (!listeners)
3197
		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
3198 3199 3200 3201

	netlink_table_grab();

	nl_table[NETLINK_USERSOCK].groups = groups;
3202
	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
3203 3204
	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
	nl_table[NETLINK_USERSOCK].registered = 1;
3205
	nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
3206 3207 3208 3209

	netlink_table_ungrab();
}

3210
static struct pernet_operations __net_initdata netlink_net_ops = {
3211 3212 3213 3214
	.init = netlink_net_init,
	.exit = netlink_net_exit,
};

3215
static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
3216 3217 3218 3219 3220
{
	const struct netlink_sock *nlk = data;
	struct netlink_compare_arg arg;

	netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
3221
	return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
3222 3223 3224 3225 3226 3227 3228
}

static const struct rhashtable_params netlink_rhashtable_params = {
	.head_offset = offsetof(struct netlink_sock, node),
	.key_len = netlink_compare_arg_len,
	.obj_hashfn = netlink_hash,
	.obj_cmpfn = netlink_compare,
3229
	.automatic_shrinking = true,
3230 3231
};

L
Linus Torvalds 已提交
3232 3233 3234 3235 3236 3237 3238 3239
static int __init netlink_proto_init(void)
{
	int i;
	int err = proto_register(&netlink_proto, 0);

	if (err != 0)
		goto out;

3240
	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
L
Linus Torvalds 已提交
3241

3242
	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
3243 3244
	if (!nl_table)
		goto panic;
L
Linus Torvalds 已提交
3245 3246

	for (i = 0; i < MAX_LINKS; i++) {
3247 3248
		if (rhashtable_init(&nl_table[i].hash,
				    &netlink_rhashtable_params) < 0) {
3249 3250
			while (--i > 0)
				rhashtable_destroy(&nl_table[i].hash);
L
Linus Torvalds 已提交
3251
			kfree(nl_table);
3252
			goto panic;
L
Linus Torvalds 已提交
3253 3254 3255
		}
	}

3256 3257
	INIT_LIST_HEAD(&netlink_tap_all);

3258 3259
	netlink_add_usersock_entry();

L
Linus Torvalds 已提交
3260
	sock_register(&netlink_family_ops);
3261
	register_pernet_subsys(&netlink_net_ops);
3262
	/* The netlink device handler may be needed early. */
L
Linus Torvalds 已提交
3263 3264 3265
	rtnetlink_init();
out:
	return err;
3266 3267
panic:
	panic("netlink_init: Cannot allocate nl_table\n");
L
Linus Torvalds 已提交
3268 3269 3270
}

core_initcall(netlink_proto_init);