af_netlink.c 69.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * NETLINK      Kernel-user communication protocol.
 *
4
 * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5
 * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6
 * 				Patrick McHardy <kaber@trash.net>
L
Linus Torvalds 已提交
7 8 9 10 11
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
12
 *
L
Linus Torvalds 已提交
13 14 15 16
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 * 				 use nlk_sk, as sk->protinfo is on a diet 8)
17 18 19 20 21 22
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 * 				 - inc module use count of module that owns
 * 				   the kernel socket in case userspace opens
 * 				   socket of same protocol
 * 				 - remove all module support, since netlink is
 * 				   mandatory if CONFIG_NET=y these days
L
Linus Torvalds 已提交
23 24 25 26
 */

#include <linux/module.h>

27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
A
Andrew Morton 已提交
57
#include <linux/audit.h>
58
#include <linux/mutex.h>
59
#include <linux/vmalloc.h>
60
#include <linux/if_arp.h>
61
#include <asm/cacheflush.h>
A
Andrew Morton 已提交
62

63
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
64 65
#include <net/sock.h>
#include <net/scm.h>
66
#include <net/netlink.h>
L
Linus Torvalds 已提交
67

68
#include "af_netlink.h"
L
Linus Torvalds 已提交
69

70 71 72
struct listeners {
	struct rcu_head		rcu;
	unsigned long		masks[0];
73 74
};

75 76 77 78
/* state bits */
#define NETLINK_CONGESTED	0x0

/* flags */
79
#define NETLINK_KERNEL_SOCKET	0x1
80
#define NETLINK_RECV_PKTINFO	0x2
81
#define NETLINK_BROADCAST_SEND_ERROR	0x4
82
#define NETLINK_RECV_NO_ENOBUFS	0x8
83

84
static inline int netlink_is_kernel(struct sock *sk)
85 86 87 88
{
	return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
}

89 90
struct netlink_table *nl_table;
EXPORT_SYMBOL_GPL(nl_table);
L
Linus Torvalds 已提交
91 92 93 94

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static int netlink_dump(struct sock *sk);
95
static void netlink_skb_destructor(struct sk_buff *skb);
L
Linus Torvalds 已提交
96

97 98
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
L
Linus Torvalds 已提交
99 100
static atomic_t nl_table_users = ATOMIC_INIT(0);

101 102
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

103
static ATOMIC_NOTIFIER_HEAD(netlink_chain);
L
Linus Torvalds 已提交
104

105 106 107
static DEFINE_SPINLOCK(netlink_tap_lock);
static struct list_head netlink_tap_all __read_mostly;

108
static inline u32 netlink_group_mask(u32 group)
109 110 111 112
{
	return group ? 1 << (group - 1) : 0;
}

113
static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u32 portid)
L
Linus Torvalds 已提交
114
{
115
	return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
L
Linus Torvalds 已提交
116 117
}

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
int netlink_add_tap(struct netlink_tap *nt)
{
	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
		return -EINVAL;

	spin_lock(&netlink_tap_lock);
	list_add_rcu(&nt->list, &netlink_tap_all);
	spin_unlock(&netlink_tap_lock);

	if (nt->module)
		__module_get(nt->module);

	return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

int __netlink_remove_tap(struct netlink_tap *nt)
{
	bool found = false;
	struct netlink_tap *tmp;

	spin_lock(&netlink_tap_lock);

	list_for_each_entry(tmp, &netlink_tap_all, list) {
		if (nt == tmp) {
			list_del_rcu(&nt->list);
			found = true;
			goto out;
		}
	}

	pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
	spin_unlock(&netlink_tap_lock);

	if (found && nt->module)
		module_put(nt->module);

	return found ? 0 : -ENODEV;
}
EXPORT_SYMBOL_GPL(__netlink_remove_tap);

int netlink_remove_tap(struct netlink_tap *nt)
{
	int ret;

	ret = __netlink_remove_tap(nt);
	synchronize_net();

	return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

static int __netlink_deliver_tap_skb(struct sk_buff *skb,
				     struct net_device *dev)
{
	struct sk_buff *nskb;
	int ret = -ENOMEM;

	dev_hold(dev);
	nskb = skb_clone(skb, GFP_ATOMIC);
	if (nskb) {
		nskb->dev = dev;
		ret = dev_queue_xmit(nskb);
		if (unlikely(ret > 0))
			ret = net_xmit_errno(ret);
	}

	dev_put(dev);
	return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb)
{
	int ret;
	struct netlink_tap *tmp;

	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
		if (unlikely(ret))
			break;
	}
}

static void netlink_deliver_tap(struct sk_buff *skb)
{
	rcu_read_lock();

	if (unlikely(!list_empty(&netlink_tap_all)))
		__netlink_deliver_tap(skb);

	rcu_read_unlock();
}

212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
static void netlink_overrun(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
		if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
		}
	}
	atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (skb_queue_empty(&sk->sk_receive_queue))
		clear_bit(NETLINK_CONGESTED, &nlk->state);
	if (!test_bit(NETLINK_CONGESTED, &nlk->state))
		wake_up_interruptible(&nlk->wait);
}

235
#ifdef CONFIG_NETLINK_MMAP
236 237 238 239 240
static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
{
	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
}

241 242 243 244 245
static bool netlink_rx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
}

246 247 248 249 250
static bool netlink_tx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
}

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
static __pure struct page *pgvec_to_page(const void *addr)
{
	if (is_vmalloc_addr(addr))
		return vmalloc_to_page(addr);
	else
		return virt_to_page(addr);
}

static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
{
	unsigned int i;

	for (i = 0; i < len; i++) {
		if (pg_vec[i] != NULL) {
			if (is_vmalloc_addr(pg_vec[i]))
				vfree(pg_vec[i]);
			else
				free_pages((unsigned long)pg_vec[i], order);
		}
	}
	kfree(pg_vec);
}

static void *alloc_one_pg_vec_page(unsigned long order)
{
	void *buffer;
	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
			  __GFP_NOWARN | __GFP_NORETRY;

	buffer = (void *)__get_free_pages(gfp_flags, order);
	if (buffer != NULL)
		return buffer;

	buffer = vzalloc((1 << order) * PAGE_SIZE);
	if (buffer != NULL)
		return buffer;

	gfp_flags &= ~__GFP_NORETRY;
	return (void *)__get_free_pages(gfp_flags, order);
}

static void **alloc_pg_vec(struct netlink_sock *nlk,
			   struct nl_mmap_req *req, unsigned int order)
{
	unsigned int block_nr = req->nm_block_nr;
	unsigned int i;
297
	void **pg_vec;
298 299 300 301 302 303

	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
	if (pg_vec == NULL)
		return NULL;

	for (i = 0; i < block_nr; i++) {
304
		pg_vec[i] = alloc_one_pg_vec_page(order);
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
		if (pg_vec[i] == NULL)
			goto err1;
	}

	return pg_vec;
err1:
	free_pg_vec(pg_vec, order, block_nr);
	return NULL;
}

static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
			    bool closing, bool tx_ring)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	struct sk_buff_head *queue;
	void **pg_vec = NULL;
	unsigned int order = 0;
	int err;

	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

	if (!closing) {
		if (atomic_read(&nlk->mapped))
			return -EBUSY;
		if (atomic_read(&ring->pending))
			return -EBUSY;
	}

	if (req->nm_block_nr) {
		if (ring->pg_vec != NULL)
			return -EBUSY;

		if ((int)req->nm_block_size <= 0)
			return -EINVAL;
		if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE))
			return -EINVAL;
		if (req->nm_frame_size < NL_MMAP_HDRLEN)
			return -EINVAL;
		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
			return -EINVAL;

		ring->frames_per_block = req->nm_block_size /
					 req->nm_frame_size;
		if (ring->frames_per_block == 0)
			return -EINVAL;
		if (ring->frames_per_block * req->nm_block_nr !=
		    req->nm_frame_nr)
			return -EINVAL;

		order = get_order(req->nm_block_size);
		pg_vec = alloc_pg_vec(nlk, req, order);
		if (pg_vec == NULL)
			return -ENOMEM;
	} else {
		if (req->nm_frame_nr)
			return -EINVAL;
	}

	err = -EBUSY;
	mutex_lock(&nlk->pg_vec_lock);
	if (closing || atomic_read(&nlk->mapped) == 0) {
		err = 0;
		spin_lock_bh(&queue->lock);

		ring->frame_max		= req->nm_frame_nr - 1;
		ring->head		= 0;
		ring->frame_size	= req->nm_frame_size;
		ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;

		swap(ring->pg_vec_len, req->nm_block_nr);
		swap(ring->pg_vec_order, order);
		swap(ring->pg_vec, pg_vec);

		__skb_queue_purge(queue);
		spin_unlock_bh(&queue->lock);

		WARN_ON(atomic_read(&nlk->mapped));
	}
	mutex_unlock(&nlk->pg_vec_lock);

	if (pg_vec)
		free_pg_vec(pg_vec, order, req->nm_block_nr);
	return err;
}

static void netlink_mm_open(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_inc(&nlk_sk(sk)->mapped);
}

static void netlink_mm_close(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_dec(&nlk_sk(sk)->mapped);
}

static const struct vm_operations_struct netlink_mmap_ops = {
	.open	= netlink_mm_open,
	.close	= netlink_mm_close,
};

static int netlink_mmap(struct file *file, struct socket *sock,
			struct vm_area_struct *vma)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	unsigned long start, size, expected;
	unsigned int i;
	int err = -EINVAL;

	if (vma->vm_pgoff)
		return -EINVAL;

	mutex_lock(&nlk->pg_vec_lock);

	expected = 0;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;
		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
	}

	if (expected == 0)
		goto out;

	size = vma->vm_end - vma->vm_start;
	if (size != expected)
		goto out;

	start = vma->vm_start;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;

		for (i = 0; i < ring->pg_vec_len; i++) {
			struct page *page;
			void *kaddr = ring->pg_vec[i];
			unsigned int pg_num;

			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
				page = pgvec_to_page(kaddr);
				err = vm_insert_page(vma, start, page);
				if (err < 0)
					goto out;
				start += PAGE_SIZE;
				kaddr += PAGE_SIZE;
			}
		}
	}

	atomic_inc(&nlk->mapped);
	vma->vm_ops = &netlink_mmap_ops;
	err = 0;
out:
	mutex_unlock(&nlk->pg_vec_lock);
472
	return err;
473
}
474 475 476 477 478 479 480 481

static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr)
{
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
	struct page *p_start, *p_end;

	/* First page is flushed through netlink_{get,set}_status */
	p_start = pgvec_to_page(hdr + PAGE_SIZE);
482
	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + hdr->nm_len - 1);
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
	while (p_start <= p_end) {
		flush_dcache_page(p_start);
		p_start++;
	}
#endif
}

static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
{
	smp_rmb();
	flush_dcache_page(pgvec_to_page(hdr));
	return hdr->nm_status;
}

static void netlink_set_status(struct nl_mmap_hdr *hdr,
			       enum nl_mmap_status status)
{
	hdr->nm_status = status;
	flush_dcache_page(pgvec_to_page(hdr));
	smp_wmb();
}

static struct nl_mmap_hdr *
__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
{
	unsigned int pg_vec_pos, frame_off;

	pg_vec_pos = pos / ring->frames_per_block;
	frame_off  = pos % ring->frames_per_block;

	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
}

static struct nl_mmap_hdr *
netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
		     enum nl_mmap_status status)
{
	struct nl_mmap_hdr *hdr;

	hdr = __netlink_lookup_frame(ring, pos);
	if (netlink_get_status(hdr) != status)
		return NULL;

	return hdr;
}

static struct nl_mmap_hdr *
netlink_current_frame(const struct netlink_ring *ring,
		      enum nl_mmap_status status)
{
	return netlink_lookup_frame(ring, ring->head, status);
}

static struct nl_mmap_hdr *
netlink_previous_frame(const struct netlink_ring *ring,
		       enum nl_mmap_status status)
{
	unsigned int prev;

	prev = ring->head ? ring->head - 1 : ring->frame_max;
	return netlink_lookup_frame(ring, prev, status);
}

static void netlink_increment_head(struct netlink_ring *ring)
{
	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
}

static void netlink_forward_ring(struct netlink_ring *ring)
{
	unsigned int head = ring->head, pos = head;
	const struct nl_mmap_hdr *hdr;

	do {
		hdr = __netlink_lookup_frame(ring, pos);
		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
			break;
		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
			break;
		netlink_increment_head(ring);
	} while (ring->head != head);
}

566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
static bool netlink_dump_space(struct netlink_sock *nlk)
{
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;
	unsigned int n;

	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		return false;

	n = ring->head + ring->frame_max / 2;
	if (n > ring->frame_max)
		n -= ring->frame_max;

	hdr = __netlink_lookup_frame(ring, n);

	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
}

585 586 587 588 589 590
static unsigned int netlink_poll(struct file *file, struct socket *sock,
				 poll_table *wait)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int mask;
591
	int err;
592

593 594 595 596 597
	if (nlk->rx_ring.pg_vec != NULL) {
		/* Memory mapped sockets don't call recvmsg(), so flow control
		 * for dumps is performed here. A dump is allowed to continue
		 * if at least half the ring is unused.
		 */
598
		while (nlk->cb_running && netlink_dump_space(nlk)) {
599 600 601 602 603 604 605 606 607
			err = netlink_dump(sk);
			if (err < 0) {
				sk->sk_err = err;
				sk->sk_error_report(sk);
				break;
			}
		}
		netlink_rcv_wake(sk);
	}
608

609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
	mask = datagram_poll(file, sock, wait);

	spin_lock_bh(&sk->sk_receive_queue.lock);
	if (nlk->rx_ring.pg_vec) {
		netlink_forward_ring(&nlk->rx_ring);
		if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
			mask |= POLLIN | POLLRDNORM;
	}
	spin_unlock_bh(&sk->sk_receive_queue.lock);

	spin_lock_bh(&sk->sk_write_queue.lock);
	if (nlk->tx_ring.pg_vec) {
		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
			mask |= POLLOUT | POLLWRNORM;
	}
	spin_unlock_bh(&sk->sk_write_queue.lock);

	return mask;
}

static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
{
	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
}

static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
				   struct netlink_ring *ring,
				   struct nl_mmap_hdr *hdr)
{
	unsigned int size;
	void *data;

	size = ring->frame_size - NL_MMAP_HDRLEN;
	data = (void *)hdr + NL_MMAP_HDRLEN;

	skb->head	= data;
	skb->data	= data;
	skb_reset_tail_pointer(skb);
	skb->end	= skb->tail + size;
	skb->len	= 0;

	skb->destructor	= netlink_skb_destructor;
	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
	NETLINK_CB(skb).sk = sk;
}
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751

static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
				u32 dst_portid, u32 dst_group,
				struct sock_iocb *siocb)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;
	struct sk_buff *skb;
	unsigned int maxlen;
	bool excl = true;
	int err = 0, len = 0;

	/* Netlink messages are validated by the receiver before processing.
	 * In order to avoid userspace changing the contents of the message
	 * after validation, the socket and the ring may only be used by a
	 * single process, otherwise we fall back to copying.
	 */
	if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
	    atomic_read(&nlk->mapped) > 1)
		excl = false;

	mutex_lock(&nlk->pg_vec_lock);

	ring   = &nlk->tx_ring;
	maxlen = ring->frame_size - NL_MMAP_HDRLEN;

	do {
		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
		if (hdr == NULL) {
			if (!(msg->msg_flags & MSG_DONTWAIT) &&
			    atomic_read(&nlk->tx_ring.pending))
				schedule();
			continue;
		}
		if (hdr->nm_len > maxlen) {
			err = -EINVAL;
			goto out;
		}

		netlink_frame_flush_dcache(hdr);

		if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
			skb = alloc_skb_head(GFP_KERNEL);
			if (skb == NULL) {
				err = -ENOBUFS;
				goto out;
			}
			sock_hold(sk);
			netlink_ring_setup_skb(skb, sk, ring, hdr);
			NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
			__skb_put(skb, hdr->nm_len);
			netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
			atomic_inc(&ring->pending);
		} else {
			skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
			if (skb == NULL) {
				err = -ENOBUFS;
				goto out;
			}
			__skb_put(skb, hdr->nm_len);
			memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
		}

		netlink_increment_head(ring);

		NETLINK_CB(skb).portid	  = nlk->portid;
		NETLINK_CB(skb).dst_group = dst_group;
		NETLINK_CB(skb).creds	  = siocb->scm->creds;

		err = security_netlink_send(sk, skb);
		if (err) {
			kfree_skb(skb);
			goto out;
		}

		if (unlikely(dst_group)) {
			atomic_inc(&skb->users);
			netlink_broadcast(sk, skb, dst_portid, dst_group,
					  GFP_KERNEL);
		}
		err = netlink_unicast(sk, skb, dst_portid,
				      msg->msg_flags & MSG_DONTWAIT);
		if (err < 0)
			goto out;
		len += err;

	} while (hdr != NULL ||
		 (!(msg->msg_flags & MSG_DONTWAIT) &&
		  atomic_read(&nlk->tx_ring.pending)));

	if (len > 0)
		err = len;
out:
	mutex_unlock(&nlk->pg_vec_lock);
	return err;
}
752 753 754 755 756 757 758 759 760

static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
{
	struct nl_mmap_hdr *hdr;

	hdr = netlink_mmap_hdr(skb);
	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
761 762
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
	netlink_frame_flush_dcache(hdr);
	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);

	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
	kfree_skb(skb);
}

static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL) {
		spin_unlock_bh(&sk->sk_receive_queue.lock);
		kfree_skb(skb);
781
		netlink_overrun(sk);
782 783 784 785 786 787 788 789 790
		return;
	}
	netlink_increment_head(ring);
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);

	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
791 792
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
793 794 795
	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
}

796
#else /* CONFIG_NETLINK_MMAP */
797
#define netlink_skb_is_mmaped(skb)	false
798
#define netlink_rx_is_mmaped(sk)	false
799
#define netlink_tx_is_mmaped(sk)	false
800
#define netlink_mmap			sock_no_mmap
801
#define netlink_poll			datagram_poll
802
#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)	0
803 804
#endif /* CONFIG_NETLINK_MMAP */

805 806
static void netlink_skb_destructor(struct sk_buff *skb)
{
807 808 809 810 811 812 813 814 815 816 817 818 819 820
#ifdef CONFIG_NETLINK_MMAP
	struct nl_mmap_hdr *hdr;
	struct netlink_ring *ring;
	struct sock *sk;

	/* If a packet from the kernel to userspace was freed because of an
	 * error without being delivered to userspace, the kernel must reset
	 * the status. In the direction userspace to kernel, the status is
	 * always reset here after the packet was processed and freed.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		hdr = netlink_mmap_hdr(skb);
		sk = NETLINK_CB(skb).sk;

821 822 823 824 825 826 827 828 829
		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
			ring = &nlk_sk(sk)->tx_ring;
		} else {
			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
				hdr->nm_len = 0;
				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
			}
			ring = &nlk_sk(sk)->rx_ring;
830 831 832 833 834 835
		}

		WARN_ON(atomic_read(&ring->pending) == 0);
		atomic_dec(&ring->pending);
		sock_put(sk);

836
		skb->head = NULL;
837 838
	}
#endif
839
	if (is_vmalloc_addr(skb->head)) {
840 841 842 843
		if (!skb->cloned ||
		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
			vfree(skb->head);

844 845
		skb->head = NULL;
	}
846 847
	if (skb->sk != NULL)
		sock_rfree(skb);
848 849 850 851 852 853 854 855 856 857 858
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
	WARN_ON(skb->sk != NULL);
	skb->sk = sk;
	skb->destructor = netlink_skb_destructor;
	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
	sk_mem_charge(sk, skb->truesize);
}

L
Linus Torvalds 已提交
859 860
static void netlink_sock_destruct(struct sock *sk)
{
861 862
	struct netlink_sock *nlk = nlk_sk(sk);

863 864 865
	if (nlk->cb_running) {
		if (nlk->cb.done)
			nlk->cb.done(&nlk->cb);
866

867 868
		module_put(nlk->cb.module);
		kfree_skb(nlk->cb.skb);
869 870
	}

L
Linus Torvalds 已提交
871
	skb_queue_purge(&sk->sk_receive_queue);
872 873 874 875 876 877 878 879 880 881 882 883
#ifdef CONFIG_NETLINK_MMAP
	if (1) {
		struct nl_mmap_req req;

		memset(&req, 0, sizeof(req));
		if (nlk->rx_ring.pg_vec)
			netlink_set_ring(sk, &req, true, false);
		memset(&req, 0, sizeof(req));
		if (nlk->tx_ring.pg_vec)
			netlink_set_ring(sk, &req, true, true);
	}
#endif /* CONFIG_NETLINK_MMAP */
L
Linus Torvalds 已提交
884 885

	if (!sock_flag(sk, SOCK_DEAD)) {
886
		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
L
Linus Torvalds 已提交
887 888
		return;
	}
889 890 891 892

	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
	WARN_ON(nlk_sk(sk)->groups);
L
Linus Torvalds 已提交
893 894
}

895 896
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
L
Linus Torvalds 已提交
897 898 899 900
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

901
void netlink_table_grab(void)
902
	__acquires(nl_table_lock)
L
Linus Torvalds 已提交
903
{
904 905
	might_sleep();

906
	write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
907 908 909 910 911

	if (atomic_read(&nl_table_users)) {
		DECLARE_WAITQUEUE(wait, current);

		add_wait_queue_exclusive(&nl_table_wait, &wait);
912
		for (;;) {
L
Linus Torvalds 已提交
913 914 915
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (atomic_read(&nl_table_users) == 0)
				break;
916
			write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
917
			schedule();
918
			write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
919 920 921 922 923 924 925
		}

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nl_table_wait, &wait);
	}
}

926
void netlink_table_ungrab(void)
927
	__releases(nl_table_lock)
L
Linus Torvalds 已提交
928
{
929
	write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
930 931 932
	wake_up(&nl_table_wait);
}

933
static inline void
L
Linus Torvalds 已提交
934 935 936 937 938 939 940 941 942
netlink_lock_table(void)
{
	/* read_lock() synchronizes us to netlink_table_grab */

	read_lock(&nl_table_lock);
	atomic_inc(&nl_table_users);
	read_unlock(&nl_table_lock);
}

943
static inline void
L
Linus Torvalds 已提交
944 945 946 947 948 949
netlink_unlock_table(void)
{
	if (atomic_dec_and_test(&nl_table_users))
		wake_up(&nl_table_wait);
}

950 951 952 953 954
static bool netlink_compare(struct net *net, struct sock *sk)
{
	return net_eq(sock_net(sk), net);
}

955
static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
L
Linus Torvalds 已提交
956
{
957 958
	struct netlink_table *table = &nl_table[protocol];
	struct nl_portid_hash *hash = &table->hash;
L
Linus Torvalds 已提交
959 960 961 962
	struct hlist_head *head;
	struct sock *sk;

	read_lock(&nl_table_lock);
963
	head = nl_portid_hashfn(hash, portid);
964
	sk_for_each(sk, head) {
965 966
		if (table->compare(net, sk) &&
		    (nlk_sk(sk)->portid == portid)) {
L
Linus Torvalds 已提交
967 968 969 970 971 972 973 974 975 976
			sock_hold(sk);
			goto found;
		}
	}
	sk = NULL;
found:
	read_unlock(&nl_table_lock);
	return sk;
}

977
static struct hlist_head *nl_portid_hash_zalloc(size_t size)
L
Linus Torvalds 已提交
978 979
{
	if (size <= PAGE_SIZE)
E
Eric Dumazet 已提交
980
		return kzalloc(size, GFP_ATOMIC);
L
Linus Torvalds 已提交
981 982
	else
		return (struct hlist_head *)
E
Eric Dumazet 已提交
983 984
			__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
					 get_order(size));
L
Linus Torvalds 已提交
985 986
}

987
static void nl_portid_hash_free(struct hlist_head *table, size_t size)
L
Linus Torvalds 已提交
988 989 990 991 992 993 994
{
	if (size <= PAGE_SIZE)
		kfree(table);
	else
		free_pages((unsigned long)table, get_order(size));
}

995
static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow)
L
Linus Torvalds 已提交
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
{
	unsigned int omask, mask, shift;
	size_t osize, size;
	struct hlist_head *otable, *table;
	int i;

	omask = mask = hash->mask;
	osize = size = (mask + 1) * sizeof(*table);
	shift = hash->shift;

	if (grow) {
		if (++shift > hash->max_shift)
			return 0;
		mask = mask * 2 + 1;
		size *= 2;
	}

1013
	table = nl_portid_hash_zalloc(size);
L
Linus Torvalds 已提交
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
	if (!table)
		return 0;

	otable = hash->table;
	hash->table = table;
	hash->mask = mask;
	hash->shift = shift;
	get_random_bytes(&hash->rnd, sizeof(hash->rnd));

	for (i = 0; i <= omask; i++) {
		struct sock *sk;
1025
		struct hlist_node *tmp;
L
Linus Torvalds 已提交
1026

1027
		sk_for_each_safe(sk, tmp, &otable[i])
1028
			__sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid));
L
Linus Torvalds 已提交
1029 1030
	}

1031
	nl_portid_hash_free(otable, osize);
L
Linus Torvalds 已提交
1032 1033 1034 1035
	hash->rehash_time = jiffies + 10 * 60 * HZ;
	return 1;
}

1036
static inline int nl_portid_hash_dilute(struct nl_portid_hash *hash, int len)
L
Linus Torvalds 已提交
1037 1038 1039
{
	int avg = hash->entries >> hash->shift;

1040
	if (unlikely(avg > 1) && nl_portid_hash_rehash(hash, 1))
L
Linus Torvalds 已提交
1041 1042 1043
		return 1;

	if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) {
1044
		nl_portid_hash_rehash(hash, 0);
L
Linus Torvalds 已提交
1045 1046 1047 1048 1049 1050
		return 1;
	}

	return 0;
}

1051
static const struct proto_ops netlink_ops;
L
Linus Torvalds 已提交
1052

1053 1054 1055 1056 1057 1058
static void
netlink_update_listeners(struct sock *sk)
{
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
	unsigned long mask;
	unsigned int i;
1059 1060 1061 1062 1063
	struct listeners *listeners;

	listeners = nl_deref_protected(tbl->listeners);
	if (!listeners)
		return;
1064

1065
	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
1066
		mask = 0;
1067
		sk_for_each_bound(sk, &tbl->mc_list) {
1068 1069 1070
			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
				mask |= nlk_sk(sk)->groups[i];
		}
1071
		listeners->masks[i] = mask;
1072 1073 1074 1075 1076
	}
	/* this function is only called with the netlink table "grabbed", which
	 * makes sure updates are visible before bind or setsockopt return. */
}

1077
static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
L
Linus Torvalds 已提交
1078
{
1079 1080
	struct netlink_table *table = &nl_table[sk->sk_protocol];
	struct nl_portid_hash *hash = &table->hash;
L
Linus Torvalds 已提交
1081 1082 1083 1084 1085 1086
	struct hlist_head *head;
	int err = -EADDRINUSE;
	struct sock *osk;
	int len;

	netlink_table_grab();
1087
	head = nl_portid_hashfn(hash, portid);
L
Linus Torvalds 已提交
1088
	len = 0;
1089
	sk_for_each(osk, head) {
1090 1091
		if (table->compare(net, osk) &&
		    (nlk_sk(osk)->portid == portid))
L
Linus Torvalds 已提交
1092 1093 1094
			break;
		len++;
	}
1095
	if (osk)
L
Linus Torvalds 已提交
1096 1097 1098
		goto err;

	err = -EBUSY;
1099
	if (nlk_sk(sk)->portid)
L
Linus Torvalds 已提交
1100 1101 1102 1103 1104 1105
		goto err;

	err = -ENOMEM;
	if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX))
		goto err;

1106 1107
	if (len && nl_portid_hash_dilute(hash, len))
		head = nl_portid_hashfn(hash, portid);
L
Linus Torvalds 已提交
1108
	hash->entries++;
1109
	nlk_sk(sk)->portid = portid;
L
Linus Torvalds 已提交
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
	sk_add_node(sk, head);
	err = 0;

err:
	netlink_table_ungrab();
	return err;
}

static void netlink_remove(struct sock *sk)
{
	netlink_table_grab();
1121 1122
	if (sk_del_node_init(sk))
		nl_table[sk->sk_protocol].hash.entries--;
1123
	if (nlk_sk(sk)->subscriptions)
L
Linus Torvalds 已提交
1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
		__sk_del_bind_node(sk);
	netlink_table_ungrab();
}

static struct proto netlink_proto = {
	.name	  = "NETLINK",
	.owner	  = THIS_MODULE,
	.obj_size = sizeof(struct netlink_sock),
};

1134 1135
static int __netlink_create(struct net *net, struct socket *sock,
			    struct mutex *cb_mutex, int protocol)
L
Linus Torvalds 已提交
1136 1137 1138
{
	struct sock *sk;
	struct netlink_sock *nlk;
1139 1140 1141

	sock->ops = &netlink_ops;

1142
	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
1143 1144 1145 1146 1147 1148
	if (!sk)
		return -ENOMEM;

	sock_init_data(sock, sk);

	nlk = nlk_sk(sk);
E
Eric Dumazet 已提交
1149
	if (cb_mutex) {
1150
		nlk->cb_mutex = cb_mutex;
E
Eric Dumazet 已提交
1151
	} else {
1152 1153 1154
		nlk->cb_mutex = &nlk->cb_def_mutex;
		mutex_init(nlk->cb_mutex);
	}
1155
	init_waitqueue_head(&nlk->wait);
1156 1157 1158
#ifdef CONFIG_NETLINK_MMAP
	mutex_init(&nlk->pg_vec_lock);
#endif
1159 1160 1161 1162 1163 1164

	sk->sk_destruct = netlink_sock_destruct;
	sk->sk_protocol = protocol;
	return 0;
}

1165 1166
static int netlink_create(struct net *net, struct socket *sock, int protocol,
			  int kern)
1167 1168
{
	struct module *module = NULL;
1169
	struct mutex *cb_mutex;
1170
	struct netlink_sock *nlk;
1171
	void (*bind)(int group);
1172
	int err = 0;
L
Linus Torvalds 已提交
1173 1174 1175 1176 1177 1178

	sock->state = SS_UNCONNECTED;

	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
		return -ESOCKTNOSUPPORT;

1179
	if (protocol < 0 || protocol >= MAX_LINKS)
L
Linus Torvalds 已提交
1180 1181
		return -EPROTONOSUPPORT;

1182
	netlink_lock_table();
1183
#ifdef CONFIG_MODULES
1184
	if (!nl_table[protocol].registered) {
1185
		netlink_unlock_table();
1186
		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
1187
		netlink_lock_table();
1188
	}
1189 1190 1191 1192
#endif
	if (nl_table[protocol].registered &&
	    try_module_get(nl_table[protocol].module))
		module = nl_table[protocol].module;
1193 1194
	else
		err = -EPROTONOSUPPORT;
1195
	cb_mutex = nl_table[protocol].cb_mutex;
1196
	bind = nl_table[protocol].bind;
1197
	netlink_unlock_table();
1198

1199 1200 1201
	if (err < 0)
		goto out;

1202 1203
	err = __netlink_create(net, sock, cb_mutex, protocol);
	if (err < 0)
1204 1205
		goto out_module;

1206
	local_bh_disable();
1207
	sock_prot_inuse_add(net, &netlink_proto, 1);
1208 1209
	local_bh_enable();

1210 1211
	nlk = nlk_sk(sock->sk);
	nlk->module = module;
1212
	nlk->netlink_bind = bind;
1213 1214
out:
	return err;
L
Linus Torvalds 已提交
1215

1216 1217 1218
out_module:
	module_put(module);
	goto out;
L
Linus Torvalds 已提交
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229
}

static int netlink_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk;

	if (!sk)
		return 0;

	netlink_remove(sk);
1230
	sock_orphan(sk);
L
Linus Torvalds 已提交
1231 1232
	nlk = nlk_sk(sk);

1233 1234 1235 1236
	/*
	 * OK. Socket is unlinked, any packets that arrive now
	 * will be purged.
	 */
L
Linus Torvalds 已提交
1237 1238 1239 1240 1241 1242

	sock->sk = NULL;
	wake_up_interruptible_all(&nlk->wait);

	skb_queue_purge(&sk->sk_write_queue);

1243
	if (nlk->portid) {
L
Linus Torvalds 已提交
1244
		struct netlink_notify n = {
1245
						.net = sock_net(sk),
L
Linus Torvalds 已提交
1246
						.protocol = sk->sk_protocol,
1247
						.portid = nlk->portid,
L
Linus Torvalds 已提交
1248
					  };
1249 1250
		atomic_notifier_call_chain(&netlink_chain,
				NETLINK_URELEASE, &n);
1251
	}
1252

1253
	module_put(nlk->module);
1254

1255
	netlink_table_grab();
1256
	if (netlink_is_kernel(sk)) {
1257 1258
		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
		if (--nl_table[sk->sk_protocol].registered == 0) {
1259 1260 1261 1262 1263
			struct listeners *old;

			old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
			RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
			kfree_rcu(old, rcu);
1264
			nl_table[sk->sk_protocol].module = NULL;
1265 1266
			nl_table[sk->sk_protocol].bind = NULL;
			nl_table[sk->sk_protocol].flags = 0;
1267 1268
			nl_table[sk->sk_protocol].registered = 0;
		}
E
Eric Dumazet 已提交
1269
	} else if (nlk->subscriptions) {
1270
		netlink_update_listeners(sk);
E
Eric Dumazet 已提交
1271
	}
1272
	netlink_table_ungrab();
1273

1274 1275 1276
	kfree(nlk->groups);
	nlk->groups = NULL;

1277
	local_bh_disable();
1278
	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
1279
	local_bh_enable();
L
Linus Torvalds 已提交
1280 1281 1282 1283 1284 1285 1286
	sock_put(sk);
	return 0;
}

static int netlink_autobind(struct socket *sock)
{
	struct sock *sk = sock->sk;
1287
	struct net *net = sock_net(sk);
1288 1289
	struct netlink_table *table = &nl_table[sk->sk_protocol];
	struct nl_portid_hash *hash = &table->hash;
L
Linus Torvalds 已提交
1290 1291
	struct hlist_head *head;
	struct sock *osk;
1292
	s32 portid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
1293 1294 1295 1296 1297 1298
	int err;
	static s32 rover = -4097;

retry:
	cond_resched();
	netlink_table_grab();
1299
	head = nl_portid_hashfn(hash, portid);
1300
	sk_for_each(osk, head) {
1301
		if (!table->compare(net, osk))
1302
			continue;
1303 1304 1305
		if (nlk_sk(osk)->portid == portid) {
			/* Bind collision, search negative portid values. */
			portid = rover--;
L
Linus Torvalds 已提交
1306 1307 1308 1309 1310 1311 1312 1313
			if (rover > -4097)
				rover = -4097;
			netlink_table_ungrab();
			goto retry;
		}
	}
	netlink_table_ungrab();

1314
	err = netlink_insert(sk, net, portid);
L
Linus Torvalds 已提交
1315 1316
	if (err == -EADDRINUSE)
		goto retry;
1317 1318 1319 1320 1321 1322

	/* If 2 threads race to autobind, that is fine.  */
	if (err == -EBUSY)
		err = 0;

	return err;
L
Linus Torvalds 已提交
1323 1324
}

1325
static inline int netlink_capable(const struct socket *sock, unsigned int flag)
1326
{
1327
	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
1328
		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
1329
}
L
Linus Torvalds 已提交
1330

1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (nlk->subscriptions && !subscriptions)
		__sk_del_bind_node(sk);
	else if (!nlk->subscriptions && subscriptions)
		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
	nlk->subscriptions = subscriptions;
}

1343
static int netlink_realloc_groups(struct sock *sk)
1344 1345 1346
{
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int groups;
1347
	unsigned long *new_groups;
1348 1349
	int err = 0;

1350 1351
	netlink_table_grab();

1352
	groups = nl_table[sk->sk_protocol].groups;
1353
	if (!nl_table[sk->sk_protocol].registered) {
1354
		err = -ENOENT;
1355 1356
		goto out_unlock;
	}
1357

1358 1359
	if (nlk->ngroups >= groups)
		goto out_unlock;
1360

1361 1362 1363 1364 1365
	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
	if (new_groups == NULL) {
		err = -ENOMEM;
		goto out_unlock;
	}
1366
	memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
1367 1368 1369
	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

	nlk->groups = new_groups;
1370
	nlk->ngroups = groups;
1371 1372 1373
 out_unlock:
	netlink_table_ungrab();
	return err;
1374 1375
}

1376 1377
static int netlink_bind(struct socket *sock, struct sockaddr *addr,
			int addr_len)
L
Linus Torvalds 已提交
1378 1379
{
	struct sock *sk = sock->sk;
1380
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1381 1382 1383
	struct netlink_sock *nlk = nlk_sk(sk);
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
	int err;
1384

1385 1386 1387
	if (addr_len < sizeof(struct sockaddr_nl))
		return -EINVAL;

L
Linus Torvalds 已提交
1388 1389 1390 1391
	if (nladdr->nl_family != AF_NETLINK)
		return -EINVAL;

	/* Only superuser is allowed to listen multicasts */
1392
	if (nladdr->nl_groups) {
1393
		if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV))
1394
			return -EPERM;
1395 1396 1397
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
1398
	}
L
Linus Torvalds 已提交
1399

1400 1401
	if (nlk->portid) {
		if (nladdr->nl_pid != nlk->portid)
L
Linus Torvalds 已提交
1402 1403 1404
			return -EINVAL;
	} else {
		err = nladdr->nl_pid ?
1405
			netlink_insert(sk, net, nladdr->nl_pid) :
L
Linus Torvalds 已提交
1406 1407 1408 1409 1410
			netlink_autobind(sock);
		if (err)
			return err;
	}

1411
	if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
L
Linus Torvalds 已提交
1412 1413 1414
		return 0;

	netlink_table_grab();
1415
	netlink_update_subscriptions(sk, nlk->subscriptions +
1416 1417 1418
					 hweight32(nladdr->nl_groups) -
					 hweight32(nlk->groups[0]));
	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
1419
	netlink_update_listeners(sk);
L
Linus Torvalds 已提交
1420 1421
	netlink_table_ungrab();

1422 1423 1424 1425 1426 1427 1428 1429 1430
	if (nlk->netlink_bind && nlk->groups[0]) {
		int i;

		for (i=0; i<nlk->ngroups; i++) {
			if (test_bit(i, nlk->groups))
				nlk->netlink_bind(i);
		}
	}

L
Linus Torvalds 已提交
1431 1432 1433 1434 1435 1436 1437 1438 1439
	return 0;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
			   int alen, int flags)
{
	int err = 0;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1440
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
L
Linus Torvalds 已提交
1441

1442 1443 1444
	if (alen < sizeof(addr->sa_family))
		return -EINVAL;

L
Linus Torvalds 已提交
1445 1446
	if (addr->sa_family == AF_UNSPEC) {
		sk->sk_state	= NETLINK_UNCONNECTED;
1447
		nlk->dst_portid	= 0;
1448
		nlk->dst_group  = 0;
L
Linus Torvalds 已提交
1449 1450 1451 1452 1453 1454
		return 0;
	}
	if (addr->sa_family != AF_NETLINK)
		return -EINVAL;

	/* Only superuser is allowed to send multicasts */
1455
	if (nladdr->nl_groups && !netlink_capable(sock, NL_CFG_F_NONROOT_SEND))
L
Linus Torvalds 已提交
1456 1457
		return -EPERM;

1458
	if (!nlk->portid)
L
Linus Torvalds 已提交
1459 1460 1461 1462
		err = netlink_autobind(sock);

	if (err == 0) {
		sk->sk_state	= NETLINK_CONNECTED;
1463
		nlk->dst_portid = nladdr->nl_pid;
1464
		nlk->dst_group  = ffs(nladdr->nl_groups);
L
Linus Torvalds 已提交
1465 1466 1467 1468 1469
	}

	return err;
}

1470 1471
static int netlink_getname(struct socket *sock, struct sockaddr *addr,
			   int *addr_len, int peer)
L
Linus Torvalds 已提交
1472 1473 1474
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1475
	DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
1476

L
Linus Torvalds 已提交
1477 1478 1479 1480 1481
	nladdr->nl_family = AF_NETLINK;
	nladdr->nl_pad = 0;
	*addr_len = sizeof(*nladdr);

	if (peer) {
1482
		nladdr->nl_pid = nlk->dst_portid;
1483
		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
L
Linus Torvalds 已提交
1484
	} else {
1485
		nladdr->nl_pid = nlk->portid;
1486
		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
L
Linus Torvalds 已提交
1487 1488 1489 1490
	}
	return 0;
}

1491
static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
L
Linus Torvalds 已提交
1492 1493 1494 1495
{
	struct sock *sock;
	struct netlink_sock *nlk;

1496
	sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
L
Linus Torvalds 已提交
1497 1498 1499 1500 1501
	if (!sock)
		return ERR_PTR(-ECONNREFUSED);

	/* Don't bother queuing skb if kernel socket has no input function */
	nlk = nlk_sk(sock);
1502
	if (sock->sk_state == NETLINK_CONNECTED &&
1503
	    nlk->dst_portid != nlk_sk(ssk)->portid) {
L
Linus Torvalds 已提交
1504 1505 1506 1507 1508 1509 1510 1511
		sock_put(sock);
		return ERR_PTR(-ECONNREFUSED);
	}
	return sock;
}

struct sock *netlink_getsockbyfilp(struct file *filp)
{
A
Al Viro 已提交
1512
	struct inode *inode = file_inode(filp);
L
Linus Torvalds 已提交
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
	struct sock *sock;

	if (!S_ISSOCK(inode->i_mode))
		return ERR_PTR(-ENOTSOCK);

	sock = SOCKET_I(inode)->sk;
	if (sock->sk_family != AF_NETLINK)
		return ERR_PTR(-EINVAL);

	sock_hold(sock);
	return sock;
}

1526 1527
static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
					       int broadcast)
1528 1529 1530 1531
{
	struct sk_buff *skb;
	void *data;

1532
	if (size <= NLMSG_GOODSIZE || broadcast)
1533 1534
		return alloc_skb(size, GFP_KERNEL);

1535 1536
	size = SKB_DATA_ALIGN(size) +
	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1537 1538 1539

	data = vmalloc(size);
	if (data == NULL)
1540
		return NULL;
1541

1542 1543 1544 1545 1546 1547 1548
	skb = build_skb(data, size);
	if (skb == NULL)
		vfree(data);
	else {
		skb->head_frag = 0;
		skb->destructor = netlink_skb_destructor;
	}
1549 1550 1551 1552

	return skb;
}

L
Linus Torvalds 已提交
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
1563
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
P
Patrick McHardy 已提交
1564
		      long *timeo, struct sock *ssk)
L
Linus Torvalds 已提交
1565 1566 1567 1568 1569
{
	struct netlink_sock *nlk;

	nlk = nlk_sk(sk);

1570 1571 1572
	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
	     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
	    !netlink_skb_is_mmaped(skb)) {
L
Linus Torvalds 已提交
1573
		DECLARE_WAITQUEUE(wait, current);
P
Patrick McHardy 已提交
1574
		if (!*timeo) {
1575
			if (!ssk || netlink_is_kernel(ssk))
L
Linus Torvalds 已提交
1576 1577 1578 1579 1580 1581 1582 1583 1584 1585
				netlink_overrun(sk);
			sock_put(sk);
			kfree_skb(skb);
			return -EAGAIN;
		}

		__set_current_state(TASK_INTERRUPTIBLE);
		add_wait_queue(&nlk->wait, &wait);

		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1586
		     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
L
Linus Torvalds 已提交
1587
		    !sock_flag(sk, SOCK_DEAD))
P
Patrick McHardy 已提交
1588
			*timeo = schedule_timeout(*timeo);
L
Linus Torvalds 已提交
1589 1590 1591 1592 1593 1594 1595

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nlk->wait, &wait);
		sock_put(sk);

		if (signal_pending(current)) {
			kfree_skb(skb);
P
Patrick McHardy 已提交
1596
			return sock_intr_errno(*timeo);
L
Linus Torvalds 已提交
1597 1598 1599
		}
		return 1;
	}
1600
	netlink_skb_set_owner_r(skb, sk);
L
Linus Torvalds 已提交
1601 1602 1603
	return 0;
}

1604
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1605 1606 1607
{
	int len = skb->len;

1608 1609
	netlink_deliver_tap(skb);

1610 1611 1612 1613 1614 1615 1616 1617
#ifdef CONFIG_NETLINK_MMAP
	if (netlink_skb_is_mmaped(skb))
		netlink_queue_mmaped_skb(sk, skb);
	else if (netlink_rx_is_mmaped(sk))
		netlink_ring_set_copied(sk, skb);
	else
#endif /* CONFIG_NETLINK_MMAP */
		skb_queue_tail(&sk->sk_receive_queue, skb);
L
Linus Torvalds 已提交
1618
	sk->sk_data_ready(sk, len);
1619 1620 1621 1622 1623 1624 1625
	return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
	int len = __netlink_sendskb(sk, skb);

L
Linus Torvalds 已提交
1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
	sock_put(sk);
	return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
	kfree_skb(skb);
	sock_put(sk);
}

1636
static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
L
Linus Torvalds 已提交
1637 1638 1639
{
	int delta;

1640
	WARN_ON(skb->sk != NULL);
1641 1642
	if (netlink_skb_is_mmaped(skb))
		return skb;
L
Linus Torvalds 已提交
1643

1644
	delta = skb->end - skb->tail;
1645
	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
L
Linus Torvalds 已提交
1646 1647 1648 1649 1650 1651
		return skb;

	if (skb_shared(skb)) {
		struct sk_buff *nskb = skb_clone(skb, allocation);
		if (!nskb)
			return skb;
1652
		consume_skb(skb);
L
Linus Torvalds 已提交
1653 1654 1655 1656 1657 1658 1659 1660 1661
		skb = nskb;
	}

	if (!pskb_expand_head(skb, 0, -delta, allocation))
		skb->truesize -= delta;

	return skb;
}

1662 1663
static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
				  struct sock *ssk)
1664 1665 1666 1667 1668 1669
{
	int ret;
	struct netlink_sock *nlk = nlk_sk(sk);

	ret = -ECONNREFUSED;
	if (nlk->netlink_rcv != NULL) {
1670 1671 1672 1673 1674
		/* We could do a netlink_deliver_tap(skb) here as well
		 * but since this is intended for the kernel only, we
		 * should rather let it stay under the hood.
		 */

1675
		ret = skb->len;
1676
		netlink_skb_set_owner_r(skb, sk);
1677
		NETLINK_CB(skb).sk = ssk;
1678
		nlk->netlink_rcv(skb);
1679 1680 1681
		consume_skb(skb);
	} else {
		kfree_skb(skb);
1682 1683 1684 1685 1686 1687
	}
	sock_put(sk);
	return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
1688
		    u32 portid, int nonblock)
L
Linus Torvalds 已提交
1689 1690 1691 1692 1693 1694 1695 1696 1697
{
	struct sock *sk;
	int err;
	long timeo;

	skb = netlink_trim(skb, gfp_any());

	timeo = sock_sndtimeo(ssk, nonblock);
retry:
1698
	sk = netlink_getsockbyportid(ssk, portid);
L
Linus Torvalds 已提交
1699 1700 1701 1702
	if (IS_ERR(sk)) {
		kfree_skb(skb);
		return PTR_ERR(sk);
	}
1703
	if (netlink_is_kernel(sk))
1704
		return netlink_unicast_kernel(sk, skb, ssk);
1705

1706
	if (sk_filter(sk, skb)) {
W
Wang Chen 已提交
1707
		err = skb->len;
1708 1709 1710 1711 1712
		kfree_skb(skb);
		sock_put(sk);
		return err;
	}

1713
	err = netlink_attachskb(sk, skb, &timeo, ssk);
L
Linus Torvalds 已提交
1714 1715 1716 1717 1718
	if (err == 1)
		goto retry;
	if (err)
		return err;

1719
	return netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
1720
}
1721
EXPORT_SYMBOL(netlink_unicast);
L
Linus Torvalds 已提交
1722

1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769
struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
				  u32 dst_portid, gfp_t gfp_mask)
{
#ifdef CONFIG_NETLINK_MMAP
	struct sock *sk = NULL;
	struct sk_buff *skb;
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;
	unsigned int maxlen;

	sk = netlink_getsockbyportid(ssk, dst_portid);
	if (IS_ERR(sk))
		goto out;

	ring = &nlk_sk(sk)->rx_ring;
	/* fast-path without atomic ops for common case: non-mmaped receiver */
	if (ring->pg_vec == NULL)
		goto out_put;

	skb = alloc_skb_head(gfp_mask);
	if (skb == NULL)
		goto err1;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	/* check again under lock */
	if (ring->pg_vec == NULL)
		goto out_free;

	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
	if (maxlen < size)
		goto out_free;

	netlink_forward_ring(ring);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		goto err2;
	netlink_ring_setup_skb(skb, sk, ring, hdr);
	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
	atomic_inc(&ring->pending);
	netlink_increment_head(ring);

	spin_unlock_bh(&sk->sk_receive_queue.lock);
	return skb;

err2:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
1770
	netlink_overrun(sk);
1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785
err1:
	sock_put(sk);
	return NULL;

out_free:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
out_put:
	sock_put(sk);
out:
#endif
	return alloc_skb(size, gfp_mask);
}
EXPORT_SYMBOL_GPL(netlink_alloc_skb);

1786 1787 1788
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
	int res = 0;
1789
	struct listeners *listeners;
1790

1791
	BUG_ON(!netlink_is_kernel(sk));
1792 1793 1794 1795

	rcu_read_lock();
	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

1796
	if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
1797
		res = test_bit(group - 1, listeners->masks);
1798 1799 1800

	rcu_read_unlock();

1801 1802 1803 1804
	return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

1805
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1806 1807 1808 1809
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
1810
	    !test_bit(NETLINK_CONGESTED, &nlk->state)) {
1811
		netlink_skb_set_owner_r(skb, sk);
1812
		__netlink_sendskb(sk, skb);
1813
		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
L
Linus Torvalds 已提交
1814 1815 1816 1817 1818 1819
	}
	return -1;
}

struct netlink_broadcast_data {
	struct sock *exclude_sk;
1820
	struct net *net;
1821
	u32 portid;
L
Linus Torvalds 已提交
1822 1823
	u32 group;
	int failure;
1824
	int delivery_failure;
L
Linus Torvalds 已提交
1825 1826
	int congested;
	int delivered;
A
Al Viro 已提交
1827
	gfp_t allocation;
L
Linus Torvalds 已提交
1828
	struct sk_buff *skb, *skb2;
1829 1830
	int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
	void *tx_data;
L
Linus Torvalds 已提交
1831 1832
};

1833
static int do_one_broadcast(struct sock *sk,
L
Linus Torvalds 已提交
1834 1835 1836 1837 1838 1839 1840 1841
				   struct netlink_broadcast_data *p)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	int val;

	if (p->exclude_sk == sk)
		goto out;

1842
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
1843
	    !test_bit(p->group - 1, nlk->groups))
L
Linus Torvalds 已提交
1844 1845
		goto out;

1846
	if (!net_eq(sock_net(sk), p->net))
1847 1848
		goto out;

L
Linus Torvalds 已提交
1849 1850 1851 1852 1853 1854 1855
	if (p->failure) {
		netlink_overrun(sk);
		goto out;
	}

	sock_hold(sk);
	if (p->skb2 == NULL) {
1856
		if (skb_shared(p->skb)) {
L
Linus Torvalds 已提交
1857 1858
			p->skb2 = skb_clone(p->skb, p->allocation);
		} else {
1859 1860 1861 1862 1863 1864
			p->skb2 = skb_get(p->skb);
			/*
			 * skb ownership may have been set when
			 * delivered to a previous socket.
			 */
			skb_orphan(p->skb2);
L
Linus Torvalds 已提交
1865 1866 1867 1868 1869 1870
		}
	}
	if (p->skb2 == NULL) {
		netlink_overrun(sk);
		/* Clone failed. Notify ALL listeners. */
		p->failure = 1;
1871 1872
		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
			p->delivery_failure = 1;
1873 1874 1875
	} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
		kfree_skb(p->skb2);
		p->skb2 = NULL;
1876 1877 1878
	} else if (sk_filter(sk, p->skb2)) {
		kfree_skb(p->skb2);
		p->skb2 = NULL;
L
Linus Torvalds 已提交
1879 1880
	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
		netlink_overrun(sk);
1881 1882
		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
			p->delivery_failure = 1;
L
Linus Torvalds 已提交
1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893
	} else {
		p->congested |= val;
		p->delivered = 1;
		p->skb2 = NULL;
	}
	sock_put(sk);

out:
	return 0;
}

1894
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
1895 1896 1897
	u32 group, gfp_t allocation,
	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
	void *filter_data)
L
Linus Torvalds 已提交
1898
{
1899
	struct net *net = sock_net(ssk);
L
Linus Torvalds 已提交
1900 1901 1902 1903 1904 1905
	struct netlink_broadcast_data info;
	struct sock *sk;

	skb = netlink_trim(skb, allocation);

	info.exclude_sk = ssk;
1906
	info.net = net;
1907
	info.portid = portid;
L
Linus Torvalds 已提交
1908 1909
	info.group = group;
	info.failure = 0;
1910
	info.delivery_failure = 0;
L
Linus Torvalds 已提交
1911 1912 1913 1914 1915
	info.congested = 0;
	info.delivered = 0;
	info.allocation = allocation;
	info.skb = skb;
	info.skb2 = NULL;
1916 1917
	info.tx_filter = filter;
	info.tx_data = filter_data;
L
Linus Torvalds 已提交
1918 1919 1920 1921 1922

	/* While we sleep in clone, do not allow to change socket list */

	netlink_lock_table();

1923
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
L
Linus Torvalds 已提交
1924 1925
		do_one_broadcast(sk, &info);

1926
	consume_skb(skb);
1927

L
Linus Torvalds 已提交
1928 1929
	netlink_unlock_table();

1930 1931
	if (info.delivery_failure) {
		kfree_skb(info.skb2);
1932
		return -ENOBUFS;
E
Eric Dumazet 已提交
1933 1934
	}
	consume_skb(info.skb2);
1935

L
Linus Torvalds 已提交
1936 1937 1938 1939 1940 1941 1942
	if (info.delivered) {
		if (info.congested && (allocation & __GFP_WAIT))
			yield();
		return 0;
	}
	return -ESRCH;
}
1943 1944
EXPORT_SYMBOL(netlink_broadcast_filtered);

1945
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
1946 1947
		      u32 group, gfp_t allocation)
{
1948
	return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
1949 1950
		NULL, NULL);
}
1951
EXPORT_SYMBOL(netlink_broadcast);
L
Linus Torvalds 已提交
1952 1953 1954

struct netlink_set_err_data {
	struct sock *exclude_sk;
1955
	u32 portid;
L
Linus Torvalds 已提交
1956 1957 1958 1959
	u32 group;
	int code;
};

1960
static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
L
Linus Torvalds 已提交
1961 1962
{
	struct netlink_sock *nlk = nlk_sk(sk);
1963
	int ret = 0;
L
Linus Torvalds 已提交
1964 1965 1966 1967

	if (sk == p->exclude_sk)
		goto out;

O
Octavian Purdila 已提交
1968
	if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
1969 1970
		goto out;

1971
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
1972
	    !test_bit(p->group - 1, nlk->groups))
L
Linus Torvalds 已提交
1973 1974
		goto out;

1975 1976 1977 1978 1979
	if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
		ret = 1;
		goto out;
	}

L
Linus Torvalds 已提交
1980 1981 1982
	sk->sk_err = p->code;
	sk->sk_error_report(sk);
out:
1983
	return ret;
L
Linus Torvalds 已提交
1984 1985
}

1986 1987 1988
/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
1989
 * @portid: the PORTID of a process that we want to skip (if any)
1990 1991
 * @groups: the broadcast group that will notice the error
 * @code: error code, must be negative (as usual in kernelspace)
1992 1993 1994
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_RECV_NO_ENOBUFS socket option.
1995
 */
1996
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
L
Linus Torvalds 已提交
1997 1998 1999
{
	struct netlink_set_err_data info;
	struct sock *sk;
2000
	int ret = 0;
L
Linus Torvalds 已提交
2001 2002

	info.exclude_sk = ssk;
2003
	info.portid = portid;
L
Linus Torvalds 已提交
2004
	info.group = group;
2005 2006
	/* sk->sk_err wants a positive error value */
	info.code = -code;
L
Linus Torvalds 已提交
2007 2008 2009

	read_lock(&nl_table_lock);

2010
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
2011
		ret += do_one_set_err(sk, &info);
L
Linus Torvalds 已提交
2012 2013

	read_unlock(&nl_table_lock);
2014
	return ret;
L
Linus Torvalds 已提交
2015
}
2016
EXPORT_SYMBOL(netlink_set_err);
L
Linus Torvalds 已提交
2017

2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034
/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
				     unsigned int group,
				     int is_new)
{
	int old, new = !!is_new, subscriptions;

	old = test_bit(group - 1, nlk->groups);
	subscriptions = nlk->subscriptions - old + new;
	if (new)
		__set_bit(group - 1, nlk->groups);
	else
		__clear_bit(group - 1, nlk->groups);
	netlink_update_subscriptions(&nlk->sk, subscriptions);
	netlink_update_listeners(&nlk->sk);
}

2035
static int netlink_setsockopt(struct socket *sock, int level, int optname,
2036
			      char __user *optval, unsigned int optlen)
2037 2038 2039
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2040 2041
	unsigned int val = 0;
	int err;
2042 2043 2044 2045

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

2046 2047
	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
	    optlen >= sizeof(int) &&
2048
	    get_user(val, (unsigned int __user *)optval))
2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060
		return -EFAULT;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (val)
			nlk->flags |= NETLINK_RECV_PKTINFO;
		else
			nlk->flags &= ~NETLINK_RECV_PKTINFO;
		err = 0;
		break;
	case NETLINK_ADD_MEMBERSHIP:
	case NETLINK_DROP_MEMBERSHIP: {
2061
		if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV))
2062
			return -EPERM;
2063 2064 2065
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
2066 2067 2068
		if (!val || val - 1 >= nlk->ngroups)
			return -EINVAL;
		netlink_table_grab();
2069 2070
		netlink_update_socket_mc(nlk, val,
					 optname == NETLINK_ADD_MEMBERSHIP);
2071
		netlink_table_ungrab();
2072 2073 2074 2075

		if (nlk->netlink_bind)
			nlk->netlink_bind(val);

2076 2077 2078
		err = 0;
		break;
	}
2079 2080 2081 2082 2083 2084 2085
	case NETLINK_BROADCAST_ERROR:
		if (val)
			nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
		else
			nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
		err = 0;
		break;
2086 2087 2088
	case NETLINK_NO_ENOBUFS:
		if (val) {
			nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
2089
			clear_bit(NETLINK_CONGESTED, &nlk->state);
2090
			wake_up_interruptible(&nlk->wait);
E
Eric Dumazet 已提交
2091
		} else {
2092
			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
E
Eric Dumazet 已提交
2093
		}
2094 2095
		err = 0;
		break;
2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114
#ifdef CONFIG_NETLINK_MMAP
	case NETLINK_RX_RING:
	case NETLINK_TX_RING: {
		struct nl_mmap_req req;

		/* Rings might consume more memory than queue limits, require
		 * CAP_NET_ADMIN.
		 */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
		if (optlen < sizeof(req))
			return -EINVAL;
		if (copy_from_user(&req, optval, sizeof(req)))
			return -EFAULT;
		err = netlink_set_ring(sk, &req, false,
				       optname == NETLINK_TX_RING);
		break;
	}
#endif /* CONFIG_NETLINK_MMAP */
2115 2116 2117 2118 2119 2120 2121
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
2122
			      char __user *optval, int __user *optlen)
2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int len, val, err;

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
H
Heiko Carstens 已提交
2142 2143 2144
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
2145 2146
		err = 0;
		break;
2147 2148 2149 2150 2151 2152 2153 2154 2155 2156
	case NETLINK_BROADCAST_ERROR:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2157 2158 2159 2160 2161 2162 2163 2164 2165 2166
	case NETLINK_NO_ENOBUFS:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
	struct nl_pktinfo info;

	info.group = NETLINK_CB(skb).dst_group;
	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

L
Linus Torvalds 已提交
2181 2182 2183 2184 2185 2186
static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
			   struct msghdr *msg, size_t len)
{
	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2187
	struct sockaddr_nl *addr = msg->msg_name;
2188
	u32 dst_portid;
2189
	u32 dst_group;
L
Linus Torvalds 已提交
2190 2191 2192 2193 2194 2195 2196
	struct sk_buff *skb;
	int err;
	struct scm_cookie scm;

	if (msg->msg_flags&MSG_OOB)
		return -EOPNOTSUPP;

2197
	if (NULL == siocb->scm)
L
Linus Torvalds 已提交
2198
		siocb->scm = &scm;
2199

2200
	err = scm_send(sock, msg, siocb->scm, true);
L
Linus Torvalds 已提交
2201 2202 2203 2204
	if (err < 0)
		return err;

	if (msg->msg_namelen) {
2205
		err = -EINVAL;
L
Linus Torvalds 已提交
2206
		if (addr->nl_family != AF_NETLINK)
2207
			goto out;
2208
		dst_portid = addr->nl_pid;
2209
		dst_group = ffs(addr->nl_groups);
2210
		err =  -EPERM;
2211
		if ((dst_group || dst_portid) &&
2212
		    !netlink_capable(sock, NL_CFG_F_NONROOT_SEND))
2213
			goto out;
L
Linus Torvalds 已提交
2214
	} else {
2215
		dst_portid = nlk->dst_portid;
2216
		dst_group = nlk->dst_group;
L
Linus Torvalds 已提交
2217 2218
	}

2219
	if (!nlk->portid) {
L
Linus Torvalds 已提交
2220 2221 2222 2223 2224
		err = netlink_autobind(sock);
		if (err)
			goto out;
	}

2225 2226 2227 2228 2229 2230 2231
	if (netlink_tx_is_mmaped(sk) &&
	    msg->msg_iov->iov_base == NULL) {
		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
					   siocb);
		goto out;
	}

L
Linus Torvalds 已提交
2232 2233 2234 2235
	err = -EMSGSIZE;
	if (len > sk->sk_sndbuf - 32)
		goto out;
	err = -ENOBUFS;
2236
	skb = netlink_alloc_large_skb(len, dst_group);
2237
	if (skb == NULL)
L
Linus Torvalds 已提交
2238 2239
		goto out;

2240
	NETLINK_CB(skb).portid	= nlk->portid;
2241
	NETLINK_CB(skb).dst_group = dst_group;
2242
	NETLINK_CB(skb).creds	= siocb->scm->creds;
L
Linus Torvalds 已提交
2243 2244

	err = -EFAULT;
2245
	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
L
Linus Torvalds 已提交
2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
		kfree_skb(skb);
		goto out;
	}

	err = security_netlink_send(sk, skb);
	if (err) {
		kfree_skb(skb);
		goto out;
	}

2256
	if (dst_group) {
L
Linus Torvalds 已提交
2257
		atomic_inc(&skb->users);
2258
		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
L
Linus Torvalds 已提交
2259
	}
2260
	err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
L
Linus Torvalds 已提交
2261 2262

out:
2263
	scm_destroy(siocb->scm);
L
Linus Torvalds 已提交
2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276
	return err;
}

static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
			   struct msghdr *msg, size_t len,
			   int flags)
{
	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
	struct scm_cookie scm;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int noblock = flags&MSG_DONTWAIT;
	size_t copied;
J
Johannes Berg 已提交
2277
	struct sk_buff *skb, *data_skb;
2278
	int err, ret;
L
Linus Torvalds 已提交
2279 2280 2281 2282 2283 2284

	if (flags&MSG_OOB)
		return -EOPNOTSUPP;

	copied = 0;

2285 2286
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
L
Linus Torvalds 已提交
2287 2288
		goto out;

J
Johannes Berg 已提交
2289 2290
	data_skb = skb;

2291 2292 2293
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
	if (unlikely(skb_shinfo(skb)->frag_list)) {
		/*
J
Johannes Berg 已提交
2294 2295 2296
		 * If this skb has a frag_list, then here that means that we
		 * will have to use the frag_list skb's data for compat tasks
		 * and the regular skb's data for normal (non-compat) tasks.
2297
		 *
J
Johannes Berg 已提交
2298 2299 2300 2301
		 * If we need to send the compat skb, assign it to the
		 * 'data_skb' variable so that it will be used below for data
		 * copying. We keep 'skb' for everything else, including
		 * freeing both later.
2302
		 */
J
Johannes Berg 已提交
2303 2304
		if (flags & MSG_CMSG_COMPAT)
			data_skb = skb_shinfo(skb)->frag_list;
2305 2306 2307
	}
#endif

L
Linus Torvalds 已提交
2308 2309
	msg->msg_namelen = 0;

J
Johannes Berg 已提交
2310
	copied = data_skb->len;
L
Linus Torvalds 已提交
2311 2312 2313 2314 2315
	if (len < copied) {
		msg->msg_flags |= MSG_TRUNC;
		copied = len;
	}

J
Johannes Berg 已提交
2316 2317
	skb_reset_transport_header(data_skb);
	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
L
Linus Torvalds 已提交
2318 2319

	if (msg->msg_name) {
2320
		struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
L
Linus Torvalds 已提交
2321 2322
		addr->nl_family = AF_NETLINK;
		addr->nl_pad    = 0;
2323
		addr->nl_pid	= NETLINK_CB(skb).portid;
2324
		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
L
Linus Torvalds 已提交
2325 2326 2327
		msg->msg_namelen = sizeof(*addr);
	}

2328 2329 2330
	if (nlk->flags & NETLINK_RECV_PKTINFO)
		netlink_cmsg_recv_pktinfo(msg, skb);

L
Linus Torvalds 已提交
2331 2332 2333 2334 2335
	if (NULL == siocb->scm) {
		memset(&scm, 0, sizeof(scm));
		siocb->scm = &scm;
	}
	siocb->scm->creds = *NETLINK_CREDS(skb);
2336
	if (flags & MSG_TRUNC)
J
Johannes Berg 已提交
2337
		copied = data_skb->len;
2338

L
Linus Torvalds 已提交
2339 2340
	skb_free_datagram(sk, skb);

2341 2342
	if (nlk->cb_running &&
	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
2343 2344 2345 2346 2347 2348
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}
L
Linus Torvalds 已提交
2349 2350 2351 2352 2353 2354 2355 2356 2357

	scm_recv(sock, msg, siocb->scm, flags);
out:
	netlink_rcv_wake(sk);
	return err ? : copied;
}

static void netlink_data_ready(struct sock *sk, int len)
{
2358
	BUG();
L
Linus Torvalds 已提交
2359 2360 2361
}

/*
2362
 *	We export these functions to other modules. They provide a
L
Linus Torvalds 已提交
2363 2364 2365 2366 2367
 *	complete set of kernel non-blocking support for message
 *	queueing.
 */

struct sock *
2368 2369
__netlink_kernel_create(struct net *net, int unit, struct module *module,
			struct netlink_kernel_cfg *cfg)
L
Linus Torvalds 已提交
2370 2371 2372
{
	struct socket *sock;
	struct sock *sk;
2373
	struct netlink_sock *nlk;
2374
	struct listeners *listeners = NULL;
2375 2376
	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
	unsigned int groups;
L
Linus Torvalds 已提交
2377

2378
	BUG_ON(!nl_table);
L
Linus Torvalds 已提交
2379

2380
	if (unit < 0 || unit >= MAX_LINKS)
L
Linus Torvalds 已提交
2381 2382 2383 2384 2385
		return NULL;

	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
		return NULL;

2386 2387 2388 2389 2390 2391 2392 2393 2394 2395
	/*
	 * We have to just have a reference on the net from sk, but don't
	 * get_net it. Besides, we cannot get and then put the net here.
	 * So we create one inside init_net and the move it to net.
	 */

	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
		goto out_sock_release_nosk;

	sk = sock->sk;
2396
	sk_change_net(sk, net);
2397

2398
	if (!cfg || cfg->groups < 32)
2399
		groups = 32;
2400 2401
	else
		groups = cfg->groups;
2402

2403
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2404 2405 2406
	if (!listeners)
		goto out_sock_release;

L
Linus Torvalds 已提交
2407
	sk->sk_data_ready = netlink_data_ready;
2408 2409
	if (cfg && cfg->input)
		nlk_sk(sk)->netlink_rcv = cfg->input;
L
Linus Torvalds 已提交
2410

2411
	if (netlink_insert(sk, net, 0))
2412
		goto out_sock_release;
2413

2414 2415
	nlk = nlk_sk(sk);
	nlk->flags |= NETLINK_KERNEL_SOCKET;
2416 2417

	netlink_table_grab();
2418 2419
	if (!nl_table[unit].registered) {
		nl_table[unit].groups = groups;
2420
		rcu_assign_pointer(nl_table[unit].listeners, listeners);
2421 2422
		nl_table[unit].cb_mutex = cb_mutex;
		nl_table[unit].module = module;
2423 2424 2425
		if (cfg) {
			nl_table[unit].bind = cfg->bind;
			nl_table[unit].flags = cfg->flags;
2426 2427
			if (cfg->compare)
				nl_table[unit].compare = cfg->compare;
2428
		}
2429
		nl_table[unit].registered = 1;
2430 2431
	} else {
		kfree(listeners);
2432
		nl_table[unit].registered++;
2433
	}
2434
	netlink_table_ungrab();
2435 2436
	return sk;

2437
out_sock_release:
2438
	kfree(listeners);
2439
	netlink_kernel_release(sk);
2440 2441 2442
	return NULL;

out_sock_release_nosk:
2443
	sock_release(sock);
2444
	return NULL;
L
Linus Torvalds 已提交
2445
}
2446
EXPORT_SYMBOL(__netlink_kernel_create);
2447 2448 2449 2450

void
netlink_kernel_release(struct sock *sk)
{
2451
	sk_release_kernel(sk);
2452 2453 2454
}
EXPORT_SYMBOL(netlink_kernel_release);

2455
int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
2456
{
2457
	struct listeners *new, *old;
2458 2459 2460 2461 2462 2463
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];

	if (groups < 32)
		groups = 32;

	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
2464 2465
		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
		if (!new)
2466
			return -ENOMEM;
2467
		old = nl_deref_protected(tbl->listeners);
2468 2469 2470
		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
		rcu_assign_pointer(tbl->listeners, new);

2471
		kfree_rcu(old, rcu);
2472 2473 2474
	}
	tbl->groups = groups;

2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495
	return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
	int err;

	netlink_table_grab();
	err = __netlink_change_ngroups(sk, groups);
2496
	netlink_table_ungrab();
2497

2498 2499 2500
	return err;
}

2501 2502 2503 2504 2505
void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
	struct sock *sk;
	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];

2506
	sk_for_each_bound(sk, &tbl->mc_list)
2507 2508 2509
		netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520
/**
 * netlink_clear_multicast_users - kick off multicast listeners
 *
 * This function removes all listeners from the given group.
 * @ksk: The kernel netlink socket, as returned by
 *	netlink_kernel_create().
 * @group: The multicast group to clear.
 */
void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
	netlink_table_grab();
2521
	__netlink_clear_multicast_users(ksk, group);
2522 2523 2524
	netlink_table_ungrab();
}

2525
struct nlmsghdr *
2526
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
2527 2528
{
	struct nlmsghdr *nlh;
2529
	int size = nlmsg_msg_size(len);
2530 2531 2532 2533 2534

	nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size));
	nlh->nlmsg_type = type;
	nlh->nlmsg_len = size;
	nlh->nlmsg_flags = flags;
2535
	nlh->nlmsg_pid = portid;
2536 2537
	nlh->nlmsg_seq = seq;
	if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
2538
		memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
2539 2540 2541 2542
	return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

L
Linus Torvalds 已提交
2543 2544 2545 2546 2547 2548 2549 2550 2551
/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_callback *cb;
2552
	struct sk_buff *skb = NULL;
L
Linus Torvalds 已提交
2553
	struct nlmsghdr *nlh;
2554
	int len, err = -ENOBUFS;
2555
	int alloc_size;
L
Linus Torvalds 已提交
2556

2557
	mutex_lock(nlk->cb_mutex);
2558
	if (!nlk->cb_running) {
2559 2560
		err = -EINVAL;
		goto errout_skb;
L
Linus Torvalds 已提交
2561 2562
	}

2563
	cb = &nlk->cb;
2564 2565
	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

2566 2567 2568 2569
	if (!netlink_rx_is_mmaped(sk) &&
	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
		goto errout_skb;
	skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL);
2570
	if (!skb)
2571
		goto errout_skb;
2572
	netlink_skb_set_owner_r(skb, sk);
2573

L
Linus Torvalds 已提交
2574 2575 2576
	len = cb->dump(skb, cb);

	if (len > 0) {
2577
		mutex_unlock(nlk->cb_mutex);
2578 2579 2580

		if (sk_filter(sk, skb))
			kfree_skb(skb);
2581 2582
		else
			__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2583 2584 2585
		return 0;
	}

2586 2587 2588 2589
	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
	if (!nlh)
		goto errout_skb;

2590 2591
	nl_dump_check_consistent(cb, nlh);

2592 2593
	memcpy(nlmsg_data(nlh), &len, sizeof(len));

2594 2595
	if (sk_filter(sk, skb))
		kfree_skb(skb);
2596 2597
	else
		__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2598

2599 2600
	if (cb->done)
		cb->done(cb);
L
Linus Torvalds 已提交
2601

2602 2603
	nlk->cb_running = false;
	mutex_unlock(nlk->cb_mutex);
2604
	module_put(cb->module);
2605
	consume_skb(cb->skb);
L
Linus Torvalds 已提交
2606
	return 0;
2607

2608
errout_skb:
2609
	mutex_unlock(nlk->cb_mutex);
2610 2611
	kfree_skb(skb);
	return err;
L
Linus Torvalds 已提交
2612 2613
}

2614 2615 2616
int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
			 const struct nlmsghdr *nlh,
			 struct netlink_dump_control *control)
L
Linus Torvalds 已提交
2617 2618 2619 2620
{
	struct netlink_callback *cb;
	struct sock *sk;
	struct netlink_sock *nlk;
2621
	int ret;
L
Linus Torvalds 已提交
2622

2623 2624 2625 2626 2627 2628
	/* Memory mapped dump requests need to be copied to avoid looping
	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
	 * a reference to the skb.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		skb = skb_copy(skb, GFP_KERNEL);
2629
		if (skb == NULL)
2630 2631 2632 2633
			return -ENOBUFS;
	} else
		atomic_inc(&skb->users);

2634
	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
L
Linus Torvalds 已提交
2635
	if (sk == NULL) {
2636 2637
		ret = -ECONNREFUSED;
		goto error_free;
L
Linus Torvalds 已提交
2638
	}
2639

2640
	nlk = nlk_sk(sk);
2641
	mutex_lock(nlk->cb_mutex);
2642
	/* A dump is in progress... */
2643
	if (nlk->cb_running) {
2644
		ret = -EBUSY;
2645
		goto error_unlock;
L
Linus Torvalds 已提交
2646
	}
2647
	/* add reference of module which cb->dump belongs to */
2648
	if (!try_module_get(control->module)) {
2649
		ret = -EPROTONOSUPPORT;
2650
		goto error_unlock;
2651 2652
	}

2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664
	cb = &nlk->cb;
	memset(cb, 0, sizeof(*cb));
	cb->dump = control->dump;
	cb->done = control->done;
	cb->nlh = nlh;
	cb->data = control->data;
	cb->module = control->module;
	cb->min_dump_alloc = control->min_dump_alloc;
	cb->skb = skb;

	nlk->cb_running = true;

2665
	mutex_unlock(nlk->cb_mutex);
L
Linus Torvalds 已提交
2666

2667
	ret = netlink_dump(sk);
L
Linus Torvalds 已提交
2668
	sock_put(sk);
2669

2670 2671 2672
	if (ret)
		return ret;

2673 2674 2675 2676
	/* We successfully started a dump, by returning -EINTR we
	 * signal not to send ACK even if it was requested.
	 */
	return -EINTR;
2677 2678 2679 2680 2681 2682 2683

error_unlock:
	sock_put(sk);
	mutex_unlock(nlk->cb_mutex);
error_free:
	kfree_skb(skb);
	return ret;
L
Linus Torvalds 已提交
2684
}
2685
EXPORT_SYMBOL(__netlink_dump_start);
L
Linus Torvalds 已提交
2686 2687 2688 2689 2690 2691

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
{
	struct sk_buff *skb;
	struct nlmsghdr *rep;
	struct nlmsgerr *errmsg;
2692
	size_t payload = sizeof(*errmsg);
L
Linus Torvalds 已提交
2693

2694 2695 2696
	/* error messages get the original request appened */
	if (err)
		payload += nlmsg_len(nlh);
L
Linus Torvalds 已提交
2697

2698 2699
	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
				NETLINK_CB(in_skb).portid, GFP_KERNEL);
L
Linus Torvalds 已提交
2700 2701 2702
	if (!skb) {
		struct sock *sk;

2703
		sk = netlink_lookup(sock_net(in_skb->sk),
2704
				    in_skb->sk->sk_protocol,
2705
				    NETLINK_CB(in_skb).portid);
L
Linus Torvalds 已提交
2706 2707 2708 2709 2710 2711 2712 2713
		if (sk) {
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
			sock_put(sk);
		}
		return;
	}

2714
	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2715
			  NLMSG_ERROR, payload, 0);
2716
	errmsg = nlmsg_data(rep);
L
Linus Torvalds 已提交
2717
	errmsg->error = err;
2718
	memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
2719
	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
L
Linus Torvalds 已提交
2720
}
2721
EXPORT_SYMBOL(netlink_ack);
L
Linus Torvalds 已提交
2722

2723
int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
2724
						     struct nlmsghdr *))
2725 2726 2727 2728 2729
{
	struct nlmsghdr *nlh;
	int err;

	while (skb->len >= nlmsg_total_size(0)) {
2730 2731
		int msglen;

2732
		nlh = nlmsg_hdr(skb);
2733
		err = 0;
2734

2735
		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
2736 2737
			return 0;

2738 2739
		/* Only requests are handled by the kernel */
		if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
2740
			goto ack;
2741 2742 2743

		/* Skip control messages */
		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
2744
			goto ack;
2745

2746
		err = cb(skb, nlh);
2747 2748 2749 2750
		if (err == -EINTR)
			goto skip;

ack:
2751
		if (nlh->nlmsg_flags & NLM_F_ACK || err)
2752 2753
			netlink_ack(skb, nlh, err);

2754
skip:
2755
		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
2756 2757 2758
		if (msglen > skb->len)
			msglen = skb->len;
		skb_pull(skb, msglen);
2759 2760 2761 2762
	}

	return 0;
}
2763
EXPORT_SYMBOL(netlink_rcv_skb);
2764

2765 2766 2767 2768
/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
2769
 * @portid: destination netlink portid for reports or 0
2770 2771 2772 2773
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
2774
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
2775 2776 2777 2778 2779
		 unsigned int group, int report, gfp_t flags)
{
	int err = 0;

	if (group) {
2780
		int exclude_portid = 0;
2781 2782 2783

		if (report) {
			atomic_inc(&skb->users);
2784
			exclude_portid = portid;
2785 2786
		}

2787 2788
		/* errors reported via destination sk->sk_err, but propagate
		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
2789
		err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
2790 2791
	}

2792 2793 2794
	if (report) {
		int err2;

2795
		err2 = nlmsg_unicast(sk, skb, portid);
2796 2797 2798
		if (!err || err == -ESRCH)
			err = err2;
	}
2799 2800 2801

	return err;
}
2802
EXPORT_SYMBOL(nlmsg_notify);
2803

L
Linus Torvalds 已提交
2804 2805
#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
2806
	struct seq_net_private p;
L
Linus Torvalds 已提交
2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817
	int link;
	int hash_idx;
};

static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
{
	struct nl_seq_iter *iter = seq->private;
	int i, j;
	struct sock *s;
	loff_t off = 0;

2818
	for (i = 0; i < MAX_LINKS; i++) {
2819
		struct nl_portid_hash *hash = &nl_table[i].hash;
L
Linus Torvalds 已提交
2820 2821

		for (j = 0; j <= hash->mask; j++) {
2822
			sk_for_each(s, &hash->table[j]) {
2823
				if (sock_net(s) != seq_file_net(seq))
2824
					continue;
L
Linus Torvalds 已提交
2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837
				if (off == pos) {
					iter->link = i;
					iter->hash_idx = j;
					return s;
				}
				++off;
			}
		}
	}
	return NULL;
}

static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
2838
	__acquires(nl_table_lock)
L
Linus Torvalds 已提交
2839 2840 2841 2842 2843 2844 2845 2846 2847
{
	read_lock(&nl_table_lock);
	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct sock *s;
	struct nl_seq_iter *iter;
2848
	struct net *net;
L
Linus Torvalds 已提交
2849 2850 2851 2852 2853 2854
	int i, j;

	++*pos;

	if (v == SEQ_START_TOKEN)
		return netlink_seq_socket_idx(seq, 0);
2855

2856
	net = seq_file_net(seq);
2857 2858 2859 2860
	iter = seq->private;
	s = v;
	do {
		s = sk_next(s);
2861
	} while (s && !nl_table[s->sk_protocol].compare(net, s));
L
Linus Torvalds 已提交
2862 2863 2864 2865 2866 2867 2868
	if (s)
		return s;

	i = iter->link;
	j = iter->hash_idx + 1;

	do {
2869
		struct nl_portid_hash *hash = &nl_table[i].hash;
L
Linus Torvalds 已提交
2870 2871 2872

		for (; j <= hash->mask; j++) {
			s = sk_head(&hash->table[j]);
2873 2874

			while (s && !nl_table[s->sk_protocol].compare(net, s))
2875
				s = sk_next(s);
L
Linus Torvalds 已提交
2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889
			if (s) {
				iter->link = i;
				iter->hash_idx = j;
				return s;
			}
		}

		j = 0;
	} while (++i < MAX_LINKS);

	return NULL;
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
2890
	__releases(nl_table_lock)
L
Linus Torvalds 已提交
2891 2892 2893 2894 2895 2896 2897
{
	read_unlock(&nl_table_lock);
}


static int netlink_seq_show(struct seq_file *seq, void *v)
{
E
Eric Dumazet 已提交
2898
	if (v == SEQ_START_TOKEN) {
L
Linus Torvalds 已提交
2899 2900
		seq_puts(seq,
			 "sk       Eth Pid    Groups   "
2901
			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
E
Eric Dumazet 已提交
2902
	} else {
L
Linus Torvalds 已提交
2903 2904 2905
		struct sock *s = v;
		struct netlink_sock *nlk = nlk_sk(s);

2906
		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
L
Linus Torvalds 已提交
2907 2908
			   s,
			   s->sk_protocol,
2909
			   nlk->portid,
2910
			   nlk->groups ? (u32)nlk->groups[0] : 0,
2911 2912
			   sk_rmem_alloc_get(s),
			   sk_wmem_alloc_get(s),
2913
			   nlk->cb_running,
2914
			   atomic_read(&s->sk_refcnt),
2915 2916
			   atomic_read(&s->sk_drops),
			   sock_i_ino(s)
L
Linus Torvalds 已提交
2917 2918 2919 2920 2921 2922
			);

	}
	return 0;
}

2923
static const struct seq_operations netlink_seq_ops = {
L
Linus Torvalds 已提交
2924 2925 2926 2927 2928 2929 2930 2931 2932
	.start  = netlink_seq_start,
	.next   = netlink_seq_next,
	.stop   = netlink_seq_stop,
	.show   = netlink_seq_show,
};


static int netlink_seq_open(struct inode *inode, struct file *file)
{
2933 2934
	return seq_open_net(inode, file, &netlink_seq_ops,
				sizeof(struct nl_seq_iter));
2935 2936
}

2937
static const struct file_operations netlink_seq_fops = {
L
Linus Torvalds 已提交
2938 2939 2940 2941
	.owner		= THIS_MODULE,
	.open		= netlink_seq_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
2942
	.release	= seq_release_net,
L
Linus Torvalds 已提交
2943 2944 2945 2946 2947 2948
};

#endif

int netlink_register_notifier(struct notifier_block *nb)
{
2949
	return atomic_notifier_chain_register(&netlink_chain, nb);
L
Linus Torvalds 已提交
2950
}
2951
EXPORT_SYMBOL(netlink_register_notifier);
L
Linus Torvalds 已提交
2952 2953 2954

int netlink_unregister_notifier(struct notifier_block *nb)
{
2955
	return atomic_notifier_chain_unregister(&netlink_chain, nb);
L
Linus Torvalds 已提交
2956
}
2957
EXPORT_SYMBOL(netlink_unregister_notifier);
2958

2959
static const struct proto_ops netlink_ops = {
L
Linus Torvalds 已提交
2960 2961 2962 2963 2964 2965 2966 2967
	.family =	PF_NETLINK,
	.owner =	THIS_MODULE,
	.release =	netlink_release,
	.bind =		netlink_bind,
	.connect =	netlink_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	sock_no_accept,
	.getname =	netlink_getname,
2968
	.poll =		netlink_poll,
L
Linus Torvalds 已提交
2969 2970 2971
	.ioctl =	sock_no_ioctl,
	.listen =	sock_no_listen,
	.shutdown =	sock_no_shutdown,
2972 2973
	.setsockopt =	netlink_setsockopt,
	.getsockopt =	netlink_getsockopt,
L
Linus Torvalds 已提交
2974 2975
	.sendmsg =	netlink_sendmsg,
	.recvmsg =	netlink_recvmsg,
2976
	.mmap =		netlink_mmap,
L
Linus Torvalds 已提交
2977 2978 2979
	.sendpage =	sock_no_sendpage,
};

2980
static const struct net_proto_family netlink_family_ops = {
L
Linus Torvalds 已提交
2981 2982 2983 2984 2985
	.family = PF_NETLINK,
	.create = netlink_create,
	.owner	= THIS_MODULE,	/* for consistency 8) */
};

2986
static int __net_init netlink_net_init(struct net *net)
2987 2988
{
#ifdef CONFIG_PROC_FS
2989
	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
2990 2991 2992 2993 2994
		return -ENOMEM;
#endif
	return 0;
}

2995
static void __net_exit netlink_net_exit(struct net *net)
2996 2997
{
#ifdef CONFIG_PROC_FS
2998
	remove_proc_entry("netlink", net->proc_net);
2999 3000 3001
#endif
}

3002 3003
static void __init netlink_add_usersock_entry(void)
{
3004
	struct listeners *listeners;
3005 3006
	int groups = 32;

3007
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
3008
	if (!listeners)
3009
		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
3010 3011 3012 3013

	netlink_table_grab();

	nl_table[NETLINK_USERSOCK].groups = groups;
3014
	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
3015 3016
	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
	nl_table[NETLINK_USERSOCK].registered = 1;
3017
	nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
3018 3019 3020 3021

	netlink_table_ungrab();
}

3022
static struct pernet_operations __net_initdata netlink_net_ops = {
3023 3024 3025 3026
	.init = netlink_net_init,
	.exit = netlink_net_exit,
};

L
Linus Torvalds 已提交
3027 3028 3029
static int __init netlink_proto_init(void)
{
	int i;
3030
	unsigned long limit;
L
Linus Torvalds 已提交
3031 3032 3033 3034 3035 3036
	unsigned int order;
	int err = proto_register(&netlink_proto, 0);

	if (err != 0)
		goto out;

3037
	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
L
Linus Torvalds 已提交
3038

3039
	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
3040 3041
	if (!nl_table)
		goto panic;
L
Linus Torvalds 已提交
3042

3043 3044
	if (totalram_pages >= (128 * 1024))
		limit = totalram_pages >> (21 - PAGE_SHIFT);
L
Linus Torvalds 已提交
3045
	else
3046
		limit = totalram_pages >> (23 - PAGE_SHIFT);
L
Linus Torvalds 已提交
3047

3048 3049 3050
	order = get_bitmask_order(limit) - 1 + PAGE_SHIFT;
	limit = (1UL << order) / sizeof(struct hlist_head);
	order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1;
L
Linus Torvalds 已提交
3051 3052

	for (i = 0; i < MAX_LINKS; i++) {
3053
		struct nl_portid_hash *hash = &nl_table[i].hash;
L
Linus Torvalds 已提交
3054

3055
		hash->table = nl_portid_hash_zalloc(1 * sizeof(*hash->table));
L
Linus Torvalds 已提交
3056 3057
		if (!hash->table) {
			while (i-- > 0)
3058
				nl_portid_hash_free(nl_table[i].hash.table,
L
Linus Torvalds 已提交
3059 3060
						 1 * sizeof(*hash->table));
			kfree(nl_table);
3061
			goto panic;
L
Linus Torvalds 已提交
3062 3063 3064 3065 3066
		}
		hash->max_shift = order;
		hash->shift = 0;
		hash->mask = 0;
		hash->rehash_time = jiffies;
3067 3068

		nl_table[i].compare = netlink_compare;
L
Linus Torvalds 已提交
3069 3070
	}

3071 3072
	INIT_LIST_HEAD(&netlink_tap_all);

3073 3074
	netlink_add_usersock_entry();

L
Linus Torvalds 已提交
3075
	sock_register(&netlink_family_ops);
3076
	register_pernet_subsys(&netlink_net_ops);
3077
	/* The netlink device handler may be needed early. */
L
Linus Torvalds 已提交
3078 3079 3080
	rtnetlink_init();
out:
	return err;
3081 3082
panic:
	panic("netlink_init: Cannot allocate nl_table\n");
L
Linus Torvalds 已提交
3083 3084 3085
}

core_initcall(netlink_proto_init);