af_netlink.c 72.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * NETLINK      Kernel-user communication protocol.
 *
4
 * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5
 * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6
 * 				Patrick McHardy <kaber@trash.net>
L
Linus Torvalds 已提交
7 8 9 10 11
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
12
 *
L
Linus Torvalds 已提交
13 14 15 16
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 * 				 use nlk_sk, as sk->protinfo is on a diet 8)
17 18 19 20 21 22
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 * 				 - inc module use count of module that owns
 * 				   the kernel socket in case userspace opens
 * 				   socket of same protocol
 * 				 - remove all module support, since netlink is
 * 				   mandatory if CONFIG_NET=y these days
L
Linus Torvalds 已提交
23 24 25 26
 */

#include <linux/module.h>

27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
A
Andrew Morton 已提交
57
#include <linux/audit.h>
58
#include <linux/mutex.h>
59
#include <linux/vmalloc.h>
60
#include <linux/if_arp.h>
61
#include <linux/rhashtable.h>
62
#include <asm/cacheflush.h>
63
#include <linux/hash.h>
A
Andrew Morton 已提交
64

65
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
66 67
#include <net/sock.h>
#include <net/scm.h>
68
#include <net/netlink.h>
L
Linus Torvalds 已提交
69

70
#include "af_netlink.h"
L
Linus Torvalds 已提交
71

72 73 74
struct listeners {
	struct rcu_head		rcu;
	unsigned long		masks[0];
75 76
};

77 78 79 80
/* state bits */
#define NETLINK_CONGESTED	0x0

/* flags */
81
#define NETLINK_KERNEL_SOCKET	0x1
82
#define NETLINK_RECV_PKTINFO	0x2
83
#define NETLINK_BROADCAST_SEND_ERROR	0x4
84
#define NETLINK_RECV_NO_ENOBUFS	0x8
85

86
static inline int netlink_is_kernel(struct sock *sk)
87 88 89 90
{
	return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
}

91 92
struct netlink_table *nl_table;
EXPORT_SYMBOL_GPL(nl_table);
L
Linus Torvalds 已提交
93 94 95 96

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static int netlink_dump(struct sock *sk);
97
static void netlink_skb_destructor(struct sk_buff *skb);
L
Linus Torvalds 已提交
98

99 100
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
L
Linus Torvalds 已提交
101 102
static atomic_t nl_table_users = ATOMIC_INIT(0);

103 104
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

105 106
/* Protects netlink socket hash table mutations */
DEFINE_MUTEX(nl_sk_hash_lock);
107
EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
108 109 110 111 112 113 114 115 116 117

static int lockdep_nl_sk_hash_is_held(void)
{
#ifdef CONFIG_LOCKDEP
	return (debug_locks) ? lockdep_is_held(&nl_sk_hash_lock) : 1;
#else
	return 1;
#endif
}

118
static ATOMIC_NOTIFIER_HEAD(netlink_chain);
L
Linus Torvalds 已提交
119

120 121 122
static DEFINE_SPINLOCK(netlink_tap_lock);
static struct list_head netlink_tap_all __read_mostly;

123
static inline u32 netlink_group_mask(u32 group)
124 125 126 127
{
	return group ? 1 << (group - 1) : 0;
}

128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
int netlink_add_tap(struct netlink_tap *nt)
{
	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
		return -EINVAL;

	spin_lock(&netlink_tap_lock);
	list_add_rcu(&nt->list, &netlink_tap_all);
	spin_unlock(&netlink_tap_lock);

	if (nt->module)
		__module_get(nt->module);

	return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

144
static int __netlink_remove_tap(struct netlink_tap *nt)
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
{
	bool found = false;
	struct netlink_tap *tmp;

	spin_lock(&netlink_tap_lock);

	list_for_each_entry(tmp, &netlink_tap_all, list) {
		if (nt == tmp) {
			list_del_rcu(&nt->list);
			found = true;
			goto out;
		}
	}

	pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
	spin_unlock(&netlink_tap_lock);

	if (found && nt->module)
		module_put(nt->module);

	return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
	int ret;

	ret = __netlink_remove_tap(nt);
	synchronize_net();

	return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
static bool netlink_filter_tap(const struct sk_buff *skb)
{
	struct sock *sk = skb->sk;

	/* We take the more conservative approach and
	 * whitelist socket protocols that may pass.
	 */
	switch (sk->sk_protocol) {
	case NETLINK_ROUTE:
	case NETLINK_USERSOCK:
	case NETLINK_SOCK_DIAG:
	case NETLINK_NFLOG:
	case NETLINK_XFRM:
	case NETLINK_FIB_LOOKUP:
	case NETLINK_NETFILTER:
	case NETLINK_GENERIC:
V
Varka Bhadram 已提交
196
		return true;
197 198
	}

V
Varka Bhadram 已提交
199
	return false;
200 201
}

202 203 204 205
static int __netlink_deliver_tap_skb(struct sk_buff *skb,
				     struct net_device *dev)
{
	struct sk_buff *nskb;
206
	struct sock *sk = skb->sk;
207 208 209 210 211 212
	int ret = -ENOMEM;

	dev_hold(dev);
	nskb = skb_clone(skb, GFP_ATOMIC);
	if (nskb) {
		nskb->dev = dev;
213
		nskb->protocol = htons((u16) sk->sk_protocol);
214 215
		nskb->pkt_type = netlink_is_kernel(sk) ?
				 PACKET_KERNEL : PACKET_USER;
216
		skb_reset_network_header(nskb);
217 218 219 220 221 222 223 224 225 226 227 228 229 230
		ret = dev_queue_xmit(nskb);
		if (unlikely(ret > 0))
			ret = net_xmit_errno(ret);
	}

	dev_put(dev);
	return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb)
{
	int ret;
	struct netlink_tap *tmp;

231 232 233
	if (!netlink_filter_tap(skb))
		return;

234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
		if (unlikely(ret))
			break;
	}
}

static void netlink_deliver_tap(struct sk_buff *skb)
{
	rcu_read_lock();

	if (unlikely(!list_empty(&netlink_tap_all)))
		__netlink_deliver_tap(skb);

	rcu_read_unlock();
}

251 252 253 254 255 256 257
static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
				       struct sk_buff *skb)
{
	if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
		netlink_deliver_tap(skb);
}

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
static void netlink_overrun(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
		if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
		}
	}
	atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (skb_queue_empty(&sk->sk_receive_queue))
		clear_bit(NETLINK_CONGESTED, &nlk->state);
	if (!test_bit(NETLINK_CONGESTED, &nlk->state))
		wake_up_interruptible(&nlk->wait);
}

281
#ifdef CONFIG_NETLINK_MMAP
282 283 284 285 286
static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
{
	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
}

287 288 289 290 291
static bool netlink_rx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
}

292 293 294 295 296
static bool netlink_tx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
}

297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
static __pure struct page *pgvec_to_page(const void *addr)
{
	if (is_vmalloc_addr(addr))
		return vmalloc_to_page(addr);
	else
		return virt_to_page(addr);
}

static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
{
	unsigned int i;

	for (i = 0; i < len; i++) {
		if (pg_vec[i] != NULL) {
			if (is_vmalloc_addr(pg_vec[i]))
				vfree(pg_vec[i]);
			else
				free_pages((unsigned long)pg_vec[i], order);
		}
	}
	kfree(pg_vec);
}

static void *alloc_one_pg_vec_page(unsigned long order)
{
	void *buffer;
	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
			  __GFP_NOWARN | __GFP_NORETRY;

	buffer = (void *)__get_free_pages(gfp_flags, order);
	if (buffer != NULL)
		return buffer;

	buffer = vzalloc((1 << order) * PAGE_SIZE);
	if (buffer != NULL)
		return buffer;

	gfp_flags &= ~__GFP_NORETRY;
	return (void *)__get_free_pages(gfp_flags, order);
}

static void **alloc_pg_vec(struct netlink_sock *nlk,
			   struct nl_mmap_req *req, unsigned int order)
{
	unsigned int block_nr = req->nm_block_nr;
	unsigned int i;
343
	void **pg_vec;
344 345 346 347 348 349

	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
	if (pg_vec == NULL)
		return NULL;

	for (i = 0; i < block_nr; i++) {
350
		pg_vec[i] = alloc_one_pg_vec_page(order);
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
		if (pg_vec[i] == NULL)
			goto err1;
	}

	return pg_vec;
err1:
	free_pg_vec(pg_vec, order, block_nr);
	return NULL;
}

static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
			    bool closing, bool tx_ring)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	struct sk_buff_head *queue;
	void **pg_vec = NULL;
	unsigned int order = 0;
	int err;

	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

	if (!closing) {
		if (atomic_read(&nlk->mapped))
			return -EBUSY;
		if (atomic_read(&ring->pending))
			return -EBUSY;
	}

	if (req->nm_block_nr) {
		if (ring->pg_vec != NULL)
			return -EBUSY;

		if ((int)req->nm_block_size <= 0)
			return -EINVAL;
T
Tobias Klauser 已提交
387
		if (!PAGE_ALIGNED(req->nm_block_size))
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
			return -EINVAL;
		if (req->nm_frame_size < NL_MMAP_HDRLEN)
			return -EINVAL;
		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
			return -EINVAL;

		ring->frames_per_block = req->nm_block_size /
					 req->nm_frame_size;
		if (ring->frames_per_block == 0)
			return -EINVAL;
		if (ring->frames_per_block * req->nm_block_nr !=
		    req->nm_frame_nr)
			return -EINVAL;

		order = get_order(req->nm_block_size);
		pg_vec = alloc_pg_vec(nlk, req, order);
		if (pg_vec == NULL)
			return -ENOMEM;
	} else {
		if (req->nm_frame_nr)
			return -EINVAL;
	}

	err = -EBUSY;
	mutex_lock(&nlk->pg_vec_lock);
	if (closing || atomic_read(&nlk->mapped) == 0) {
		err = 0;
		spin_lock_bh(&queue->lock);

		ring->frame_max		= req->nm_frame_nr - 1;
		ring->head		= 0;
		ring->frame_size	= req->nm_frame_size;
		ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;

		swap(ring->pg_vec_len, req->nm_block_nr);
		swap(ring->pg_vec_order, order);
		swap(ring->pg_vec, pg_vec);

		__skb_queue_purge(queue);
		spin_unlock_bh(&queue->lock);

		WARN_ON(atomic_read(&nlk->mapped));
	}
	mutex_unlock(&nlk->pg_vec_lock);

	if (pg_vec)
		free_pg_vec(pg_vec, order, req->nm_block_nr);
	return err;
}

static void netlink_mm_open(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_inc(&nlk_sk(sk)->mapped);
}

static void netlink_mm_close(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_dec(&nlk_sk(sk)->mapped);
}

static const struct vm_operations_struct netlink_mmap_ops = {
	.open	= netlink_mm_open,
	.close	= netlink_mm_close,
};

static int netlink_mmap(struct file *file, struct socket *sock,
			struct vm_area_struct *vma)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	unsigned long start, size, expected;
	unsigned int i;
	int err = -EINVAL;

	if (vma->vm_pgoff)
		return -EINVAL;

	mutex_lock(&nlk->pg_vec_lock);

	expected = 0;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;
		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
	}

	if (expected == 0)
		goto out;

	size = vma->vm_end - vma->vm_start;
	if (size != expected)
		goto out;

	start = vma->vm_start;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;

		for (i = 0; i < ring->pg_vec_len; i++) {
			struct page *page;
			void *kaddr = ring->pg_vec[i];
			unsigned int pg_num;

			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
				page = pgvec_to_page(kaddr);
				err = vm_insert_page(vma, start, page);
				if (err < 0)
					goto out;
				start += PAGE_SIZE;
				kaddr += PAGE_SIZE;
			}
		}
	}

	atomic_inc(&nlk->mapped);
	vma->vm_ops = &netlink_mmap_ops;
	err = 0;
out:
	mutex_unlock(&nlk->pg_vec_lock);
518
	return err;
519
}
520 521 522 523 524 525 526 527

static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr)
{
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
	struct page *p_start, *p_end;

	/* First page is flushed through netlink_{get,set}_status */
	p_start = pgvec_to_page(hdr + PAGE_SIZE);
528
	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + hdr->nm_len - 1);
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
	while (p_start <= p_end) {
		flush_dcache_page(p_start);
		p_start++;
	}
#endif
}

static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
{
	smp_rmb();
	flush_dcache_page(pgvec_to_page(hdr));
	return hdr->nm_status;
}

static void netlink_set_status(struct nl_mmap_hdr *hdr,
			       enum nl_mmap_status status)
{
	hdr->nm_status = status;
	flush_dcache_page(pgvec_to_page(hdr));
	smp_wmb();
}

static struct nl_mmap_hdr *
__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
{
	unsigned int pg_vec_pos, frame_off;

	pg_vec_pos = pos / ring->frames_per_block;
	frame_off  = pos % ring->frames_per_block;

	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
}

static struct nl_mmap_hdr *
netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
		     enum nl_mmap_status status)
{
	struct nl_mmap_hdr *hdr;

	hdr = __netlink_lookup_frame(ring, pos);
	if (netlink_get_status(hdr) != status)
		return NULL;

	return hdr;
}

static struct nl_mmap_hdr *
netlink_current_frame(const struct netlink_ring *ring,
		      enum nl_mmap_status status)
{
	return netlink_lookup_frame(ring, ring->head, status);
}

static struct nl_mmap_hdr *
netlink_previous_frame(const struct netlink_ring *ring,
		       enum nl_mmap_status status)
{
	unsigned int prev;

	prev = ring->head ? ring->head - 1 : ring->frame_max;
	return netlink_lookup_frame(ring, prev, status);
}

static void netlink_increment_head(struct netlink_ring *ring)
{
	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
}

static void netlink_forward_ring(struct netlink_ring *ring)
{
	unsigned int head = ring->head, pos = head;
	const struct nl_mmap_hdr *hdr;

	do {
		hdr = __netlink_lookup_frame(ring, pos);
		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
			break;
		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
			break;
		netlink_increment_head(ring);
	} while (ring->head != head);
}

612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
static bool netlink_dump_space(struct netlink_sock *nlk)
{
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;
	unsigned int n;

	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		return false;

	n = ring->head + ring->frame_max / 2;
	if (n > ring->frame_max)
		n -= ring->frame_max;

	hdr = __netlink_lookup_frame(ring, n);

	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
}

631 632 633 634 635 636
static unsigned int netlink_poll(struct file *file, struct socket *sock,
				 poll_table *wait)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int mask;
637
	int err;
638

639 640 641 642 643
	if (nlk->rx_ring.pg_vec != NULL) {
		/* Memory mapped sockets don't call recvmsg(), so flow control
		 * for dumps is performed here. A dump is allowed to continue
		 * if at least half the ring is unused.
		 */
644
		while (nlk->cb_running && netlink_dump_space(nlk)) {
645 646
			err = netlink_dump(sk);
			if (err < 0) {
647
				sk->sk_err = -err;
648 649 650 651 652 653
				sk->sk_error_report(sk);
				break;
			}
		}
		netlink_rcv_wake(sk);
	}
654

655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
	mask = datagram_poll(file, sock, wait);

	spin_lock_bh(&sk->sk_receive_queue.lock);
	if (nlk->rx_ring.pg_vec) {
		netlink_forward_ring(&nlk->rx_ring);
		if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
			mask |= POLLIN | POLLRDNORM;
	}
	spin_unlock_bh(&sk->sk_receive_queue.lock);

	spin_lock_bh(&sk->sk_write_queue.lock);
	if (nlk->tx_ring.pg_vec) {
		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
			mask |= POLLOUT | POLLWRNORM;
	}
	spin_unlock_bh(&sk->sk_write_queue.lock);

	return mask;
}

static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
{
	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
}

static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
				   struct netlink_ring *ring,
				   struct nl_mmap_hdr *hdr)
{
	unsigned int size;
	void *data;

	size = ring->frame_size - NL_MMAP_HDRLEN;
	data = (void *)hdr + NL_MMAP_HDRLEN;

	skb->head	= data;
	skb->data	= data;
	skb_reset_tail_pointer(skb);
	skb->end	= skb->tail + size;
	skb->len	= 0;

	skb->destructor	= netlink_skb_destructor;
	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
	NETLINK_CB(skb).sk = sk;
}
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797

static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
				u32 dst_portid, u32 dst_group,
				struct sock_iocb *siocb)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;
	struct sk_buff *skb;
	unsigned int maxlen;
	bool excl = true;
	int err = 0, len = 0;

	/* Netlink messages are validated by the receiver before processing.
	 * In order to avoid userspace changing the contents of the message
	 * after validation, the socket and the ring may only be used by a
	 * single process, otherwise we fall back to copying.
	 */
	if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
	    atomic_read(&nlk->mapped) > 1)
		excl = false;

	mutex_lock(&nlk->pg_vec_lock);

	ring   = &nlk->tx_ring;
	maxlen = ring->frame_size - NL_MMAP_HDRLEN;

	do {
		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
		if (hdr == NULL) {
			if (!(msg->msg_flags & MSG_DONTWAIT) &&
			    atomic_read(&nlk->tx_ring.pending))
				schedule();
			continue;
		}
		if (hdr->nm_len > maxlen) {
			err = -EINVAL;
			goto out;
		}

		netlink_frame_flush_dcache(hdr);

		if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
			skb = alloc_skb_head(GFP_KERNEL);
			if (skb == NULL) {
				err = -ENOBUFS;
				goto out;
			}
			sock_hold(sk);
			netlink_ring_setup_skb(skb, sk, ring, hdr);
			NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
			__skb_put(skb, hdr->nm_len);
			netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
			atomic_inc(&ring->pending);
		} else {
			skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
			if (skb == NULL) {
				err = -ENOBUFS;
				goto out;
			}
			__skb_put(skb, hdr->nm_len);
			memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
		}

		netlink_increment_head(ring);

		NETLINK_CB(skb).portid	  = nlk->portid;
		NETLINK_CB(skb).dst_group = dst_group;
		NETLINK_CB(skb).creds	  = siocb->scm->creds;

		err = security_netlink_send(sk, skb);
		if (err) {
			kfree_skb(skb);
			goto out;
		}

		if (unlikely(dst_group)) {
			atomic_inc(&skb->users);
			netlink_broadcast(sk, skb, dst_portid, dst_group,
					  GFP_KERNEL);
		}
		err = netlink_unicast(sk, skb, dst_portid,
				      msg->msg_flags & MSG_DONTWAIT);
		if (err < 0)
			goto out;
		len += err;

	} while (hdr != NULL ||
		 (!(msg->msg_flags & MSG_DONTWAIT) &&
		  atomic_read(&nlk->tx_ring.pending)));

	if (len > 0)
		err = len;
out:
	mutex_unlock(&nlk->pg_vec_lock);
	return err;
}
798 799 800 801 802 803 804 805 806

static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
{
	struct nl_mmap_hdr *hdr;

	hdr = netlink_mmap_hdr(skb);
	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
807 808
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
	netlink_frame_flush_dcache(hdr);
	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);

	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
	kfree_skb(skb);
}

static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL) {
		spin_unlock_bh(&sk->sk_receive_queue.lock);
		kfree_skb(skb);
827
		netlink_overrun(sk);
828 829 830 831 832 833 834 835 836
		return;
	}
	netlink_increment_head(ring);
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);

	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
837 838
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
839 840 841
	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
}

842
#else /* CONFIG_NETLINK_MMAP */
843
#define netlink_skb_is_mmaped(skb)	false
844
#define netlink_rx_is_mmaped(sk)	false
845
#define netlink_tx_is_mmaped(sk)	false
846
#define netlink_mmap			sock_no_mmap
847
#define netlink_poll			datagram_poll
848
#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)	0
849 850
#endif /* CONFIG_NETLINK_MMAP */

851 852
static void netlink_skb_destructor(struct sk_buff *skb)
{
853 854 855 856 857 858 859 860 861 862 863 864 865 866
#ifdef CONFIG_NETLINK_MMAP
	struct nl_mmap_hdr *hdr;
	struct netlink_ring *ring;
	struct sock *sk;

	/* If a packet from the kernel to userspace was freed because of an
	 * error without being delivered to userspace, the kernel must reset
	 * the status. In the direction userspace to kernel, the status is
	 * always reset here after the packet was processed and freed.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		hdr = netlink_mmap_hdr(skb);
		sk = NETLINK_CB(skb).sk;

867 868 869 870 871 872 873 874 875
		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
			ring = &nlk_sk(sk)->tx_ring;
		} else {
			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
				hdr->nm_len = 0;
				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
			}
			ring = &nlk_sk(sk)->rx_ring;
876 877 878 879 880 881
		}

		WARN_ON(atomic_read(&ring->pending) == 0);
		atomic_dec(&ring->pending);
		sock_put(sk);

882
		skb->head = NULL;
883 884
	}
#endif
885
	if (is_vmalloc_addr(skb->head)) {
886 887 888 889
		if (!skb->cloned ||
		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
			vfree(skb->head);

890 891
		skb->head = NULL;
	}
892 893
	if (skb->sk != NULL)
		sock_rfree(skb);
894 895 896 897 898 899 900 901 902 903 904
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
	WARN_ON(skb->sk != NULL);
	skb->sk = sk;
	skb->destructor = netlink_skb_destructor;
	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
	sk_mem_charge(sk, skb->truesize);
}

L
Linus Torvalds 已提交
905 906
static void netlink_sock_destruct(struct sock *sk)
{
907 908
	struct netlink_sock *nlk = nlk_sk(sk);

909 910 911
	if (nlk->cb_running) {
		if (nlk->cb.done)
			nlk->cb.done(&nlk->cb);
912

913 914
		module_put(nlk->cb.module);
		kfree_skb(nlk->cb.skb);
915 916
	}

L
Linus Torvalds 已提交
917
	skb_queue_purge(&sk->sk_receive_queue);
918 919 920 921 922 923 924 925 926 927 928 929
#ifdef CONFIG_NETLINK_MMAP
	if (1) {
		struct nl_mmap_req req;

		memset(&req, 0, sizeof(req));
		if (nlk->rx_ring.pg_vec)
			netlink_set_ring(sk, &req, true, false);
		memset(&req, 0, sizeof(req));
		if (nlk->tx_ring.pg_vec)
			netlink_set_ring(sk, &req, true, true);
	}
#endif /* CONFIG_NETLINK_MMAP */
L
Linus Torvalds 已提交
930 931

	if (!sock_flag(sk, SOCK_DEAD)) {
932
		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
L
Linus Torvalds 已提交
933 934
		return;
	}
935 936 937 938

	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
	WARN_ON(nlk_sk(sk)->groups);
L
Linus Torvalds 已提交
939 940
}

941 942
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
L
Linus Torvalds 已提交
943 944 945 946
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

947
void netlink_table_grab(void)
948
	__acquires(nl_table_lock)
L
Linus Torvalds 已提交
949
{
950 951
	might_sleep();

952
	write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
953 954 955 956 957

	if (atomic_read(&nl_table_users)) {
		DECLARE_WAITQUEUE(wait, current);

		add_wait_queue_exclusive(&nl_table_wait, &wait);
958
		for (;;) {
L
Linus Torvalds 已提交
959 960 961
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (atomic_read(&nl_table_users) == 0)
				break;
962
			write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
963
			schedule();
964
			write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
965 966 967 968 969 970 971
		}

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nl_table_wait, &wait);
	}
}

972
void netlink_table_ungrab(void)
973
	__releases(nl_table_lock)
L
Linus Torvalds 已提交
974
{
975
	write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
976 977 978
	wake_up(&nl_table_wait);
}

979
static inline void
L
Linus Torvalds 已提交
980 981 982 983 984 985 986 987 988
netlink_lock_table(void)
{
	/* read_lock() synchronizes us to netlink_table_grab */

	read_lock(&nl_table_lock);
	atomic_inc(&nl_table_users);
	read_unlock(&nl_table_lock);
}

989
static inline void
L
Linus Torvalds 已提交
990 991 992 993 994 995
netlink_unlock_table(void)
{
	if (atomic_dec_and_test(&nl_table_users))
		wake_up(&nl_table_wait);
}

996
struct netlink_compare_arg
L
Linus Torvalds 已提交
997
{
998 999 1000
	struct net *net;
	u32 portid;
};
L
Linus Torvalds 已提交
1001

1002
static bool netlink_compare(void *ptr, void *arg)
L
Linus Torvalds 已提交
1003
{
1004 1005
	struct netlink_compare_arg *x = arg;
	struct sock *sk = ptr;
L
Linus Torvalds 已提交
1006

1007 1008
	return nlk_sk(sk)->portid == x->portid &&
	       net_eq(sock_net(sk), x->net);
L
Linus Torvalds 已提交
1009 1010
}

1011 1012
static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
				     struct net *net)
L
Linus Torvalds 已提交
1013
{
1014 1015 1016 1017 1018
	struct netlink_compare_arg arg = {
		.net = net,
		.portid = portid,
	};
	u32 hash;
L
Linus Torvalds 已提交
1019

1020
	hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid));
L
Linus Torvalds 已提交
1021

1022 1023
	return rhashtable_lookup_compare(&table->hash, hash,
					 &netlink_compare, &arg);
L
Linus Torvalds 已提交
1024 1025
}

1026
static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
L
Linus Torvalds 已提交
1027
{
1028 1029
	struct netlink_table *table = &nl_table[protocol];
	struct sock *sk;
L
Linus Torvalds 已提交
1030

1031 1032 1033 1034 1035
	rcu_read_lock();
	sk = __netlink_lookup(table, portid, net);
	if (sk)
		sock_hold(sk);
	rcu_read_unlock();
L
Linus Torvalds 已提交
1036

1037
	return sk;
L
Linus Torvalds 已提交
1038 1039
}

1040
static const struct proto_ops netlink_ops;
L
Linus Torvalds 已提交
1041

1042 1043 1044 1045 1046 1047
static void
netlink_update_listeners(struct sock *sk)
{
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
	unsigned long mask;
	unsigned int i;
1048 1049 1050 1051 1052
	struct listeners *listeners;

	listeners = nl_deref_protected(tbl->listeners);
	if (!listeners)
		return;
1053

1054
	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
1055
		mask = 0;
1056
		sk_for_each_bound(sk, &tbl->mc_list) {
1057 1058 1059
			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
				mask |= nlk_sk(sk)->groups[i];
		}
1060
		listeners->masks[i] = mask;
1061 1062 1063 1064 1065
	}
	/* this function is only called with the netlink table "grabbed", which
	 * makes sure updates are visible before bind or setsockopt return. */
}

1066
static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
L
Linus Torvalds 已提交
1067
{
1068
	struct netlink_table *table = &nl_table[sk->sk_protocol];
L
Linus Torvalds 已提交
1069 1070
	int err = -EADDRINUSE;

1071 1072
	mutex_lock(&nl_sk_hash_lock);
	if (__netlink_lookup(table, portid, net))
L
Linus Torvalds 已提交
1073 1074 1075
		goto err;

	err = -EBUSY;
1076
	if (nlk_sk(sk)->portid)
L
Linus Torvalds 已提交
1077 1078 1079
		goto err;

	err = -ENOMEM;
1080
	if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX))
L
Linus Torvalds 已提交
1081 1082
		goto err;

1083
	nlk_sk(sk)->portid = portid;
1084 1085
	sock_hold(sk);
	rhashtable_insert(&table->hash, &nlk_sk(sk)->node, GFP_KERNEL);
L
Linus Torvalds 已提交
1086 1087
	err = 0;
err:
1088
	mutex_unlock(&nl_sk_hash_lock);
L
Linus Torvalds 已提交
1089 1090 1091 1092 1093
	return err;
}

static void netlink_remove(struct sock *sk)
{
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
	struct netlink_table *table;

	mutex_lock(&nl_sk_hash_lock);
	table = &nl_table[sk->sk_protocol];
	if (rhashtable_remove(&table->hash, &nlk_sk(sk)->node, GFP_KERNEL)) {
		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
		__sock_put(sk);
	}
	mutex_unlock(&nl_sk_hash_lock);

L
Linus Torvalds 已提交
1104
	netlink_table_grab();
1105
	if (nlk_sk(sk)->subscriptions)
L
Linus Torvalds 已提交
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
		__sk_del_bind_node(sk);
	netlink_table_ungrab();
}

static struct proto netlink_proto = {
	.name	  = "NETLINK",
	.owner	  = THIS_MODULE,
	.obj_size = sizeof(struct netlink_sock),
};

1116 1117
static int __netlink_create(struct net *net, struct socket *sock,
			    struct mutex *cb_mutex, int protocol)
L
Linus Torvalds 已提交
1118 1119 1120
{
	struct sock *sk;
	struct netlink_sock *nlk;
1121 1122 1123

	sock->ops = &netlink_ops;

1124
	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
1125 1126 1127 1128 1129 1130
	if (!sk)
		return -ENOMEM;

	sock_init_data(sock, sk);

	nlk = nlk_sk(sk);
E
Eric Dumazet 已提交
1131
	if (cb_mutex) {
1132
		nlk->cb_mutex = cb_mutex;
E
Eric Dumazet 已提交
1133
	} else {
1134 1135 1136
		nlk->cb_mutex = &nlk->cb_def_mutex;
		mutex_init(nlk->cb_mutex);
	}
1137
	init_waitqueue_head(&nlk->wait);
1138 1139 1140
#ifdef CONFIG_NETLINK_MMAP
	mutex_init(&nlk->pg_vec_lock);
#endif
1141 1142 1143 1144 1145 1146

	sk->sk_destruct = netlink_sock_destruct;
	sk->sk_protocol = protocol;
	return 0;
}

1147 1148
static int netlink_create(struct net *net, struct socket *sock, int protocol,
			  int kern)
1149 1150
{
	struct module *module = NULL;
1151
	struct mutex *cb_mutex;
1152
	struct netlink_sock *nlk;
1153 1154
	int (*bind)(int group);
	void (*unbind)(int group);
1155
	int err = 0;
L
Linus Torvalds 已提交
1156 1157 1158 1159 1160 1161

	sock->state = SS_UNCONNECTED;

	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
		return -ESOCKTNOSUPPORT;

1162
	if (protocol < 0 || protocol >= MAX_LINKS)
L
Linus Torvalds 已提交
1163 1164
		return -EPROTONOSUPPORT;

1165
	netlink_lock_table();
1166
#ifdef CONFIG_MODULES
1167
	if (!nl_table[protocol].registered) {
1168
		netlink_unlock_table();
1169
		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
1170
		netlink_lock_table();
1171
	}
1172 1173 1174 1175
#endif
	if (nl_table[protocol].registered &&
	    try_module_get(nl_table[protocol].module))
		module = nl_table[protocol].module;
1176 1177
	else
		err = -EPROTONOSUPPORT;
1178
	cb_mutex = nl_table[protocol].cb_mutex;
1179
	bind = nl_table[protocol].bind;
1180
	unbind = nl_table[protocol].unbind;
1181
	netlink_unlock_table();
1182

1183 1184 1185
	if (err < 0)
		goto out;

1186 1187
	err = __netlink_create(net, sock, cb_mutex, protocol);
	if (err < 0)
1188 1189
		goto out_module;

1190
	local_bh_disable();
1191
	sock_prot_inuse_add(net, &netlink_proto, 1);
1192 1193
	local_bh_enable();

1194 1195
	nlk = nlk_sk(sock->sk);
	nlk->module = module;
1196
	nlk->netlink_bind = bind;
1197
	nlk->netlink_unbind = unbind;
1198 1199
out:
	return err;
L
Linus Torvalds 已提交
1200

1201 1202 1203
out_module:
	module_put(module);
	goto out;
L
Linus Torvalds 已提交
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214
}

static int netlink_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk;

	if (!sk)
		return 0;

	netlink_remove(sk);
1215
	sock_orphan(sk);
L
Linus Torvalds 已提交
1216 1217
	nlk = nlk_sk(sk);

1218 1219 1220 1221
	/*
	 * OK. Socket is unlinked, any packets that arrive now
	 * will be purged.
	 */
L
Linus Torvalds 已提交
1222 1223 1224 1225 1226 1227

	sock->sk = NULL;
	wake_up_interruptible_all(&nlk->wait);

	skb_queue_purge(&sk->sk_write_queue);

1228
	if (nlk->portid) {
L
Linus Torvalds 已提交
1229
		struct netlink_notify n = {
1230
						.net = sock_net(sk),
L
Linus Torvalds 已提交
1231
						.protocol = sk->sk_protocol,
1232
						.portid = nlk->portid,
L
Linus Torvalds 已提交
1233
					  };
1234 1235
		atomic_notifier_call_chain(&netlink_chain,
				NETLINK_URELEASE, &n);
1236
	}
1237

1238
	module_put(nlk->module);
1239

1240
	netlink_table_grab();
1241
	if (netlink_is_kernel(sk)) {
1242 1243
		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
		if (--nl_table[sk->sk_protocol].registered == 0) {
1244 1245 1246 1247 1248
			struct listeners *old;

			old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
			RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
			kfree_rcu(old, rcu);
1249
			nl_table[sk->sk_protocol].module = NULL;
1250
			nl_table[sk->sk_protocol].bind = NULL;
1251
			nl_table[sk->sk_protocol].unbind = NULL;
1252
			nl_table[sk->sk_protocol].flags = 0;
1253 1254
			nl_table[sk->sk_protocol].registered = 0;
		}
E
Eric Dumazet 已提交
1255
	} else if (nlk->subscriptions) {
1256
		netlink_update_listeners(sk);
E
Eric Dumazet 已提交
1257
	}
1258
	netlink_table_ungrab();
1259

1260 1261 1262
	/* Wait for readers to complete */
	synchronize_net();

1263 1264 1265
	kfree(nlk->groups);
	nlk->groups = NULL;

1266
	local_bh_disable();
1267
	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
1268
	local_bh_enable();
L
Linus Torvalds 已提交
1269 1270 1271 1272 1273 1274 1275
	sock_put(sk);
	return 0;
}

static int netlink_autobind(struct socket *sock)
{
	struct sock *sk = sock->sk;
1276
	struct net *net = sock_net(sk);
1277
	struct netlink_table *table = &nl_table[sk->sk_protocol];
1278
	s32 portid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
1279 1280 1281 1282 1283
	int err;
	static s32 rover = -4097;

retry:
	cond_resched();
1284 1285 1286 1287 1288 1289 1290 1291
	rcu_read_lock();
	if (__netlink_lookup(table, portid, net)) {
		/* Bind collision, search negative portid values. */
		portid = rover--;
		if (rover > -4097)
			rover = -4097;
		rcu_read_unlock();
		goto retry;
L
Linus Torvalds 已提交
1292
	}
1293
	rcu_read_unlock();
L
Linus Torvalds 已提交
1294

1295
	err = netlink_insert(sk, net, portid);
L
Linus Torvalds 已提交
1296 1297
	if (err == -EADDRINUSE)
		goto retry;
1298 1299 1300 1301 1302 1303

	/* If 2 threads race to autobind, that is fine.  */
	if (err == -EBUSY)
		err = 0;

	return err;
L
Linus Torvalds 已提交
1304 1305
}

1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318
/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
			struct user_namespace *user_ns, int cap)
{
1319 1320 1321
	return ((nsp->flags & NETLINK_SKB_DST) ||
		file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
		ns_capable(user_ns, cap);
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
			struct user_namespace *user_ns, int cap)
{
	return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

1373
static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
1374
{
1375
	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
1376
		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
1377
}
L
Linus Torvalds 已提交
1378

1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (nlk->subscriptions && !subscriptions)
		__sk_del_bind_node(sk);
	else if (!nlk->subscriptions && subscriptions)
		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
	nlk->subscriptions = subscriptions;
}

1391
static int netlink_realloc_groups(struct sock *sk)
1392 1393 1394
{
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int groups;
1395
	unsigned long *new_groups;
1396 1397
	int err = 0;

1398 1399
	netlink_table_grab();

1400
	groups = nl_table[sk->sk_protocol].groups;
1401
	if (!nl_table[sk->sk_protocol].registered) {
1402
		err = -ENOENT;
1403 1404
		goto out_unlock;
	}
1405

1406 1407
	if (nlk->ngroups >= groups)
		goto out_unlock;
1408

1409 1410 1411 1412 1413
	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
	if (new_groups == NULL) {
		err = -ENOMEM;
		goto out_unlock;
	}
1414
	memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
1415 1416 1417
	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

	nlk->groups = new_groups;
1418
	nlk->ngroups = groups;
1419 1420 1421
 out_unlock:
	netlink_table_ungrab();
	return err;
1422 1423
}

1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436
static void netlink_unbind(int group, long unsigned int groups,
			   struct netlink_sock *nlk)
{
	int undo;

	if (!nlk->netlink_unbind)
		return;

	for (undo = 0; undo < group; undo++)
		if (test_bit(group, &groups))
			nlk->netlink_unbind(undo);
}

1437 1438
static int netlink_bind(struct socket *sock, struct sockaddr *addr,
			int addr_len)
L
Linus Torvalds 已提交
1439 1440
{
	struct sock *sk = sock->sk;
1441
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1442 1443 1444
	struct netlink_sock *nlk = nlk_sk(sk);
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
	int err;
1445
	long unsigned int groups = nladdr->nl_groups;
1446

1447 1448 1449
	if (addr_len < sizeof(struct sockaddr_nl))
		return -EINVAL;

L
Linus Torvalds 已提交
1450 1451 1452 1453
	if (nladdr->nl_family != AF_NETLINK)
		return -EINVAL;

	/* Only superuser is allowed to listen multicasts */
1454
	if (groups) {
1455
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
1456
			return -EPERM;
1457 1458 1459
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
1460
	}
L
Linus Torvalds 已提交
1461

1462
	if (nlk->portid)
1463
		if (nladdr->nl_pid != nlk->portid)
L
Linus Torvalds 已提交
1464
			return -EINVAL;
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480

	if (nlk->netlink_bind && groups) {
		int group;

		for (group = 0; group < nlk->ngroups; group++) {
			if (!test_bit(group, &groups))
				continue;
			err = nlk->netlink_bind(group);
			if (!err)
				continue;
			netlink_unbind(group, groups, nlk);
			return err;
		}
	}

	if (!nlk->portid) {
L
Linus Torvalds 已提交
1481
		err = nladdr->nl_pid ?
1482
			netlink_insert(sk, net, nladdr->nl_pid) :
L
Linus Torvalds 已提交
1483
			netlink_autobind(sock);
1484 1485
		if (err) {
			netlink_unbind(nlk->ngroups - 1, groups, nlk);
L
Linus Torvalds 已提交
1486
			return err;
1487
		}
L
Linus Torvalds 已提交
1488 1489
	}

1490
	if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
L
Linus Torvalds 已提交
1491 1492 1493
		return 0;

	netlink_table_grab();
1494
	netlink_update_subscriptions(sk, nlk->subscriptions +
1495
					 hweight32(groups) -
1496
					 hweight32(nlk->groups[0]));
1497
	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
1498
	netlink_update_listeners(sk);
L
Linus Torvalds 已提交
1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509
	netlink_table_ungrab();

	return 0;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
			   int alen, int flags)
{
	int err = 0;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1510
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
L
Linus Torvalds 已提交
1511

1512 1513 1514
	if (alen < sizeof(addr->sa_family))
		return -EINVAL;

L
Linus Torvalds 已提交
1515 1516
	if (addr->sa_family == AF_UNSPEC) {
		sk->sk_state	= NETLINK_UNCONNECTED;
1517
		nlk->dst_portid	= 0;
1518
		nlk->dst_group  = 0;
L
Linus Torvalds 已提交
1519 1520 1521 1522 1523
		return 0;
	}
	if (addr->sa_family != AF_NETLINK)
		return -EINVAL;

1524
	if ((nladdr->nl_groups || nladdr->nl_pid) &&
1525
	    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
L
Linus Torvalds 已提交
1526 1527
		return -EPERM;

1528
	if (!nlk->portid)
L
Linus Torvalds 已提交
1529 1530 1531 1532
		err = netlink_autobind(sock);

	if (err == 0) {
		sk->sk_state	= NETLINK_CONNECTED;
1533
		nlk->dst_portid = nladdr->nl_pid;
1534
		nlk->dst_group  = ffs(nladdr->nl_groups);
L
Linus Torvalds 已提交
1535 1536 1537 1538 1539
	}

	return err;
}

1540 1541
static int netlink_getname(struct socket *sock, struct sockaddr *addr,
			   int *addr_len, int peer)
L
Linus Torvalds 已提交
1542 1543 1544
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1545
	DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
1546

L
Linus Torvalds 已提交
1547 1548 1549 1550 1551
	nladdr->nl_family = AF_NETLINK;
	nladdr->nl_pad = 0;
	*addr_len = sizeof(*nladdr);

	if (peer) {
1552
		nladdr->nl_pid = nlk->dst_portid;
1553
		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
L
Linus Torvalds 已提交
1554
	} else {
1555
		nladdr->nl_pid = nlk->portid;
1556
		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
L
Linus Torvalds 已提交
1557 1558 1559 1560
	}
	return 0;
}

1561
static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
L
Linus Torvalds 已提交
1562 1563 1564 1565
{
	struct sock *sock;
	struct netlink_sock *nlk;

1566
	sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
L
Linus Torvalds 已提交
1567 1568 1569 1570 1571
	if (!sock)
		return ERR_PTR(-ECONNREFUSED);

	/* Don't bother queuing skb if kernel socket has no input function */
	nlk = nlk_sk(sock);
1572
	if (sock->sk_state == NETLINK_CONNECTED &&
1573
	    nlk->dst_portid != nlk_sk(ssk)->portid) {
L
Linus Torvalds 已提交
1574 1575 1576 1577 1578 1579 1580 1581
		sock_put(sock);
		return ERR_PTR(-ECONNREFUSED);
	}
	return sock;
}

struct sock *netlink_getsockbyfilp(struct file *filp)
{
A
Al Viro 已提交
1582
	struct inode *inode = file_inode(filp);
L
Linus Torvalds 已提交
1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
	struct sock *sock;

	if (!S_ISSOCK(inode->i_mode))
		return ERR_PTR(-ENOTSOCK);

	sock = SOCKET_I(inode)->sk;
	if (sock->sk_family != AF_NETLINK)
		return ERR_PTR(-EINVAL);

	sock_hold(sock);
	return sock;
}

1596 1597
static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
					       int broadcast)
1598 1599 1600 1601
{
	struct sk_buff *skb;
	void *data;

1602
	if (size <= NLMSG_GOODSIZE || broadcast)
1603 1604
		return alloc_skb(size, GFP_KERNEL);

1605 1606
	size = SKB_DATA_ALIGN(size) +
	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1607 1608 1609

	data = vmalloc(size);
	if (data == NULL)
1610
		return NULL;
1611

1612 1613 1614 1615 1616 1617 1618
	skb = build_skb(data, size);
	if (skb == NULL)
		vfree(data);
	else {
		skb->head_frag = 0;
		skb->destructor = netlink_skb_destructor;
	}
1619 1620 1621 1622

	return skb;
}

L
Linus Torvalds 已提交
1623 1624 1625 1626 1627 1628 1629 1630 1631 1632
/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
1633
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
P
Patrick McHardy 已提交
1634
		      long *timeo, struct sock *ssk)
L
Linus Torvalds 已提交
1635 1636 1637 1638 1639
{
	struct netlink_sock *nlk;

	nlk = nlk_sk(sk);

1640 1641 1642
	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
	     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
	    !netlink_skb_is_mmaped(skb)) {
L
Linus Torvalds 已提交
1643
		DECLARE_WAITQUEUE(wait, current);
P
Patrick McHardy 已提交
1644
		if (!*timeo) {
1645
			if (!ssk || netlink_is_kernel(ssk))
L
Linus Torvalds 已提交
1646 1647 1648 1649 1650 1651 1652 1653 1654 1655
				netlink_overrun(sk);
			sock_put(sk);
			kfree_skb(skb);
			return -EAGAIN;
		}

		__set_current_state(TASK_INTERRUPTIBLE);
		add_wait_queue(&nlk->wait, &wait);

		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1656
		     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
L
Linus Torvalds 已提交
1657
		    !sock_flag(sk, SOCK_DEAD))
P
Patrick McHardy 已提交
1658
			*timeo = schedule_timeout(*timeo);
L
Linus Torvalds 已提交
1659 1660 1661 1662 1663 1664 1665

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nlk->wait, &wait);
		sock_put(sk);

		if (signal_pending(current)) {
			kfree_skb(skb);
P
Patrick McHardy 已提交
1666
			return sock_intr_errno(*timeo);
L
Linus Torvalds 已提交
1667 1668 1669
		}
		return 1;
	}
1670
	netlink_skb_set_owner_r(skb, sk);
L
Linus Torvalds 已提交
1671 1672 1673
	return 0;
}

1674
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1675 1676 1677
{
	int len = skb->len;

1678 1679
	netlink_deliver_tap(skb);

1680 1681 1682 1683 1684 1685 1686 1687
#ifdef CONFIG_NETLINK_MMAP
	if (netlink_skb_is_mmaped(skb))
		netlink_queue_mmaped_skb(sk, skb);
	else if (netlink_rx_is_mmaped(sk))
		netlink_ring_set_copied(sk, skb);
	else
#endif /* CONFIG_NETLINK_MMAP */
		skb_queue_tail(&sk->sk_receive_queue, skb);
1688
	sk->sk_data_ready(sk);
1689 1690 1691 1692 1693 1694 1695
	return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
	int len = __netlink_sendskb(sk, skb);

L
Linus Torvalds 已提交
1696 1697 1698 1699 1700 1701 1702 1703 1704 1705
	sock_put(sk);
	return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
	kfree_skb(skb);
	sock_put(sk);
}

1706
static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
L
Linus Torvalds 已提交
1707 1708 1709
{
	int delta;

1710
	WARN_ON(skb->sk != NULL);
1711 1712
	if (netlink_skb_is_mmaped(skb))
		return skb;
L
Linus Torvalds 已提交
1713

1714
	delta = skb->end - skb->tail;
1715
	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
L
Linus Torvalds 已提交
1716 1717 1718 1719 1720 1721
		return skb;

	if (skb_shared(skb)) {
		struct sk_buff *nskb = skb_clone(skb, allocation);
		if (!nskb)
			return skb;
1722
		consume_skb(skb);
L
Linus Torvalds 已提交
1723 1724 1725 1726 1727 1728 1729 1730 1731
		skb = nskb;
	}

	if (!pskb_expand_head(skb, 0, -delta, allocation))
		skb->truesize -= delta;

	return skb;
}

1732 1733
static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
				  struct sock *ssk)
1734 1735 1736 1737 1738 1739 1740
{
	int ret;
	struct netlink_sock *nlk = nlk_sk(sk);

	ret = -ECONNREFUSED;
	if (nlk->netlink_rcv != NULL) {
		ret = skb->len;
1741
		netlink_skb_set_owner_r(skb, sk);
1742
		NETLINK_CB(skb).sk = ssk;
1743
		netlink_deliver_tap_kernel(sk, ssk, skb);
1744
		nlk->netlink_rcv(skb);
1745 1746 1747
		consume_skb(skb);
	} else {
		kfree_skb(skb);
1748 1749 1750 1751 1752 1753
	}
	sock_put(sk);
	return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
1754
		    u32 portid, int nonblock)
L
Linus Torvalds 已提交
1755 1756 1757 1758 1759 1760 1761 1762 1763
{
	struct sock *sk;
	int err;
	long timeo;

	skb = netlink_trim(skb, gfp_any());

	timeo = sock_sndtimeo(ssk, nonblock);
retry:
1764
	sk = netlink_getsockbyportid(ssk, portid);
L
Linus Torvalds 已提交
1765 1766 1767 1768
	if (IS_ERR(sk)) {
		kfree_skb(skb);
		return PTR_ERR(sk);
	}
1769
	if (netlink_is_kernel(sk))
1770
		return netlink_unicast_kernel(sk, skb, ssk);
1771

1772
	if (sk_filter(sk, skb)) {
W
Wang Chen 已提交
1773
		err = skb->len;
1774 1775 1776 1777 1778
		kfree_skb(skb);
		sock_put(sk);
		return err;
	}

1779
	err = netlink_attachskb(sk, skb, &timeo, ssk);
L
Linus Torvalds 已提交
1780 1781 1782 1783 1784
	if (err == 1)
		goto retry;
	if (err)
		return err;

1785
	return netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
1786
}
1787
EXPORT_SYMBOL(netlink_unicast);
L
Linus Torvalds 已提交
1788

1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807
struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
				  u32 dst_portid, gfp_t gfp_mask)
{
#ifdef CONFIG_NETLINK_MMAP
	struct sock *sk = NULL;
	struct sk_buff *skb;
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;
	unsigned int maxlen;

	sk = netlink_getsockbyportid(ssk, dst_portid);
	if (IS_ERR(sk))
		goto out;

	ring = &nlk_sk(sk)->rx_ring;
	/* fast-path without atomic ops for common case: non-mmaped receiver */
	if (ring->pg_vec == NULL)
		goto out_put;

1808 1809 1810
	if (ring->frame_size - NL_MMAP_HDRLEN < size)
		goto out_put;

1811 1812 1813 1814 1815 1816 1817 1818 1819
	skb = alloc_skb_head(gfp_mask);
	if (skb == NULL)
		goto err1;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	/* check again under lock */
	if (ring->pg_vec == NULL)
		goto out_free;

1820
	/* check again under lock */
1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839
	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
	if (maxlen < size)
		goto out_free;

	netlink_forward_ring(ring);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		goto err2;
	netlink_ring_setup_skb(skb, sk, ring, hdr);
	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
	atomic_inc(&ring->pending);
	netlink_increment_head(ring);

	spin_unlock_bh(&sk->sk_receive_queue.lock);
	return skb;

err2:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
1840
	netlink_overrun(sk);
1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855
err1:
	sock_put(sk);
	return NULL;

out_free:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
out_put:
	sock_put(sk);
out:
#endif
	return alloc_skb(size, gfp_mask);
}
EXPORT_SYMBOL_GPL(netlink_alloc_skb);

1856 1857 1858
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
	int res = 0;
1859
	struct listeners *listeners;
1860

1861
	BUG_ON(!netlink_is_kernel(sk));
1862 1863 1864 1865

	rcu_read_lock();
	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

1866
	if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
1867
		res = test_bit(group - 1, listeners->masks);
1868 1869 1870

	rcu_read_unlock();

1871 1872 1873 1874
	return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

1875
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1876 1877 1878 1879
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
1880
	    !test_bit(NETLINK_CONGESTED, &nlk->state)) {
1881
		netlink_skb_set_owner_r(skb, sk);
1882
		__netlink_sendskb(sk, skb);
1883
		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
L
Linus Torvalds 已提交
1884 1885 1886 1887 1888 1889
	}
	return -1;
}

struct netlink_broadcast_data {
	struct sock *exclude_sk;
1890
	struct net *net;
1891
	u32 portid;
L
Linus Torvalds 已提交
1892 1893
	u32 group;
	int failure;
1894
	int delivery_failure;
L
Linus Torvalds 已提交
1895 1896
	int congested;
	int delivered;
A
Al Viro 已提交
1897
	gfp_t allocation;
L
Linus Torvalds 已提交
1898
	struct sk_buff *skb, *skb2;
1899 1900
	int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
	void *tx_data;
L
Linus Torvalds 已提交
1901 1902
};

1903 1904
static void do_one_broadcast(struct sock *sk,
				    struct netlink_broadcast_data *p)
L
Linus Torvalds 已提交
1905 1906 1907 1908 1909
{
	struct netlink_sock *nlk = nlk_sk(sk);
	int val;

	if (p->exclude_sk == sk)
1910
		return;
L
Linus Torvalds 已提交
1911

1912
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
1913
	    !test_bit(p->group - 1, nlk->groups))
1914
		return;
L
Linus Torvalds 已提交
1915

1916
	if (!net_eq(sock_net(sk), p->net))
1917
		return;
1918

L
Linus Torvalds 已提交
1919 1920
	if (p->failure) {
		netlink_overrun(sk);
1921
		return;
L
Linus Torvalds 已提交
1922 1923 1924 1925
	}

	sock_hold(sk);
	if (p->skb2 == NULL) {
1926
		if (skb_shared(p->skb)) {
L
Linus Torvalds 已提交
1927 1928
			p->skb2 = skb_clone(p->skb, p->allocation);
		} else {
1929 1930 1931 1932 1933 1934
			p->skb2 = skb_get(p->skb);
			/*
			 * skb ownership may have been set when
			 * delivered to a previous socket.
			 */
			skb_orphan(p->skb2);
L
Linus Torvalds 已提交
1935 1936 1937 1938 1939 1940
		}
	}
	if (p->skb2 == NULL) {
		netlink_overrun(sk);
		/* Clone failed. Notify ALL listeners. */
		p->failure = 1;
1941 1942
		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
			p->delivery_failure = 1;
1943 1944 1945
	} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
		kfree_skb(p->skb2);
		p->skb2 = NULL;
1946 1947 1948
	} else if (sk_filter(sk, p->skb2)) {
		kfree_skb(p->skb2);
		p->skb2 = NULL;
L
Linus Torvalds 已提交
1949 1950
	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
		netlink_overrun(sk);
1951 1952
		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
			p->delivery_failure = 1;
L
Linus Torvalds 已提交
1953 1954 1955 1956 1957 1958 1959 1960
	} else {
		p->congested |= val;
		p->delivered = 1;
		p->skb2 = NULL;
	}
	sock_put(sk);
}

1961
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
1962 1963 1964
	u32 group, gfp_t allocation,
	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
	void *filter_data)
L
Linus Torvalds 已提交
1965
{
1966
	struct net *net = sock_net(ssk);
L
Linus Torvalds 已提交
1967 1968 1969 1970 1971 1972
	struct netlink_broadcast_data info;
	struct sock *sk;

	skb = netlink_trim(skb, allocation);

	info.exclude_sk = ssk;
1973
	info.net = net;
1974
	info.portid = portid;
L
Linus Torvalds 已提交
1975 1976
	info.group = group;
	info.failure = 0;
1977
	info.delivery_failure = 0;
L
Linus Torvalds 已提交
1978 1979 1980 1981 1982
	info.congested = 0;
	info.delivered = 0;
	info.allocation = allocation;
	info.skb = skb;
	info.skb2 = NULL;
1983 1984
	info.tx_filter = filter;
	info.tx_data = filter_data;
L
Linus Torvalds 已提交
1985 1986 1987 1988 1989

	/* While we sleep in clone, do not allow to change socket list */

	netlink_lock_table();

1990
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
L
Linus Torvalds 已提交
1991 1992
		do_one_broadcast(sk, &info);

1993
	consume_skb(skb);
1994

L
Linus Torvalds 已提交
1995 1996
	netlink_unlock_table();

1997 1998
	if (info.delivery_failure) {
		kfree_skb(info.skb2);
1999
		return -ENOBUFS;
E
Eric Dumazet 已提交
2000 2001
	}
	consume_skb(info.skb2);
2002

L
Linus Torvalds 已提交
2003 2004 2005 2006 2007 2008 2009
	if (info.delivered) {
		if (info.congested && (allocation & __GFP_WAIT))
			yield();
		return 0;
	}
	return -ESRCH;
}
2010 2011
EXPORT_SYMBOL(netlink_broadcast_filtered);

2012
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
2013 2014
		      u32 group, gfp_t allocation)
{
2015
	return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
2016 2017
		NULL, NULL);
}
2018
EXPORT_SYMBOL(netlink_broadcast);
L
Linus Torvalds 已提交
2019 2020 2021

struct netlink_set_err_data {
	struct sock *exclude_sk;
2022
	u32 portid;
L
Linus Torvalds 已提交
2023 2024 2025 2026
	u32 group;
	int code;
};

2027
static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
L
Linus Torvalds 已提交
2028 2029
{
	struct netlink_sock *nlk = nlk_sk(sk);
2030
	int ret = 0;
L
Linus Torvalds 已提交
2031 2032 2033 2034

	if (sk == p->exclude_sk)
		goto out;

O
Octavian Purdila 已提交
2035
	if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
2036 2037
		goto out;

2038
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
2039
	    !test_bit(p->group - 1, nlk->groups))
L
Linus Torvalds 已提交
2040 2041
		goto out;

2042 2043 2044 2045 2046
	if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
		ret = 1;
		goto out;
	}

L
Linus Torvalds 已提交
2047 2048 2049
	sk->sk_err = p->code;
	sk->sk_error_report(sk);
out:
2050
	return ret;
L
Linus Torvalds 已提交
2051 2052
}

2053 2054 2055
/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
2056
 * @portid: the PORTID of a process that we want to skip (if any)
2057
 * @group: the broadcast group that will notice the error
2058
 * @code: error code, must be negative (as usual in kernelspace)
2059 2060 2061
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_RECV_NO_ENOBUFS socket option.
2062
 */
2063
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
L
Linus Torvalds 已提交
2064 2065 2066
{
	struct netlink_set_err_data info;
	struct sock *sk;
2067
	int ret = 0;
L
Linus Torvalds 已提交
2068 2069

	info.exclude_sk = ssk;
2070
	info.portid = portid;
L
Linus Torvalds 已提交
2071
	info.group = group;
2072 2073
	/* sk->sk_err wants a positive error value */
	info.code = -code;
L
Linus Torvalds 已提交
2074 2075 2076

	read_lock(&nl_table_lock);

2077
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
2078
		ret += do_one_set_err(sk, &info);
L
Linus Torvalds 已提交
2079 2080

	read_unlock(&nl_table_lock);
2081
	return ret;
L
Linus Torvalds 已提交
2082
}
2083
EXPORT_SYMBOL(netlink_set_err);
L
Linus Torvalds 已提交
2084

2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101
/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
				     unsigned int group,
				     int is_new)
{
	int old, new = !!is_new, subscriptions;

	old = test_bit(group - 1, nlk->groups);
	subscriptions = nlk->subscriptions - old + new;
	if (new)
		__set_bit(group - 1, nlk->groups);
	else
		__clear_bit(group - 1, nlk->groups);
	netlink_update_subscriptions(&nlk->sk, subscriptions);
	netlink_update_listeners(&nlk->sk);
}

2102
static int netlink_setsockopt(struct socket *sock, int level, int optname,
2103
			      char __user *optval, unsigned int optlen)
2104 2105 2106
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2107 2108
	unsigned int val = 0;
	int err;
2109 2110 2111 2112

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

2113 2114
	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
	    optlen >= sizeof(int) &&
2115
	    get_user(val, (unsigned int __user *)optval))
2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127
		return -EFAULT;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (val)
			nlk->flags |= NETLINK_RECV_PKTINFO;
		else
			nlk->flags &= ~NETLINK_RECV_PKTINFO;
		err = 0;
		break;
	case NETLINK_ADD_MEMBERSHIP:
	case NETLINK_DROP_MEMBERSHIP: {
2128
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
2129
			return -EPERM;
2130 2131 2132
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
2133 2134
		if (!val || val - 1 >= nlk->ngroups)
			return -EINVAL;
2135
		if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
2136 2137 2138 2139
			err = nlk->netlink_bind(val);
			if (err)
				return err;
		}
2140
		netlink_table_grab();
2141 2142
		netlink_update_socket_mc(nlk, val,
					 optname == NETLINK_ADD_MEMBERSHIP);
2143
		netlink_table_ungrab();
2144 2145
		if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
			nlk->netlink_unbind(val);
2146

2147 2148 2149
		err = 0;
		break;
	}
2150 2151 2152 2153 2154 2155 2156
	case NETLINK_BROADCAST_ERROR:
		if (val)
			nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
		else
			nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
		err = 0;
		break;
2157 2158 2159
	case NETLINK_NO_ENOBUFS:
		if (val) {
			nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
2160
			clear_bit(NETLINK_CONGESTED, &nlk->state);
2161
			wake_up_interruptible(&nlk->wait);
E
Eric Dumazet 已提交
2162
		} else {
2163
			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
E
Eric Dumazet 已提交
2164
		}
2165 2166
		err = 0;
		break;
2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185
#ifdef CONFIG_NETLINK_MMAP
	case NETLINK_RX_RING:
	case NETLINK_TX_RING: {
		struct nl_mmap_req req;

		/* Rings might consume more memory than queue limits, require
		 * CAP_NET_ADMIN.
		 */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
		if (optlen < sizeof(req))
			return -EINVAL;
		if (copy_from_user(&req, optval, sizeof(req)))
			return -EFAULT;
		err = netlink_set_ring(sk, &req, false,
				       optname == NETLINK_TX_RING);
		break;
	}
#endif /* CONFIG_NETLINK_MMAP */
2186 2187 2188 2189 2190 2191 2192
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
2193
			      char __user *optval, int __user *optlen)
2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int len, val, err;

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
H
Heiko Carstens 已提交
2213 2214 2215
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
2216 2217
		err = 0;
		break;
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227
	case NETLINK_BROADCAST_ERROR:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2228 2229 2230 2231 2232 2233 2234 2235 2236 2237
	case NETLINK_NO_ENOBUFS:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
	struct nl_pktinfo info;

	info.group = NETLINK_CB(skb).dst_group;
	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

L
Linus Torvalds 已提交
2252 2253 2254 2255 2256 2257
static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
			   struct msghdr *msg, size_t len)
{
	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2258
	DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
2259
	u32 dst_portid;
2260
	u32 dst_group;
L
Linus Torvalds 已提交
2261 2262 2263
	struct sk_buff *skb;
	int err;
	struct scm_cookie scm;
2264
	u32 netlink_skb_flags = 0;
L
Linus Torvalds 已提交
2265 2266 2267 2268

	if (msg->msg_flags&MSG_OOB)
		return -EOPNOTSUPP;

2269
	if (NULL == siocb->scm)
L
Linus Torvalds 已提交
2270
		siocb->scm = &scm;
2271

2272
	err = scm_send(sock, msg, siocb->scm, true);
L
Linus Torvalds 已提交
2273 2274 2275 2276
	if (err < 0)
		return err;

	if (msg->msg_namelen) {
2277
		err = -EINVAL;
L
Linus Torvalds 已提交
2278
		if (addr->nl_family != AF_NETLINK)
2279
			goto out;
2280
		dst_portid = addr->nl_pid;
2281
		dst_group = ffs(addr->nl_groups);
2282
		err =  -EPERM;
2283
		if ((dst_group || dst_portid) &&
2284
		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
2285
			goto out;
2286
		netlink_skb_flags |= NETLINK_SKB_DST;
L
Linus Torvalds 已提交
2287
	} else {
2288
		dst_portid = nlk->dst_portid;
2289
		dst_group = nlk->dst_group;
L
Linus Torvalds 已提交
2290 2291
	}

2292
	if (!nlk->portid) {
L
Linus Torvalds 已提交
2293 2294 2295 2296 2297
		err = netlink_autobind(sock);
		if (err)
			goto out;
	}

2298 2299 2300 2301 2302 2303 2304
	if (netlink_tx_is_mmaped(sk) &&
	    msg->msg_iov->iov_base == NULL) {
		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
					   siocb);
		goto out;
	}

L
Linus Torvalds 已提交
2305 2306 2307 2308
	err = -EMSGSIZE;
	if (len > sk->sk_sndbuf - 32)
		goto out;
	err = -ENOBUFS;
2309
	skb = netlink_alloc_large_skb(len, dst_group);
2310
	if (skb == NULL)
L
Linus Torvalds 已提交
2311 2312
		goto out;

2313
	NETLINK_CB(skb).portid	= nlk->portid;
2314
	NETLINK_CB(skb).dst_group = dst_group;
2315
	NETLINK_CB(skb).creds	= siocb->scm->creds;
2316
	NETLINK_CB(skb).flags	= netlink_skb_flags;
L
Linus Torvalds 已提交
2317 2318

	err = -EFAULT;
2319
	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
L
Linus Torvalds 已提交
2320 2321 2322 2323 2324 2325 2326 2327 2328 2329
		kfree_skb(skb);
		goto out;
	}

	err = security_netlink_send(sk, skb);
	if (err) {
		kfree_skb(skb);
		goto out;
	}

2330
	if (dst_group) {
L
Linus Torvalds 已提交
2331
		atomic_inc(&skb->users);
2332
		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
L
Linus Torvalds 已提交
2333
	}
2334
	err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
L
Linus Torvalds 已提交
2335 2336

out:
2337
	scm_destroy(siocb->scm);
L
Linus Torvalds 已提交
2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350
	return err;
}

static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
			   struct msghdr *msg, size_t len,
			   int flags)
{
	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
	struct scm_cookie scm;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int noblock = flags&MSG_DONTWAIT;
	size_t copied;
J
Johannes Berg 已提交
2351
	struct sk_buff *skb, *data_skb;
2352
	int err, ret;
L
Linus Torvalds 已提交
2353 2354 2355 2356 2357 2358

	if (flags&MSG_OOB)
		return -EOPNOTSUPP;

	copied = 0;

2359 2360
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
L
Linus Torvalds 已提交
2361 2362
		goto out;

J
Johannes Berg 已提交
2363 2364
	data_skb = skb;

2365 2366 2367
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
	if (unlikely(skb_shinfo(skb)->frag_list)) {
		/*
J
Johannes Berg 已提交
2368 2369 2370
		 * If this skb has a frag_list, then here that means that we
		 * will have to use the frag_list skb's data for compat tasks
		 * and the regular skb's data for normal (non-compat) tasks.
2371
		 *
J
Johannes Berg 已提交
2372 2373 2374 2375
		 * If we need to send the compat skb, assign it to the
		 * 'data_skb' variable so that it will be used below for data
		 * copying. We keep 'skb' for everything else, including
		 * freeing both later.
2376
		 */
J
Johannes Berg 已提交
2377 2378
		if (flags & MSG_CMSG_COMPAT)
			data_skb = skb_shinfo(skb)->frag_list;
2379 2380 2381
	}
#endif

E
Eric Dumazet 已提交
2382 2383 2384 2385 2386
	/* Record the max length of recvmsg() calls for future allocations */
	nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
	nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
				     16384);

J
Johannes Berg 已提交
2387
	copied = data_skb->len;
L
Linus Torvalds 已提交
2388 2389 2390 2391 2392
	if (len < copied) {
		msg->msg_flags |= MSG_TRUNC;
		copied = len;
	}

J
Johannes Berg 已提交
2393 2394
	skb_reset_transport_header(data_skb);
	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
L
Linus Torvalds 已提交
2395 2396

	if (msg->msg_name) {
2397
		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
L
Linus Torvalds 已提交
2398 2399
		addr->nl_family = AF_NETLINK;
		addr->nl_pad    = 0;
2400
		addr->nl_pid	= NETLINK_CB(skb).portid;
2401
		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
L
Linus Torvalds 已提交
2402 2403 2404
		msg->msg_namelen = sizeof(*addr);
	}

2405 2406 2407
	if (nlk->flags & NETLINK_RECV_PKTINFO)
		netlink_cmsg_recv_pktinfo(msg, skb);

L
Linus Torvalds 已提交
2408 2409 2410 2411 2412
	if (NULL == siocb->scm) {
		memset(&scm, 0, sizeof(scm));
		siocb->scm = &scm;
	}
	siocb->scm->creds = *NETLINK_CREDS(skb);
2413
	if (flags & MSG_TRUNC)
J
Johannes Berg 已提交
2414
		copied = data_skb->len;
2415

L
Linus Torvalds 已提交
2416 2417
	skb_free_datagram(sk, skb);

2418 2419
	if (nlk->cb_running &&
	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
2420 2421
		ret = netlink_dump(sk);
		if (ret) {
2422
			sk->sk_err = -ret;
2423 2424 2425
			sk->sk_error_report(sk);
		}
	}
L
Linus Torvalds 已提交
2426 2427 2428 2429 2430 2431 2432

	scm_recv(sock, msg, siocb->scm, flags);
out:
	netlink_rcv_wake(sk);
	return err ? : copied;
}

2433
static void netlink_data_ready(struct sock *sk)
L
Linus Torvalds 已提交
2434
{
2435
	BUG();
L
Linus Torvalds 已提交
2436 2437 2438
}

/*
2439
 *	We export these functions to other modules. They provide a
L
Linus Torvalds 已提交
2440 2441 2442 2443 2444
 *	complete set of kernel non-blocking support for message
 *	queueing.
 */

struct sock *
2445 2446
__netlink_kernel_create(struct net *net, int unit, struct module *module,
			struct netlink_kernel_cfg *cfg)
L
Linus Torvalds 已提交
2447 2448 2449
{
	struct socket *sock;
	struct sock *sk;
2450
	struct netlink_sock *nlk;
2451
	struct listeners *listeners = NULL;
2452 2453
	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
	unsigned int groups;
L
Linus Torvalds 已提交
2454

2455
	BUG_ON(!nl_table);
L
Linus Torvalds 已提交
2456

2457
	if (unit < 0 || unit >= MAX_LINKS)
L
Linus Torvalds 已提交
2458 2459 2460 2461 2462
		return NULL;

	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
		return NULL;

2463 2464 2465 2466 2467 2468 2469 2470 2471 2472
	/*
	 * We have to just have a reference on the net from sk, but don't
	 * get_net it. Besides, we cannot get and then put the net here.
	 * So we create one inside init_net and the move it to net.
	 */

	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
		goto out_sock_release_nosk;

	sk = sock->sk;
2473
	sk_change_net(sk, net);
2474

2475
	if (!cfg || cfg->groups < 32)
2476
		groups = 32;
2477 2478
	else
		groups = cfg->groups;
2479

2480
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2481 2482 2483
	if (!listeners)
		goto out_sock_release;

L
Linus Torvalds 已提交
2484
	sk->sk_data_ready = netlink_data_ready;
2485 2486
	if (cfg && cfg->input)
		nlk_sk(sk)->netlink_rcv = cfg->input;
L
Linus Torvalds 已提交
2487

2488
	if (netlink_insert(sk, net, 0))
2489
		goto out_sock_release;
2490

2491 2492
	nlk = nlk_sk(sk);
	nlk->flags |= NETLINK_KERNEL_SOCKET;
2493 2494

	netlink_table_grab();
2495 2496
	if (!nl_table[unit].registered) {
		nl_table[unit].groups = groups;
2497
		rcu_assign_pointer(nl_table[unit].listeners, listeners);
2498 2499
		nl_table[unit].cb_mutex = cb_mutex;
		nl_table[unit].module = module;
2500 2501 2502
		if (cfg) {
			nl_table[unit].bind = cfg->bind;
			nl_table[unit].flags = cfg->flags;
2503 2504
			if (cfg->compare)
				nl_table[unit].compare = cfg->compare;
2505
		}
2506
		nl_table[unit].registered = 1;
2507 2508
	} else {
		kfree(listeners);
2509
		nl_table[unit].registered++;
2510
	}
2511
	netlink_table_ungrab();
2512 2513
	return sk;

2514
out_sock_release:
2515
	kfree(listeners);
2516
	netlink_kernel_release(sk);
2517 2518 2519
	return NULL;

out_sock_release_nosk:
2520
	sock_release(sock);
2521
	return NULL;
L
Linus Torvalds 已提交
2522
}
2523
EXPORT_SYMBOL(__netlink_kernel_create);
2524 2525 2526 2527

void
netlink_kernel_release(struct sock *sk)
{
2528
	sk_release_kernel(sk);
2529 2530 2531
}
EXPORT_SYMBOL(netlink_kernel_release);

2532
int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
2533
{
2534
	struct listeners *new, *old;
2535 2536 2537 2538 2539 2540
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];

	if (groups < 32)
		groups = 32;

	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
2541 2542
		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
		if (!new)
2543
			return -ENOMEM;
2544
		old = nl_deref_protected(tbl->listeners);
2545 2546 2547
		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
		rcu_assign_pointer(tbl->listeners, new);

2548
		kfree_rcu(old, rcu);
2549 2550 2551
	}
	tbl->groups = groups;

2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572
	return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
	int err;

	netlink_table_grab();
	err = __netlink_change_ngroups(sk, groups);
2573
	netlink_table_ungrab();
2574

2575 2576 2577
	return err;
}

2578 2579 2580 2581 2582
void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
	struct sock *sk;
	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];

2583
	sk_for_each_bound(sk, &tbl->mc_list)
2584 2585 2586
		netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

2587
struct nlmsghdr *
2588
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
2589 2590
{
	struct nlmsghdr *nlh;
2591
	int size = nlmsg_msg_size(len);
2592

2593
	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size));
2594 2595 2596
	nlh->nlmsg_type = type;
	nlh->nlmsg_len = size;
	nlh->nlmsg_flags = flags;
2597
	nlh->nlmsg_pid = portid;
2598 2599
	nlh->nlmsg_seq = seq;
	if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
2600
		memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
2601 2602 2603 2604
	return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

L
Linus Torvalds 已提交
2605 2606 2607 2608 2609 2610 2611 2612 2613
/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_callback *cb;
2614
	struct sk_buff *skb = NULL;
L
Linus Torvalds 已提交
2615
	struct nlmsghdr *nlh;
2616
	int len, err = -ENOBUFS;
2617
	int alloc_size;
L
Linus Torvalds 已提交
2618

2619
	mutex_lock(nlk->cb_mutex);
2620
	if (!nlk->cb_running) {
2621 2622
		err = -EINVAL;
		goto errout_skb;
L
Linus Torvalds 已提交
2623 2624
	}

2625
	cb = &nlk->cb;
2626 2627
	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

2628 2629 2630
	if (!netlink_rx_is_mmaped(sk) &&
	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
		goto errout_skb;
E
Eric Dumazet 已提交
2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651

	/* NLMSG_GOODSIZE is small to avoid high order allocations being
	 * required, but it makes sense to _attempt_ a 16K bytes allocation
	 * to reduce number of system calls on dump operations, if user
	 * ever provided a big enough buffer.
	 */
	if (alloc_size < nlk->max_recvmsg_len) {
		skb = netlink_alloc_skb(sk,
					nlk->max_recvmsg_len,
					nlk->portid,
					GFP_KERNEL |
					__GFP_NOWARN |
					__GFP_NORETRY);
		/* available room should be exact amount to avoid MSG_TRUNC */
		if (skb)
			skb_reserve(skb, skb_tailroom(skb) -
					 nlk->max_recvmsg_len);
	}
	if (!skb)
		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
					GFP_KERNEL);
2652
	if (!skb)
2653
		goto errout_skb;
2654
	netlink_skb_set_owner_r(skb, sk);
2655

L
Linus Torvalds 已提交
2656 2657 2658
	len = cb->dump(skb, cb);

	if (len > 0) {
2659
		mutex_unlock(nlk->cb_mutex);
2660 2661 2662

		if (sk_filter(sk, skb))
			kfree_skb(skb);
2663 2664
		else
			__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2665 2666 2667
		return 0;
	}

2668 2669 2670 2671
	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
	if (!nlh)
		goto errout_skb;

2672 2673
	nl_dump_check_consistent(cb, nlh);

2674 2675
	memcpy(nlmsg_data(nlh), &len, sizeof(len));

2676 2677
	if (sk_filter(sk, skb))
		kfree_skb(skb);
2678 2679
	else
		__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2680

2681 2682
	if (cb->done)
		cb->done(cb);
L
Linus Torvalds 已提交
2683

2684 2685
	nlk->cb_running = false;
	mutex_unlock(nlk->cb_mutex);
2686
	module_put(cb->module);
2687
	consume_skb(cb->skb);
L
Linus Torvalds 已提交
2688
	return 0;
2689

2690
errout_skb:
2691
	mutex_unlock(nlk->cb_mutex);
2692 2693
	kfree_skb(skb);
	return err;
L
Linus Torvalds 已提交
2694 2695
}

2696 2697 2698
int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
			 const struct nlmsghdr *nlh,
			 struct netlink_dump_control *control)
L
Linus Torvalds 已提交
2699 2700 2701 2702
{
	struct netlink_callback *cb;
	struct sock *sk;
	struct netlink_sock *nlk;
2703
	int ret;
L
Linus Torvalds 已提交
2704

2705 2706 2707 2708 2709 2710
	/* Memory mapped dump requests need to be copied to avoid looping
	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
	 * a reference to the skb.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		skb = skb_copy(skb, GFP_KERNEL);
2711
		if (skb == NULL)
2712 2713 2714 2715
			return -ENOBUFS;
	} else
		atomic_inc(&skb->users);

2716
	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
L
Linus Torvalds 已提交
2717
	if (sk == NULL) {
2718 2719
		ret = -ECONNREFUSED;
		goto error_free;
L
Linus Torvalds 已提交
2720
	}
2721

2722
	nlk = nlk_sk(sk);
2723
	mutex_lock(nlk->cb_mutex);
2724
	/* A dump is in progress... */
2725
	if (nlk->cb_running) {
2726
		ret = -EBUSY;
2727
		goto error_unlock;
L
Linus Torvalds 已提交
2728
	}
2729
	/* add reference of module which cb->dump belongs to */
2730
	if (!try_module_get(control->module)) {
2731
		ret = -EPROTONOSUPPORT;
2732
		goto error_unlock;
2733 2734
	}

2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746
	cb = &nlk->cb;
	memset(cb, 0, sizeof(*cb));
	cb->dump = control->dump;
	cb->done = control->done;
	cb->nlh = nlh;
	cb->data = control->data;
	cb->module = control->module;
	cb->min_dump_alloc = control->min_dump_alloc;
	cb->skb = skb;

	nlk->cb_running = true;

2747
	mutex_unlock(nlk->cb_mutex);
L
Linus Torvalds 已提交
2748

2749
	ret = netlink_dump(sk);
L
Linus Torvalds 已提交
2750
	sock_put(sk);
2751

2752 2753 2754
	if (ret)
		return ret;

2755 2756 2757 2758
	/* We successfully started a dump, by returning -EINTR we
	 * signal not to send ACK even if it was requested.
	 */
	return -EINTR;
2759 2760 2761 2762 2763 2764 2765

error_unlock:
	sock_put(sk);
	mutex_unlock(nlk->cb_mutex);
error_free:
	kfree_skb(skb);
	return ret;
L
Linus Torvalds 已提交
2766
}
2767
EXPORT_SYMBOL(__netlink_dump_start);
L
Linus Torvalds 已提交
2768 2769 2770 2771 2772 2773

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
{
	struct sk_buff *skb;
	struct nlmsghdr *rep;
	struct nlmsgerr *errmsg;
2774
	size_t payload = sizeof(*errmsg);
L
Linus Torvalds 已提交
2775

2776 2777 2778
	/* error messages get the original request appened */
	if (err)
		payload += nlmsg_len(nlh);
L
Linus Torvalds 已提交
2779

2780 2781
	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
				NETLINK_CB(in_skb).portid, GFP_KERNEL);
L
Linus Torvalds 已提交
2782 2783 2784
	if (!skb) {
		struct sock *sk;

2785
		sk = netlink_lookup(sock_net(in_skb->sk),
2786
				    in_skb->sk->sk_protocol,
2787
				    NETLINK_CB(in_skb).portid);
L
Linus Torvalds 已提交
2788 2789 2790 2791 2792 2793 2794 2795
		if (sk) {
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
			sock_put(sk);
		}
		return;
	}

2796
	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2797
			  NLMSG_ERROR, payload, 0);
2798
	errmsg = nlmsg_data(rep);
L
Linus Torvalds 已提交
2799
	errmsg->error = err;
2800
	memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
2801
	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
L
Linus Torvalds 已提交
2802
}
2803
EXPORT_SYMBOL(netlink_ack);
L
Linus Torvalds 已提交
2804

2805
int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
2806
						     struct nlmsghdr *))
2807 2808 2809 2810 2811
{
	struct nlmsghdr *nlh;
	int err;

	while (skb->len >= nlmsg_total_size(0)) {
2812 2813
		int msglen;

2814
		nlh = nlmsg_hdr(skb);
2815
		err = 0;
2816

2817
		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
2818 2819
			return 0;

2820 2821
		/* Only requests are handled by the kernel */
		if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
2822
			goto ack;
2823 2824 2825

		/* Skip control messages */
		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
2826
			goto ack;
2827

2828
		err = cb(skb, nlh);
2829 2830 2831 2832
		if (err == -EINTR)
			goto skip;

ack:
2833
		if (nlh->nlmsg_flags & NLM_F_ACK || err)
2834 2835
			netlink_ack(skb, nlh, err);

2836
skip:
2837
		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
2838 2839 2840
		if (msglen > skb->len)
			msglen = skb->len;
		skb_pull(skb, msglen);
2841 2842 2843 2844
	}

	return 0;
}
2845
EXPORT_SYMBOL(netlink_rcv_skb);
2846

2847 2848 2849 2850
/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
2851
 * @portid: destination netlink portid for reports or 0
2852 2853 2854 2855
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
2856
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
2857 2858 2859 2860 2861
		 unsigned int group, int report, gfp_t flags)
{
	int err = 0;

	if (group) {
2862
		int exclude_portid = 0;
2863 2864 2865

		if (report) {
			atomic_inc(&skb->users);
2866
			exclude_portid = portid;
2867 2868
		}

2869 2870
		/* errors reported via destination sk->sk_err, but propagate
		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
2871
		err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
2872 2873
	}

2874 2875 2876
	if (report) {
		int err2;

2877
		err2 = nlmsg_unicast(sk, skb, portid);
2878 2879 2880
		if (!err || err == -ESRCH)
			err = err2;
	}
2881 2882 2883

	return err;
}
2884
EXPORT_SYMBOL(nlmsg_notify);
2885

L
Linus Torvalds 已提交
2886 2887
#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
2888
	struct seq_net_private p;
L
Linus Torvalds 已提交
2889 2890 2891 2892 2893 2894 2895 2896
	int link;
	int hash_idx;
};

static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
{
	struct nl_seq_iter *iter = seq->private;
	int i, j;
2897
	struct netlink_sock *nlk;
L
Linus Torvalds 已提交
2898 2899 2900
	struct sock *s;
	loff_t off = 0;

2901
	for (i = 0; i < MAX_LINKS; i++) {
2902
		struct rhashtable *ht = &nl_table[i].hash;
E
Eric Dumazet 已提交
2903
		const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
2904 2905 2906 2907

		for (j = 0; j < tbl->size; j++) {
			rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) {
				s = (struct sock *)nlk;
L
Linus Torvalds 已提交
2908

2909
				if (sock_net(s) != seq_file_net(seq))
2910
					continue;
L
Linus Torvalds 已提交
2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924
				if (off == pos) {
					iter->link = i;
					iter->hash_idx = j;
					return s;
				}
				++off;
			}
		}
	}
	return NULL;
}

static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
{
2925
	rcu_read_lock();
L
Linus Torvalds 已提交
2926 2927 2928 2929 2930
	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
2931
	struct netlink_sock *nlk;
L
Linus Torvalds 已提交
2932
	struct nl_seq_iter *iter;
2933
	struct net *net;
L
Linus Torvalds 已提交
2934 2935 2936 2937 2938 2939
	int i, j;

	++*pos;

	if (v == SEQ_START_TOKEN)
		return netlink_seq_socket_idx(seq, 0);
2940

2941
	net = seq_file_net(seq);
2942
	iter = seq->private;
2943 2944 2945 2946 2947
	nlk = v;

	rht_for_each_entry_rcu(nlk, nlk->node.next, node)
		if (net_eq(sock_net((struct sock *)nlk), net))
			return nlk;
L
Linus Torvalds 已提交
2948 2949 2950 2951 2952

	i = iter->link;
	j = iter->hash_idx + 1;

	do {
2953
		struct rhashtable *ht = &nl_table[i].hash;
E
Eric Dumazet 已提交
2954
		const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
2955

2956 2957 2958 2959 2960 2961 2962
		for (; j < tbl->size; j++) {
			rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) {
				if (net_eq(sock_net((struct sock *)nlk), net)) {
					iter->link = i;
					iter->hash_idx = j;
					return nlk;
				}
L
Linus Torvalds 已提交
2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973
			}
		}

		j = 0;
	} while (++i < MAX_LINKS);

	return NULL;
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
2974
	rcu_read_unlock();
L
Linus Torvalds 已提交
2975 2976 2977 2978 2979
}


static int netlink_seq_show(struct seq_file *seq, void *v)
{
E
Eric Dumazet 已提交
2980
	if (v == SEQ_START_TOKEN) {
L
Linus Torvalds 已提交
2981 2982
		seq_puts(seq,
			 "sk       Eth Pid    Groups   "
2983
			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
E
Eric Dumazet 已提交
2984
	} else {
L
Linus Torvalds 已提交
2985 2986 2987
		struct sock *s = v;
		struct netlink_sock *nlk = nlk_sk(s);

2988
		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
L
Linus Torvalds 已提交
2989 2990
			   s,
			   s->sk_protocol,
2991
			   nlk->portid,
2992
			   nlk->groups ? (u32)nlk->groups[0] : 0,
2993 2994
			   sk_rmem_alloc_get(s),
			   sk_wmem_alloc_get(s),
2995
			   nlk->cb_running,
2996
			   atomic_read(&s->sk_refcnt),
2997 2998
			   atomic_read(&s->sk_drops),
			   sock_i_ino(s)
L
Linus Torvalds 已提交
2999 3000 3001 3002 3003 3004
			);

	}
	return 0;
}

3005
static const struct seq_operations netlink_seq_ops = {
L
Linus Torvalds 已提交
3006 3007 3008 3009 3010 3011 3012 3013 3014
	.start  = netlink_seq_start,
	.next   = netlink_seq_next,
	.stop   = netlink_seq_stop,
	.show   = netlink_seq_show,
};


static int netlink_seq_open(struct inode *inode, struct file *file)
{
3015 3016
	return seq_open_net(inode, file, &netlink_seq_ops,
				sizeof(struct nl_seq_iter));
3017 3018
}

3019
static const struct file_operations netlink_seq_fops = {
L
Linus Torvalds 已提交
3020 3021 3022 3023
	.owner		= THIS_MODULE,
	.open		= netlink_seq_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
3024
	.release	= seq_release_net,
L
Linus Torvalds 已提交
3025 3026 3027 3028 3029 3030
};

#endif

int netlink_register_notifier(struct notifier_block *nb)
{
3031
	return atomic_notifier_chain_register(&netlink_chain, nb);
L
Linus Torvalds 已提交
3032
}
3033
EXPORT_SYMBOL(netlink_register_notifier);
L
Linus Torvalds 已提交
3034 3035 3036

int netlink_unregister_notifier(struct notifier_block *nb)
{
3037
	return atomic_notifier_chain_unregister(&netlink_chain, nb);
L
Linus Torvalds 已提交
3038
}
3039
EXPORT_SYMBOL(netlink_unregister_notifier);
3040

3041
static const struct proto_ops netlink_ops = {
L
Linus Torvalds 已提交
3042 3043 3044 3045 3046 3047 3048 3049
	.family =	PF_NETLINK,
	.owner =	THIS_MODULE,
	.release =	netlink_release,
	.bind =		netlink_bind,
	.connect =	netlink_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	sock_no_accept,
	.getname =	netlink_getname,
3050
	.poll =		netlink_poll,
L
Linus Torvalds 已提交
3051 3052 3053
	.ioctl =	sock_no_ioctl,
	.listen =	sock_no_listen,
	.shutdown =	sock_no_shutdown,
3054 3055
	.setsockopt =	netlink_setsockopt,
	.getsockopt =	netlink_getsockopt,
L
Linus Torvalds 已提交
3056 3057
	.sendmsg =	netlink_sendmsg,
	.recvmsg =	netlink_recvmsg,
3058
	.mmap =		netlink_mmap,
L
Linus Torvalds 已提交
3059 3060 3061
	.sendpage =	sock_no_sendpage,
};

3062
static const struct net_proto_family netlink_family_ops = {
L
Linus Torvalds 已提交
3063 3064 3065 3066 3067
	.family = PF_NETLINK,
	.create = netlink_create,
	.owner	= THIS_MODULE,	/* for consistency 8) */
};

3068
static int __net_init netlink_net_init(struct net *net)
3069 3070
{
#ifdef CONFIG_PROC_FS
3071
	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
3072 3073 3074 3075 3076
		return -ENOMEM;
#endif
	return 0;
}

3077
static void __net_exit netlink_net_exit(struct net *net)
3078 3079
{
#ifdef CONFIG_PROC_FS
3080
	remove_proc_entry("netlink", net->proc_net);
3081 3082 3083
#endif
}

3084 3085
static void __init netlink_add_usersock_entry(void)
{
3086
	struct listeners *listeners;
3087 3088
	int groups = 32;

3089
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
3090
	if (!listeners)
3091
		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
3092 3093 3094 3095

	netlink_table_grab();

	nl_table[NETLINK_USERSOCK].groups = groups;
3096
	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
3097 3098
	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
	nl_table[NETLINK_USERSOCK].registered = 1;
3099
	nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
3100 3101 3102 3103

	netlink_table_ungrab();
}

3104
static struct pernet_operations __net_initdata netlink_net_ops = {
3105 3106 3107 3108
	.init = netlink_net_init,
	.exit = netlink_net_exit,
};

L
Linus Torvalds 已提交
3109 3110 3111 3112
static int __init netlink_proto_init(void)
{
	int i;
	int err = proto_register(&netlink_proto, 0);
3113 3114 3115 3116 3117 3118 3119 3120 3121 3122
	struct rhashtable_params ht_params = {
		.head_offset = offsetof(struct netlink_sock, node),
		.key_offset = offsetof(struct netlink_sock, portid),
		.key_len = sizeof(u32), /* portid */
		.hashfn = arch_fast_hash,
		.max_shift = 16, /* 64K */
		.grow_decision = rht_grow_above_75,
		.shrink_decision = rht_shrink_below_30,
		.mutex_is_held = lockdep_nl_sk_hash_is_held,
	};
L
Linus Torvalds 已提交
3123 3124 3125 3126

	if (err != 0)
		goto out;

3127
	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
L
Linus Torvalds 已提交
3128

3129
	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
3130 3131
	if (!nl_table)
		goto panic;
L
Linus Torvalds 已提交
3132 3133

	for (i = 0; i < MAX_LINKS; i++) {
3134 3135 3136
		if (rhashtable_init(&nl_table[i].hash, &ht_params) < 0) {
			while (--i > 0)
				rhashtable_destroy(&nl_table[i].hash);
L
Linus Torvalds 已提交
3137
			kfree(nl_table);
3138
			goto panic;
L
Linus Torvalds 已提交
3139 3140 3141
		}
	}

3142 3143
	INIT_LIST_HEAD(&netlink_tap_all);

3144 3145
	netlink_add_usersock_entry();

L
Linus Torvalds 已提交
3146
	sock_register(&netlink_family_ops);
3147
	register_pernet_subsys(&netlink_net_ops);
3148
	/* The netlink device handler may be needed early. */
L
Linus Torvalds 已提交
3149 3150 3151
	rtnetlink_init();
out:
	return err;
3152 3153
panic:
	panic("netlink_init: Cannot allocate nl_table\n");
L
Linus Torvalds 已提交
3154 3155 3156
}

core_initcall(netlink_proto_init);