af_netlink.c 78.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 * NETLINK      Kernel-user communication protocol.
 *
4
 * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5
 * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6
 * 				Patrick McHardy <kaber@trash.net>
L
Linus Torvalds 已提交
7 8 9 10 11
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
12
 *
L
Linus Torvalds 已提交
13 14 15 16
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 * 				 use nlk_sk, as sk->protinfo is on a diet 8)
17 18 19 20 21 22
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 * 				 - inc module use count of module that owns
 * 				   the kernel socket in case userspace opens
 * 				   socket of same protocol
 * 				 - remove all module support, since netlink is
 * 				   mandatory if CONFIG_NET=y these days
L
Linus Torvalds 已提交
23 24 25 26
 */

#include <linux/module.h>

27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
A
Andrew Morton 已提交
57
#include <linux/audit.h>
58
#include <linux/mutex.h>
59
#include <linux/vmalloc.h>
60
#include <linux/if_arp.h>
61
#include <linux/rhashtable.h>
62
#include <asm/cacheflush.h>
63
#include <linux/hash.h>
64
#include <linux/genetlink.h>
A
Andrew Morton 已提交
65

66
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
67 68
#include <net/sock.h>
#include <net/scm.h>
69
#include <net/netlink.h>
L
Linus Torvalds 已提交
70

71
#include "af_netlink.h"
L
Linus Torvalds 已提交
72

73 74 75
struct listeners {
	struct rcu_head		rcu;
	unsigned long		masks[0];
76 77
};

78
/* state bits */
79
#define NETLINK_S_CONGESTED		0x0
80 81

/* flags */
82 83 84 85
#define NETLINK_F_KERNEL_SOCKET		0x1
#define NETLINK_F_RECV_PKTINFO		0x2
#define NETLINK_F_BROADCAST_SEND_ERROR	0x4
#define NETLINK_F_RECV_NO_ENOBUFS	0x8
86
#define NETLINK_F_LISTEN_ALL_NSID	0x10
87
#define NETLINK_F_CAP_ACK		0x20
88

89
static inline int netlink_is_kernel(struct sock *sk)
90
{
91
	return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
92 93
}

94
struct netlink_table *nl_table __read_mostly;
95
EXPORT_SYMBOL_GPL(nl_table);
L
Linus Torvalds 已提交
96 97 98 99

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static int netlink_dump(struct sock *sk);
100
static void netlink_skb_destructor(struct sk_buff *skb);
L
Linus Torvalds 已提交
101

102
/* nl_table locking explained:
103
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
Y
Ying Xue 已提交
104
 * and removal are protected with per bucket lock while using RCU list
105 106 107 108
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
109
 */
110 111
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
L
Linus Torvalds 已提交
112 113
static atomic_t nl_table_users = ATOMIC_INIT(0);

114 115
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

116
static ATOMIC_NOTIFIER_HEAD(netlink_chain);
L
Linus Torvalds 已提交
117

118 119 120
static DEFINE_SPINLOCK(netlink_tap_lock);
static struct list_head netlink_tap_all __read_mostly;

121 122
static const struct rhashtable_params netlink_rhashtable_params;

123
static inline u32 netlink_group_mask(u32 group)
124 125 126 127
{
	return group ? 1 << (group - 1) : 0;
}

128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
					   gfp_t gfp_mask)
{
	unsigned int len = skb_end_offset(skb);
	struct sk_buff *new;

	new = alloc_skb(len, gfp_mask);
	if (new == NULL)
		return NULL;

	NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
	NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
	NETLINK_CB(new).creds = NETLINK_CB(skb).creds;

	memcpy(skb_put(new, len), skb->data, len);
	return new;
}

146 147 148 149 150 151 152 153 154
int netlink_add_tap(struct netlink_tap *nt)
{
	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
		return -EINVAL;

	spin_lock(&netlink_tap_lock);
	list_add_rcu(&nt->list, &netlink_tap_all);
	spin_unlock(&netlink_tap_lock);

155
	__module_get(nt->module);
156 157 158 159 160

	return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

161
static int __netlink_remove_tap(struct netlink_tap *nt)
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
{
	bool found = false;
	struct netlink_tap *tmp;

	spin_lock(&netlink_tap_lock);

	list_for_each_entry(tmp, &netlink_tap_all, list) {
		if (nt == tmp) {
			list_del_rcu(&nt->list);
			found = true;
			goto out;
		}
	}

	pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
	spin_unlock(&netlink_tap_lock);

180
	if (found)
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
		module_put(nt->module);

	return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
	int ret;

	ret = __netlink_remove_tap(nt);
	synchronize_net();

	return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
static bool netlink_filter_tap(const struct sk_buff *skb)
{
	struct sock *sk = skb->sk;

	/* We take the more conservative approach and
	 * whitelist socket protocols that may pass.
	 */
	switch (sk->sk_protocol) {
	case NETLINK_ROUTE:
	case NETLINK_USERSOCK:
	case NETLINK_SOCK_DIAG:
	case NETLINK_NFLOG:
	case NETLINK_XFRM:
	case NETLINK_FIB_LOOKUP:
	case NETLINK_NETFILTER:
	case NETLINK_GENERIC:
V
Varka Bhadram 已提交
213
		return true;
214 215
	}

V
Varka Bhadram 已提交
216
	return false;
217 218
}

219 220 221 222
static int __netlink_deliver_tap_skb(struct sk_buff *skb,
				     struct net_device *dev)
{
	struct sk_buff *nskb;
223
	struct sock *sk = skb->sk;
224 225 226
	int ret = -ENOMEM;

	dev_hold(dev);
227 228 229 230 231

	if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head))
		nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
	else
		nskb = skb_clone(skb, GFP_ATOMIC);
232 233
	if (nskb) {
		nskb->dev = dev;
234
		nskb->protocol = htons((u16) sk->sk_protocol);
235 236
		nskb->pkt_type = netlink_is_kernel(sk) ?
				 PACKET_KERNEL : PACKET_USER;
237
		skb_reset_network_header(nskb);
238 239 240 241 242 243 244 245 246 247 248 249 250 251
		ret = dev_queue_xmit(nskb);
		if (unlikely(ret > 0))
			ret = net_xmit_errno(ret);
	}

	dev_put(dev);
	return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb)
{
	int ret;
	struct netlink_tap *tmp;

252 253 254
	if (!netlink_filter_tap(skb))
		return;

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
		if (unlikely(ret))
			break;
	}
}

static void netlink_deliver_tap(struct sk_buff *skb)
{
	rcu_read_lock();

	if (unlikely(!list_empty(&netlink_tap_all)))
		__netlink_deliver_tap(skb);

	rcu_read_unlock();
}

272 273 274 275 276 277 278
static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
				       struct sk_buff *skb)
{
	if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
		netlink_deliver_tap(skb);
}

279 280 281 282
static void netlink_overrun(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

283 284 285
	if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) {
		if (!test_and_set_bit(NETLINK_S_CONGESTED,
				      &nlk_sk(sk)->state)) {
286 287 288 289 290 291 292 293 294 295 296 297
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
		}
	}
	atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (skb_queue_empty(&sk->sk_receive_queue))
298 299
		clear_bit(NETLINK_S_CONGESTED, &nlk->state);
	if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
300 301 302
		wake_up_interruptible(&nlk->wait);
}

303
#ifdef CONFIG_NETLINK_MMAP
304 305 306 307 308
static bool netlink_rx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
}

309 310 311 312 313
static bool netlink_tx_is_mmaped(struct sock *sk)
{
	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
}

314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
static __pure struct page *pgvec_to_page(const void *addr)
{
	if (is_vmalloc_addr(addr))
		return vmalloc_to_page(addr);
	else
		return virt_to_page(addr);
}

static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
{
	unsigned int i;

	for (i = 0; i < len; i++) {
		if (pg_vec[i] != NULL) {
			if (is_vmalloc_addr(pg_vec[i]))
				vfree(pg_vec[i]);
			else
				free_pages((unsigned long)pg_vec[i], order);
		}
	}
	kfree(pg_vec);
}

static void *alloc_one_pg_vec_page(unsigned long order)
{
	void *buffer;
	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
			  __GFP_NOWARN | __GFP_NORETRY;

	buffer = (void *)__get_free_pages(gfp_flags, order);
	if (buffer != NULL)
		return buffer;

	buffer = vzalloc((1 << order) * PAGE_SIZE);
	if (buffer != NULL)
		return buffer;

	gfp_flags &= ~__GFP_NORETRY;
	return (void *)__get_free_pages(gfp_flags, order);
}

static void **alloc_pg_vec(struct netlink_sock *nlk,
			   struct nl_mmap_req *req, unsigned int order)
{
	unsigned int block_nr = req->nm_block_nr;
	unsigned int i;
360
	void **pg_vec;
361 362 363 364 365 366

	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
	if (pg_vec == NULL)
		return NULL;

	for (i = 0; i < block_nr; i++) {
367
		pg_vec[i] = alloc_one_pg_vec_page(order);
368 369 370 371 372 373 374 375 376 377
		if (pg_vec[i] == NULL)
			goto err1;
	}

	return pg_vec;
err1:
	free_pg_vec(pg_vec, order, block_nr);
	return NULL;
}

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409

static void
__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
		   unsigned int order)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct sk_buff_head *queue;
	struct netlink_ring *ring;

	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;

	spin_lock_bh(&queue->lock);

	ring->frame_max		= req->nm_frame_nr - 1;
	ring->head		= 0;
	ring->frame_size	= req->nm_frame_size;
	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;

	swap(ring->pg_vec_len, req->nm_block_nr);
	swap(ring->pg_vec_order, order);
	swap(ring->pg_vec, pg_vec);

	__skb_queue_purge(queue);
	spin_unlock_bh(&queue->lock);

	WARN_ON(atomic_read(&nlk->mapped));

	if (pg_vec)
		free_pg_vec(pg_vec, order, req->nm_block_nr);
}

410
static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
411
			    bool tx_ring)
412 413 414 415 416 417 418 419
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	void **pg_vec = NULL;
	unsigned int order = 0;

	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;

420 421 422 423
	if (atomic_read(&nlk->mapped))
		return -EBUSY;
	if (atomic_read(&ring->pending))
		return -EBUSY;
424 425 426 427 428 429 430

	if (req->nm_block_nr) {
		if (ring->pg_vec != NULL)
			return -EBUSY;

		if ((int)req->nm_block_size <= 0)
			return -EINVAL;
T
Tobias Klauser 已提交
431
		if (!PAGE_ALIGNED(req->nm_block_size))
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
			return -EINVAL;
		if (req->nm_frame_size < NL_MMAP_HDRLEN)
			return -EINVAL;
		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
			return -EINVAL;

		ring->frames_per_block = req->nm_block_size /
					 req->nm_frame_size;
		if (ring->frames_per_block == 0)
			return -EINVAL;
		if (ring->frames_per_block * req->nm_block_nr !=
		    req->nm_frame_nr)
			return -EINVAL;

		order = get_order(req->nm_block_size);
		pg_vec = alloc_pg_vec(nlk, req, order);
		if (pg_vec == NULL)
			return -ENOMEM;
	} else {
		if (req->nm_frame_nr)
			return -EINVAL;
	}

	mutex_lock(&nlk->pg_vec_lock);
456 457 458 459
	if (atomic_read(&nlk->mapped) == 0) {
		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
		mutex_unlock(&nlk->pg_vec_lock);
		return 0;
460
	}
461

462 463 464 465
	mutex_unlock(&nlk->pg_vec_lock);

	if (pg_vec)
		free_pg_vec(pg_vec, order, req->nm_block_nr);
466 467

	return -EBUSY;
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549
}

static void netlink_mm_open(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_inc(&nlk_sk(sk)->mapped);
}

static void netlink_mm_close(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct socket *sock = file->private_data;
	struct sock *sk = sock->sk;

	if (sk)
		atomic_dec(&nlk_sk(sk)->mapped);
}

static const struct vm_operations_struct netlink_mmap_ops = {
	.open	= netlink_mm_open,
	.close	= netlink_mm_close,
};

static int netlink_mmap(struct file *file, struct socket *sock,
			struct vm_area_struct *vma)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	unsigned long start, size, expected;
	unsigned int i;
	int err = -EINVAL;

	if (vma->vm_pgoff)
		return -EINVAL;

	mutex_lock(&nlk->pg_vec_lock);

	expected = 0;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;
		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
	}

	if (expected == 0)
		goto out;

	size = vma->vm_end - vma->vm_start;
	if (size != expected)
		goto out;

	start = vma->vm_start;
	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
		if (ring->pg_vec == NULL)
			continue;

		for (i = 0; i < ring->pg_vec_len; i++) {
			struct page *page;
			void *kaddr = ring->pg_vec[i];
			unsigned int pg_num;

			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
				page = pgvec_to_page(kaddr);
				err = vm_insert_page(vma, start, page);
				if (err < 0)
					goto out;
				start += PAGE_SIZE;
				kaddr += PAGE_SIZE;
			}
		}
	}

	atomic_inc(&nlk->mapped);
	vma->vm_ops = &netlink_mmap_ops;
	err = 0;
out:
	mutex_unlock(&nlk->pg_vec_lock);
550
	return err;
551
}
552

D
David Miller 已提交
553
static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
554 555 556 557 558 559
{
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
	struct page *p_start, *p_end;

	/* First page is flushed through netlink_{get,set}_status */
	p_start = pgvec_to_page(hdr + PAGE_SIZE);
D
David Miller 已提交
560
	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
	while (p_start <= p_end) {
		flush_dcache_page(p_start);
		p_start++;
	}
#endif
}

static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
{
	smp_rmb();
	flush_dcache_page(pgvec_to_page(hdr));
	return hdr->nm_status;
}

static void netlink_set_status(struct nl_mmap_hdr *hdr,
			       enum nl_mmap_status status)
{
578
	smp_mb();
579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
	hdr->nm_status = status;
	flush_dcache_page(pgvec_to_page(hdr));
}

static struct nl_mmap_hdr *
__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
{
	unsigned int pg_vec_pos, frame_off;

	pg_vec_pos = pos / ring->frames_per_block;
	frame_off  = pos % ring->frames_per_block;

	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
}

static struct nl_mmap_hdr *
netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
		     enum nl_mmap_status status)
{
	struct nl_mmap_hdr *hdr;

	hdr = __netlink_lookup_frame(ring, pos);
	if (netlink_get_status(hdr) != status)
		return NULL;

	return hdr;
}

static struct nl_mmap_hdr *
netlink_current_frame(const struct netlink_ring *ring,
		      enum nl_mmap_status status)
{
	return netlink_lookup_frame(ring, ring->head, status);
}

static void netlink_increment_head(struct netlink_ring *ring)
{
	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
}

static void netlink_forward_ring(struct netlink_ring *ring)
{
621
	unsigned int head = ring->head;
622 623 624
	const struct nl_mmap_hdr *hdr;

	do {
625
		hdr = __netlink_lookup_frame(ring, ring->head);
626 627 628 629 630 631 632 633
		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
			break;
		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
			break;
		netlink_increment_head(ring);
	} while (ring->head != head);
}

634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
static bool netlink_has_valid_frame(struct netlink_ring *ring)
{
	unsigned int head = ring->head, pos = head;
	const struct nl_mmap_hdr *hdr;

	do {
		hdr = __netlink_lookup_frame(ring, pos);
		if (hdr->nm_status == NL_MMAP_STATUS_VALID)
			return true;
		pos = pos != 0 ? pos - 1 : ring->frame_max;
	} while (pos != head);

	return false;
}

649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
static bool netlink_dump_space(struct netlink_sock *nlk)
{
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;
	unsigned int n;

	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		return false;

	n = ring->head + ring->frame_max / 2;
	if (n > ring->frame_max)
		n -= ring->frame_max;

	hdr = __netlink_lookup_frame(ring, n);

	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
}

668 669 670 671 672 673
static unsigned int netlink_poll(struct file *file, struct socket *sock,
				 poll_table *wait)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int mask;
674
	int err;
675

676 677 678 679 680
	if (nlk->rx_ring.pg_vec != NULL) {
		/* Memory mapped sockets don't call recvmsg(), so flow control
		 * for dumps is performed here. A dump is allowed to continue
		 * if at least half the ring is unused.
		 */
681
		while (nlk->cb_running && netlink_dump_space(nlk)) {
682 683
			err = netlink_dump(sk);
			if (err < 0) {
684
				sk->sk_err = -err;
685 686 687 688 689 690
				sk->sk_error_report(sk);
				break;
			}
		}
		netlink_rcv_wake(sk);
	}
691

692 693
	mask = datagram_poll(file, sock, wait);

694 695 696 697 698 699 700 701 702 703 704 705
	/* We could already have received frames in the normal receive
	 * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
	 * so if mask contains pollin/etc already, there's no point
	 * walking the ring.
	 */
	if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
		spin_lock_bh(&sk->sk_receive_queue.lock);
		if (nlk->rx_ring.pg_vec) {
			if (netlink_has_valid_frame(&nlk->rx_ring))
				mask |= POLLIN | POLLRDNORM;
		}
		spin_unlock_bh(&sk->sk_receive_queue.lock);
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
	}

	spin_lock_bh(&sk->sk_write_queue.lock);
	if (nlk->tx_ring.pg_vec) {
		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
			mask |= POLLOUT | POLLWRNORM;
	}
	spin_unlock_bh(&sk->sk_write_queue.lock);

	return mask;
}

static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
{
	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
}

static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
				   struct netlink_ring *ring,
				   struct nl_mmap_hdr *hdr)
{
	unsigned int size;
	void *data;

	size = ring->frame_size - NL_MMAP_HDRLEN;
	data = (void *)hdr + NL_MMAP_HDRLEN;

	skb->head	= data;
	skb->data	= data;
	skb_reset_tail_pointer(skb);
	skb->end	= skb->tail + size;
	skb->len	= 0;

	skb->destructor	= netlink_skb_destructor;
	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
	NETLINK_CB(skb).sk = sk;
}
743 744 745

static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
				u32 dst_portid, u32 dst_group,
C
Christoph Hellwig 已提交
746
				struct scm_cookie *scm)
747 748 749 750 751 752 753 754 755 756 757 758 759 760
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;
	struct sk_buff *skb;
	unsigned int maxlen;
	int err = 0, len = 0;

	mutex_lock(&nlk->pg_vec_lock);

	ring   = &nlk->tx_ring;
	maxlen = ring->frame_size - NL_MMAP_HDRLEN;

	do {
D
David Miller 已提交
761 762
		unsigned int nm_len;

763 764 765 766 767 768 769
		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
		if (hdr == NULL) {
			if (!(msg->msg_flags & MSG_DONTWAIT) &&
			    atomic_read(&nlk->tx_ring.pending))
				schedule();
			continue;
		}
D
David Miller 已提交
770 771 772

		nm_len = ACCESS_ONCE(hdr->nm_len);
		if (nm_len > maxlen) {
773 774 775 776
			err = -EINVAL;
			goto out;
		}

D
David Miller 已提交
777
		netlink_frame_flush_dcache(hdr, nm_len);
778

D
David Miller 已提交
779 780 781 782
		skb = alloc_skb(nm_len, GFP_KERNEL);
		if (skb == NULL) {
			err = -ENOBUFS;
			goto out;
783
		}
D
David Miller 已提交
784 785 786
		__skb_put(skb, nm_len);
		memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
		netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
787 788 789 790 791

		netlink_increment_head(ring);

		NETLINK_CB(skb).portid	  = nlk->portid;
		NETLINK_CB(skb).dst_group = dst_group;
C
Christoph Hellwig 已提交
792
		NETLINK_CB(skb).creds	  = scm->creds;
793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820

		err = security_netlink_send(sk, skb);
		if (err) {
			kfree_skb(skb);
			goto out;
		}

		if (unlikely(dst_group)) {
			atomic_inc(&skb->users);
			netlink_broadcast(sk, skb, dst_portid, dst_group,
					  GFP_KERNEL);
		}
		err = netlink_unicast(sk, skb, dst_portid,
				      msg->msg_flags & MSG_DONTWAIT);
		if (err < 0)
			goto out;
		len += err;

	} while (hdr != NULL ||
		 (!(msg->msg_flags & MSG_DONTWAIT) &&
		  atomic_read(&nlk->tx_ring.pending)));

	if (len > 0)
		err = len;
out:
	mutex_unlock(&nlk->pg_vec_lock);
	return err;
}
821 822 823 824 825 826 827 828 829

static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
{
	struct nl_mmap_hdr *hdr;

	hdr = netlink_mmap_hdr(skb);
	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
830 831
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
D
David Miller 已提交
832
	netlink_frame_flush_dcache(hdr, hdr->nm_len);
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);

	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
	kfree_skb(skb);
}

static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_ring *ring = &nlk->rx_ring;
	struct nl_mmap_hdr *hdr;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL) {
		spin_unlock_bh(&sk->sk_receive_queue.lock);
		kfree_skb(skb);
850
		netlink_overrun(sk);
851 852 853 854 855 856 857 858 859
		return;
	}
	netlink_increment_head(ring);
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);

	hdr->nm_len	= skb->len;
	hdr->nm_group	= NETLINK_CB(skb).dst_group;
	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
860 861
	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
862 863 864
	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
}

865
#else /* CONFIG_NETLINK_MMAP */
866
#define netlink_rx_is_mmaped(sk)	false
867
#define netlink_tx_is_mmaped(sk)	false
868
#define netlink_mmap			sock_no_mmap
869
#define netlink_poll			datagram_poll
C
Christoph Hellwig 已提交
870
#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
871 872
#endif /* CONFIG_NETLINK_MMAP */

873 874
static void netlink_skb_destructor(struct sk_buff *skb)
{
875 876 877 878 879 880 881 882 883 884 885 886 887 888
#ifdef CONFIG_NETLINK_MMAP
	struct nl_mmap_hdr *hdr;
	struct netlink_ring *ring;
	struct sock *sk;

	/* If a packet from the kernel to userspace was freed because of an
	 * error without being delivered to userspace, the kernel must reset
	 * the status. In the direction userspace to kernel, the status is
	 * always reset here after the packet was processed and freed.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		hdr = netlink_mmap_hdr(skb);
		sk = NETLINK_CB(skb).sk;

889 890 891 892 893 894 895 896 897
		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
			ring = &nlk_sk(sk)->tx_ring;
		} else {
			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
				hdr->nm_len = 0;
				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
			}
			ring = &nlk_sk(sk)->rx_ring;
898 899 900 901 902 903
		}

		WARN_ON(atomic_read(&ring->pending) == 0);
		atomic_dec(&ring->pending);
		sock_put(sk);

904
		skb->head = NULL;
905 906
	}
#endif
907
	if (is_vmalloc_addr(skb->head)) {
908 909 910 911
		if (!skb->cloned ||
		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
			vfree(skb->head);

912 913
		skb->head = NULL;
	}
914 915
	if (skb->sk != NULL)
		sock_rfree(skb);
916 917 918 919 920 921 922 923 924 925 926
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
	WARN_ON(skb->sk != NULL);
	skb->sk = sk;
	skb->destructor = netlink_skb_destructor;
	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
	sk_mem_charge(sk, skb->truesize);
}

L
Linus Torvalds 已提交
927 928
static void netlink_sock_destruct(struct sock *sk)
{
929 930
	struct netlink_sock *nlk = nlk_sk(sk);

931 932 933
	if (nlk->cb_running) {
		if (nlk->cb.done)
			nlk->cb.done(&nlk->cb);
934

935 936
		module_put(nlk->cb.module);
		kfree_skb(nlk->cb.skb);
937 938
	}

L
Linus Torvalds 已提交
939
	skb_queue_purge(&sk->sk_receive_queue);
940 941 942 943 944 945
#ifdef CONFIG_NETLINK_MMAP
	if (1) {
		struct nl_mmap_req req;

		memset(&req, 0, sizeof(req));
		if (nlk->rx_ring.pg_vec)
946
			__netlink_set_ring(sk, &req, false, NULL, 0);
947 948
		memset(&req, 0, sizeof(req));
		if (nlk->tx_ring.pg_vec)
949
			__netlink_set_ring(sk, &req, true, NULL, 0);
950 951
	}
#endif /* CONFIG_NETLINK_MMAP */
L
Linus Torvalds 已提交
952 953

	if (!sock_flag(sk, SOCK_DEAD)) {
954
		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
L
Linus Torvalds 已提交
955 956
		return;
	}
957 958 959 960

	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
	WARN_ON(nlk_sk(sk)->groups);
L
Linus Torvalds 已提交
961 962
}

963 964
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
L
Linus Torvalds 已提交
965 966 967 968
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

969
void netlink_table_grab(void)
970
	__acquires(nl_table_lock)
L
Linus Torvalds 已提交
971
{
972 973
	might_sleep();

974
	write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
975 976 977 978 979

	if (atomic_read(&nl_table_users)) {
		DECLARE_WAITQUEUE(wait, current);

		add_wait_queue_exclusive(&nl_table_wait, &wait);
980
		for (;;) {
L
Linus Torvalds 已提交
981 982 983
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (atomic_read(&nl_table_users) == 0)
				break;
984
			write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
985
			schedule();
986
			write_lock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
987 988 989 990 991 992 993
		}

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nl_table_wait, &wait);
	}
}

994
void netlink_table_ungrab(void)
995
	__releases(nl_table_lock)
L
Linus Torvalds 已提交
996
{
997
	write_unlock_irq(&nl_table_lock);
L
Linus Torvalds 已提交
998 999 1000
	wake_up(&nl_table_wait);
}

1001
static inline void
L
Linus Torvalds 已提交
1002 1003 1004 1005 1006 1007 1008 1009 1010
netlink_lock_table(void)
{
	/* read_lock() synchronizes us to netlink_table_grab */

	read_lock(&nl_table_lock);
	atomic_inc(&nl_table_users);
	read_unlock(&nl_table_lock);
}

1011
static inline void
L
Linus Torvalds 已提交
1012 1013 1014 1015 1016 1017
netlink_unlock_table(void)
{
	if (atomic_dec_and_test(&nl_table_users))
		wake_up(&nl_table_wait);
}

1018
struct netlink_compare_arg
L
Linus Torvalds 已提交
1019
{
1020
	possible_net_t pnet;
1021 1022
	u32 portid;
};
L
Linus Torvalds 已提交
1023

1024 1025 1026
/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
	(offsetof(struct netlink_compare_arg, portid) + sizeof(u32))
1027 1028 1029

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
				  const void *ptr)
L
Linus Torvalds 已提交
1030
{
1031 1032
	const struct netlink_compare_arg *x = arg->key;
	const struct netlink_sock *nlk = ptr;
L
Linus Torvalds 已提交
1033

1034
	return nlk->portid != x->portid ||
1035 1036 1037 1038 1039 1040 1041 1042 1043
	       !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
				     struct net *net, u32 portid)
{
	memset(arg, 0, sizeof(*arg));
	write_pnet(&arg->pnet, net);
	arg->portid = portid;
L
Linus Torvalds 已提交
1044 1045
}

1046 1047
static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
				     struct net *net)
L
Linus Torvalds 已提交
1048
{
1049
	struct netlink_compare_arg arg;
L
Linus Torvalds 已提交
1050

1051 1052 1053
	netlink_compare_arg_init(&arg, net, portid);
	return rhashtable_lookup_fast(&table->hash, &arg,
				      netlink_rhashtable_params);
L
Linus Torvalds 已提交
1054 1055
}

1056
static int __netlink_insert(struct netlink_table *table, struct sock *sk)
Y
Ying Xue 已提交
1057
{
1058
	struct netlink_compare_arg arg;
Y
Ying Xue 已提交
1059

1060
	netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
1061 1062 1063
	return rhashtable_lookup_insert_key(&table->hash, &arg,
					    &nlk_sk(sk)->node,
					    netlink_rhashtable_params);
Y
Ying Xue 已提交
1064 1065
}

1066
static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
L
Linus Torvalds 已提交
1067
{
1068 1069
	struct netlink_table *table = &nl_table[protocol];
	struct sock *sk;
L
Linus Torvalds 已提交
1070

1071 1072 1073 1074 1075
	rcu_read_lock();
	sk = __netlink_lookup(table, portid, net);
	if (sk)
		sock_hold(sk);
	rcu_read_unlock();
L
Linus Torvalds 已提交
1076

1077
	return sk;
L
Linus Torvalds 已提交
1078 1079
}

1080
static const struct proto_ops netlink_ops;
L
Linus Torvalds 已提交
1081

1082 1083 1084 1085 1086 1087
static void
netlink_update_listeners(struct sock *sk)
{
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
	unsigned long mask;
	unsigned int i;
1088 1089 1090 1091 1092
	struct listeners *listeners;

	listeners = nl_deref_protected(tbl->listeners);
	if (!listeners)
		return;
1093

1094
	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
1095
		mask = 0;
1096
		sk_for_each_bound(sk, &tbl->mc_list) {
1097 1098 1099
			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
				mask |= nlk_sk(sk)->groups[i];
		}
1100
		listeners->masks[i] = mask;
1101 1102 1103 1104 1105
	}
	/* this function is only called with the netlink table "grabbed", which
	 * makes sure updates are visible before bind or setsockopt return. */
}

1106
static int netlink_insert(struct sock *sk, u32 portid)
L
Linus Torvalds 已提交
1107
{
1108
	struct netlink_table *table = &nl_table[sk->sk_protocol];
1109
	int err;
L
Linus Torvalds 已提交
1110

Y
Ying Xue 已提交
1111
	lock_sock(sk);
L
Linus Torvalds 已提交
1112

1113 1114
	err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
	if (nlk_sk(sk)->bound)
L
Linus Torvalds 已提交
1115 1116 1117
		goto err;

	err = -ENOMEM;
1118 1119
	if (BITS_PER_LONG > 32 &&
	    unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
L
Linus Torvalds 已提交
1120 1121
		goto err;

1122
	nlk_sk(sk)->portid = portid;
1123
	sock_hold(sk);
1124

1125 1126
	err = __netlink_insert(table, sk);
	if (err) {
1127 1128 1129 1130 1131
		/* In case the hashtable backend returns with -EBUSY
		 * from here, it must not escape to the caller.
		 */
		if (unlikely(err == -EBUSY))
			err = -EOVERFLOW;
1132 1133
		if (err == -EEXIST)
			err = -EADDRINUSE;
Y
Ying Xue 已提交
1134
		sock_put(sk);
1135
		goto err;
1136 1137
	}

1138 1139 1140
	/* We need to ensure that the socket is hashed and visible. */
	smp_wmb();
	nlk_sk(sk)->bound = portid;
1141

L
Linus Torvalds 已提交
1142
err:
Y
Ying Xue 已提交
1143
	release_sock(sk);
L
Linus Torvalds 已提交
1144 1145 1146 1147 1148
	return err;
}

static void netlink_remove(struct sock *sk)
{
1149 1150 1151
	struct netlink_table *table;

	table = &nl_table[sk->sk_protocol];
1152 1153
	if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
				    netlink_rhashtable_params)) {
1154 1155 1156 1157
		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
		__sock_put(sk);
	}

L
Linus Torvalds 已提交
1158
	netlink_table_grab();
1159
	if (nlk_sk(sk)->subscriptions) {
L
Linus Torvalds 已提交
1160
		__sk_del_bind_node(sk);
1161 1162
		netlink_update_listeners(sk);
	}
1163 1164
	if (sk->sk_protocol == NETLINK_GENERIC)
		atomic_inc(&genl_sk_destructing_cnt);
L
Linus Torvalds 已提交
1165 1166 1167 1168 1169 1170 1171 1172 1173
	netlink_table_ungrab();
}

static struct proto netlink_proto = {
	.name	  = "NETLINK",
	.owner	  = THIS_MODULE,
	.obj_size = sizeof(struct netlink_sock),
};

1174
static int __netlink_create(struct net *net, struct socket *sock,
1175 1176
			    struct mutex *cb_mutex, int protocol,
			    int kern)
L
Linus Torvalds 已提交
1177 1178 1179
{
	struct sock *sk;
	struct netlink_sock *nlk;
1180 1181 1182

	sock->ops = &netlink_ops;

1183
	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
1184 1185 1186 1187 1188 1189
	if (!sk)
		return -ENOMEM;

	sock_init_data(sock, sk);

	nlk = nlk_sk(sk);
E
Eric Dumazet 已提交
1190
	if (cb_mutex) {
1191
		nlk->cb_mutex = cb_mutex;
E
Eric Dumazet 已提交
1192
	} else {
1193 1194 1195
		nlk->cb_mutex = &nlk->cb_def_mutex;
		mutex_init(nlk->cb_mutex);
	}
1196
	init_waitqueue_head(&nlk->wait);
1197 1198 1199
#ifdef CONFIG_NETLINK_MMAP
	mutex_init(&nlk->pg_vec_lock);
#endif
1200 1201 1202 1203 1204 1205

	sk->sk_destruct = netlink_sock_destruct;
	sk->sk_protocol = protocol;
	return 0;
}

1206 1207
static int netlink_create(struct net *net, struct socket *sock, int protocol,
			  int kern)
1208 1209
{
	struct module *module = NULL;
1210
	struct mutex *cb_mutex;
1211
	struct netlink_sock *nlk;
1212 1213
	int (*bind)(struct net *net, int group);
	void (*unbind)(struct net *net, int group);
1214
	int err = 0;
L
Linus Torvalds 已提交
1215 1216 1217 1218 1219 1220

	sock->state = SS_UNCONNECTED;

	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
		return -ESOCKTNOSUPPORT;

1221
	if (protocol < 0 || protocol >= MAX_LINKS)
L
Linus Torvalds 已提交
1222 1223
		return -EPROTONOSUPPORT;

1224
	netlink_lock_table();
1225
#ifdef CONFIG_MODULES
1226
	if (!nl_table[protocol].registered) {
1227
		netlink_unlock_table();
1228
		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
1229
		netlink_lock_table();
1230
	}
1231 1232 1233 1234
#endif
	if (nl_table[protocol].registered &&
	    try_module_get(nl_table[protocol].module))
		module = nl_table[protocol].module;
1235 1236
	else
		err = -EPROTONOSUPPORT;
1237
	cb_mutex = nl_table[protocol].cb_mutex;
1238
	bind = nl_table[protocol].bind;
1239
	unbind = nl_table[protocol].unbind;
1240
	netlink_unlock_table();
1241

1242 1243 1244
	if (err < 0)
		goto out;

1245
	err = __netlink_create(net, sock, cb_mutex, protocol, kern);
1246
	if (err < 0)
1247 1248
		goto out_module;

1249
	local_bh_disable();
1250
	sock_prot_inuse_add(net, &netlink_proto, 1);
1251 1252
	local_bh_enable();

1253 1254
	nlk = nlk_sk(sock->sk);
	nlk->module = module;
1255
	nlk->netlink_bind = bind;
1256
	nlk->netlink_unbind = unbind;
1257 1258
out:
	return err;
L
Linus Torvalds 已提交
1259

1260 1261 1262
out_module:
	module_put(module);
	goto out;
L
Linus Torvalds 已提交
1263 1264
}

1265 1266 1267 1268 1269 1270 1271
static void deferred_put_nlk_sk(struct rcu_head *head)
{
	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);

	sock_put(&nlk->sk);
}

L
Linus Torvalds 已提交
1272 1273 1274 1275 1276 1277 1278 1279 1280
static int netlink_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk;

	if (!sk)
		return 0;

	netlink_remove(sk);
1281
	sock_orphan(sk);
L
Linus Torvalds 已提交
1282 1283
	nlk = nlk_sk(sk);

1284 1285 1286 1287
	/*
	 * OK. Socket is unlinked, any packets that arrive now
	 * will be purged.
	 */
L
Linus Torvalds 已提交
1288

1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302
	/* must not acquire netlink_table_lock in any way again before unbind
	 * and notifying genetlink is done as otherwise it might deadlock
	 */
	if (nlk->netlink_unbind) {
		int i;

		for (i = 0; i < nlk->ngroups; i++)
			if (test_bit(i, nlk->groups))
				nlk->netlink_unbind(sock_net(sk), i + 1);
	}
	if (sk->sk_protocol == NETLINK_GENERIC &&
	    atomic_dec_return(&genl_sk_destructing_cnt) == 0)
		wake_up(&genl_sk_destructing_waitq);

L
Linus Torvalds 已提交
1303 1304 1305 1306 1307
	sock->sk = NULL;
	wake_up_interruptible_all(&nlk->wait);

	skb_queue_purge(&sk->sk_write_queue);

1308
	if (nlk->portid) {
L
Linus Torvalds 已提交
1309
		struct netlink_notify n = {
1310
						.net = sock_net(sk),
L
Linus Torvalds 已提交
1311
						.protocol = sk->sk_protocol,
1312
						.portid = nlk->portid,
L
Linus Torvalds 已提交
1313
					  };
1314 1315
		atomic_notifier_call_chain(&netlink_chain,
				NETLINK_URELEASE, &n);
1316
	}
1317

1318
	module_put(nlk->module);
1319

1320
	if (netlink_is_kernel(sk)) {
1321
		netlink_table_grab();
1322 1323
		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
		if (--nl_table[sk->sk_protocol].registered == 0) {
1324 1325 1326 1327 1328
			struct listeners *old;

			old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
			RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
			kfree_rcu(old, rcu);
1329
			nl_table[sk->sk_protocol].module = NULL;
1330
			nl_table[sk->sk_protocol].bind = NULL;
1331
			nl_table[sk->sk_protocol].unbind = NULL;
1332
			nl_table[sk->sk_protocol].flags = 0;
1333 1334
			nl_table[sk->sk_protocol].registered = 0;
		}
1335
		netlink_table_ungrab();
E
Eric Dumazet 已提交
1336
	}
1337

1338 1339 1340
	kfree(nlk->groups);
	nlk->groups = NULL;

1341
	local_bh_disable();
1342
	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
1343
	local_bh_enable();
1344
	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
L
Linus Torvalds 已提交
1345 1346 1347 1348 1349 1350
	return 0;
}

static int netlink_autobind(struct socket *sock)
{
	struct sock *sk = sock->sk;
1351
	struct net *net = sock_net(sk);
1352
	struct netlink_table *table = &nl_table[sk->sk_protocol];
1353
	s32 portid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
1354
	int err;
H
Herbert Xu 已提交
1355 1356
	s32 rover = -4096;
	bool ok;
L
Linus Torvalds 已提交
1357 1358 1359

retry:
	cond_resched();
1360
	rcu_read_lock();
H
Herbert Xu 已提交
1361 1362 1363
	ok = !__netlink_lookup(table, portid, net);
	rcu_read_unlock();
	if (!ok) {
1364
		/* Bind collision, search negative portid values. */
H
Herbert Xu 已提交
1365 1366 1367 1368
		if (rover == -4096)
			/* rover will be in range [S32_MIN, -4097] */
			rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN);
		else if (rover >= -4096)
1369
			rover = -4097;
H
Herbert Xu 已提交
1370
		portid = rover--;
1371
		goto retry;
L
Linus Torvalds 已提交
1372 1373
	}

1374
	err = netlink_insert(sk, portid);
L
Linus Torvalds 已提交
1375 1376
	if (err == -EADDRINUSE)
		goto retry;
1377 1378 1379 1380 1381 1382

	/* If 2 threads race to autobind, that is fine.  */
	if (err == -EBUSY)
		err = 0;

	return err;
L
Linus Torvalds 已提交
1383 1384
}

1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397
/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
			struct user_namespace *user_ns, int cap)
{
1398 1399 1400
	return ((nsp->flags & NETLINK_SKB_DST) ||
		file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
		ns_capable(user_ns, cap);
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
			struct user_namespace *user_ns, int cap)
{
	return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
	return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

1452
static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
1453
{
1454
	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
1455
		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
1456
}
L
Linus Torvalds 已提交
1457

1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (nlk->subscriptions && !subscriptions)
		__sk_del_bind_node(sk);
	else if (!nlk->subscriptions && subscriptions)
		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
	nlk->subscriptions = subscriptions;
}

1470
static int netlink_realloc_groups(struct sock *sk)
1471 1472 1473
{
	struct netlink_sock *nlk = nlk_sk(sk);
	unsigned int groups;
1474
	unsigned long *new_groups;
1475 1476
	int err = 0;

1477 1478
	netlink_table_grab();

1479
	groups = nl_table[sk->sk_protocol].groups;
1480
	if (!nl_table[sk->sk_protocol].registered) {
1481
		err = -ENOENT;
1482 1483
		goto out_unlock;
	}
1484

1485 1486
	if (nlk->ngroups >= groups)
		goto out_unlock;
1487

1488 1489 1490 1491 1492
	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
	if (new_groups == NULL) {
		err = -ENOMEM;
		goto out_unlock;
	}
1493
	memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
1494 1495 1496
	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

	nlk->groups = new_groups;
1497
	nlk->ngroups = groups;
1498 1499 1500
 out_unlock:
	netlink_table_ungrab();
	return err;
1501 1502
}

1503
static void netlink_undo_bind(int group, long unsigned int groups,
1504
			      struct sock *sk)
1505
{
1506
	struct netlink_sock *nlk = nlk_sk(sk);
1507 1508 1509 1510 1511 1512
	int undo;

	if (!nlk->netlink_unbind)
		return;

	for (undo = 0; undo < group; undo++)
1513
		if (test_bit(undo, &groups))
1514
			nlk->netlink_unbind(sock_net(sk), undo + 1);
1515 1516
}

1517 1518
static int netlink_bind(struct socket *sock, struct sockaddr *addr,
			int addr_len)
L
Linus Torvalds 已提交
1519 1520
{
	struct sock *sk = sock->sk;
1521
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1522 1523 1524
	struct netlink_sock *nlk = nlk_sk(sk);
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
	int err;
1525
	long unsigned int groups = nladdr->nl_groups;
1526
	bool bound;
1527

1528 1529 1530
	if (addr_len < sizeof(struct sockaddr_nl))
		return -EINVAL;

L
Linus Torvalds 已提交
1531 1532 1533 1534
	if (nladdr->nl_family != AF_NETLINK)
		return -EINVAL;

	/* Only superuser is allowed to listen multicasts */
1535
	if (groups) {
1536
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
1537
			return -EPERM;
1538 1539 1540
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
1541
	}
L
Linus Torvalds 已提交
1542

1543 1544 1545 1546 1547
	bound = nlk->bound;
	if (bound) {
		/* Ensure nlk->portid is up-to-date. */
		smp_rmb();

1548
		if (nladdr->nl_pid != nlk->portid)
L
Linus Torvalds 已提交
1549
			return -EINVAL;
1550
	}
1551 1552 1553 1554 1555 1556 1557

	if (nlk->netlink_bind && groups) {
		int group;

		for (group = 0; group < nlk->ngroups; group++) {
			if (!test_bit(group, &groups))
				continue;
1558
			err = nlk->netlink_bind(net, group + 1);
1559 1560
			if (!err)
				continue;
1561
			netlink_undo_bind(group, groups, sk);
1562 1563 1564 1565
			return err;
		}
	}

1566 1567 1568 1569
	/* No need for barriers here as we return to user-space without
	 * using any of the bound attributes.
	 */
	if (!bound) {
L
Linus Torvalds 已提交
1570
		err = nladdr->nl_pid ?
1571
			netlink_insert(sk, nladdr->nl_pid) :
L
Linus Torvalds 已提交
1572
			netlink_autobind(sock);
1573
		if (err) {
1574
			netlink_undo_bind(nlk->ngroups, groups, sk);
L
Linus Torvalds 已提交
1575
			return err;
1576
		}
L
Linus Torvalds 已提交
1577 1578
	}

1579
	if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
L
Linus Torvalds 已提交
1580 1581 1582
		return 0;

	netlink_table_grab();
1583
	netlink_update_subscriptions(sk, nlk->subscriptions +
1584
					 hweight32(groups) -
1585
					 hweight32(nlk->groups[0]));
1586
	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
1587
	netlink_update_listeners(sk);
L
Linus Torvalds 已提交
1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
	netlink_table_ungrab();

	return 0;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
			   int alen, int flags)
{
	int err = 0;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1599
	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
L
Linus Torvalds 已提交
1600

1601 1602 1603
	if (alen < sizeof(addr->sa_family))
		return -EINVAL;

L
Linus Torvalds 已提交
1604 1605
	if (addr->sa_family == AF_UNSPEC) {
		sk->sk_state	= NETLINK_UNCONNECTED;
1606
		nlk->dst_portid	= 0;
1607
		nlk->dst_group  = 0;
L
Linus Torvalds 已提交
1608 1609 1610 1611 1612
		return 0;
	}
	if (addr->sa_family != AF_NETLINK)
		return -EINVAL;

1613
	if ((nladdr->nl_groups || nladdr->nl_pid) &&
1614
	    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
L
Linus Torvalds 已提交
1615 1616
		return -EPERM;

1617 1618 1619 1620
	/* No need for barriers here as we return to user-space without
	 * using any of the bound attributes.
	 */
	if (!nlk->bound)
L
Linus Torvalds 已提交
1621 1622 1623 1624
		err = netlink_autobind(sock);

	if (err == 0) {
		sk->sk_state	= NETLINK_CONNECTED;
1625
		nlk->dst_portid = nladdr->nl_pid;
1626
		nlk->dst_group  = ffs(nladdr->nl_groups);
L
Linus Torvalds 已提交
1627 1628 1629 1630 1631
	}

	return err;
}

1632 1633
static int netlink_getname(struct socket *sock, struct sockaddr *addr,
			   int *addr_len, int peer)
L
Linus Torvalds 已提交
1634 1635 1636
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
1637
	DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
1638

L
Linus Torvalds 已提交
1639 1640 1641 1642 1643
	nladdr->nl_family = AF_NETLINK;
	nladdr->nl_pad = 0;
	*addr_len = sizeof(*nladdr);

	if (peer) {
1644
		nladdr->nl_pid = nlk->dst_portid;
1645
		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
L
Linus Torvalds 已提交
1646
	} else {
1647
		nladdr->nl_pid = nlk->portid;
1648
		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
L
Linus Torvalds 已提交
1649 1650 1651 1652
	}
	return 0;
}

1653
static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
L
Linus Torvalds 已提交
1654 1655 1656 1657
{
	struct sock *sock;
	struct netlink_sock *nlk;

1658
	sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
L
Linus Torvalds 已提交
1659 1660 1661 1662 1663
	if (!sock)
		return ERR_PTR(-ECONNREFUSED);

	/* Don't bother queuing skb if kernel socket has no input function */
	nlk = nlk_sk(sock);
1664
	if (sock->sk_state == NETLINK_CONNECTED &&
1665
	    nlk->dst_portid != nlk_sk(ssk)->portid) {
L
Linus Torvalds 已提交
1666 1667 1668 1669 1670 1671 1672 1673
		sock_put(sock);
		return ERR_PTR(-ECONNREFUSED);
	}
	return sock;
}

struct sock *netlink_getsockbyfilp(struct file *filp)
{
A
Al Viro 已提交
1674
	struct inode *inode = file_inode(filp);
L
Linus Torvalds 已提交
1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
	struct sock *sock;

	if (!S_ISSOCK(inode->i_mode))
		return ERR_PTR(-ENOTSOCK);

	sock = SOCKET_I(inode)->sk;
	if (sock->sk_family != AF_NETLINK)
		return ERR_PTR(-EINVAL);

	sock_hold(sock);
	return sock;
}

1688 1689
static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
					       int broadcast)
1690 1691 1692 1693
{
	struct sk_buff *skb;
	void *data;

1694
	if (size <= NLMSG_GOODSIZE || broadcast)
1695 1696
		return alloc_skb(size, GFP_KERNEL);

1697 1698
	size = SKB_DATA_ALIGN(size) +
	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1699 1700 1701

	data = vmalloc(size);
	if (data == NULL)
1702
		return NULL;
1703

E
Eric Dumazet 已提交
1704
	skb = __build_skb(data, size);
1705 1706
	if (skb == NULL)
		vfree(data);
E
Eric Dumazet 已提交
1707
	else
1708
		skb->destructor = netlink_skb_destructor;
1709 1710 1711 1712

	return skb;
}

L
Linus Torvalds 已提交
1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
1723
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
P
Patrick McHardy 已提交
1724
		      long *timeo, struct sock *ssk)
L
Linus Torvalds 已提交
1725 1726 1727 1728 1729
{
	struct netlink_sock *nlk;

	nlk = nlk_sk(sk);

1730
	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1731
	     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
1732
	    !netlink_skb_is_mmaped(skb)) {
L
Linus Torvalds 已提交
1733
		DECLARE_WAITQUEUE(wait, current);
P
Patrick McHardy 已提交
1734
		if (!*timeo) {
1735
			if (!ssk || netlink_is_kernel(ssk))
L
Linus Torvalds 已提交
1736 1737 1738 1739 1740 1741 1742 1743 1744 1745
				netlink_overrun(sk);
			sock_put(sk);
			kfree_skb(skb);
			return -EAGAIN;
		}

		__set_current_state(TASK_INTERRUPTIBLE);
		add_wait_queue(&nlk->wait, &wait);

		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1746
		     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
L
Linus Torvalds 已提交
1747
		    !sock_flag(sk, SOCK_DEAD))
P
Patrick McHardy 已提交
1748
			*timeo = schedule_timeout(*timeo);
L
Linus Torvalds 已提交
1749 1750 1751 1752 1753 1754 1755

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&nlk->wait, &wait);
		sock_put(sk);

		if (signal_pending(current)) {
			kfree_skb(skb);
P
Patrick McHardy 已提交
1756
			return sock_intr_errno(*timeo);
L
Linus Torvalds 已提交
1757 1758 1759
		}
		return 1;
	}
1760
	netlink_skb_set_owner_r(skb, sk);
L
Linus Torvalds 已提交
1761 1762 1763
	return 0;
}

1764
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1765 1766 1767
{
	int len = skb->len;

1768 1769
	netlink_deliver_tap(skb);

1770 1771 1772 1773 1774 1775 1776 1777
#ifdef CONFIG_NETLINK_MMAP
	if (netlink_skb_is_mmaped(skb))
		netlink_queue_mmaped_skb(sk, skb);
	else if (netlink_rx_is_mmaped(sk))
		netlink_ring_set_copied(sk, skb);
	else
#endif /* CONFIG_NETLINK_MMAP */
		skb_queue_tail(&sk->sk_receive_queue, skb);
1778
	sk->sk_data_ready(sk);
1779 1780 1781 1782 1783 1784 1785
	return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
	int len = __netlink_sendskb(sk, skb);

L
Linus Torvalds 已提交
1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
	sock_put(sk);
	return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
	kfree_skb(skb);
	sock_put(sk);
}

1796
static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
L
Linus Torvalds 已提交
1797 1798 1799
{
	int delta;

1800
	WARN_ON(skb->sk != NULL);
1801 1802
	if (netlink_skb_is_mmaped(skb))
		return skb;
L
Linus Torvalds 已提交
1803

1804
	delta = skb->end - skb->tail;
1805
	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
L
Linus Torvalds 已提交
1806 1807 1808 1809 1810 1811
		return skb;

	if (skb_shared(skb)) {
		struct sk_buff *nskb = skb_clone(skb, allocation);
		if (!nskb)
			return skb;
1812
		consume_skb(skb);
L
Linus Torvalds 已提交
1813 1814 1815 1816 1817 1818 1819 1820 1821
		skb = nskb;
	}

	if (!pskb_expand_head(skb, 0, -delta, allocation))
		skb->truesize -= delta;

	return skb;
}

1822 1823
static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
				  struct sock *ssk)
1824 1825 1826 1827 1828 1829 1830
{
	int ret;
	struct netlink_sock *nlk = nlk_sk(sk);

	ret = -ECONNREFUSED;
	if (nlk->netlink_rcv != NULL) {
		ret = skb->len;
1831
		netlink_skb_set_owner_r(skb, sk);
1832
		NETLINK_CB(skb).sk = ssk;
1833
		netlink_deliver_tap_kernel(sk, ssk, skb);
1834
		nlk->netlink_rcv(skb);
1835 1836 1837
		consume_skb(skb);
	} else {
		kfree_skb(skb);
1838 1839 1840 1841 1842 1843
	}
	sock_put(sk);
	return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
1844
		    u32 portid, int nonblock)
L
Linus Torvalds 已提交
1845 1846 1847 1848 1849 1850 1851 1852 1853
{
	struct sock *sk;
	int err;
	long timeo;

	skb = netlink_trim(skb, gfp_any());

	timeo = sock_sndtimeo(ssk, nonblock);
retry:
1854
	sk = netlink_getsockbyportid(ssk, portid);
L
Linus Torvalds 已提交
1855 1856 1857 1858
	if (IS_ERR(sk)) {
		kfree_skb(skb);
		return PTR_ERR(sk);
	}
1859
	if (netlink_is_kernel(sk))
1860
		return netlink_unicast_kernel(sk, skb, ssk);
1861

1862
	if (sk_filter(sk, skb)) {
W
Wang Chen 已提交
1863
		err = skb->len;
1864 1865 1866 1867 1868
		kfree_skb(skb);
		sock_put(sk);
		return err;
	}

1869
	err = netlink_attachskb(sk, skb, &timeo, ssk);
L
Linus Torvalds 已提交
1870 1871 1872 1873 1874
	if (err == 1)
		goto retry;
	if (err)
		return err;

1875
	return netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
1876
}
1877
EXPORT_SYMBOL(netlink_unicast);
L
Linus Torvalds 已提交
1878

1879 1880 1881
struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
				    unsigned int ldiff, u32 dst_portid,
				    gfp_t gfp_mask)
1882 1883
{
#ifdef CONFIG_NETLINK_MMAP
1884
	unsigned int maxlen, linear_size;
1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898
	struct sock *sk = NULL;
	struct sk_buff *skb;
	struct netlink_ring *ring;
	struct nl_mmap_hdr *hdr;

	sk = netlink_getsockbyportid(ssk, dst_portid);
	if (IS_ERR(sk))
		goto out;

	ring = &nlk_sk(sk)->rx_ring;
	/* fast-path without atomic ops for common case: non-mmaped receiver */
	if (ring->pg_vec == NULL)
		goto out_put;

1899 1900 1901 1902 1903
	/* We need to account the full linear size needed as a ring
	 * slot cannot have non-linear parts.
	 */
	linear_size = size + ldiff;
	if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
1904 1905
		goto out_put;

1906 1907 1908 1909 1910 1911 1912 1913 1914
	skb = alloc_skb_head(gfp_mask);
	if (skb == NULL)
		goto err1;

	spin_lock_bh(&sk->sk_receive_queue.lock);
	/* check again under lock */
	if (ring->pg_vec == NULL)
		goto out_free;

1915
	/* check again under lock */
1916
	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
1917
	if (maxlen < linear_size)
1918 1919 1920 1921 1922 1923
		goto out_free;

	netlink_forward_ring(ring);
	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
	if (hdr == NULL)
		goto err2;
1924

1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935
	netlink_ring_setup_skb(skb, sk, ring, hdr);
	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
	atomic_inc(&ring->pending);
	netlink_increment_head(ring);

	spin_unlock_bh(&sk->sk_receive_queue.lock);
	return skb;

err2:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
1936
	netlink_overrun(sk);
1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
err1:
	sock_put(sk);
	return NULL;

out_free:
	kfree_skb(skb);
	spin_unlock_bh(&sk->sk_receive_queue.lock);
out_put:
	sock_put(sk);
out:
#endif
	return alloc_skb(size, gfp_mask);
}
1950
EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
1951

1952 1953 1954
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
	int res = 0;
1955
	struct listeners *listeners;
1956

1957
	BUG_ON(!netlink_is_kernel(sk));
1958 1959 1960 1961

	rcu_read_lock();
	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

1962
	if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
1963
		res = test_bit(group - 1, listeners->masks);
1964 1965 1966

	rcu_read_unlock();

1967 1968 1969 1970
	return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

1971
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
1972 1973 1974 1975
{
	struct netlink_sock *nlk = nlk_sk(sk);

	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
1976
	    !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
1977
		netlink_skb_set_owner_r(skb, sk);
1978
		__netlink_sendskb(sk, skb);
1979
		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
L
Linus Torvalds 已提交
1980 1981 1982 1983 1984 1985
	}
	return -1;
}

struct netlink_broadcast_data {
	struct sock *exclude_sk;
1986
	struct net *net;
1987
	u32 portid;
L
Linus Torvalds 已提交
1988 1989
	u32 group;
	int failure;
1990
	int delivery_failure;
L
Linus Torvalds 已提交
1991 1992
	int congested;
	int delivered;
A
Al Viro 已提交
1993
	gfp_t allocation;
L
Linus Torvalds 已提交
1994
	struct sk_buff *skb, *skb2;
1995 1996
	int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
	void *tx_data;
L
Linus Torvalds 已提交
1997 1998
};

1999 2000
static void do_one_broadcast(struct sock *sk,
				    struct netlink_broadcast_data *p)
L
Linus Torvalds 已提交
2001 2002 2003 2004 2005
{
	struct netlink_sock *nlk = nlk_sk(sk);
	int val;

	if (p->exclude_sk == sk)
2006
		return;
L
Linus Torvalds 已提交
2007

2008
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
2009
	    !test_bit(p->group - 1, nlk->groups))
2010
		return;
L
Linus Torvalds 已提交
2011

2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
	if (!net_eq(sock_net(sk), p->net)) {
		if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
			return;

		if (!peernet_has_id(sock_net(sk), p->net))
			return;

		if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
				     CAP_NET_BROADCAST))
			return;
	}
2023

L
Linus Torvalds 已提交
2024 2025
	if (p->failure) {
		netlink_overrun(sk);
2026
		return;
L
Linus Torvalds 已提交
2027 2028 2029 2030
	}

	sock_hold(sk);
	if (p->skb2 == NULL) {
2031
		if (skb_shared(p->skb)) {
L
Linus Torvalds 已提交
2032 2033
			p->skb2 = skb_clone(p->skb, p->allocation);
		} else {
2034 2035 2036 2037 2038 2039
			p->skb2 = skb_get(p->skb);
			/*
			 * skb ownership may have been set when
			 * delivered to a previous socket.
			 */
			skb_orphan(p->skb2);
L
Linus Torvalds 已提交
2040 2041 2042 2043 2044 2045
		}
	}
	if (p->skb2 == NULL) {
		netlink_overrun(sk);
		/* Clone failed. Notify ALL listeners. */
		p->failure = 1;
2046
		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
2047
			p->delivery_failure = 1;
2048 2049 2050
		goto out;
	}
	if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
2051 2052
		kfree_skb(p->skb2);
		p->skb2 = NULL;
2053 2054 2055
		goto out;
	}
	if (sk_filter(sk, p->skb2)) {
2056 2057
		kfree_skb(p->skb2);
		p->skb2 = NULL;
2058 2059 2060 2061 2062 2063
		goto out;
	}
	NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
	NETLINK_CB(p->skb2).nsid_is_set = true;
	val = netlink_broadcast_deliver(sk, p->skb2);
	if (val < 0) {
L
Linus Torvalds 已提交
2064
		netlink_overrun(sk);
2065
		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
2066
			p->delivery_failure = 1;
L
Linus Torvalds 已提交
2067 2068 2069 2070 2071
	} else {
		p->congested |= val;
		p->delivered = 1;
		p->skb2 = NULL;
	}
2072
out:
L
Linus Torvalds 已提交
2073 2074 2075
	sock_put(sk);
}

2076
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
2077 2078 2079
	u32 group, gfp_t allocation,
	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
	void *filter_data)
L
Linus Torvalds 已提交
2080
{
2081
	struct net *net = sock_net(ssk);
L
Linus Torvalds 已提交
2082 2083 2084 2085 2086 2087
	struct netlink_broadcast_data info;
	struct sock *sk;

	skb = netlink_trim(skb, allocation);

	info.exclude_sk = ssk;
2088
	info.net = net;
2089
	info.portid = portid;
L
Linus Torvalds 已提交
2090 2091
	info.group = group;
	info.failure = 0;
2092
	info.delivery_failure = 0;
L
Linus Torvalds 已提交
2093 2094 2095 2096 2097
	info.congested = 0;
	info.delivered = 0;
	info.allocation = allocation;
	info.skb = skb;
	info.skb2 = NULL;
2098 2099
	info.tx_filter = filter;
	info.tx_data = filter_data;
L
Linus Torvalds 已提交
2100 2101 2102 2103 2104

	/* While we sleep in clone, do not allow to change socket list */

	netlink_lock_table();

2105
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
L
Linus Torvalds 已提交
2106 2107
		do_one_broadcast(sk, &info);

2108
	consume_skb(skb);
2109

L
Linus Torvalds 已提交
2110 2111
	netlink_unlock_table();

2112 2113
	if (info.delivery_failure) {
		kfree_skb(info.skb2);
2114
		return -ENOBUFS;
E
Eric Dumazet 已提交
2115 2116
	}
	consume_skb(info.skb2);
2117

L
Linus Torvalds 已提交
2118
	if (info.delivered) {
2119
		if (info.congested && gfpflags_allow_blocking(allocation))
L
Linus Torvalds 已提交
2120 2121 2122 2123 2124
			yield();
		return 0;
	}
	return -ESRCH;
}
2125 2126
EXPORT_SYMBOL(netlink_broadcast_filtered);

2127
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
2128 2129
		      u32 group, gfp_t allocation)
{
2130
	return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
2131 2132
		NULL, NULL);
}
2133
EXPORT_SYMBOL(netlink_broadcast);
L
Linus Torvalds 已提交
2134 2135 2136

struct netlink_set_err_data {
	struct sock *exclude_sk;
2137
	u32 portid;
L
Linus Torvalds 已提交
2138 2139 2140 2141
	u32 group;
	int code;
};

2142
static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
L
Linus Torvalds 已提交
2143 2144
{
	struct netlink_sock *nlk = nlk_sk(sk);
2145
	int ret = 0;
L
Linus Torvalds 已提交
2146 2147 2148 2149

	if (sk == p->exclude_sk)
		goto out;

O
Octavian Purdila 已提交
2150
	if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
2151 2152
		goto out;

2153
	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
2154
	    !test_bit(p->group - 1, nlk->groups))
L
Linus Torvalds 已提交
2155 2156
		goto out;

2157
	if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) {
2158 2159 2160 2161
		ret = 1;
		goto out;
	}

L
Linus Torvalds 已提交
2162 2163 2164
	sk->sk_err = p->code;
	sk->sk_error_report(sk);
out:
2165
	return ret;
L
Linus Torvalds 已提交
2166 2167
}

2168 2169 2170
/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
2171
 * @portid: the PORTID of a process that we want to skip (if any)
2172
 * @group: the broadcast group that will notice the error
2173
 * @code: error code, must be negative (as usual in kernelspace)
2174 2175
 *
 * This function returns the number of broadcast listeners that have set the
2176
 * NETLINK_NO_ENOBUFS socket option.
2177
 */
2178
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
L
Linus Torvalds 已提交
2179 2180 2181
{
	struct netlink_set_err_data info;
	struct sock *sk;
2182
	int ret = 0;
L
Linus Torvalds 已提交
2183 2184

	info.exclude_sk = ssk;
2185
	info.portid = portid;
L
Linus Torvalds 已提交
2186
	info.group = group;
2187 2188
	/* sk->sk_err wants a positive error value */
	info.code = -code;
L
Linus Torvalds 已提交
2189 2190 2191

	read_lock(&nl_table_lock);

2192
	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
2193
		ret += do_one_set_err(sk, &info);
L
Linus Torvalds 已提交
2194 2195

	read_unlock(&nl_table_lock);
2196
	return ret;
L
Linus Torvalds 已提交
2197
}
2198
EXPORT_SYMBOL(netlink_set_err);
L
Linus Torvalds 已提交
2199

2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216
/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
				     unsigned int group,
				     int is_new)
{
	int old, new = !!is_new, subscriptions;

	old = test_bit(group - 1, nlk->groups);
	subscriptions = nlk->subscriptions - old + new;
	if (new)
		__set_bit(group - 1, nlk->groups);
	else
		__clear_bit(group - 1, nlk->groups);
	netlink_update_subscriptions(&nlk->sk, subscriptions);
	netlink_update_listeners(&nlk->sk);
}

2217
static int netlink_setsockopt(struct socket *sock, int level, int optname,
2218
			      char __user *optval, unsigned int optlen)
2219 2220 2221
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2222 2223
	unsigned int val = 0;
	int err;
2224 2225 2226 2227

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

2228 2229
	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
	    optlen >= sizeof(int) &&
2230
	    get_user(val, (unsigned int __user *)optval))
2231 2232 2233 2234 2235
		return -EFAULT;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (val)
2236
			nlk->flags |= NETLINK_F_RECV_PKTINFO;
2237
		else
2238
			nlk->flags &= ~NETLINK_F_RECV_PKTINFO;
2239 2240 2241 2242
		err = 0;
		break;
	case NETLINK_ADD_MEMBERSHIP:
	case NETLINK_DROP_MEMBERSHIP: {
2243
		if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
2244
			return -EPERM;
2245 2246 2247
		err = netlink_realloc_groups(sk);
		if (err)
			return err;
2248 2249
		if (!val || val - 1 >= nlk->ngroups)
			return -EINVAL;
2250
		if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
2251
			err = nlk->netlink_bind(sock_net(sk), val);
2252 2253 2254
			if (err)
				return err;
		}
2255
		netlink_table_grab();
2256 2257
		netlink_update_socket_mc(nlk, val,
					 optname == NETLINK_ADD_MEMBERSHIP);
2258
		netlink_table_ungrab();
2259
		if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
2260
			nlk->netlink_unbind(sock_net(sk), val);
2261

2262 2263 2264
		err = 0;
		break;
	}
2265 2266
	case NETLINK_BROADCAST_ERROR:
		if (val)
2267
			nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR;
2268
		else
2269
			nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR;
2270 2271
		err = 0;
		break;
2272 2273
	case NETLINK_NO_ENOBUFS:
		if (val) {
2274 2275
			nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS;
			clear_bit(NETLINK_S_CONGESTED, &nlk->state);
2276
			wake_up_interruptible(&nlk->wait);
E
Eric Dumazet 已提交
2277
		} else {
2278
			nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS;
E
Eric Dumazet 已提交
2279
		}
2280 2281
		err = 0;
		break;
2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295
#ifdef CONFIG_NETLINK_MMAP
	case NETLINK_RX_RING:
	case NETLINK_TX_RING: {
		struct nl_mmap_req req;

		/* Rings might consume more memory than queue limits, require
		 * CAP_NET_ADMIN.
		 */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
		if (optlen < sizeof(req))
			return -EINVAL;
		if (copy_from_user(&req, optval, sizeof(req)))
			return -EFAULT;
2296
		err = netlink_set_ring(sk, &req,
2297 2298 2299 2300
				       optname == NETLINK_TX_RING);
		break;
	}
#endif /* CONFIG_NETLINK_MMAP */
2301 2302 2303 2304 2305 2306 2307 2308 2309 2310
	case NETLINK_LISTEN_ALL_NSID:
		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
			return -EPERM;

		if (val)
			nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
		else
			nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
		err = 0;
		break;
2311 2312 2313 2314 2315 2316 2317
	case NETLINK_CAP_ACK:
		if (val)
			nlk->flags |= NETLINK_F_CAP_ACK;
		else
			nlk->flags &= ~NETLINK_F_CAP_ACK;
		err = 0;
		break;
2318 2319 2320 2321 2322 2323 2324
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
2325
			      char __user *optval, int __user *optlen)
2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int len, val, err;

	if (level != SOL_NETLINK)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case NETLINK_PKTINFO:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
2344
		val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0;
H
Heiko Carstens 已提交
2345 2346 2347
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
2348 2349
		err = 0;
		break;
2350 2351 2352 2353
	case NETLINK_BROADCAST_ERROR:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
2354
		val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0;
2355 2356 2357 2358 2359
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2360 2361 2362 2363
	case NETLINK_NO_ENOBUFS:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
2364
		val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0;
2365 2366 2367 2368 2369
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2370 2371 2372 2373
	case NETLINK_LIST_MEMBERSHIPS: {
		int pos, idx, shift;

		err = 0;
2374
		netlink_lock_table();
2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388
		for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
			if (len - pos < sizeof(u32))
				break;

			idx = pos / sizeof(unsigned long);
			shift = (pos % sizeof(unsigned long)) * 8;
			if (put_user((u32)(nlk->groups[idx] >> shift),
				     (u32 __user *)(optval + pos))) {
				err = -EFAULT;
				break;
			}
		}
		if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen))
			err = -EFAULT;
2389
		netlink_unlock_table();
2390 2391
		break;
	}
2392 2393 2394 2395 2396 2397 2398 2399 2400 2401
	case NETLINK_CAP_ACK:
		if (len < sizeof(int))
			return -EINVAL;
		len = sizeof(int);
		val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0;
		if (put_user(len, optlen) ||
		    put_user(val, optval))
			return -EFAULT;
		err = 0;
		break;
2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
	default:
		err = -ENOPROTOOPT;
	}
	return err;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
	struct nl_pktinfo info;

	info.group = NETLINK_CB(skb).dst_group;
	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

2416 2417 2418 2419 2420 2421 2422 2423 2424 2425
static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
					 struct sk_buff *skb)
{
	if (!NETLINK_CB(skb).nsid_is_set)
		return;

	put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
		 &NETLINK_CB(skb).nsid);
}

2426
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
L
Linus Torvalds 已提交
2427 2428 2429
{
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
2430
	DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
2431
	u32 dst_portid;
2432
	u32 dst_group;
L
Linus Torvalds 已提交
2433 2434 2435
	struct sk_buff *skb;
	int err;
	struct scm_cookie scm;
2436
	u32 netlink_skb_flags = 0;
L
Linus Torvalds 已提交
2437 2438 2439 2440

	if (msg->msg_flags&MSG_OOB)
		return -EOPNOTSUPP;

C
Christoph Hellwig 已提交
2441
	err = scm_send(sock, msg, &scm, true);
L
Linus Torvalds 已提交
2442 2443 2444 2445
	if (err < 0)
		return err;

	if (msg->msg_namelen) {
2446
		err = -EINVAL;
L
Linus Torvalds 已提交
2447
		if (addr->nl_family != AF_NETLINK)
2448
			goto out;
2449
		dst_portid = addr->nl_pid;
2450
		dst_group = ffs(addr->nl_groups);
2451
		err =  -EPERM;
2452
		if ((dst_group || dst_portid) &&
2453
		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
2454
			goto out;
2455
		netlink_skb_flags |= NETLINK_SKB_DST;
L
Linus Torvalds 已提交
2456
	} else {
2457
		dst_portid = nlk->dst_portid;
2458
		dst_group = nlk->dst_group;
L
Linus Torvalds 已提交
2459 2460
	}

2461
	if (!nlk->bound) {
L
Linus Torvalds 已提交
2462 2463 2464
		err = netlink_autobind(sock);
		if (err)
			goto out;
2465 2466 2467
	} else {
		/* Ensure nlk is hashed and visible. */
		smp_rmb();
L
Linus Torvalds 已提交
2468 2469
	}

2470 2471 2472
	/* It's a really convoluted way for userland to ask for mmaped
	 * sendmsg(), but that's what we've got...
	 */
2473
	if (netlink_tx_is_mmaped(sk) &&
2474
	    iter_is_iovec(&msg->msg_iter) &&
2475
	    msg->msg_iter.nr_segs == 1 &&
A
Al Viro 已提交
2476
	    msg->msg_iter.iov->iov_base == NULL) {
2477
		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
C
Christoph Hellwig 已提交
2478
					   &scm);
2479 2480 2481
		goto out;
	}

L
Linus Torvalds 已提交
2482 2483 2484 2485
	err = -EMSGSIZE;
	if (len > sk->sk_sndbuf - 32)
		goto out;
	err = -ENOBUFS;
2486
	skb = netlink_alloc_large_skb(len, dst_group);
2487
	if (skb == NULL)
L
Linus Torvalds 已提交
2488 2489
		goto out;

2490
	NETLINK_CB(skb).portid	= nlk->portid;
2491
	NETLINK_CB(skb).dst_group = dst_group;
C
Christoph Hellwig 已提交
2492
	NETLINK_CB(skb).creds	= scm.creds;
2493
	NETLINK_CB(skb).flags	= netlink_skb_flags;
L
Linus Torvalds 已提交
2494 2495

	err = -EFAULT;
A
Al Viro 已提交
2496
	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
L
Linus Torvalds 已提交
2497 2498 2499 2500 2501 2502 2503 2504 2505 2506
		kfree_skb(skb);
		goto out;
	}

	err = security_netlink_send(sk, skb);
	if (err) {
		kfree_skb(skb);
		goto out;
	}

2507
	if (dst_group) {
L
Linus Torvalds 已提交
2508
		atomic_inc(&skb->users);
2509
		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
L
Linus Torvalds 已提交
2510
	}
2511
	err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
L
Linus Torvalds 已提交
2512 2513

out:
C
Christoph Hellwig 已提交
2514
	scm_destroy(&scm);
L
Linus Torvalds 已提交
2515 2516 2517
	return err;
}

2518
static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
L
Linus Torvalds 已提交
2519 2520 2521 2522 2523 2524 2525
			   int flags)
{
	struct scm_cookie scm;
	struct sock *sk = sock->sk;
	struct netlink_sock *nlk = nlk_sk(sk);
	int noblock = flags&MSG_DONTWAIT;
	size_t copied;
J
Johannes Berg 已提交
2526
	struct sk_buff *skb, *data_skb;
2527
	int err, ret;
L
Linus Torvalds 已提交
2528 2529 2530 2531 2532 2533

	if (flags&MSG_OOB)
		return -EOPNOTSUPP;

	copied = 0;

2534 2535
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
L
Linus Torvalds 已提交
2536 2537
		goto out;

J
Johannes Berg 已提交
2538 2539
	data_skb = skb;

2540 2541 2542
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
	if (unlikely(skb_shinfo(skb)->frag_list)) {
		/*
J
Johannes Berg 已提交
2543 2544 2545
		 * If this skb has a frag_list, then here that means that we
		 * will have to use the frag_list skb's data for compat tasks
		 * and the regular skb's data for normal (non-compat) tasks.
2546
		 *
J
Johannes Berg 已提交
2547 2548 2549 2550
		 * If we need to send the compat skb, assign it to the
		 * 'data_skb' variable so that it will be used below for data
		 * copying. We keep 'skb' for everything else, including
		 * freeing both later.
2551
		 */
J
Johannes Berg 已提交
2552 2553
		if (flags & MSG_CMSG_COMPAT)
			data_skb = skb_shinfo(skb)->frag_list;
2554 2555 2556
	}
#endif

E
Eric Dumazet 已提交
2557 2558 2559 2560 2561
	/* Record the max length of recvmsg() calls for future allocations */
	nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
	nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
				     16384);

J
Johannes Berg 已提交
2562
	copied = data_skb->len;
L
Linus Torvalds 已提交
2563 2564 2565 2566 2567
	if (len < copied) {
		msg->msg_flags |= MSG_TRUNC;
		copied = len;
	}

J
Johannes Berg 已提交
2568
	skb_reset_transport_header(data_skb);
2569
	err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
L
Linus Torvalds 已提交
2570 2571

	if (msg->msg_name) {
2572
		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
L
Linus Torvalds 已提交
2573 2574
		addr->nl_family = AF_NETLINK;
		addr->nl_pad    = 0;
2575
		addr->nl_pid	= NETLINK_CB(skb).portid;
2576
		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
L
Linus Torvalds 已提交
2577 2578 2579
		msg->msg_namelen = sizeof(*addr);
	}

2580
	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
2581
		netlink_cmsg_recv_pktinfo(msg, skb);
2582 2583
	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
		netlink_cmsg_listen_all_nsid(sk, msg, skb);
2584

C
Christoph Hellwig 已提交
2585 2586
	memset(&scm, 0, sizeof(scm));
	scm.creds = *NETLINK_CREDS(skb);
2587
	if (flags & MSG_TRUNC)
J
Johannes Berg 已提交
2588
		copied = data_skb->len;
2589

L
Linus Torvalds 已提交
2590 2591
	skb_free_datagram(sk, skb);

2592 2593
	if (nlk->cb_running &&
	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
2594 2595
		ret = netlink_dump(sk);
		if (ret) {
2596
			sk->sk_err = -ret;
2597 2598 2599
			sk->sk_error_report(sk);
		}
	}
L
Linus Torvalds 已提交
2600

C
Christoph Hellwig 已提交
2601
	scm_recv(sock, msg, &scm, flags);
L
Linus Torvalds 已提交
2602 2603 2604 2605 2606
out:
	netlink_rcv_wake(sk);
	return err ? : copied;
}

2607
static void netlink_data_ready(struct sock *sk)
L
Linus Torvalds 已提交
2608
{
2609
	BUG();
L
Linus Torvalds 已提交
2610 2611 2612
}

/*
2613
 *	We export these functions to other modules. They provide a
L
Linus Torvalds 已提交
2614 2615 2616 2617 2618
 *	complete set of kernel non-blocking support for message
 *	queueing.
 */

struct sock *
2619 2620
__netlink_kernel_create(struct net *net, int unit, struct module *module,
			struct netlink_kernel_cfg *cfg)
L
Linus Torvalds 已提交
2621 2622 2623
{
	struct socket *sock;
	struct sock *sk;
2624
	struct netlink_sock *nlk;
2625
	struct listeners *listeners = NULL;
2626 2627
	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
	unsigned int groups;
L
Linus Torvalds 已提交
2628

2629
	BUG_ON(!nl_table);
L
Linus Torvalds 已提交
2630

2631
	if (unit < 0 || unit >= MAX_LINKS)
L
Linus Torvalds 已提交
2632 2633 2634 2635
		return NULL;

	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
		return NULL;
2636 2637

	if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0)
2638 2639 2640
		goto out_sock_release_nosk;

	sk = sock->sk;
2641

2642
	if (!cfg || cfg->groups < 32)
2643
		groups = 32;
2644 2645
	else
		groups = cfg->groups;
2646

2647
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2648 2649 2650
	if (!listeners)
		goto out_sock_release;

L
Linus Torvalds 已提交
2651
	sk->sk_data_ready = netlink_data_ready;
2652 2653
	if (cfg && cfg->input)
		nlk_sk(sk)->netlink_rcv = cfg->input;
L
Linus Torvalds 已提交
2654

2655
	if (netlink_insert(sk, 0))
2656
		goto out_sock_release;
2657

2658
	nlk = nlk_sk(sk);
2659
	nlk->flags |= NETLINK_F_KERNEL_SOCKET;
2660 2661

	netlink_table_grab();
2662 2663
	if (!nl_table[unit].registered) {
		nl_table[unit].groups = groups;
2664
		rcu_assign_pointer(nl_table[unit].listeners, listeners);
2665 2666
		nl_table[unit].cb_mutex = cb_mutex;
		nl_table[unit].module = module;
2667 2668
		if (cfg) {
			nl_table[unit].bind = cfg->bind;
2669
			nl_table[unit].unbind = cfg->unbind;
2670
			nl_table[unit].flags = cfg->flags;
2671 2672
			if (cfg->compare)
				nl_table[unit].compare = cfg->compare;
2673
		}
2674
		nl_table[unit].registered = 1;
2675 2676
	} else {
		kfree(listeners);
2677
		nl_table[unit].registered++;
2678
	}
2679
	netlink_table_ungrab();
2680 2681
	return sk;

2682
out_sock_release:
2683
	kfree(listeners);
2684
	netlink_kernel_release(sk);
2685 2686 2687
	return NULL;

out_sock_release_nosk:
2688
	sock_release(sock);
2689
	return NULL;
L
Linus Torvalds 已提交
2690
}
2691
EXPORT_SYMBOL(__netlink_kernel_create);
2692 2693 2694 2695

void
netlink_kernel_release(struct sock *sk)
{
2696 2697 2698 2699
	if (sk == NULL || sk->sk_socket == NULL)
		return;

	sock_release(sk->sk_socket);
2700 2701 2702
}
EXPORT_SYMBOL(netlink_kernel_release);

2703
int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
2704
{
2705
	struct listeners *new, *old;
2706 2707 2708 2709 2710 2711
	struct netlink_table *tbl = &nl_table[sk->sk_protocol];

	if (groups < 32)
		groups = 32;

	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
2712 2713
		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
		if (!new)
2714
			return -ENOMEM;
2715
		old = nl_deref_protected(tbl->listeners);
2716 2717 2718
		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
		rcu_assign_pointer(tbl->listeners, new);

2719
		kfree_rcu(old, rcu);
2720 2721 2722
	}
	tbl->groups = groups;

2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743
	return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
	int err;

	netlink_table_grab();
	err = __netlink_change_ngroups(sk, groups);
2744
	netlink_table_ungrab();
2745

2746 2747 2748
	return err;
}

2749 2750 2751 2752 2753
void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
	struct sock *sk;
	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];

2754
	sk_for_each_bound(sk, &tbl->mc_list)
2755 2756 2757
		netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

2758
struct nlmsghdr *
2759
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
2760 2761
{
	struct nlmsghdr *nlh;
2762
	int size = nlmsg_msg_size(len);
2763

2764
	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size));
2765 2766 2767
	nlh->nlmsg_type = type;
	nlh->nlmsg_len = size;
	nlh->nlmsg_flags = flags;
2768
	nlh->nlmsg_pid = portid;
2769 2770
	nlh->nlmsg_seq = seq;
	if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
2771
		memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
2772 2773 2774 2775
	return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

L
Linus Torvalds 已提交
2776 2777 2778 2779 2780 2781 2782 2783 2784
/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump(struct sock *sk)
{
	struct netlink_sock *nlk = nlk_sk(sk);
	struct netlink_callback *cb;
2785
	struct sk_buff *skb = NULL;
L
Linus Torvalds 已提交
2786
	struct nlmsghdr *nlh;
2787
	int len, err = -ENOBUFS;
2788
	int alloc_min_size;
2789
	int alloc_size;
L
Linus Torvalds 已提交
2790

2791
	mutex_lock(nlk->cb_mutex);
2792
	if (!nlk->cb_running) {
2793 2794
		err = -EINVAL;
		goto errout_skb;
L
Linus Torvalds 已提交
2795 2796
	}

2797 2798 2799
	if (!netlink_rx_is_mmaped(sk) &&
	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
		goto errout_skb;
E
Eric Dumazet 已提交
2800 2801 2802 2803 2804 2805

	/* NLMSG_GOODSIZE is small to avoid high order allocations being
	 * required, but it makes sense to _attempt_ a 16K bytes allocation
	 * to reduce number of system calls on dump operations, if user
	 * ever provided a big enough buffer.
	 */
2806 2807 2808 2809 2810 2811
	cb = &nlk->cb;
	alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

	if (alloc_min_size < nlk->max_recvmsg_len) {
		alloc_size = nlk->max_recvmsg_len;
		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
E
Eric Dumazet 已提交
2812 2813 2814 2815
					GFP_KERNEL |
					__GFP_NOWARN |
					__GFP_NORETRY);
	}
2816 2817
	if (!skb) {
		alloc_size = alloc_min_size;
E
Eric Dumazet 已提交
2818 2819
		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
					GFP_KERNEL);
2820
	}
2821
	if (!skb)
2822
		goto errout_skb;
2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834

	/* Trim skb to allocated size. User is expected to provide buffer as
	 * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at
	 * netlink_recvmsg())). dump will pack as many smaller messages as
	 * could fit within the allocated skb. skb is typically allocated
	 * with larger space than required (could be as much as near 2x the
	 * requested size with align to next power of 2 approach). Allowing
	 * dump to use the excess space makes it difficult for a user to have a
	 * reasonable static buffer based on the expected largest dump of a
	 * single netdev. The outcome is MSG_TRUNC error.
	 */
	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
2835
	netlink_skb_set_owner_r(skb, sk);
2836

L
Linus Torvalds 已提交
2837 2838 2839
	len = cb->dump(skb, cb);

	if (len > 0) {
2840
		mutex_unlock(nlk->cb_mutex);
2841 2842 2843

		if (sk_filter(sk, skb))
			kfree_skb(skb);
2844 2845
		else
			__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2846 2847 2848
		return 0;
	}

2849 2850 2851 2852
	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
	if (!nlh)
		goto errout_skb;

2853 2854
	nl_dump_check_consistent(cb, nlh);

2855 2856
	memcpy(nlmsg_data(nlh), &len, sizeof(len));

2857 2858
	if (sk_filter(sk, skb))
		kfree_skb(skb);
2859 2860
	else
		__netlink_sendskb(sk, skb);
L
Linus Torvalds 已提交
2861

2862 2863
	if (cb->done)
		cb->done(cb);
L
Linus Torvalds 已提交
2864

2865 2866
	nlk->cb_running = false;
	mutex_unlock(nlk->cb_mutex);
2867
	module_put(cb->module);
2868
	consume_skb(cb->skb);
L
Linus Torvalds 已提交
2869
	return 0;
2870

2871
errout_skb:
2872
	mutex_unlock(nlk->cb_mutex);
2873 2874
	kfree_skb(skb);
	return err;
L
Linus Torvalds 已提交
2875 2876
}

2877 2878 2879
int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
			 const struct nlmsghdr *nlh,
			 struct netlink_dump_control *control)
L
Linus Torvalds 已提交
2880 2881 2882 2883
{
	struct netlink_callback *cb;
	struct sock *sk;
	struct netlink_sock *nlk;
2884
	int ret;
L
Linus Torvalds 已提交
2885

2886 2887 2888 2889 2890 2891
	/* Memory mapped dump requests need to be copied to avoid looping
	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
	 * a reference to the skb.
	 */
	if (netlink_skb_is_mmaped(skb)) {
		skb = skb_copy(skb, GFP_KERNEL);
2892
		if (skb == NULL)
2893 2894 2895 2896
			return -ENOBUFS;
	} else
		atomic_inc(&skb->users);

2897
	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
L
Linus Torvalds 已提交
2898
	if (sk == NULL) {
2899 2900
		ret = -ECONNREFUSED;
		goto error_free;
L
Linus Torvalds 已提交
2901
	}
2902

2903
	nlk = nlk_sk(sk);
2904
	mutex_lock(nlk->cb_mutex);
2905
	/* A dump is in progress... */
2906
	if (nlk->cb_running) {
2907
		ret = -EBUSY;
2908
		goto error_unlock;
L
Linus Torvalds 已提交
2909
	}
2910
	/* add reference of module which cb->dump belongs to */
2911
	if (!try_module_get(control->module)) {
2912
		ret = -EPROTONOSUPPORT;
2913
		goto error_unlock;
2914 2915
	}

2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927
	cb = &nlk->cb;
	memset(cb, 0, sizeof(*cb));
	cb->dump = control->dump;
	cb->done = control->done;
	cb->nlh = nlh;
	cb->data = control->data;
	cb->module = control->module;
	cb->min_dump_alloc = control->min_dump_alloc;
	cb->skb = skb;

	nlk->cb_running = true;

2928
	mutex_unlock(nlk->cb_mutex);
L
Linus Torvalds 已提交
2929

2930
	ret = netlink_dump(sk);
L
Linus Torvalds 已提交
2931
	sock_put(sk);
2932

2933 2934 2935
	if (ret)
		return ret;

2936 2937 2938 2939
	/* We successfully started a dump, by returning -EINTR we
	 * signal not to send ACK even if it was requested.
	 */
	return -EINTR;
2940 2941 2942 2943 2944 2945 2946

error_unlock:
	sock_put(sk);
	mutex_unlock(nlk->cb_mutex);
error_free:
	kfree_skb(skb);
	return ret;
L
Linus Torvalds 已提交
2947
}
2948
EXPORT_SYMBOL(__netlink_dump_start);
L
Linus Torvalds 已提交
2949 2950 2951 2952 2953 2954

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
{
	struct sk_buff *skb;
	struct nlmsghdr *rep;
	struct nlmsgerr *errmsg;
2955
	size_t payload = sizeof(*errmsg);
2956
	struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
L
Linus Torvalds 已提交
2957

2958 2959 2960 2961
	/* Error messages get the original request appened, unless the user
	 * requests to cap the error message.
	 */
	if (!(nlk->flags & NETLINK_F_CAP_ACK) && err)
2962
		payload += nlmsg_len(nlh);
L
Linus Torvalds 已提交
2963

2964 2965
	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
				NETLINK_CB(in_skb).portid, GFP_KERNEL);
L
Linus Torvalds 已提交
2966 2967 2968
	if (!skb) {
		struct sock *sk;

2969
		sk = netlink_lookup(sock_net(in_skb->sk),
2970
				    in_skb->sk->sk_protocol,
2971
				    NETLINK_CB(in_skb).portid);
L
Linus Torvalds 已提交
2972 2973 2974 2975 2976 2977 2978 2979
		if (sk) {
			sk->sk_err = ENOBUFS;
			sk->sk_error_report(sk);
			sock_put(sk);
		}
		return;
	}

2980
	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2981
			  NLMSG_ERROR, payload, 0);
2982
	errmsg = nlmsg_data(rep);
L
Linus Torvalds 已提交
2983
	errmsg->error = err;
2984
	memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));
2985
	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
L
Linus Torvalds 已提交
2986
}
2987
EXPORT_SYMBOL(netlink_ack);
L
Linus Torvalds 已提交
2988

2989
int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
2990
						     struct nlmsghdr *))
2991 2992 2993 2994 2995
{
	struct nlmsghdr *nlh;
	int err;

	while (skb->len >= nlmsg_total_size(0)) {
2996 2997
		int msglen;

2998
		nlh = nlmsg_hdr(skb);
2999
		err = 0;
3000

3001
		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
3002 3003
			return 0;

3004 3005
		/* Only requests are handled by the kernel */
		if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
3006
			goto ack;
3007 3008 3009

		/* Skip control messages */
		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
3010
			goto ack;
3011

3012
		err = cb(skb, nlh);
3013 3014 3015 3016
		if (err == -EINTR)
			goto skip;

ack:
3017
		if (nlh->nlmsg_flags & NLM_F_ACK || err)
3018 3019
			netlink_ack(skb, nlh, err);

3020
skip:
3021
		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
3022 3023 3024
		if (msglen > skb->len)
			msglen = skb->len;
		skb_pull(skb, msglen);
3025 3026 3027 3028
	}

	return 0;
}
3029
EXPORT_SYMBOL(netlink_rcv_skb);
3030

3031 3032 3033 3034
/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
3035
 * @portid: destination netlink portid for reports or 0
3036 3037 3038 3039
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
3040
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
3041 3042 3043 3044 3045
		 unsigned int group, int report, gfp_t flags)
{
	int err = 0;

	if (group) {
3046
		int exclude_portid = 0;
3047 3048 3049

		if (report) {
			atomic_inc(&skb->users);
3050
			exclude_portid = portid;
3051 3052
		}

3053 3054
		/* errors reported via destination sk->sk_err, but propagate
		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
3055
		err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
3056 3057
	}

3058 3059 3060
	if (report) {
		int err2;

3061
		err2 = nlmsg_unicast(sk, skb, portid);
3062 3063 3064
		if (!err || err == -ESRCH)
			err = err2;
	}
3065 3066 3067

	return err;
}
3068
EXPORT_SYMBOL(nlmsg_notify);
3069

L
Linus Torvalds 已提交
3070 3071
#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
3072
	struct seq_net_private p;
3073
	struct rhashtable_iter hti;
L
Linus Torvalds 已提交
3074 3075 3076
	int link;
};

3077
static int netlink_walk_start(struct nl_seq_iter *iter)
L
Linus Torvalds 已提交
3078
{
3079
	int err;
L
Linus Torvalds 已提交
3080

3081 3082 3083 3084
	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti);
	if (err) {
		iter->link = MAX_LINKS;
		return err;
L
Linus Torvalds 已提交
3085
	}
3086 3087 3088

	err = rhashtable_walk_start(&iter->hti);
	return err == -EAGAIN ? 0 : err;
L
Linus Torvalds 已提交
3089 3090
}

3091
static void netlink_walk_stop(struct nl_seq_iter *iter)
L
Linus Torvalds 已提交
3092
{
3093 3094
	rhashtable_walk_stop(&iter->hti);
	rhashtable_walk_exit(&iter->hti);
L
Linus Torvalds 已提交
3095 3096
}

3097
static void *__netlink_seq_next(struct seq_file *seq)
L
Linus Torvalds 已提交
3098
{
3099
	struct nl_seq_iter *iter = seq->private;
3100
	struct netlink_sock *nlk;
L
Linus Torvalds 已提交
3101

3102 3103 3104
	do {
		for (;;) {
			int err;
L
Linus Torvalds 已提交
3105

3106
			nlk = rhashtable_walk_next(&iter->hti);
3107

3108 3109 3110
			if (IS_ERR(nlk)) {
				if (PTR_ERR(nlk) == -EAGAIN)
					continue;
3111

3112 3113
				return nlk;
			}
L
Linus Torvalds 已提交
3114

3115 3116
			if (nlk)
				break;
L
Linus Torvalds 已提交
3117

3118 3119 3120
			netlink_walk_stop(iter);
			if (++iter->link >= MAX_LINKS)
				return NULL;
3121

3122 3123 3124
			err = netlink_walk_start(iter);
			if (err)
				return ERR_PTR(err);
L
Linus Torvalds 已提交
3125
		}
3126
	} while (sock_net(&nlk->sk) != seq_file_net(seq));
L
Linus Torvalds 已提交
3127

3128 3129
	return nlk;
}
L
Linus Torvalds 已提交
3130

3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153
static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
{
	struct nl_seq_iter *iter = seq->private;
	void *obj = SEQ_START_TOKEN;
	loff_t pos;
	int err;

	iter->link = 0;

	err = netlink_walk_start(iter);
	if (err)
		return ERR_PTR(err);

	for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
		obj = __netlink_seq_next(seq);

	return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	++*pos;
	return __netlink_seq_next(seq);
L
Linus Torvalds 已提交
3154 3155 3156 3157
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
3158 3159 3160 3161 3162 3163
	struct nl_seq_iter *iter = seq->private;

	if (iter->link >= MAX_LINKS)
		return;

	netlink_walk_stop(iter);
L
Linus Torvalds 已提交
3164 3165 3166 3167 3168
}


static int netlink_seq_show(struct seq_file *seq, void *v)
{
E
Eric Dumazet 已提交
3169
	if (v == SEQ_START_TOKEN) {
L
Linus Torvalds 已提交
3170 3171
		seq_puts(seq,
			 "sk       Eth Pid    Groups   "
3172
			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
E
Eric Dumazet 已提交
3173
	} else {
L
Linus Torvalds 已提交
3174 3175 3176
		struct sock *s = v;
		struct netlink_sock *nlk = nlk_sk(s);

3177
		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
L
Linus Torvalds 已提交
3178 3179
			   s,
			   s->sk_protocol,
3180
			   nlk->portid,
3181
			   nlk->groups ? (u32)nlk->groups[0] : 0,
3182 3183
			   sk_rmem_alloc_get(s),
			   sk_wmem_alloc_get(s),
3184
			   nlk->cb_running,
3185
			   atomic_read(&s->sk_refcnt),
3186 3187
			   atomic_read(&s->sk_drops),
			   sock_i_ino(s)
L
Linus Torvalds 已提交
3188 3189 3190 3191 3192 3193
			);

	}
	return 0;
}

3194
static const struct seq_operations netlink_seq_ops = {
L
Linus Torvalds 已提交
3195 3196 3197 3198 3199 3200 3201 3202 3203
	.start  = netlink_seq_start,
	.next   = netlink_seq_next,
	.stop   = netlink_seq_stop,
	.show   = netlink_seq_show,
};


static int netlink_seq_open(struct inode *inode, struct file *file)
{
3204 3205
	return seq_open_net(inode, file, &netlink_seq_ops,
				sizeof(struct nl_seq_iter));
3206 3207
}

3208
static const struct file_operations netlink_seq_fops = {
L
Linus Torvalds 已提交
3209 3210 3211 3212
	.owner		= THIS_MODULE,
	.open		= netlink_seq_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
3213
	.release	= seq_release_net,
L
Linus Torvalds 已提交
3214 3215 3216 3217 3218 3219
};

#endif

int netlink_register_notifier(struct notifier_block *nb)
{
3220
	return atomic_notifier_chain_register(&netlink_chain, nb);
L
Linus Torvalds 已提交
3221
}
3222
EXPORT_SYMBOL(netlink_register_notifier);
L
Linus Torvalds 已提交
3223 3224 3225

int netlink_unregister_notifier(struct notifier_block *nb)
{
3226
	return atomic_notifier_chain_unregister(&netlink_chain, nb);
L
Linus Torvalds 已提交
3227
}
3228
EXPORT_SYMBOL(netlink_unregister_notifier);
3229

3230
static const struct proto_ops netlink_ops = {
L
Linus Torvalds 已提交
3231 3232 3233 3234 3235 3236 3237 3238
	.family =	PF_NETLINK,
	.owner =	THIS_MODULE,
	.release =	netlink_release,
	.bind =		netlink_bind,
	.connect =	netlink_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	sock_no_accept,
	.getname =	netlink_getname,
3239
	.poll =		netlink_poll,
L
Linus Torvalds 已提交
3240 3241 3242
	.ioctl =	sock_no_ioctl,
	.listen =	sock_no_listen,
	.shutdown =	sock_no_shutdown,
3243 3244
	.setsockopt =	netlink_setsockopt,
	.getsockopt =	netlink_getsockopt,
L
Linus Torvalds 已提交
3245 3246
	.sendmsg =	netlink_sendmsg,
	.recvmsg =	netlink_recvmsg,
3247
	.mmap =		netlink_mmap,
L
Linus Torvalds 已提交
3248 3249 3250
	.sendpage =	sock_no_sendpage,
};

3251
static const struct net_proto_family netlink_family_ops = {
L
Linus Torvalds 已提交
3252 3253 3254 3255 3256
	.family = PF_NETLINK,
	.create = netlink_create,
	.owner	= THIS_MODULE,	/* for consistency 8) */
};

3257
static int __net_init netlink_net_init(struct net *net)
3258 3259
{
#ifdef CONFIG_PROC_FS
3260
	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
3261 3262 3263 3264 3265
		return -ENOMEM;
#endif
	return 0;
}

3266
static void __net_exit netlink_net_exit(struct net *net)
3267 3268
{
#ifdef CONFIG_PROC_FS
3269
	remove_proc_entry("netlink", net->proc_net);
3270 3271 3272
#endif
}

3273 3274
static void __init netlink_add_usersock_entry(void)
{
3275
	struct listeners *listeners;
3276 3277
	int groups = 32;

3278
	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
3279
	if (!listeners)
3280
		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
3281 3282 3283 3284

	netlink_table_grab();

	nl_table[NETLINK_USERSOCK].groups = groups;
3285
	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
3286 3287
	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
	nl_table[NETLINK_USERSOCK].registered = 1;
3288
	nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
3289 3290 3291 3292

	netlink_table_ungrab();
}

3293
static struct pernet_operations __net_initdata netlink_net_ops = {
3294 3295 3296 3297
	.init = netlink_net_init,
	.exit = netlink_net_exit,
};

3298
static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
3299 3300 3301 3302
{
	const struct netlink_sock *nlk = data;
	struct netlink_compare_arg arg;

3303
	netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
3304
	return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
3305 3306 3307 3308 3309 3310 3311
}

static const struct rhashtable_params netlink_rhashtable_params = {
	.head_offset = offsetof(struct netlink_sock, node),
	.key_len = netlink_compare_arg_len,
	.obj_hashfn = netlink_hash,
	.obj_cmpfn = netlink_compare,
3312
	.automatic_shrinking = true,
3313 3314
};

L
Linus Torvalds 已提交
3315 3316 3317 3318 3319 3320 3321 3322
static int __init netlink_proto_init(void)
{
	int i;
	int err = proto_register(&netlink_proto, 0);

	if (err != 0)
		goto out;

3323
	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
L
Linus Torvalds 已提交
3324

3325
	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
3326 3327
	if (!nl_table)
		goto panic;
L
Linus Torvalds 已提交
3328 3329

	for (i = 0; i < MAX_LINKS; i++) {
3330 3331
		if (rhashtable_init(&nl_table[i].hash,
				    &netlink_rhashtable_params) < 0) {
3332 3333
			while (--i > 0)
				rhashtable_destroy(&nl_table[i].hash);
L
Linus Torvalds 已提交
3334
			kfree(nl_table);
3335
			goto panic;
L
Linus Torvalds 已提交
3336 3337 3338
		}
	}

3339 3340
	INIT_LIST_HEAD(&netlink_tap_all);

3341 3342
	netlink_add_usersock_entry();

L
Linus Torvalds 已提交
3343
	sock_register(&netlink_family_ops);
3344
	register_pernet_subsys(&netlink_net_ops);
3345
	/* The netlink device handler may be needed early. */
L
Linus Torvalds 已提交
3346 3347 3348
	rtnetlink_init();
out:
	return err;
3349 3350
panic:
	panic("netlink_init: Cannot allocate nl_table\n");
L
Linus Torvalds 已提交
3351 3352 3353
}

core_initcall(netlink_proto_init);