socket.c 92.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7
/*
 * NET		An implementation of the SOCKET network access protocol.
 *
 * Version:	@(#)socket.c	1.1.93	18/02/95
 *
 * Authors:	Orest Zborowski, <obz@Kodak.COM>
8
 *		Ross Biro
L
Linus Torvalds 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
 *					shutdown()
 *		Alan Cox	:	verify_area() fixes
 *		Alan Cox	:	Removed DDI
 *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
 *		Alan Cox	:	Moved a load of checks to the very
 *					top level.
 *		Alan Cox	:	Move address structures to/from user
 *					mode above the protocol layers.
 *		Rob Janssen	:	Allow 0 length sends.
 *		Alan Cox	:	Asynchronous I/O support (cribbed from the
 *					tty drivers).
 *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
 *		Jeff Uphoff	:	Made max number of sockets command-line
 *					configurable.
 *		Matti Aarnio	:	Made the number of sockets dynamic,
 *					to be allocated when needed, and mr.
 *					Uphoff's max is used as max to be
 *					allowed to allocate.
 *		Linus		:	Argh. removed all the socket allocation
 *					altogether: it's in the inode now.
 *		Alan Cox	:	Made sock_alloc()/sock_release() public
 *					for NetROM and future kernel nfsd type
 *					stuff.
 *		Alan Cox	:	sendmsg/recvmsg basics.
 *		Tom Dyas	:	Export net symbols.
 *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
 *		Alan Cox	:	Added thread locking to sys_* calls
 *					for sockets. May have errors at the
 *					moment.
 *		Kevin Buhr	:	Fixed the dumb errors in the above.
 *		Andi Kleen	:	Some small cleanups, optimizations,
 *					and fixed a copy_from_user() bug.
 *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
46
 *		Tigran Aivazian	:	Made listen(2) backlog sanity checks
L
Linus Torvalds 已提交
47 48 49
 *					protocol-independent
 *
 *	This module is effectively the top level interface to the BSD socket
50
 *	paradigm.
L
Linus Torvalds 已提交
51 52 53 54 55 56 57 58 59
 *
 *	Based upon Swansea University Computer Society NET3.039
 */

#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/net.h>
#include <linux/interrupt.h>
U
Ulrich Drepper 已提交
60
#include <linux/thread_info.h>
61
#include <linux/rcupdate.h>
L
Linus Torvalds 已提交
62 63 64
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
A
Arjan van de Ven 已提交
65
#include <linux/mutex.h>
L
Linus Torvalds 已提交
66
#include <linux/if_bridge.h>
67 68
#include <linux/if_frad.h>
#include <linux/if_vlan.h>
69
#include <linux/ptp_classify.h>
L
Linus Torvalds 已提交
70 71 72 73 74 75 76 77 78 79
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
80
#include <linux/audit.h>
81
#include <linux/wireless.h>
82
#include <linux/nsproxy.h>
N
Nick Black 已提交
83
#include <linux/magic.h>
84
#include <linux/slab.h>
85
#include <linux/xattr.h>
86
#include <linux/nospec.h>
87
#include <linux/indirect_call_wrapper.h>
L
Linus Torvalds 已提交
88

89
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
90 91 92
#include <asm/unistd.h>

#include <net/compat.h>
93
#include <net/wext.h>
94
#include <net/cls_cgroup.h>
L
Linus Torvalds 已提交
95 96 97 98

#include <net/sock.h>
#include <linux/netfilter.h>

99 100 101 102
#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
#include <linux/sockios.h>
103
#include <net/busy_poll.h>
104
#include <linux/errqueue.h>
E
Eliezer Tamir 已提交
105

106
#ifdef CONFIG_NET_RX_BUSY_POLL
107 108
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
E
Eliezer Tamir 已提交
109
#endif
110

111 112
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
113
static int sock_mmap(struct file *file, struct vm_area_struct *vma);
L
Linus Torvalds 已提交
114 115

static int sock_close(struct inode *inode, struct file *file);
116 117
static __poll_t sock_poll(struct file *file,
			      struct poll_table_struct *wait);
118
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
119 120
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
121
			      unsigned int cmd, unsigned long arg);
122
#endif
L
Linus Torvalds 已提交
123 124 125
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_sendpage(struct file *file, struct page *page,
			     int offset, size_t size, loff_t *ppos, int more);
J
Jens Axboe 已提交
126
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
127
				struct pipe_inode_info *pipe, size_t len,
J
Jens Axboe 已提交
128
				unsigned int flags);
L
Linus Torvalds 已提交
129 130 131 132 133 134

/*
 *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *	in the operation structures but are done directly via the socketcall() multiplexor.
 */

135
static const struct file_operations socket_file_ops = {
L
Linus Torvalds 已提交
136 137
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
138 139
	.read_iter =	sock_read_iter,
	.write_iter =	sock_write_iter,
L
Linus Torvalds 已提交
140 141
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
142 143 144
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
L
Linus Torvalds 已提交
145 146 147
	.mmap =		sock_mmap,
	.release =	sock_close,
	.fasync =	sock_fasync,
148 149
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
J
Jens Axboe 已提交
150
	.splice_read =	sock_splice_read,
L
Linus Torvalds 已提交
151 152 153 154 155 156 157
};

/*
 *	The protocol list. Each protocol is registered in here.
 */

static DEFINE_SPINLOCK(net_family_lock);
E
Eric Dumazet 已提交
158
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
L
Linus Torvalds 已提交
159 160

/*
161 162 163
 * Support routines.
 * Move socket addresses back and forth across the kernel/user
 * divide and look after the messy bits.
L
Linus Torvalds 已提交
164 165 166 167 168 169 170 171 172 173 174 175 176
 */

/**
 *	move_addr_to_kernel	-	copy a socket address into kernel space
 *	@uaddr: Address in user space
 *	@kaddr: Address in kernel space
 *	@ulen: Length in user space
 *
 *	The address is copied into kernel space. If the provided address is
 *	too long an error code of -EINVAL is returned. If the copy gives
 *	invalid addresses -EFAULT is returned. On a success 0 is returned.
 */

177
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
L
Linus Torvalds 已提交
178
{
179
	if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
L
Linus Torvalds 已提交
180
		return -EINVAL;
181
	if (ulen == 0)
L
Linus Torvalds 已提交
182
		return 0;
183
	if (copy_from_user(kaddr, uaddr, ulen))
L
Linus Torvalds 已提交
184
		return -EFAULT;
185
	return audit_sockaddr(ulen, kaddr);
L
Linus Torvalds 已提交
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
}

/**
 *	move_addr_to_user	-	copy an address to user space
 *	@kaddr: kernel space address
 *	@klen: length of address in kernel
 *	@uaddr: user space address
 *	@ulen: pointer to user length field
 *
 *	The value pointed to by ulen on entry is the buffer length available.
 *	This is overwritten with the buffer space used. -EINVAL is returned
 *	if an overlong buffer is specified or a negative buffer size. -EFAULT
 *	is returned if either the buffer or the length field are not
 *	accessible.
 *	After copying the data up to the limit the user specifies, the true
 *	length of the data is written over the length limit the user
 *	specified. Zero is returned for a success.
 */
204

205
static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
S
stephen hemminger 已提交
206
			     void __user *uaddr, int __user *ulen)
L
Linus Torvalds 已提交
207 208 209 210
{
	int err;
	int len;

211
	BUG_ON(klen > sizeof(struct sockaddr_storage));
212 213
	err = get_user(len, ulen);
	if (err)
L
Linus Torvalds 已提交
214
		return err;
215 216
	if (len > klen)
		len = klen;
217
	if (len < 0)
L
Linus Torvalds 已提交
218
		return -EINVAL;
219
	if (len) {
S
Steve Grubb 已提交
220 221
		if (audit_sockaddr(klen, kaddr))
			return -ENOMEM;
222
		if (copy_to_user(uaddr, kaddr, len))
L
Linus Torvalds 已提交
223 224 225
			return -EFAULT;
	}
	/*
226 227
	 *      "fromlen shall refer to the value before truncation.."
	 *                      1003.1g
L
Linus Torvalds 已提交
228 229 230 231
	 */
	return __put_user(klen, ulen);
}

232
static struct kmem_cache *sock_inode_cachep __ro_after_init;
L
Linus Torvalds 已提交
233 234 235 236

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;
237
	struct socket_wq *wq;
238

239
	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
L
Linus Torvalds 已提交
240 241
	if (!ei)
		return NULL;
242 243
	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
	if (!wq) {
244 245 246
		kmem_cache_free(sock_inode_cachep, ei);
		return NULL;
	}
247 248
	init_waitqueue_head(&wq->wait);
	wq->fasync_list = NULL;
249
	wq->flags = 0;
250
	ei->socket.wq = wq;
251

L
Linus Torvalds 已提交
252 253 254 255 256 257 258 259 260
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}

A
Al Viro 已提交
261
static void sock_free_inode(struct inode *inode)
L
Linus Torvalds 已提交
262
{
263 264 265
	struct socket_alloc *ei;

	ei = container_of(inode, struct socket_alloc, vfs_inode);
A
Al Viro 已提交
266
	kfree(ei->socket.wq);
267
	kmem_cache_free(sock_inode_cachep, ei);
L
Linus Torvalds 已提交
268 269
}

270
static void init_once(void *foo)
L
Linus Torvalds 已提交
271
{
272
	struct socket_alloc *ei = (struct socket_alloc *)foo;
L
Linus Torvalds 已提交
273

C
Christoph Lameter 已提交
274
	inode_init_once(&ei->vfs_inode);
L
Linus Torvalds 已提交
275
}
276

277
static void init_inodecache(void)
L
Linus Torvalds 已提交
278 279
{
	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
280 281 282 283
					      sizeof(struct socket_alloc),
					      0,
					      (SLAB_HWCACHE_ALIGN |
					       SLAB_RECLAIM_ACCOUNT |
284
					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
285
					      init_once);
286
	BUG_ON(sock_inode_cachep == NULL);
L
Linus Torvalds 已提交
287 288
}

289
static const struct super_operations sockfs_ops = {
290
	.alloc_inode	= sock_alloc_inode,
A
Al Viro 已提交
291
	.free_inode	= sock_free_inode,
292
	.statfs		= simple_statfs,
L
Linus Torvalds 已提交
293 294
};

295 296 297 298 299 300
/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
	return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
D
David Howells 已提交
301
				d_inode(dentry)->i_ino);
302 303
}

A
Al Viro 已提交
304
static const struct dentry_operations sockfs_dentry_operations = {
305
	.d_dname  = sockfs_dname,
L
Linus Torvalds 已提交
306 307
};

308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
static int sockfs_xattr_get(const struct xattr_handler *handler,
			    struct dentry *dentry, struct inode *inode,
			    const char *suffix, void *value, size_t size)
{
	if (value) {
		if (dentry->d_name.len + 1 > size)
			return -ERANGE;
		memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
	}
	return dentry->d_name.len + 1;
}

#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)

static const struct xattr_handler sockfs_xattr_handler = {
	.name = XATTR_NAME_SOCKPROTONAME,
	.get = sockfs_xattr_get,
};

329 330 331 332 333 334 335 336 337 338 339 340 341 342
static int sockfs_security_xattr_set(const struct xattr_handler *handler,
				     struct dentry *dentry, struct inode *inode,
				     const char *suffix, const void *value,
				     size_t size, int flags)
{
	/* Handled by LSM. */
	return -EAGAIN;
}

static const struct xattr_handler sockfs_security_xattr_handler = {
	.prefix = XATTR_SECURITY_PREFIX,
	.set = sockfs_security_xattr_set,
};

343 344
static const struct xattr_handler *sockfs_xattr_handlers[] = {
	&sockfs_xattr_handler,
345
	&sockfs_security_xattr_handler,
346 347 348
	NULL
};

349 350 351
static struct dentry *sockfs_mount(struct file_system_type *fs_type,
			 int flags, const char *dev_name, void *data)
{
352 353 354
	return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops,
				  sockfs_xattr_handlers,
				  &sockfs_dentry_operations, SOCKFS_MAGIC);
355 356 357 358 359 360 361 362 363 364
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
	.name =		"sockfs",
	.mount =	sockfs_mount,
	.kill_sb =	kill_anon_super,
};

L
Linus Torvalds 已提交
365 366 367
/*
 *	Obtains the first available file descriptor and sets it up for use.
 *
368 369
 *	These functions create file structures and maps them to fd space
 *	of the current process. On success it returns file descriptor
L
Linus Torvalds 已提交
370 371 372 373 374 375 376 377 378 379 380 381
 *	and file struct implicitly stored in sock->file.
 *	Note that another thread may close file descriptor before we return
 *	from this function. We use the fact that now we do not refer
 *	to socket after mapping. If one day we will need it, this
 *	function will increment ref. count on file by 1.
 *
 *	In any case returned fd MAY BE not valid!
 *	This race condition is unavoidable
 *	with shared fd spaces, we cannot solve it inside kernel,
 *	but we take care of internal coherence yet.
 */

382 383 384 385 386 387 388 389 390 391 392 393
/**
 *	sock_alloc_file - Bind a &socket to a &file
 *	@sock: socket
 *	@flags: file status flags
 *	@dname: protocol name
 *
 *	Returns the &file bound with @sock, implicitly storing it
 *	in sock->file. If dname is %NULL, sets to "".
 *	On failure the return is a ERR pointer (see linux/err.h).
 *	This function uses GFP_KERNEL internally.
 */

394
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
L
Linus Torvalds 已提交
395
{
396
	struct file *file;
L
Linus Torvalds 已提交
397

A
Al Viro 已提交
398 399
	if (!dname)
		dname = sock->sk ? sock->sk->sk_prot_creator->name : "";
400

A
Al Viro 已提交
401 402 403
	file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
				O_RDWR | (flags & O_NONBLOCK),
				&socket_file_ops);
404
	if (IS_ERR(file)) {
405
		sock_release(sock);
406
		return file;
A
Al Viro 已提交
407 408 409
	}

	sock->file = file;
410
	file->private_data = sock;
411
	return file;
412
}
413
EXPORT_SYMBOL(sock_alloc_file);
414

415
static int sock_map_fd(struct socket *sock, int flags)
416 417
{
	struct file *newfile;
418
	int fd = get_unused_fd_flags(flags);
419 420
	if (unlikely(fd < 0)) {
		sock_release(sock);
421
		return fd;
422
	}
423

424
	newfile = sock_alloc_file(sock, flags, NULL);
425
	if (!IS_ERR(newfile)) {
426
		fd_install(fd, newfile);
427 428
		return fd;
	}
429

430 431
	put_unused_fd(fd);
	return PTR_ERR(newfile);
L
Linus Torvalds 已提交
432 433
}

434 435 436 437 438 439 440 441
/**
 *	sock_from_file - Return the &socket bounded to @file.
 *	@file: file
 *	@err: pointer to an error code return
 *
 *	On failure returns %NULL and assigns -ENOTSOCK to @err.
 */

442
struct socket *sock_from_file(struct file *file, int *err)
443 444 445 446
{
	if (file->f_op == &socket_file_ops)
		return file->private_data;	/* set in sock_map_fd */

E
Eric Dumazet 已提交
447 448
	*err = -ENOTSOCK;
	return NULL;
449
}
450
EXPORT_SYMBOL(sock_from_file);
451

L
Linus Torvalds 已提交
452
/**
453
 *	sockfd_lookup - Go from a file number to its socket slot
L
Linus Torvalds 已提交
454 455 456 457
 *	@fd: file handle
 *	@err: pointer to an error code return
 *
 *	The file handle passed in is locked and the socket it is bound
458
 *	to is returned. If an error occurs the err pointer is overwritten
L
Linus Torvalds 已提交
459 460 461 462 463 464 465 466 467 468 469
 *	with a negative errno code and NULL is returned. The function checks
 *	for both invalid handles and passing a handle which is not a socket.
 *
 *	On a success the socket object pointer is returned.
 */

struct socket *sockfd_lookup(int fd, int *err)
{
	struct file *file;
	struct socket *sock;

470 471
	file = fget(fd);
	if (!file) {
L
Linus Torvalds 已提交
472 473 474
		*err = -EBADF;
		return NULL;
	}
475

476 477
	sock = sock_from_file(file, err);
	if (!sock)
L
Linus Torvalds 已提交
478
		fput(file);
479 480
	return sock;
}
481
EXPORT_SYMBOL(sockfd_lookup);
L
Linus Torvalds 已提交
482

483 484
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
485
	struct fd f = fdget(fd);
486 487
	struct socket *sock;

488
	*err = -EBADF;
489 490 491 492
	if (f.file) {
		sock = sock_from_file(f.file, err);
		if (likely(sock)) {
			*fput_needed = f.flags;
493
			return sock;
494 495
		}
		fdput(f);
L
Linus Torvalds 已提交
496
	}
497
	return NULL;
L
Linus Torvalds 已提交
498 499
}

500 501 502 503 504 505
static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
				size_t size)
{
	ssize_t len;
	ssize_t used = 0;

D
David Howells 已提交
506
	len = security_inode_listsecurity(d_inode(dentry), buffer, size);
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
	if (len < 0)
		return len;
	used += len;
	if (buffer) {
		if (size < used)
			return -ERANGE;
		buffer += len;
	}

	len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
	used += len;
	if (buffer) {
		if (size < used)
			return -ERANGE;
		memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
		buffer += len;
	}

	return used;
}

528
static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
529 530 531
{
	int err = simple_setattr(dentry, iattr);

532
	if (!err && (iattr->ia_valid & ATTR_UID)) {
533 534
		struct socket *sock = SOCKET_I(d_inode(dentry));

535 536 537 538
		if (sock->sk)
			sock->sk->sk_uid = iattr->ia_uid;
		else
			err = -ENOENT;
539 540 541 542 543
	}

	return err;
}

544 545
static const struct inode_operations sockfs_inode_ops = {
	.listxattr = sockfs_listxattr,
546
	.setattr = sockfs_setattr,
547 548
};

L
Linus Torvalds 已提交
549
/**
550
 *	sock_alloc - allocate a socket
551
 *
L
Linus Torvalds 已提交
552 553
 *	Allocate a new inode and socket object. The two are bound together
 *	and initialised. The socket is then returned. If we are out of inodes
554
 *	NULL is returned. This functions uses GFP_KERNEL internally.
L
Linus Torvalds 已提交
555 556
 */

T
Tom Herbert 已提交
557
struct socket *sock_alloc(void)
L
Linus Torvalds 已提交
558
{
559 560
	struct inode *inode;
	struct socket *sock;
L
Linus Torvalds 已提交
561

562
	inode = new_inode_pseudo(sock_mnt->mnt_sb);
L
Linus Torvalds 已提交
563 564 565 566 567
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode);

568
	inode->i_ino = get_next_ino();
569
	inode->i_mode = S_IFSOCK | S_IRWXUGO;
570 571
	inode->i_uid = current_fsuid();
	inode->i_gid = current_fsgid();
572
	inode->i_op = &sockfs_inode_ops;
L
Linus Torvalds 已提交
573 574 575

	return sock;
}
T
Tom Herbert 已提交
576
EXPORT_SYMBOL(sock_alloc);
L
Linus Torvalds 已提交
577 578

/**
579
 *	sock_release - close a socket
L
Linus Torvalds 已提交
580 581 582 583
 *	@sock: socket to close
 *
 *	The socket is released from the protocol stack if it has a release
 *	callback, and the inode is then released if the socket is bound to
584
 *	an inode not a file.
L
Linus Torvalds 已提交
585
 */
586

587
static void __sock_release(struct socket *sock, struct inode *inode)
L
Linus Torvalds 已提交
588 589 590 591
{
	if (sock->ops) {
		struct module *owner = sock->ops->owner;

592 593
		if (inode)
			inode_lock(inode);
L
Linus Torvalds 已提交
594
		sock->ops->release(sock);
595
		sock->sk = NULL;
596 597
		if (inode)
			inode_unlock(inode);
L
Linus Torvalds 已提交
598 599 600 601
		sock->ops = NULL;
		module_put(owner);
	}

602
	if (sock->wq->fasync_list)
603
		pr_err("%s: fasync list not empty!\n", __func__);
L
Linus Torvalds 已提交
604 605 606 607 608

	if (!sock->file) {
		iput(SOCK_INODE(sock));
		return;
	}
609
	sock->file = NULL;
L
Linus Torvalds 已提交
610
}
611 612 613 614 615

void sock_release(struct socket *sock)
{
	__sock_release(sock, NULL);
}
616
EXPORT_SYMBOL(sock_release);
L
Linus Torvalds 已提交
617

618
void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
619
{
620 621
	u8 flags = *tx_flags;

622
	if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
623 624
		flags |= SKBTX_HW_TSTAMP;

625
	if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
626 627
		flags |= SKBTX_SW_TSTAMP;

628
	if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
629 630 631
		flags |= SKBTX_SCHED_TSTAMP;

	*tx_flags = flags;
632
}
633
EXPORT_SYMBOL(__sock_tx_timestamp);
634

635 636
INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *,
					   size_t));
637 638
INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *,
					    size_t));
639
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
L
Linus Torvalds 已提交
640
{
641 642 643
	int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg,
				     inet_sendmsg, sock, msg,
				     msg_data_left(msg));
644 645
	BUG_ON(ret == -EIOCBQUEUED);
	return ret;
L
Linus Torvalds 已提交
646 647
}

648 649 650 651 652 653 654 655
/**
 *	sock_sendmsg - send a message through @sock
 *	@sock: socket
 *	@msg: message to send
 *
 *	Sends @msg through @sock, passing through LSM.
 *	Returns the number of bytes sent, or an error code.
 */
656
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
657
{
658
	int err = security_socket_sendmsg(sock, msg,
A
Al Viro 已提交
659
					  msg_data_left(msg));
660

661
	return err ?: sock_sendmsg_nosec(sock, msg);
662
}
663
EXPORT_SYMBOL(sock_sendmsg);
L
Linus Torvalds 已提交
664

665 666 667 668 669 670 671 672 673 674 675 676
/**
 *	kernel_sendmsg - send a message through @sock (kernel-space)
 *	@sock: socket
 *	@msg: message header
 *	@vec: kernel vec
 *	@num: vec array length
 *	@size: total message data size
 *
 *	Builds the message data with @vec and sends it through @sock.
 *	Returns the number of bytes sent, or an error code.
 */

L
Linus Torvalds 已提交
677 678 679
int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
		   struct kvec *vec, size_t num, size_t size)
{
680
	iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size);
681
	return sock_sendmsg(sock, msg);
L
Linus Torvalds 已提交
682
}
683
EXPORT_SYMBOL(kernel_sendmsg);
L
Linus Torvalds 已提交
684

685 686 687 688 689 690 691 692 693 694 695 696 697
/**
 *	kernel_sendmsg_locked - send a message through @sock (kernel-space)
 *	@sk: sock
 *	@msg: message header
 *	@vec: output s/g array
 *	@num: output s/g array length
 *	@size: total message data size
 *
 *	Builds the message data with @vec and sends it through @sock.
 *	Returns the number of bytes sent, or an error code.
 *	Caller must hold @sk.
 */

698 699 700 701 702 703
int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
			  struct kvec *vec, size_t num, size_t size)
{
	struct socket *sock = sk->sk_socket;

	if (!sock->ops->sendmsg_locked)
J
John Fastabend 已提交
704
		return sock_no_sendmsg_locked(sk, msg, size);
705

706
	iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size);
707 708 709 710 711

	return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg));
}
EXPORT_SYMBOL(kernel_sendmsg_locked);

712 713 714 715 716 717 718 719 720 721
static bool skb_is_err_queue(const struct sk_buff *skb)
{
	/* pkt_type of skbs enqueued on the error queue are set to
	 * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
	 * in recvmsg, since skbs received on a local socket will never
	 * have a pkt_type of PACKET_OUTGOING.
	 */
	return skb->pkt_type == PACKET_OUTGOING;
}

722 723 724 725 726
/* On transmit, software and hardware timestamps are returned independently.
 * As the two skb clones share the hardware timestamp, which may be updated
 * before the software timestamp is received, a hardware TX timestamp may be
 * returned only if there is no software TX timestamp. Ignore false software
 * timestamps, which may be made in the __sock_recv_timestamp() call when the
727
 * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a
728 729 730 731 732 733 734
 * hardware timestamp.
 */
static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
{
	return skb->tstamp && !false_tstamp && skb_is_err_queue(skb);
}

735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
	struct scm_ts_pktinfo ts_pktinfo;
	struct net_device *orig_dev;

	if (!skb_mac_header_was_set(skb))
		return;

	memset(&ts_pktinfo, 0, sizeof(ts_pktinfo));

	rcu_read_lock();
	orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
	if (orig_dev)
		ts_pktinfo.if_index = orig_dev->ifindex;
	rcu_read_unlock();

	ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
		 sizeof(ts_pktinfo), &ts_pktinfo);
}

756 757 758 759 760 761
/*
 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 */
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
	struct sk_buff *skb)
{
762
	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
763
	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
D
Deepa Dinamani 已提交
764 765
	struct scm_timestamping_internal tss;

766
	int empty = 1, false_tstamp = 0;
767 768 769 770 771
	struct skb_shared_hwtstamps *shhwtstamps =
		skb_hwtstamps(skb);

	/* Race occurred between timestamp enabling and packet
	   receiving.  Fill in the current time for now. */
772
	if (need_software_tstamp && skb->tstamp == 0) {
773
		__net_timestamp(skb);
774 775
		false_tstamp = 1;
	}
776 777 778

	if (need_software_tstamp) {
		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
779 780 781 782 783 784 785 786 787 788 789 790 791
			if (new_tstamp) {
				struct __kernel_sock_timeval tv;

				skb_get_new_timestamp(skb, &tv);
				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
					 sizeof(tv), &tv);
			} else {
				struct __kernel_old_timeval tv;

				skb_get_timestamp(skb, &tv);
				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
					 sizeof(tv), &tv);
			}
792
		} else {
793 794 795 796 797 798 799 800 801 802 803 804 805
			if (new_tstamp) {
				struct __kernel_timespec ts;

				skb_get_new_timestampns(skb, &ts);
				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
					 sizeof(ts), &ts);
			} else {
				struct timespec ts;

				skb_get_timestampns(skb, &ts);
				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
					 sizeof(ts), &ts);
			}
806 807 808
		}
	}

809
	memset(&tss, 0, sizeof(tss));
810
	if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
D
Deepa Dinamani 已提交
811
	    ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
812
		empty = 0;
813
	if (shhwtstamps &&
814
	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
815
	    !skb_is_swtx_tstamp(skb, false_tstamp) &&
D
Deepa Dinamani 已提交
816
	    ktime_to_timespec64_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
817
		empty = 0;
818 819 820 821
		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
		    !skb_is_err_queue(skb))
			put_ts_pktinfo(msg, skb);
	}
822
	if (!empty) {
D
Deepa Dinamani 已提交
823 824 825 826
		if (sock_flag(sk, SOCK_TSTAMP_NEW))
			put_cmsg_scm_timestamping64(msg, &tss);
		else
			put_cmsg_scm_timestamping(msg, &tss);
827

828
		if (skb_is_err_queue(skb) && skb->len &&
829
		    SKB_EXT_ERR(skb)->opt_stats)
830 831 832
			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
				 skb->len, skb->data);
	}
833
}
834 835
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);

836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
	struct sk_buff *skb)
{
	int ack;

	if (!sock_flag(sk, SOCK_WIFI_STATUS))
		return;
	if (!skb->wifi_acked_valid)
		return;

	ack = skb->wifi_acked;

	put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);

S
stephen hemminger 已提交
852 853
static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
				   struct sk_buff *skb)
854
{
855
	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
856
		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
857
			sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
858 859
}

860
void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
861 862 863 864 865
	struct sk_buff *skb)
{
	sock_recv_timestamp(msg, sk, skb);
	sock_recv_drops(msg, sk, skb);
}
866
EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
867

868
INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *,
869 870 871
					   size_t, int));
INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
					    size_t, int));
872
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
873
				     int flags)
L
Linus Torvalds 已提交
874
{
875 876 877
	return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
				  inet_recvmsg, sock, msg, msg_data_left(msg),
				  flags);
L
Linus Torvalds 已提交
878 879
}

880 881 882 883 884 885 886 887 888
/**
 *	sock_recvmsg - receive a message from @sock
 *	@sock: socket
 *	@msg: message to receive
 *	@flags: message flags
 *
 *	Receives @msg from @sock, passing through LSM. Returns the total number
 *	of bytes received, or an error.
 */
889
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
890
{
891
	int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
892

893
	return err ?: sock_recvmsg_nosec(sock, msg, flags);
L
Linus Torvalds 已提交
894
}
895
EXPORT_SYMBOL(sock_recvmsg);
L
Linus Torvalds 已提交
896

897
/**
898 899 900 901 902 903 904
 *	kernel_recvmsg - Receive a message from a socket (kernel space)
 *	@sock: The socket to receive the message from
 *	@msg: Received message
 *	@vec: Input s/g array for message data
 *	@num: Size of input s/g array
 *	@size: Number of bytes to read
 *	@flags: Message flags (MSG_DONTWAIT, etc...)
905
 *
906 907 908
 *	On return the msg structure contains the scatter/gather array passed in the
 *	vec argument. The array is modified so that it consists of the unfilled
 *	portion of the original array.
909
 *
910
 *	The returned value is the total number of bytes received, or an error.
911
 */
912

913 914
int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
		   struct kvec *vec, size_t num, size_t size, int flags)
L
Linus Torvalds 已提交
915 916 917 918
{
	mm_segment_t oldfs = get_fs();
	int result;

919
	iov_iter_kvec(&msg->msg_iter, READ, vec, num, size);
L
Linus Torvalds 已提交
920
	set_fs(KERNEL_DS);
921
	result = sock_recvmsg(sock, msg, flags);
L
Linus Torvalds 已提交
922 923 924
	set_fs(oldfs);
	return result;
}
925
EXPORT_SYMBOL(kernel_recvmsg);
L
Linus Torvalds 已提交
926

927 928
static ssize_t sock_sendpage(struct file *file, struct page *page,
			     int offset, size_t size, loff_t *ppos, int more)
L
Linus Torvalds 已提交
929 930 931 932
{
	struct socket *sock;
	int flags;

933 934
	sock = file->private_data;

935 936 937
	flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
	flags |= more;
938

939
	return kernel_sendpage(sock, page, offset, size, flags);
940
}
L
Linus Torvalds 已提交
941

J
Jens Axboe 已提交
942
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
943
				struct pipe_inode_info *pipe, size_t len,
J
Jens Axboe 已提交
944 945 946 947
				unsigned int flags)
{
	struct socket *sock = file->private_data;

948
	if (unlikely(!sock->ops->splice_read))
949
		return generic_file_splice_read(file, ppos, pipe, len, flags);
950

J
Jens Axboe 已提交
951 952 953
	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
}

954
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
955
{
956 957
	struct file *file = iocb->ki_filp;
	struct socket *sock = file->private_data;
958 959
	struct msghdr msg = {.msg_iter = *to,
			     .msg_iocb = iocb};
960
	ssize_t res;
961

962 963 964 965
	if (file->f_flags & O_NONBLOCK)
		msg.msg_flags = MSG_DONTWAIT;

	if (iocb->ki_pos != 0)
L
Linus Torvalds 已提交
966
		return -ESPIPE;
967

C
Christoph Hellwig 已提交
968
	if (!iov_iter_count(to))	/* Match SYS5 behaviour */
L
Linus Torvalds 已提交
969 970
		return 0;

971
	res = sock_recvmsg(sock, &msg, msg.msg_flags);
972 973
	*to = msg.msg_iter;
	return res;
L
Linus Torvalds 已提交
974 975
}

976
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
977
{
978 979
	struct file *file = iocb->ki_filp;
	struct socket *sock = file->private_data;
980 981
	struct msghdr msg = {.msg_iter = *from,
			     .msg_iocb = iocb};
982
	ssize_t res;
L
Linus Torvalds 已提交
983

984
	if (iocb->ki_pos != 0)
985
		return -ESPIPE;
986

987 988 989
	if (file->f_flags & O_NONBLOCK)
		msg.msg_flags = MSG_DONTWAIT;

990 991 992
	if (sock->type == SOCK_SEQPACKET)
		msg.msg_flags |= MSG_EOR;

993
	res = sock_sendmsg(sock, &msg);
994 995
	*from = msg.msg_iter;
	return res;
L
Linus Torvalds 已提交
996 997 998 999 1000 1001 1002
}

/*
 * Atomic setting of ioctl hooks to avoid race
 * with module unload.
 */

A
Arjan van de Ven 已提交
1003
static DEFINE_MUTEX(br_ioctl_mutex);
1004
static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
L
Linus Torvalds 已提交
1005

1006
void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
L
Linus Torvalds 已提交
1007
{
A
Arjan van de Ven 已提交
1008
	mutex_lock(&br_ioctl_mutex);
L
Linus Torvalds 已提交
1009
	br_ioctl_hook = hook;
A
Arjan van de Ven 已提交
1010
	mutex_unlock(&br_ioctl_mutex);
L
Linus Torvalds 已提交
1011 1012 1013
}
EXPORT_SYMBOL(brioctl_set);

A
Arjan van de Ven 已提交
1014
static DEFINE_MUTEX(vlan_ioctl_mutex);
1015
static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
L
Linus Torvalds 已提交
1016

1017
void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
L
Linus Torvalds 已提交
1018
{
A
Arjan van de Ven 已提交
1019
	mutex_lock(&vlan_ioctl_mutex);
L
Linus Torvalds 已提交
1020
	vlan_ioctl_hook = hook;
A
Arjan van de Ven 已提交
1021
	mutex_unlock(&vlan_ioctl_mutex);
L
Linus Torvalds 已提交
1022 1023 1024
}
EXPORT_SYMBOL(vlan_ioctl_set);

A
Arjan van de Ven 已提交
1025
static DEFINE_MUTEX(dlci_ioctl_mutex);
1026
static int (*dlci_ioctl_hook) (unsigned int, void __user *);
L
Linus Torvalds 已提交
1027

1028
void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
L
Linus Torvalds 已提交
1029
{
A
Arjan van de Ven 已提交
1030
	mutex_lock(&dlci_ioctl_mutex);
L
Linus Torvalds 已提交
1031
	dlci_ioctl_hook = hook;
A
Arjan van de Ven 已提交
1032
	mutex_unlock(&dlci_ioctl_mutex);
L
Linus Torvalds 已提交
1033 1034 1035
}
EXPORT_SYMBOL(dlci_ioctl_set);

1036
static long sock_do_ioctl(struct net *net, struct socket *sock,
1037
			  unsigned int cmd, unsigned long arg)
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
{
	int err;
	void __user *argp = (void __user *)arg;

	err = sock->ops->ioctl(sock, cmd, arg);

	/*
	 * If this ioctl is unknown try to hand it down
	 * to the NIC driver.
	 */
1048 1049
	if (err != -ENOIOCTLCMD)
		return err;
1050

1051 1052 1053 1054 1055 1056 1057 1058 1059
	if (cmd == SIOCGIFCONF) {
		struct ifconf ifc;
		if (copy_from_user(&ifc, argp, sizeof(struct ifconf)))
			return -EFAULT;
		rtnl_lock();
		err = dev_ifconf(net, &ifc, sizeof(struct ifreq));
		rtnl_unlock();
		if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf)))
			err = -EFAULT;
1060 1061 1062
	} else {
		struct ifreq ifr;
		bool need_copyout;
1063
		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
1064 1065 1066
			return -EFAULT;
		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
		if (!err && need_copyout)
1067
			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
1068
				return -EFAULT;
1069
	}
1070 1071 1072
	return err;
}

L
Linus Torvalds 已提交
1073 1074 1075 1076 1077
/*
 *	With an ioctl, arg may well be a user mode pointer, but we don't know
 *	what to do with it - that's up to the protocol still.
 */

1078 1079 1080 1081 1082 1083 1084
/**
 *	get_net_ns - increment the refcount of the network namespace
 *	@ns: common namespace (net)
 *
 *	Returns the net's common namespace.
 */

1085
struct ns_common *get_net_ns(struct ns_common *ns)
1086 1087 1088
{
	return &get_net(container_of(ns, struct net, ns))->ns;
}
1089
EXPORT_SYMBOL_GPL(get_net_ns);
1090

L
Linus Torvalds 已提交
1091 1092 1093
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
	struct socket *sock;
1094
	struct sock *sk;
L
Linus Torvalds 已提交
1095 1096
	void __user *argp = (void __user *)arg;
	int pid, err;
1097
	struct net *net;
L
Linus Torvalds 已提交
1098

1099
	sock = file->private_data;
1100
	sk = sock->sk;
1101
	net = sock_net(sk);
1102 1103 1104 1105 1106 1107 1108 1109 1110
	if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
		struct ifreq ifr;
		bool need_copyout;
		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
			return -EFAULT;
		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
		if (!err && need_copyout)
			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
				return -EFAULT;
L
Linus Torvalds 已提交
1111
	} else
J
Johannes Berg 已提交
1112
#ifdef CONFIG_WEXT_CORE
L
Linus Torvalds 已提交
1113
	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1114
		err = wext_handle_ioctl(net, cmd, argp);
L
Linus Torvalds 已提交
1115
	} else
J
Johannes Berg 已提交
1116
#endif
1117
		switch (cmd) {
L
Linus Torvalds 已提交
1118 1119 1120 1121 1122
		case FIOSETOWN:
		case SIOCSPGRP:
			err = -EFAULT;
			if (get_user(pid, (int __user *)argp))
				break;
1123
			err = f_setown(sock->file, pid, 1);
L
Linus Torvalds 已提交
1124 1125 1126
			break;
		case FIOGETOWN:
		case SIOCGPGRP:
1127
			err = put_user(f_getown(sock->file),
1128
				       (int __user *)argp);
L
Linus Torvalds 已提交
1129 1130 1131 1132 1133 1134 1135 1136 1137
			break;
		case SIOCGIFBR:
		case SIOCSIFBR:
		case SIOCBRADDBR:
		case SIOCBRDELBR:
			err = -ENOPKG;
			if (!br_ioctl_hook)
				request_module("bridge");

A
Arjan van de Ven 已提交
1138
			mutex_lock(&br_ioctl_mutex);
1139
			if (br_ioctl_hook)
1140
				err = br_ioctl_hook(net, cmd, argp);
A
Arjan van de Ven 已提交
1141
			mutex_unlock(&br_ioctl_mutex);
L
Linus Torvalds 已提交
1142 1143 1144 1145 1146 1147 1148
			break;
		case SIOCGIFVLAN:
		case SIOCSIFVLAN:
			err = -ENOPKG;
			if (!vlan_ioctl_hook)
				request_module("8021q");

A
Arjan van de Ven 已提交
1149
			mutex_lock(&vlan_ioctl_mutex);
L
Linus Torvalds 已提交
1150
			if (vlan_ioctl_hook)
1151
				err = vlan_ioctl_hook(net, argp);
A
Arjan van de Ven 已提交
1152
			mutex_unlock(&vlan_ioctl_mutex);
L
Linus Torvalds 已提交
1153 1154 1155 1156 1157 1158 1159
			break;
		case SIOCADDDLCI:
		case SIOCDELDLCI:
			err = -ENOPKG;
			if (!dlci_ioctl_hook)
				request_module("dlci");

1160 1161
			mutex_lock(&dlci_ioctl_mutex);
			if (dlci_ioctl_hook)
L
Linus Torvalds 已提交
1162
				err = dlci_ioctl_hook(cmd, argp);
1163
			mutex_unlock(&dlci_ioctl_mutex);
L
Linus Torvalds 已提交
1164
			break;
1165 1166 1167 1168 1169 1170 1171
		case SIOCGSKNS:
			err = -EPERM;
			if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
				break;

			err = open_related_ns(&net->ns, get_net_ns);
			break;
1172 1173
		case SIOCGSTAMP_OLD:
		case SIOCGSTAMPNS_OLD:
1174 1175 1176 1177 1178
			if (!sock->ops->gettstamp) {
				err = -ENOIOCTLCMD;
				break;
			}
			err = sock->ops->gettstamp(sock, argp,
1179 1180
						   cmd == SIOCGSTAMP_OLD,
						   !IS_ENABLED(CONFIG_64BIT));
1181
			break;
1182 1183 1184 1185 1186 1187 1188 1189 1190
		case SIOCGSTAMP_NEW:
		case SIOCGSTAMPNS_NEW:
			if (!sock->ops->gettstamp) {
				err = -ENOIOCTLCMD;
				break;
			}
			err = sock->ops->gettstamp(sock, argp,
						   cmd == SIOCGSTAMP_NEW,
						   false);
1191
			break;
L
Linus Torvalds 已提交
1192
		default:
1193
			err = sock_do_ioctl(net, sock, cmd, arg);
L
Linus Torvalds 已提交
1194
			break;
1195
		}
L
Linus Torvalds 已提交
1196 1197 1198
	return err;
}

1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
/**
 *	sock_create_lite - creates a socket
 *	@family: protocol family (AF_INET, ...)
 *	@type: communication type (SOCK_STREAM, ...)
 *	@protocol: protocol (0, ...)
 *	@res: new socket
 *
 *	Creates a new socket and assigns it to @res, passing through LSM.
 *	The new socket initialization is not complete, see kernel_accept().
 *	Returns 0 or an error. On failure @res is set to %NULL.
 *	This function internally uses GFP_KERNEL.
 */

L
Linus Torvalds 已提交
1212 1213 1214 1215
int sock_create_lite(int family, int type, int protocol, struct socket **res)
{
	int err;
	struct socket *sock = NULL;
1216

L
Linus Torvalds 已提交
1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
	err = security_socket_create(family, type, protocol, 1);
	if (err)
		goto out;

	sock = sock_alloc();
	if (!sock) {
		err = -ENOMEM;
		goto out;
	}

	sock->type = type;
V
Venkat Yekkirala 已提交
1228 1229 1230 1231
	err = security_socket_post_create(sock, family, type, protocol, 1);
	if (err)
		goto out_release;

L
Linus Torvalds 已提交
1232 1233 1234
out:
	*res = sock;
	return err;
V
Venkat Yekkirala 已提交
1235 1236 1237 1238
out_release:
	sock_release(sock);
	sock = NULL;
	goto out;
L
Linus Torvalds 已提交
1239
}
1240
EXPORT_SYMBOL(sock_create_lite);
L
Linus Torvalds 已提交
1241 1242

/* No kernel lock held - perfect */
A
Al Viro 已提交
1243
static __poll_t sock_poll(struct file *file, poll_table *wait)
L
Linus Torvalds 已提交
1244
{
C
Christoph Hellwig 已提交
1245
	struct socket *sock = file->private_data;
1246
	__poll_t events = poll_requested_events(wait), flag = 0;
1247

1248 1249
	if (!sock->ops->poll)
		return 0;
1250

1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
	if (sk_can_busy_loop(sock->sk)) {
		/* poll once if requested by the syscall */
		if (events & POLL_BUSY_LOOP)
			sk_busy_loop(sock->sk, 1);

		/* if this socket can poll_ll, tell the system call */
		flag = POLL_BUSY_LOOP;
	}

	return sock->ops->poll(file, sock, wait) | flag;
L
Linus Torvalds 已提交
1261 1262
}

1263
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
L
Linus Torvalds 已提交
1264
{
1265
	struct socket *sock = file->private_data;
L
Linus Torvalds 已提交
1266 1267 1268 1269

	return sock->ops->mmap(file, sock, vma);
}

1270
static int sock_close(struct inode *inode, struct file *filp)
L
Linus Torvalds 已提交
1271
{
1272
	__sock_release(SOCKET_I(inode), inode);
L
Linus Torvalds 已提交
1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
	return 0;
}

/*
 *	Update the socket async list
 *
 *	Fasync_list locking strategy.
 *
 *	1. fasync_list is modified only under process context socket lock
 *	   i.e. under semaphore.
 *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
1284
 *	   or under socket lock
L
Linus Torvalds 已提交
1285 1286 1287 1288
 */

static int sock_fasync(int fd, struct file *filp, int on)
{
1289 1290
	struct socket *sock = filp->private_data;
	struct sock *sk = sock->sk;
1291
	struct socket_wq *wq;
L
Linus Torvalds 已提交
1292

1293
	if (sk == NULL)
L
Linus Torvalds 已提交
1294 1295 1296
		return -EINVAL;

	lock_sock(sk);
1297
	wq = sock->wq;
1298
	fasync_helper(fd, filp, on, &wq->fasync_list);
L
Linus Torvalds 已提交
1299

1300
	if (!wq->fasync_list)
1301 1302
		sock_reset_flag(sk, SOCK_FASYNC);
	else
E
Eric Dumazet 已提交
1303
		sock_set_flag(sk, SOCK_FASYNC);
L
Linus Torvalds 已提交
1304

1305
	release_sock(sk);
L
Linus Torvalds 已提交
1306 1307 1308
	return 0;
}

1309
/* This function may be called only under rcu_lock */
L
Linus Torvalds 已提交
1310

1311
int sock_wake_async(struct socket_wq *wq, int how, int band)
L
Linus Torvalds 已提交
1312
{
1313
	if (!wq || !wq->fasync_list)
L
Linus Torvalds 已提交
1314
		return -1;
1315

1316
	switch (how) {
1317
	case SOCK_WAKE_WAITD:
1318
		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
L
Linus Torvalds 已提交
1319 1320
			break;
		goto call_kill;
1321
	case SOCK_WAKE_SPACE:
1322
		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
L
Linus Torvalds 已提交
1323 1324
			break;
		/* fall through */
1325
	case SOCK_WAKE_IO:
1326
call_kill:
1327
		kill_fasync(&wq->fasync_list, SIGIO, band);
L
Linus Torvalds 已提交
1328
		break;
1329
	case SOCK_WAKE_URG:
1330
		kill_fasync(&wq->fasync_list, SIGURG, band);
L
Linus Torvalds 已提交
1331
	}
1332

L
Linus Torvalds 已提交
1333 1334
	return 0;
}
1335
EXPORT_SYMBOL(sock_wake_async);
L
Linus Torvalds 已提交
1336

1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
/**
 *	__sock_create - creates a socket
 *	@net: net namespace
 *	@family: protocol family (AF_INET, ...)
 *	@type: communication type (SOCK_STREAM, ...)
 *	@protocol: protocol (0, ...)
 *	@res: new socket
 *	@kern: boolean for kernel space sockets
 *
 *	Creates a new socket and assigns it to @res, passing through LSM.
 *	Returns 0 or an error. On failure @res is set to %NULL. @kern must
 *	be set to true if the socket resides in kernel space.
 *	This function internally uses GFP_KERNEL.
 */

P
Pavel Emelyanov 已提交
1352
int __sock_create(struct net *net, int family, int type, int protocol,
1353
			 struct socket **res, int kern)
L
Linus Torvalds 已提交
1354 1355 1356
{
	int err;
	struct socket *sock;
1357
	const struct net_proto_family *pf;
L
Linus Torvalds 已提交
1358 1359

	/*
1360
	 *      Check protocol is in range
L
Linus Torvalds 已提交
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.

	   This uglymoron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
1373 1374
		pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			     current->comm);
L
Linus Torvalds 已提交
1375 1376 1377 1378 1379 1380
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;
1381

1382 1383 1384 1385 1386 1387 1388
	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate
	 *	default.
	 */
	sock = sock_alloc();
	if (!sock) {
1389
		net_warn_ratelimited("socket: no more sockets\n");
1390 1391 1392 1393 1394 1395
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	sock->type = type;

1396
#ifdef CONFIG_MODULES
1397 1398 1399
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
L
Linus Torvalds 已提交
1400 1401 1402
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
E
Eric Dumazet 已提交
1403
	if (rcu_access_pointer(net_families[family]) == NULL)
1404
		request_module("net-pf-%d", family);
L
Linus Torvalds 已提交
1405 1406
#endif

1407 1408 1409 1410 1411
	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;
L
Linus Torvalds 已提交
1412 1413 1414 1415 1416

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
1417
	if (!try_module_get(pf->owner))
L
Linus Torvalds 已提交
1418 1419
		goto out_release;

1420 1421 1422
	/* Now protected by module ref count */
	rcu_read_unlock();

1423
	err = pf->create(net, sock, protocol, kern);
1424
	if (err < 0)
L
Linus Torvalds 已提交
1425
		goto out_module_put;
1426

L
Linus Torvalds 已提交
1427 1428 1429 1430
	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
1431 1432 1433
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

L
Linus Torvalds 已提交
1434 1435 1436 1437
	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
1438
	module_put(pf->owner);
V
Venkat Yekkirala 已提交
1439 1440
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
1441
		goto out_sock_release;
1442
	*res = sock;
L
Linus Torvalds 已提交
1443

1444 1445 1446 1447
	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
L
Linus Torvalds 已提交
1448
out_module_put:
1449 1450 1451
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
L
Linus Torvalds 已提交
1452
	sock_release(sock);
1453 1454 1455 1456 1457
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
L
Linus Torvalds 已提交
1458
}
P
Pavel Emelyanov 已提交
1459
EXPORT_SYMBOL(__sock_create);
L
Linus Torvalds 已提交
1460

1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471
/**
 *	sock_create - creates a socket
 *	@family: protocol family (AF_INET, ...)
 *	@type: communication type (SOCK_STREAM, ...)
 *	@protocol: protocol (0, ...)
 *	@res: new socket
 *
 *	A wrapper around __sock_create().
 *	Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

L
Linus Torvalds 已提交
1472 1473
int sock_create(int family, int type, int protocol, struct socket **res)
{
1474
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
L
Linus Torvalds 已提交
1475
}
1476
EXPORT_SYMBOL(sock_create);
L
Linus Torvalds 已提交
1477

1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
/**
 *	sock_create_kern - creates a socket (kernel space)
 *	@net: net namespace
 *	@family: protocol family (AF_INET, ...)
 *	@type: communication type (SOCK_STREAM, ...)
 *	@protocol: protocol (0, ...)
 *	@res: new socket
 *
 *	A wrapper around __sock_create().
 *	Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

1490
int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
L
Linus Torvalds 已提交
1491
{
1492
	return __sock_create(net, family, type, protocol, res, 1);
L
Linus Torvalds 已提交
1493
}
1494
EXPORT_SYMBOL(sock_create_kern);
L
Linus Torvalds 已提交
1495

1496
int __sys_socket(int family, int type, int protocol)
L
Linus Torvalds 已提交
1497 1498 1499
{
	int retval;
	struct socket *sock;
1500 1501
	int flags;

1502 1503 1504 1505 1506 1507
	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

1508
	flags = type & ~SOCK_TYPE_MASK;
1509
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1510 1511
		return -EINVAL;
	type &= SOCK_TYPE_MASK;
L
Linus Torvalds 已提交
1512

U
Ulrich Drepper 已提交
1513 1514 1515
	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

L
Linus Torvalds 已提交
1516 1517
	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
1518
		return retval;
L
Linus Torvalds 已提交
1519

1520
	return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
L
Linus Torvalds 已提交
1521 1522
}

1523 1524 1525 1526 1527
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	return __sys_socket(family, type, protocol);
}

L
Linus Torvalds 已提交
1528 1529 1530 1531
/*
 *	Create a pair of connected sockets.
 */

1532
int __sys_socketpair(int family, int type, int protocol, int __user *usockvec)
L
Linus Torvalds 已提交
1533 1534 1535
{
	struct socket *sock1, *sock2;
	int fd1, fd2, err;
A
Al Viro 已提交
1536
	struct file *newfile1, *newfile2;
1537 1538 1539
	int flags;

	flags = type & ~SOCK_TYPE_MASK;
1540
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1541 1542
		return -EINVAL;
	type &= SOCK_TYPE_MASK;
L
Linus Torvalds 已提交
1543

U
Ulrich Drepper 已提交
1544 1545 1546
	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

A
Al Viro 已提交
1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568
	/*
	 * reserve descriptors and make sure we won't fail
	 * to return them to userland.
	 */
	fd1 = get_unused_fd_flags(flags);
	if (unlikely(fd1 < 0))
		return fd1;

	fd2 = get_unused_fd_flags(flags);
	if (unlikely(fd2 < 0)) {
		put_unused_fd(fd1);
		return fd2;
	}

	err = put_user(fd1, &usockvec[0]);
	if (err)
		goto out;

	err = put_user(fd2, &usockvec[1]);
	if (err)
		goto out;

L
Linus Torvalds 已提交
1569 1570 1571 1572 1573 1574
	/*
	 * Obtain the first socket and check if the underlying protocol
	 * supports the socketpair call.
	 */

	err = sock_create(family, type, protocol, &sock1);
A
Al Viro 已提交
1575
	if (unlikely(err < 0))
L
Linus Torvalds 已提交
1576 1577 1578
		goto out;

	err = sock_create(family, type, protocol, &sock2);
A
Al Viro 已提交
1579 1580 1581
	if (unlikely(err < 0)) {
		sock_release(sock1);
		goto out;
1582
	}
1583

D
David Herrmann 已提交
1584 1585 1586 1587 1588 1589 1590
	err = security_socket_socketpair(sock1, sock2);
	if (unlikely(err)) {
		sock_release(sock2);
		sock_release(sock1);
		goto out;
	}

A
Al Viro 已提交
1591 1592 1593 1594 1595
	err = sock1->ops->socketpair(sock1, sock2);
	if (unlikely(err < 0)) {
		sock_release(sock2);
		sock_release(sock1);
		goto out;
1596 1597
	}

1598
	newfile1 = sock_alloc_file(sock1, flags, NULL);
1599
	if (IS_ERR(newfile1)) {
1600
		err = PTR_ERR(newfile1);
A
Al Viro 已提交
1601 1602
		sock_release(sock2);
		goto out;
1603 1604
	}

1605
	newfile2 = sock_alloc_file(sock2, flags, NULL);
1606 1607
	if (IS_ERR(newfile2)) {
		err = PTR_ERR(newfile2);
A
Al Viro 已提交
1608 1609
		fput(newfile1);
		goto out;
A
Al Viro 已提交
1610 1611
	}

A
Al Viro 已提交
1612
	audit_fd_pair(fd1, fd2);
1613

A
Al Viro 已提交
1614 1615
	fd_install(fd1, newfile1);
	fd_install(fd2, newfile2);
1616
	return 0;
L
Linus Torvalds 已提交
1617

A
Al Viro 已提交
1618
out:
1619 1620
	put_unused_fd(fd2);
	put_unused_fd(fd1);
L
Linus Torvalds 已提交
1621 1622 1623
	return err;
}

1624 1625 1626 1627 1628 1629
SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
		int __user *, usockvec)
{
	return __sys_socketpair(family, type, protocol, usockvec);
}

L
Linus Torvalds 已提交
1630 1631 1632 1633 1634 1635 1636 1637
/*
 *	Bind a name to a socket. Nothing much to do here since it's
 *	the protocol's responsibility to handle the local address.
 *
 *	We move the socket address to kernel space before we call
 *	the protocol layer (having also checked the address is ok).
 */

1638
int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
L
Linus Torvalds 已提交
1639 1640
{
	struct socket *sock;
1641
	struct sockaddr_storage address;
1642
	int err, fput_needed;
L
Linus Torvalds 已提交
1643

1644
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1645
	if (sock) {
1646
		err = move_addr_to_kernel(umyaddr, addrlen, &address);
1647
		if (!err) {
1648
			err = security_socket_bind(sock,
1649
						   (struct sockaddr *)&address,
1650
						   addrlen);
1651 1652
			if (!err)
				err = sock->ops->bind(sock,
1653
						      (struct sockaddr *)
1654
						      &address, addrlen);
L
Linus Torvalds 已提交
1655
		}
1656
		fput_light(sock->file, fput_needed);
1657
	}
L
Linus Torvalds 已提交
1658 1659 1660
	return err;
}

1661 1662 1663 1664 1665
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
	return __sys_bind(fd, umyaddr, addrlen);
}

L
Linus Torvalds 已提交
1666 1667 1668 1669 1670 1671
/*
 *	Perform a listen. Basically, we allow the protocol to do anything
 *	necessary for a listen, and if that works, we mark the socket as
 *	ready for listening.
 */

1672
int __sys_listen(int fd, int backlog)
L
Linus Torvalds 已提交
1673 1674
{
	struct socket *sock;
1675
	int err, fput_needed;
1676
	int somaxconn;
1677 1678 1679

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock) {
1680
		somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
1681
		if ((unsigned int)backlog > somaxconn)
1682
			backlog = somaxconn;
L
Linus Torvalds 已提交
1683 1684

		err = security_socket_listen(sock, backlog);
1685 1686
		if (!err)
			err = sock->ops->listen(sock, backlog);
L
Linus Torvalds 已提交
1687

1688
		fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
1689 1690 1691 1692
	}
	return err;
}

1693 1694 1695 1696 1697
SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
	return __sys_listen(fd, backlog);
}

L
Linus Torvalds 已提交
1698 1699 1700 1701 1702 1703 1704 1705 1706
/*
 *	For accept, we attempt to create a new socket, set up the link
 *	with the client, wake up the client, then return the new
 *	connected fd. We collect the address of the connector in kernel
 *	space and move it to user at the very end. This is unclean because
 *	we open the socket then return an error.
 *
 *	1003.1g adds the ability to recvmsg() to query connection pending
 *	status to recvmsg. We need to add that support in a way thats
1707
 *	clean when we restructure accept also.
L
Linus Torvalds 已提交
1708 1709
 */

1710 1711
int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
		  int __user *upeer_addrlen, int flags)
L
Linus Torvalds 已提交
1712 1713
{
	struct socket *sock, *newsock;
1714
	struct file *newfile;
1715
	int err, len, newfd, fput_needed;
1716
	struct sockaddr_storage address;
L
Linus Torvalds 已提交
1717

1718
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
U
Ulrich Drepper 已提交
1719 1720 1721 1722 1723
		return -EINVAL;

	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

1724
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
L
Linus Torvalds 已提交
1725 1726 1727 1728
	if (!sock)
		goto out;

	err = -ENFILE;
1729 1730
	newsock = sock_alloc();
	if (!newsock)
L
Linus Torvalds 已提交
1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741
		goto out_put;

	newsock->type = sock->type;
	newsock->ops = sock->ops;

	/*
	 * We don't need try_module_get here, as the listening socket (sock)
	 * has the protocol module (sock->ops->owner) held.
	 */
	__module_get(newsock->ops->owner);

1742
	newfd = get_unused_fd_flags(flags);
1743 1744
	if (unlikely(newfd < 0)) {
		err = newfd;
1745 1746
		sock_release(newsock);
		goto out_put;
1747
	}
1748
	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
1749
	if (IS_ERR(newfile)) {
1750 1751 1752 1753
		err = PTR_ERR(newfile);
		put_unused_fd(newfd);
		goto out_put;
	}
1754

1755 1756
	err = security_socket_accept(sock, newsock);
	if (err)
1757
		goto out_fd;
1758

1759
	err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
L
Linus Torvalds 已提交
1760
	if (err < 0)
1761
		goto out_fd;
L
Linus Torvalds 已提交
1762 1763

	if (upeer_sockaddr) {
1764 1765 1766
		len = newsock->ops->getname(newsock,
					(struct sockaddr *)&address, 2);
		if (len < 0) {
L
Linus Torvalds 已提交
1767
			err = -ECONNABORTED;
1768
			goto out_fd;
L
Linus Torvalds 已提交
1769
		}
1770
		err = move_addr_to_user(&address,
1771
					len, upeer_sockaddr, upeer_addrlen);
L
Linus Torvalds 已提交
1772
		if (err < 0)
1773
			goto out_fd;
L
Linus Torvalds 已提交
1774 1775 1776 1777
	}

	/* File flags are not inherited via accept() unlike another OSes. */

1778 1779
	fd_install(newfd, newfile);
	err = newfd;
L
Linus Torvalds 已提交
1780 1781

out_put:
1782
	fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
1783 1784
out:
	return err;
1785
out_fd:
1786
	fput(newfile);
1787
	put_unused_fd(newfd);
L
Linus Torvalds 已提交
1788 1789 1790
	goto out_put;
}

1791 1792 1793 1794 1795 1796
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
		int __user *, upeer_addrlen, int, flags)
{
	return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags);
}

1797 1798
SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
		int __user *, upeer_addrlen)
U
Ulrich Drepper 已提交
1799
{
1800
	return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
U
Ulrich Drepper 已提交
1801 1802
}

L
Linus Torvalds 已提交
1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
/*
 *	Attempt to connect to a socket with the server address.  The address
 *	is in user space so we verify it is OK and move it to kernel space.
 *
 *	For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
 *	break bindings
 *
 *	NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
 *	other SEQPACKET protocols that take time to connect() as it doesn't
 *	include the -EINPROGRESS status for such sockets.
 */

1815
int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
L
Linus Torvalds 已提交
1816 1817
{
	struct socket *sock;
1818
	struct sockaddr_storage address;
1819
	int err, fput_needed;
L
Linus Torvalds 已提交
1820

1821
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
L
Linus Torvalds 已提交
1822 1823
	if (!sock)
		goto out;
1824
	err = move_addr_to_kernel(uservaddr, addrlen, &address);
L
Linus Torvalds 已提交
1825 1826 1827
	if (err < 0)
		goto out_put;

1828
	err =
1829
	    security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
L
Linus Torvalds 已提交
1830 1831 1832
	if (err)
		goto out_put;

1833
	err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
L
Linus Torvalds 已提交
1834 1835
				 sock->file->f_flags);
out_put:
1836
	fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
1837 1838 1839 1840
out:
	return err;
}

1841 1842 1843 1844 1845 1846
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
		int, addrlen)
{
	return __sys_connect(fd, uservaddr, addrlen);
}

L
Linus Torvalds 已提交
1847 1848 1849 1850 1851
/*
 *	Get the local address ('name') of a socket object. Move the obtained
 *	name to user space.
 */

1852 1853
int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
		      int __user *usockaddr_len)
L
Linus Torvalds 已提交
1854 1855
{
	struct socket *sock;
1856
	struct sockaddr_storage address;
1857
	int err, fput_needed;
1858

1859
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
L
Linus Torvalds 已提交
1860 1861 1862 1863 1864 1865 1866
	if (!sock)
		goto out;

	err = security_socket_getsockname(sock);
	if (err)
		goto out_put;

1867 1868
	err = sock->ops->getname(sock, (struct sockaddr *)&address, 0);
	if (err < 0)
L
Linus Torvalds 已提交
1869
		goto out_put;
1870 1871
        /* "err" is actually length in this case */
	err = move_addr_to_user(&address, err, usockaddr, usockaddr_len);
L
Linus Torvalds 已提交
1872 1873

out_put:
1874
	fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
1875 1876 1877 1878
out:
	return err;
}

1879 1880 1881 1882 1883 1884
SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
		int __user *, usockaddr_len)
{
	return __sys_getsockname(fd, usockaddr, usockaddr_len);
}

L
Linus Torvalds 已提交
1885 1886 1887 1888 1889
/*
 *	Get the remote address ('name') of a socket object. Move the obtained
 *	name to user space.
 */

1890 1891
int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
		      int __user *usockaddr_len)
L
Linus Torvalds 已提交
1892 1893
{
	struct socket *sock;
1894
	struct sockaddr_storage address;
1895
	int err, fput_needed;
L
Linus Torvalds 已提交
1896

1897 1898
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock != NULL) {
L
Linus Torvalds 已提交
1899 1900
		err = security_socket_getpeername(sock);
		if (err) {
1901
			fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
1902 1903 1904
			return err;
		}

1905 1906 1907 1908
		err = sock->ops->getname(sock, (struct sockaddr *)&address, 1);
		if (err >= 0)
			/* "err" is actually length in this case */
			err = move_addr_to_user(&address, err, usockaddr,
1909
						usockaddr_len);
1910
		fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
1911 1912 1913 1914
	}
	return err;
}

1915 1916 1917 1918 1919 1920
SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
		int __user *, usockaddr_len)
{
	return __sys_getpeername(fd, usockaddr, usockaddr_len);
}

L
Linus Torvalds 已提交
1921 1922 1923 1924 1925
/*
 *	Send a datagram to a given address. We move the address into kernel
 *	space and check the user space data area is readable before invoking
 *	the protocol.
 */
1926 1927
int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
		 struct sockaddr __user *addr,  int addr_len)
L
Linus Torvalds 已提交
1928 1929
{
	struct socket *sock;
1930
	struct sockaddr_storage address;
L
Linus Torvalds 已提交
1931 1932 1933
	int err;
	struct msghdr msg;
	struct iovec iov;
1934 1935
	int fput_needed;

1936 1937 1938
	err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter);
	if (unlikely(err))
		return err;
1939 1940
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (!sock)
1941
		goto out;
1942

1943 1944 1945 1946
	msg.msg_name = NULL;
	msg.msg_control = NULL;
	msg.msg_controllen = 0;
	msg.msg_namelen = 0;
1947
	if (addr) {
1948
		err = move_addr_to_kernel(addr, addr_len, &address);
L
Linus Torvalds 已提交
1949 1950
		if (err < 0)
			goto out_put;
1951
		msg.msg_name = (struct sockaddr *)&address;
1952
		msg.msg_namelen = addr_len;
L
Linus Torvalds 已提交
1953 1954 1955 1956
	}
	if (sock->file->f_flags & O_NONBLOCK)
		flags |= MSG_DONTWAIT;
	msg.msg_flags = flags;
1957
	err = sock_sendmsg(sock, &msg);
L
Linus Torvalds 已提交
1958

1959
out_put:
1960
	fput_light(sock->file, fput_needed);
1961
out:
L
Linus Torvalds 已提交
1962 1963 1964
	return err;
}

1965 1966 1967 1968 1969 1970 1971
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
		unsigned int, flags, struct sockaddr __user *, addr,
		int, addr_len)
{
	return __sys_sendto(fd, buff, len, flags, addr, addr_len);
}

L
Linus Torvalds 已提交
1972
/*
1973
 *	Send a datagram down a socket.
L
Linus Torvalds 已提交
1974 1975
 */

1976
SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
1977
		unsigned int, flags)
L
Linus Torvalds 已提交
1978
{
1979
	return __sys_sendto(fd, buff, len, flags, NULL, 0);
L
Linus Torvalds 已提交
1980 1981 1982
}

/*
1983
 *	Receive a frame from the socket and optionally record the address of the
L
Linus Torvalds 已提交
1984 1985 1986
 *	sender. We verify the buffers are writable and if needed move the
 *	sender address from kernel to user space.
 */
1987 1988
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
		   struct sockaddr __user *addr, int __user *addr_len)
L
Linus Torvalds 已提交
1989 1990 1991 1992
{
	struct socket *sock;
	struct iovec iov;
	struct msghdr msg;
1993
	struct sockaddr_storage address;
1994
	int err, err2;
1995 1996
	int fput_needed;

1997 1998 1999
	err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter);
	if (unlikely(err))
		return err;
2000
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
L
Linus Torvalds 已提交
2001
	if (!sock)
2002
		goto out;
L
Linus Torvalds 已提交
2003

2004 2005
	msg.msg_control = NULL;
	msg.msg_controllen = 0;
2006 2007 2008 2009
	/* Save some cycles and don't copy the address if not needed */
	msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
	/* We assume all kernel code knows the size of sockaddr_storage */
	msg.msg_namelen = 0;
2010
	msg.msg_iocb = NULL;
2011
	msg.msg_flags = 0;
L
Linus Torvalds 已提交
2012 2013
	if (sock->file->f_flags & O_NONBLOCK)
		flags |= MSG_DONTWAIT;
2014
	err = sock_recvmsg(sock, &msg, flags);
L
Linus Torvalds 已提交
2015

2016
	if (err >= 0 && addr != NULL) {
2017
		err2 = move_addr_to_user(&address,
2018
					 msg.msg_namelen, addr, addr_len);
2019 2020
		if (err2 < 0)
			err = err2;
L
Linus Torvalds 已提交
2021
	}
2022 2023

	fput_light(sock->file, fput_needed);
2024
out:
L
Linus Torvalds 已提交
2025 2026 2027
	return err;
}

2028 2029 2030 2031 2032 2033 2034
SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
		unsigned int, flags, struct sockaddr __user *, addr,
		int __user *, addr_len)
{
	return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len);
}

L
Linus Torvalds 已提交
2035
/*
2036
 *	Receive a datagram from a socket.
L
Linus Torvalds 已提交
2037 2038
 */

2039 2040
SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
		unsigned int, flags)
L
Linus Torvalds 已提交
2041
{
2042
	return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
L
Linus Torvalds 已提交
2043 2044 2045 2046 2047 2048 2049
}

/*
 *	Set a socket option. Because we don't know the option lengths we have
 *	to pass the user mode parameter for the protocols to sort out.
 */

2050 2051
static int __sys_setsockopt(int fd, int level, int optname,
			    char __user *optval, int optlen)
L
Linus Torvalds 已提交
2052
{
2053 2054
	mm_segment_t oldfs = get_fs();
	char *kernel_optval = NULL;
2055
	int err, fput_needed;
L
Linus Torvalds 已提交
2056 2057 2058 2059
	struct socket *sock;

	if (optlen < 0)
		return -EINVAL;
2060 2061 2062 2063

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock != NULL) {
		err = security_socket_setsockopt(sock, level, optname);
2064 2065
		if (err)
			goto out_put;
L
Linus Torvalds 已提交
2066

2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082
		err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level,
						     &optname, optval, &optlen,
						     &kernel_optval);

		if (err < 0) {
			goto out_put;
		} else if (err > 0) {
			err = 0;
			goto out_put;
		}

		if (kernel_optval) {
			set_fs(KERNEL_DS);
			optval = (char __user __force *)kernel_optval;
		}

L
Linus Torvalds 已提交
2083
		if (level == SOL_SOCKET)
2084 2085 2086
			err =
			    sock_setsockopt(sock, level, optname, optval,
					    optlen);
L
Linus Torvalds 已提交
2087
		else
2088 2089 2090
			err =
			    sock->ops->setsockopt(sock, level, optname, optval,
						  optlen);
2091 2092 2093 2094 2095

		if (kernel_optval) {
			set_fs(oldfs);
			kfree(kernel_optval);
		}
2096 2097
out_put:
		fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
2098 2099 2100 2101
	}
	return err;
}

2102 2103 2104 2105 2106 2107
SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
		char __user *, optval, int, optlen)
{
	return __sys_setsockopt(fd, level, optname, optval, optlen);
}

L
Linus Torvalds 已提交
2108 2109 2110 2111 2112
/*
 *	Get a socket option. Because we don't know the option lengths we have
 *	to pass a user mode parameter for the protocols to sort out.
 */

2113 2114
static int __sys_getsockopt(int fd, int level, int optname,
			    char __user *optval, int __user *optlen)
L
Linus Torvalds 已提交
2115
{
2116
	int err, fput_needed;
L
Linus Torvalds 已提交
2117
	struct socket *sock;
2118
	int max_optlen;
L
Linus Torvalds 已提交
2119

2120 2121
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock != NULL) {
2122 2123 2124
		err = security_socket_getsockopt(sock, level, optname);
		if (err)
			goto out_put;
L
Linus Torvalds 已提交
2125

2126 2127
		max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);

L
Linus Torvalds 已提交
2128
		if (level == SOL_SOCKET)
2129 2130 2131
			err =
			    sock_getsockopt(sock, level, optname, optval,
					    optlen);
L
Linus Torvalds 已提交
2132
		else
2133 2134 2135
			err =
			    sock->ops->getsockopt(sock, level, optname, optval,
						  optlen);
2136 2137 2138 2139

		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
						     optval, optlen,
						     max_optlen, err);
2140 2141
out_put:
		fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
2142 2143 2144 2145
	}
	return err;
}

2146 2147 2148 2149 2150 2151
SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
		char __user *, optval, int __user *, optlen)
{
	return __sys_getsockopt(fd, level, optname, optval, optlen);
}

L
Linus Torvalds 已提交
2152 2153 2154 2155
/*
 *	Shutdown a socket.
 */

2156
int __sys_shutdown(int fd, int how)
L
Linus Torvalds 已提交
2157
{
2158
	int err, fput_needed;
L
Linus Torvalds 已提交
2159 2160
	struct socket *sock;

2161 2162
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock != NULL) {
L
Linus Torvalds 已提交
2163
		err = security_socket_shutdown(sock, how);
2164 2165 2166
		if (!err)
			err = sock->ops->shutdown(sock, how);
		fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
2167 2168 2169 2170
	}
	return err;
}

2171 2172 2173 2174 2175
SYSCALL_DEFINE2(shutdown, int, fd, int, how)
{
	return __sys_shutdown(fd, how);
}

2176
/* A couple of helpful macros for getting the address of the 32/64 bit
L
Linus Torvalds 已提交
2177 2178 2179 2180 2181 2182
 * fields which are the same type (int / unsigned) on our platforms.
 */
#define COMPAT_MSG(msg, member)	((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
#define COMPAT_NAMELEN(msg)	COMPAT_MSG(msg, msg_namelen)
#define COMPAT_FLAGS(msg)	COMPAT_MSG(msg, msg_flags)

2183 2184 2185 2186 2187
struct used_address {
	struct sockaddr_storage name;
	unsigned int name_len;
};

2188 2189 2190 2191
static int copy_msghdr_from_user(struct msghdr *kmsg,
				 struct user_msghdr __user *umsg,
				 struct sockaddr __user **save_addr,
				 struct iovec **iov)
2192
{
2193
	struct user_msghdr msg;
2194 2195
	ssize_t err;

2196
	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
2197
		return -EFAULT;
2198

2199
	kmsg->msg_control = (void __force *)msg.msg_control;
2200 2201 2202 2203 2204
	kmsg->msg_controllen = msg.msg_controllen;
	kmsg->msg_flags = msg.msg_flags;

	kmsg->msg_namelen = msg.msg_namelen;
	if (!msg.msg_name)
2205 2206
		kmsg->msg_namelen = 0;

2207 2208 2209
	if (kmsg->msg_namelen < 0)
		return -EINVAL;

2210
	if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
2211
		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
2212 2213

	if (save_addr)
2214
		*save_addr = msg.msg_name;
2215

2216
	if (msg.msg_name && kmsg->msg_namelen) {
2217
		if (!save_addr) {
2218 2219
			err = move_addr_to_kernel(msg.msg_name,
						  kmsg->msg_namelen,
2220 2221 2222 2223 2224 2225 2226 2227 2228
						  kmsg->msg_name);
			if (err < 0)
				return err;
		}
	} else {
		kmsg->msg_name = NULL;
		kmsg->msg_namelen = 0;
	}

2229
	if (msg.msg_iovlen > UIO_MAXIOV)
2230 2231
		return -EMSGSIZE;

2232 2233
	kmsg->msg_iocb = NULL;

2234 2235
	return import_iovec(save_addr ? READ : WRITE,
			    msg.msg_iov, msg.msg_iovlen,
2236
			    UIO_FASTIOV, iov, &kmsg->msg_iter);
2237 2238
}

2239
static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
2240
			 struct msghdr *msg_sys, unsigned int flags,
2241 2242
			 struct used_address *used_address,
			 unsigned int allowed_msghdr_flags)
L
Linus Torvalds 已提交
2243
{
2244 2245
	struct compat_msghdr __user *msg_compat =
	    (struct compat_msghdr __user *)msg;
2246
	struct sockaddr_storage address;
L
Linus Torvalds 已提交
2247
	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
2248
	unsigned char ctl[sizeof(struct cmsghdr) + 20]
2249
				__aligned(sizeof(__kernel_size_t));
2250
	/* 20 is size of ipv6_pktinfo */
L
Linus Torvalds 已提交
2251
	unsigned char *ctl_buf = ctl;
2252
	int ctl_len;
2253
	ssize_t err;
2254

2255
	msg_sys->msg_name = &address;
L
Linus Torvalds 已提交
2256

2257
	if (MSG_CMSG_COMPAT & flags)
2258
		err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov);
2259
	else
2260
		err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov);
2261
	if (err < 0)
2262
		return err;
L
Linus Torvalds 已提交
2263 2264 2265

	err = -ENOBUFS;

2266
	if (msg_sys->msg_controllen > INT_MAX)
L
Linus Torvalds 已提交
2267
		goto out_freeiov;
2268
	flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
2269
	ctl_len = msg_sys->msg_controllen;
L
Linus Torvalds 已提交
2270
	if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
2271
		err =
2272
		    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
2273
						     sizeof(ctl));
L
Linus Torvalds 已提交
2274 2275
		if (err)
			goto out_freeiov;
2276 2277
		ctl_buf = msg_sys->msg_control;
		ctl_len = msg_sys->msg_controllen;
L
Linus Torvalds 已提交
2278
	} else if (ctl_len) {
2279 2280
		BUILD_BUG_ON(sizeof(struct cmsghdr) !=
			     CMSG_ALIGN(sizeof(struct cmsghdr)));
2281
		if (ctl_len > sizeof(ctl)) {
L
Linus Torvalds 已提交
2282
			ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
2283
			if (ctl_buf == NULL)
L
Linus Torvalds 已提交
2284 2285 2286 2287
				goto out_freeiov;
		}
		err = -EFAULT;
		/*
2288
		 * Careful! Before this, msg_sys->msg_control contains a user pointer.
L
Linus Torvalds 已提交
2289 2290 2291
		 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
		 * checking falls down on this.
		 */
2292
		if (copy_from_user(ctl_buf,
2293
				   (void __user __force *)msg_sys->msg_control,
2294
				   ctl_len))
L
Linus Torvalds 已提交
2295
			goto out_freectl;
2296
		msg_sys->msg_control = ctl_buf;
L
Linus Torvalds 已提交
2297
	}
2298
	msg_sys->msg_flags = flags;
L
Linus Torvalds 已提交
2299 2300

	if (sock->file->f_flags & O_NONBLOCK)
2301
		msg_sys->msg_flags |= MSG_DONTWAIT;
2302 2303 2304 2305 2306 2307
	/*
	 * If this is sendmmsg() and current destination address is same as
	 * previously succeeded address, omit asking LSM's decision.
	 * used_address->name_len is initialized to UINT_MAX so that the first
	 * destination address never matches.
	 */
2308 2309 2310
	if (used_address && msg_sys->msg_name &&
	    used_address->name_len == msg_sys->msg_namelen &&
	    !memcmp(&used_address->name, msg_sys->msg_name,
2311
		    used_address->name_len)) {
2312
		err = sock_sendmsg_nosec(sock, msg_sys);
2313 2314
		goto out_freectl;
	}
2315
	err = sock_sendmsg(sock, msg_sys);
2316 2317 2318 2319 2320 2321
	/*
	 * If this is sendmmsg() and sending to current destination address was
	 * successful, remember it.
	 */
	if (used_address && err >= 0) {
		used_address->name_len = msg_sys->msg_namelen;
2322 2323 2324
		if (msg_sys->msg_name)
			memcpy(&used_address->name, msg_sys->msg_name,
			       used_address->name_len);
2325
	}
L
Linus Torvalds 已提交
2326 2327

out_freectl:
2328
	if (ctl_buf != ctl)
L
Linus Torvalds 已提交
2329 2330
		sock_kfree_s(sock->sk, ctl_buf, ctl_len);
out_freeiov:
2331
	kfree(iov);
2332 2333 2334 2335 2336 2337 2338
	return err;
}

/*
 *	BSD sendmsg interface
 */

2339 2340
long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
		   bool forbid_cmsg_compat)
2341 2342 2343
{
	int fput_needed, err;
	struct msghdr msg_sys;
2344 2345
	struct socket *sock;

2346 2347 2348
	if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
		return -EINVAL;

2349
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
2350 2351 2352
	if (!sock)
		goto out;

2353
	err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
2354

2355
	fput_light(sock->file, fput_needed);
2356
out:
L
Linus Torvalds 已提交
2357 2358 2359
	return err;
}

2360
SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
2361
{
2362
	return __sys_sendmsg(fd, msg, flags, true);
2363 2364
}

2365 2366 2367 2368 2369
/*
 *	Linux sendmmsg interface
 */

int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
2370
		   unsigned int flags, bool forbid_cmsg_compat)
2371 2372 2373 2374 2375 2376
{
	int fput_needed, err, datagrams;
	struct socket *sock;
	struct mmsghdr __user *entry;
	struct compat_mmsghdr __user *compat_entry;
	struct msghdr msg_sys;
2377
	struct used_address used_address;
T
Tom Herbert 已提交
2378
	unsigned int oflags = flags;
2379

2380 2381 2382
	if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
		return -EINVAL;

2383 2384
	if (vlen > UIO_MAXIOV)
		vlen = UIO_MAXIOV;
2385 2386 2387 2388 2389 2390 2391

	datagrams = 0;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (!sock)
		return err;

2392
	used_address.name_len = UINT_MAX;
2393 2394
	entry = mmsg;
	compat_entry = (struct compat_mmsghdr __user *)mmsg;
2395
	err = 0;
T
Tom Herbert 已提交
2396
	flags |= MSG_BATCH;
2397 2398

	while (datagrams < vlen) {
T
Tom Herbert 已提交
2399 2400 2401
		if (datagrams == vlen - 1)
			flags = oflags;

2402
		if (MSG_CMSG_COMPAT & flags) {
2403
			err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
2404
					     &msg_sys, flags, &used_address, MSG_EOR);
2405 2406 2407 2408 2409
			if (err < 0)
				break;
			err = __put_user(err, &compat_entry->msg_len);
			++compat_entry;
		} else {
2410
			err = ___sys_sendmsg(sock,
2411
					     (struct user_msghdr __user *)entry,
2412
					     &msg_sys, flags, &used_address, MSG_EOR);
2413 2414 2415 2416 2417 2418 2419 2420 2421
			if (err < 0)
				break;
			err = put_user(err, &entry->msg_len);
			++entry;
		}

		if (err)
			break;
		++datagrams;
2422 2423
		if (msg_data_left(&msg_sys))
			break;
2424
		cond_resched();
2425 2426 2427 2428
	}

	fput_light(sock->file, fput_needed);

2429 2430
	/* We only return an error if no datagrams were able to be sent */
	if (datagrams != 0)
2431 2432 2433 2434 2435 2436 2437 2438
		return datagrams;

	return err;
}

SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
		unsigned int, vlen, unsigned int, flags)
{
2439
	return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
2440 2441
}

2442
static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
2443
			 struct msghdr *msg_sys, unsigned int flags, int nosec)
L
Linus Torvalds 已提交
2444
{
2445 2446
	struct compat_msghdr __user *msg_compat =
	    (struct compat_msghdr __user *)msg;
L
Linus Torvalds 已提交
2447
	struct iovec iovstack[UIO_FASTIOV];
2448
	struct iovec *iov = iovstack;
L
Linus Torvalds 已提交
2449
	unsigned long cmsg_ptr;
2450
	int len;
2451
	ssize_t err;
L
Linus Torvalds 已提交
2452 2453

	/* kernel mode address */
2454
	struct sockaddr_storage addr;
L
Linus Torvalds 已提交
2455 2456 2457

	/* user mode address pointers */
	struct sockaddr __user *uaddr;
2458
	int __user *uaddr_len = COMPAT_NAMELEN(msg);
2459

2460
	msg_sys->msg_name = &addr;
L
Linus Torvalds 已提交
2461

2462
	if (MSG_CMSG_COMPAT & flags)
2463
		err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov);
2464
	else
2465
		err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
L
Linus Torvalds 已提交
2466
	if (err < 0)
2467
		return err;
L
Linus Torvalds 已提交
2468

2469 2470
	cmsg_ptr = (unsigned long)msg_sys->msg_control;
	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
2471

2472 2473 2474
	/* We assume all kernel code knows the size of sockaddr_storage */
	msg_sys->msg_namelen = 0;

L
Linus Torvalds 已提交
2475 2476
	if (sock->file->f_flags & O_NONBLOCK)
		flags |= MSG_DONTWAIT;
2477
	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags);
L
Linus Torvalds 已提交
2478 2479 2480 2481 2482
	if (err < 0)
		goto out_freeiov;
	len = err;

	if (uaddr != NULL) {
2483
		err = move_addr_to_user(&addr,
2484
					msg_sys->msg_namelen, uaddr,
2485
					uaddr_len);
L
Linus Torvalds 已提交
2486 2487 2488
		if (err < 0)
			goto out_freeiov;
	}
2489
	err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
2490
			 COMPAT_FLAGS(msg));
L
Linus Torvalds 已提交
2491 2492 2493
	if (err)
		goto out_freeiov;
	if (MSG_CMSG_COMPAT & flags)
2494
		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
L
Linus Torvalds 已提交
2495 2496
				 &msg_compat->msg_controllen);
	else
2497
		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
L
Linus Torvalds 已提交
2498 2499 2500 2501 2502 2503
				 &msg->msg_controllen);
	if (err)
		goto out_freeiov;
	err = len;

out_freeiov:
2504
	kfree(iov);
2505 2506 2507 2508 2509 2510 2511
	return err;
}

/*
 *	BSD recvmsg interface
 */

2512 2513
long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
		   bool forbid_cmsg_compat)
2514 2515 2516
{
	int fput_needed, err;
	struct msghdr msg_sys;
2517 2518
	struct socket *sock;

2519 2520 2521
	if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
		return -EINVAL;

2522
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
2523 2524 2525
	if (!sock)
		goto out;

2526
	err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
2527

2528
	fput_light(sock->file, fput_needed);
L
Linus Torvalds 已提交
2529 2530 2531 2532
out:
	return err;
}

2533
SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
2534 2535
		unsigned int, flags)
{
2536
	return __sys_recvmsg(fd, msg, flags, true);
2537 2538
}

2539 2540 2541 2542
/*
 *     Linux recvmmsg interface
 */

2543 2544 2545
static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
			  unsigned int vlen, unsigned int flags,
			  struct timespec64 *timeout)
2546 2547 2548 2549
{
	int fput_needed, err, datagrams;
	struct socket *sock;
	struct mmsghdr __user *entry;
2550
	struct compat_mmsghdr __user *compat_entry;
2551
	struct msghdr msg_sys;
2552 2553
	struct timespec64 end_time;
	struct timespec64 timeout64;
2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565

	if (timeout &&
	    poll_select_set_timeout(&end_time, timeout->tv_sec,
				    timeout->tv_nsec))
		return -EINVAL;

	datagrams = 0;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (!sock)
		return err;

2566 2567 2568 2569 2570 2571
	if (likely(!(flags & MSG_ERRQUEUE))) {
		err = sock_error(sock->sk);
		if (err) {
			datagrams = err;
			goto out_put;
		}
2572
	}
2573 2574

	entry = mmsg;
2575
	compat_entry = (struct compat_mmsghdr __user *)mmsg;
2576 2577 2578 2579 2580

	while (datagrams < vlen) {
		/*
		 * No need to ask LSM for more than the first datagram.
		 */
2581
		if (MSG_CMSG_COMPAT & flags) {
2582
			err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
2583 2584
					     &msg_sys, flags & ~MSG_WAITFORONE,
					     datagrams);
2585 2586 2587 2588 2589
			if (err < 0)
				break;
			err = __put_user(err, &compat_entry->msg_len);
			++compat_entry;
		} else {
2590
			err = ___sys_recvmsg(sock,
2591
					     (struct user_msghdr __user *)entry,
2592 2593
					     &msg_sys, flags & ~MSG_WAITFORONE,
					     datagrams);
2594 2595 2596 2597 2598 2599
			if (err < 0)
				break;
			err = put_user(err, &entry->msg_len);
			++entry;
		}

2600 2601 2602 2603
		if (err)
			break;
		++datagrams;

2604 2605 2606 2607
		/* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
		if (flags & MSG_WAITFORONE)
			flags |= MSG_DONTWAIT;

2608
		if (timeout) {
2609
			ktime_get_ts64(&timeout64);
2610
			*timeout = timespec64_sub(end_time, timeout64);
2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623
			if (timeout->tv_sec < 0) {
				timeout->tv_sec = timeout->tv_nsec = 0;
				break;
			}

			/* Timeout, return less than vlen datagrams */
			if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
				break;
		}

		/* Out of band data, return right away */
		if (msg_sys.msg_flags & MSG_OOB)
			break;
2624
		cond_resched();
2625 2626 2627
	}

	if (err == 0)
2628 2629 2630 2631 2632 2633
		goto out_put;

	if (datagrams == 0) {
		datagrams = err;
		goto out_put;
	}
2634

2635 2636 2637 2638 2639
	/*
	 * We may return less entries than requested (vlen) if the
	 * sock is non block and there aren't enough datagrams...
	 */
	if (err != -EAGAIN) {
2640
		/*
2641 2642 2643 2644
		 * ... or  if recvmsg returns an error after we
		 * received some datagrams, where we record the
		 * error to return on the next call or if the
		 * app asks about it using getsockopt(SO_ERROR).
2645
		 */
2646
		sock->sk->sk_err = -err;
2647
	}
2648 2649
out_put:
	fput_light(sock->file, fput_needed);
2650

2651
	return datagrams;
2652 2653
}

2654 2655 2656 2657
int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
		   unsigned int vlen, unsigned int flags,
		   struct __kernel_timespec __user *timeout,
		   struct old_timespec32 __user *timeout32)
2658 2659
{
	int datagrams;
2660
	struct timespec64 timeout_sys;
2661

2662 2663
	if (timeout && get_timespec64(&timeout_sys, timeout))
		return -EFAULT;
2664

2665
	if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
2666 2667
		return -EFAULT;

2668 2669 2670 2671
	if (!timeout && !timeout32)
		return do_recvmmsg(fd, mmsg, vlen, flags, NULL);

	datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
2672

2673 2674 2675 2676 2677 2678 2679
	if (datagrams <= 0)
		return datagrams;

	if (timeout && put_timespec64(&timeout_sys, timeout))
		datagrams = -EFAULT;

	if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
2680 2681 2682 2683 2684
		datagrams = -EFAULT;

	return datagrams;
}

2685 2686
SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
		unsigned int, vlen, unsigned int, flags,
2687
		struct __kernel_timespec __user *, timeout)
2688
{
2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703
	if (flags & MSG_CMSG_COMPAT)
		return -EINVAL;

	return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL);
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg,
		unsigned int, vlen, unsigned int, flags,
		struct old_timespec32 __user *, timeout)
{
	if (flags & MSG_CMSG_COMPAT)
		return -EINVAL;

	return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout);
2704
}
2705
#endif
2706

2707
#ifdef __ARCH_WANT_SYS_SOCKETCALL
L
Linus Torvalds 已提交
2708 2709
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
2710
static const unsigned char nargs[21] = {
2711 2712 2713
	AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
	AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
	AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
2714
	AL(4), AL(5), AL(4)
2715 2716
};

L
Linus Torvalds 已提交
2717 2718 2719
#undef AL

/*
2720
 *	System call vectors.
L
Linus Torvalds 已提交
2721 2722 2723
 *
 *	Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
2724
 *  it is set by the callees.
L
Linus Torvalds 已提交
2725 2726
 */

2727
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
L
Linus Torvalds 已提交
2728
{
2729
	unsigned long a[AUDITSC_ARGS];
2730
	unsigned long a0, a1;
L
Linus Torvalds 已提交
2731
	int err;
2732
	unsigned int len;
L
Linus Torvalds 已提交
2733

2734
	if (call < 1 || call > SYS_SENDMMSG)
L
Linus Torvalds 已提交
2735
		return -EINVAL;
2736
	call = array_index_nospec(call, SYS_SENDMMSG + 1);
L
Linus Torvalds 已提交
2737

2738 2739 2740 2741
	len = nargs[call];
	if (len > sizeof(a))
		return -EINVAL;

L
Linus Torvalds 已提交
2742
	/* copy_from_user should be SMP safe. */
2743
	if (copy_from_user(a, args, len))
L
Linus Torvalds 已提交
2744
		return -EFAULT;
2745

2746 2747 2748
	err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
	if (err)
		return err;
2749

2750 2751 2752 2753 2754
	a0 = a[0];
	a1 = a[1];

	switch (call) {
	case SYS_SOCKET:
2755
		err = __sys_socket(a0, a1, a[2]);
2756 2757
		break;
	case SYS_BIND:
2758
		err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
2759 2760
		break;
	case SYS_CONNECT:
2761
		err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
2762 2763
		break;
	case SYS_LISTEN:
2764
		err = __sys_listen(a0, a1);
2765 2766
		break;
	case SYS_ACCEPT:
2767 2768
		err = __sys_accept4(a0, (struct sockaddr __user *)a1,
				    (int __user *)a[2], 0);
2769 2770 2771
		break;
	case SYS_GETSOCKNAME:
		err =
2772 2773
		    __sys_getsockname(a0, (struct sockaddr __user *)a1,
				      (int __user *)a[2]);
2774 2775 2776
		break;
	case SYS_GETPEERNAME:
		err =
2777 2778
		    __sys_getpeername(a0, (struct sockaddr __user *)a1,
				      (int __user *)a[2]);
2779 2780
		break;
	case SYS_SOCKETPAIR:
2781
		err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
2782 2783
		break;
	case SYS_SEND:
2784 2785
		err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
				   NULL, 0);
2786 2787
		break;
	case SYS_SENDTO:
2788 2789
		err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
				   (struct sockaddr __user *)a[4], a[5]);
2790 2791
		break;
	case SYS_RECV:
2792 2793
		err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
				     NULL, NULL);
2794 2795
		break;
	case SYS_RECVFROM:
2796 2797 2798
		err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
				     (struct sockaddr __user *)a[4],
				     (int __user *)a[5]);
2799 2800
		break;
	case SYS_SHUTDOWN:
2801
		err = __sys_shutdown(a0, a1);
2802 2803
		break;
	case SYS_SETSOCKOPT:
2804 2805
		err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3],
				       a[4]);
2806 2807 2808
		break;
	case SYS_GETSOCKOPT:
		err =
2809 2810
		    __sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
				     (int __user *)a[4]);
2811 2812
		break;
	case SYS_SENDMSG:
2813 2814
		err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1,
				    a[2], true);
2815
		break;
2816
	case SYS_SENDMMSG:
2817 2818
		err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2],
				     a[3], true);
2819
		break;
2820
	case SYS_RECVMSG:
2821 2822
		err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1,
				    a[2], true);
2823
		break;
2824
	case SYS_RECVMMSG:
2825 2826 2827 2828 2829 2830 2831 2832 2833
		if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME))
			err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
					     a[2], a[3],
					     (struct __kernel_timespec __user *)a[4],
					     NULL);
		else
			err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
					     a[2], a[3], NULL,
					     (struct old_timespec32 __user *)a[4]);
2834
		break;
U
Ulrich Drepper 已提交
2835
	case SYS_ACCEPT4:
2836 2837
		err = __sys_accept4(a0, (struct sockaddr __user *)a1,
				    (int __user *)a[2], a[3]);
U
Ulrich Drepper 已提交
2838
		break;
2839 2840 2841
	default:
		err = -EINVAL;
		break;
L
Linus Torvalds 已提交
2842 2843 2844 2845
	}
	return err;
}

2846
#endif				/* __ARCH_WANT_SYS_SOCKETCALL */
L
Linus Torvalds 已提交
2847

2848 2849 2850 2851
/**
 *	sock_register - add a socket protocol handler
 *	@ops: description of protocol
 *
L
Linus Torvalds 已提交
2852 2853
 *	This function is called by a protocol handler that wants to
 *	advertise its address family, and have it linked into the
2854
 *	socket interface. The value ops->family corresponds to the
2855
 *	socket system call protocol family.
L
Linus Torvalds 已提交
2856
 */
2857
int sock_register(const struct net_proto_family *ops)
L
Linus Torvalds 已提交
2858 2859 2860 2861
{
	int err;

	if (ops->family >= NPROTO) {
2862
		pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
L
Linus Torvalds 已提交
2863 2864
		return -ENOBUFS;
	}
2865 2866

	spin_lock(&net_family_lock);
E
Eric Dumazet 已提交
2867 2868
	if (rcu_dereference_protected(net_families[ops->family],
				      lockdep_is_held(&net_family_lock)))
2869 2870
		err = -EEXIST;
	else {
2871
		rcu_assign_pointer(net_families[ops->family], ops);
L
Linus Torvalds 已提交
2872 2873
		err = 0;
	}
2874 2875
	spin_unlock(&net_family_lock);

2876
	pr_info("NET: Registered protocol family %d\n", ops->family);
L
Linus Torvalds 已提交
2877 2878
	return err;
}
2879
EXPORT_SYMBOL(sock_register);
L
Linus Torvalds 已提交
2880

2881 2882 2883 2884
/**
 *	sock_unregister - remove a protocol handler
 *	@family: protocol family to remove
 *
L
Linus Torvalds 已提交
2885 2886
 *	This function is called by a protocol handler that wants to
 *	remove its address family, and have it unlinked from the
2887 2888 2889 2890 2891 2892
 *	new socket creation.
 *
 *	If protocol handler is a module, then it can use module reference
 *	counts to protect against new references. If protocol handler is not
 *	a module then it needs to provide its own protection in
 *	the ops->create routine.
L
Linus Torvalds 已提交
2893
 */
2894
void sock_unregister(int family)
L
Linus Torvalds 已提交
2895
{
2896
	BUG_ON(family < 0 || family >= NPROTO);
L
Linus Torvalds 已提交
2897

2898
	spin_lock(&net_family_lock);
2899
	RCU_INIT_POINTER(net_families[family], NULL);
2900 2901 2902 2903
	spin_unlock(&net_family_lock);

	synchronize_rcu();

2904
	pr_info("NET: Unregistered protocol family %d\n", family);
L
Linus Torvalds 已提交
2905
}
2906
EXPORT_SYMBOL(sock_unregister);
L
Linus Torvalds 已提交
2907

2908 2909
bool sock_is_registered(int family)
{
2910
	return family < NPROTO && rcu_access_pointer(net_families[family]);
2911 2912
}

2913
static int __init sock_init(void)
L
Linus Torvalds 已提交
2914
{
N
Nick Piggin 已提交
2915
	int err;
2916 2917 2918 2919 2920 2921
	/*
	 *      Initialize the network sysctl infrastructure.
	 */
	err = net_sysctl_init();
	if (err)
		goto out;
N
Nick Piggin 已提交
2922

L
Linus Torvalds 已提交
2923
	/*
2924
	 *      Initialize skbuff SLAB cache
L
Linus Torvalds 已提交
2925 2926 2927 2928
	 */
	skb_init();

	/*
2929
	 *      Initialize the protocols module.
L
Linus Torvalds 已提交
2930 2931 2932
	 */

	init_inodecache();
N
Nick Piggin 已提交
2933 2934 2935 2936

	err = register_filesystem(&sock_fs_type);
	if (err)
		goto out_fs;
L
Linus Torvalds 已提交
2937
	sock_mnt = kern_mount(&sock_fs_type);
N
Nick Piggin 已提交
2938 2939 2940 2941
	if (IS_ERR(sock_mnt)) {
		err = PTR_ERR(sock_mnt);
		goto out_mount;
	}
2942 2943

	/* The real protocol initialization is performed in later initcalls.
L
Linus Torvalds 已提交
2944 2945 2946
	 */

#ifdef CONFIG_NETFILTER
2947 2948 2949
	err = netfilter_init();
	if (err)
		goto out;
L
Linus Torvalds 已提交
2950
#endif
2951

2952
	ptp_classifier_init();
2953

N
Nick Piggin 已提交
2954 2955 2956 2957 2958 2959 2960
out:
	return err;

out_mount:
	unregister_filesystem(&sock_fs_type);
out_fs:
	goto out;
L
Linus Torvalds 已提交
2961 2962
}

2963 2964
core_initcall(sock_init);	/* early initcall */

L
Linus Torvalds 已提交
2965 2966 2967
#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
2968 2969
	seq_printf(seq, "sockets: used %d\n",
		   sock_inuse_get(seq->private));
L
Linus Torvalds 已提交
2970
}
2971
#endif				/* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
2972

2973
#ifdef CONFIG_COMPAT
2974
static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
2975
{
2976
	struct compat_ifconf ifc32;
2977 2978 2979
	struct ifconf ifc;
	int err;

2980
	if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
2981 2982
		return -EFAULT;

2983 2984
	ifc.ifc_len = ifc32.ifc_len;
	ifc.ifc_req = compat_ptr(ifc32.ifcbuf);
2985

2986 2987 2988
	rtnl_lock();
	err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq));
	rtnl_unlock();
2989 2990 2991
	if (err)
		return err;

2992
	ifc32.ifc_len = ifc.ifc_len;
2993
	if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
2994 2995 2996 2997 2998
		return -EFAULT;

	return 0;
}

2999
static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
3000
{
3001 3002
	struct compat_ethtool_rxnfc __user *compat_rxnfc;
	bool convert_in = false, convert_out = false;
3003 3004 3005
	size_t buf_size = 0;
	struct ethtool_rxnfc __user *rxnfc = NULL;
	struct ifreq ifr;
3006 3007
	u32 rule_cnt = 0, actual_rule_cnt;
	u32 ethcmd;
3008
	u32 data;
3009
	int ret;
3010

3011 3012
	if (get_user(data, &ifr32->ifr_ifru.ifru_data))
		return -EFAULT;
3013

3014 3015 3016
	compat_rxnfc = compat_ptr(data);

	if (get_user(ethcmd, &compat_rxnfc->cmd))
3017 3018
		return -EFAULT;

3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035
	/* Most ethtool structures are defined without padding.
	 * Unfortunately struct ethtool_rxnfc is an exception.
	 */
	switch (ethcmd) {
	default:
		break;
	case ETHTOOL_GRXCLSRLALL:
		/* Buffer size is variable */
		if (get_user(rule_cnt, &compat_rxnfc->rule_cnt))
			return -EFAULT;
		if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32))
			return -ENOMEM;
		buf_size += rule_cnt * sizeof(u32);
		/* fall through */
	case ETHTOOL_GRXRINGS:
	case ETHTOOL_GRXCLSRLCNT:
	case ETHTOOL_GRXCLSRULE:
3036
	case ETHTOOL_SRXCLSRLINS:
3037 3038 3039 3040 3041
		convert_out = true;
		/* fall through */
	case ETHTOOL_SRXCLSRLDEL:
		buf_size += sizeof(struct ethtool_rxnfc);
		convert_in = true;
3042
		rxnfc = compat_alloc_user_space(buf_size);
3043 3044 3045
		break;
	}

3046
	if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ))
3047 3048
		return -EFAULT;

3049
	ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc;
3050

3051
	if (convert_in) {
3052
		/* We expect there to be holes between fs.m_ext and
3053 3054
		 * fs.ring_cookie and at the end of fs, but nowhere else.
		 */
3055 3056 3057 3058
		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
			     sizeof(compat_rxnfc->fs.m_ext) !=
			     offsetof(struct ethtool_rxnfc, fs.m_ext) +
			     sizeof(rxnfc->fs.m_ext));
3059 3060 3061 3062 3063 3064 3065
		BUILD_BUG_ON(
			offsetof(struct compat_ethtool_rxnfc, fs.location) -
			offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
			offsetof(struct ethtool_rxnfc, fs.location) -
			offsetof(struct ethtool_rxnfc, fs.ring_cookie));

		if (copy_in_user(rxnfc, compat_rxnfc,
S
Stephen Hemminger 已提交
3066 3067
				 (void __user *)(&rxnfc->fs.m_ext + 1) -
				 (void __user *)rxnfc) ||
3068 3069
		    copy_in_user(&rxnfc->fs.ring_cookie,
				 &compat_rxnfc->fs.ring_cookie,
S
Stephen Hemminger 已提交
3070
				 (void __user *)(&rxnfc->fs.location + 1) -
W
Wenwen Wang 已提交
3071 3072 3073 3074 3075 3076 3077 3078
				 (void __user *)&rxnfc->fs.ring_cookie))
			return -EFAULT;
		if (ethcmd == ETHTOOL_GRXCLSRLALL) {
			if (put_user(rule_cnt, &rxnfc->rule_cnt))
				return -EFAULT;
		} else if (copy_in_user(&rxnfc->rule_cnt,
					&compat_rxnfc->rule_cnt,
					sizeof(rxnfc->rule_cnt)))
3079 3080 3081
			return -EFAULT;
	}

3082
	ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL);
3083 3084 3085 3086 3087
	if (ret)
		return ret;

	if (convert_out) {
		if (copy_in_user(compat_rxnfc, rxnfc,
S
Stephen Hemminger 已提交
3088 3089
				 (const void __user *)(&rxnfc->fs.m_ext + 1) -
				 (const void __user *)rxnfc) ||
3090 3091
		    copy_in_user(&compat_rxnfc->fs.ring_cookie,
				 &rxnfc->fs.ring_cookie,
S
Stephen Hemminger 已提交
3092 3093
				 (const void __user *)(&rxnfc->fs.location + 1) -
				 (const void __user *)&rxnfc->fs.ring_cookie) ||
3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118
		    copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt,
				 sizeof(rxnfc->rule_cnt)))
			return -EFAULT;

		if (ethcmd == ETHTOOL_GRXCLSRLALL) {
			/* As an optimisation, we only copy the actual
			 * number of rules that the underlying
			 * function returned.  Since Mallory might
			 * change the rule count in user memory, we
			 * check that it is less than the rule count
			 * originally given (as the user buffer size),
			 * which has been range-checked.
			 */
			if (get_user(actual_rule_cnt, &rxnfc->rule_cnt))
				return -EFAULT;
			if (actual_rule_cnt < rule_cnt)
				rule_cnt = actual_rule_cnt;
			if (copy_in_user(&compat_rxnfc->rule_locs[0],
					 &rxnfc->rule_locs[0],
					 rule_cnt * sizeof(u32)))
				return -EFAULT;
		}
	}

	return 0;
3119 3120
}

3121 3122 3123
static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
{
	compat_uptr_t uptr32;
3124 3125 3126
	struct ifreq ifr;
	void __user *saved;
	int err;
3127

3128
	if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq)))
3129 3130 3131 3132 3133
		return -EFAULT;

	if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
		return -EFAULT;

3134 3135
	saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
	ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);
3136

3137 3138 3139 3140 3141
	err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL);
	if (!err) {
		ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
		if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq)))
			err = -EFAULT;
3142
	}
3143
	return err;
3144 3145
}

3146 3147
/* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
3148
				 struct compat_ifreq __user *u_ifreq32)
3149
{
3150
	struct ifreq ifreq;
3151 3152
	u32 data32;

3153
	if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ))
3154
		return -EFAULT;
3155
	if (get_user(data32, &u_ifreq32->ifr_data))
3156
		return -EFAULT;
3157
	ifreq.ifr_data = compat_ptr(data32);
3158

3159
	return dev_ioctl(net, cmd, &ifreq, NULL);
3160 3161
}

J
Johannes Berg 已提交
3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200
static int compat_ifreq_ioctl(struct net *net, struct socket *sock,
			      unsigned int cmd,
			      struct compat_ifreq __user *uifr32)
{
	struct ifreq __user *uifr;
	int err;

	/* Handle the fact that while struct ifreq has the same *layout* on
	 * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
	 * which are handled elsewhere, it still has different *size* due to
	 * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
	 * resulting in struct ifreq being 32 and 40 bytes respectively).
	 * As a result, if the struct happens to be at the end of a page and
	 * the next page isn't readable/writable, we get a fault. To prevent
	 * that, copy back and forth to the full size.
	 */

	uifr = compat_alloc_user_space(sizeof(*uifr));
	if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
		return -EFAULT;

	err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);

	if (!err) {
		switch (cmd) {
		case SIOCGIFFLAGS:
		case SIOCGIFMETRIC:
		case SIOCGIFMTU:
		case SIOCGIFMEM:
		case SIOCGIFHWADDR:
		case SIOCGIFINDEX:
		case SIOCGIFADDR:
		case SIOCGIFBRDADDR:
		case SIOCGIFDSTADDR:
		case SIOCGIFNETMASK:
		case SIOCGIFPFLAGS:
		case SIOCGIFTXQLEN:
		case SIOCGMIIPHY:
		case SIOCGMIIREG:
3201
		case SIOCGIFNAME:
J
Johannes Berg 已提交
3202 3203 3204 3205 3206 3207 3208 3209
			if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
				err = -EFAULT;
			break;
		}
	}
	return err;
}

3210 3211 3212 3213 3214 3215 3216 3217 3218
static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
			struct compat_ifreq __user *uifr32)
{
	struct ifreq ifr;
	struct compat_ifmap __user *uifmap32;
	int err;

	uifmap32 = &uifr32->ifr_ifru.ifru_map;
	err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
3219 3220 3221 3222 3223 3224
	err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
	err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
	err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
	err |= get_user(ifr.ifr_map.irq, &uifmap32->irq);
	err |= get_user(ifr.ifr_map.dma, &uifmap32->dma);
	err |= get_user(ifr.ifr_map.port, &uifmap32->port);
3225 3226 3227
	if (err)
		return -EFAULT;

3228
	err = dev_ioctl(net, cmd, &ifr, NULL);
3229 3230 3231

	if (cmd == SIOCGIFMAP && !err) {
		err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
3232 3233 3234 3235 3236 3237
		err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
		err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
		err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
		err |= put_user(ifr.ifr_map.irq, &uifmap32->irq);
		err |= put_user(ifr.ifr_map.dma, &uifmap32->dma);
		err |= put_user(ifr.ifr_map.port, &uifmap32->port);
3238 3239 3240 3241 3242 3243
		if (err)
			err = -EFAULT;
	}
	return err;
}

3244
struct rtentry32 {
3245
	u32		rt_pad1;
3246 3247 3248
	struct sockaddr rt_dst;         /* target address               */
	struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
	struct sockaddr rt_genmask;     /* target network mask (IP)     */
3249 3250 3251 3252 3253 3254 3255
	unsigned short	rt_flags;
	short		rt_pad2;
	u32		rt_pad3;
	unsigned char	rt_tos;
	unsigned char	rt_class;
	short		rt_pad4;
	short		rt_metric;      /* +1 for binary compatibility! */
3256
	/* char * */ u32 rt_dev;        /* forcing the device at add    */
3257 3258
	u32		rt_mtu;         /* per route MTU/Window         */
	u32		rt_window;      /* Window clamping              */
3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274
	unsigned short  rt_irtt;        /* Initial RTT                  */
};

struct in6_rtmsg32 {
	struct in6_addr		rtmsg_dst;
	struct in6_addr		rtmsg_src;
	struct in6_addr		rtmsg_gateway;
	u32			rtmsg_type;
	u16			rtmsg_dst_len;
	u16			rtmsg_src_len;
	u32			rtmsg_metric;
	u32			rtmsg_info;
	u32			rtmsg_flags;
	s32			rtmsg_ifindex;
};

3275 3276
static int routing_ioctl(struct net *net, struct socket *sock,
			 unsigned int cmd, void __user *argp)
3277 3278 3279 3280 3281 3282 3283 3284 3285
{
	int ret;
	void *r = NULL;
	struct in6_rtmsg r6;
	struct rtentry r4;
	char devname[16];
	u32 rtdev;
	mm_segment_t old_fs = get_fs();

3286 3287
	if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */
		struct in6_rtmsg32 __user *ur6 = argp;
3288
		ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
3289
			3 * sizeof(struct in6_addr));
3290 3291 3292 3293 3294 3295 3296
		ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
		ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
		ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
		ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
		ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
		ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
		ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
3297 3298 3299

		r = (void *) &r6;
	} else { /* ipv4 */
3300
		struct rtentry32 __user *ur4 = argp;
3301
		ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
3302
					3 * sizeof(struct sockaddr));
3303 3304 3305 3306 3307 3308
		ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
		ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
		ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
		ret |= get_user(r4.rt_window, &(ur4->rt_window));
		ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
		ret |= get_user(rtdev, &(ur4->rt_dev));
3309
		if (rtdev) {
3310
			ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
3311 3312
			r4.rt_dev = (char __user __force *)devname;
			devname[15] = 0;
3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323
		} else
			r4.rt_dev = NULL;

		r = (void *) &r4;
	}

	if (ret) {
		ret = -EFAULT;
		goto out;
	}

3324
	set_fs(KERNEL_DS);
3325
	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
3326
	set_fs(old_fs);
3327 3328 3329 3330 3331 3332 3333

out:
	return ret;
}

/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
 * for some operations; this forces use of the newer bridge-utils that
L
Lucas De Marchi 已提交
3334
 * use compatible ioctls
3335
 */
3336
static int old_bridge_ioctl(compat_ulong_t __user *argp)
3337
{
3338
	compat_ulong_t tmp;
3339

3340
	if (get_user(tmp, argp))
3341 3342 3343 3344 3345 3346
		return -EFAULT;
	if (tmp == BRCTL_GET_VERSION)
		return BRCTL_VERSION + 1;
	return -EINVAL;
}

3347 3348 3349 3350 3351 3352
static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
			 unsigned int cmd, unsigned long arg)
{
	void __user *argp = compat_ptr(arg);
	struct sock *sk = sock->sk;
	struct net *net = sock_net(sk);
3353

3354
	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
3355
		return compat_ifr_data_ioctl(net, cmd, argp);
3356 3357 3358 3359 3360 3361

	switch (cmd) {
	case SIOCSIFBR:
	case SIOCGIFBR:
		return old_bridge_ioctl(argp);
	case SIOCGIFCONF:
3362
		return compat_dev_ifconf(net, argp);
3363 3364
	case SIOCETHTOOL:
		return ethtool_ioctl(net, argp);
3365 3366
	case SIOCWANDEV:
		return compat_siocwandev(net, argp);
3367 3368 3369
	case SIOCGIFMAP:
	case SIOCSIFMAP:
		return compat_sioc_ifmap(net, cmd, argp);
3370 3371 3372
	case SIOCADDRT:
	case SIOCDELRT:
		return routing_ioctl(net, sock, cmd, argp);
3373 3374
	case SIOCGSTAMP_OLD:
	case SIOCGSTAMPNS_OLD:
3375 3376
		if (!sock->ops->gettstamp)
			return -ENOIOCTLCMD;
3377
		return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
3378 3379
					    !COMPAT_USE_64BIT_TIME);

3380 3381
	case SIOCBONDSLAVEINFOQUERY:
	case SIOCBONDINFOQUERY:
3382
	case SIOCSHWTSTAMP:
3383
	case SIOCGHWTSTAMP:
3384
		return compat_ifr_data_ioctl(net, cmd, argp);
3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395

	case FIOSETOWN:
	case SIOCSPGRP:
	case FIOGETOWN:
	case SIOCGPGRP:
	case SIOCBRADDBR:
	case SIOCBRDELBR:
	case SIOCGIFVLAN:
	case SIOCSIFVLAN:
	case SIOCADDDLCI:
	case SIOCDELDLCI:
3396
	case SIOCGSKNS:
3397 3398
	case SIOCGSTAMP_NEW:
	case SIOCGSTAMPNS_NEW:
3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429
		return sock_ioctl(file, cmd, arg);

	case SIOCGIFFLAGS:
	case SIOCSIFFLAGS:
	case SIOCGIFMETRIC:
	case SIOCSIFMETRIC:
	case SIOCGIFMTU:
	case SIOCSIFMTU:
	case SIOCGIFMEM:
	case SIOCSIFMEM:
	case SIOCGIFHWADDR:
	case SIOCSIFHWADDR:
	case SIOCADDMULTI:
	case SIOCDELMULTI:
	case SIOCGIFINDEX:
	case SIOCGIFADDR:
	case SIOCSIFADDR:
	case SIOCSIFHWBROADCAST:
	case SIOCDIFADDR:
	case SIOCGIFBRDADDR:
	case SIOCSIFBRDADDR:
	case SIOCGIFDSTADDR:
	case SIOCSIFDSTADDR:
	case SIOCGIFNETMASK:
	case SIOCSIFNETMASK:
	case SIOCSIFPFLAGS:
	case SIOCGIFPFLAGS:
	case SIOCGIFTXQLEN:
	case SIOCSIFTXQLEN:
	case SIOCBRADDIF:
	case SIOCBRDELIF:
3430
	case SIOCGIFNAME:
3431 3432 3433 3434
	case SIOCSIFNAME:
	case SIOCGMIIPHY:
	case SIOCGMIIREG:
	case SIOCSMIIREG:
A
Al Viro 已提交
3435 3436 3437 3438
	case SIOCBONDENSLAVE:
	case SIOCBONDRELEASE:
	case SIOCBONDSETHWADDR:
	case SIOCBONDCHANGEACTIVE:
J
Johannes Berg 已提交
3439 3440
		return compat_ifreq_ioctl(net, sock, cmd, argp);

3441 3442 3443 3444
	case SIOCSARP:
	case SIOCGARP:
	case SIOCDARP:
	case SIOCATMARK:
3445
		return sock_do_ioctl(net, sock, cmd, arg);
3446 3447
	}

3448 3449
	return -ENOIOCTLCMD;
}
3450

3451
static long compat_sock_ioctl(struct file *file, unsigned int cmd,
3452
			      unsigned long arg)
3453 3454 3455
{
	struct socket *sock = file->private_data;
	int ret = -ENOIOCTLCMD;
3456 3457 3458 3459 3460
	struct sock *sk;
	struct net *net;

	sk = sock->sk;
	net = sock_net(sk);
3461 3462 3463 3464

	if (sock->ops->compat_ioctl)
		ret = sock->ops->compat_ioctl(sock, cmd, arg);

3465 3466 3467 3468
	if (ret == -ENOIOCTLCMD &&
	    (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
		ret = compat_wext_handle_ioctl(net, cmd, arg);

3469 3470 3471
	if (ret == -ENOIOCTLCMD)
		ret = compat_sock_ioctl_trans(file, sock, cmd, arg);

3472 3473 3474 3475
	return ret;
}
#endif

3476 3477 3478 3479 3480 3481 3482 3483 3484
/**
 *	kernel_bind - bind an address to a socket (kernel space)
 *	@sock: socket
 *	@addr: address
 *	@addrlen: length of address
 *
 *	Returns 0 or an error.
 */

3485 3486 3487 3488
int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
{
	return sock->ops->bind(sock, addr, addrlen);
}
3489
EXPORT_SYMBOL(kernel_bind);
3490

3491 3492 3493 3494 3495 3496 3497 3498
/**
 *	kernel_listen - move socket to listening state (kernel space)
 *	@sock: socket
 *	@backlog: pending connections queue size
 *
 *	Returns 0 or an error.
 */

3499 3500 3501 3502
int kernel_listen(struct socket *sock, int backlog)
{
	return sock->ops->listen(sock, backlog);
}
3503
EXPORT_SYMBOL(kernel_listen);
3504

3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515
/**
 *	kernel_accept - accept a connection (kernel space)
 *	@sock: listening socket
 *	@newsock: new connected socket
 *	@flags: flags
 *
 *	@flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0.
 *	If it fails, @newsock is guaranteed to be %NULL.
 *	Returns 0 or an error.
 */

3516 3517 3518 3519 3520 3521 3522 3523 3524 3525
int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
{
	struct sock *sk = sock->sk;
	int err;

	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
			       newsock);
	if (err < 0)
		goto done;

3526
	err = sock->ops->accept(sock, *newsock, flags, true);
3527 3528
	if (err < 0) {
		sock_release(*newsock);
3529
		*newsock = NULL;
3530 3531 3532 3533
		goto done;
	}

	(*newsock)->ops = sock->ops;
3534
	__module_get((*newsock)->ops->owner);
3535 3536 3537 3538

done:
	return err;
}
3539
EXPORT_SYMBOL(kernel_accept);
3540

3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553
/**
 *	kernel_connect - connect a socket (kernel space)
 *	@sock: socket
 *	@addr: address
 *	@addrlen: address length
 *	@flags: flags (O_NONBLOCK, ...)
 *
 *	For datagram sockets, @addr is the addres to which datagrams are sent
 *	by default, and the only address from which datagrams are received.
 *	For stream sockets, attempts to connect to @addr.
 *	Returns 0 or an error code.
 */

3554
int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
3555
		   int flags)
3556 3557 3558
{
	return sock->ops->connect(sock, addr, addrlen, flags);
}
3559
EXPORT_SYMBOL(kernel_connect);
3560

3561 3562 3563 3564 3565 3566 3567 3568 3569
/**
 *	kernel_getsockname - get the address which the socket is bound (kernel space)
 *	@sock: socket
 *	@addr: address holder
 *
 * 	Fills the @addr pointer with the address which the socket is bound.
 *	Returns 0 or an error code.
 */

3570
int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
3571
{
3572
	return sock->ops->getname(sock, addr, 0);
3573
}
3574
EXPORT_SYMBOL(kernel_getsockname);
3575

3576 3577 3578 3579 3580 3581 3582 3583 3584
/**
 *	kernel_peername - get the address which the socket is connected (kernel space)
 *	@sock: socket
 *	@addr: address holder
 *
 * 	Fills the @addr pointer with the address which the socket is connected.
 *	Returns 0 or an error code.
 */

3585
int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
3586
{
3587
	return sock->ops->getname(sock, addr, 1);
3588
}
3589
EXPORT_SYMBOL(kernel_getpeername);
3590

3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602
/**
 *	kernel_getsockopt - get a socket option (kernel space)
 *	@sock: socket
 *	@level: API level (SOL_SOCKET, ...)
 *	@optname: option tag
 *	@optval: option value
 *	@optlen: option length
 *
 *	Assigns the option length to @optlen.
 *	Returns 0 or an error.
 */

3603 3604 3605 3606
int kernel_getsockopt(struct socket *sock, int level, int optname,
			char *optval, int *optlen)
{
	mm_segment_t oldfs = get_fs();
3607 3608
	char __user *uoptval;
	int __user *uoptlen;
3609 3610
	int err;

3611 3612 3613
	uoptval = (char __user __force *) optval;
	uoptlen = (int __user __force *) optlen;

3614 3615
	set_fs(KERNEL_DS);
	if (level == SOL_SOCKET)
3616
		err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
3617
	else
3618 3619
		err = sock->ops->getsockopt(sock, level, optname, uoptval,
					    uoptlen);
3620 3621 3622
	set_fs(oldfs);
	return err;
}
3623
EXPORT_SYMBOL(kernel_getsockopt);
3624

3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635
/**
 *	kernel_setsockopt - set a socket option (kernel space)
 *	@sock: socket
 *	@level: API level (SOL_SOCKET, ...)
 *	@optname: option tag
 *	@optval: option value
 *	@optlen: option length
 *
 *	Returns 0 or an error.
 */

3636
int kernel_setsockopt(struct socket *sock, int level, int optname,
3637
			char *optval, unsigned int optlen)
3638 3639
{
	mm_segment_t oldfs = get_fs();
3640
	char __user *uoptval;
3641 3642
	int err;

3643 3644
	uoptval = (char __user __force *) optval;

3645 3646
	set_fs(KERNEL_DS);
	if (level == SOL_SOCKET)
3647
		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
3648
	else
3649
		err = sock->ops->setsockopt(sock, level, optname, uoptval,
3650 3651 3652 3653
					    optlen);
	set_fs(oldfs);
	return err;
}
3654
EXPORT_SYMBOL(kernel_setsockopt);
3655

3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666
/**
 *	kernel_sendpage - send a &page through a socket (kernel space)
 *	@sock: socket
 *	@page: page
 *	@offset: page offset
 *	@size: total size in bytes
 *	@flags: flags (MSG_DONTWAIT, ...)
 *
 *	Returns the total amount sent in bytes or an error.
 */

3667 3668 3669 3670 3671 3672 3673 3674
int kernel_sendpage(struct socket *sock, struct page *page, int offset,
		    size_t size, int flags)
{
	if (sock->ops->sendpage)
		return sock->ops->sendpage(sock, page, offset, size, flags);

	return sock_no_sendpage(sock, page, offset, size, flags);
}
3675
EXPORT_SYMBOL(kernel_sendpage);
3676

3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688
/**
 *	kernel_sendpage_locked - send a &page through the locked sock (kernel space)
 *	@sk: sock
 *	@page: page
 *	@offset: page offset
 *	@size: total size in bytes
 *	@flags: flags (MSG_DONTWAIT, ...)
 *
 *	Returns the total amount sent in bytes or an error.
 *	Caller must hold @sk.
 */

3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701
int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
			   size_t size, int flags)
{
	struct socket *sock = sk->sk_socket;

	if (sock->ops->sendpage_locked)
		return sock->ops->sendpage_locked(sk, page, offset, size,
						  flags);

	return sock_no_sendpage_locked(sk, page, offset, size, flags);
}
EXPORT_SYMBOL(kernel_sendpage_locked);

3702 3703 3704 3705 3706 3707 3708 3709
/**
 *	kernel_shutdown - shut down part of a full-duplex connection (kernel space)
 *	@sock: socket
 *	@how: connection part
 *
 *	Returns 0 or an error.
 */

3710 3711 3712 3713 3714
int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
{
	return sock->ops->shutdown(sock, how);
}
EXPORT_SYMBOL(kernel_sock_shutdown);
3715

3716 3717 3718 3719 3720 3721 3722 3723
/**
 *	kernel_sock_ip_overhead - returns the IP overhead imposed by a socket
 *	@sk: socket
 *
 *	This routine returns the IP overhead imposed by a socket i.e.
 *	the length of the underlying IP header, depending on whether
 *	this is an IPv4 or IPv6 socket and the length from IP options turned
 *	on at the socket. Assumes that the caller has a lock on the socket.
3724
 */
3725

3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743
u32 kernel_sock_ip_overhead(struct sock *sk)
{
	struct inet_sock *inet;
	struct ip_options_rcu *opt;
	u32 overhead = 0;
#if IS_ENABLED(CONFIG_IPV6)
	struct ipv6_pinfo *np;
	struct ipv6_txoptions *optv6 = NULL;
#endif /* IS_ENABLED(CONFIG_IPV6) */

	if (!sk)
		return overhead;

	switch (sk->sk_family) {
	case AF_INET:
		inet = inet_sk(sk);
		overhead += sizeof(struct iphdr);
		opt = rcu_dereference_protected(inet->inet_opt,
3744
						sock_owned_by_user(sk));
3745 3746 3747 3748 3749 3750 3751 3752 3753
		if (opt)
			overhead += opt->opt.optlen;
		return overhead;
#if IS_ENABLED(CONFIG_IPV6)
	case AF_INET6:
		np = inet6_sk(sk);
		overhead += sizeof(struct ipv6hdr);
		if (np)
			optv6 = rcu_dereference_protected(np->opt,
3754
							  sock_owned_by_user(sk));
3755 3756 3757 3758 3759 3760 3761 3762 3763
		if (optv6)
			overhead += (optv6->opt_flen + optv6->opt_nflen);
		return overhead;
#endif /* IS_ENABLED(CONFIG_IPV6) */
	default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */
		return overhead;
	}
}
EXPORT_SYMBOL(kernel_sock_ip_overhead);