tun.c 41.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  TUN - Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
 */

/*
 *  Changes:
 *
21 22 23
 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
 *    Add TUNSETLINK ioctl to set the link encapsulation
 *
L
Linus Torvalds 已提交
24
 *  Mark Smith <markzzzsmith@yahoo.com.au>
25
 *    Use eth_random_addr() for tap MAC address.
L
Linus Torvalds 已提交
26 27 28 29 30 31 32 33 34 35 36
 *
 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
 *    Fixes in packet dropping, queue length setting and queue wakeup.
 *    Increased default tx queue length.
 *    Added ethtool API.
 *    Minor cleanups
 *
 *  Daniel Podlejski <underley@underley.eu.org>
 *    Modifications for 2.3.99-pre5 kernel.
 */

37 38
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

L
Linus Torvalds 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
#define DRV_NAME	"tun"
#define DRV_VERSION	"1.6"
#define DRV_DESCRIPTION	"Universal TUN/TAP device driver"
#define DRV_COPYRIGHT	"(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/miscdevice.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
58
#include <linux/compat.h>
L
Linus Torvalds 已提交
59 60 61 62 63
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_tun.h>
#include <linux/crc32.h>
64
#include <linux/nsproxy.h>
65
#include <linux/virtio_net.h>
66
#include <linux/rcupdate.h>
67
#include <net/net_namespace.h>
68
#include <net/netns/generic.h>
69
#include <net/rtnetlink.h>
70
#include <net/sock.h>
L
Linus Torvalds 已提交
71 72 73

#include <asm/uaccess.h>

74 75 76
/* Uncomment to enable debugging */
/* #define TUN_DEBUG 1 */

L
Linus Torvalds 已提交
77 78
#ifdef TUN_DEBUG
static int debug;
79

80 81 82 83 84 85 86 87 88 89
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (tun->debug)						\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (debug == 2)						\
		printk(level fmt, ##args);			\
} while (0)
90
#else
91 92 93 94 95 96 97 98 99 100
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (0)							\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (0)							\
		printk(level fmt, ##args);			\
} while (0)
101 102
#endif

103 104
#define GOODCOPY_LEN 128

105 106 107 108 109 110 111
#define FLT_EXACT_COUNT 8
struct tap_filter {
	unsigned int    count;    /* Number of addrs. Zero means disabled */
	u32             mask[2];  /* Mask of the hashed addrs */
	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
};

112
struct tun_file {
113
	atomic_t count;
114
	struct tun_struct *tun;
115
	struct net *net;
116 117
};

118 119
struct tun_sock;

120
struct tun_struct {
121
	struct tun_file		*tfile;
122
	unsigned int 		flags;
123 124 125 126
	uid_t			owner;
	gid_t			group;

	struct net_device	*dev;
127
	netdev_features_t	set_features;
128 129
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
			  NETIF_F_TSO6|NETIF_F_UFO)
130
	struct fasync_struct	*fasync;
131

132
	struct tap_filter       txflt;
133
	struct socket		socket;
134
	struct socket_wq	wq;
135 136 137

	int			vnet_hdr_sz;

138 139
#ifdef TUN_DEBUG
	int debug;
L
Linus Torvalds 已提交
140
#endif
141
};
L
Linus Torvalds 已提交
142

143 144 145 146 147 148 149 150 151 152
struct tun_sock {
	struct sock		sk;
	struct tun_struct	*tun;
};

static inline struct tun_sock *tun_sk(struct sock *sk)
{
	return container_of(sk, struct tun_sock, sk);
}

153 154
static int tun_attach(struct tun_struct *tun, struct file *file)
{
155
	struct tun_file *tfile = file->private_data;
156
	int err;
157 158 159

	ASSERT_RTNL();

160 161 162 163 164 165 166 167 168 169 170
	netif_tx_lock_bh(tun->dev);

	err = -EINVAL;
	if (tfile->tun)
		goto out;

	err = -EBUSY;
	if (tun->tfile)
		goto out;

	err = 0;
171 172
	tfile->tun = tun;
	tun->tfile = tfile;
173
	tun->socket.file = file;
174
	netif_carrier_on(tun->dev);
175
	dev_hold(tun->dev);
176
	sock_hold(tun->socket.sk);
177
	atomic_inc(&tfile->count);
178

179 180 181
out:
	netif_tx_unlock_bh(tun->dev);
	return err;
182 183
}

184 185 186
static void __tun_detach(struct tun_struct *tun)
{
	/* Detach from net device */
187
	netif_tx_lock_bh(tun->dev);
188
	netif_carrier_off(tun->dev);
189
	tun->tfile = NULL;
190
	netif_tx_unlock_bh(tun->dev);
191 192

	/* Drop read queue */
193
	skb_queue_purge(&tun->socket.sk->sk_receive_queue);
194 195 196 197 198 199 200 201 202 203

	/* Drop the extra count on the net device */
	dev_put(tun->dev);
}

static void tun_detach(struct tun_struct *tun)
{
	rtnl_lock();
	__tun_detach(tun);
	rtnl_unlock();
204 205 206 207
}

static struct tun_struct *__tun_get(struct tun_file *tfile)
{
208 209 210 211 212 213
	struct tun_struct *tun = NULL;

	if (atomic_inc_not_zero(&tfile->count))
		tun = tfile->tun;

	return tun;
214 215 216 217 218 219 220 221 222
}

static struct tun_struct *tun_get(struct file *file)
{
	return __tun_get(file->private_data);
}

static void tun_put(struct tun_struct *tun)
{
223 224 225 226
	struct tun_file *tfile = tun->tfile;

	if (atomic_dec_and_test(&tfile->count))
		tun_detach(tfile->tun);
227 228
}

229
/* TAP filtering */
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
static void addr_hash_set(u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	mask[n >> 5] |= (1 << (n & 31));
}

static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	return mask[n >> 5] & (1 << (n & 31));
}

static int update_filter(struct tap_filter *filter, void __user *arg)
{
	struct { u8 u[ETH_ALEN]; } *addr;
	struct tun_filter uf;
	int err, alen, n, nexact;

	if (copy_from_user(&uf, arg, sizeof(uf)))
		return -EFAULT;

	if (!uf.count) {
		/* Disabled */
		filter->count = 0;
		return 0;
	}

	alen = ETH_ALEN * uf.count;
	addr = kmalloc(alen, GFP_KERNEL);
	if (!addr)
		return -ENOMEM;

	if (copy_from_user(addr, arg + sizeof(uf), alen)) {
		err = -EFAULT;
		goto done;
	}

	/* The filter is updated without holding any locks. Which is
	 * perfectly safe. We disable it first and in the worst
	 * case we'll accept a few undesired packets. */
	filter->count = 0;
	wmb();

	/* Use first set of addresses as an exact filter */
	for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
		memcpy(filter->addr[n], addr[n].u, ETH_ALEN);

	nexact = n;

279 280
	/* Remaining multicast addresses are hashed,
	 * unicast will leave the filter disabled. */
281
	memset(filter->mask, 0, sizeof(filter->mask));
282 283 284 285 286
	for (; n < uf.count; n++) {
		if (!is_multicast_ether_addr(addr[n].u)) {
			err = 0; /* no filter */
			goto done;
		}
287
		addr_hash_set(filter->mask, addr[n].u);
288
	}
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316

	/* For ALLMULTI just set the mask to all ones.
	 * This overrides the mask populated above. */
	if ((uf.flags & TUN_FLT_ALLMULTI))
		memset(filter->mask, ~0, sizeof(filter->mask));

	/* Now enable the filter */
	wmb();
	filter->count = nexact;

	/* Return the number of exact filters */
	err = nexact;

done:
	kfree(addr);
	return err;
}

/* Returns: 0 - drop, !=0 - accept */
static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	/* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
	 * at this point. */
	struct ethhdr *eh = (struct ethhdr *) skb->data;
	int i;

	/* Exact match */
	for (i = 0; i < filter->count; i++)
317
		if (ether_addr_equal(eh->h_dest, filter->addr[i]))
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
			return 1;

	/* Inexact match (multicast only) */
	if (is_multicast_ether_addr(eh->h_dest))
		return addr_hash_test(filter->mask, eh->h_dest);

	return 0;
}

/*
 * Checks whether the packet is accepted or not.
 * Returns: 0 - drop, !=0 - accept
 */
static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	if (!filter->count)
		return 1;

	return run_filter(filter, skb);
}

L
Linus Torvalds 已提交
339 340
/* Network device part of the driver */

341
static const struct ethtool_ops tun_ethtool_ops;
L
Linus Torvalds 已提交
342

343 344 345 346 347 348 349 350 351
/* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);
	struct tun_file *tfile = tun->tfile;

	/* Inform the methods they need to stop using the dev.
	 */
	if (tfile) {
352
		wake_up_all(&tun->wq.wait);
353 354 355 356 357
		if (atomic_dec_and_test(&tfile->count))
			__tun_detach(tun);
	}
}

358 359 360 361
static void tun_free_netdev(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);

362 363
	BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags));

364
	sk_release_kernel(tun->socket.sk);
365 366
}

L
Linus Torvalds 已提交
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
	netif_start_queue(dev);
	return 0;
}

/* Net device close. */
static int tun_net_close(struct net_device *dev)
{
	netif_stop_queue(dev);
	return 0;
}

/* Net device start xmit */
382
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
383 384 385
{
	struct tun_struct *tun = netdev_priv(dev);

386
	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
L
Linus Torvalds 已提交
387 388

	/* Drop packet if interface is not attached */
389
	if (!tun->tfile)
L
Linus Torvalds 已提交
390 391
		goto drop;

392 393 394 395 396 397
	/* Drop if the filter does not like it.
	 * This is a noop if the filter is disabled.
	 * Filter can be enabled only for the TAP devices. */
	if (!check_filter(&tun->txflt, skb))
		goto drop;

398 399 400 401
	if (tun->socket.sk->sk_filter &&
	    sk_filter(tun->socket.sk, skb))
		goto drop;

402
	if (skb_queue_len(&tun->socket.sk->sk_receive_queue) >= dev->tx_queue_len) {
L
Linus Torvalds 已提交
403 404 405 406 407 408 409
		if (!(tun->flags & TUN_ONE_QUEUE)) {
			/* Normal queueing mode. */
			/* Packet scheduler handles dropping of further packets. */
			netif_stop_queue(dev);

			/* We won't see all dropped packets individually, so overrun
			 * error is more appropriate. */
410
			dev->stats.tx_fifo_errors++;
L
Linus Torvalds 已提交
411 412 413 414 415 416 417
		} else {
			/* Single queue mode.
			 * Driver handles dropping of all packets itself. */
			goto drop;
		}
	}

418 419
	/* Orphan the skb - required as we might hang on to it
	 * for indefinite time. */
420 421
	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
		goto drop;
422 423
	skb_orphan(skb);

424
	/* Enqueue packet */
425
	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
L
Linus Torvalds 已提交
426 427 428 429

	/* Notify and wake up reader process */
	if (tun->flags & TUN_FASYNC)
		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
430
	wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
431
				   POLLRDNORM | POLLRDBAND);
432
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
433 434

drop:
435
	dev->stats.tx_dropped++;
L
Linus Torvalds 已提交
436
	kfree_skb(skb);
437
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
438 439
}

440
static void tun_net_mclist(struct net_device *dev)
L
Linus Torvalds 已提交
441
{
442 443 444 445 446
	/*
	 * This callback is supposed to deal with mc filter in
	 * _rx_ path and has nothing to do with the _tx_ path.
	 * In rx path we always accept everything userspace gives us.
	 */
L
Linus Torvalds 已提交
447 448
}

449 450 451 452 453 454 455 456 457 458 459 460
#define MIN_MTU 68
#define MAX_MTU 65535

static int
tun_net_change_mtu(struct net_device *dev, int new_mtu)
{
	if (new_mtu < MIN_MTU || new_mtu + dev->hard_header_len > MAX_MTU)
		return -EINVAL;
	dev->mtu = new_mtu;
	return 0;
}

461 462
static netdev_features_t tun_net_fix_features(struct net_device *dev,
	netdev_features_t features)
463 464 465 466 467
{
	struct tun_struct *tun = netdev_priv(dev);

	return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
}
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
#ifdef CONFIG_NET_POLL_CONTROLLER
static void tun_poll_controller(struct net_device *dev)
{
	/*
	 * Tun only receives frames when:
	 * 1) the char device endpoint gets data from user space
	 * 2) the tun socket gets a sendmsg call from user space
	 * Since both of those are syncronous operations, we are guaranteed
	 * never to have pending data when we poll for it
	 * so theres nothing to do here but return.
	 * We need this though so netpoll recognizes us as an interface that
	 * supports polling, which enables bridge devices in virt setups to
	 * still use netconsole
	 */
	return;
}
#endif
485
static const struct net_device_ops tun_netdev_ops = {
486
	.ndo_uninit		= tun_net_uninit,
487 488
	.ndo_open		= tun_net_open,
	.ndo_stop		= tun_net_close,
489
	.ndo_start_xmit		= tun_net_xmit,
490
	.ndo_change_mtu		= tun_net_change_mtu,
491
	.ndo_fix_features	= tun_net_fix_features,
492 493 494
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= tun_poll_controller,
#endif
495 496 497
};

static const struct net_device_ops tap_netdev_ops = {
498
	.ndo_uninit		= tun_net_uninit,
499 500
	.ndo_open		= tun_net_open,
	.ndo_stop		= tun_net_close,
501
	.ndo_start_xmit		= tun_net_xmit,
502
	.ndo_change_mtu		= tun_net_change_mtu,
503
	.ndo_fix_features	= tun_net_fix_features,
504
	.ndo_set_rx_mode	= tun_net_mclist,
505 506
	.ndo_set_mac_address	= eth_mac_addr,
	.ndo_validate_addr	= eth_validate_addr,
507 508 509
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= tun_poll_controller,
#endif
510 511
};

L
Linus Torvalds 已提交
512 513 514 515
/* Initialize net device. */
static void tun_net_init(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);
516

L
Linus Torvalds 已提交
517 518
	switch (tun->flags & TUN_TYPE_MASK) {
	case TUN_TUN_DEV:
519 520
		dev->netdev_ops = &tun_netdev_ops;

L
Linus Torvalds 已提交
521 522 523 524 525 526
		/* Point-to-Point TUN Device */
		dev->hard_header_len = 0;
		dev->addr_len = 0;
		dev->mtu = 1500;

		/* Zero header length */
527
		dev->type = ARPHRD_NONE;
L
Linus Torvalds 已提交
528 529 530 531 532
		dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
		dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
		break;

	case TUN_TAP_DEV:
533
		dev->netdev_ops = &tap_netdev_ops;
L
Linus Torvalds 已提交
534 535
		/* Ethernet TAP Device */
		ether_setup(dev);
536
		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
537

538
		eth_hw_addr_random(dev);
539

L
Linus Torvalds 已提交
540 541 542 543 544 545 546 547 548
		dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
		break;
	}
}

/* Character device part */

/* Poll */
static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
549
{
550 551
	struct tun_file *tfile = file->private_data;
	struct tun_struct *tun = __tun_get(tfile);
552
	struct sock *sk;
553
	unsigned int mask = 0;
L
Linus Torvalds 已提交
554 555

	if (!tun)
556
		return POLLERR;
L
Linus Torvalds 已提交
557

558
	sk = tun->socket.sk;
559

560
	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
L
Linus Torvalds 已提交
561

562
	poll_wait(file, &tun->wq.wait, wait);
563

564
	if (!skb_queue_empty(&sk->sk_receive_queue))
L
Linus Torvalds 已提交
565 566
		mask |= POLLIN | POLLRDNORM;

567 568 569 570 571
	if (sock_writeable(sk) ||
	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
	     sock_writeable(sk)))
		mask |= POLLOUT | POLLWRNORM;

572 573 574
	if (tun->dev->reg_state != NETREG_REGISTERED)
		mask = POLLERR;

575
	tun_put(tun);
L
Linus Torvalds 已提交
576 577 578
	return mask;
}

579 580
/* prepad is the amount to reserve at front.  len is length after that.
 * linear is a hint as to how much to copy (usually headers). */
581 582 583
static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
				     size_t prepad, size_t len,
				     size_t linear, int noblock)
584
{
585
	struct sock *sk = tun->socket.sk;
586
	struct sk_buff *skb;
587
	int err;
588

589 590
	sock_update_classid(sk);

591
	/* Under a page?  Don't bother with paged skb. */
592
	if (prepad + len < PAGE_SIZE || !linear)
593
		linear = len;
594

595 596
	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
				   &err);
597
	if (!skb)
598
		return ERR_PTR(err);
599 600 601

	skb_reserve(skb, prepad);
	skb_put(skb, linear);
602 603
	skb->data_len = len - linear;
	skb->len += len - linear;
604 605 606 607

	return skb;
}

608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
/* set skb frags from iovec, this can move to core network code for reuse */
static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
				  int offset, size_t count)
{
	int len = iov_length(from, count) - offset;
	int copy = skb_headlen(skb);
	int size, offset1 = 0;
	int i = 0;

	/* Skip over from offset */
	while (count && (offset >= from->iov_len)) {
		offset -= from->iov_len;
		++from;
		--count;
	}

	/* copy up to skb headlen */
	while (count && (copy > 0)) {
		size = min_t(unsigned int, copy, from->iov_len - offset);
		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
				   size))
			return -EFAULT;
		if (copy > size) {
			++from;
			--count;
			offset = 0;
		} else
			offset += size;
		copy -= size;
		offset1 += size;
	}

	if (len == offset1)
		return 0;

	while (count--) {
		struct page *page[MAX_SKB_FRAGS];
		int num_pages;
		unsigned long base;
		unsigned long truesize;

		len = from->iov_len - offset;
		if (!len) {
			offset = 0;
			++from;
			continue;
		}
		base = (unsigned long)from->iov_base + offset;
		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
		if (i + size > MAX_SKB_FRAGS)
			return -EMSGSIZE;
		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
		if (num_pages != size) {
			for (i = 0; i < num_pages; i++)
				put_page(page[i]);
			return -EFAULT;
		}
		truesize = size * PAGE_SIZE;
		skb->data_len += len;
		skb->len += len;
		skb->truesize += truesize;
		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
		while (len) {
			int off = base & ~PAGE_MASK;
			int size = min_t(int, len, PAGE_SIZE - off);
			__skb_fill_page_desc(skb, i, page[i], off, size);
			skb_shinfo(skb)->nr_frags++;
			/* increase sk_wmem_alloc */
			base += size;
			len -= size;
			i++;
		}
		offset = 0;
		++from;
	}
	return 0;
}

L
Linus Torvalds 已提交
686
/* Get packet from user space buffer */
687 688 689
static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
			    const struct iovec *iv, size_t total_len,
			    size_t count, int noblock)
L
Linus Torvalds 已提交
690
{
691
	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
L
Linus Torvalds 已提交
692
	struct sk_buff *skb;
693
	size_t len = total_len, align = NET_SKB_PAD;
694
	struct virtio_net_hdr gso = { 0 };
695
	int offset = 0;
696 697 698
	int copylen;
	bool zerocopy = false;
	int err;
L
Linus Torvalds 已提交
699 700

	if (!(tun->flags & TUN_NO_PI)) {
701
		if ((len -= sizeof(pi)) > total_len)
L
Linus Torvalds 已提交
702 703
			return -EINVAL;

704
		if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
L
Linus Torvalds 已提交
705
			return -EFAULT;
706
		offset += sizeof(pi);
L
Linus Torvalds 已提交
707 708
	}

709
	if (tun->flags & TUN_VNET_HDR) {
710
		if ((len -= tun->vnet_hdr_sz) > total_len)
711 712
			return -EINVAL;

713
		if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
714 715
			return -EFAULT;

716 717 718 719
		if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
		    gso.csum_start + gso.csum_offset + 2 > gso.hdr_len)
			gso.hdr_len = gso.csum_start + gso.csum_offset + 2;

720 721
		if (gso.hdr_len > len)
			return -EINVAL;
722
		offset += tun->vnet_hdr_sz;
723 724
	}

725
	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
726
		align += NET_IP_ALIGN;
727 728
		if (unlikely(len < ETH_HLEN ||
			     (gso.hdr_len && gso.hdr_len < ETH_HLEN)))
729 730
			return -EINVAL;
	}
731

732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
	if (msg_control)
		zerocopy = true;

	if (zerocopy) {
		/* Userspace may produce vectors with count greater than
		 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
		 * to let the rest of data to be fit in the frags.
		 */
		if (count > MAX_SKB_FRAGS) {
			copylen = iov_length(iv, count - MAX_SKB_FRAGS);
			if (copylen < offset)
				copylen = 0;
			else
				copylen -= offset;
		} else
				copylen = 0;
		/* There are 256 bytes to be copied in skb, so there is enough
		 * room for skb expand head in case it is used.
		 * The rest of the buffer is mapped from userspace.
		 */
		if (copylen < gso.hdr_len)
			copylen = gso.hdr_len;
		if (!copylen)
			copylen = GOODCOPY_LEN;
	} else
		copylen = len;

	skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
760 761 762 763
	if (IS_ERR(skb)) {
		if (PTR_ERR(skb) != -EAGAIN)
			tun->dev->stats.rx_dropped++;
		return PTR_ERR(skb);
L
Linus Torvalds 已提交
764 765
	}

766 767 768 769 770 771
	if (zerocopy)
		err = zerocopy_sg_from_iovec(skb, iv, offset, count);
	else
		err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);

	if (err) {
772
		tun->dev->stats.rx_dropped++;
773
		kfree_skb(skb);
L
Linus Torvalds 已提交
774
		return -EFAULT;
775
	}
L
Linus Torvalds 已提交
776

777 778 779 780 781 782 783
	if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
		if (!skb_partial_csum_set(skb, gso.csum_start,
					  gso.csum_offset)) {
			tun->dev->stats.rx_frame_errors++;
			kfree_skb(skb);
			return -EINVAL;
		}
784
	}
785

L
Linus Torvalds 已提交
786 787
	switch (tun->flags & TUN_TYPE_MASK) {
	case TUN_TUN_DEV:
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
		if (tun->flags & TUN_NO_PI) {
			switch (skb->data[0] & 0xf0) {
			case 0x40:
				pi.proto = htons(ETH_P_IP);
				break;
			case 0x60:
				pi.proto = htons(ETH_P_IPV6);
				break;
			default:
				tun->dev->stats.rx_dropped++;
				kfree_skb(skb);
				return -EINVAL;
			}
		}

803
		skb_reset_mac_header(skb);
L
Linus Torvalds 已提交
804
		skb->protocol = pi.proto;
805
		skb->dev = tun->dev;
L
Linus Torvalds 已提交
806 807 808 809
		break;
	case TUN_TAP_DEV:
		skb->protocol = eth_type_trans(skb, tun->dev);
		break;
810
	}
L
Linus Torvalds 已提交
811

812 813 814 815 816 817 818 819 820
	if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
		pr_debug("GSO!\n");
		switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
		case VIRTIO_NET_HDR_GSO_TCPV4:
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
			break;
		case VIRTIO_NET_HDR_GSO_TCPV6:
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
			break;
821 822 823
		case VIRTIO_NET_HDR_GSO_UDP:
			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
			break;
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843
		default:
			tun->dev->stats.rx_frame_errors++;
			kfree_skb(skb);
			return -EINVAL;
		}

		if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;

		skb_shinfo(skb)->gso_size = gso.gso_size;
		if (skb_shinfo(skb)->gso_size == 0) {
			tun->dev->stats.rx_frame_errors++;
			kfree_skb(skb);
			return -EINVAL;
		}

		/* Header must be checked, and gso_segs computed. */
		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
		skb_shinfo(skb)->gso_segs = 0;
	}
844

845 846 847 848 849 850
	/* copy skb_ubuf_info for callback when skb has no error */
	if (zerocopy) {
		skb_shinfo(skb)->destructor_arg = msg_control;
		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
	}

L
Linus Torvalds 已提交
851
	netif_rx_ni(skb);
852

853 854
	tun->dev->stats.rx_packets++;
	tun->dev->stats.rx_bytes += len;
L
Linus Torvalds 已提交
855

856
	return total_len;
857
}
L
Linus Torvalds 已提交
858

859 860
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
			      unsigned long count, loff_t pos)
L
Linus Torvalds 已提交
861
{
862
	struct file *file = iocb->ki_filp;
H
Herbert Xu 已提交
863
	struct tun_struct *tun = tun_get(file);
864
	ssize_t result;
L
Linus Torvalds 已提交
865 866 867 868

	if (!tun)
		return -EBADFD;

869
	tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
L
Linus Torvalds 已提交
870

871
	result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count,
872
			      file->f_flags & O_NONBLOCK);
873 874 875

	tun_put(tun);
	return result;
L
Linus Torvalds 已提交
876 877 878
}

/* Put packet to the user space buffer */
879 880 881
static ssize_t tun_put_user(struct tun_struct *tun,
			    struct sk_buff *skb,
			    const struct iovec *iv, int len)
L
Linus Torvalds 已提交
882 883 884 885 886 887 888 889 890 891 892 893
{
	struct tun_pi pi = { 0, skb->protocol };
	ssize_t total = 0;

	if (!(tun->flags & TUN_NO_PI)) {
		if ((len -= sizeof(pi)) < 0)
			return -EINVAL;

		if (len < skb->len) {
			/* Packet will be striped */
			pi.flags |= TUN_PKT_STRIP;
		}
894

895
		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
L
Linus Torvalds 已提交
896 897
			return -EFAULT;
		total += sizeof(pi);
898
	}
L
Linus Torvalds 已提交
899

900 901
	if (tun->flags & TUN_VNET_HDR) {
		struct virtio_net_hdr gso = { 0 }; /* no info leak */
902
		if ((len -= tun->vnet_hdr_sz) < 0)
903 904 905 906 907 908 909 910 911 912 913 914
			return -EINVAL;

		if (skb_is_gso(skb)) {
			struct skb_shared_info *sinfo = skb_shinfo(skb);

			/* This is a hint as to how much should be linear. */
			gso.hdr_len = skb_headlen(skb);
			gso.gso_size = sinfo->gso_size;
			if (sinfo->gso_type & SKB_GSO_TCPV4)
				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
			else if (sinfo->gso_type & SKB_GSO_TCPV6)
				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
915 916
			else if (sinfo->gso_type & SKB_GSO_UDP)
				gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
917
			else {
918
				pr_err("unexpected GSO type: "
919 920 921 922 923 924 925 926 927 928
				       "0x%x, gso_size %d, hdr_len %d\n",
				       sinfo->gso_type, gso.gso_size,
				       gso.hdr_len);
				print_hex_dump(KERN_ERR, "tun: ",
					       DUMP_PREFIX_NONE,
					       16, 1, skb->head,
					       min((int)gso.hdr_len, 64), true);
				WARN_ON_ONCE(1);
				return -EINVAL;
			}
929 930 931 932 933 934 935
			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
		} else
			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;

		if (skb->ip_summed == CHECKSUM_PARTIAL) {
			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
936
			gso.csum_start = skb_checksum_start_offset(skb);
937
			gso.csum_offset = skb->csum_offset;
938 939
		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
940 941
		} /* else everything is zero */

942 943
		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
					       sizeof(gso))))
944
			return -EFAULT;
945
		total += tun->vnet_hdr_sz;
946 947
	}

L
Linus Torvalds 已提交
948 949
	len = min_t(int, skb->len, len);

950
	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
951
	total += skb->len;
L
Linus Torvalds 已提交
952

953 954
	tun->dev->stats.tx_packets++;
	tun->dev->stats.tx_bytes += len;
L
Linus Torvalds 已提交
955 956 957 958

	return total;
}

959 960 961
static ssize_t tun_do_read(struct tun_struct *tun,
			   struct kiocb *iocb, const struct iovec *iv,
			   ssize_t len, int noblock)
L
Linus Torvalds 已提交
962 963 964
{
	DECLARE_WAITQUEUE(wait, current);
	struct sk_buff *skb;
965
	ssize_t ret = 0;
L
Linus Torvalds 已提交
966

967
	tun_debug(KERN_INFO, tun, "tun_chr_read\n");
L
Linus Torvalds 已提交
968

969 970
	if (unlikely(!noblock))
		add_wait_queue(&tun->wq.wait, &wait);
L
Linus Torvalds 已提交
971 972 973 974
	while (len) {
		current->state = TASK_INTERRUPTIBLE;

		/* Read frames from the queue */
975
		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
976
			if (noblock) {
L
Linus Torvalds 已提交
977 978 979 980 981 982 983
				ret = -EAGAIN;
				break;
			}
			if (signal_pending(current)) {
				ret = -ERESTARTSYS;
				break;
			}
984 985 986 987
			if (tun->dev->reg_state != NETREG_REGISTERED) {
				ret = -EIO;
				break;
			}
L
Linus Torvalds 已提交
988 989 990 991 992 993 994

			/* Nothing to read, let's sleep */
			schedule();
			continue;
		}
		netif_wake_queue(tun->dev);

995
		ret = tun_put_user(tun, skb, iv, len);
996 997
		kfree_skb(skb);
		break;
L
Linus Torvalds 已提交
998 999 1000
	}

	current->state = TASK_RUNNING;
1001 1002
	if (unlikely(!noblock))
		remove_wait_queue(&tun->wq.wait, &wait);
L
Linus Torvalds 已提交
1003

1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
	return ret;
}

static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
			    unsigned long count, loff_t pos)
{
	struct file *file = iocb->ki_filp;
	struct tun_file *tfile = file->private_data;
	struct tun_struct *tun = __tun_get(tfile);
	ssize_t len, ret;

	if (!tun)
		return -EBADFD;
	len = iov_length(iv, count);
	if (len < 0) {
		ret = -EINVAL;
		goto out;
	}

	ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK);
	ret = min_t(ssize_t, ret, len);
1025 1026
out:
	tun_put(tun);
L
Linus Torvalds 已提交
1027 1028 1029 1030 1031 1032 1033 1034
	return ret;
}

static void tun_setup(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);

	tun->owner = -1;
1035
	tun->group = -1;
L
Linus Torvalds 已提交
1036 1037

	dev->ethtool_ops = &tun_ethtool_ops;
1038
	dev->destructor = tun_free_netdev;
L
Linus Torvalds 已提交
1039 1040
}

1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
/* Trivial set of netlink ops to allow deleting tun or tap
 * device with netlink.
 */
static int tun_validate(struct nlattr *tb[], struct nlattr *data[])
{
	return -EINVAL;
}

static struct rtnl_link_ops tun_link_ops __read_mostly = {
	.kind		= DRV_NAME,
	.priv_size	= sizeof(struct tun_struct),
	.setup		= tun_setup,
	.validate	= tun_validate,
};

1056 1057 1058
static void tun_sock_write_space(struct sock *sk)
{
	struct tun_struct *tun;
1059
	wait_queue_head_t *wqueue;
1060 1061 1062 1063 1064 1065 1066

	if (!sock_writeable(sk))
		return;

	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
		return;

1067 1068 1069
	wqueue = sk_sleep(sk);
	if (wqueue && waitqueue_active(wqueue))
		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
1070
						POLLWRNORM | POLLWRBAND);
H
Herbert Xu 已提交
1071

1072
	tun = tun_sk(sk)->tun;
1073 1074 1075 1076 1077
	kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
}

static void tun_sock_destruct(struct sock *sk)
{
1078
	free_netdev(tun_sk(sk)->tun->dev);
1079 1080
}

1081 1082 1083 1084
static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
		       struct msghdr *m, size_t total_len)
{
	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
1085 1086
	return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
			    m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
}

static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
		       struct msghdr *m, size_t total_len,
		       int flags)
{
	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
	int ret;
	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
		return -EINVAL;
	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
			  flags & MSG_DONTWAIT);
	if (ret > total_len) {
		m->msg_flags |= MSG_TRUNC;
		ret = flags & MSG_TRUNC ? ret : total_len;
	}
	return ret;
}

1106 1107 1108 1109 1110 1111 1112
static int tun_release(struct socket *sock)
{
	if (sock->sk)
		sock_put(sock->sk);
	return 0;
}

1113 1114 1115 1116
/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
	.sendmsg = tun_sendmsg,
	.recvmsg = tun_recvmsg,
1117
	.release = tun_release,
1118 1119
};

1120 1121 1122 1123 1124
static struct proto tun_proto = {
	.name		= "tun",
	.owner		= THIS_MODULE,
	.obj_size	= sizeof(struct tun_sock),
};
1125

1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
static int tun_flags(struct tun_struct *tun)
{
	int flags = 0;

	if (tun->flags & TUN_TUN_DEV)
		flags |= IFF_TUN;
	else
		flags |= IFF_TAP;

	if (tun->flags & TUN_NO_PI)
		flags |= IFF_NO_PI;

	if (tun->flags & TUN_ONE_QUEUE)
		flags |= IFF_ONE_QUEUE;

	if (tun->flags & TUN_VNET_HDR)
		flags |= IFF_VNET_HDR;

	return flags;
}

static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
			      char *buf)
{
	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
	return sprintf(buf, "0x%x\n", tun_flags(tun));
}

static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
			      char *buf)
{
	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
	return sprintf(buf, "%d\n", tun->owner);
}

static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
			      char *buf)
{
	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
	return sprintf(buf, "%d\n", tun->group);
}

static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
static DEVICE_ATTR(group, 0444, tun_show_group, NULL);

1172
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
L
Linus Torvalds 已提交
1173
{
1174
	struct sock *sk;
L
Linus Torvalds 已提交
1175 1176 1177 1178
	struct tun_struct *tun;
	struct net_device *dev;
	int err;

1179 1180
	dev = __dev_get_by_name(net, ifr->ifr_name);
	if (dev) {
1181 1182
		const struct cred *cred = current_cred();

1183 1184
		if (ifr->ifr_flags & IFF_TUN_EXCL)
			return -EBUSY;
1185 1186 1187 1188 1189 1190 1191
		if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
			tun = netdev_priv(dev);
		else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
			tun = netdev_priv(dev);
		else
			return -EINVAL;

1192 1193 1194 1195
		if (((tun->owner != -1 && cred->euid != tun->owner) ||
		     (tun->group != -1 && !in_egroup_p(tun->group))) &&
		    !capable(CAP_NET_ADMIN))
			return -EPERM;
1196
		err = security_tun_dev_attach(tun->socket.sk);
1197 1198 1199
		if (err < 0)
			return err;

1200 1201 1202
		err = tun_attach(tun, file);
		if (err < 0)
			return err;
1203
	}
L
Linus Torvalds 已提交
1204 1205 1206 1207
	else {
		char *name;
		unsigned long flags = 0;

1208 1209
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
1210 1211 1212
		err = security_tun_dev_create();
		if (err < 0)
			return err;
1213

L
Linus Torvalds 已提交
1214 1215 1216 1217 1218 1219 1220 1221 1222
		/* Set dev type */
		if (ifr->ifr_flags & IFF_TUN) {
			/* TUN device */
			flags |= TUN_TUN_DEV;
			name = "tun%d";
		} else if (ifr->ifr_flags & IFF_TAP) {
			/* TAP device */
			flags |= TUN_TAP_DEV;
			name = "tap%d";
1223
		} else
1224
			return -EINVAL;
1225

L
Linus Torvalds 已提交
1226 1227 1228 1229 1230 1231 1232 1233
		if (*ifr->ifr_name)
			name = ifr->ifr_name;

		dev = alloc_netdev(sizeof(struct tun_struct), name,
				   tun_setup);
		if (!dev)
			return -ENOMEM;

1234
		dev_net_set(dev, net);
1235
		dev->rtnl_link_ops = &tun_link_ops;
1236

L
Linus Torvalds 已提交
1237 1238 1239
		tun = netdev_priv(dev);
		tun->dev = dev;
		tun->flags = flags;
1240
		tun->txflt.count = 0;
1241
		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
1242
		set_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags);
L
Linus Torvalds 已提交
1243

1244
		err = -ENOMEM;
1245
		sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
1246 1247 1248
		if (!sk)
			goto err_free_dev;

1249
		sk_change_net(sk, net);
1250 1251
		tun->socket.wq = &tun->wq;
		init_waitqueue_head(&tun->wq.wait);
1252
		tun->socket.ops = &tun_socket_ops;
1253 1254 1255
		sock_init_data(&tun->socket, sk);
		sk->sk_write_space = tun_sock_write_space;
		sk->sk_sndbuf = INT_MAX;
1256
		sock_set_flag(sk, SOCK_ZEROCOPY);
1257

1258
		tun_sk(sk)->tun = tun;
1259

1260 1261
		security_tun_dev_post_create(sk);

L
Linus Torvalds 已提交
1262 1263
		tun_net_init(dev);

1264 1265 1266 1267
		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
			TUN_USER_FEATURES;
		dev->features = dev->hw_features;

L
Linus Torvalds 已提交
1268 1269
		err = register_netdevice(tun->dev);
		if (err < 0)
1270 1271
			goto err_free_sk;

1272 1273 1274
		if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
		    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
		    device_create_file(&tun->dev->dev, &dev_attr_group))
1275
			pr_err("Failed to create tun sysfs files\n");
1276

1277
		sk->sk_destruct = tun_sock_destruct;
1278 1279 1280

		err = tun_attach(tun, file);
		if (err < 0)
1281
			goto failed;
L
Linus Torvalds 已提交
1282 1283
	}

1284
	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
L
Linus Torvalds 已提交
1285 1286 1287

	if (ifr->ifr_flags & IFF_NO_PI)
		tun->flags |= TUN_NO_PI;
1288 1289
	else
		tun->flags &= ~TUN_NO_PI;
L
Linus Torvalds 已提交
1290 1291 1292

	if (ifr->ifr_flags & IFF_ONE_QUEUE)
		tun->flags |= TUN_ONE_QUEUE;
1293 1294
	else
		tun->flags &= ~TUN_ONE_QUEUE;
L
Linus Torvalds 已提交
1295

1296 1297 1298 1299 1300
	if (ifr->ifr_flags & IFF_VNET_HDR)
		tun->flags |= TUN_VNET_HDR;
	else
		tun->flags &= ~TUN_VNET_HDR;

1301 1302 1303 1304 1305 1306
	/* Make sure persistent devices do not get stuck in
	 * xoff state.
	 */
	if (netif_running(tun->dev))
		netif_wake_queue(tun->dev);

L
Linus Torvalds 已提交
1307 1308 1309
	strcpy(ifr->ifr_name, tun->dev->name);
	return 0;

1310
 err_free_sk:
1311
	tun_free_netdev(dev);
L
Linus Torvalds 已提交
1312 1313 1314 1315 1316 1317
 err_free_dev:
	free_netdev(dev);
 failed:
	return err;
}

1318 1319
static int tun_get_iff(struct net *net, struct tun_struct *tun,
		       struct ifreq *ifr)
1320
{
1321
	tun_debug(KERN_INFO, tun, "tun_get_iff\n");
1322 1323 1324

	strcpy(ifr->ifr_name, tun->dev->name);

1325
	ifr->ifr_flags = tun_flags(tun);
1326 1327 1328 1329

	return 0;
}

1330 1331
/* This is like a cut-down ethtool ops, except done via tun fd so no
 * privs required. */
1332
static int set_offload(struct tun_struct *tun, unsigned long arg)
1333
{
1334
	netdev_features_t features = 0;
1335 1336

	if (arg & TUN_F_CSUM) {
1337
		features |= NETIF_F_HW_CSUM;
1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
		arg &= ~TUN_F_CSUM;

		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
			if (arg & TUN_F_TSO_ECN) {
				features |= NETIF_F_TSO_ECN;
				arg &= ~TUN_F_TSO_ECN;
			}
			if (arg & TUN_F_TSO4)
				features |= NETIF_F_TSO;
			if (arg & TUN_F_TSO6)
				features |= NETIF_F_TSO6;
			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
		}
1351 1352 1353 1354 1355

		if (arg & TUN_F_UFO) {
			features |= NETIF_F_UFO;
			arg &= ~TUN_F_UFO;
		}
1356 1357 1358 1359 1360 1361 1362
	}

	/* This gives the user a way to test for new features in future by
	 * trying to set them. */
	if (arg)
		return -EINVAL;

1363 1364
	tun->set_features = features;
	netdev_update_features(tun->dev);
1365 1366 1367 1368

	return 0;
}

1369 1370
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
			    unsigned long arg, int ifreq_len)
L
Linus Torvalds 已提交
1371
{
1372
	struct tun_file *tfile = file->private_data;
1373
	struct tun_struct *tun;
L
Linus Torvalds 已提交
1374
	void __user* argp = (void __user*)arg;
1375
	struct sock_fprog fprog;
L
Linus Torvalds 已提交
1376
	struct ifreq ifr;
1377
	int sndbuf;
1378
	int vnet_hdr_sz;
1379
	int ret;
L
Linus Torvalds 已提交
1380

1381
	if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) {
1382
		if (copy_from_user(&ifr, argp, ifreq_len))
L
Linus Torvalds 已提交
1383
			return -EFAULT;
D
David S. Miller 已提交
1384
	} else {
1385
		memset(&ifr, 0, sizeof(ifr));
D
David S. Miller 已提交
1386
	}
1387 1388 1389 1390 1391 1392 1393 1394 1395
	if (cmd == TUNGETFEATURES) {
		/* Currently this just means: "what IFF flags are valid?".
		 * This is needed because we never checked for invalid flags on
		 * TUNSETIFF. */
		return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
				IFF_VNET_HDR,
				(unsigned int __user*)argp);
	}

1396 1397
	rtnl_lock();

1398
	tun = __tun_get(tfile);
L
Linus Torvalds 已提交
1399 1400 1401
	if (cmd == TUNSETIFF && !tun) {
		ifr.ifr_name[IFNAMSIZ-1] = '\0';

1402
		ret = tun_set_iff(tfile->net, file, &ifr);
L
Linus Torvalds 已提交
1403

1404 1405
		if (ret)
			goto unlock;
L
Linus Torvalds 已提交
1406

1407
		if (copy_to_user(argp, &ifr, ifreq_len))
1408 1409
			ret = -EFAULT;
		goto unlock;
L
Linus Torvalds 已提交
1410 1411
	}

1412
	ret = -EBADFD;
L
Linus Torvalds 已提交
1413
	if (!tun)
1414
		goto unlock;
L
Linus Torvalds 已提交
1415

1416
	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
L
Linus Torvalds 已提交
1417

1418
	ret = 0;
L
Linus Torvalds 已提交
1419
	switch (cmd) {
1420
	case TUNGETIFF:
1421
		ret = tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
1422
		if (ret)
1423
			break;
1424

1425
		if (copy_to_user(argp, &ifr, ifreq_len))
1426
			ret = -EFAULT;
1427 1428
		break;

L
Linus Torvalds 已提交
1429 1430 1431
	case TUNSETNOCSUM:
		/* Disable/Enable checksum */

1432 1433
		/* [unimplemented] */
		tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
1434
			  arg ? "disabled" : "enabled");
L
Linus Torvalds 已提交
1435 1436 1437 1438 1439 1440 1441 1442 1443
		break;

	case TUNSETPERSIST:
		/* Disable/Enable persist mode */
		if (arg)
			tun->flags |= TUN_PERSIST;
		else
			tun->flags &= ~TUN_PERSIST;

1444 1445
		tun_debug(KERN_INFO, tun, "persist %s\n",
			  arg ? "enabled" : "disabled");
L
Linus Torvalds 已提交
1446 1447 1448 1449 1450 1451
		break;

	case TUNSETOWNER:
		/* Set owner of the device */
		tun->owner = (uid_t) arg;

1452
		tun_debug(KERN_INFO, tun, "owner set to %d\n", tun->owner);
L
Linus Torvalds 已提交
1453 1454
		break;

1455 1456 1457 1458
	case TUNSETGROUP:
		/* Set group of the device */
		tun->group= (gid_t) arg;

1459
		tun_debug(KERN_INFO, tun, "group set to %d\n", tun->group);
1460 1461
		break;

1462 1463 1464
	case TUNSETLINK:
		/* Only allow setting the type when the interface is down */
		if (tun->dev->flags & IFF_UP) {
1465 1466
			tun_debug(KERN_INFO, tun,
				  "Linktype set failed because interface is up\n");
1467
			ret = -EBUSY;
1468 1469
		} else {
			tun->dev->type = (int) arg;
1470 1471
			tun_debug(KERN_INFO, tun, "linktype set to %d\n",
				  tun->dev->type);
1472
			ret = 0;
1473
		}
1474
		break;
1475

L
Linus Torvalds 已提交
1476 1477 1478 1479 1480
#ifdef TUN_DEBUG
	case TUNSETDEBUG:
		tun->debug = arg;
		break;
#endif
1481
	case TUNSETOFFLOAD:
1482
		ret = set_offload(tun, arg);
1483
		break;
1484

1485 1486
	case TUNSETTXFILTER:
		/* Can be set only for TAPs */
1487
		ret = -EINVAL;
1488
		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
1489
			break;
H
Harvey Harrison 已提交
1490
		ret = update_filter(&tun->txflt, (void __user *)arg);
1491
		break;
L
Linus Torvalds 已提交
1492 1493

	case SIOCGIFHWADDR:
1494
		/* Get hw address */
1495 1496
		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
		ifr.ifr_hwaddr.sa_family = tun->dev->type;
1497
		if (copy_to_user(argp, &ifr, ifreq_len))
1498 1499
			ret = -EFAULT;
		break;
L
Linus Torvalds 已提交
1500 1501

	case SIOCSIFHWADDR:
1502
		/* Set hw address */
1503 1504
		tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
			  ifr.ifr_hwaddr.sa_data);
1505 1506

		ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
1507
		break;
1508 1509

	case TUNGETSNDBUF:
1510
		sndbuf = tun->socket.sk->sk_sndbuf;
1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
			ret = -EFAULT;
		break;

	case TUNSETSNDBUF:
		if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
			ret = -EFAULT;
			break;
		}

1521
		tun->socket.sk->sk_sndbuf = sndbuf;
1522 1523
		break;

1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542
	case TUNGETVNETHDRSZ:
		vnet_hdr_sz = tun->vnet_hdr_sz;
		if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
			ret = -EFAULT;
		break;

	case TUNSETVNETHDRSZ:
		if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
			ret = -EFAULT;
			break;
		}
		if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
			ret = -EINVAL;
			break;
		}

		tun->vnet_hdr_sz = vnet_hdr_sz;
		break;

1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
	case TUNATTACHFILTER:
		/* Can be set only for TAPs */
		ret = -EINVAL;
		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
			break;
		ret = -EFAULT;
		if (copy_from_user(&fprog, argp, sizeof(fprog)))
			break;

		ret = sk_attach_filter(&fprog, tun->socket.sk);
		break;

	case TUNDETACHFILTER:
		/* Can be set only for TAPs */
		ret = -EINVAL;
		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
			break;
		ret = sk_detach_filter(tun->socket.sk);
		break;

L
Linus Torvalds 已提交
1563
	default:
1564 1565
		ret = -EINVAL;
		break;
1566
	}
L
Linus Torvalds 已提交
1567

1568 1569 1570 1571
unlock:
	rtnl_unlock();
	if (tun)
		tun_put(tun);
1572
	return ret;
L
Linus Torvalds 已提交
1573 1574
}

1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609
static long tun_chr_ioctl(struct file *file,
			  unsigned int cmd, unsigned long arg)
{
	return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
}

#ifdef CONFIG_COMPAT
static long tun_chr_compat_ioctl(struct file *file,
			 unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case TUNSETIFF:
	case TUNGETIFF:
	case TUNSETTXFILTER:
	case TUNGETSNDBUF:
	case TUNSETSNDBUF:
	case SIOCGIFHWADDR:
	case SIOCSIFHWADDR:
		arg = (unsigned long)compat_ptr(arg);
		break;
	default:
		arg = (compat_ulong_t)arg;
		break;
	}

	/*
	 * compat_ifreq is shorter than ifreq, so we must not access beyond
	 * the end of that structure. All fields that are used in this
	 * driver are compatible though, we don't need to convert the
	 * contents.
	 */
	return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
}
#endif /* CONFIG_COMPAT */

L
Linus Torvalds 已提交
1610 1611
static int tun_chr_fasync(int fd, struct file *file, int on)
{
1612
	struct tun_struct *tun = tun_get(file);
L
Linus Torvalds 已提交
1613 1614 1615 1616 1617
	int ret;

	if (!tun)
		return -EBADFD;

1618
	tun_debug(KERN_INFO, tun, "tun_chr_fasync %d\n", on);
L
Linus Torvalds 已提交
1619 1620

	if ((ret = fasync_helper(fd, file, on, &tun->fasync)) < 0)
1621
		goto out;
1622

L
Linus Torvalds 已提交
1623
	if (on) {
1624
		ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
L
Linus Torvalds 已提交
1625
		if (ret)
1626
			goto out;
L
Linus Torvalds 已提交
1627
		tun->flags |= TUN_FASYNC;
1628
	} else
L
Linus Torvalds 已提交
1629
		tun->flags &= ~TUN_FASYNC;
1630 1631
	ret = 0;
out:
1632
	tun_put(tun);
1633
	return ret;
L
Linus Torvalds 已提交
1634 1635 1636 1637
}

static int tun_chr_open(struct inode *inode, struct file * file)
{
1638
	struct tun_file *tfile;
1639

1640
	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
1641 1642 1643 1644

	tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
	if (!tfile)
		return -ENOMEM;
1645
	atomic_set(&tfile->count, 0);
1646
	tfile->tun = NULL;
1647
	tfile->net = get_net(current->nsproxy->net_ns);
1648
	file->private_data = tfile;
L
Linus Torvalds 已提交
1649 1650 1651 1652 1653
	return 0;
}

static int tun_chr_close(struct inode *inode, struct file *file)
{
1654
	struct tun_file *tfile = file->private_data;
1655
	struct tun_struct *tun;
L
Linus Torvalds 已提交
1656

1657
	tun = __tun_get(tfile);
1658
	if (tun) {
1659 1660
		struct net_device *dev = tun->dev;

1661
		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
L
Linus Torvalds 已提交
1662

1663
		__tun_detach(tun);
L
Linus Torvalds 已提交
1664

1665
		/* If desirable, unregister the netdevice. */
1666 1667 1668 1669 1670 1671
		if (!(tun->flags & TUN_PERSIST)) {
			rtnl_lock();
			if (dev->reg_state == NETREG_REGISTERED)
				unregister_netdevice(dev);
			rtnl_unlock();
		}
1672
	}
L
Linus Torvalds 已提交
1673

1674 1675
	tun = tfile->tun;
	if (tun)
1676
		sock_put(tun->socket.sk);
1677

1678
	put_net(tfile->net);
1679
	kfree(tfile);
L
Linus Torvalds 已提交
1680 1681 1682 1683

	return 0;
}

1684
static const struct file_operations tun_fops = {
1685
	.owner	= THIS_MODULE,
L
Linus Torvalds 已提交
1686
	.llseek = no_llseek,
1687 1688 1689 1690
	.read  = do_sync_read,
	.aio_read  = tun_chr_aio_read,
	.write = do_sync_write,
	.aio_write = tun_chr_aio_write,
L
Linus Torvalds 已提交
1691
	.poll	= tun_chr_poll,
1692 1693 1694 1695
	.unlocked_ioctl	= tun_chr_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = tun_chr_compat_ioctl,
#endif
L
Linus Torvalds 已提交
1696 1697
	.open	= tun_chr_open,
	.release = tun_chr_close,
1698
	.fasync = tun_chr_fasync
L
Linus Torvalds 已提交
1699 1700 1701 1702 1703
};

static struct miscdevice tun_miscdev = {
	.minor = TUN_MINOR,
	.name = "tun",
1704
	.nodename = "net/tun",
L
Linus Torvalds 已提交
1705 1706 1707 1708 1709 1710 1711 1712 1713
	.fops = &tun_fops,
};

/* ethtool interface */

static int tun_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	cmd->supported		= 0;
	cmd->advertising	= 0;
1714
	ethtool_cmd_speed_set(cmd, SPEED_10);
L
Linus Torvalds 已提交
1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728
	cmd->duplex		= DUPLEX_FULL;
	cmd->port		= PORT_TP;
	cmd->phy_address	= 0;
	cmd->transceiver	= XCVR_INTERNAL;
	cmd->autoneg		= AUTONEG_DISABLE;
	cmd->maxtxpkt		= 0;
	cmd->maxrxpkt		= 0;
	return 0;
}

static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
	struct tun_struct *tun = netdev_priv(dev);

1729 1730
	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
L
Linus Torvalds 已提交
1731 1732 1733

	switch (tun->flags & TUN_TYPE_MASK) {
	case TUN_TUN_DEV:
1734
		strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
L
Linus Torvalds 已提交
1735 1736
		break;
	case TUN_TAP_DEV:
1737
		strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
L
Linus Torvalds 已提交
1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759
		break;
	}
}

static u32 tun_get_msglevel(struct net_device *dev)
{
#ifdef TUN_DEBUG
	struct tun_struct *tun = netdev_priv(dev);
	return tun->debug;
#else
	return -EOPNOTSUPP;
#endif
}

static void tun_set_msglevel(struct net_device *dev, u32 value)
{
#ifdef TUN_DEBUG
	struct tun_struct *tun = netdev_priv(dev);
	tun->debug = value;
#endif
}

1760
static const struct ethtool_ops tun_ethtool_ops = {
L
Linus Torvalds 已提交
1761 1762 1763 1764
	.get_settings	= tun_get_settings,
	.get_drvinfo	= tun_get_drvinfo,
	.get_msglevel	= tun_get_msglevel,
	.set_msglevel	= tun_set_msglevel,
1765
	.get_link	= ethtool_op_get_link,
L
Linus Torvalds 已提交
1766 1767
};

1768

L
Linus Torvalds 已提交
1769 1770 1771 1772
static int __init tun_init(void)
{
	int ret = 0;

1773 1774
	pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
	pr_info("%s\n", DRV_COPYRIGHT);
L
Linus Torvalds 已提交
1775

1776
	ret = rtnl_link_register(&tun_link_ops);
1777
	if (ret) {
1778
		pr_err("Can't register link_ops\n");
1779
		goto err_linkops;
1780 1781
	}

L
Linus Torvalds 已提交
1782
	ret = misc_register(&tun_miscdev);
1783
	if (ret) {
1784
		pr_err("Can't register misc device %d\n", TUN_MINOR);
1785 1786
		goto err_misc;
	}
1787
	return  0;
1788
err_misc:
1789 1790
	rtnl_link_unregister(&tun_link_ops);
err_linkops:
L
Linus Torvalds 已提交
1791 1792 1793 1794 1795
	return ret;
}

static void tun_cleanup(void)
{
1796
	misc_deregister(&tun_miscdev);
1797
	rtnl_link_unregister(&tun_link_ops);
L
Linus Torvalds 已提交
1798 1799
}

1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816
/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
struct socket *tun_get_socket(struct file *file)
{
	struct tun_struct *tun;
	if (file->f_op != &tun_fops)
		return ERR_PTR(-EINVAL);
	tun = tun_get(file);
	if (!tun)
		return ERR_PTR(-EBADFD);
	tun_put(tun);
	return &tun->socket;
}
EXPORT_SYMBOL_GPL(tun_get_socket);

L
Linus Torvalds 已提交
1817 1818 1819 1820 1821 1822
module_init(tun_init);
module_exit(tun_cleanup);
MODULE_DESCRIPTION(DRV_DESCRIPTION);
MODULE_AUTHOR(DRV_COPYRIGHT);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(TUN_MINOR);
1823
MODULE_ALIAS("devname:net/tun");
反馈
建议
客服 返回
顶部