tun.c 55.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  TUN - Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
 */

/*
 *  Changes:
 *
21 22 23
 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
 *    Add TUNSETLINK ioctl to set the link encapsulation
 *
L
Linus Torvalds 已提交
24
 *  Mark Smith <markzzzsmith@yahoo.com.au>
J
Joe Perches 已提交
25
 *    Use eth_random_addr() for tap MAC address.
L
Linus Torvalds 已提交
26 27 28 29 30 31 32 33 34 35 36
 *
 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
 *    Fixes in packet dropping, queue length setting and queue wakeup.
 *    Increased default tx queue length.
 *    Added ethtool API.
 *    Minor cleanups
 *
 *  Daniel Podlejski <underley@underley.eu.org>
 *    Modifications for 2.3.99-pre5 kernel.
 */

37 38
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

L
Linus Torvalds 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
#define DRV_NAME	"tun"
#define DRV_VERSION	"1.6"
#define DRV_DESCRIPTION	"Universal TUN/TAP device driver"
#define DRV_COPYRIGHT	"(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/miscdevice.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
58
#include <linux/compat.h>
L
Linus Torvalds 已提交
59 60 61 62
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_tun.h>
J
Jason Wang 已提交
63
#include <linux/if_vlan.h>
L
Linus Torvalds 已提交
64
#include <linux/crc32.h>
65
#include <linux/nsproxy.h>
66
#include <linux/virtio_net.h>
M
Michael S. Tsirkin 已提交
67
#include <linux/rcupdate.h>
68
#include <net/ipv6.h>
69
#include <net/net_namespace.h>
70
#include <net/netns/generic.h>
71
#include <net/rtnetlink.h>
72
#include <net/sock.h>
73
#include <linux/seq_file.h>
H
Herbert Xu 已提交
74
#include <linux/uio.h>
L
Linus Torvalds 已提交
75 76 77

#include <asm/uaccess.h>

78 79 80
/* Uncomment to enable debugging */
/* #define TUN_DEBUG 1 */

L
Linus Torvalds 已提交
81 82
#ifdef TUN_DEBUG
static int debug;
83

84 85 86 87 88 89 90 91 92 93
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (tun->debug)						\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (debug == 2)						\
		printk(level fmt, ##args);			\
} while (0)
94
#else
95 96 97 98 99 100 101 102 103 104
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (0)							\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (0)							\
		printk(level fmt, ##args);			\
} while (0)
105 106
#endif

107 108 109 110 111 112
/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
 * overload it to mean fasync when stored there.
 */
#define TUN_FASYNC	IFF_ATTACH_QUEUE
M
Michael S. Tsirkin 已提交
113 114
/* High bits in flags field are unused. */
#define TUN_VNET_LE     0x80000000
115 116

#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
M
Michael S. Tsirkin 已提交
117
		      IFF_MULTI_QUEUE)
118 119
#define GOODCOPY_LEN 128

120 121 122 123 124 125 126
#define FLT_EXACT_COUNT 8
struct tap_filter {
	unsigned int    count;    /* Number of addrs. Zero means disabled */
	u32             mask[2];  /* Mask of the hashed addrs */
	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
};

S
stephen hemminger 已提交
127
/* DEFAULT_MAX_NUM_RSS_QUEUES were chosen to let the rx/tx queues allocated for
128 129 130
 * the netdevice to be fit in one page. So we can make sure the success of
 * memory allocation. TODO: increase the limit. */
#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES
131
#define MAX_TAP_FLOWS  4096
J
Jason Wang 已提交
132

J
Jason Wang 已提交
133 134
#define TUN_FLOW_EXPIRE (3 * HZ)

J
Jason Wang 已提交
135
/* A tun_file connects an open character device to a tuntap netdevice. It
S
stephen hemminger 已提交
136
 * also contains all socket related structures (except sock_fprog and tap_filter)
J
Jason Wang 已提交
137 138
 * to serve as one transmit queue for tuntap device. The sock_fprog and
 * tap_filter were kept in tun_struct since they were used for filtering for the
R
Rami Rosen 已提交
139
 * netdevice not for a specific queue (at least I didn't see the requirement for
J
Jason Wang 已提交
140
 * this).
141 142
 *
 * RCU usage:
R
Rami Rosen 已提交
143
 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
144
 * other can only be read while rcu_read_lock or rtnl_lock is held.
J
Jason Wang 已提交
145
 */
E
Eric W. Biederman 已提交
146
struct tun_file {
J
Jason Wang 已提交
147 148 149
	struct sock sk;
	struct socket socket;
	struct socket_wq wq;
150
	struct tun_struct __rcu *tun;
E
Eric W. Biederman 已提交
151
	struct net *net;
J
Jason Wang 已提交
152 153 154
	struct fasync_struct *fasync;
	/* only used for fasnyc */
	unsigned int flags;
155 156 157 158
	union {
		u16 queue_index;
		unsigned int ifindex;
	};
J
Jason Wang 已提交
159 160
	struct list_head next;
	struct tun_struct *detached;
E
Eric W. Biederman 已提交
161 162
};

J
Jason Wang 已提交
163 164 165 166 167 168
struct tun_flow_entry {
	struct hlist_node hash_link;
	struct rcu_head rcu;
	struct tun_struct *tun;

	u32 rxhash;
169
	u32 rps_rxhash;
J
Jason Wang 已提交
170 171 172 173 174 175
	int queue_index;
	unsigned long updated;
};

#define TUN_NUM_FLOW_ENTRIES 1024

J
Jason Wang 已提交
176
/* Since the socket were moved to tun_file, to preserve the behavior of persist
R
Rami Rosen 已提交
177
 * device, socket filter, sndbuf and vnet header size were restore when the
J
Jason Wang 已提交
178 179
 * file were attached to a persist device.
 */
180
struct tun_struct {
J
Jason Wang 已提交
181 182
	struct tun_file __rcu	*tfiles[MAX_TAP_QUEUES];
	unsigned int            numqueues;
183
	unsigned int 		flags;
184 185
	kuid_t			owner;
	kgid_t			group;
186 187

	struct net_device	*dev;
188
	netdev_features_t	set_features;
189
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
190
			  NETIF_F_TSO6)
191 192

	int			vnet_hdr_sz;
J
Jason Wang 已提交
193 194 195 196 197
	int			sndbuf;
	struct tap_filter	txflt;
	struct sock_fprog	fprog;
	/* protected by rtnl lock */
	bool			filter_attached;
198 199
#ifdef TUN_DEBUG
	int debug;
L
Linus Torvalds 已提交
200
#endif
J
Jason Wang 已提交
201 202 203 204
	spinlock_t lock;
	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
	struct timer_list flow_gc_timer;
	unsigned long ageing_time;
J
Jason Wang 已提交
205 206
	unsigned int numdisabled;
	struct list_head disabled;
207
	void *security;
208
	u32 flow_count;
209
};
L
Linus Torvalds 已提交
210

211 212
static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
{
M
Michael S. Tsirkin 已提交
213
	return __virtio16_to_cpu(tun->flags & TUN_VNET_LE, val);
214 215 216 217
}

static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
{
M
Michael S. Tsirkin 已提交
218
	return __cpu_to_virtio16(tun->flags & TUN_VNET_LE, val);
219 220
}

J
Jason Wang 已提交
221 222 223 224 225 226 227 228 229
static inline u32 tun_hashfn(u32 rxhash)
{
	return rxhash & 0x3ff;
}

static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
{
	struct tun_flow_entry *e;

230
	hlist_for_each_entry_rcu(e, head, hash_link) {
J
Jason Wang 已提交
231 232 233 234 235 236 237 238 239 240
		if (e->rxhash == rxhash)
			return e;
	}
	return NULL;
}

static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
					      struct hlist_head *head,
					      u32 rxhash, u16 queue_index)
{
241 242
	struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);

J
Jason Wang 已提交
243 244 245 246 247
	if (e) {
		tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
			  rxhash, queue_index);
		e->updated = jiffies;
		e->rxhash = rxhash;
248
		e->rps_rxhash = 0;
J
Jason Wang 已提交
249 250 251
		e->queue_index = queue_index;
		e->tun = tun;
		hlist_add_head_rcu(&e->hash_link, head);
252
		++tun->flow_count;
J
Jason Wang 已提交
253 254 255 256 257 258 259 260
	}
	return e;
}

static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
		  e->rxhash, e->queue_index);
261
	sock_rps_reset_flow_hash(e->rps_rxhash);
J
Jason Wang 已提交
262
	hlist_del_rcu(&e->hash_link);
263
	kfree_rcu(e, rcu);
264
	--tun->flow_count;
J
Jason Wang 已提交
265 266 267 268 269 270 271 272 273
}

static void tun_flow_flush(struct tun_struct *tun)
{
	int i;

	spin_lock_bh(&tun->lock);
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
274
		struct hlist_node *n;
J
Jason Wang 已提交
275

276
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
J
Jason Wang 已提交
277 278 279 280 281 282 283 284 285 286 287 288
			tun_flow_delete(tun, e);
	}
	spin_unlock_bh(&tun->lock);
}

static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
{
	int i;

	spin_lock_bh(&tun->lock);
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
289
		struct hlist_node *n;
J
Jason Wang 已提交
290

291
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
J
Jason Wang 已提交
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
			if (e->queue_index == queue_index)
				tun_flow_delete(tun, e);
		}
	}
	spin_unlock_bh(&tun->lock);
}

static void tun_flow_cleanup(unsigned long data)
{
	struct tun_struct *tun = (struct tun_struct *)data;
	unsigned long delay = tun->ageing_time;
	unsigned long next_timer = jiffies + delay;
	unsigned long count = 0;
	int i;

	tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");

	spin_lock_bh(&tun->lock);
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
312
		struct hlist_node *n;
J
Jason Wang 已提交
313

314
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
J
Jason Wang 已提交
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
			unsigned long this_timer;
			count++;
			this_timer = e->updated + delay;
			if (time_before_eq(this_timer, jiffies))
				tun_flow_delete(tun, e);
			else if (time_before(this_timer, next_timer))
				next_timer = this_timer;
		}
	}

	if (count)
		mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
	spin_unlock_bh(&tun->lock);
}

330
static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
331
			    struct tun_file *tfile)
J
Jason Wang 已提交
332 333 334 335
{
	struct hlist_head *head;
	struct tun_flow_entry *e;
	unsigned long delay = tun->ageing_time;
336
	u16 queue_index = tfile->queue_index;
J
Jason Wang 已提交
337 338 339 340 341 342 343 344

	if (!rxhash)
		return;
	else
		head = &tun->flows[tun_hashfn(rxhash)];

	rcu_read_lock();

345 346 347
	/* We may get a very small possibility of OOO during switching, not
	 * worth to optimize.*/
	if (tun->numqueues == 1 || tfile->detached)
J
Jason Wang 已提交
348 349 350 351 352 353 354
		goto unlock;

	e = tun_flow_find(head, rxhash);
	if (likely(e)) {
		/* TODO: keep queueing to old queue until it's empty? */
		e->queue_index = queue_index;
		e->updated = jiffies;
355
		sock_rps_record_flow_hash(e->rps_rxhash);
J
Jason Wang 已提交
356 357
	} else {
		spin_lock_bh(&tun->lock);
358 359
		if (!tun_flow_find(head, rxhash) &&
		    tun->flow_count < MAX_TAP_FLOWS)
J
Jason Wang 已提交
360 361 362 363 364 365 366 367 368 369 370 371
			tun_flow_create(tun, head, rxhash, queue_index);

		if (!timer_pending(&tun->flow_gc_timer))
			mod_timer(&tun->flow_gc_timer,
				  round_jiffies_up(jiffies + delay));
		spin_unlock_bh(&tun->lock);
	}

unlock:
	rcu_read_unlock();
}

372 373 374 375 376 377 378 379 380 381 382 383
/**
 * Save the hash received in the stack receive path and update the
 * flow_hash table accordingly.
 */
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{
	if (unlikely(e->rps_rxhash != hash)) {
		sock_rps_reset_flow_hash(e->rps_rxhash);
		e->rps_rxhash = hash;
	}
}

J
Jason Wang 已提交
384
/* We try to identify a flow through its rxhash first. The reason that
S
stephen hemminger 已提交
385
 * we do not check rxq no. is because some cards(e.g 82599), chooses
J
Jason Wang 已提交
386 387 388 389 390
 * the rxq based on the txq where the last packet of the flow comes. As
 * the userspace application move between processors, we may get a
 * different rxq no. here. If we could not get rxhash, then we would
 * hope the rxq no. may help here.
 */
391
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
392
			    void *accel_priv, select_queue_fallback_t fallback)
J
Jason Wang 已提交
393 394
{
	struct tun_struct *tun = netdev_priv(dev);
J
Jason Wang 已提交
395
	struct tun_flow_entry *e;
J
Jason Wang 已提交
396 397 398 399
	u32 txq = 0;
	u32 numqueues = 0;

	rcu_read_lock();
400
	numqueues = ACCESS_ONCE(tun->numqueues);
J
Jason Wang 已提交
401

402
	txq = skb_get_hash(skb);
J
Jason Wang 已提交
403
	if (txq) {
J
Jason Wang 已提交
404
		e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
405 406
		if (e) {
			tun_flow_save_rps_rxhash(e, txq);
407
			txq = e->queue_index;
408
		} else
J
Jason Wang 已提交
409 410
			/* use multiply and shift instead of expensive divide */
			txq = ((u64)txq * numqueues) >> 32;
J
Jason Wang 已提交
411 412 413 414 415 416 417 418 419 420
	} else if (likely(skb_rx_queue_recorded(skb))) {
		txq = skb_get_rx_queue(skb);
		while (unlikely(txq >= numqueues))
			txq -= numqueues;
	}

	rcu_read_unlock();
	return txq;
}

421 422 423
static inline bool tun_not_capable(struct tun_struct *tun)
{
	const struct cred *cred = current_cred();
424
	struct net *net = dev_net(tun->dev);
425 426 427

	return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
		  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
428
		!ns_capable(net->user_ns, CAP_NET_ADMIN);
429 430
}

J
Jason Wang 已提交
431 432 433 434 435 436
static void tun_set_real_num_queues(struct tun_struct *tun)
{
	netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
}

J
Jason Wang 已提交
437 438 439 440 441 442 443
static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
{
	tfile->detached = tun;
	list_add_tail(&tfile->next, &tun->disabled);
	++tun->numdisabled;
}

J
Jason Wang 已提交
444
static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
J
Jason Wang 已提交
445 446 447 448 449 450 451 452 453
{
	struct tun_struct *tun = tfile->detached;

	tfile->detached = NULL;
	list_del_init(&tfile->next);
	--tun->numdisabled;
	return tun;
}

454 455 456 457 458 459
static void tun_queue_purge(struct tun_file *tfile)
{
	skb_queue_purge(&tfile->sk.sk_receive_queue);
	skb_queue_purge(&tfile->sk.sk_error_queue);
}

J
Jason Wang 已提交
460 461 462 463 464
static void __tun_detach(struct tun_file *tfile, bool clean)
{
	struct tun_file *ntfile;
	struct tun_struct *tun;

465 466
	tun = rtnl_dereference(tfile->tun);

467
	if (tun && !tfile->detached) {
J
Jason Wang 已提交
468 469 470 471 472
		u16 index = tfile->queue_index;
		BUG_ON(index >= tun->numqueues);

		rcu_assign_pointer(tun->tfiles[index],
				   tun->tfiles[tun->numqueues - 1]);
473
		ntfile = rtnl_dereference(tun->tfiles[index]);
J
Jason Wang 已提交
474 475 476
		ntfile->queue_index = index;

		--tun->numqueues;
477
		if (clean) {
478
			RCU_INIT_POINTER(tfile->tun, NULL);
J
Jason Wang 已提交
479
			sock_put(&tfile->sk);
480
		} else
J
Jason Wang 已提交
481
			tun_disable_queue(tun, tfile);
J
Jason Wang 已提交
482 483

		synchronize_net();
J
Jason Wang 已提交
484
		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
J
Jason Wang 已提交
485
		/* Drop read queue */
486
		tun_queue_purge(tfile);
J
Jason Wang 已提交
487
		tun_set_real_num_queues(tun);
J
Jason Wang 已提交
488
	} else if (tfile->detached && clean) {
J
Jason Wang 已提交
489
		tun = tun_enable_queue(tfile);
J
Jason Wang 已提交
490 491
		sock_put(&tfile->sk);
	}
J
Jason Wang 已提交
492 493

	if (clean) {
494 495 496
		if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
			netif_carrier_off(tun->dev);

M
Michael S. Tsirkin 已提交
497
			if (!(tun->flags & IFF_PERSIST) &&
498
			    tun->dev->reg_state == NETREG_REGISTERED)
J
Jason Wang 已提交
499
				unregister_netdevice(tun->dev);
500
		}
J
Jason Wang 已提交
501

J
Jason Wang 已提交
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
		BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
				 &tfile->socket.flags));
		sk_release_kernel(&tfile->sk);
	}
}

static void tun_detach(struct tun_file *tfile, bool clean)
{
	rtnl_lock();
	__tun_detach(tfile, clean);
	rtnl_unlock();
}

static void tun_detach_all(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);
J
Jason Wang 已提交
518
	struct tun_file *tfile, *tmp;
J
Jason Wang 已提交
519 520 521
	int i, n = tun->numqueues;

	for (i = 0; i < n; i++) {
522
		tfile = rtnl_dereference(tun->tfiles[i]);
J
Jason Wang 已提交
523
		BUG_ON(!tfile);
524
		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
525
		RCU_INIT_POINTER(tfile->tun, NULL);
J
Jason Wang 已提交
526 527
		--tun->numqueues;
	}
528
	list_for_each_entry(tfile, &tun->disabled, next) {
529
		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
530
		RCU_INIT_POINTER(tfile->tun, NULL);
531
	}
J
Jason Wang 已提交
532 533 534 535
	BUG_ON(tun->numqueues != 0);

	synchronize_net();
	for (i = 0; i < n; i++) {
536
		tfile = rtnl_dereference(tun->tfiles[i]);
J
Jason Wang 已提交
537
		/* Drop read queue */
538
		tun_queue_purge(tfile);
J
Jason Wang 已提交
539 540
		sock_put(&tfile->sk);
	}
J
Jason Wang 已提交
541 542
	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
		tun_enable_queue(tfile);
543
		tun_queue_purge(tfile);
J
Jason Wang 已提交
544 545 546
		sock_put(&tfile->sk);
	}
	BUG_ON(tun->numdisabled != 0);
J
Jason Wang 已提交
547

M
Michael S. Tsirkin 已提交
548
	if (tun->flags & IFF_PERSIST)
J
Jason Wang 已提交
549
		module_put(THIS_MODULE);
J
Jason Wang 已提交
550 551
}

552
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
553
{
E
Eric W. Biederman 已提交
554
	struct tun_file *tfile = file->private_data;
555
	int err;
556

557 558 559 560
	err = security_tun_dev_attach(tfile->socket.sk, tun->security);
	if (err < 0)
		goto out;

561
	err = -EINVAL;
562
	if (rtnl_dereference(tfile->tun) && !tfile->detached)
563 564 565
		goto out;

	err = -EBUSY;
M
Michael S. Tsirkin 已提交
566
	if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
J
Jason Wang 已提交
567 568 569
		goto out;

	err = -E2BIG;
J
Jason Wang 已提交
570 571
	if (!tfile->detached &&
	    tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
572 573 574
		goto out;

	err = 0;
J
Jason Wang 已提交
575

S
stephen hemminger 已提交
576
	/* Re-attach the filter to persist device */
577
	if (!skip_filter && (tun->filter_attached == true)) {
J
Jason Wang 已提交
578 579 580 581
		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
		if (!err)
			goto out;
	}
J
Jason Wang 已提交
582
	tfile->queue_index = tun->numqueues;
583
	rcu_assign_pointer(tfile->tun, tun);
J
Jason Wang 已提交
584 585
	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
	tun->numqueues++;
586

J
Jason Wang 已提交
587 588 589 590 591
	if (tfile->detached)
		tun_enable_queue(tfile);
	else
		sock_hold(&tfile->sk);

J
Jason Wang 已提交
592
	tun_set_real_num_queues(tun);
593

J
Jason Wang 已提交
594 595 596 597 598 599
	/* device is allowed to go away first, so no need to hold extra
	 * refcnt.
	 */

out:
	return err;
E
Eric W. Biederman 已提交
600 601 602 603
}

static struct tun_struct *__tun_get(struct tun_file *tfile)
{
604
	struct tun_struct *tun;
605

606 607 608 609 610
	rcu_read_lock();
	tun = rcu_dereference(tfile->tun);
	if (tun)
		dev_hold(tun->dev);
	rcu_read_unlock();
611 612

	return tun;
E
Eric W. Biederman 已提交
613 614 615 616 617 618 619 620 621
}

static struct tun_struct *tun_get(struct file *file)
{
	return __tun_get(file->private_data);
}

static void tun_put(struct tun_struct *tun)
{
622
	dev_put(tun->dev);
E
Eric W. Biederman 已提交
623 624
}

625
/* TAP filtering */
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
static void addr_hash_set(u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	mask[n >> 5] |= (1 << (n & 31));
}

static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	return mask[n >> 5] & (1 << (n & 31));
}

static int update_filter(struct tap_filter *filter, void __user *arg)
{
	struct { u8 u[ETH_ALEN]; } *addr;
	struct tun_filter uf;
	int err, alen, n, nexact;

	if (copy_from_user(&uf, arg, sizeof(uf)))
		return -EFAULT;

	if (!uf.count) {
		/* Disabled */
		filter->count = 0;
		return 0;
	}

	alen = ETH_ALEN * uf.count;
	addr = kmalloc(alen, GFP_KERNEL);
	if (!addr)
		return -ENOMEM;

	if (copy_from_user(addr, arg + sizeof(uf), alen)) {
		err = -EFAULT;
		goto done;
	}

	/* The filter is updated without holding any locks. Which is
	 * perfectly safe. We disable it first and in the worst
	 * case we'll accept a few undesired packets. */
	filter->count = 0;
	wmb();

	/* Use first set of addresses as an exact filter */
	for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
		memcpy(filter->addr[n], addr[n].u, ETH_ALEN);

	nexact = n;

675 676
	/* Remaining multicast addresses are hashed,
	 * unicast will leave the filter disabled. */
677
	memset(filter->mask, 0, sizeof(filter->mask));
678 679 680 681 682
	for (; n < uf.count; n++) {
		if (!is_multicast_ether_addr(addr[n].u)) {
			err = 0; /* no filter */
			goto done;
		}
683
		addr_hash_set(filter->mask, addr[n].u);
684
	}
685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712

	/* For ALLMULTI just set the mask to all ones.
	 * This overrides the mask populated above. */
	if ((uf.flags & TUN_FLT_ALLMULTI))
		memset(filter->mask, ~0, sizeof(filter->mask));

	/* Now enable the filter */
	wmb();
	filter->count = nexact;

	/* Return the number of exact filters */
	err = nexact;

done:
	kfree(addr);
	return err;
}

/* Returns: 0 - drop, !=0 - accept */
static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	/* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
	 * at this point. */
	struct ethhdr *eh = (struct ethhdr *) skb->data;
	int i;

	/* Exact match */
	for (i = 0; i < filter->count; i++)
713
		if (ether_addr_equal(eh->h_dest, filter->addr[i]))
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
			return 1;

	/* Inexact match (multicast only) */
	if (is_multicast_ether_addr(eh->h_dest))
		return addr_hash_test(filter->mask, eh->h_dest);

	return 0;
}

/*
 * Checks whether the packet is accepted or not.
 * Returns: 0 - drop, !=0 - accept
 */
static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	if (!filter->count)
		return 1;

	return run_filter(filter, skb);
}

L
Linus Torvalds 已提交
735 736
/* Network device part of the driver */

737
static const struct ethtool_ops tun_ethtool_ops;
L
Linus Torvalds 已提交
738

739 740 741
/* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev)
{
J
Jason Wang 已提交
742
	tun_detach_all(dev);
743 744
}

L
Linus Torvalds 已提交
745 746 747
/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
J
Jason Wang 已提交
748
	netif_tx_start_all_queues(dev);
L
Linus Torvalds 已提交
749 750 751 752 753 754
	return 0;
}

/* Net device close. */
static int tun_net_close(struct net_device *dev)
{
J
Jason Wang 已提交
755
	netif_tx_stop_all_queues(dev);
L
Linus Torvalds 已提交
756 757 758 759
	return 0;
}

/* Net device start xmit */
760
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
761 762
{
	struct tun_struct *tun = netdev_priv(dev);
J
Jason Wang 已提交
763
	int txq = skb->queue_mapping;
764
	struct tun_file *tfile;
765
	u32 numqueues = 0;
L
Linus Torvalds 已提交
766

767
	rcu_read_lock();
J
Jason Wang 已提交
768
	tfile = rcu_dereference(tun->tfiles[txq]);
769
	numqueues = ACCESS_ONCE(tun->numqueues);
J
Jason Wang 已提交
770

L
Linus Torvalds 已提交
771
	/* Drop packet if interface is not attached */
772
	if (txq >= numqueues)
L
Linus Torvalds 已提交
773 774
		goto drop;

775
	if (numqueues == 1) {
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
		/* Select queue was not called for the skbuff, so we extract the
		 * RPS hash and save it into the flow_table here.
		 */
		__u32 rxhash;

		rxhash = skb_get_hash(skb);
		if (rxhash) {
			struct tun_flow_entry *e;
			e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)],
					rxhash);
			if (e)
				tun_flow_save_rps_rxhash(e, rxhash);
		}
	}

791 792
	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);

J
Jason Wang 已提交
793 794
	BUG_ON(!tfile);

795 796 797 798 799 800
	/* Drop if the filter does not like it.
	 * This is a noop if the filter is disabled.
	 * Filter can be enabled only for the TAP devices. */
	if (!check_filter(&tun->txflt, skb))
		goto drop;

J
Jason Wang 已提交
801 802
	if (tfile->socket.sk->sk_filter &&
	    sk_filter(tfile->socket.sk, skb))
M
Michael S. Tsirkin 已提交
803 804
		goto drop;

R
Rami Rosen 已提交
805
	/* Limit the number of packets queued by dividing txq length with the
J
Jason Wang 已提交
806 807
	 * number of queues.
	 */
808 809
	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) * numqueues
			  >= dev->tx_queue_len)
810
		goto drop;
L
Linus Torvalds 已提交
811

812 813 814
	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
		goto drop;

815 816 817 818 819
	if (skb->sk) {
		sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags);
		sw_tx_timestamp(skb);
	}

M
Michael S. Tsirkin 已提交
820
	/* Orphan the skb - required as we might hang on to it
821 822
	 * for indefinite time.
	 */
M
Michael S. Tsirkin 已提交
823 824
	skb_orphan(skb);

825 826
	nf_reset(skb);

827
	/* Enqueue packet */
J
Jason Wang 已提交
828
	skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
L
Linus Torvalds 已提交
829 830

	/* Notify and wake up reader process */
J
Jason Wang 已提交
831 832
	if (tfile->flags & TUN_FASYNC)
		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
833
	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
834 835

	rcu_read_unlock();
836
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
837 838

drop:
839
	dev->stats.tx_dropped++;
840
	skb_tx_error(skb);
L
Linus Torvalds 已提交
841
	kfree_skb(skb);
842
	rcu_read_unlock();
843
	return NET_XMIT_DROP;
L
Linus Torvalds 已提交
844 845
}

846
static void tun_net_mclist(struct net_device *dev)
L
Linus Torvalds 已提交
847
{
848 849 850 851 852
	/*
	 * This callback is supposed to deal with mc filter in
	 * _rx_ path and has nothing to do with the _tx_ path.
	 * In rx path we always accept everything userspace gives us.
	 */
L
Linus Torvalds 已提交
853 854
}

E
Ed Swierk 已提交
855 856 857 858 859 860 861 862 863 864 865 866
#define MIN_MTU 68
#define MAX_MTU 65535

static int
tun_net_change_mtu(struct net_device *dev, int new_mtu)
{
	if (new_mtu < MIN_MTU || new_mtu + dev->hard_header_len > MAX_MTU)
		return -EINVAL;
	dev->mtu = new_mtu;
	return 0;
}

867 868
static netdev_features_t tun_net_fix_features(struct net_device *dev,
	netdev_features_t features)
869 870 871 872 873
{
	struct tun_struct *tun = netdev_priv(dev);

	return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
}
874 875 876 877 878 879 880
#ifdef CONFIG_NET_POLL_CONTROLLER
static void tun_poll_controller(struct net_device *dev)
{
	/*
	 * Tun only receives frames when:
	 * 1) the char device endpoint gets data from user space
	 * 2) the tun socket gets a sendmsg call from user space
S
stephen hemminger 已提交
881
	 * Since both of those are synchronous operations, we are guaranteed
882
	 * never to have pending data when we poll for it
S
stephen hemminger 已提交
883
	 * so there is nothing to do here but return.
884 885 886 887 888 889 890
	 * We need this though so netpoll recognizes us as an interface that
	 * supports polling, which enables bridge devices in virt setups to
	 * still use netconsole
	 */
	return;
}
#endif
S
Stephen Hemminger 已提交
891
static const struct net_device_ops tun_netdev_ops = {
892
	.ndo_uninit		= tun_net_uninit,
S
Stephen Hemminger 已提交
893 894
	.ndo_open		= tun_net_open,
	.ndo_stop		= tun_net_close,
895
	.ndo_start_xmit		= tun_net_xmit,
S
Stephen Hemminger 已提交
896
	.ndo_change_mtu		= tun_net_change_mtu,
897
	.ndo_fix_features	= tun_net_fix_features,
J
Jason Wang 已提交
898
	.ndo_select_queue	= tun_select_queue,
899 900 901
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= tun_poll_controller,
#endif
S
Stephen Hemminger 已提交
902 903 904
};

static const struct net_device_ops tap_netdev_ops = {
905
	.ndo_uninit		= tun_net_uninit,
S
Stephen Hemminger 已提交
906 907
	.ndo_open		= tun_net_open,
	.ndo_stop		= tun_net_close,
908
	.ndo_start_xmit		= tun_net_xmit,
S
Stephen Hemminger 已提交
909
	.ndo_change_mtu		= tun_net_change_mtu,
910
	.ndo_fix_features	= tun_net_fix_features,
911
	.ndo_set_rx_mode	= tun_net_mclist,
S
Stephen Hemminger 已提交
912 913
	.ndo_set_mac_address	= eth_mac_addr,
	.ndo_validate_addr	= eth_validate_addr,
J
Jason Wang 已提交
914
	.ndo_select_queue	= tun_select_queue,
915 916 917
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= tun_poll_controller,
#endif
S
Stephen Hemminger 已提交
918 919
};

920
static void tun_flow_init(struct tun_struct *tun)
J
Jason Wang 已提交
921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
{
	int i;

	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
		INIT_HLIST_HEAD(&tun->flows[i]);

	tun->ageing_time = TUN_FLOW_EXPIRE;
	setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
	mod_timer(&tun->flow_gc_timer,
		  round_jiffies_up(jiffies + tun->ageing_time));
}

static void tun_flow_uninit(struct tun_struct *tun)
{
	del_timer_sync(&tun->flow_gc_timer);
	tun_flow_flush(tun);
}

L
Linus Torvalds 已提交
939 940 941 942
/* Initialize net device. */
static void tun_net_init(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);
943

L
Linus Torvalds 已提交
944
	switch (tun->flags & TUN_TYPE_MASK) {
M
Michael S. Tsirkin 已提交
945
	case IFF_TUN:
S
Stephen Hemminger 已提交
946 947
		dev->netdev_ops = &tun_netdev_ops;

L
Linus Torvalds 已提交
948 949 950 951 952 953
		/* Point-to-Point TUN Device */
		dev->hard_header_len = 0;
		dev->addr_len = 0;
		dev->mtu = 1500;

		/* Zero header length */
954
		dev->type = ARPHRD_NONE;
L
Linus Torvalds 已提交
955 956 957 958
		dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
		dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
		break;

M
Michael S. Tsirkin 已提交
959
	case IFF_TAP:
K
Kusanagi Kouichi 已提交
960
		dev->netdev_ops = &tap_netdev_ops;
L
Linus Torvalds 已提交
961 962
		/* Ethernet TAP Device */
		ether_setup(dev);
963
		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
964
		dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
965

966
		eth_hw_addr_random(dev);
967

L
Linus Torvalds 已提交
968 969 970 971 972 973 974 975
		dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
		break;
	}
}

/* Character device part */

/* Poll */
J
Jason Wang 已提交
976
static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
977
{
978 979
	struct tun_file *tfile = file->private_data;
	struct tun_struct *tun = __tun_get(tfile);
980
	struct sock *sk;
981
	unsigned int mask = 0;
L
Linus Torvalds 已提交
982 983

	if (!tun)
984
		return POLLERR;
L
Linus Torvalds 已提交
985

J
Jason Wang 已提交
986
	sk = tfile->socket.sk;
987

988
	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
L
Linus Torvalds 已提交
989

990
	poll_wait(file, sk_sleep(sk), wait);
991

992
	if (!skb_queue_empty(&sk->sk_receive_queue))
L
Linus Torvalds 已提交
993 994
		mask |= POLLIN | POLLRDNORM;

995 996 997 998 999
	if (sock_writeable(sk) ||
	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
	     sock_writeable(sk)))
		mask |= POLLOUT | POLLWRNORM;

1000 1001 1002
	if (tun->dev->reg_state != NETREG_REGISTERED)
		mask = POLLERR;

E
Eric W. Biederman 已提交
1003
	tun_put(tun);
L
Linus Torvalds 已提交
1004 1005 1006
	return mask;
}

1007 1008
/* prepad is the amount to reserve at front.  len is length after that.
 * linear is a hint as to how much to copy (usually headers). */
J
Jason Wang 已提交
1009
static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
1010 1011
				     size_t prepad, size_t len,
				     size_t linear, int noblock)
1012
{
J
Jason Wang 已提交
1013
	struct sock *sk = tfile->socket.sk;
1014
	struct sk_buff *skb;
1015
	int err;
1016 1017

	/* Under a page?  Don't bother with paged skb. */
H
Herbert Xu 已提交
1018
	if (prepad + len < PAGE_SIZE || !linear)
1019
		linear = len;
1020

1021
	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1022
				   &err, 0);
1023
	if (!skb)
1024
		return ERR_PTR(err);
1025 1026 1027

	skb_reserve(skb, prepad);
	skb_put(skb, linear);
1028 1029
	skb->data_len = len - linear;
	skb->len += len - linear;
1030 1031 1032 1033

	return skb;
}

L
Linus Torvalds 已提交
1034
/* Get packet from user space buffer */
J
Jason Wang 已提交
1035
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1036 1037
			    void *msg_control, struct iov_iter *from,
			    int noblock)
L
Linus Torvalds 已提交
1038
{
1039
	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
L
Linus Torvalds 已提交
1040
	struct sk_buff *skb;
1041
	size_t total_len = iov_iter_count(from);
1042
	size_t len = total_len, align = NET_SKB_PAD, linear;
1043
	struct virtio_net_hdr gso = { 0 };
1044
	int good_linear;
1045 1046 1047
	int copylen;
	bool zerocopy = false;
	int err;
1048
	u32 rxhash;
1049
	ssize_t n;
L
Linus Torvalds 已提交
1050

M
Michael S. Tsirkin 已提交
1051
	if (!(tun->flags & IFF_NO_PI)) {
1052
		if (len < sizeof(pi))
L
Linus Torvalds 已提交
1053
			return -EINVAL;
1054
		len -= sizeof(pi);
L
Linus Torvalds 已提交
1055

1056 1057
		n = copy_from_iter(&pi, sizeof(pi), from);
		if (n != sizeof(pi))
L
Linus Torvalds 已提交
1058 1059 1060
			return -EFAULT;
	}

M
Michael S. Tsirkin 已提交
1061
	if (tun->flags & IFF_VNET_HDR) {
1062
		if (len < tun->vnet_hdr_sz)
1063
			return -EINVAL;
1064
		len -= tun->vnet_hdr_sz;
1065

1066 1067
		n = copy_from_iter(&gso, sizeof(gso), from);
		if (n != sizeof(gso))
1068 1069
			return -EFAULT;

1070
		if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1071 1072
		    tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
			gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);
1073

1074
		if (tun16_to_cpu(tun, gso.hdr_len) > len)
1075
			return -EINVAL;
1076
		iov_iter_advance(from, tun->vnet_hdr_sz - sizeof(gso));
1077 1078
	}

M
Michael S. Tsirkin 已提交
1079
	if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
1080
		align += NET_IP_ALIGN;
H
Herbert Xu 已提交
1081
		if (unlikely(len < ETH_HLEN ||
1082
			     (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
1083 1084
			return -EINVAL;
	}
1085

1086 1087
	good_linear = SKB_MAX_HEAD(align);

1088
	if (msg_control) {
1089 1090
		struct iov_iter i = *from;

1091 1092
		/* There are 256 bytes to be copied in skb, so there is
		 * enough room for skb expand head in case it is used.
1093 1094
		 * The rest of the buffer is mapped from userspace.
		 */
1095
		copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
1096 1097
		if (copylen > good_linear)
			copylen = good_linear;
1098
		linear = copylen;
1099 1100
		iov_iter_advance(&i, copylen);
		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
1101 1102 1103 1104
			zerocopy = true;
	}

	if (!zerocopy) {
1105
		copylen = len;
1106
		if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
1107 1108
			linear = good_linear;
		else
1109
			linear = tun16_to_cpu(tun, gso.hdr_len);
1110
	}
1111

1112
	skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
1113 1114 1115 1116
	if (IS_ERR(skb)) {
		if (PTR_ERR(skb) != -EAGAIN)
			tun->dev->stats.rx_dropped++;
		return PTR_ERR(skb);
L
Linus Torvalds 已提交
1117 1118
	}

1119
	if (zerocopy)
1120
		err = zerocopy_sg_from_iter(skb, from);
1121
	else {
1122
		err = skb_copy_datagram_from_iter(skb, 0, from, len);
1123 1124 1125 1126 1127
		if (!err && msg_control) {
			struct ubuf_info *uarg = msg_control;
			uarg->callback(uarg, false);
		}
	}
1128 1129

	if (err) {
1130
		tun->dev->stats.rx_dropped++;
D
Dave Jones 已提交
1131
		kfree_skb(skb);
L
Linus Torvalds 已提交
1132
		return -EFAULT;
D
Dave Jones 已提交
1133
	}
L
Linus Torvalds 已提交
1134

1135
	if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1136 1137
		if (!skb_partial_csum_set(skb, tun16_to_cpu(tun, gso.csum_start),
					  tun16_to_cpu(tun, gso.csum_offset))) {
1138 1139 1140 1141
			tun->dev->stats.rx_frame_errors++;
			kfree_skb(skb);
			return -EINVAL;
		}
1142
	}
1143

L
Linus Torvalds 已提交
1144
	switch (tun->flags & TUN_TYPE_MASK) {
M
Michael S. Tsirkin 已提交
1145 1146
	case IFF_TUN:
		if (tun->flags & IFF_NO_PI) {
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
			switch (skb->data[0] & 0xf0) {
			case 0x40:
				pi.proto = htons(ETH_P_IP);
				break;
			case 0x60:
				pi.proto = htons(ETH_P_IPV6);
				break;
			default:
				tun->dev->stats.rx_dropped++;
				kfree_skb(skb);
				return -EINVAL;
			}
		}

1161
		skb_reset_mac_header(skb);
L
Linus Torvalds 已提交
1162
		skb->protocol = pi.proto;
1163
		skb->dev = tun->dev;
L
Linus Torvalds 已提交
1164
		break;
M
Michael S. Tsirkin 已提交
1165
	case IFF_TAP:
L
Linus Torvalds 已提交
1166 1167
		skb->protocol = eth_type_trans(skb, tun->dev);
		break;
1168
	}
L
Linus Torvalds 已提交
1169

1170 1171
	skb_reset_network_header(skb);

1172 1173 1174 1175
	if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
		pr_debug("GSO!\n");
		switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
		case VIRTIO_NET_HDR_GSO_TCPV4:
1176
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
1177 1178
			break;
		case VIRTIO_NET_HDR_GSO_TCPV6:
1179
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
1180
			break;
1181
		case VIRTIO_NET_HDR_GSO_UDP:
1182 1183 1184 1185 1186 1187 1188 1189 1190
		{
			static bool warned;

			if (!warned) {
				warned = true;
				netdev_warn(tun->dev,
					    "%s: using disabled UFO feature; please fix this program\n",
					    current->comm);
			}
1191
			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1192 1193
			if (skb->protocol == htons(ETH_P_IPV6))
				ipv6_proxy_select_ident(skb);
1194
			break;
1195
		}
1196 1197 1198 1199 1200 1201 1202
		default:
			tun->dev->stats.rx_frame_errors++;
			kfree_skb(skb);
			return -EINVAL;
		}

		if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1203
			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
1204

1205
		skb_shinfo(skb)->gso_size = tun16_to_cpu(tun, gso.gso_size);
1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
		if (skb_shinfo(skb)->gso_size == 0) {
			tun->dev->stats.rx_frame_errors++;
			kfree_skb(skb);
			return -EINVAL;
		}

		/* Header must be checked, and gso_segs computed. */
		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
		skb_shinfo(skb)->gso_segs = 0;
	}
1216

1217 1218 1219 1220
	/* copy skb_ubuf_info for callback when skb has no error */
	if (zerocopy) {
		skb_shinfo(skb)->destructor_arg = msg_control;
		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1221
		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1222 1223
	}

1224
	skb_probe_transport_header(skb, 0);
1225

1226
	rxhash = skb_get_hash(skb);
L
Linus Torvalds 已提交
1227
	netif_rx_ni(skb);
1228

1229 1230
	tun->dev->stats.rx_packets++;
	tun->dev->stats.rx_bytes += len;
L
Linus Torvalds 已提交
1231

1232
	tun_flow_update(tun, rxhash, tfile);
1233
	return total_len;
1234
}
L
Linus Torvalds 已提交
1235

1236
static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
L
Linus Torvalds 已提交
1237
{
1238
	struct file *file = iocb->ki_filp;
H
Herbert Xu 已提交
1239
	struct tun_struct *tun = tun_get(file);
J
Jason Wang 已提交
1240
	struct tun_file *tfile = file->private_data;
E
Eric W. Biederman 已提交
1241
	ssize_t result;
L
Linus Torvalds 已提交
1242 1243 1244 1245

	if (!tun)
		return -EBADFD;

1246
	result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK);
E
Eric W. Biederman 已提交
1247 1248 1249

	tun_put(tun);
	return result;
L
Linus Torvalds 已提交
1250 1251 1252
}

/* Put packet to the user space buffer */
1253
static ssize_t tun_put_user(struct tun_struct *tun,
J
Jason Wang 已提交
1254
			    struct tun_file *tfile,
1255
			    struct sk_buff *skb,
H
Herbert Xu 已提交
1256
			    struct iov_iter *iter)
L
Linus Torvalds 已提交
1257 1258
{
	struct tun_pi pi = { 0, skb->protocol };
H
Herbert Xu 已提交
1259
	ssize_t total;
1260
	int vlan_offset = 0;
1261
	int vlan_hlen = 0;
H
Herbert Xu 已提交
1262
	int vnet_hdr_sz = 0;
1263 1264 1265

	if (vlan_tx_tag_present(skb))
		vlan_hlen = VLAN_HLEN;
L
Linus Torvalds 已提交
1266

M
Michael S. Tsirkin 已提交
1267
	if (tun->flags & IFF_VNET_HDR)
H
Herbert Xu 已提交
1268
		vnet_hdr_sz = tun->vnet_hdr_sz;
L
Linus Torvalds 已提交
1269

H
Herbert Xu 已提交
1270 1271
	total = skb->len + vlan_hlen + vnet_hdr_sz;

M
Michael S. Tsirkin 已提交
1272
	if (!(tun->flags & IFF_NO_PI)) {
H
Herbert Xu 已提交
1273
		if (iov_iter_count(iter) < sizeof(pi))
L
Linus Torvalds 已提交
1274 1275
			return -EINVAL;

H
Herbert Xu 已提交
1276 1277
		total += sizeof(pi);
		if (iov_iter_count(iter) < total) {
L
Linus Torvalds 已提交
1278 1279 1280
			/* Packet will be striped */
			pi.flags |= TUN_PKT_STRIP;
		}
1281

H
Herbert Xu 已提交
1282
		if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
L
Linus Torvalds 已提交
1283
			return -EFAULT;
1284
	}
L
Linus Torvalds 已提交
1285

H
Herbert Xu 已提交
1286
	if (vnet_hdr_sz) {
1287
		struct virtio_net_hdr gso = { 0 }; /* no info leak */
H
Herbert Xu 已提交
1288
		if (iov_iter_count(iter) < vnet_hdr_sz)
1289 1290 1291 1292 1293 1294
			return -EINVAL;

		if (skb_is_gso(skb)) {
			struct skb_shared_info *sinfo = skb_shinfo(skb);

			/* This is a hint as to how much should be linear. */
1295 1296
			gso.hdr_len = cpu_to_tun16(tun, skb_headlen(skb));
			gso.gso_size = cpu_to_tun16(tun, sinfo->gso_size);
1297 1298 1299 1300
			if (sinfo->gso_type & SKB_GSO_TCPV4)
				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
			else if (sinfo->gso_type & SKB_GSO_TCPV6)
				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1301
			else {
1302
				pr_err("unexpected GSO type: "
1303
				       "0x%x, gso_size %d, hdr_len %d\n",
1304 1305
				       sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
				       tun16_to_cpu(tun, gso.hdr_len));
1306 1307 1308
				print_hex_dump(KERN_ERR, "tun: ",
					       DUMP_PREFIX_NONE,
					       16, 1, skb->head,
1309
					       min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
1310 1311 1312
				WARN_ON_ONCE(1);
				return -EINVAL;
			}
1313 1314 1315 1316 1317 1318 1319
			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
		} else
			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;

		if (skb->ip_summed == CHECKSUM_PARTIAL) {
			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1320 1321 1322
			gso.csum_start = cpu_to_tun16(tun, skb_checksum_start_offset(skb) +
						      vlan_hlen);
			gso.csum_offset = cpu_to_tun16(tun, skb->csum_offset);
1323 1324
		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
1325 1326
		} /* else everything is zero */

H
Herbert Xu 已提交
1327
		if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
1328
			return -EFAULT;
1329 1330

		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
1331 1332
	}

1333
	if (vlan_hlen) {
H
Herbert Xu 已提交
1334
		int ret;
J
Jason Wang 已提交
1335 1336 1337 1338 1339 1340 1341 1342 1343 1344
		struct {
			__be16 h_vlan_proto;
			__be16 h_vlan_TCI;
		} veth;

		veth.h_vlan_proto = skb->vlan_proto;
		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));

		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);

H
Herbert Xu 已提交
1345 1346
		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
		if (ret || !iov_iter_count(iter))
J
Jason Wang 已提交
1347 1348
			goto done;

H
Herbert Xu 已提交
1349 1350
		ret = copy_to_iter(&veth, sizeof(veth), iter);
		if (ret != sizeof(veth) || !iov_iter_count(iter))
J
Jason Wang 已提交
1351 1352
			goto done;
	}
L
Linus Torvalds 已提交
1353

H
Herbert Xu 已提交
1354
	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
L
Linus Torvalds 已提交
1355

J
Jason Wang 已提交
1356
done:
1357
	tun->dev->stats.tx_packets++;
H
Herbert Xu 已提交
1358
	tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
L
Linus Torvalds 已提交
1359 1360 1361 1362

	return total;
}

J
Jason Wang 已提交
1363
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
1364 1365
			   struct iov_iter *to,
			   int noblock)
L
Linus Torvalds 已提交
1366 1367
{
	struct sk_buff *skb;
1368
	ssize_t ret;
1369
	int peeked, err, off = 0;
L
Linus Torvalds 已提交
1370

1371
	tun_debug(KERN_INFO, tun, "tun_do_read\n");
L
Linus Torvalds 已提交
1372

1373 1374
	if (!iov_iter_count(to))
		return 0;
L
Linus Torvalds 已提交
1375

1376 1377
	if (tun->dev->reg_state != NETREG_REGISTERED)
		return -EIO;
L
Linus Torvalds 已提交
1378

1379 1380 1381
	/* Read frames from queue */
	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
				  &peeked, &off, &err);
H
Herbert Xu 已提交
1382
	if (!skb)
1383
		return 0;
H
Herbert Xu 已提交
1384

1385
	ret = tun_put_user(tun, tfile, skb, to);
1386
	if (unlikely(ret < 0))
1387
		kfree_skb(skb);
1388 1389
	else
		consume_skb(skb);
L
Linus Torvalds 已提交
1390

1391 1392 1393
	return ret;
}

1394
static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
1395 1396 1397 1398
{
	struct file *file = iocb->ki_filp;
	struct tun_file *tfile = file->private_data;
	struct tun_struct *tun = __tun_get(tfile);
1399
	ssize_t len = iov_iter_count(to), ret;
1400 1401 1402

	if (!tun)
		return -EBADFD;
1403
	ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK);
1404
	ret = min_t(ssize_t, ret, len);
Z
Zhi Yong Wu 已提交
1405 1406
	if (ret > 0)
		iocb->ki_pos = ret;
E
Eric W. Biederman 已提交
1407
	tun_put(tun);
L
Linus Torvalds 已提交
1408 1409 1410
	return ret;
}

J
Jason Wang 已提交
1411 1412 1413 1414
static void tun_free_netdev(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);

J
Jason Wang 已提交
1415
	BUG_ON(!(list_empty(&tun->disabled)));
J
Jason Wang 已提交
1416
	tun_flow_uninit(tun);
1417
	security_tun_dev_free_security(tun->security);
J
Jason Wang 已提交
1418 1419 1420
	free_netdev(dev);
}

L
Linus Torvalds 已提交
1421 1422 1423 1424
static void tun_setup(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);

1425 1426
	tun->owner = INVALID_UID;
	tun->group = INVALID_GID;
L
Linus Torvalds 已提交
1427 1428

	dev->ethtool_ops = &tun_ethtool_ops;
J
Jason Wang 已提交
1429
	dev->destructor = tun_free_netdev;
L
Linus Torvalds 已提交
1430 1431
}

1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446
/* Trivial set of netlink ops to allow deleting tun or tap
 * device with netlink.
 */
static int tun_validate(struct nlattr *tb[], struct nlattr *data[])
{
	return -EINVAL;
}

static struct rtnl_link_ops tun_link_ops __read_mostly = {
	.kind		= DRV_NAME,
	.priv_size	= sizeof(struct tun_struct),
	.setup		= tun_setup,
	.validate	= tun_validate,
};

1447 1448
static void tun_sock_write_space(struct sock *sk)
{
J
Jason Wang 已提交
1449
	struct tun_file *tfile;
1450
	wait_queue_head_t *wqueue;
1451 1452 1453 1454 1455 1456 1457

	if (!sock_writeable(sk))
		return;

	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
		return;

1458 1459 1460
	wqueue = sk_sleep(sk);
	if (wqueue && waitqueue_active(wqueue))
		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
1461
						POLLWRNORM | POLLWRBAND);
H
Herbert Xu 已提交
1462

J
Jason Wang 已提交
1463 1464
	tfile = container_of(sk, struct tun_file, sk);
	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
1465 1466
}

1467 1468 1469
static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
		       struct msghdr *m, size_t total_len)
{
J
Jason Wang 已提交
1470 1471 1472 1473 1474 1475
	int ret;
	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
	struct tun_struct *tun = __tun_get(tfile);

	if (!tun)
		return -EBADFD;
1476

A
Al Viro 已提交
1477
	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
1478
			   m->msg_flags & MSG_DONTWAIT);
J
Jason Wang 已提交
1479 1480
	tun_put(tun);
	return ret;
1481 1482 1483 1484 1485 1486
}

static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
		       struct msghdr *m, size_t total_len,
		       int flags)
{
J
Jason Wang 已提交
1487 1488
	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
	struct tun_struct *tun = __tun_get(tfile);
1489
	int ret;
J
Jason Wang 已提交
1490 1491 1492 1493

	if (!tun)
		return -EBADFD;

1494
	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
1495 1496 1497
		ret = -EINVAL;
		goto out;
	}
1498 1499 1500 1501 1502
	if (flags & MSG_ERRQUEUE) {
		ret = sock_recv_errqueue(sock->sk, m, total_len,
					 SOL_PACKET, TUN_TX_TIMESTAMP);
		goto out;
	}
A
Al Viro 已提交
1503
	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT);
1504 1505 1506 1507
	if (ret > total_len) {
		m->msg_flags |= MSG_TRUNC;
		ret = flags & MSG_TRUNC ? ret : total_len;
	}
1508
out:
J
Jason Wang 已提交
1509
	tun_put(tun);
1510 1511 1512
	return ret;
}

1513 1514 1515 1516 1517 1518 1519
static int tun_release(struct socket *sock)
{
	if (sock->sk)
		sock_put(sock->sk);
	return 0;
}

1520 1521 1522 1523
/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
	.sendmsg = tun_sendmsg,
	.recvmsg = tun_recvmsg,
1524
	.release = tun_release,
1525 1526
};

1527 1528 1529
static struct proto tun_proto = {
	.name		= "tun",
	.owner		= THIS_MODULE,
J
Jason Wang 已提交
1530
	.obj_size	= sizeof(struct tun_file),
1531
};
1532

1533 1534
static int tun_flags(struct tun_struct *tun)
{
1535
	return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548
}

static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
			      char *buf)
{
	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
	return sprintf(buf, "0x%x\n", tun_flags(tun));
}

static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
			      char *buf)
{
	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1549 1550 1551 1552
	return uid_valid(tun->owner)?
		sprintf(buf, "%u\n",
			from_kuid_munged(current_user_ns(), tun->owner)):
		sprintf(buf, "-1\n");
1553 1554 1555 1556 1557 1558
}

static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
			      char *buf)
{
	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1559 1560 1561 1562
	return gid_valid(tun->group) ?
		sprintf(buf, "%u\n",
			from_kgid_munged(current_user_ns(), tun->group)):
		sprintf(buf, "-1\n");
1563 1564 1565 1566 1567 1568
}

static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
static DEVICE_ATTR(group, 0444, tun_show_group, NULL);

1569
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
L
Linus Torvalds 已提交
1570 1571
{
	struct tun_struct *tun;
J
Jason Wang 已提交
1572
	struct tun_file *tfile = file->private_data;
L
Linus Torvalds 已提交
1573 1574 1575
	struct net_device *dev;
	int err;

1576 1577 1578
	if (tfile->detached)
		return -EINVAL;

1579 1580
	dev = __dev_get_by_name(net, ifr->ifr_name);
	if (dev) {
1581 1582
		if (ifr->ifr_flags & IFF_TUN_EXCL)
			return -EBUSY;
1583 1584 1585 1586 1587 1588 1589
		if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
			tun = netdev_priv(dev);
		else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
			tun = netdev_priv(dev);
		else
			return -EINVAL;

1590
		if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
M
Michael S. Tsirkin 已提交
1591
		    !!(tun->flags & IFF_MULTI_QUEUE))
1592 1593
			return -EINVAL;

1594
		if (tun_not_capable(tun))
P
Paul Moore 已提交
1595
			return -EPERM;
1596
		err = security_tun_dev_open(tun->security);
P
Paul Moore 已提交
1597 1598 1599
		if (err < 0)
			return err;

1600
		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);
1601 1602
		if (err < 0)
			return err;
J
Jason Wang 已提交
1603

M
Michael S. Tsirkin 已提交
1604
		if (tun->flags & IFF_MULTI_QUEUE &&
1605 1606 1607 1608 1609 1610
		    (tun->numqueues + tun->numdisabled > 1)) {
			/* One or more queue has already been attached, no need
			 * to initialize the device again.
			 */
			return 0;
		}
1611
	}
L
Linus Torvalds 已提交
1612 1613 1614
	else {
		char *name;
		unsigned long flags = 0;
1615 1616
		int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
			     MAX_TAP_QUEUES : 1;
L
Linus Torvalds 已提交
1617

1618
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1619
			return -EPERM;
P
Paul Moore 已提交
1620 1621 1622
		err = security_tun_dev_create();
		if (err < 0)
			return err;
1623

L
Linus Torvalds 已提交
1624 1625 1626
		/* Set dev type */
		if (ifr->ifr_flags & IFF_TUN) {
			/* TUN device */
M
Michael S. Tsirkin 已提交
1627
			flags |= IFF_TUN;
L
Linus Torvalds 已提交
1628 1629 1630
			name = "tun%d";
		} else if (ifr->ifr_flags & IFF_TAP) {
			/* TAP device */
M
Michael S. Tsirkin 已提交
1631
			flags |= IFF_TAP;
L
Linus Torvalds 已提交
1632
			name = "tap%d";
1633
		} else
1634
			return -EINVAL;
1635

L
Linus Torvalds 已提交
1636 1637 1638
		if (*ifr->ifr_name)
			name = ifr->ifr_name;

J
Jason Wang 已提交
1639
		dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
1640 1641
				       NET_NAME_UNKNOWN, tun_setup, queues,
				       queues);
1642

L
Linus Torvalds 已提交
1643 1644 1645
		if (!dev)
			return -ENOMEM;

1646
		dev_net_set(dev, net);
1647
		dev->rtnl_link_ops = &tun_link_ops;
1648
		dev->ifindex = tfile->ifindex;
S
Stephen Hemminger 已提交
1649

L
Linus Torvalds 已提交
1650 1651 1652
		tun = netdev_priv(dev);
		tun->dev = dev;
		tun->flags = flags;
1653
		tun->txflt.count = 0;
1654
		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
1655

J
Jason Wang 已提交
1656 1657
		tun->filter_attached = false;
		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
1658

J
Jason Wang 已提交
1659 1660
		spin_lock_init(&tun->lock);

1661 1662 1663
		err = security_tun_dev_alloc_security(&tun->security);
		if (err < 0)
			goto err_free_dev;
P
Paul Moore 已提交
1664

L
Linus Torvalds 已提交
1665
		tun_net_init(dev);
1666
		tun_flow_init(tun);
J
Jason Wang 已提交
1667

1668
		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
J
Jason Wang 已提交
1669 1670
				   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
				   NETIF_F_HW_VLAN_STAG_TX;
1671
		dev->features = dev->hw_features;
1672 1673 1674
		dev->vlan_features = dev->features &
				     ~(NETIF_F_HW_VLAN_CTAG_TX |
				       NETIF_F_HW_VLAN_STAG_TX);
1675

J
Jason Wang 已提交
1676
		INIT_LIST_HEAD(&tun->disabled);
1677
		err = tun_attach(tun, file, false);
1678
		if (err < 0)
1679
			goto err_free_flow;
1680

L
Linus Torvalds 已提交
1681 1682
		err = register_netdevice(tun->dev);
		if (err < 0)
1683
			goto err_detach;
1684

1685 1686 1687
		if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
		    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
		    device_create_file(&tun->dev->dev, &dev_attr_group))
1688
			pr_err("Failed to create tun sysfs files\n");
L
Linus Torvalds 已提交
1689 1690
	}

1691 1692
	netif_carrier_on(tun->dev);

1693
	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
L
Linus Torvalds 已提交
1694

1695 1696
	tun->flags = (tun->flags & ~TUN_FEATURES) |
		(ifr->ifr_flags & TUN_FEATURES);
J
Jason Wang 已提交
1697

1698 1699 1700 1701
	/* Make sure persistent devices do not get stuck in
	 * xoff state.
	 */
	if (netif_running(tun->dev))
J
Jason Wang 已提交
1702
		netif_tx_wake_all_queues(tun->dev);
1703

L
Linus Torvalds 已提交
1704 1705 1706
	strcpy(ifr->ifr_name, tun->dev->name);
	return 0;

1707 1708 1709 1710 1711 1712
err_detach:
	tun_detach_all(dev);
err_free_flow:
	tun_flow_uninit(tun);
	security_tun_dev_free_security(tun->security);
err_free_dev:
L
Linus Torvalds 已提交
1713 1714 1715 1716
	free_netdev(dev);
	return err;
}

R
Rami Rosen 已提交
1717
static void tun_get_iff(struct net *net, struct tun_struct *tun,
1718
		       struct ifreq *ifr)
1719
{
1720
	tun_debug(KERN_INFO, tun, "tun_get_iff\n");
1721 1722 1723

	strcpy(ifr->ifr_name, tun->dev->name);

1724
	ifr->ifr_flags = tun_flags(tun);
1725 1726 1727

}

1728 1729
/* This is like a cut-down ethtool ops, except done via tun fd so no
 * privs required. */
1730
static int set_offload(struct tun_struct *tun, unsigned long arg)
1731
{
1732
	netdev_features_t features = 0;
1733 1734

	if (arg & TUN_F_CSUM) {
1735
		features |= NETIF_F_HW_CSUM;
1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755
		arg &= ~TUN_F_CSUM;

		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
			if (arg & TUN_F_TSO_ECN) {
				features |= NETIF_F_TSO_ECN;
				arg &= ~TUN_F_TSO_ECN;
			}
			if (arg & TUN_F_TSO4)
				features |= NETIF_F_TSO;
			if (arg & TUN_F_TSO6)
				features |= NETIF_F_TSO6;
			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
		}
	}

	/* This gives the user a way to test for new features in future by
	 * trying to set them. */
	if (arg)
		return -EINVAL;

1756 1757
	tun->set_features = features;
	netdev_update_features(tun->dev);
1758 1759 1760 1761

	return 0;
}

J
Jason Wang 已提交
1762 1763 1764 1765 1766 1767
static void tun_detach_filter(struct tun_struct *tun, int n)
{
	int i;
	struct tun_file *tfile;

	for (i = 0; i < n; i++) {
1768
		tfile = rtnl_dereference(tun->tfiles[i]);
J
Jason Wang 已提交
1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780
		sk_detach_filter(tfile->socket.sk);
	}

	tun->filter_attached = false;
}

static int tun_attach_filter(struct tun_struct *tun)
{
	int i, ret = 0;
	struct tun_file *tfile;

	for (i = 0; i < tun->numqueues; i++) {
1781
		tfile = rtnl_dereference(tun->tfiles[i]);
J
Jason Wang 已提交
1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798
		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
		if (ret) {
			tun_detach_filter(tun, i);
			return ret;
		}
	}

	tun->filter_attached = true;
	return ret;
}

static void tun_set_sndbuf(struct tun_struct *tun)
{
	struct tun_file *tfile;
	int i;

	for (i = 0; i < tun->numqueues; i++) {
1799
		tfile = rtnl_dereference(tun->tfiles[i]);
J
Jason Wang 已提交
1800 1801 1802 1803
		tfile->socket.sk->sk_sndbuf = tun->sndbuf;
	}
}

1804 1805 1806 1807 1808 1809 1810 1811 1812
static int tun_set_queue(struct file *file, struct ifreq *ifr)
{
	struct tun_file *tfile = file->private_data;
	struct tun_struct *tun;
	int ret = 0;

	rtnl_lock();

	if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
J
Jason Wang 已提交
1813
		tun = tfile->detached;
1814
		if (!tun) {
1815
			ret = -EINVAL;
1816 1817 1818 1819 1820
			goto unlock;
		}
		ret = security_tun_dev_attach_queue(tun->security);
		if (ret < 0)
			goto unlock;
1821
		ret = tun_attach(tun, file, false);
J
Jason Wang 已提交
1822
	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
1823
		tun = rtnl_dereference(tfile->tun);
M
Michael S. Tsirkin 已提交
1824
		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
J
Jason Wang 已提交
1825 1826 1827 1828
			ret = -EINVAL;
		else
			__tun_detach(tfile, false);
	} else
1829 1830
		ret = -EINVAL;

1831
unlock:
1832 1833 1834 1835
	rtnl_unlock();
	return ret;
}

1836 1837
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
			    unsigned long arg, int ifreq_len)
L
Linus Torvalds 已提交
1838
{
E
Eric W. Biederman 已提交
1839
	struct tun_file *tfile = file->private_data;
E
Eric W. Biederman 已提交
1840
	struct tun_struct *tun;
L
Linus Torvalds 已提交
1841 1842
	void __user* argp = (void __user*)arg;
	struct ifreq ifr;
1843 1844
	kuid_t owner;
	kgid_t group;
1845
	int sndbuf;
1846
	int vnet_hdr_sz;
1847
	unsigned int ifindex;
M
Michael S. Tsirkin 已提交
1848
	int le;
1849
	int ret;
L
Linus Torvalds 已提交
1850

1851
	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
1852
		if (copy_from_user(&ifr, argp, ifreq_len))
L
Linus Torvalds 已提交
1853
			return -EFAULT;
D
David S. Miller 已提交
1854
	} else {
1855
		memset(&ifr, 0, sizeof(ifr));
D
David S. Miller 已提交
1856
	}
E
Eric W. Biederman 已提交
1857 1858 1859
	if (cmd == TUNGETFEATURES) {
		/* Currently this just means: "what IFF flags are valid?".
		 * This is needed because we never checked for invalid flags on
1860 1861 1862
		 * TUNSETIFF.
		 */
		return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
E
Eric W. Biederman 已提交
1863
				(unsigned int __user*)argp);
1864 1865
	} else if (cmd == TUNSETQUEUE)
		return tun_set_queue(file, &ifr);
E
Eric W. Biederman 已提交
1866

J
Jason Wang 已提交
1867
	ret = 0;
1868 1869
	rtnl_lock();

E
Eric W. Biederman 已提交
1870
	tun = __tun_get(tfile);
L
Linus Torvalds 已提交
1871 1872 1873
	if (cmd == TUNSETIFF && !tun) {
		ifr.ifr_name[IFNAMSIZ-1] = '\0';

1874
		ret = tun_set_iff(tfile->net, file, &ifr);
L
Linus Torvalds 已提交
1875

1876 1877
		if (ret)
			goto unlock;
L
Linus Torvalds 已提交
1878

1879
		if (copy_to_user(argp, &ifr, ifreq_len))
1880 1881
			ret = -EFAULT;
		goto unlock;
L
Linus Torvalds 已提交
1882
	}
1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895
	if (cmd == TUNSETIFINDEX) {
		ret = -EPERM;
		if (tun)
			goto unlock;

		ret = -EFAULT;
		if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
			goto unlock;

		ret = 0;
		tfile->ifindex = ifindex;
		goto unlock;
	}
L
Linus Torvalds 已提交
1896

1897
	ret = -EBADFD;
L
Linus Torvalds 已提交
1898
	if (!tun)
1899
		goto unlock;
L
Linus Torvalds 已提交
1900

1901
	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
L
Linus Torvalds 已提交
1902

E
Eric W. Biederman 已提交
1903
	ret = 0;
L
Linus Torvalds 已提交
1904
	switch (cmd) {
1905
	case TUNGETIFF:
R
Rami Rosen 已提交
1906
		tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
1907

1908 1909
		if (tfile->detached)
			ifr.ifr_flags |= IFF_DETACH_QUEUE;
1910 1911
		if (!tfile->socket.sk->sk_filter)
			ifr.ifr_flags |= IFF_NOFILTER;
1912

1913
		if (copy_to_user(argp, &ifr, ifreq_len))
E
Eric W. Biederman 已提交
1914
			ret = -EFAULT;
1915 1916
		break;

L
Linus Torvalds 已提交
1917 1918 1919
	case TUNSETNOCSUM:
		/* Disable/Enable checksum */

1920 1921
		/* [unimplemented] */
		tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
1922
			  arg ? "disabled" : "enabled");
L
Linus Torvalds 已提交
1923 1924 1925
		break;

	case TUNSETPERSIST:
J
Jason Wang 已提交
1926 1927 1928
		/* Disable/Enable persist mode. Keep an extra reference to the
		 * module to prevent the module being unprobed.
		 */
M
Michael S. Tsirkin 已提交
1929 1930
		if (arg && !(tun->flags & IFF_PERSIST)) {
			tun->flags |= IFF_PERSIST;
J
Jason Wang 已提交
1931
			__module_get(THIS_MODULE);
J
Jason Wang 已提交
1932
		}
M
Michael S. Tsirkin 已提交
1933 1934
		if (!arg && (tun->flags & IFF_PERSIST)) {
			tun->flags &= ~IFF_PERSIST;
J
Jason Wang 已提交
1935 1936
			module_put(THIS_MODULE);
		}
L
Linus Torvalds 已提交
1937

1938 1939
		tun_debug(KERN_INFO, tun, "persist %s\n",
			  arg ? "enabled" : "disabled");
L
Linus Torvalds 已提交
1940 1941 1942 1943
		break;

	case TUNSETOWNER:
		/* Set owner of the device */
1944 1945 1946 1947 1948 1949
		owner = make_kuid(current_user_ns(), arg);
		if (!uid_valid(owner)) {
			ret = -EINVAL;
			break;
		}
		tun->owner = owner;
1950
		tun_debug(KERN_INFO, tun, "owner set to %u\n",
1951
			  from_kuid(&init_user_ns, tun->owner));
L
Linus Torvalds 已提交
1952 1953
		break;

1954 1955
	case TUNSETGROUP:
		/* Set group of the device */
1956 1957 1958 1959 1960 1961
		group = make_kgid(current_user_ns(), arg);
		if (!gid_valid(group)) {
			ret = -EINVAL;
			break;
		}
		tun->group = group;
1962
		tun_debug(KERN_INFO, tun, "group set to %u\n",
1963
			  from_kgid(&init_user_ns, tun->group));
1964 1965
		break;

1966 1967 1968
	case TUNSETLINK:
		/* Only allow setting the type when the interface is down */
		if (tun->dev->flags & IFF_UP) {
1969 1970
			tun_debug(KERN_INFO, tun,
				  "Linktype set failed because interface is up\n");
1971
			ret = -EBUSY;
1972 1973
		} else {
			tun->dev->type = (int) arg;
1974 1975
			tun_debug(KERN_INFO, tun, "linktype set to %d\n",
				  tun->dev->type);
1976
			ret = 0;
1977
		}
E
Eric W. Biederman 已提交
1978
		break;
1979

L
Linus Torvalds 已提交
1980 1981 1982 1983 1984
#ifdef TUN_DEBUG
	case TUNSETDEBUG:
		tun->debug = arg;
		break;
#endif
1985
	case TUNSETOFFLOAD:
1986
		ret = set_offload(tun, arg);
E
Eric W. Biederman 已提交
1987
		break;
1988

1989 1990
	case TUNSETTXFILTER:
		/* Can be set only for TAPs */
E
Eric W. Biederman 已提交
1991
		ret = -EINVAL;
M
Michael S. Tsirkin 已提交
1992
		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
E
Eric W. Biederman 已提交
1993
			break;
H
Harvey Harrison 已提交
1994
		ret = update_filter(&tun->txflt, (void __user *)arg);
E
Eric W. Biederman 已提交
1995
		break;
L
Linus Torvalds 已提交
1996 1997

	case SIOCGIFHWADDR:
1998
		/* Get hw address */
1999 2000
		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
		ifr.ifr_hwaddr.sa_family = tun->dev->type;
2001
		if (copy_to_user(argp, &ifr, ifreq_len))
E
Eric W. Biederman 已提交
2002 2003
			ret = -EFAULT;
		break;
L
Linus Torvalds 已提交
2004 2005

	case SIOCSIFHWADDR:
2006
		/* Set hw address */
2007 2008
		tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
			  ifr.ifr_hwaddr.sa_data);
2009 2010

		ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
E
Eric W. Biederman 已提交
2011
		break;
2012 2013

	case TUNGETSNDBUF:
J
Jason Wang 已提交
2014
		sndbuf = tfile->socket.sk->sk_sndbuf;
2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
			ret = -EFAULT;
		break;

	case TUNSETSNDBUF:
		if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
			ret = -EFAULT;
			break;
		}

J
Jason Wang 已提交
2025 2026
		tun->sndbuf = sndbuf;
		tun_set_sndbuf(tun);
2027 2028
		break;

2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047
	case TUNGETVNETHDRSZ:
		vnet_hdr_sz = tun->vnet_hdr_sz;
		if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
			ret = -EFAULT;
		break;

	case TUNSETVNETHDRSZ:
		if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
			ret = -EFAULT;
			break;
		}
		if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
			ret = -EINVAL;
			break;
		}

		tun->vnet_hdr_sz = vnet_hdr_sz;
		break;

M
Michael S. Tsirkin 已提交
2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064
	case TUNGETVNETLE:
		le = !!(tun->flags & TUN_VNET_LE);
		if (put_user(le, (int __user *)argp))
			ret = -EFAULT;
		break;

	case TUNSETVNETLE:
		if (get_user(le, (int __user *)argp)) {
			ret = -EFAULT;
			break;
		}
		if (le)
			tun->flags |= TUN_VNET_LE;
		else
			tun->flags &= ~TUN_VNET_LE;
		break;

M
Michael S. Tsirkin 已提交
2065 2066 2067
	case TUNATTACHFILTER:
		/* Can be set only for TAPs */
		ret = -EINVAL;
M
Michael S. Tsirkin 已提交
2068
		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
M
Michael S. Tsirkin 已提交
2069 2070
			break;
		ret = -EFAULT;
J
Jason Wang 已提交
2071
		if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
M
Michael S. Tsirkin 已提交
2072 2073
			break;

J
Jason Wang 已提交
2074
		ret = tun_attach_filter(tun);
M
Michael S. Tsirkin 已提交
2075 2076 2077 2078 2079
		break;

	case TUNDETACHFILTER:
		/* Can be set only for TAPs */
		ret = -EINVAL;
M
Michael S. Tsirkin 已提交
2080
		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
M
Michael S. Tsirkin 已提交
2081
			break;
J
Jason Wang 已提交
2082 2083
		ret = 0;
		tun_detach_filter(tun, tun->numqueues);
M
Michael S. Tsirkin 已提交
2084 2085
		break;

P
Pavel Emelyanov 已提交
2086 2087
	case TUNGETFILTER:
		ret = -EINVAL;
M
Michael S. Tsirkin 已提交
2088
		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
P
Pavel Emelyanov 已提交
2089 2090 2091 2092 2093 2094 2095
			break;
		ret = -EFAULT;
		if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
			break;
		ret = 0;
		break;

L
Linus Torvalds 已提交
2096
	default:
E
Eric W. Biederman 已提交
2097 2098
		ret = -EINVAL;
		break;
2099
	}
L
Linus Torvalds 已提交
2100

2101 2102 2103 2104
unlock:
	rtnl_unlock();
	if (tun)
		tun_put(tun);
E
Eric W. Biederman 已提交
2105
	return ret;
L
Linus Torvalds 已提交
2106 2107
}

2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142
static long tun_chr_ioctl(struct file *file,
			  unsigned int cmd, unsigned long arg)
{
	return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
}

#ifdef CONFIG_COMPAT
static long tun_chr_compat_ioctl(struct file *file,
			 unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case TUNSETIFF:
	case TUNGETIFF:
	case TUNSETTXFILTER:
	case TUNGETSNDBUF:
	case TUNSETSNDBUF:
	case SIOCGIFHWADDR:
	case SIOCSIFHWADDR:
		arg = (unsigned long)compat_ptr(arg);
		break;
	default:
		arg = (compat_ulong_t)arg;
		break;
	}

	/*
	 * compat_ifreq is shorter than ifreq, so we must not access beyond
	 * the end of that structure. All fields that are used in this
	 * driver are compatible though, we don't need to convert the
	 * contents.
	 */
	return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
}
#endif /* CONFIG_COMPAT */

L
Linus Torvalds 已提交
2143 2144
static int tun_chr_fasync(int fd, struct file *file, int on)
{
J
Jason Wang 已提交
2145
	struct tun_file *tfile = file->private_data;
L
Linus Torvalds 已提交
2146 2147
	int ret;

J
Jason Wang 已提交
2148
	if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
J
Jonathan Corbet 已提交
2149
		goto out;
2150

L
Linus Torvalds 已提交
2151
	if (on) {
2152
		__f_setown(file, task_pid(current), PIDTYPE_PID, 0);
J
Jason Wang 已提交
2153
		tfile->flags |= TUN_FASYNC;
2154
	} else
J
Jason Wang 已提交
2155
		tfile->flags &= ~TUN_FASYNC;
J
Jonathan Corbet 已提交
2156 2157 2158
	ret = 0;
out:
	return ret;
L
Linus Torvalds 已提交
2159 2160 2161 2162
}

static int tun_chr_open(struct inode *inode, struct file * file)
{
E
Eric W. Biederman 已提交
2163
	struct tun_file *tfile;
T
Thomas Gleixner 已提交
2164

2165
	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
E
Eric W. Biederman 已提交
2166

J
Jason Wang 已提交
2167 2168
	tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL,
					    &tun_proto);
E
Eric W. Biederman 已提交
2169 2170
	if (!tfile)
		return -ENOMEM;
2171
	RCU_INIT_POINTER(tfile->tun, NULL);
E
Eric W. Biederman 已提交
2172
	tfile->net = get_net(current->nsproxy->net_ns);
J
Jason Wang 已提交
2173
	tfile->flags = 0;
2174
	tfile->ifindex = 0;
J
Jason Wang 已提交
2175 2176

	init_waitqueue_head(&tfile->wq.wait);
2177
	RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
J
Jason Wang 已提交
2178 2179 2180 2181 2182 2183 2184 2185 2186 2187

	tfile->socket.file = file;
	tfile->socket.ops = &tun_socket_ops;

	sock_init_data(&tfile->socket, &tfile->sk);
	sk_change_net(&tfile->sk, tfile->net);

	tfile->sk.sk_write_space = tun_sock_write_space;
	tfile->sk.sk_sndbuf = INT_MAX;

E
Eric W. Biederman 已提交
2188
	file->private_data = tfile;
J
Jason Wang 已提交
2189
	set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags);
J
Jason Wang 已提交
2190
	INIT_LIST_HEAD(&tfile->next);
J
Jason Wang 已提交
2191

2192 2193
	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);

L
Linus Torvalds 已提交
2194 2195 2196 2197 2198
	return 0;
}

static int tun_chr_close(struct inode *inode, struct file *file)
{
E
Eric W. Biederman 已提交
2199
	struct tun_file *tfile = file->private_data;
J
Jason Wang 已提交
2200
	struct net *net = tfile->net;
L
Linus Torvalds 已提交
2201

J
Jason Wang 已提交
2202
	tun_detach(tfile, true);
J
Jason Wang 已提交
2203
	put_net(net);
L
Linus Torvalds 已提交
2204 2205 2206 2207

	return 0;
}

2208
#ifdef CONFIG_PROC_FS
2209
static void tun_chr_show_fdinfo(struct seq_file *m, struct file *f)
2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224
{
	struct tun_struct *tun;
	struct ifreq ifr;

	memset(&ifr, 0, sizeof(ifr));

	rtnl_lock();
	tun = tun_get(f);
	if (tun)
		tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
	rtnl_unlock();

	if (tun)
		tun_put(tun);

2225
	seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
2226 2227 2228
}
#endif

2229
static const struct file_operations tun_fops = {
2230
	.owner	= THIS_MODULE,
L
Linus Torvalds 已提交
2231
	.llseek = no_llseek,
2232
	.read  = new_sync_read,
2233
	.write = new_sync_write,
2234
	.read_iter  = tun_chr_read_iter,
2235
	.write_iter = tun_chr_write_iter,
L
Linus Torvalds 已提交
2236
	.poll	= tun_chr_poll,
2237 2238 2239 2240
	.unlocked_ioctl	= tun_chr_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = tun_chr_compat_ioctl,
#endif
L
Linus Torvalds 已提交
2241 2242
	.open	= tun_chr_open,
	.release = tun_chr_close,
2243 2244 2245 2246
	.fasync = tun_chr_fasync,
#ifdef CONFIG_PROC_FS
	.show_fdinfo = tun_chr_show_fdinfo,
#endif
L
Linus Torvalds 已提交
2247 2248 2249 2250 2251
};

static struct miscdevice tun_miscdev = {
	.minor = TUN_MINOR,
	.name = "tun",
2252
	.nodename = "net/tun",
L
Linus Torvalds 已提交
2253 2254 2255 2256 2257 2258 2259 2260 2261
	.fops = &tun_fops,
};

/* ethtool interface */

static int tun_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	cmd->supported		= 0;
	cmd->advertising	= 0;
2262
	ethtool_cmd_speed_set(cmd, SPEED_10);
L
Linus Torvalds 已提交
2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276
	cmd->duplex		= DUPLEX_FULL;
	cmd->port		= PORT_TP;
	cmd->phy_address	= 0;
	cmd->transceiver	= XCVR_INTERNAL;
	cmd->autoneg		= AUTONEG_DISABLE;
	cmd->maxtxpkt		= 0;
	cmd->maxrxpkt		= 0;
	return 0;
}

static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
	struct tun_struct *tun = netdev_priv(dev);

2277 2278
	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
L
Linus Torvalds 已提交
2279 2280

	switch (tun->flags & TUN_TYPE_MASK) {
M
Michael S. Tsirkin 已提交
2281
	case IFF_TUN:
2282
		strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
L
Linus Torvalds 已提交
2283
		break;
M
Michael S. Tsirkin 已提交
2284
	case IFF_TAP:
2285
		strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
L
Linus Torvalds 已提交
2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307
		break;
	}
}

static u32 tun_get_msglevel(struct net_device *dev)
{
#ifdef TUN_DEBUG
	struct tun_struct *tun = netdev_priv(dev);
	return tun->debug;
#else
	return -EOPNOTSUPP;
#endif
}

static void tun_set_msglevel(struct net_device *dev, u32 value)
{
#ifdef TUN_DEBUG
	struct tun_struct *tun = netdev_priv(dev);
	tun->debug = value;
#endif
}

2308
static const struct ethtool_ops tun_ethtool_ops = {
L
Linus Torvalds 已提交
2309 2310 2311 2312
	.get_settings	= tun_get_settings,
	.get_drvinfo	= tun_get_drvinfo,
	.get_msglevel	= tun_get_msglevel,
	.set_msglevel	= tun_set_msglevel,
2313
	.get_link	= ethtool_op_get_link,
2314
	.get_ts_info	= ethtool_op_get_ts_info,
L
Linus Torvalds 已提交
2315 2316
};

2317

L
Linus Torvalds 已提交
2318 2319 2320 2321
static int __init tun_init(void)
{
	int ret = 0;

2322 2323
	pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
	pr_info("%s\n", DRV_COPYRIGHT);
L
Linus Torvalds 已提交
2324

2325
	ret = rtnl_link_register(&tun_link_ops);
2326
	if (ret) {
2327
		pr_err("Can't register link_ops\n");
2328
		goto err_linkops;
2329 2330
	}

L
Linus Torvalds 已提交
2331
	ret = misc_register(&tun_miscdev);
2332
	if (ret) {
2333
		pr_err("Can't register misc device %d\n", TUN_MINOR);
2334 2335
		goto err_misc;
	}
2336
	return  0;
2337
err_misc:
2338 2339
	rtnl_link_unregister(&tun_link_ops);
err_linkops:
L
Linus Torvalds 已提交
2340 2341 2342 2343 2344
	return ret;
}

static void tun_cleanup(void)
{
2345
	misc_deregister(&tun_miscdev);
2346
	rtnl_link_unregister(&tun_link_ops);
L
Linus Torvalds 已提交
2347 2348
}

2349 2350 2351 2352 2353 2354
/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
struct socket *tun_get_socket(struct file *file)
{
2355
	struct tun_file *tfile;
2356 2357
	if (file->f_op != &tun_fops)
		return ERR_PTR(-EINVAL);
2358 2359
	tfile = file->private_data;
	if (!tfile)
2360
		return ERR_PTR(-EBADFD);
J
Jason Wang 已提交
2361
	return &tfile->socket;
2362 2363 2364
}
EXPORT_SYMBOL_GPL(tun_get_socket);

L
Linus Torvalds 已提交
2365 2366 2367 2368 2369 2370
module_init(tun_init);
module_exit(tun_cleanup);
MODULE_DESCRIPTION(DRV_DESCRIPTION);
MODULE_AUTHOR(DRV_COPYRIGHT);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(TUN_MINOR);
2371
MODULE_ALIAS("devname:net/tun");