tap.c 30.4 KB
Newer Older
A
Arnd Bergmann 已提交
1
#include <linux/etherdevice.h>
2
#include <linux/if_tap.h>
3
#include <linux/if_vlan.h>
A
Arnd Bergmann 已提交
4 5 6 7 8 9 10
#include <linux/interrupt.h>
#include <linux/nsproxy.h>
#include <linux/compat.h>
#include <linux/if_tun.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
11
#include <linux/sched/signal.h>
A
Arnd Bergmann 已提交
12
#include <linux/types.h>
13
#include <linux/slab.h>
A
Arnd Bergmann 已提交
14 15
#include <linux/wait.h>
#include <linux/cdev.h>
A
Al Viro 已提交
16
#include <linux/idr.h>
A
Arnd Bergmann 已提交
17
#include <linux/fs.h>
H
Herbert Xu 已提交
18
#include <linux/uio.h>
A
Arnd Bergmann 已提交
19 20 21 22

#include <net/net_namespace.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
23
#include <linux/virtio_net.h>
J
Jason Wang 已提交
24
#include <linux/skb_array.h>
A
Arnd Bergmann 已提交
25

26
#define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
27

28 29
#define TAP_VNET_LE 0x80000000
#define TAP_VNET_BE 0x40000000
30 31

#ifdef CONFIG_TUN_VNET_CROSS_LE
32
static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
33
{
34
	return q->flags & TAP_VNET_BE ? false :
35 36 37
		virtio_legacy_is_little_endian();
}

38
static long tap_get_vnet_be(struct tap_queue *q, int __user *sp)
39
{
40
	int s = !!(q->flags & TAP_VNET_BE);
41 42 43 44 45 46 47

	if (put_user(s, sp))
		return -EFAULT;

	return 0;
}

48
static long tap_set_vnet_be(struct tap_queue *q, int __user *sp)
49 50 51 52 53 54 55
{
	int s;

	if (get_user(s, sp))
		return -EFAULT;

	if (s)
56
		q->flags |= TAP_VNET_BE;
57
	else
58
		q->flags &= ~TAP_VNET_BE;
59 60 61 62

	return 0;
}
#else
63
static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
64 65 66 67
{
	return virtio_legacy_is_little_endian();
}

68
static long tap_get_vnet_be(struct tap_queue *q, int __user *argp)
69 70 71 72
{
	return -EINVAL;
}

73
static long tap_set_vnet_be(struct tap_queue *q, int __user *argp)
74 75 76 77
{
	return -EINVAL;
}
#endif /* CONFIG_TUN_VNET_CROSS_LE */
M
Michael S. Tsirkin 已提交
78

79
static inline bool tap_is_little_endian(struct tap_queue *q)
80
{
81 82
	return q->flags & TAP_VNET_LE ||
		tap_legacy_is_little_endian(q);
83
}
M
Michael S. Tsirkin 已提交
84

85
static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val)
M
Michael S. Tsirkin 已提交
86
{
87
	return __virtio16_to_cpu(tap_is_little_endian(q), val);
M
Michael S. Tsirkin 已提交
88 89
}

90
static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val)
M
Michael S. Tsirkin 已提交
91
{
92
	return __cpu_to_virtio16(tap_is_little_endian(q), val);
M
Michael S. Tsirkin 已提交
93 94
}

95 96
static struct proto tap_proto = {
	.name = "tap",
A
Arnd Bergmann 已提交
97
	.owner = THIS_MODULE,
98
	.obj_size = sizeof(struct tap_queue),
A
Arnd Bergmann 已提交
99 100
};

101
#define TAP_NUM_DEVS (1U << MINORBITS)
102 103 104

static LIST_HEAD(major_list);

105
struct major_info {
106
	struct rcu_head rcu;
107 108
	dev_t major;
	struct idr minor_idr;
W
WANG Cong 已提交
109
	spinlock_t minor_lock;
110
	const char *device_name;
111 112
	struct list_head next;
};
113

114
#define GOODCOPY_LEN 128
A
Arnd Bergmann 已提交
115

116
static const struct proto_ops tap_socket_ops;
A
Arnd Bergmann 已提交
117

118
#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
119
#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
120

121
static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev)
122 123 124 125
{
	return rcu_dereference(dev->rx_handler_data);
}

A
Arnd Bergmann 已提交
126 127
/*
 * RCU usage:
128
 * The tap_queue and the macvlan_dev are loosely coupled, the
129
 * pointers from one to the other can only be read while rcu_read_lock
130
 * or rtnl is held.
A
Arnd Bergmann 已提交
131
 *
132
 * Both the file and the macvlan_dev hold a reference on the tap_queue
133 134
 * through sock_hold(&q->sk). When the macvlan_dev goes away first,
 * q->vlan becomes inaccessible. When the files gets closed,
135
 * tap_get_queue() fails.
A
Arnd Bergmann 已提交
136
 *
137 138 139 140
 * There may still be references to the struct sock inside of the
 * queue from outbound SKBs, but these never reference back to the
 * file or the dev. The data structure is freed through __sk_free
 * when both our references and any pending SKBs are gone.
A
Arnd Bergmann 已提交
141 142
 */

143
static int tap_enable_queue(struct tap_dev *tap, struct file *file,
144
			    struct tap_queue *q)
J
Jason Wang 已提交
145 146 147
{
	int err = -EINVAL;

148
	ASSERT_RTNL();
J
Jason Wang 已提交
149 150 151 152 153

	if (q->enabled)
		goto out;

	err = 0;
154 155
	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
	q->queue_index = tap->numvtaps;
J
Jason Wang 已提交
156 157
	q->enabled = true;

158
	tap->numvtaps++;
J
Jason Wang 已提交
159 160 161 162
out:
	return err;
}

163
/* Requires RTNL */
164
static int tap_set_queue(struct tap_dev *tap, struct file *file,
165
			 struct tap_queue *q)
A
Arnd Bergmann 已提交
166
{
167
	if (tap->numqueues == MAX_TAP_QUEUES)
168
		return -EBUSY;
A
Arnd Bergmann 已提交
169

170 171
	rcu_assign_pointer(q->tap, tap);
	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
172
	sock_hold(&q->sk);
A
Arnd Bergmann 已提交
173 174

	q->file = file;
175
	q->queue_index = tap->numvtaps;
J
Jason Wang 已提交
176
	q->enabled = true;
177
	file->private_data = q;
178
	list_add_tail(&q->next, &tap->queue_list);
A
Arnd Bergmann 已提交
179

180 181
	tap->numvtaps++;
	tap->numqueues++;
182

183
	return 0;
A
Arnd Bergmann 已提交
184 185
}

186
static int tap_disable_queue(struct tap_queue *q)
J
Jason Wang 已提交
187
{
188
	struct tap_dev *tap;
189
	struct tap_queue *nq;
J
Jason Wang 已提交
190

191
	ASSERT_RTNL();
J
Jason Wang 已提交
192 193 194
	if (!q->enabled)
		return -EINVAL;

195
	tap = rtnl_dereference(q->tap);
196

197
	if (tap) {
J
Jason Wang 已提交
198
		int index = q->queue_index;
199 200
		BUG_ON(index >= tap->numvtaps);
		nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]);
J
Jason Wang 已提交
201 202
		nq->queue_index = index;

203 204
		rcu_assign_pointer(tap->taps[index], nq);
		RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL);
J
Jason Wang 已提交
205 206
		q->enabled = false;

207
		tap->numvtaps--;
J
Jason Wang 已提交
208 209 210 211 212
	}

	return 0;
}

A
Arnd Bergmann 已提交
213
/*
214 215 216
 * The file owning the queue got closed, give up both
 * the reference that the files holds as well as the
 * one from the macvlan_dev if that still exists.
A
Arnd Bergmann 已提交
217 218 219 220
 *
 * Using the spinlock makes sure that we don't get
 * to the queue again after destroying it.
 */
221
static void tap_put_queue(struct tap_queue *q)
A
Arnd Bergmann 已提交
222
{
223
	struct tap_dev *tap;
A
Arnd Bergmann 已提交
224

225
	rtnl_lock();
226
	tap = rtnl_dereference(q->tap);
227

228
	if (tap) {
J
Jason Wang 已提交
229
		if (q->enabled)
230
			BUG_ON(tap_disable_queue(q));
J
Jason Wang 已提交
231

232 233
		tap->numqueues--;
		RCU_INIT_POINTER(q->tap, NULL);
234
		sock_put(&q->sk);
J
Jason Wang 已提交
235
		list_del_init(&q->next);
A
Arnd Bergmann 已提交
236 237
	}

238
	rtnl_unlock();
A
Arnd Bergmann 已提交
239 240 241 242 243 244

	synchronize_rcu();
	sock_put(&q->sk);
}

/*
245 246 247 248 249
 * Select a queue based on the rxq of the device on which this packet
 * arrived. If the incoming device is not mq, calculate a flow hash
 * to select a queue. If all fails, find the first available queue.
 * Cache vlan->numvtaps since it can become zero during the execution
 * of this function.
A
Arnd Bergmann 已提交
250
 */
251
static struct tap_queue *tap_get_queue(struct tap_dev *tap,
252
				       struct sk_buff *skb)
A
Arnd Bergmann 已提交
253
{
254
	struct tap_queue *queue = NULL;
J
Jason Wang 已提交
255 256 257 258 259
	/* Access to taps array is protected by rcu, but access to numvtaps
	 * isn't. Below we use it to lookup a queue, but treat it as a hint
	 * and validate that the result isn't NULL - in case we are
	 * racing against queue removal.
	 */
260
	int numvtaps = READ_ONCE(tap->numvtaps);
261 262 263 264 265
	__u32 rxq;

	if (!numvtaps)
		goto out;

266 267 268
	if (numvtaps == 1)
		goto single;

269
	/* Check if we can use flow to select a queue */
270
	rxq = skb_get_hash(skb);
271
	if (rxq) {
272
		queue = rcu_dereference(tap->taps[rxq % numvtaps]);
J
Jason Wang 已提交
273
		goto out;
274 275
	}

276 277
	if (likely(skb_rx_queue_recorded(skb))) {
		rxq = skb_get_rx_queue(skb);
A
Arnd Bergmann 已提交
278

279 280 281
		while (unlikely(rxq >= numvtaps))
			rxq -= numvtaps;

282
		queue = rcu_dereference(tap->taps[rxq]);
J
Jason Wang 已提交
283
		goto out;
284 285
	}

286
single:
287
	queue = rcu_dereference(tap->taps[0]);
288
out:
289
	return queue;
A
Arnd Bergmann 已提交
290 291
}

292 293
/*
 * The net_device is going away, give up the reference
294 295
 * that it holds on all queues and safely set the pointer
 * from the queues to NULL.
296
 */
297
void tap_del_queues(struct tap_dev *tap)
A
Arnd Bergmann 已提交
298
{
299
	struct tap_queue *q, *tmp;
300

301
	ASSERT_RTNL();
302
	list_for_each_entry_safe(q, tmp, &tap->queue_list, next) {
J
Jason Wang 已提交
303
		list_del_init(&q->next);
304
		RCU_INIT_POINTER(q->tap, NULL);
J
Jason Wang 已提交
305
		if (q->enabled)
306 307
			tap->numvtaps--;
		tap->numqueues--;
308
		sock_put(&q->sk);
309
	}
310 311
	BUG_ON(tap->numvtaps);
	BUG_ON(tap->numqueues);
312
	/* guarantee that any future tap_set_queue will fail */
313
	tap->numvtaps = MAX_TAP_QUEUES;
A
Arnd Bergmann 已提交
314
}
315
EXPORT_SYMBOL_GPL(tap_del_queues);
A
Arnd Bergmann 已提交
316

317
rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
A
Arnd Bergmann 已提交
318
{
319 320
	struct sk_buff *skb = *pskb;
	struct net_device *dev = skb->dev;
321
	struct tap_dev *tap;
322
	struct tap_queue *q;
323 324
	netdev_features_t features = TAP_FEATURES;

325 326
	tap = tap_dev_get_rcu(dev);
	if (!tap)
327 328
		return RX_HANDLER_PASS;

329
	q = tap_get_queue(tap, skb);
A
Arnd Bergmann 已提交
330
	if (!q)
331
		return RX_HANDLER_PASS;
H
Herbert Xu 已提交
332

333 334
	skb_push(skb, ETH_HLEN);

335
	/* Apply the forward feature mask so that we perform segmentation
336 337
	 * according to users wishes.  This only works if VNET_HDR is
	 * enabled.
338
	 */
339
	if (q->flags & IFF_VNET_HDR)
340
		features |= tap->tap_features;
341
	if (netif_needs_gso(skb, features)) {
342 343 344 345 346 347
		struct sk_buff *segs = __skb_gso_segment(skb, features, false);

		if (IS_ERR(segs))
			goto drop;

		if (!segs) {
348
			if (ptr_ring_produce(&q->ring, skb))
J
Jason Wang 已提交
349
				goto drop;
350 351 352
			goto wake_up;
		}

353
		consume_skb(skb);
354 355 356 357
		while (segs) {
			struct sk_buff *nskb = segs->next;

			segs->next = NULL;
358
			if (ptr_ring_produce(&q->ring, segs)) {
J
Jason Wang 已提交
359 360 361 362
				kfree_skb(segs);
				kfree_skb_list(nskb);
				break;
			}
363 364 365
			segs = nskb;
		}
	} else {
366 367 368
		/* If we receive a partial checksum and the tap side
		 * doesn't support checksum offload, compute the checksum.
		 * Note: it doesn't matter which checksum feature to
S
Sainath Grandhi 已提交
369
		 *	  check, we either support them all or none.
370 371
		 */
		if (skb->ip_summed == CHECKSUM_PARTIAL &&
372
		    !(features & NETIF_F_CSUM_MASK) &&
373 374
		    skb_checksum_help(skb))
			goto drop;
375
		if (ptr_ring_produce(&q->ring, skb))
J
Jason Wang 已提交
376
			goto drop;
377 378 379
	}

wake_up:
380
	wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND);
381
	return RX_HANDLER_CONSUMED;
H
Herbert Xu 已提交
382 383

drop:
384
	/* Count errors/drops only here, thus don't care about args. */
385 386
	if (tap->count_rx_dropped)
		tap->count_rx_dropped(tap);
H
Herbert Xu 已提交
387
	kfree_skb(skb);
388
	return RX_HANDLER_CONSUMED;
A
Arnd Bergmann 已提交
389
}
390
EXPORT_SYMBOL_GPL(tap_handle_frame);
A
Arnd Bergmann 已提交
391

392 393 394 395 396 397 398 399 400 401 402 403 404
static struct major_info *tap_get_major(int major)
{
	struct major_info *tap_major;

	list_for_each_entry_rcu(tap_major, &major_list, next) {
		if (tap_major->major == major)
			return tap_major;
	}

	return NULL;
}

int tap_get_minor(dev_t major, struct tap_dev *tap)
405 406
{
	int retval = -ENOMEM;
407 408 409 410 411 412 413 414
	struct major_info *tap_major;

	rcu_read_lock();
	tap_major = tap_get_major(MAJOR(major));
	if (!tap_major) {
		retval = -EINVAL;
		goto unlock;
	}
415

W
WANG Cong 已提交
416 417
	spin_lock(&tap_major->minor_lock);
	retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC);
T
Tejun Heo 已提交
418
	if (retval >= 0) {
419
		tap->minor = retval;
T
Tejun Heo 已提交
420
	} else if (retval == -ENOSPC) {
421
		netdev_err(tap->dev, "Too many tap devices\n");
422 423
		retval = -EINVAL;
	}
W
WANG Cong 已提交
424
	spin_unlock(&tap_major->minor_lock);
425 426 427

unlock:
	rcu_read_unlock();
T
Tejun Heo 已提交
428
	return retval < 0 ? retval : 0;
429
}
430
EXPORT_SYMBOL_GPL(tap_get_minor);
431

432
void tap_free_minor(dev_t major, struct tap_dev *tap)
433
{
434 435 436 437 438 439 440 441
	struct major_info *tap_major;

	rcu_read_lock();
	tap_major = tap_get_major(MAJOR(major));
	if (!tap_major) {
		goto unlock;
	}

W
WANG Cong 已提交
442
	spin_lock(&tap_major->minor_lock);
443
	if (tap->minor) {
444
		idr_remove(&tap_major->minor_idr, tap->minor);
445
		tap->minor = 0;
446
	}
W
WANG Cong 已提交
447
	spin_unlock(&tap_major->minor_lock);
448 449 450

unlock:
	rcu_read_unlock();
451
}
452
EXPORT_SYMBOL_GPL(tap_free_minor);
453

454
static struct tap_dev *dev_get_by_tap_file(int major, int minor)
455 456
{
	struct net_device *dev = NULL;
457
	struct tap_dev *tap;
458
	struct major_info *tap_major;
459

460 461 462 463 464 465 466
	rcu_read_lock();
	tap_major = tap_get_major(major);
	if (!tap_major) {
		tap = NULL;
		goto unlock;
	}

W
WANG Cong 已提交
467
	spin_lock(&tap_major->minor_lock);
468
	tap = idr_find(&tap_major->minor_idr, minor);
469 470
	if (tap) {
		dev = tap->dev;
471 472
		dev_hold(dev);
	}
W
WANG Cong 已提交
473
	spin_unlock(&tap_major->minor_lock);
474 475 476

unlock:
	rcu_read_unlock();
477
	return tap;
478 479
}

480
static void tap_sock_write_space(struct sock *sk)
A
Arnd Bergmann 已提交
481
{
482 483
	wait_queue_head_t *wqueue;

A
Arnd Bergmann 已提交
484
	if (!sock_writeable(sk) ||
485
	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
A
Arnd Bergmann 已提交
486 487
		return;

488 489
	wqueue = sk_sleep(sk);
	if (wqueue && waitqueue_active(wqueue))
490
		wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
A
Arnd Bergmann 已提交
491 492
}

493
static void tap_sock_destruct(struct sock *sk)
494
{
495
	struct tap_queue *q = container_of(sk, struct tap_queue, sk);
J
Jason Wang 已提交
496

497
	ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb);
498 499
}

500
static int tap_open(struct inode *inode, struct file *file)
A
Arnd Bergmann 已提交
501 502
{
	struct net *net = current->nsproxy->net_ns;
503
	struct tap_dev *tap;
504
	struct tap_queue *q;
505
	int err = -ENODEV;
A
Arnd Bergmann 已提交
506

507
	rtnl_lock();
508
	tap = dev_get_by_tap_file(imajor(inode), iminor(inode));
509
	if (!tap)
J
Jason Wang 已提交
510
		goto err;
A
Arnd Bergmann 已提交
511 512

	err = -ENOMEM;
513 514
	q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
					     &tap_proto, 0);
A
Arnd Bergmann 已提交
515
	if (!q)
J
Jason Wang 已提交
516
		goto err;
517
	if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) {
518 519 520
		sk_free(&q->sk);
		goto err;
	}
A
Arnd Bergmann 已提交
521

J
Jason Wang 已提交
522
	RCU_INIT_POINTER(q->sock.wq, &q->wq);
523
	init_waitqueue_head(&q->wq.wait);
A
Arnd Bergmann 已提交
524 525
	q->sock.type = SOCK_RAW;
	q->sock.state = SS_CONNECTED;
A
Arnd Bergmann 已提交
526
	q->sock.file = file;
527
	q->sock.ops = &tap_socket_ops;
A
Arnd Bergmann 已提交
528
	sock_init_data(&q->sock, &q->sk);
529 530
	q->sk.sk_write_space = tap_sock_write_space;
	q->sk.sk_destruct = tap_sock_destruct;
531
	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
532
	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
A
Arnd Bergmann 已提交
533

534
	/*
535
	 * so far only KVM virtio_net uses tap, enable zero copy between
536
	 * guest kernel and host kernel when lower device supports zerocopy
537 538 539
	 *
	 * The macvlan supports zerocopy iff the lower device supports zero
	 * copy so we don't have to look at the lower device directly.
540
	 */
541
	if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
542
		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
543

544
	err = tap_set_queue(tap, file, q);
545
	if (err) {
546
		/* tap_sock_destruct() will take care of freeing ptr_ring */
547 548
		goto err_put;
	}
A
Arnd Bergmann 已提交
549

550
	dev_put(tap->dev);
J
Jason Wang 已提交
551 552 553 554

	rtnl_unlock();
	return err;

555
err_put:
J
Jason Wang 已提交
556 557
	sock_put(&q->sk);
err:
558 559
	if (tap)
		dev_put(tap->dev);
A
Arnd Bergmann 已提交
560

561
	rtnl_unlock();
A
Arnd Bergmann 已提交
562 563 564
	return err;
}

565
static int tap_release(struct inode *inode, struct file *file)
A
Arnd Bergmann 已提交
566
{
567 568
	struct tap_queue *q = file->private_data;
	tap_put_queue(q);
A
Arnd Bergmann 已提交
569 570 571
	return 0;
}

572
static __poll_t tap_poll(struct file *file, poll_table *wait)
A
Arnd Bergmann 已提交
573
{
574
	struct tap_queue *q = file->private_data;
575
	__poll_t mask = EPOLLERR;
A
Arnd Bergmann 已提交
576 577 578 579 580

	if (!q)
		goto out;

	mask = 0;
581
	poll_wait(file, &q->wq.wait, wait);
A
Arnd Bergmann 已提交
582

583
	if (!ptr_ring_empty(&q->ring))
584
		mask |= EPOLLIN | EPOLLRDNORM;
A
Arnd Bergmann 已提交
585 586

	if (sock_writeable(&q->sk) ||
587
	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
A
Arnd Bergmann 已提交
588
	     sock_writeable(&q->sk)))
589
		mask |= EPOLLOUT | EPOLLWRNORM;
A
Arnd Bergmann 已提交
590 591 592 593 594

out:
	return mask;
}

595 596
static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
					    size_t len, size_t linear,
597 598 599 600 601 602 603 604 605
						int noblock, int *err)
{
	struct sk_buff *skb;

	/* Under a page?  Don't bother with paged skb. */
	if (prepad + len < PAGE_SIZE || !linear)
		linear = len;

	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
606
				   err, 0);
607 608 609 610 611 612 613 614 615 616 617
	if (!skb)
		return NULL;

	skb_reserve(skb, prepad);
	skb_put(skb, linear);
	skb->data_len = len - linear;
	skb->len += len - linear;

	return skb;
}

618
/* Neighbour code has some assumptions on HH_DATA_MOD alignment */
619
#define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
620

A
Arnd Bergmann 已提交
621
/* Get packet from user space buffer */
622
static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
623
			    struct iov_iter *from, int noblock)
A
Arnd Bergmann 已提交
624
{
625
	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
A
Arnd Bergmann 已提交
626
	struct sk_buff *skb;
627
	struct tap_dev *tap;
628
	unsigned long total_len = iov_iter_count(from);
629
	unsigned long len = total_len;
A
Arnd Bergmann 已提交
630
	int err;
631 632
	struct virtio_net_hdr vnet_hdr = { 0 };
	int vnet_hdr_len = 0;
633
	int copylen = 0;
634
	int depth;
635
	bool zerocopy = false;
636
	size_t linear;
637 638

	if (q->flags & IFF_VNET_HDR) {
639
		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
640 641

		err = -EINVAL;
642
		if (len < vnet_hdr_len)
643
			goto err;
644
		len -= vnet_hdr_len;
645

646
		err = -EFAULT;
647
		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
648
			goto err;
649
		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
650
		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
651 652 653 654 655 656
		     tap16_to_cpu(q, vnet_hdr.csum_start) +
		     tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
			     tap16_to_cpu(q, vnet_hdr.hdr_len))
			vnet_hdr.hdr_len = cpu_to_tap16(q,
				 tap16_to_cpu(q, vnet_hdr.csum_start) +
				 tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
657
		err = -EINVAL;
658
		if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
659 660
			goto err;
	}
A
Arnd Bergmann 已提交
661

662
	err = -EINVAL;
A
Arnd Bergmann 已提交
663
	if (unlikely(len < ETH_HLEN))
664
		goto err;
A
Arnd Bergmann 已提交
665

666
	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
667 668
		struct iov_iter i;

M
Michael S. Tsirkin 已提交
669
		copylen = vnet_hdr.hdr_len ?
670
			tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
671 672
		if (copylen > good_linear)
			copylen = good_linear;
673 674
		else if (copylen < ETH_HLEN)
			copylen = ETH_HLEN;
675
		linear = copylen;
676 677 678
		i = *from;
		iov_iter_advance(&i, copylen);
		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
679 680 681 682
			zerocopy = true;
	}

	if (!zerocopy) {
683
		copylen = len;
684
		linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
685
		if (linear > good_linear)
686
			linear = good_linear;
687 688
		else if (linear < ETH_HLEN)
			linear = ETH_HLEN;
689
	}
690

691 692
	skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
			    linear, noblock, &err);
693 694
	if (!skb)
		goto err;
A
Arnd Bergmann 已提交
695

696
	if (zerocopy)
697
		err = zerocopy_sg_from_iter(skb, from);
698
	else
699
		err = skb_copy_datagram_from_iter(skb, 0, from, len);
700

701
	if (err)
702
		goto err_kfree;
A
Arnd Bergmann 已提交
703 704

	skb_set_network_header(skb, ETH_HLEN);
705 706 707 708
	skb_reset_mac_header(skb);
	skb->protocol = eth_hdr(skb)->h_proto;

	if (vnet_hdr_len) {
709
		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
710
					    tap_is_little_endian(q));
711 712 713 714
		if (err)
			goto err_kfree;
	}

715
	skb_probe_transport_header(skb, ETH_HLEN);
716

717 718 719 720 721 722
	/* Move network header to the right position for VLAN tagged packets */
	if ((skb->protocol == htons(ETH_P_8021Q) ||
	     skb->protocol == htons(ETH_P_8021AD)) &&
	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
		skb_set_network_header(skb, depth);

723
	rcu_read_lock();
724
	tap = rcu_dereference(q->tap);
725
	/* copy skb_ubuf_info for callback when skb has no error */
726
	if (zerocopy) {
727
		skb_shinfo(skb)->destructor_arg = msg_control;
728
		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
729
		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
730 731
	} else if (msg_control) {
		struct ubuf_info *uarg = msg_control;
732
		uarg->callback(uarg, false);
733
	}
734

735 736
	if (tap) {
		skb->dev = tap->dev;
737
		dev_queue_xmit(skb);
E
Eric Dumazet 已提交
738
	} else {
739
		kfree_skb(skb);
E
Eric Dumazet 已提交
740
	}
741
	rcu_read_unlock();
A
Arnd Bergmann 已提交
742

743
	return total_len;
744

745 746 747
err_kfree:
	kfree_skb(skb);

748
err:
749
	rcu_read_lock();
750 751 752
	tap = rcu_dereference(q->tap);
	if (tap && tap->count_tx_dropped)
		tap->count_tx_dropped(tap);
753
	rcu_read_unlock();
754 755

	return err;
A
Arnd Bergmann 已提交
756 757
}

758
static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
A
Arnd Bergmann 已提交
759 760
{
	struct file *file = iocb->ki_filp;
761
	struct tap_queue *q = file->private_data;
A
Arnd Bergmann 已提交
762

763
	return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
A
Arnd Bergmann 已提交
764 765 766
}

/* Put packet to the user space buffer */
767 768 769
static ssize_t tap_put_user(struct tap_queue *q,
			    const struct sk_buff *skb,
			    struct iov_iter *iter)
A
Arnd Bergmann 已提交
770 771
{
	int ret;
772
	int vnet_hdr_len = 0;
773
	int vlan_offset = 0;
H
Herbert Xu 已提交
774
	int total;
775 776

	if (q->flags & IFF_VNET_HDR) {
777
		int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
778
		struct virtio_net_hdr vnet_hdr;
779

780
		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
H
Herbert Xu 已提交
781
		if (iov_iter_count(iter) < vnet_hdr_len)
782 783
			return -EINVAL;

784
		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
785 786
					    tap_is_little_endian(q), true,
					    vlan_hlen))
787
			BUG();
788

H
Herbert Xu 已提交
789 790
		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
		    sizeof(vnet_hdr))
791
			return -EFAULT;
792 793

		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
794
	}
H
Herbert Xu 已提交
795
	total = vnet_hdr_len;
J
Jason Wang 已提交
796
	total += skb->len;
797

798
	if (skb_vlan_tag_present(skb)) {
799 800 801 802
		struct {
			__be16 h_vlan_proto;
			__be16 h_vlan_TCI;
		} veth;
803
		veth.h_vlan_proto = skb->vlan_proto;
804
		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
805 806

		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
J
Jason Wang 已提交
807
		total += VLAN_HLEN;
808

H
Herbert Xu 已提交
809 810
		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
		if (ret || !iov_iter_count(iter))
811 812
			goto done;

H
Herbert Xu 已提交
813 814
		ret = copy_to_iter(&veth, sizeof(veth), iter);
		if (ret != sizeof(veth) || !iov_iter_count(iter))
815 816
			goto done;
	}
A
Arnd Bergmann 已提交
817

H
Herbert Xu 已提交
818 819
	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
				     skb->len - vlan_offset);
A
Arnd Bergmann 已提交
820

821
done:
J
Jason Wang 已提交
822
	return ret ? ret : total;
A
Arnd Bergmann 已提交
823 824
}

825 826
static ssize_t tap_do_read(struct tap_queue *q,
			   struct iov_iter *to,
827
			   int noblock, struct sk_buff *skb)
A
Arnd Bergmann 已提交
828
{
829
	DEFINE_WAIT(wait);
A
Arnd Bergmann 已提交
830
	ssize_t ret = 0;
A
Arnd Bergmann 已提交
831

W
Wei Xu 已提交
832
	if (!iov_iter_count(to)) {
833
		kfree_skb(skb);
A
Al Viro 已提交
834
		return 0;
W
Wei Xu 已提交
835
	}
A
Al Viro 已提交
836

837 838 839
	if (skb)
		goto put;

A
Al Viro 已提交
840
	while (1) {
841 842 843
		if (!noblock)
			prepare_to_wait(sk_sleep(&q->sk), &wait,
					TASK_INTERRUPTIBLE);
A
Arnd Bergmann 已提交
844 845

		/* Read frames from the queue */
846
		skb = ptr_ring_consume(&q->ring);
A
Al Viro 已提交
847 848 849 850 851
		if (skb)
			break;
		if (noblock) {
			ret = -EAGAIN;
			break;
A
Arnd Bergmann 已提交
852
		}
A
Al Viro 已提交
853 854 855 856 857 858 859
		if (signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}
		/* Nothing to read, let's sleep */
		schedule();
	}
860 861 862
	if (!noblock)
		finish_wait(sk_sleep(&q->sk), &wait);

863
put:
A
Al Viro 已提交
864
	if (skb) {
865
		ret = tap_put_user(q, skb, to);
866 867 868 869
		if (unlikely(ret < 0))
			kfree_skb(skb);
		else
			consume_skb(skb);
A
Arnd Bergmann 已提交
870
	}
A
Arnd Bergmann 已提交
871 872 873
	return ret;
}

874
static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
A
Arnd Bergmann 已提交
875 876
{
	struct file *file = iocb->ki_filp;
877
	struct tap_queue *q = file->private_data;
A
Al Viro 已提交
878
	ssize_t len = iov_iter_count(to), ret;
A
Arnd Bergmann 已提交
879

880
	ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL);
J
Jason Wang 已提交
881
	ret = min_t(ssize_t, ret, len);
882 883
	if (ret > 0)
		iocb->ki_pos = ret;
A
Arnd Bergmann 已提交
884 885 886
	return ret;
}

887
static struct tap_dev *tap_get_tap_dev(struct tap_queue *q)
888
{
889
	struct tap_dev *tap;
890

891
	ASSERT_RTNL();
892 893 894
	tap = rtnl_dereference(q->tap);
	if (tap)
		dev_hold(tap->dev);
895

896
	return tap;
897 898
}

899
static void tap_put_tap_dev(struct tap_dev *tap)
900
{
901
	dev_put(tap->dev);
902 903
}

904
static int tap_ioctl_set_queue(struct file *file, unsigned int flags)
J
Jason Wang 已提交
905
{
906
	struct tap_queue *q = file->private_data;
907
	struct tap_dev *tap;
J
Jason Wang 已提交
908 909
	int ret;

910 911
	tap = tap_get_tap_dev(q);
	if (!tap)
J
Jason Wang 已提交
912 913 914
		return -EINVAL;

	if (flags & IFF_ATTACH_QUEUE)
915
		ret = tap_enable_queue(tap, file, q);
J
Jason Wang 已提交
916
	else if (flags & IFF_DETACH_QUEUE)
917
		ret = tap_disable_queue(q);
918 919
	else
		ret = -EINVAL;
J
Jason Wang 已提交
920

921
	tap_put_tap_dev(tap);
J
Jason Wang 已提交
922 923 924
	return ret;
}

925
static int set_offload(struct tap_queue *q, unsigned long arg)
926
{
927
	struct tap_dev *tap;
928 929 930
	netdev_features_t features;
	netdev_features_t feature_mask = 0;

931 932
	tap = rtnl_dereference(q->tap);
	if (!tap)
933 934
		return -ENOLINK;

935
	features = tap->dev->features;
936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953

	if (arg & TUN_F_CSUM) {
		feature_mask = NETIF_F_HW_CSUM;

		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
			if (arg & TUN_F_TSO_ECN)
				feature_mask |= NETIF_F_TSO_ECN;
			if (arg & TUN_F_TSO4)
				feature_mask |= NETIF_F_TSO;
			if (arg & TUN_F_TSO6)
				feature_mask |= NETIF_F_TSO6;
		}
	}

	/* tun/tap driver inverts the usage for TSO offloads, where
	 * setting the TSO bit means that the userspace wants to
	 * accept TSO frames and turning it off means that user space
	 * does not support TSO.
954
	 * For tap, we have to invert it to mean the same thing.
955 956 957
	 * When user space turns off TSO, we turn off GSO/LRO so that
	 * user-space will not receive TSO frames.
	 */
958
	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6))
959 960 961 962 963 964 965
		features |= RX_OFFLOADS;
	else
		features &= ~RX_OFFLOADS;

	/* tap_features are the same as features on tun/tap and
	 * reflect user expectations.
	 */
966 967 968
	tap->tap_features = feature_mask;
	if (tap->update_features)
		tap->update_features(tap, features);
969 970 971 972

	return 0;
}

A
Arnd Bergmann 已提交
973 974 975
/*
 * provide compatibility with generic tun/tap interface
 */
976 977
static long tap_ioctl(struct file *file, unsigned int cmd,
		      unsigned long arg)
A
Arnd Bergmann 已提交
978
{
979
	struct tap_queue *q = file->private_data;
980
	struct tap_dev *tap;
A
Arnd Bergmann 已提交
981 982 983
	void __user *argp = (void __user *)arg;
	struct ifreq __user *ifr = argp;
	unsigned int __user *up = argp;
984
	unsigned short u;
985
	int __user *sp = argp;
986
	struct sockaddr sa;
987
	int s;
988
	int ret;
A
Arnd Bergmann 已提交
989 990 991 992 993 994

	switch (cmd) {
	case TUNSETIFF:
		/* ignore the name, just look at flags */
		if (get_user(u, &ifr->ifr_flags))
			return -EFAULT;
995 996

		ret = 0;
997
		if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP))
998 999
			ret = -EINVAL;
		else
1000
			q->flags = (q->flags & ~TAP_IFFEATURES) | u;
1001 1002

		return ret;
A
Arnd Bergmann 已提交
1003 1004

	case TUNGETIFF:
1005
		rtnl_lock();
1006 1007
		tap = tap_get_tap_dev(q);
		if (!tap) {
1008
			rtnl_unlock();
A
Arnd Bergmann 已提交
1009
			return -ENOLINK;
1010
		}
A
Arnd Bergmann 已提交
1011

1012
		ret = 0;
1013
		u = q->flags;
1014
		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
1015
		    put_user(u, &ifr->ifr_flags))
1016
			ret = -EFAULT;
1017
		tap_put_tap_dev(tap);
1018
		rtnl_unlock();
1019
		return ret;
A
Arnd Bergmann 已提交
1020

J
Jason Wang 已提交
1021 1022 1023
	case TUNSETQUEUE:
		if (get_user(u, &ifr->ifr_flags))
			return -EFAULT;
1024
		rtnl_lock();
1025
		ret = tap_ioctl_set_queue(file, u);
1026
		rtnl_unlock();
1027
		return ret;
J
Jason Wang 已提交
1028

A
Arnd Bergmann 已提交
1029
	case TUNGETFEATURES:
1030
		if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up))
A
Arnd Bergmann 已提交
1031 1032 1033 1034
			return -EFAULT;
		return 0;

	case TUNSETSNDBUF:
1035
		if (get_user(s, sp))
A
Arnd Bergmann 已提交
1036
			return -EFAULT;
1037 1038
		if (s <= 0)
			return -EINVAL;
A
Arnd Bergmann 已提交
1039

1040
		q->sk.sk_sndbuf = s;
A
Arnd Bergmann 已提交
1041 1042
		return 0;

1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
	case TUNGETVNETHDRSZ:
		s = q->vnet_hdr_sz;
		if (put_user(s, sp))
			return -EFAULT;
		return 0;

	case TUNSETVNETHDRSZ:
		if (get_user(s, sp))
			return -EFAULT;
		if (s < (int)sizeof(struct virtio_net_hdr))
			return -EINVAL;

		q->vnet_hdr_sz = s;
		return 0;

1058
	case TUNGETVNETLE:
1059
		s = !!(q->flags & TAP_VNET_LE);
1060 1061 1062 1063 1064 1065 1066 1067
		if (put_user(s, sp))
			return -EFAULT;
		return 0;

	case TUNSETVNETLE:
		if (get_user(s, sp))
			return -EFAULT;
		if (s)
1068
			q->flags |= TAP_VNET_LE;
1069
		else
1070
			q->flags &= ~TAP_VNET_LE;
1071 1072
		return 0;

1073
	case TUNGETVNETBE:
1074
		return tap_get_vnet_be(q, sp);
1075 1076

	case TUNSETVNETBE:
1077
		return tap_set_vnet_be(q, sp);
1078

A
Arnd Bergmann 已提交
1079 1080 1081
	case TUNSETOFFLOAD:
		/* let the user check for future flags */
		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
1082
			    TUN_F_TSO_ECN | TUN_F_UFO))
A
Arnd Bergmann 已提交
1083 1084
			return -EINVAL;

1085 1086 1087 1088
		rtnl_lock();
		ret = set_offload(q, arg);
		rtnl_unlock();
		return ret;
A
Arnd Bergmann 已提交
1089

1090 1091
	case SIOCGIFHWADDR:
		rtnl_lock();
1092 1093
		tap = tap_get_tap_dev(q);
		if (!tap) {
1094 1095 1096 1097
			rtnl_unlock();
			return -ENOLINK;
		}
		ret = 0;
1098 1099 1100
		u = tap->dev->type;
		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
		    copy_to_user(&ifr->ifr_hwaddr.sa_data, tap->dev->dev_addr, ETH_ALEN) ||
1101 1102
		    put_user(u, &ifr->ifr_hwaddr.sa_family))
			ret = -EFAULT;
1103
		tap_put_tap_dev(tap);
1104 1105 1106 1107
		rtnl_unlock();
		return ret;

	case SIOCSIFHWADDR:
1108 1109
		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
			return -EFAULT;
1110
		rtnl_lock();
1111 1112
		tap = tap_get_tap_dev(q);
		if (!tap) {
1113 1114 1115
			rtnl_unlock();
			return -ENOLINK;
		}
1116 1117
		ret = dev_set_mac_address(tap->dev, &sa);
		tap_put_tap_dev(tap);
1118 1119 1120
		rtnl_unlock();
		return ret;

A
Arnd Bergmann 已提交
1121 1122 1123 1124 1125 1126
	default:
		return -EINVAL;
	}
}

#ifdef CONFIG_COMPAT
1127 1128
static long tap_compat_ioctl(struct file *file, unsigned int cmd,
			     unsigned long arg)
A
Arnd Bergmann 已提交
1129
{
1130
	return tap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
A
Arnd Bergmann 已提交
1131 1132 1133
}
#endif

1134
static const struct file_operations tap_fops = {
A
Arnd Bergmann 已提交
1135
	.owner		= THIS_MODULE,
1136 1137 1138 1139 1140
	.open		= tap_open,
	.release	= tap_release,
	.read_iter	= tap_read_iter,
	.write_iter	= tap_write_iter,
	.poll		= tap_poll,
A
Arnd Bergmann 已提交
1141
	.llseek		= no_llseek,
1142
	.unlocked_ioctl	= tap_ioctl,
A
Arnd Bergmann 已提交
1143
#ifdef CONFIG_COMPAT
1144
	.compat_ioctl	= tap_compat_ioctl,
A
Arnd Bergmann 已提交
1145 1146 1147
#endif
};

1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210
static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
{
	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
	struct virtio_net_hdr *gso = &hdr->gso;
	int buflen = hdr->buflen;
	int vnet_hdr_len = 0;
	struct tap_dev *tap;
	struct sk_buff *skb;
	int err, depth;

	if (q->flags & IFF_VNET_HDR)
		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);

	skb = build_skb(xdp->data_hard_start, buflen);
	if (!skb) {
		err = -ENOMEM;
		goto err;
	}

	skb_reserve(skb, xdp->data - xdp->data_hard_start);
	skb_put(skb, xdp->data_end - xdp->data);

	skb_set_network_header(skb, ETH_HLEN);
	skb_reset_mac_header(skb);
	skb->protocol = eth_hdr(skb)->h_proto;

	if (vnet_hdr_len) {
		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
		if (err)
			goto err_kfree;
	}

	skb_probe_transport_header(skb, ETH_HLEN);

	/* Move network header to the right position for VLAN tagged packets */
	if ((skb->protocol == htons(ETH_P_8021Q) ||
	     skb->protocol == htons(ETH_P_8021AD)) &&
	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
		skb_set_network_header(skb, depth);

	rcu_read_lock();
	tap = rcu_dereference(q->tap);
	if (tap) {
		skb->dev = tap->dev;
		dev_queue_xmit(skb);
	} else {
		kfree_skb(skb);
	}
	rcu_read_unlock();

	return 0;

err_kfree:
	kfree_skb(skb);
err:
	rcu_read_lock();
		tap = rcu_dereference(q->tap);
	if (tap && tap->count_tx_dropped)
		tap->count_tx_dropped(tap);
	rcu_read_unlock();
	return err;
}

1211 1212
static int tap_sendmsg(struct socket *sock, struct msghdr *m,
		       size_t total_len)
A
Arnd Bergmann 已提交
1213
{
1214
	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
1215
	struct tun_msg_ctl *ctl = m->msg_control;
1216 1217
	struct xdp_buff *xdp;
	int i;
1218

1219 1220 1221 1222 1223 1224 1225
	if (ctl && (ctl->type == TUN_MSG_PTR)) {
		for (i = 0; i < ctl->num; i++) {
			xdp = &((struct xdp_buff *)ctl->ptr)[i];
			tap_get_user_xdp(q, xdp);
		}
		return 0;
	}
1226 1227 1228

	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
			    m->msg_flags & MSG_DONTWAIT);
A
Arnd Bergmann 已提交
1229 1230
}

1231 1232
static int tap_recvmsg(struct socket *sock, struct msghdr *m,
		       size_t total_len, int flags)
A
Arnd Bergmann 已提交
1233
{
1234
	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
W
Wei Xu 已提交
1235
	struct sk_buff *skb = m->msg_control;
A
Arnd Bergmann 已提交
1236
	int ret;
W
Wei Xu 已提交
1237
	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
1238
		kfree_skb(skb);
A
Arnd Bergmann 已提交
1239
		return -EINVAL;
W
Wei Xu 已提交
1240 1241
	}
	ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb);
1242 1243 1244 1245
	if (ret > total_len) {
		m->msg_flags |= MSG_TRUNC;
		ret = flags & MSG_TRUNC ? ret : total_len;
	}
A
Arnd Bergmann 已提交
1246 1247 1248
	return ret;
}

1249
static int tap_peek_len(struct socket *sock)
J
Jason Wang 已提交
1250
{
1251
	struct tap_queue *q = container_of(sock, struct tap_queue,
J
Jason Wang 已提交
1252
					       sock);
1253
	return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag);
J
Jason Wang 已提交
1254 1255
}

A
Arnd Bergmann 已提交
1256
/* Ops structure to mimic raw sockets with tun */
1257 1258 1259 1260
static const struct proto_ops tap_socket_ops = {
	.sendmsg = tap_sendmsg,
	.recvmsg = tap_recvmsg,
	.peek_len = tap_peek_len,
A
Arnd Bergmann 已提交
1261 1262 1263 1264 1265 1266
};

/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
1267
struct socket *tap_get_socket(struct file *file)
A
Arnd Bergmann 已提交
1268
{
1269 1270
	struct tap_queue *q;
	if (file->f_op != &tap_fops)
A
Arnd Bergmann 已提交
1271 1272 1273 1274 1275 1276
		return ERR_PTR(-EINVAL);
	q = file->private_data;
	if (!q)
		return ERR_PTR(-EBADFD);
	return &q->sock;
}
1277
EXPORT_SYMBOL_GPL(tap_get_socket);
A
Arnd Bergmann 已提交
1278

1279
struct ptr_ring *tap_get_ptr_ring(struct file *file)
J
Jason Wang 已提交
1280 1281 1282 1283 1284 1285 1286 1287
{
	struct tap_queue *q;

	if (file->f_op != &tap_fops)
		return ERR_PTR(-EINVAL);
	q = file->private_data;
	if (!q)
		return ERR_PTR(-EBADFD);
1288
	return &q->ring;
J
Jason Wang 已提交
1289
}
1290
EXPORT_SYMBOL_GPL(tap_get_ptr_ring);
J
Jason Wang 已提交
1291

1292
int tap_queue_resize(struct tap_dev *tap)
J
Jason Wang 已提交
1293
{
1294
	struct net_device *dev = tap->dev;
1295
	struct tap_queue *q;
1296
	struct ptr_ring **rings;
1297
	int n = tap->numqueues;
J
Jason Wang 已提交
1298 1299
	int ret, i = 0;

1300 1301
	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
	if (!rings)
J
Jason Wang 已提交
1302 1303
		return -ENOMEM;

1304
	list_for_each_entry(q, &tap->queue_list, next)
1305
		rings[i++] = &q->ring;
J
Jason Wang 已提交
1306

1307 1308 1309
	ret = ptr_ring_resize_multiple(rings, n,
				       dev->tx_queue_len, GFP_KERNEL,
				       __skb_array_destroy_skb);
J
Jason Wang 已提交
1310

1311
	kfree(rings);
J
Jason Wang 已提交
1312 1313
	return ret;
}
1314
EXPORT_SYMBOL_GPL(tap_queue_resize);
1315

1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
static int tap_list_add(dev_t major, const char *device_name)
{
	struct major_info *tap_major;

	tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC);
	if (!tap_major)
		return -ENOMEM;

	tap_major->major = MAJOR(major);

	idr_init(&tap_major->minor_idr);
W
WANG Cong 已提交
1327
	spin_lock_init(&tap_major->minor_lock);
1328 1329 1330 1331 1332 1333 1334

	tap_major->device_name = device_name;

	list_add_tail_rcu(&tap_major->next, &major_list);
	return 0;
}

1335 1336
int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major,
		    const char *device_name, struct module *module)
1337 1338 1339 1340 1341 1342 1343 1344
{
	int err;

	err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name);
	if (err)
		goto out1;

	cdev_init(tap_cdev, &tap_fops);
1345
	tap_cdev->owner = module;
1346 1347 1348 1349
	err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS);
	if (err)
		goto out2;

1350 1351 1352
	err =  tap_list_add(*tap_major, device_name);
	if (err)
		goto out3;
1353 1354 1355

	return 0;

1356 1357
out3:
	cdev_del(tap_cdev);
1358 1359 1360 1361 1362
out2:
	unregister_chrdev_region(*tap_major, TAP_NUM_DEVS);
out1:
	return err;
}
1363
EXPORT_SYMBOL_GPL(tap_create_cdev);
1364 1365 1366

void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
{
1367 1368
	struct major_info *tap_major, *tmp;

1369 1370
	cdev_del(tap_cdev);
	unregister_chrdev_region(major, TAP_NUM_DEVS);
1371 1372 1373 1374 1375 1376 1377
	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
		if (tap_major->major == MAJOR(major)) {
			idr_destroy(&tap_major->minor_idr);
			list_del_rcu(&tap_major->next);
			kfree_rcu(tap_major, rcu);
		}
	}
1378
}
1379 1380 1381 1382 1383
EXPORT_SYMBOL_GPL(tap_destroy_cdev);

MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>");
MODULE_LICENSE("GPL");