tap.c 30.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
A
Arnd Bergmann 已提交
2
#include <linux/etherdevice.h>
3
#include <linux/if_tap.h>
4
#include <linux/if_vlan.h>
A
Arnd Bergmann 已提交
5 6 7 8 9 10 11
#include <linux/interrupt.h>
#include <linux/nsproxy.h>
#include <linux/compat.h>
#include <linux/if_tun.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
12
#include <linux/sched/signal.h>
A
Arnd Bergmann 已提交
13
#include <linux/types.h>
14
#include <linux/slab.h>
A
Arnd Bergmann 已提交
15 16
#include <linux/wait.h>
#include <linux/cdev.h>
A
Al Viro 已提交
17
#include <linux/idr.h>
A
Arnd Bergmann 已提交
18
#include <linux/fs.h>
H
Herbert Xu 已提交
19
#include <linux/uio.h>
A
Arnd Bergmann 已提交
20 21 22 23

#include <net/net_namespace.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
24
#include <linux/virtio_net.h>
J
Jason Wang 已提交
25
#include <linux/skb_array.h>
A
Arnd Bergmann 已提交
26

27
#define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
28

29 30
#define TAP_VNET_LE 0x80000000
#define TAP_VNET_BE 0x40000000
31 32

#ifdef CONFIG_TUN_VNET_CROSS_LE
33
static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
34
{
35
	return q->flags & TAP_VNET_BE ? false :
36 37 38
		virtio_legacy_is_little_endian();
}

39
static long tap_get_vnet_be(struct tap_queue *q, int __user *sp)
40
{
41
	int s = !!(q->flags & TAP_VNET_BE);
42 43 44 45 46 47 48

	if (put_user(s, sp))
		return -EFAULT;

	return 0;
}

49
static long tap_set_vnet_be(struct tap_queue *q, int __user *sp)
50 51 52 53 54 55 56
{
	int s;

	if (get_user(s, sp))
		return -EFAULT;

	if (s)
57
		q->flags |= TAP_VNET_BE;
58
	else
59
		q->flags &= ~TAP_VNET_BE;
60 61 62 63

	return 0;
}
#else
64
static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
65 66 67 68
{
	return virtio_legacy_is_little_endian();
}

69
static long tap_get_vnet_be(struct tap_queue *q, int __user *argp)
70 71 72 73
{
	return -EINVAL;
}

74
static long tap_set_vnet_be(struct tap_queue *q, int __user *argp)
75 76 77 78
{
	return -EINVAL;
}
#endif /* CONFIG_TUN_VNET_CROSS_LE */
M
Michael S. Tsirkin 已提交
79

80
static inline bool tap_is_little_endian(struct tap_queue *q)
81
{
82 83
	return q->flags & TAP_VNET_LE ||
		tap_legacy_is_little_endian(q);
84
}
M
Michael S. Tsirkin 已提交
85

86
static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val)
M
Michael S. Tsirkin 已提交
87
{
88
	return __virtio16_to_cpu(tap_is_little_endian(q), val);
M
Michael S. Tsirkin 已提交
89 90
}

91
static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val)
M
Michael S. Tsirkin 已提交
92
{
93
	return __cpu_to_virtio16(tap_is_little_endian(q), val);
M
Michael S. Tsirkin 已提交
94 95
}

96 97
static struct proto tap_proto = {
	.name = "tap",
A
Arnd Bergmann 已提交
98
	.owner = THIS_MODULE,
99
	.obj_size = sizeof(struct tap_queue),
A
Arnd Bergmann 已提交
100 101
};

102
#define TAP_NUM_DEVS (1U << MINORBITS)
103 104 105

static LIST_HEAD(major_list);

106
struct major_info {
107
	struct rcu_head rcu;
108 109
	dev_t major;
	struct idr minor_idr;
W
WANG Cong 已提交
110
	spinlock_t minor_lock;
111
	const char *device_name;
112 113
	struct list_head next;
};
114

115
#define GOODCOPY_LEN 128
A
Arnd Bergmann 已提交
116

117
static const struct proto_ops tap_socket_ops;
A
Arnd Bergmann 已提交
118

119
#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
120
#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
121

122
static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev)
123 124 125 126
{
	return rcu_dereference(dev->rx_handler_data);
}

A
Arnd Bergmann 已提交
127 128
/*
 * RCU usage:
129
 * The tap_queue and the macvlan_dev are loosely coupled, the
130
 * pointers from one to the other can only be read while rcu_read_lock
131
 * or rtnl is held.
A
Arnd Bergmann 已提交
132
 *
133
 * Both the file and the macvlan_dev hold a reference on the tap_queue
134 135
 * through sock_hold(&q->sk). When the macvlan_dev goes away first,
 * q->vlan becomes inaccessible. When the files gets closed,
136
 * tap_get_queue() fails.
A
Arnd Bergmann 已提交
137
 *
138 139 140 141
 * There may still be references to the struct sock inside of the
 * queue from outbound SKBs, but these never reference back to the
 * file or the dev. The data structure is freed through __sk_free
 * when both our references and any pending SKBs are gone.
A
Arnd Bergmann 已提交
142 143
 */

144
static int tap_enable_queue(struct tap_dev *tap, struct file *file,
145
			    struct tap_queue *q)
J
Jason Wang 已提交
146 147 148
{
	int err = -EINVAL;

149
	ASSERT_RTNL();
J
Jason Wang 已提交
150 151 152 153 154

	if (q->enabled)
		goto out;

	err = 0;
155 156
	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
	q->queue_index = tap->numvtaps;
J
Jason Wang 已提交
157 158
	q->enabled = true;

159
	tap->numvtaps++;
J
Jason Wang 已提交
160 161 162 163
out:
	return err;
}

164
/* Requires RTNL */
165
static int tap_set_queue(struct tap_dev *tap, struct file *file,
166
			 struct tap_queue *q)
A
Arnd Bergmann 已提交
167
{
168
	if (tap->numqueues == MAX_TAP_QUEUES)
169
		return -EBUSY;
A
Arnd Bergmann 已提交
170

171 172
	rcu_assign_pointer(q->tap, tap);
	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
173
	sock_hold(&q->sk);
A
Arnd Bergmann 已提交
174 175

	q->file = file;
176
	q->queue_index = tap->numvtaps;
J
Jason Wang 已提交
177
	q->enabled = true;
178
	file->private_data = q;
179
	list_add_tail(&q->next, &tap->queue_list);
A
Arnd Bergmann 已提交
180

181 182
	tap->numvtaps++;
	tap->numqueues++;
183

184
	return 0;
A
Arnd Bergmann 已提交
185 186
}

187
static int tap_disable_queue(struct tap_queue *q)
J
Jason Wang 已提交
188
{
189
	struct tap_dev *tap;
190
	struct tap_queue *nq;
J
Jason Wang 已提交
191

192
	ASSERT_RTNL();
J
Jason Wang 已提交
193 194 195
	if (!q->enabled)
		return -EINVAL;

196
	tap = rtnl_dereference(q->tap);
197

198
	if (tap) {
J
Jason Wang 已提交
199
		int index = q->queue_index;
200 201
		BUG_ON(index >= tap->numvtaps);
		nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]);
J
Jason Wang 已提交
202 203
		nq->queue_index = index;

204 205
		rcu_assign_pointer(tap->taps[index], nq);
		RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL);
J
Jason Wang 已提交
206 207
		q->enabled = false;

208
		tap->numvtaps--;
J
Jason Wang 已提交
209 210 211 212 213
	}

	return 0;
}

A
Arnd Bergmann 已提交
214
/*
215 216 217
 * The file owning the queue got closed, give up both
 * the reference that the files holds as well as the
 * one from the macvlan_dev if that still exists.
A
Arnd Bergmann 已提交
218 219 220 221
 *
 * Using the spinlock makes sure that we don't get
 * to the queue again after destroying it.
 */
222
static void tap_put_queue(struct tap_queue *q)
A
Arnd Bergmann 已提交
223
{
224
	struct tap_dev *tap;
A
Arnd Bergmann 已提交
225

226
	rtnl_lock();
227
	tap = rtnl_dereference(q->tap);
228

229
	if (tap) {
J
Jason Wang 已提交
230
		if (q->enabled)
231
			BUG_ON(tap_disable_queue(q));
J
Jason Wang 已提交
232

233 234
		tap->numqueues--;
		RCU_INIT_POINTER(q->tap, NULL);
235
		sock_put(&q->sk);
J
Jason Wang 已提交
236
		list_del_init(&q->next);
A
Arnd Bergmann 已提交
237 238
	}

239
	rtnl_unlock();
A
Arnd Bergmann 已提交
240 241 242 243 244 245

	synchronize_rcu();
	sock_put(&q->sk);
}

/*
246 247 248 249 250
 * Select a queue based on the rxq of the device on which this packet
 * arrived. If the incoming device is not mq, calculate a flow hash
 * to select a queue. If all fails, find the first available queue.
 * Cache vlan->numvtaps since it can become zero during the execution
 * of this function.
A
Arnd Bergmann 已提交
251
 */
252
static struct tap_queue *tap_get_queue(struct tap_dev *tap,
253
				       struct sk_buff *skb)
A
Arnd Bergmann 已提交
254
{
255
	struct tap_queue *queue = NULL;
J
Jason Wang 已提交
256 257 258 259 260
	/* Access to taps array is protected by rcu, but access to numvtaps
	 * isn't. Below we use it to lookup a queue, but treat it as a hint
	 * and validate that the result isn't NULL - in case we are
	 * racing against queue removal.
	 */
261
	int numvtaps = READ_ONCE(tap->numvtaps);
262 263 264 265 266
	__u32 rxq;

	if (!numvtaps)
		goto out;

267 268 269
	if (numvtaps == 1)
		goto single;

270
	/* Check if we can use flow to select a queue */
271
	rxq = skb_get_hash(skb);
272
	if (rxq) {
273
		queue = rcu_dereference(tap->taps[rxq % numvtaps]);
J
Jason Wang 已提交
274
		goto out;
275 276
	}

277 278
	if (likely(skb_rx_queue_recorded(skb))) {
		rxq = skb_get_rx_queue(skb);
A
Arnd Bergmann 已提交
279

280 281 282
		while (unlikely(rxq >= numvtaps))
			rxq -= numvtaps;

283
		queue = rcu_dereference(tap->taps[rxq]);
J
Jason Wang 已提交
284
		goto out;
285 286
	}

287
single:
288
	queue = rcu_dereference(tap->taps[0]);
289
out:
290
	return queue;
A
Arnd Bergmann 已提交
291 292
}

293 294
/*
 * The net_device is going away, give up the reference
295 296
 * that it holds on all queues and safely set the pointer
 * from the queues to NULL.
297
 */
298
void tap_del_queues(struct tap_dev *tap)
A
Arnd Bergmann 已提交
299
{
300
	struct tap_queue *q, *tmp;
301

302
	ASSERT_RTNL();
303
	list_for_each_entry_safe(q, tmp, &tap->queue_list, next) {
J
Jason Wang 已提交
304
		list_del_init(&q->next);
305
		RCU_INIT_POINTER(q->tap, NULL);
J
Jason Wang 已提交
306
		if (q->enabled)
307 308
			tap->numvtaps--;
		tap->numqueues--;
309
		sock_put(&q->sk);
310
	}
311 312
	BUG_ON(tap->numvtaps);
	BUG_ON(tap->numqueues);
313
	/* guarantee that any future tap_set_queue will fail */
314
	tap->numvtaps = MAX_TAP_QUEUES;
A
Arnd Bergmann 已提交
315
}
316
EXPORT_SYMBOL_GPL(tap_del_queues);
A
Arnd Bergmann 已提交
317

318
rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
A
Arnd Bergmann 已提交
319
{
320 321
	struct sk_buff *skb = *pskb;
	struct net_device *dev = skb->dev;
322
	struct tap_dev *tap;
323
	struct tap_queue *q;
324
	netdev_features_t features = TAP_FEATURES;
325
	enum skb_drop_reason drop_reason;
326

327 328
	tap = tap_dev_get_rcu(dev);
	if (!tap)
329 330
		return RX_HANDLER_PASS;

331
	q = tap_get_queue(tap, skb);
A
Arnd Bergmann 已提交
332
	if (!q)
333
		return RX_HANDLER_PASS;
H
Herbert Xu 已提交
334

335 336
	skb_push(skb, ETH_HLEN);

337
	/* Apply the forward feature mask so that we perform segmentation
338 339
	 * according to users wishes.  This only works if VNET_HDR is
	 * enabled.
340
	 */
341
	if (q->flags & IFF_VNET_HDR)
342
		features |= tap->tap_features;
343
	if (netif_needs_gso(skb, features)) {
344
		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
345
		struct sk_buff *next;
346

347 348
		if (IS_ERR(segs)) {
			drop_reason = SKB_DROP_REASON_SKB_GSO_SEG;
349
			goto drop;
350
		}
351 352

		if (!segs) {
353 354
			if (ptr_ring_produce(&q->ring, skb)) {
				drop_reason = SKB_DROP_REASON_FULL_RING;
J
Jason Wang 已提交
355
				goto drop;
356
			}
357 358 359
			goto wake_up;
		}

360
		consume_skb(skb);
361 362 363
		skb_list_walk_safe(segs, skb, next) {
			skb_mark_not_on_list(skb);
			if (ptr_ring_produce(&q->ring, skb)) {
364 365 366
				drop_reason = SKB_DROP_REASON_FULL_RING;
				kfree_skb_reason(skb, drop_reason);
				kfree_skb_list_reason(next, drop_reason);
J
Jason Wang 已提交
367 368
				break;
			}
369 370
		}
	} else {
371 372 373
		/* If we receive a partial checksum and the tap side
		 * doesn't support checksum offload, compute the checksum.
		 * Note: it doesn't matter which checksum feature to
S
Sainath Grandhi 已提交
374
		 *	  check, we either support them all or none.
375 376
		 */
		if (skb->ip_summed == CHECKSUM_PARTIAL &&
377
		    !(features & NETIF_F_CSUM_MASK) &&
378 379
		    skb_checksum_help(skb)) {
			drop_reason = SKB_DROP_REASON_SKB_CSUM;
380
			goto drop;
381 382 383
		}
		if (ptr_ring_produce(&q->ring, skb)) {
			drop_reason = SKB_DROP_REASON_FULL_RING;
J
Jason Wang 已提交
384
			goto drop;
385
		}
386 387 388
	}

wake_up:
389
	wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND);
390
	return RX_HANDLER_CONSUMED;
H
Herbert Xu 已提交
391 392

drop:
393
	/* Count errors/drops only here, thus don't care about args. */
394 395
	if (tap->count_rx_dropped)
		tap->count_rx_dropped(tap);
396
	kfree_skb_reason(skb, drop_reason);
397
	return RX_HANDLER_CONSUMED;
A
Arnd Bergmann 已提交
398
}
399
EXPORT_SYMBOL_GPL(tap_handle_frame);
A
Arnd Bergmann 已提交
400

401 402 403 404 405 406 407 408 409 410 411 412 413
static struct major_info *tap_get_major(int major)
{
	struct major_info *tap_major;

	list_for_each_entry_rcu(tap_major, &major_list, next) {
		if (tap_major->major == major)
			return tap_major;
	}

	return NULL;
}

int tap_get_minor(dev_t major, struct tap_dev *tap)
414 415
{
	int retval = -ENOMEM;
416 417 418 419 420 421 422 423
	struct major_info *tap_major;

	rcu_read_lock();
	tap_major = tap_get_major(MAJOR(major));
	if (!tap_major) {
		retval = -EINVAL;
		goto unlock;
	}
424

W
WANG Cong 已提交
425 426
	spin_lock(&tap_major->minor_lock);
	retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC);
T
Tejun Heo 已提交
427
	if (retval >= 0) {
428
		tap->minor = retval;
T
Tejun Heo 已提交
429
	} else if (retval == -ENOSPC) {
430
		netdev_err(tap->dev, "Too many tap devices\n");
431 432
		retval = -EINVAL;
	}
W
WANG Cong 已提交
433
	spin_unlock(&tap_major->minor_lock);
434 435 436

unlock:
	rcu_read_unlock();
T
Tejun Heo 已提交
437
	return retval < 0 ? retval : 0;
438
}
439
EXPORT_SYMBOL_GPL(tap_get_minor);
440

441
void tap_free_minor(dev_t major, struct tap_dev *tap)
442
{
443 444 445 446 447 448 449 450
	struct major_info *tap_major;

	rcu_read_lock();
	tap_major = tap_get_major(MAJOR(major));
	if (!tap_major) {
		goto unlock;
	}

W
WANG Cong 已提交
451
	spin_lock(&tap_major->minor_lock);
452
	if (tap->minor) {
453
		idr_remove(&tap_major->minor_idr, tap->minor);
454
		tap->minor = 0;
455
	}
W
WANG Cong 已提交
456
	spin_unlock(&tap_major->minor_lock);
457 458 459

unlock:
	rcu_read_unlock();
460
}
461
EXPORT_SYMBOL_GPL(tap_free_minor);
462

463
static struct tap_dev *dev_get_by_tap_file(int major, int minor)
464 465
{
	struct net_device *dev = NULL;
466
	struct tap_dev *tap;
467
	struct major_info *tap_major;
468

469 470 471 472 473 474 475
	rcu_read_lock();
	tap_major = tap_get_major(major);
	if (!tap_major) {
		tap = NULL;
		goto unlock;
	}

W
WANG Cong 已提交
476
	spin_lock(&tap_major->minor_lock);
477
	tap = idr_find(&tap_major->minor_idr, minor);
478 479
	if (tap) {
		dev = tap->dev;
480 481
		dev_hold(dev);
	}
W
WANG Cong 已提交
482
	spin_unlock(&tap_major->minor_lock);
483 484 485

unlock:
	rcu_read_unlock();
486
	return tap;
487 488
}

489
static void tap_sock_write_space(struct sock *sk)
A
Arnd Bergmann 已提交
490
{
491 492
	wait_queue_head_t *wqueue;

A
Arnd Bergmann 已提交
493
	if (!sock_writeable(sk) ||
494
	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
A
Arnd Bergmann 已提交
495 496
		return;

497 498
	wqueue = sk_sleep(sk);
	if (wqueue && waitqueue_active(wqueue))
499
		wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
A
Arnd Bergmann 已提交
500 501
}

502
static void tap_sock_destruct(struct sock *sk)
503
{
504
	struct tap_queue *q = container_of(sk, struct tap_queue, sk);
J
Jason Wang 已提交
505

506
	ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb);
507 508
}

509
static int tap_open(struct inode *inode, struct file *file)
A
Arnd Bergmann 已提交
510 511
{
	struct net *net = current->nsproxy->net_ns;
512
	struct tap_dev *tap;
513
	struct tap_queue *q;
514
	int err = -ENODEV;
A
Arnd Bergmann 已提交
515

516
	rtnl_lock();
517
	tap = dev_get_by_tap_file(imajor(inode), iminor(inode));
518
	if (!tap)
J
Jason Wang 已提交
519
		goto err;
A
Arnd Bergmann 已提交
520 521

	err = -ENOMEM;
522 523
	q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
					     &tap_proto, 0);
A
Arnd Bergmann 已提交
524
	if (!q)
J
Jason Wang 已提交
525
		goto err;
526
	if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) {
527 528 529
		sk_free(&q->sk);
		goto err;
	}
A
Arnd Bergmann 已提交
530

531
	init_waitqueue_head(&q->sock.wq.wait);
A
Arnd Bergmann 已提交
532 533
	q->sock.type = SOCK_RAW;
	q->sock.state = SS_CONNECTED;
A
Arnd Bergmann 已提交
534
	q->sock.file = file;
535
	q->sock.ops = &tap_socket_ops;
536
	sock_init_data_uid(&q->sock, &q->sk, inode->i_uid);
537 538
	q->sk.sk_write_space = tap_sock_write_space;
	q->sk.sk_destruct = tap_sock_destruct;
539
	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
540
	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
A
Arnd Bergmann 已提交
541

542
	/*
543
	 * so far only KVM virtio_net uses tap, enable zero copy between
544
	 * guest kernel and host kernel when lower device supports zerocopy
545 546 547
	 *
	 * The macvlan supports zerocopy iff the lower device supports zero
	 * copy so we don't have to look at the lower device directly.
548
	 */
549
	if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
550
		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
551

552
	err = tap_set_queue(tap, file, q);
553
	if (err) {
554
		/* tap_sock_destruct() will take care of freeing ptr_ring */
555 556
		goto err_put;
	}
A
Arnd Bergmann 已提交
557

558
	dev_put(tap->dev);
J
Jason Wang 已提交
559 560 561 562

	rtnl_unlock();
	return err;

563
err_put:
J
Jason Wang 已提交
564 565
	sock_put(&q->sk);
err:
566 567
	if (tap)
		dev_put(tap->dev);
A
Arnd Bergmann 已提交
568

569
	rtnl_unlock();
A
Arnd Bergmann 已提交
570 571 572
	return err;
}

573
static int tap_release(struct inode *inode, struct file *file)
A
Arnd Bergmann 已提交
574
{
575 576
	struct tap_queue *q = file->private_data;
	tap_put_queue(q);
A
Arnd Bergmann 已提交
577 578 579
	return 0;
}

580
static __poll_t tap_poll(struct file *file, poll_table *wait)
A
Arnd Bergmann 已提交
581
{
582
	struct tap_queue *q = file->private_data;
583
	__poll_t mask = EPOLLERR;
A
Arnd Bergmann 已提交
584 585 586 587 588

	if (!q)
		goto out;

	mask = 0;
589
	poll_wait(file, &q->sock.wq.wait, wait);
A
Arnd Bergmann 已提交
590

591
	if (!ptr_ring_empty(&q->ring))
592
		mask |= EPOLLIN | EPOLLRDNORM;
A
Arnd Bergmann 已提交
593 594

	if (sock_writeable(&q->sk) ||
595
	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
A
Arnd Bergmann 已提交
596
	     sock_writeable(&q->sk)))
597
		mask |= EPOLLOUT | EPOLLWRNORM;
A
Arnd Bergmann 已提交
598 599 600 601 602

out:
	return mask;
}

603 604
static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
					    size_t len, size_t linear,
605 606 607 608 609 610 611 612 613
						int noblock, int *err)
{
	struct sk_buff *skb;

	/* Under a page?  Don't bother with paged skb. */
	if (prepad + len < PAGE_SIZE || !linear)
		linear = len;

	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
614
				   err, 0);
615 616 617 618 619 620 621 622 623 624 625
	if (!skb)
		return NULL;

	skb_reserve(skb, prepad);
	skb_put(skb, linear);
	skb->data_len = len - linear;
	skb->len += len - linear;

	return skb;
}

626
/* Neighbour code has some assumptions on HH_DATA_MOD alignment */
627
#define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
628

A
Arnd Bergmann 已提交
629
/* Get packet from user space buffer */
630
static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
631
			    struct iov_iter *from, int noblock)
A
Arnd Bergmann 已提交
632
{
633
	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
A
Arnd Bergmann 已提交
634
	struct sk_buff *skb;
635
	struct tap_dev *tap;
636
	unsigned long total_len = iov_iter_count(from);
637
	unsigned long len = total_len;
A
Arnd Bergmann 已提交
638
	int err;
639 640
	struct virtio_net_hdr vnet_hdr = { 0 };
	int vnet_hdr_len = 0;
641
	int copylen = 0;
642
	int depth;
643
	bool zerocopy = false;
644
	size_t linear;
645
	enum skb_drop_reason drop_reason;
646 647

	if (q->flags & IFF_VNET_HDR) {
648
		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
649 650

		err = -EINVAL;
651
		if (len < vnet_hdr_len)
652
			goto err;
653
		len -= vnet_hdr_len;
654

655
		err = -EFAULT;
656
		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
657
			goto err;
658
		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
659
		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
660 661 662 663 664 665
		     tap16_to_cpu(q, vnet_hdr.csum_start) +
		     tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
			     tap16_to_cpu(q, vnet_hdr.hdr_len))
			vnet_hdr.hdr_len = cpu_to_tap16(q,
				 tap16_to_cpu(q, vnet_hdr.csum_start) +
				 tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
666
		err = -EINVAL;
667
		if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
668 669
			goto err;
	}
A
Arnd Bergmann 已提交
670

671
	err = -EINVAL;
A
Arnd Bergmann 已提交
672
	if (unlikely(len < ETH_HLEN))
673
		goto err;
A
Arnd Bergmann 已提交
674

675
	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
676 677
		struct iov_iter i;

M
Michael S. Tsirkin 已提交
678
		copylen = vnet_hdr.hdr_len ?
679
			tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
680 681
		if (copylen > good_linear)
			copylen = good_linear;
682 683
		else if (copylen < ETH_HLEN)
			copylen = ETH_HLEN;
684
		linear = copylen;
685 686 687
		i = *from;
		iov_iter_advance(&i, copylen);
		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
688 689 690 691
			zerocopy = true;
	}

	if (!zerocopy) {
692
		copylen = len;
693
		linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
694
		if (linear > good_linear)
695
			linear = good_linear;
696 697
		else if (linear < ETH_HLEN)
			linear = ETH_HLEN;
698
	}
699

700 701
	skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
			    linear, noblock, &err);
702 703
	if (!skb)
		goto err;
A
Arnd Bergmann 已提交
704

705
	if (zerocopy)
706
		err = zerocopy_sg_from_iter(skb, from);
707
	else
708
		err = skb_copy_datagram_from_iter(skb, 0, from, len);
709

710 711
	if (err) {
		drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
712
		goto err_kfree;
713
	}
A
Arnd Bergmann 已提交
714 715

	skb_set_network_header(skb, ETH_HLEN);
716 717 718
	skb_reset_mac_header(skb);
	skb->protocol = eth_hdr(skb)->h_proto;

719 720 721 722 723 724 725 726 727
	rcu_read_lock();
	tap = rcu_dereference(q->tap);
	if (!tap) {
		kfree_skb(skb);
		rcu_read_unlock();
		return total_len;
	}
	skb->dev = tap->dev;

728
	if (vnet_hdr_len) {
729
		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
730
					    tap_is_little_endian(q));
731
		if (err) {
732
			rcu_read_unlock();
733
			drop_reason = SKB_DROP_REASON_DEV_HDR;
734
			goto err_kfree;
735
		}
736 737
	}

738
	skb_probe_transport_header(skb);
739

740
	/* Move network header to the right position for VLAN tagged packets */
741
	if (eth_type_vlan(skb->protocol) &&
742 743 744
	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
		skb_set_network_header(skb, depth);

745
	/* copy skb_ubuf_info for callback when skb has no error */
746
	if (zerocopy) {
747
		skb_zcopy_init(skb, msg_control);
748 749
	} else if (msg_control) {
		struct ubuf_info *uarg = msg_control;
750
		uarg->callback(NULL, uarg, false);
751
	}
752

753
	dev_queue_xmit(skb);
754
	rcu_read_unlock();
755
	return total_len;
756

757
err_kfree:
758
	kfree_skb_reason(skb, drop_reason);
759

760
err:
761
	rcu_read_lock();
762 763 764
	tap = rcu_dereference(q->tap);
	if (tap && tap->count_tx_dropped)
		tap->count_tx_dropped(tap);
765
	rcu_read_unlock();
766 767

	return err;
A
Arnd Bergmann 已提交
768 769
}

770
static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
A
Arnd Bergmann 已提交
771 772
{
	struct file *file = iocb->ki_filp;
773
	struct tap_queue *q = file->private_data;
A
Arnd Bergmann 已提交
774

775
	return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
A
Arnd Bergmann 已提交
776 777 778
}

/* Put packet to the user space buffer */
779 780 781
static ssize_t tap_put_user(struct tap_queue *q,
			    const struct sk_buff *skb,
			    struct iov_iter *iter)
A
Arnd Bergmann 已提交
782 783
{
	int ret;
784
	int vnet_hdr_len = 0;
785
	int vlan_offset = 0;
H
Herbert Xu 已提交
786
	int total;
787 788

	if (q->flags & IFF_VNET_HDR) {
789
		int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
790
		struct virtio_net_hdr vnet_hdr;
791

792
		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
H
Herbert Xu 已提交
793
		if (iov_iter_count(iter) < vnet_hdr_len)
794 795
			return -EINVAL;

796
		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
797 798
					    tap_is_little_endian(q), true,
					    vlan_hlen))
799
			BUG();
800

H
Herbert Xu 已提交
801 802
		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
		    sizeof(vnet_hdr))
803
			return -EFAULT;
804 805

		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
806
	}
H
Herbert Xu 已提交
807
	total = vnet_hdr_len;
J
Jason Wang 已提交
808
	total += skb->len;
809

810
	if (skb_vlan_tag_present(skb)) {
811 812 813 814
		struct {
			__be16 h_vlan_proto;
			__be16 h_vlan_TCI;
		} veth;
815
		veth.h_vlan_proto = skb->vlan_proto;
816
		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
817 818

		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
J
Jason Wang 已提交
819
		total += VLAN_HLEN;
820

H
Herbert Xu 已提交
821 822
		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
		if (ret || !iov_iter_count(iter))
823 824
			goto done;

H
Herbert Xu 已提交
825 826
		ret = copy_to_iter(&veth, sizeof(veth), iter);
		if (ret != sizeof(veth) || !iov_iter_count(iter))
827 828
			goto done;
	}
A
Arnd Bergmann 已提交
829

H
Herbert Xu 已提交
830 831
	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
				     skb->len - vlan_offset);
A
Arnd Bergmann 已提交
832

833
done:
J
Jason Wang 已提交
834
	return ret ? ret : total;
A
Arnd Bergmann 已提交
835 836
}

837 838
static ssize_t tap_do_read(struct tap_queue *q,
			   struct iov_iter *to,
839
			   int noblock, struct sk_buff *skb)
A
Arnd Bergmann 已提交
840
{
841
	DEFINE_WAIT(wait);
A
Arnd Bergmann 已提交
842
	ssize_t ret = 0;
A
Arnd Bergmann 已提交
843

W
Wei Xu 已提交
844
	if (!iov_iter_count(to)) {
845
		kfree_skb(skb);
A
Al Viro 已提交
846
		return 0;
W
Wei Xu 已提交
847
	}
A
Al Viro 已提交
848

849 850 851
	if (skb)
		goto put;

A
Al Viro 已提交
852
	while (1) {
853 854 855
		if (!noblock)
			prepare_to_wait(sk_sleep(&q->sk), &wait,
					TASK_INTERRUPTIBLE);
A
Arnd Bergmann 已提交
856 857

		/* Read frames from the queue */
858
		skb = ptr_ring_consume(&q->ring);
A
Al Viro 已提交
859 860 861 862 863
		if (skb)
			break;
		if (noblock) {
			ret = -EAGAIN;
			break;
A
Arnd Bergmann 已提交
864
		}
A
Al Viro 已提交
865 866 867 868 869 870 871
		if (signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}
		/* Nothing to read, let's sleep */
		schedule();
	}
872 873 874
	if (!noblock)
		finish_wait(sk_sleep(&q->sk), &wait);

875
put:
A
Al Viro 已提交
876
	if (skb) {
877
		ret = tap_put_user(q, skb, to);
878 879 880 881
		if (unlikely(ret < 0))
			kfree_skb(skb);
		else
			consume_skb(skb);
A
Arnd Bergmann 已提交
882
	}
A
Arnd Bergmann 已提交
883 884 885
	return ret;
}

886
static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
A
Arnd Bergmann 已提交
887 888
{
	struct file *file = iocb->ki_filp;
889
	struct tap_queue *q = file->private_data;
A
Al Viro 已提交
890
	ssize_t len = iov_iter_count(to), ret;
A
Arnd Bergmann 已提交
891

892
	ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL);
J
Jason Wang 已提交
893
	ret = min_t(ssize_t, ret, len);
894 895
	if (ret > 0)
		iocb->ki_pos = ret;
A
Arnd Bergmann 已提交
896 897 898
	return ret;
}

899
static struct tap_dev *tap_get_tap_dev(struct tap_queue *q)
900
{
901
	struct tap_dev *tap;
902

903
	ASSERT_RTNL();
904 905 906
	tap = rtnl_dereference(q->tap);
	if (tap)
		dev_hold(tap->dev);
907

908
	return tap;
909 910
}

911
static void tap_put_tap_dev(struct tap_dev *tap)
912
{
913
	dev_put(tap->dev);
914 915
}

916
static int tap_ioctl_set_queue(struct file *file, unsigned int flags)
J
Jason Wang 已提交
917
{
918
	struct tap_queue *q = file->private_data;
919
	struct tap_dev *tap;
J
Jason Wang 已提交
920 921
	int ret;

922 923
	tap = tap_get_tap_dev(q);
	if (!tap)
J
Jason Wang 已提交
924 925 926
		return -EINVAL;

	if (flags & IFF_ATTACH_QUEUE)
927
		ret = tap_enable_queue(tap, file, q);
J
Jason Wang 已提交
928
	else if (flags & IFF_DETACH_QUEUE)
929
		ret = tap_disable_queue(q);
930 931
	else
		ret = -EINVAL;
J
Jason Wang 已提交
932

933
	tap_put_tap_dev(tap);
J
Jason Wang 已提交
934 935 936
	return ret;
}

937
static int set_offload(struct tap_queue *q, unsigned long arg)
938
{
939
	struct tap_dev *tap;
940 941 942
	netdev_features_t features;
	netdev_features_t feature_mask = 0;

943 944
	tap = rtnl_dereference(q->tap);
	if (!tap)
945 946
		return -ENOLINK;

947
	features = tap->dev->features;
948 949 950 951 952 953 954 955 956 957 958 959

	if (arg & TUN_F_CSUM) {
		feature_mask = NETIF_F_HW_CSUM;

		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
			if (arg & TUN_F_TSO_ECN)
				feature_mask |= NETIF_F_TSO_ECN;
			if (arg & TUN_F_TSO4)
				feature_mask |= NETIF_F_TSO;
			if (arg & TUN_F_TSO6)
				feature_mask |= NETIF_F_TSO6;
		}
960 961 962 963

		/* TODO: for now USO4 and USO6 should work simultaneously */
		if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6))
			features |= NETIF_F_GSO_UDP_L4;
964 965 966 967 968 969
	}

	/* tun/tap driver inverts the usage for TSO offloads, where
	 * setting the TSO bit means that the userspace wants to
	 * accept TSO frames and turning it off means that user space
	 * does not support TSO.
970
	 * For tap, we have to invert it to mean the same thing.
971 972 973
	 * When user space turns off TSO, we turn off GSO/LRO so that
	 * user-space will not receive TSO frames.
	 */
974 975
	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) ||
	    (feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6))
976 977 978 979 980 981 982
		features |= RX_OFFLOADS;
	else
		features &= ~RX_OFFLOADS;

	/* tap_features are the same as features on tun/tap and
	 * reflect user expectations.
	 */
983 984 985
	tap->tap_features = feature_mask;
	if (tap->update_features)
		tap->update_features(tap, features);
986 987 988 989

	return 0;
}

A
Arnd Bergmann 已提交
990 991 992
/*
 * provide compatibility with generic tun/tap interface
 */
993 994
static long tap_ioctl(struct file *file, unsigned int cmd,
		      unsigned long arg)
A
Arnd Bergmann 已提交
995
{
996
	struct tap_queue *q = file->private_data;
997
	struct tap_dev *tap;
A
Arnd Bergmann 已提交
998 999 1000
	void __user *argp = (void __user *)arg;
	struct ifreq __user *ifr = argp;
	unsigned int __user *up = argp;
1001
	unsigned short u;
1002
	int __user *sp = argp;
1003
	struct sockaddr sa;
1004
	int s;
1005
	int ret;
A
Arnd Bergmann 已提交
1006 1007 1008 1009 1010 1011

	switch (cmd) {
	case TUNSETIFF:
		/* ignore the name, just look at flags */
		if (get_user(u, &ifr->ifr_flags))
			return -EFAULT;
1012 1013

		ret = 0;
1014
		if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP))
1015 1016
			ret = -EINVAL;
		else
1017
			q->flags = (q->flags & ~TAP_IFFEATURES) | u;
1018 1019

		return ret;
A
Arnd Bergmann 已提交
1020 1021

	case TUNGETIFF:
1022
		rtnl_lock();
1023 1024
		tap = tap_get_tap_dev(q);
		if (!tap) {
1025
			rtnl_unlock();
A
Arnd Bergmann 已提交
1026
			return -ENOLINK;
1027
		}
A
Arnd Bergmann 已提交
1028

1029
		ret = 0;
1030
		u = q->flags;
1031
		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
1032
		    put_user(u, &ifr->ifr_flags))
1033
			ret = -EFAULT;
1034
		tap_put_tap_dev(tap);
1035
		rtnl_unlock();
1036
		return ret;
A
Arnd Bergmann 已提交
1037

J
Jason Wang 已提交
1038 1039 1040
	case TUNSETQUEUE:
		if (get_user(u, &ifr->ifr_flags))
			return -EFAULT;
1041
		rtnl_lock();
1042
		ret = tap_ioctl_set_queue(file, u);
1043
		rtnl_unlock();
1044
		return ret;
J
Jason Wang 已提交
1045

A
Arnd Bergmann 已提交
1046
	case TUNGETFEATURES:
1047
		if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up))
A
Arnd Bergmann 已提交
1048 1049 1050 1051
			return -EFAULT;
		return 0;

	case TUNSETSNDBUF:
1052
		if (get_user(s, sp))
A
Arnd Bergmann 已提交
1053
			return -EFAULT;
1054 1055
		if (s <= 0)
			return -EINVAL;
A
Arnd Bergmann 已提交
1056

1057
		q->sk.sk_sndbuf = s;
A
Arnd Bergmann 已提交
1058 1059
		return 0;

1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
	case TUNGETVNETHDRSZ:
		s = q->vnet_hdr_sz;
		if (put_user(s, sp))
			return -EFAULT;
		return 0;

	case TUNSETVNETHDRSZ:
		if (get_user(s, sp))
			return -EFAULT;
		if (s < (int)sizeof(struct virtio_net_hdr))
			return -EINVAL;

		q->vnet_hdr_sz = s;
		return 0;

1075
	case TUNGETVNETLE:
1076
		s = !!(q->flags & TAP_VNET_LE);
1077 1078 1079 1080 1081 1082 1083 1084
		if (put_user(s, sp))
			return -EFAULT;
		return 0;

	case TUNSETVNETLE:
		if (get_user(s, sp))
			return -EFAULT;
		if (s)
1085
			q->flags |= TAP_VNET_LE;
1086
		else
1087
			q->flags &= ~TAP_VNET_LE;
1088 1089
		return 0;

1090
	case TUNGETVNETBE:
1091
		return tap_get_vnet_be(q, sp);
1092 1093

	case TUNSETVNETBE:
1094
		return tap_set_vnet_be(q, sp);
1095

A
Arnd Bergmann 已提交
1096 1097 1098
	case TUNSETOFFLOAD:
		/* let the user check for future flags */
		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
1099 1100
			    TUN_F_TSO_ECN | TUN_F_UFO |
			    TUN_F_USO4 | TUN_F_USO6))
A
Arnd Bergmann 已提交
1101 1102
			return -EINVAL;

1103 1104 1105 1106
		rtnl_lock();
		ret = set_offload(q, arg);
		rtnl_unlock();
		return ret;
A
Arnd Bergmann 已提交
1107

1108 1109
	case SIOCGIFHWADDR:
		rtnl_lock();
1110 1111
		tap = tap_get_tap_dev(q);
		if (!tap) {
1112 1113 1114 1115
			rtnl_unlock();
			return -ENOLINK;
		}
		ret = 0;
1116
		dev_get_mac_address(&sa, dev_net(tap->dev), tap->dev->name);
1117
		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
1118
		    copy_to_user(&ifr->ifr_hwaddr, &sa, sizeof(sa)))
1119
			ret = -EFAULT;
1120
		tap_put_tap_dev(tap);
1121 1122 1123 1124
		rtnl_unlock();
		return ret;

	case SIOCSIFHWADDR:
1125 1126
		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
			return -EFAULT;
1127
		rtnl_lock();
1128 1129
		tap = tap_get_tap_dev(q);
		if (!tap) {
1130 1131 1132
			rtnl_unlock();
			return -ENOLINK;
		}
1133
		ret = dev_set_mac_address_user(tap->dev, &sa, NULL);
1134
		tap_put_tap_dev(tap);
1135 1136 1137
		rtnl_unlock();
		return ret;

A
Arnd Bergmann 已提交
1138 1139 1140 1141 1142
	default:
		return -EINVAL;
	}
}

1143
static const struct file_operations tap_fops = {
A
Arnd Bergmann 已提交
1144
	.owner		= THIS_MODULE,
1145 1146 1147 1148 1149
	.open		= tap_open,
	.release	= tap_release,
	.read_iter	= tap_read_iter,
	.write_iter	= tap_write_iter,
	.poll		= tap_poll,
A
Arnd Bergmann 已提交
1150
	.llseek		= no_llseek,
1151
	.unlocked_ioctl	= tap_ioctl,
1152
	.compat_ioctl	= compat_ptr_ioctl,
A
Arnd Bergmann 已提交
1153 1154
};

1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
{
	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
	struct virtio_net_hdr *gso = &hdr->gso;
	int buflen = hdr->buflen;
	int vnet_hdr_len = 0;
	struct tap_dev *tap;
	struct sk_buff *skb;
	int err, depth;

	if (q->flags & IFF_VNET_HDR)
		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);

	skb = build_skb(xdp->data_hard_start, buflen);
	if (!skb) {
		err = -ENOMEM;
		goto err;
	}

	skb_reserve(skb, xdp->data - xdp->data_hard_start);
	skb_put(skb, xdp->data_end - xdp->data);

	skb_set_network_header(skb, ETH_HLEN);
	skb_reset_mac_header(skb);
	skb->protocol = eth_hdr(skb)->h_proto;

	if (vnet_hdr_len) {
		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
		if (err)
			goto err_kfree;
	}

	/* Move network header to the right position for VLAN tagged packets */
1188
	if (eth_type_vlan(skb->protocol) &&
1189 1190 1191 1192 1193 1194 1195
	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
		skb_set_network_header(skb, depth);

	rcu_read_lock();
	tap = rcu_dereference(q->tap);
	if (tap) {
		skb->dev = tap->dev;
1196
		skb_probe_transport_header(skb);
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
		dev_queue_xmit(skb);
	} else {
		kfree_skb(skb);
	}
	rcu_read_unlock();

	return 0;

err_kfree:
	kfree_skb(skb);
err:
	rcu_read_lock();
1209
	tap = rcu_dereference(q->tap);
1210 1211 1212 1213 1214 1215
	if (tap && tap->count_tx_dropped)
		tap->count_tx_dropped(tap);
	rcu_read_unlock();
	return err;
}

1216 1217
static int tap_sendmsg(struct socket *sock, struct msghdr *m,
		       size_t total_len)
A
Arnd Bergmann 已提交
1218
{
1219
	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
1220
	struct tun_msg_ctl *ctl = m->msg_control;
1221 1222
	struct xdp_buff *xdp;
	int i;
1223

1224 1225
	if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
	    ctl && ctl->type == TUN_MSG_PTR) {
1226 1227 1228 1229 1230 1231
		for (i = 0; i < ctl->num; i++) {
			xdp = &((struct xdp_buff *)ctl->ptr)[i];
			tap_get_user_xdp(q, xdp);
		}
		return 0;
	}
1232 1233 1234

	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
			    m->msg_flags & MSG_DONTWAIT);
A
Arnd Bergmann 已提交
1235 1236
}

1237 1238
static int tap_recvmsg(struct socket *sock, struct msghdr *m,
		       size_t total_len, int flags)
A
Arnd Bergmann 已提交
1239
{
1240
	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
W
Wei Xu 已提交
1241
	struct sk_buff *skb = m->msg_control;
A
Arnd Bergmann 已提交
1242
	int ret;
W
Wei Xu 已提交
1243
	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
1244
		kfree_skb(skb);
A
Arnd Bergmann 已提交
1245
		return -EINVAL;
W
Wei Xu 已提交
1246 1247
	}
	ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb);
1248 1249 1250 1251
	if (ret > total_len) {
		m->msg_flags |= MSG_TRUNC;
		ret = flags & MSG_TRUNC ? ret : total_len;
	}
A
Arnd Bergmann 已提交
1252 1253 1254
	return ret;
}

1255
static int tap_peek_len(struct socket *sock)
J
Jason Wang 已提交
1256
{
1257
	struct tap_queue *q = container_of(sock, struct tap_queue,
J
Jason Wang 已提交
1258
					       sock);
1259
	return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag);
J
Jason Wang 已提交
1260 1261
}

A
Arnd Bergmann 已提交
1262
/* Ops structure to mimic raw sockets with tun */
1263 1264 1265 1266
static const struct proto_ops tap_socket_ops = {
	.sendmsg = tap_sendmsg,
	.recvmsg = tap_recvmsg,
	.peek_len = tap_peek_len,
A
Arnd Bergmann 已提交
1267 1268 1269 1270 1271 1272
};

/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
1273
struct socket *tap_get_socket(struct file *file)
A
Arnd Bergmann 已提交
1274
{
1275 1276
	struct tap_queue *q;
	if (file->f_op != &tap_fops)
A
Arnd Bergmann 已提交
1277 1278 1279 1280 1281 1282
		return ERR_PTR(-EINVAL);
	q = file->private_data;
	if (!q)
		return ERR_PTR(-EBADFD);
	return &q->sock;
}
1283
EXPORT_SYMBOL_GPL(tap_get_socket);
A
Arnd Bergmann 已提交
1284

1285
struct ptr_ring *tap_get_ptr_ring(struct file *file)
J
Jason Wang 已提交
1286 1287 1288 1289 1290 1291 1292 1293
{
	struct tap_queue *q;

	if (file->f_op != &tap_fops)
		return ERR_PTR(-EINVAL);
	q = file->private_data;
	if (!q)
		return ERR_PTR(-EBADFD);
1294
	return &q->ring;
J
Jason Wang 已提交
1295
}
1296
EXPORT_SYMBOL_GPL(tap_get_ptr_ring);
J
Jason Wang 已提交
1297

1298
int tap_queue_resize(struct tap_dev *tap)
J
Jason Wang 已提交
1299
{
1300
	struct net_device *dev = tap->dev;
1301
	struct tap_queue *q;
1302
	struct ptr_ring **rings;
1303
	int n = tap->numqueues;
J
Jason Wang 已提交
1304 1305
	int ret, i = 0;

1306 1307
	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
	if (!rings)
J
Jason Wang 已提交
1308 1309
		return -ENOMEM;

1310
	list_for_each_entry(q, &tap->queue_list, next)
1311
		rings[i++] = &q->ring;
J
Jason Wang 已提交
1312

1313 1314 1315
	ret = ptr_ring_resize_multiple(rings, n,
				       dev->tx_queue_len, GFP_KERNEL,
				       __skb_array_destroy_skb);
J
Jason Wang 已提交
1316

1317
	kfree(rings);
J
Jason Wang 已提交
1318 1319
	return ret;
}
1320
EXPORT_SYMBOL_GPL(tap_queue_resize);
1321

1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
static int tap_list_add(dev_t major, const char *device_name)
{
	struct major_info *tap_major;

	tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC);
	if (!tap_major)
		return -ENOMEM;

	tap_major->major = MAJOR(major);

	idr_init(&tap_major->minor_idr);
W
WANG Cong 已提交
1333
	spin_lock_init(&tap_major->minor_lock);
1334 1335 1336 1337 1338 1339 1340

	tap_major->device_name = device_name;

	list_add_tail_rcu(&tap_major->next, &major_list);
	return 0;
}

1341 1342
int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major,
		    const char *device_name, struct module *module)
1343 1344 1345 1346 1347 1348 1349 1350
{
	int err;

	err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name);
	if (err)
		goto out1;

	cdev_init(tap_cdev, &tap_fops);
1351
	tap_cdev->owner = module;
1352 1353 1354 1355
	err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS);
	if (err)
		goto out2;

1356 1357 1358
	err =  tap_list_add(*tap_major, device_name);
	if (err)
		goto out3;
1359 1360 1361

	return 0;

1362 1363
out3:
	cdev_del(tap_cdev);
1364 1365 1366 1367 1368
out2:
	unregister_chrdev_region(*tap_major, TAP_NUM_DEVS);
out1:
	return err;
}
1369
EXPORT_SYMBOL_GPL(tap_create_cdev);
1370 1371 1372

void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
{
1373 1374
	struct major_info *tap_major, *tmp;

1375 1376
	cdev_del(tap_cdev);
	unregister_chrdev_region(major, TAP_NUM_DEVS);
1377 1378 1379 1380 1381 1382 1383
	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
		if (tap_major->major == MAJOR(major)) {
			idr_destroy(&tap_major->minor_idr);
			list_del_rcu(&tap_major->next);
			kfree_rcu(tap_major, rcu);
		}
	}
1384
}
1385 1386 1387 1388 1389
EXPORT_SYMBOL_GPL(tap_destroy_cdev);

MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>");
MODULE_LICENSE("GPL");