macvtap.c 30.2 KB
Newer Older
A
Arnd Bergmann 已提交
1 2
#include <linux/etherdevice.h>
#include <linux/if_macvlan.h>
3
#include <linux/if_vlan.h>
A
Arnd Bergmann 已提交
4 5 6 7 8 9 10 11 12
#include <linux/interrupt.h>
#include <linux/nsproxy.h>
#include <linux/compat.h>
#include <linux/if_tun.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/types.h>
13
#include <linux/slab.h>
A
Arnd Bergmann 已提交
14 15
#include <linux/wait.h>
#include <linux/cdev.h>
A
Al Viro 已提交
16
#include <linux/idr.h>
A
Arnd Bergmann 已提交
17
#include <linux/fs.h>
H
Herbert Xu 已提交
18
#include <linux/uio.h>
A
Arnd Bergmann 已提交
19

20
#include <net/ipv6.h>
A
Arnd Bergmann 已提交
21 22 23
#include <net/net_namespace.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
24
#include <linux/virtio_net.h>
A
Arnd Bergmann 已提交
25 26 27 28 29 30 31 32 33 34 35 36 37 38

/*
 * A macvtap queue is the central object of this driver, it connects
 * an open character device to a macvlan interface. There can be
 * multiple queues on one interface, which map back to queues
 * implemented in hardware on the underlying device.
 *
 * macvtap_proto is used to allocate queues through the sock allocation
 * mechanism.
 *
 */
struct macvtap_queue {
	struct sock sk;
	struct socket sock;
39
	struct socket_wq wq;
40
	int vnet_hdr_sz;
41
	struct macvlan_dev __rcu *vlan;
A
Arnd Bergmann 已提交
42
	struct file *file;
43
	unsigned int flags;
J
Jason Wang 已提交
44
	u16 queue_index;
J
Jason Wang 已提交
45 46
	bool enabled;
	struct list_head next;
A
Arnd Bergmann 已提交
47 48
};

49 50 51
#define MACVTAP_FEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)

#define MACVTAP_VNET_LE 0x80000000
M
Michael S. Tsirkin 已提交
52 53 54

static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
{
55
	return __virtio16_to_cpu(q->flags & MACVTAP_VNET_LE, val);
M
Michael S. Tsirkin 已提交
56 57 58 59
}

static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
{
60
	return __cpu_to_virtio16(q->flags & MACVTAP_VNET_LE, val);
M
Michael S. Tsirkin 已提交
61 62
}

A
Arnd Bergmann 已提交
63 64 65 66 67 68 69
static struct proto macvtap_proto = {
	.name = "macvtap",
	.owner = THIS_MODULE,
	.obj_size = sizeof (struct macvtap_queue),
};

/*
70
 * Variables for dealing with macvtaps device numbers.
A
Arnd Bergmann 已提交
71
 */
72
static dev_t macvtap_major;
73 74 75 76
#define MACVTAP_NUM_DEVS (1U << MINORBITS)
static DEFINE_MUTEX(minor_lock);
static DEFINE_IDR(minor_idr);

77
#define GOODCOPY_LEN 128
A
Arnd Bergmann 已提交
78 79 80
static struct class *macvtap_class;
static struct cdev macvtap_cdev;

A
Arnd Bergmann 已提交
81 82
static const struct proto_ops macvtap_socket_ops;

83
#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
84
		      NETIF_F_TSO6)
85
#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
86 87
#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG)

88 89 90 91 92
static struct macvlan_dev *macvtap_get_vlan_rcu(const struct net_device *dev)
{
	return rcu_dereference(dev->rx_handler_data);
}

A
Arnd Bergmann 已提交
93 94
/*
 * RCU usage:
95 96
 * The macvtap_queue and the macvlan_dev are loosely coupled, the
 * pointers from one to the other can only be read while rcu_read_lock
97
 * or rtnl is held.
A
Arnd Bergmann 已提交
98
 *
99 100 101 102
 * Both the file and the macvlan_dev hold a reference on the macvtap_queue
 * through sock_hold(&q->sk). When the macvlan_dev goes away first,
 * q->vlan becomes inaccessible. When the files gets closed,
 * macvtap_get_queue() fails.
A
Arnd Bergmann 已提交
103
 *
104 105 106 107
 * There may still be references to the struct sock inside of the
 * queue from outbound SKBs, but these never reference back to the
 * file or the dev. The data structure is freed through __sk_free
 * when both our references and any pending SKBs are gone.
A
Arnd Bergmann 已提交
108 109
 */

J
Jason Wang 已提交
110
static int macvtap_enable_queue(struct net_device *dev, struct file *file,
A
Arnd Bergmann 已提交
111
				struct macvtap_queue *q)
J
Jason Wang 已提交
112 113 114 115
{
	struct macvlan_dev *vlan = netdev_priv(dev);
	int err = -EINVAL;

116
	ASSERT_RTNL();
J
Jason Wang 已提交
117 118 119 120 121 122 123 124 125 126 127 128 129 130

	if (q->enabled)
		goto out;

	err = 0;
	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
	q->queue_index = vlan->numvtaps;
	q->enabled = true;

	vlan->numvtaps++;
out:
	return err;
}

131
/* Requires RTNL */
J
Jason Wang 已提交
132 133
static int macvtap_set_queue(struct net_device *dev, struct file *file,
			     struct macvtap_queue *q)
A
Arnd Bergmann 已提交
134 135 136
{
	struct macvlan_dev *vlan = netdev_priv(dev);

J
Jason Wang 已提交
137
	if (vlan->numqueues == MAX_MACVTAP_QUEUES)
138
		return -EBUSY;
A
Arnd Bergmann 已提交
139

140
	rcu_assign_pointer(q->vlan, vlan);
J
Jason Wang 已提交
141
	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
142
	sock_hold(&q->sk);
A
Arnd Bergmann 已提交
143 144

	q->file = file;
J
Jason Wang 已提交
145
	q->queue_index = vlan->numvtaps;
J
Jason Wang 已提交
146
	q->enabled = true;
147
	file->private_data = q;
J
Jason Wang 已提交
148
	list_add_tail(&q->next, &vlan->queue_list);
A
Arnd Bergmann 已提交
149

150
	vlan->numvtaps++;
J
Jason Wang 已提交
151
	vlan->numqueues++;
152

153
	return 0;
A
Arnd Bergmann 已提交
154 155
}

156
static int macvtap_disable_queue(struct macvtap_queue *q)
J
Jason Wang 已提交
157 158 159 160
{
	struct macvlan_dev *vlan;
	struct macvtap_queue *nq;

161
	ASSERT_RTNL();
J
Jason Wang 已提交
162 163 164
	if (!q->enabled)
		return -EINVAL;

165 166
	vlan = rtnl_dereference(q->vlan);

J
Jason Wang 已提交
167 168 169
	if (vlan) {
		int index = q->queue_index;
		BUG_ON(index >= vlan->numvtaps);
170
		nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]);
J
Jason Wang 已提交
171 172 173 174 175 176 177 178 179 180 181 182
		nq->queue_index = index;

		rcu_assign_pointer(vlan->taps[index], nq);
		RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
		q->enabled = false;

		vlan->numvtaps--;
	}

	return 0;
}

A
Arnd Bergmann 已提交
183
/*
184 185 186
 * The file owning the queue got closed, give up both
 * the reference that the files holds as well as the
 * one from the macvlan_dev if that still exists.
A
Arnd Bergmann 已提交
187 188 189 190
 *
 * Using the spinlock makes sure that we don't get
 * to the queue again after destroying it.
 */
191
static void macvtap_put_queue(struct macvtap_queue *q)
A
Arnd Bergmann 已提交
192
{
193
	struct macvlan_dev *vlan;
A
Arnd Bergmann 已提交
194

195 196 197
	rtnl_lock();
	vlan = rtnl_dereference(q->vlan);

198
	if (vlan) {
J
Jason Wang 已提交
199
		if (q->enabled)
200
			BUG_ON(macvtap_disable_queue(q));
J
Jason Wang 已提交
201

J
Jason Wang 已提交
202
		vlan->numqueues--;
203
		RCU_INIT_POINTER(q->vlan, NULL);
204
		sock_put(&q->sk);
J
Jason Wang 已提交
205
		list_del_init(&q->next);
A
Arnd Bergmann 已提交
206 207
	}

208
	rtnl_unlock();
A
Arnd Bergmann 已提交
209 210 211 212 213 214

	synchronize_rcu();
	sock_put(&q->sk);
}

/*
215 216 217 218 219
 * Select a queue based on the rxq of the device on which this packet
 * arrived. If the incoming device is not mq, calculate a flow hash
 * to select a queue. If all fails, find the first available queue.
 * Cache vlan->numvtaps since it can become zero during the execution
 * of this function.
A
Arnd Bergmann 已提交
220 221 222 223 224
 */
static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
					       struct sk_buff *skb)
{
	struct macvlan_dev *vlan = netdev_priv(dev);
225
	struct macvtap_queue *tap = NULL;
J
Jason Wang 已提交
226 227 228 229 230
	/* Access to taps array is protected by rcu, but access to numvtaps
	 * isn't. Below we use it to lookup a queue, but treat it as a hint
	 * and validate that the result isn't NULL - in case we are
	 * racing against queue removal.
	 */
231
	int numvtaps = ACCESS_ONCE(vlan->numvtaps);
232 233 234 235 236
	__u32 rxq;

	if (!numvtaps)
		goto out;

237
	/* Check if we can use flow to select a queue */
238
	rxq = skb_get_hash(skb);
239 240
	if (rxq) {
		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
J
Jason Wang 已提交
241
		goto out;
242 243
	}

244 245
	if (likely(skb_rx_queue_recorded(skb))) {
		rxq = skb_get_rx_queue(skb);
A
Arnd Bergmann 已提交
246

247 248 249 250
		while (unlikely(rxq >= numvtaps))
			rxq -= numvtaps;

		tap = rcu_dereference(vlan->taps[rxq]);
J
Jason Wang 已提交
251
		goto out;
252 253
	}

J
Jason Wang 已提交
254
	tap = rcu_dereference(vlan->taps[0]);
255 256
out:
	return tap;
A
Arnd Bergmann 已提交
257 258
}

259 260
/*
 * The net_device is going away, give up the reference
261 262
 * that it holds on all queues and safely set the pointer
 * from the queues to NULL.
263
 */
A
Arnd Bergmann 已提交
264 265 266
static void macvtap_del_queues(struct net_device *dev)
{
	struct macvlan_dev *vlan = netdev_priv(dev);
J
Jason Wang 已提交
267
	struct macvtap_queue *q, *tmp, *qlist[MAX_MACVTAP_QUEUES];
268
	int i, j = 0;
269

270
	ASSERT_RTNL();
J
Jason Wang 已提交
271 272
	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
		list_del_init(&q->next);
J
Jason Wang 已提交
273 274
		qlist[j++] = q;
		RCU_INIT_POINTER(q->vlan, NULL);
J
Jason Wang 已提交
275 276 277
		if (q->enabled)
			vlan->numvtaps--;
		vlan->numqueues--;
278
	}
J
Jason Wang 已提交
279 280 281 282
	for (i = 0; i < vlan->numvtaps; i++)
		RCU_INIT_POINTER(vlan->taps[i], NULL);
	BUG_ON(vlan->numvtaps);
	BUG_ON(vlan->numqueues);
283 284
	/* guarantee that any future macvtap_set_queue will fail */
	vlan->numvtaps = MAX_MACVTAP_QUEUES;
285 286 287

	for (--j; j >= 0; j--)
		sock_put(&qlist[j]->sk);
A
Arnd Bergmann 已提交
288 289
}

290
static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
A
Arnd Bergmann 已提交
291
{
292 293 294 295
	struct sk_buff *skb = *pskb;
	struct net_device *dev = skb->dev;
	struct macvlan_dev *vlan;
	struct macvtap_queue *q;
296 297
	netdev_features_t features = TAP_FEATURES;

298 299 300 301 302
	vlan = macvtap_get_vlan_rcu(dev);
	if (!vlan)
		return RX_HANDLER_PASS;

	q = macvtap_get_queue(dev, skb);
A
Arnd Bergmann 已提交
303
	if (!q)
304
		return RX_HANDLER_PASS;
H
Herbert Xu 已提交
305 306 307

	if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len)
		goto drop;
A
Arnd Bergmann 已提交
308

309 310
	skb_push(skb, ETH_HLEN);

311
	/* Apply the forward feature mask so that we perform segmentation
312 313
	 * according to users wishes.  This only works if VNET_HDR is
	 * enabled.
314
	 */
315 316
	if (q->flags & IFF_VNET_HDR)
		features |= vlan->tap_features;
T
Tom Herbert 已提交
317
	if (netif_needs_gso(dev, skb, features)) {
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
		struct sk_buff *segs = __skb_gso_segment(skb, features, false);

		if (IS_ERR(segs))
			goto drop;

		if (!segs) {
			skb_queue_tail(&q->sk.sk_receive_queue, skb);
			goto wake_up;
		}

		kfree_skb(skb);
		while (segs) {
			struct sk_buff *nskb = segs->next;

			segs->next = NULL;
			skb_queue_tail(&q->sk.sk_receive_queue, segs);
			segs = nskb;
		}
	} else {
337 338 339 340 341 342 343 344 345
		/* If we receive a partial checksum and the tap side
		 * doesn't support checksum offload, compute the checksum.
		 * Note: it doesn't matter which checksum feature to
		 *        check, we either support them all or none.
		 */
		if (skb->ip_summed == CHECKSUM_PARTIAL &&
		    !(features & NETIF_F_ALL_CSUM) &&
		    skb_checksum_help(skb))
			goto drop;
346 347 348 349
		skb_queue_tail(&q->sk.sk_receive_queue, skb);
	}

wake_up:
E
Eric Dumazet 已提交
350
	wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
351
	return RX_HANDLER_CONSUMED;
H
Herbert Xu 已提交
352 353

drop:
354 355
	/* Count errors/drops only here, thus don't care about args. */
	macvlan_count_rx(vlan, 0, 0, 0);
H
Herbert Xu 已提交
356
	kfree_skb(skb);
357
	return RX_HANDLER_CONSUMED;
A
Arnd Bergmann 已提交
358 359
}

360 361 362 363 364
static int macvtap_get_minor(struct macvlan_dev *vlan)
{
	int retval = -ENOMEM;

	mutex_lock(&minor_lock);
T
Tejun Heo 已提交
365 366 367 368
	retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL);
	if (retval >= 0) {
		vlan->minor = retval;
	} else if (retval == -ENOSPC) {
369 370 371 372
		printk(KERN_ERR "too many macvtap devices\n");
		retval = -EINVAL;
	}
	mutex_unlock(&minor_lock);
T
Tejun Heo 已提交
373
	return retval < 0 ? retval : 0;
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
}

static void macvtap_free_minor(struct macvlan_dev *vlan)
{
	mutex_lock(&minor_lock);
	if (vlan->minor) {
		idr_remove(&minor_idr, vlan->minor);
		vlan->minor = 0;
	}
	mutex_unlock(&minor_lock);
}

static struct net_device *dev_get_by_macvtap_minor(int minor)
{
	struct net_device *dev = NULL;
	struct macvlan_dev *vlan;

	mutex_lock(&minor_lock);
	vlan = idr_find(&minor_idr, minor);
	if (vlan) {
		dev = vlan->dev;
		dev_hold(dev);
	}
	mutex_unlock(&minor_lock);
	return dev;
}

A
Arnd Bergmann 已提交
401 402 403 404 405
static int macvtap_newlink(struct net *src_net,
			   struct net_device *dev,
			   struct nlattr *tb[],
			   struct nlattr *data[])
{
J
Jason Wang 已提交
406
	struct macvlan_dev *vlan = netdev_priv(dev);
407 408
	int err;

J
Jason Wang 已提交
409 410
	INIT_LIST_HEAD(&vlan->queue_list);

411 412 413 414 415
	/* Since macvlan supports all offloads by default, make
	 * tap support all offloads also.
	 */
	vlan->tap_features = TUN_OFFLOADS;

416 417 418 419
	err = netdev_rx_handler_register(dev, macvtap_handle_frame, vlan);
	if (err)
		return err;

420 421 422
	/* Don't put anything that may fail after macvlan_common_newlink
	 * because we can't undo what it does.
	 */
423
	return macvlan_common_newlink(src_net, dev, tb, data);
A
Arnd Bergmann 已提交
424 425 426 427 428
}

static void macvtap_dellink(struct net_device *dev,
			    struct list_head *head)
{
429
	netdev_rx_handler_unregister(dev);
A
Arnd Bergmann 已提交
430 431 432 433
	macvtap_del_queues(dev);
	macvlan_dellink(dev, head);
}

H
Herbert Xu 已提交
434 435 436 437 438 439
static void macvtap_setup(struct net_device *dev)
{
	macvlan_common_setup(dev);
	dev->tx_queue_len = TUN_READQ_SIZE;
}

A
Arnd Bergmann 已提交
440 441
static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
	.kind		= "macvtap",
H
Herbert Xu 已提交
442
	.setup		= macvtap_setup,
A
Arnd Bergmann 已提交
443 444 445 446 447 448 449
	.newlink	= macvtap_newlink,
	.dellink	= macvtap_dellink,
};


static void macvtap_sock_write_space(struct sock *sk)
{
450 451
	wait_queue_head_t *wqueue;

A
Arnd Bergmann 已提交
452 453 454 455
	if (!sock_writeable(sk) ||
	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
		return;

456 457 458
	wqueue = sk_sleep(sk);
	if (wqueue && waitqueue_active(wqueue))
		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
A
Arnd Bergmann 已提交
459 460
}

461 462 463 464 465
static void macvtap_sock_destruct(struct sock *sk)
{
	skb_queue_purge(&sk->sk_receive_queue);
}

A
Arnd Bergmann 已提交
466 467 468
static int macvtap_open(struct inode *inode, struct file *file)
{
	struct net *net = current->nsproxy->net_ns;
469
	struct net_device *dev;
A
Arnd Bergmann 已提交
470
	struct macvtap_queue *q;
471
	int err = -ENODEV;
A
Arnd Bergmann 已提交
472

473 474
	rtnl_lock();
	dev = dev_get_by_macvtap_minor(iminor(inode));
A
Arnd Bergmann 已提交
475 476 477 478 479 480 481 482 483
	if (!dev)
		goto out;

	err = -ENOMEM;
	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
					     &macvtap_proto);
	if (!q)
		goto out;

J
Jason Wang 已提交
484
	RCU_INIT_POINTER(q->sock.wq, &q->wq);
485
	init_waitqueue_head(&q->wq.wait);
A
Arnd Bergmann 已提交
486 487
	q->sock.type = SOCK_RAW;
	q->sock.state = SS_CONNECTED;
A
Arnd Bergmann 已提交
488 489
	q->sock.file = file;
	q->sock.ops = &macvtap_socket_ops;
A
Arnd Bergmann 已提交
490 491
	sock_init_data(&q->sock, &q->sk);
	q->sk.sk_write_space = macvtap_sock_write_space;
492
	q->sk.sk_destruct = macvtap_sock_destruct;
493
	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
494
	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
A
Arnd Bergmann 已提交
495

496 497 498
	/*
	 * so far only KVM virtio_net uses macvtap, enable zero copy between
	 * guest kernel and host kernel when lower device supports zerocopy
499 500 501
	 *
	 * The macvlan supports zerocopy iff the lower device supports zero
	 * copy so we don't have to look at the lower device directly.
502
	 */
503 504
	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
505

A
Arnd Bergmann 已提交
506 507 508 509 510 511 512 513
	err = macvtap_set_queue(dev, file, q);
	if (err)
		sock_put(&q->sk);

out:
	if (dev)
		dev_put(dev);

514
	rtnl_unlock();
A
Arnd Bergmann 已提交
515 516 517 518 519
	return err;
}

static int macvtap_release(struct inode *inode, struct file *file)
{
520 521
	struct macvtap_queue *q = file->private_data;
	macvtap_put_queue(q);
A
Arnd Bergmann 已提交
522 523 524 525 526
	return 0;
}

static unsigned int macvtap_poll(struct file *file, poll_table * wait)
{
527
	struct macvtap_queue *q = file->private_data;
A
Arnd Bergmann 已提交
528 529 530 531 532 533
	unsigned int mask = POLLERR;

	if (!q)
		goto out;

	mask = 0;
534
	poll_wait(file, &q->wq.wait, wait);
A
Arnd Bergmann 已提交
535 536 537 538 539 540 541 542 543 544 545 546 547

	if (!skb_queue_empty(&q->sk.sk_receive_queue))
		mask |= POLLIN | POLLRDNORM;

	if (sock_writeable(&q->sk) ||
	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
	     sock_writeable(&q->sk)))
		mask |= POLLOUT | POLLWRNORM;

out:
	return mask;
}

548 549 550 551 552 553 554 555 556 557 558
static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
						size_t len, size_t linear,
						int noblock, int *err)
{
	struct sk_buff *skb;

	/* Under a page?  Don't bother with paged skb. */
	if (prepad + len < PAGE_SIZE || !linear)
		linear = len;

	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
559
				   err, 0);
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
	if (!skb)
		return NULL;

	skb_reserve(skb, prepad);
	skb_put(skb, linear);
	skb->data_len = len - linear;
	skb->len += len - linear;

	return skb;
}

/*
 * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
 * be shared with the tun/tap driver.
 */
M
Michael S. Tsirkin 已提交
575 576
static int macvtap_skb_from_vnet_hdr(struct macvtap_queue *q,
				     struct sk_buff *skb,
577 578 579 580 581 582 583 584 585 586 587 588
				     struct virtio_net_hdr *vnet_hdr)
{
	unsigned short gso_type = 0;
	if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
		switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
		case VIRTIO_NET_HDR_GSO_TCPV4:
			gso_type = SKB_GSO_TCPV4;
			break;
		case VIRTIO_NET_HDR_GSO_TCPV6:
			gso_type = SKB_GSO_TCPV6;
			break;
		case VIRTIO_NET_HDR_GSO_UDP:
589 590
			pr_warn_once("macvtap: %s: using disabled UFO feature; please fix this program\n",
				     current->comm);
591
			gso_type = SKB_GSO_UDP;
592 593
			if (skb->protocol == htons(ETH_P_IPV6))
				ipv6_proxy_select_ident(skb);
594 595 596 597 598 599 600 601 602 603 604 605 606
			break;
		default:
			return -EINVAL;
		}

		if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
			gso_type |= SKB_GSO_TCP_ECN;

		if (vnet_hdr->gso_size == 0)
			return -EINVAL;
	}

	if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
M
Michael S. Tsirkin 已提交
607 608
		if (!skb_partial_csum_set(skb, macvtap16_to_cpu(q, vnet_hdr->csum_start),
					  macvtap16_to_cpu(q, vnet_hdr->csum_offset)))
609 610 611 612
			return -EINVAL;
	}

	if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
M
Michael S. Tsirkin 已提交
613
		skb_shinfo(skb)->gso_size = macvtap16_to_cpu(q, vnet_hdr->gso_size);
614
		skb_shinfo(skb)->gso_type = gso_type;
615 616 617 618 619 620 621 622

		/* Header must be checked, and gso_segs computed. */
		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
		skb_shinfo(skb)->gso_segs = 0;
	}
	return 0;
}

M
Michael S. Tsirkin 已提交
623 624 625
static void macvtap_skb_to_vnet_hdr(struct macvtap_queue *q,
				    const struct sk_buff *skb,
				    struct virtio_net_hdr *vnet_hdr)
626 627 628 629 630 631 632
{
	memset(vnet_hdr, 0, sizeof(*vnet_hdr));

	if (skb_is_gso(skb)) {
		struct skb_shared_info *sinfo = skb_shinfo(skb);

		/* This is a hint as to how much should be linear. */
M
Michael S. Tsirkin 已提交
633 634
		vnet_hdr->hdr_len = cpu_to_macvtap16(q, skb_headlen(skb));
		vnet_hdr->gso_size = cpu_to_macvtap16(q, sinfo->gso_size);
635 636 637 638 639 640 641 642 643 644 645 646 647
		if (sinfo->gso_type & SKB_GSO_TCPV4)
			vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
		else if (sinfo->gso_type & SKB_GSO_TCPV6)
			vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
		else
			BUG();
		if (sinfo->gso_type & SKB_GSO_TCP_ECN)
			vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
	} else
		vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;

	if (skb->ip_summed == CHECKSUM_PARTIAL) {
		vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
648
		if (skb_vlan_tag_present(skb))
M
Michael S. Tsirkin 已提交
649 650 651 652 653 654
			vnet_hdr->csum_start = cpu_to_macvtap16(q,
				skb_checksum_start_offset(skb) + VLAN_HLEN);
		else
			vnet_hdr->csum_start = cpu_to_macvtap16(q,
				skb_checksum_start_offset(skb));
		vnet_hdr->csum_offset = cpu_to_macvtap16(q, skb->csum_offset);
655 656
	} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
		vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
657 658 659
	} /* else everything is zero */
}

A
Arnd Bergmann 已提交
660
/* Get packet from user space buffer */
661
static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
662
				struct iov_iter *from, int noblock)
A
Arnd Bergmann 已提交
663
{
664
	int good_linear = SKB_MAX_HEAD(NET_IP_ALIGN);
A
Arnd Bergmann 已提交
665
	struct sk_buff *skb;
666
	struct macvlan_dev *vlan;
667
	unsigned long total_len = iov_iter_count(from);
668
	unsigned long len = total_len;
A
Arnd Bergmann 已提交
669
	int err;
670 671
	struct virtio_net_hdr vnet_hdr = { 0 };
	int vnet_hdr_len = 0;
672
	int copylen = 0;
673
	bool zerocopy = false;
674
	size_t linear;
675
	ssize_t n;
676 677

	if (q->flags & IFF_VNET_HDR) {
678
		vnet_hdr_len = q->vnet_hdr_sz;
679 680

		err = -EINVAL;
681
		if (len < vnet_hdr_len)
682
			goto err;
683
		len -= vnet_hdr_len;
684

685 686 687
		err = -EFAULT;
		n = copy_from_iter(&vnet_hdr, sizeof(vnet_hdr), from);
		if (n != sizeof(vnet_hdr))
688
			goto err;
689
		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
690
		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
M
Michael S. Tsirkin 已提交
691 692 693 694 695 696
		     macvtap16_to_cpu(q, vnet_hdr.csum_start) +
		     macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
			     macvtap16_to_cpu(q, vnet_hdr.hdr_len))
			vnet_hdr.hdr_len = cpu_to_macvtap16(q,
				 macvtap16_to_cpu(q, vnet_hdr.csum_start) +
				 macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
697
		err = -EINVAL;
M
Michael S. Tsirkin 已提交
698
		if (macvtap16_to_cpu(q, vnet_hdr.hdr_len) > len)
699 700
			goto err;
	}
A
Arnd Bergmann 已提交
701

702
	err = -EINVAL;
A
Arnd Bergmann 已提交
703
	if (unlikely(len < ETH_HLEN))
704
		goto err;
A
Arnd Bergmann 已提交
705

706
	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
707 708
		struct iov_iter i;

M
Michael S. Tsirkin 已提交
709 710
		copylen = vnet_hdr.hdr_len ?
			macvtap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
711 712
		if (copylen > good_linear)
			copylen = good_linear;
713
		linear = copylen;
714 715 716
		i = *from;
		iov_iter_advance(&i, copylen);
		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
717 718 719 720
			zerocopy = true;
	}

	if (!zerocopy) {
721
		copylen = len;
M
Michael S. Tsirkin 已提交
722
		if (macvtap16_to_cpu(q, vnet_hdr.hdr_len) > good_linear)
723 724
			linear = good_linear;
		else
M
Michael S. Tsirkin 已提交
725
			linear = macvtap16_to_cpu(q, vnet_hdr.hdr_len);
726
	}
727 728

	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
729
				linear, noblock, &err);
730 731
	if (!skb)
		goto err;
A
Arnd Bergmann 已提交
732

733
	if (zerocopy)
734
		err = zerocopy_sg_from_iter(skb, from);
735
	else {
736
		err = skb_copy_datagram_from_iter(skb, 0, from, len);
737 738 739 740 741 742
		if (!err && m && m->msg_control) {
			struct ubuf_info *uarg = m->msg_control;
			uarg->callback(uarg, false);
		}
	}

743
	if (err)
744
		goto err_kfree;
A
Arnd Bergmann 已提交
745 746

	skb_set_network_header(skb, ETH_HLEN);
747 748 749 750
	skb_reset_mac_header(skb);
	skb->protocol = eth_hdr(skb)->h_proto;

	if (vnet_hdr_len) {
M
Michael S. Tsirkin 已提交
751
		err = macvtap_skb_from_vnet_hdr(q, skb, &vnet_hdr);
752 753 754 755
		if (err)
			goto err_kfree;
	}

756
	skb_probe_transport_header(skb, ETH_HLEN);
757

758 759
	rcu_read_lock();
	vlan = rcu_dereference(q->vlan);
760
	/* copy skb_ubuf_info for callback when skb has no error */
761
	if (zerocopy) {
762
		skb_shinfo(skb)->destructor_arg = m->msg_control;
763
		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
764
		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
765
	}
E
Eric Dumazet 已提交
766
	if (vlan) {
767 768
		skb->dev = vlan->dev;
		dev_queue_xmit(skb);
E
Eric Dumazet 已提交
769
	} else {
770
		kfree_skb(skb);
E
Eric Dumazet 已提交
771
	}
772
	rcu_read_unlock();
A
Arnd Bergmann 已提交
773

774
	return total_len;
775

776 777 778
err_kfree:
	kfree_skb(skb);

779
err:
780 781
	rcu_read_lock();
	vlan = rcu_dereference(q->vlan);
782
	if (vlan)
783
		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
784
	rcu_read_unlock();
785 786

	return err;
A
Arnd Bergmann 已提交
787 788
}

789
static ssize_t macvtap_write_iter(struct kiocb *iocb, struct iov_iter *from)
A
Arnd Bergmann 已提交
790 791
{
	struct file *file = iocb->ki_filp;
792
	struct macvtap_queue *q = file->private_data;
A
Arnd Bergmann 已提交
793

794
	return macvtap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
A
Arnd Bergmann 已提交
795 796 797 798 799
}

/* Put packet to the user space buffer */
static ssize_t macvtap_put_user(struct macvtap_queue *q,
				const struct sk_buff *skb,
H
Herbert Xu 已提交
800
				struct iov_iter *iter)
A
Arnd Bergmann 已提交
801 802
{
	int ret;
803
	int vnet_hdr_len = 0;
804
	int vlan_offset = 0;
H
Herbert Xu 已提交
805
	int total;
806 807 808

	if (q->flags & IFF_VNET_HDR) {
		struct virtio_net_hdr vnet_hdr;
809
		vnet_hdr_len = q->vnet_hdr_sz;
H
Herbert Xu 已提交
810
		if (iov_iter_count(iter) < vnet_hdr_len)
811 812
			return -EINVAL;

M
Michael S. Tsirkin 已提交
813
		macvtap_skb_to_vnet_hdr(q, skb, &vnet_hdr);
814

H
Herbert Xu 已提交
815 816
		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
		    sizeof(vnet_hdr))
817
			return -EFAULT;
818 819

		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
820
	}
H
Herbert Xu 已提交
821
	total = vnet_hdr_len;
J
Jason Wang 已提交
822
	total += skb->len;
823

824
	if (skb_vlan_tag_present(skb)) {
825 826 827 828
		struct {
			__be16 h_vlan_proto;
			__be16 h_vlan_TCI;
		} veth;
829
		veth.h_vlan_proto = skb->vlan_proto;
830
		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
831 832

		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
J
Jason Wang 已提交
833
		total += VLAN_HLEN;
834

H
Herbert Xu 已提交
835 836
		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
		if (ret || !iov_iter_count(iter))
837 838
			goto done;

H
Herbert Xu 已提交
839 840
		ret = copy_to_iter(&veth, sizeof(veth), iter);
		if (ret != sizeof(veth) || !iov_iter_count(iter))
841 842
			goto done;
	}
A
Arnd Bergmann 已提交
843

H
Herbert Xu 已提交
844 845
	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
				     skb->len - vlan_offset);
A
Arnd Bergmann 已提交
846

847
done:
J
Jason Wang 已提交
848
	return ret ? ret : total;
A
Arnd Bergmann 已提交
849 850
}

851
static ssize_t macvtap_do_read(struct macvtap_queue *q,
A
Al Viro 已提交
852
			       struct iov_iter *to,
A
Arnd Bergmann 已提交
853
			       int noblock)
A
Arnd Bergmann 已提交
854
{
855
	DEFINE_WAIT(wait);
A
Arnd Bergmann 已提交
856
	struct sk_buff *skb;
A
Arnd Bergmann 已提交
857
	ssize_t ret = 0;
A
Arnd Bergmann 已提交
858

A
Al Viro 已提交
859 860 861 862
	if (!iov_iter_count(to))
		return 0;

	while (1) {
863 864 865
		if (!noblock)
			prepare_to_wait(sk_sleep(&q->sk), &wait,
					TASK_INTERRUPTIBLE);
A
Arnd Bergmann 已提交
866 867 868

		/* Read frames from the queue */
		skb = skb_dequeue(&q->sk.sk_receive_queue);
A
Al Viro 已提交
869 870 871 872 873
		if (skb)
			break;
		if (noblock) {
			ret = -EAGAIN;
			break;
A
Arnd Bergmann 已提交
874
		}
A
Al Viro 已提交
875 876 877 878 879 880 881 882 883
		if (signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}
		/* Nothing to read, let's sleep */
		schedule();
	}
	if (skb) {
		ret = macvtap_put_user(q, skb, to);
884 885 886 887
		if (unlikely(ret < 0))
			kfree_skb(skb);
		else
			consume_skb(skb);
A
Arnd Bergmann 已提交
888
	}
889 890
	if (!noblock)
		finish_wait(sk_sleep(&q->sk), &wait);
A
Arnd Bergmann 已提交
891 892 893
	return ret;
}

A
Al Viro 已提交
894
static ssize_t macvtap_read_iter(struct kiocb *iocb, struct iov_iter *to)
A
Arnd Bergmann 已提交
895 896 897
{
	struct file *file = iocb->ki_filp;
	struct macvtap_queue *q = file->private_data;
A
Al Viro 已提交
898
	ssize_t len = iov_iter_count(to), ret;
A
Arnd Bergmann 已提交
899

A
Al Viro 已提交
900
	ret = macvtap_do_read(q, to, file->f_flags & O_NONBLOCK);
J
Jason Wang 已提交
901
	ret = min_t(ssize_t, ret, len);
902 903
	if (ret > 0)
		iocb->ki_pos = ret;
A
Arnd Bergmann 已提交
904 905 906
	return ret;
}

907 908 909 910
static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
{
	struct macvlan_dev *vlan;

911 912
	ASSERT_RTNL();
	vlan = rtnl_dereference(q->vlan);
913 914 915 916 917 918 919 920 921 922 923
	if (vlan)
		dev_hold(vlan->dev);

	return vlan;
}

static void macvtap_put_vlan(struct macvlan_dev *vlan)
{
	dev_put(vlan->dev);
}

J
Jason Wang 已提交
924 925 926 927 928 929 930 931 932 933 934 935 936 937
static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
{
	struct macvtap_queue *q = file->private_data;
	struct macvlan_dev *vlan;
	int ret;

	vlan = macvtap_get_vlan(q);
	if (!vlan)
		return -EINVAL;

	if (flags & IFF_ATTACH_QUEUE)
		ret = macvtap_enable_queue(vlan->dev, file, q);
	else if (flags & IFF_DETACH_QUEUE)
		ret = macvtap_disable_queue(q);
938 939
	else
		ret = -EINVAL;
J
Jason Wang 已提交
940 941 942 943 944

	macvtap_put_vlan(vlan);
	return ret;
}

945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977
static int set_offload(struct macvtap_queue *q, unsigned long arg)
{
	struct macvlan_dev *vlan;
	netdev_features_t features;
	netdev_features_t feature_mask = 0;

	vlan = rtnl_dereference(q->vlan);
	if (!vlan)
		return -ENOLINK;

	features = vlan->dev->features;

	if (arg & TUN_F_CSUM) {
		feature_mask = NETIF_F_HW_CSUM;

		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
			if (arg & TUN_F_TSO_ECN)
				feature_mask |= NETIF_F_TSO_ECN;
			if (arg & TUN_F_TSO4)
				feature_mask |= NETIF_F_TSO;
			if (arg & TUN_F_TSO6)
				feature_mask |= NETIF_F_TSO6;
		}
	}

	/* tun/tap driver inverts the usage for TSO offloads, where
	 * setting the TSO bit means that the userspace wants to
	 * accept TSO frames and turning it off means that user space
	 * does not support TSO.
	 * For macvtap, we have to invert it to mean the same thing.
	 * When user space turns off TSO, we turn off GSO/LRO so that
	 * user-space will not receive TSO frames.
	 */
978
	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6))
979 980 981 982 983 984 985
		features |= RX_OFFLOADS;
	else
		features &= ~RX_OFFLOADS;

	/* tap_features are the same as features on tun/tap and
	 * reflect user expectations.
	 */
986
	vlan->tap_features = feature_mask;
987 988 989 990 991 992
	vlan->set_features = features;
	netdev_update_features(vlan->dev);

	return 0;
}

A
Arnd Bergmann 已提交
993 994 995 996 997 998
/*
 * provide compatibility with generic tun/tap interface
 */
static long macvtap_ioctl(struct file *file, unsigned int cmd,
			  unsigned long arg)
{
999 1000
	struct macvtap_queue *q = file->private_data;
	struct macvlan_dev *vlan;
A
Arnd Bergmann 已提交
1001 1002 1003
	void __user *argp = (void __user *)arg;
	struct ifreq __user *ifr = argp;
	unsigned int __user *up = argp;
1004
	unsigned short u;
1005 1006
	int __user *sp = argp;
	int s;
1007
	int ret;
A
Arnd Bergmann 已提交
1008 1009 1010 1011 1012 1013

	switch (cmd) {
	case TUNSETIFF:
		/* ignore the name, just look at flags */
		if (get_user(u, &ifr->ifr_flags))
			return -EFAULT;
1014 1015

		ret = 0;
M
Michael S. Tsirkin 已提交
1016
		if ((u & ~MACVTAP_FEATURES) != (IFF_NO_PI | IFF_TAP))
1017 1018
			ret = -EINVAL;
		else
1019
			q->flags = (q->flags & ~MACVTAP_FEATURES) | u;
1020 1021

		return ret;
A
Arnd Bergmann 已提交
1022 1023

	case TUNGETIFF:
1024
		rtnl_lock();
1025
		vlan = macvtap_get_vlan(q);
1026 1027
		if (!vlan) {
			rtnl_unlock();
A
Arnd Bergmann 已提交
1028
			return -ENOLINK;
1029
		}
A
Arnd Bergmann 已提交
1030

1031
		ret = 0;
1032
		u = q->flags;
1033
		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
1034
		    put_user(u, &ifr->ifr_flags))
1035
			ret = -EFAULT;
1036
		macvtap_put_vlan(vlan);
1037
		rtnl_unlock();
1038
		return ret;
A
Arnd Bergmann 已提交
1039

J
Jason Wang 已提交
1040 1041 1042
	case TUNSETQUEUE:
		if (get_user(u, &ifr->ifr_flags))
			return -EFAULT;
1043 1044 1045
		rtnl_lock();
		ret = macvtap_ioctl_set_queue(file, u);
		rtnl_unlock();
1046
		return ret;
J
Jason Wang 已提交
1047

A
Arnd Bergmann 已提交
1048
	case TUNGETFEATURES:
M
Michael S. Tsirkin 已提交
1049
		if (put_user(IFF_TAP | IFF_NO_PI | MACVTAP_FEATURES, up))
A
Arnd Bergmann 已提交
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
			return -EFAULT;
		return 0;

	case TUNSETSNDBUF:
		if (get_user(u, up))
			return -EFAULT;

		q->sk.sk_sndbuf = u;
		return 0;

1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
	case TUNGETVNETHDRSZ:
		s = q->vnet_hdr_sz;
		if (put_user(s, sp))
			return -EFAULT;
		return 0;

	case TUNSETVNETHDRSZ:
		if (get_user(s, sp))
			return -EFAULT;
		if (s < (int)sizeof(struct virtio_net_hdr))
			return -EINVAL;

		q->vnet_hdr_sz = s;
		return 0;

1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089
	case TUNGETVNETLE:
		s = !!(q->flags & MACVTAP_VNET_LE);
		if (put_user(s, sp))
			return -EFAULT;
		return 0;

	case TUNSETVNETLE:
		if (get_user(s, sp))
			return -EFAULT;
		if (s)
			q->flags |= MACVTAP_VNET_LE;
		else
			q->flags &= ~MACVTAP_VNET_LE;
		return 0;

A
Arnd Bergmann 已提交
1090 1091 1092
	case TUNSETOFFLOAD:
		/* let the user check for future flags */
		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
1093
			    TUN_F_TSO_ECN))
A
Arnd Bergmann 已提交
1094 1095
			return -EINVAL;

1096 1097 1098 1099
		rtnl_lock();
		ret = set_offload(q, arg);
		rtnl_unlock();
		return ret;
A
Arnd Bergmann 已提交
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117

	default:
		return -EINVAL;
	}
}

#ifdef CONFIG_COMPAT
static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
				 unsigned long arg)
{
	return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
#endif

static const struct file_operations macvtap_fops = {
	.owner		= THIS_MODULE,
	.open		= macvtap_open,
	.release	= macvtap_release,
A
Al Viro 已提交
1118
	.read		= new_sync_read,
1119
	.write		= new_sync_write,
A
Al Viro 已提交
1120
	.read_iter	= macvtap_read_iter,
1121
	.write_iter	= macvtap_write_iter,
A
Arnd Bergmann 已提交
1122 1123 1124 1125 1126 1127 1128 1129
	.poll		= macvtap_poll,
	.llseek		= no_llseek,
	.unlocked_ioctl	= macvtap_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= macvtap_compat_ioctl,
#endif
};

A
Arnd Bergmann 已提交
1130 1131 1132 1133
static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
			   struct msghdr *m, size_t total_len)
{
	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
A
Al Viro 已提交
1134
	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
A
Arnd Bergmann 已提交
1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
}

static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
			   struct msghdr *m, size_t total_len,
			   int flags)
{
	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
	int ret;
	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
		return -EINVAL;
A
Al Viro 已提交
1145
	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
1146 1147 1148 1149
	if (ret > total_len) {
		m->msg_flags |= MSG_TRUNC;
		ret = flags & MSG_TRUNC ? ret : total_len;
	}
A
Arnd Bergmann 已提交
1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
	return ret;
}

/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops macvtap_socket_ops = {
	.sendmsg = macvtap_sendmsg,
	.recvmsg = macvtap_recvmsg,
};

/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
struct socket *macvtap_get_socket(struct file *file)
{
	struct macvtap_queue *q;
	if (file->f_op != &macvtap_fops)
		return ERR_PTR(-EINVAL);
	q = file->private_data;
	if (!q)
		return ERR_PTR(-EBADFD);
	return &q->sock;
}
EXPORT_SYMBOL_GPL(macvtap_get_socket);

1175 1176 1177
static int macvtap_device_event(struct notifier_block *unused,
				unsigned long event, void *ptr)
{
1178
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1179
	struct macvlan_dev *vlan;
1180 1181
	struct device *classdev;
	dev_t devt;
1182
	int err;
1183 1184 1185 1186

	if (dev->rtnl_link_ops != &macvtap_link_ops)
		return NOTIFY_DONE;

1187
	vlan = netdev_priv(dev);
1188 1189 1190 1191 1192 1193 1194

	switch (event) {
	case NETDEV_REGISTER:
		/* Create the device node here after the network device has
		 * been registered but before register_netdevice has
		 * finished running.
		 */
1195 1196 1197 1198 1199
		err = macvtap_get_minor(vlan);
		if (err)
			return notifier_from_errno(err);

		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
1200 1201
		classdev = device_create(macvtap_class, &dev->dev, devt,
					 dev, "tap%d", dev->ifindex);
1202 1203
		if (IS_ERR(classdev)) {
			macvtap_free_minor(vlan);
1204
			return notifier_from_errno(PTR_ERR(classdev));
1205
		}
1206 1207
		break;
	case NETDEV_UNREGISTER:
1208
		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
1209
		device_destroy(macvtap_class, devt);
1210
		macvtap_free_minor(vlan);
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220
		break;
	}

	return NOTIFY_DONE;
}

static struct notifier_block macvtap_notifier_block __read_mostly = {
	.notifier_call	= macvtap_device_event,
};

A
Arnd Bergmann 已提交
1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
static int macvtap_init(void)
{
	int err;

	err = alloc_chrdev_region(&macvtap_major, 0,
				MACVTAP_NUM_DEVS, "macvtap");
	if (err)
		goto out1;

	cdev_init(&macvtap_cdev, &macvtap_fops);
	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
	if (err)
		goto out2;

	macvtap_class = class_create(THIS_MODULE, "macvtap");
	if (IS_ERR(macvtap_class)) {
		err = PTR_ERR(macvtap_class);
		goto out3;
	}

1241
	err = register_netdevice_notifier(&macvtap_notifier_block);
A
Arnd Bergmann 已提交
1242 1243 1244
	if (err)
		goto out4;

1245 1246 1247 1248
	err = macvlan_link_register(&macvtap_link_ops);
	if (err)
		goto out5;

A
Arnd Bergmann 已提交
1249 1250
	return 0;

1251 1252
out5:
	unregister_netdevice_notifier(&macvtap_notifier_block);
A
Arnd Bergmann 已提交
1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266
out4:
	class_unregister(macvtap_class);
out3:
	cdev_del(&macvtap_cdev);
out2:
	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
out1:
	return err;
}
module_init(macvtap_init);

static void macvtap_exit(void)
{
	rtnl_link_unregister(&macvtap_link_ops);
1267
	unregister_netdevice_notifier(&macvtap_notifier_block);
A
Arnd Bergmann 已提交
1268 1269 1270 1271 1272 1273 1274 1275 1276
	class_unregister(macvtap_class);
	cdev_del(&macvtap_cdev);
	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
}
module_exit(macvtap_exit);

MODULE_ALIAS_RTNL_LINK("macvtap");
MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
MODULE_LICENSE("GPL");