netback.c 44.0 KB
Newer Older
I
Ian Campbell 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
/*
 * Back-end of the driver for virtual network devices. This portion of the
 * driver exports a 'unified' network-device interface that can be accessed
 * by any operating system that implements a compatible front end. A
 * reference front-end implementation can be found in:
 *  drivers/net/xen-netfront.c
 *
 * Copyright (c) 2002-2005, K A Fraser
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version 2
 * as published by the Free Software Foundation; or, when distributed
 * separately from the Linux kernel or incorporated into other
 * software packages, subject to the following license:
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this source file (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use, copy, modify,
 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "common.h"

#include <linux/kthread.h>
#include <linux/if_vlan.h>
#include <linux/udp.h>
40
#include <linux/highmem.h>
I
Ian Campbell 已提交
41 42 43

#include <net/tcp.h>

S
Stefano Stabellini 已提交
44
#include <xen/xen.h>
I
Ian Campbell 已提交
45 46
#include <xen/events.h>
#include <xen/interface/memory.h>
47
#include <xen/page.h>
I
Ian Campbell 已提交
48 49 50

#include <asm/xen/hypercall.h>

51 52 53 54
/* Provide an option to disable split event channels at load time as
 * event channels are limited resource. Split event channels are
 * enabled by default.
 */
55
bool separate_tx_rx_irq = true;
56 57
module_param(separate_tx_rx_irq, bool, 0644);

58 59
/* The time that packets can stay on the guest Rx internal queue
 * before they are dropped.
60 61 62 63
 */
unsigned int rx_drain_timeout_msecs = 10000;
module_param(rx_drain_timeout_msecs, uint, 0444);

64 65 66
/* The length of time before the frontend is considered unresponsive
 * because it isn't providing Rx slots.
 */
67
unsigned int rx_stall_timeout_msecs = 60000;
68 69
module_param(rx_stall_timeout_msecs, uint, 0444);

70
#define MAX_QUEUES_DEFAULT 8
71 72 73 74 75
unsigned int xenvif_max_queues;
module_param_named(max_queues, xenvif_max_queues, uint, 0644);
MODULE_PARM_DESC(max_queues,
		 "Maximum number of queues per virtual interface");

76 77 78 79
/*
 * This is the maximum slots a skb can have. If a guest sends a skb
 * which exceeds this limit it is considered malicious.
 */
80 81 82 83
#define FATAL_SKB_SLOTS_DEFAULT 20
static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
module_param(fatal_skb_slots, uint, 0444);

84 85 86 87 88 89 90 91 92
/* The amount to copy out of the first guest Tx slot into the skb's
 * linear area.  If the first slot has more data, it will be mapped
 * and put into the first frag.
 *
 * This is sized to avoid pulling headers from the frags for most
 * TCP/IP packets.
 */
#define XEN_NETBACK_TX_COPY_LEN 128

93 94 95 96 97
/* This is the maximum number of flows in the hash cache. */
#define XENVIF_HASH_CACHE_SIZE_DEFAULT 64
unsigned int xenvif_hash_cache_size = XENVIF_HASH_CACHE_SIZE_DEFAULT;
module_param_named(hash_cache_size, xenvif_hash_cache_size, uint, 0644);
MODULE_PARM_DESC(hash_cache_size, "Number of flows in the hash cache");
98

99
static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
W
Wei Liu 已提交
100 101
			       u8 status);

102
static void make_tx_response(struct xenvif_queue *queue,
I
Ian Campbell 已提交
103
			     struct xen_netif_tx_request *txp,
104
			     unsigned int extra_count,
I
Ian Campbell 已提交
105
			     s8       st);
106
static void push_tx_responses(struct xenvif_queue *queue);
107

108
static inline int tx_work_todo(struct xenvif_queue *queue);
109

110
static inline unsigned long idx_to_pfn(struct xenvif_queue *queue,
111
				       u16 idx)
I
Ian Campbell 已提交
112
{
113
	return page_to_pfn(queue->mmap_pages[idx]);
I
Ian Campbell 已提交
114 115
}

116
static inline unsigned long idx_to_kaddr(struct xenvif_queue *queue,
117
					 u16 idx)
I
Ian Campbell 已提交
118
{
119
	return (unsigned long)pfn_to_kaddr(idx_to_pfn(queue, idx));
I
Ian Campbell 已提交
120 121
}

122 123 124
#define callback_param(vif, pending_idx) \
	(vif->pending_tx_info[pending_idx].callback_struct)

125 126
/* Find the containing VIF's structure from a pointer in pending_tx_info array
 */
127
static inline struct xenvif_queue *ubuf_to_queue(const struct ubuf_info *ubuf)
128
{
129 130 131 132
	u16 pending_idx = ubuf->desc;
	struct pending_tx_info *temp =
		container_of(ubuf, struct pending_tx_info, callback_struct);
	return container_of(temp - pending_idx,
133
			    struct xenvif_queue,
134
			    pending_tx_info[0]);
135
}
136

137 138 139 140 141 142 143 144 145 146
static u16 frag_get_pending_idx(skb_frag_t *frag)
{
	return (u16)frag->page_offset;
}

static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx)
{
	frag->page_offset = pending_idx;
}

I
Ian Campbell 已提交
147 148 149 150 151
static inline pending_ring_idx_t pending_index(unsigned i)
{
	return i & (MAX_PENDING_REQS-1);
}

152
void xenvif_kick_thread(struct xenvif_queue *queue)
153
{
154
	wake_up(&queue->wq);
155 156
}

157
void xenvif_napi_schedule_or_enable_events(struct xenvif_queue *queue)
I
Ian Campbell 已提交
158 159 160
{
	int more_to_do;

161
	RING_FINAL_CHECK_FOR_REQUESTS(&queue->tx, more_to_do);
I
Ian Campbell 已提交
162 163

	if (more_to_do)
164
		napi_schedule(&queue->napi);
I
Ian Campbell 已提交
165 166
}

167
static void tx_add_credit(struct xenvif_queue *queue)
I
Ian Campbell 已提交
168 169 170 171 172 173 174
{
	unsigned long max_burst, max_credit;

	/*
	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
	 * Otherwise the interface can seize up due to insufficient credit.
	 */
175
	max_burst = max(131072UL, queue->credit_bytes);
I
Ian Campbell 已提交
176 177

	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
178 179
	max_credit = queue->remaining_credit + queue->credit_bytes;
	if (max_credit < queue->remaining_credit)
I
Ian Campbell 已提交
180 181
		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */

182
	queue->remaining_credit = min(max_credit, max_burst);
I
Ian Campbell 已提交
183 184
}

185
void xenvif_tx_credit_callback(unsigned long data)
I
Ian Campbell 已提交
186
{
187 188 189
	struct xenvif_queue *queue = (struct xenvif_queue *)data;
	tx_add_credit(queue);
	xenvif_napi_schedule_or_enable_events(queue);
I
Ian Campbell 已提交
190 191
}

192
static void xenvif_tx_err(struct xenvif_queue *queue,
193 194
			  struct xen_netif_tx_request *txp,
			  unsigned int extra_count, RING_IDX end)
I
Ian Campbell 已提交
195
{
196
	RING_IDX cons = queue->tx.req_cons;
197
	unsigned long flags;
I
Ian Campbell 已提交
198 199

	do {
200
		spin_lock_irqsave(&queue->response_lock, flags);
201
		make_tx_response(queue, txp, extra_count, XEN_NETIF_RSP_ERROR);
202
		push_tx_responses(queue);
203
		spin_unlock_irqrestore(&queue->response_lock, flags);
204
		if (cons == end)
I
Ian Campbell 已提交
205
			break;
206
		RING_COPY_REQUEST(&queue->tx, cons++, txp);
207
		extra_count = 0; /* only the first frag can have extras */
I
Ian Campbell 已提交
208
	} while (1);
209
	queue->tx.req_cons = cons;
I
Ian Campbell 已提交
210 211
}

W
Wei Liu 已提交
212
static void xenvif_fatal_tx_err(struct xenvif *vif)
213 214
{
	netdev_err(vif->dev, "fatal error; disabling device\n");
215
	vif->disabled = true;
216 217 218
	/* Disable the vif from queue 0's kthread */
	if (vif->queues)
		xenvif_kick_thread(&vif->queues[0]);
219 220
}

221
static int xenvif_count_requests(struct xenvif_queue *queue,
W
Wei Liu 已提交
222
				 struct xen_netif_tx_request *first,
223
				 unsigned int extra_count,
W
Wei Liu 已提交
224 225
				 struct xen_netif_tx_request *txp,
				 int work_to_do)
I
Ian Campbell 已提交
226
{
227
	RING_IDX cons = queue->tx.req_cons;
228 229
	int slots = 0;
	int drop_err = 0;
230
	int more_data;
I
Ian Campbell 已提交
231 232 233 234 235

	if (!(first->flags & XEN_NETTXF_more_data))
		return 0;

	do {
236 237
		struct xen_netif_tx_request dropped_tx = { 0 };

238
		if (slots >= work_to_do) {
239
			netdev_err(queue->vif->dev,
240 241
				   "Asked for %d slots but exceeds this limit\n",
				   work_to_do);
242
			xenvif_fatal_tx_err(queue->vif);
243
			return -ENODATA;
I
Ian Campbell 已提交
244 245
		}

246 247 248
		/* This guest is really using too many slots and
		 * considered malicious.
		 */
249
		if (unlikely(slots >= fatal_skb_slots)) {
250
			netdev_err(queue->vif->dev,
251
				   "Malicious frontend using %d slots, threshold %u\n",
252
				   slots, fatal_skb_slots);
253
			xenvif_fatal_tx_err(queue->vif);
254
			return -E2BIG;
I
Ian Campbell 已提交
255 256
		}

257
		/* Xen network protocol had implicit dependency on
258 259 260 261 262
		 * MAX_SKB_FRAGS. XEN_NETBK_LEGACY_SLOTS_MAX is set to
		 * the historical MAX_SKB_FRAGS value 18 to honor the
		 * same behavior as before. Any packet using more than
		 * 18 slots but less than fatal_skb_slots slots is
		 * dropped
263
		 */
264
		if (!drop_err && slots >= XEN_NETBK_LEGACY_SLOTS_MAX) {
265
			if (net_ratelimit())
266
				netdev_dbg(queue->vif->dev,
267
					   "Too many slots (%d) exceeding limit (%d), dropping packet\n",
268
					   slots, XEN_NETBK_LEGACY_SLOTS_MAX);
269 270 271
			drop_err = -E2BIG;
		}

272 273 274
		if (drop_err)
			txp = &dropped_tx;

275
		RING_COPY_REQUEST(&queue->tx, cons + slots, txp);
276 277 278 279 280 281 282 283 284 285 286 287

		/* If the guest submitted a frame >= 64 KiB then
		 * first->size overflowed and following slots will
		 * appear to be larger than the frame.
		 *
		 * This cannot be fatal error as there are buggy
		 * frontends that do this.
		 *
		 * Consume all slots and drop the packet.
		 */
		if (!drop_err && txp->size > first->size) {
			if (net_ratelimit())
288
				netdev_dbg(queue->vif->dev,
289 290 291
					   "Invalid tx request, slot size %u > remaining size %u\n",
					   txp->size, first->size);
			drop_err = -EIO;
I
Ian Campbell 已提交
292 293 294
		}

		first->size -= txp->size;
295
		slots++;
I
Ian Campbell 已提交
296

297
		if (unlikely((txp->offset + txp->size) > XEN_PAGE_SIZE)) {
298
			netdev_err(queue->vif->dev, "Cross page boundary, txp->offset: %u, size: %u\n",
I
Ian Campbell 已提交
299
				 txp->offset, txp->size);
300
			xenvif_fatal_tx_err(queue->vif);
301
			return -EINVAL;
I
Ian Campbell 已提交
302
		}
303 304 305 306 307 308 309

		more_data = txp->flags & XEN_NETTXF_more_data;

		if (!drop_err)
			txp++;

	} while (more_data);
310 311

	if (drop_err) {
312
		xenvif_tx_err(queue, first, extra_count, cons + slots);
313 314 315 316
		return drop_err;
	}

	return slots;
I
Ian Campbell 已提交
317 318
}

319 320 321 322 323 324 325

struct xenvif_tx_cb {
	u16 pending_idx;
};

#define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)

326
static inline void xenvif_tx_create_map_op(struct xenvif_queue *queue,
327 328 329 330
					   u16 pending_idx,
					   struct xen_netif_tx_request *txp,
					   unsigned int extra_count,
					   struct gnttab_map_grant_ref *mop)
331
{
332 333
	queue->pages_to_map[mop-queue->tx_map_ops] = queue->mmap_pages[pending_idx];
	gnttab_set_map_op(mop, idx_to_kaddr(queue, pending_idx),
334
			  GNTMAP_host_map | GNTMAP_readonly,
335
			  txp->gref, queue->vif->domid);
336

337
	memcpy(&queue->pending_tx_info[pending_idx].req, txp,
338
	       sizeof(*txp));
339
	queue->pending_tx_info[pending_idx].extra_count = extra_count;
340 341
}

342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
{
	struct sk_buff *skb =
		alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
			  GFP_ATOMIC | __GFP_NOWARN);
	if (unlikely(skb == NULL))
		return NULL;

	/* Packets passed to netif_rx() must have some headroom. */
	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);

	/* Initialize it here to avoid later surprises */
	skb_shinfo(skb)->destructor_arg = NULL;

	return skb;
}

359
static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *queue,
360 361
							struct sk_buff *skb,
							struct xen_netif_tx_request *txp,
362 363 364
							struct gnttab_map_grant_ref *gop,
							unsigned int frag_overflow,
							struct sk_buff *nskb)
I
Ian Campbell 已提交
365 366 367
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	skb_frag_t *frags = shinfo->frags;
368
	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
369 370
	int start;
	pending_ring_idx_t index;
371
	unsigned int nr_slots;
372 373

	nr_slots = shinfo->nr_frags;
I
Ian Campbell 已提交
374 375

	/* Skip first skb fragment if it is on same page as header fragment. */
376
	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
I
Ian Campbell 已提交
377

378 379
	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
	     shinfo->nr_frags++, txp++, gop++) {
380 381
		index = pending_index(queue->pending_cons++);
		pending_idx = queue->pending_ring[index];
382
		xenvif_tx_create_map_op(queue, pending_idx, txp, 0, gop);
383
		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
I
Ian Campbell 已提交
384 385
	}

386 387 388 389 390 391 392
	if (frag_overflow) {

		shinfo = skb_shinfo(nskb);
		frags = shinfo->frags;

		for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
		     shinfo->nr_frags++, txp++, gop++) {
393 394
			index = pending_index(queue->pending_cons++);
			pending_idx = queue->pending_ring[index];
395 396
			xenvif_tx_create_map_op(queue, pending_idx, txp, 0,
						gop);
397 398 399 400 401 402
			frag_set_pending_idx(&frags[shinfo->nr_frags],
					     pending_idx);
		}

		skb_shinfo(skb)->frag_list = nskb;
	}
403

I
Ian Campbell 已提交
404 405 406
	return gop;
}

407
static inline void xenvif_grant_handle_set(struct xenvif_queue *queue,
408 409 410
					   u16 pending_idx,
					   grant_handle_t handle)
{
411
	if (unlikely(queue->grant_tx_handle[pending_idx] !=
412
		     NETBACK_INVALID_HANDLE)) {
413
		netdev_err(queue->vif->dev,
414
			   "Trying to overwrite active handle! pending_idx: 0x%x\n",
415 416 417
			   pending_idx);
		BUG();
	}
418
	queue->grant_tx_handle[pending_idx] = handle;
419 420
}

421
static inline void xenvif_grant_handle_reset(struct xenvif_queue *queue,
422 423
					     u16 pending_idx)
{
424
	if (unlikely(queue->grant_tx_handle[pending_idx] ==
425
		     NETBACK_INVALID_HANDLE)) {
426
		netdev_err(queue->vif->dev,
427
			   "Trying to unmap invalid handle! pending_idx: 0x%x\n",
428 429 430
			   pending_idx);
		BUG();
	}
431
	queue->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
432 433
}

434
static int xenvif_tx_check_gop(struct xenvif_queue *queue,
W
Wei Liu 已提交
435
			       struct sk_buff *skb,
436 437
			       struct gnttab_map_grant_ref **gopp_map,
			       struct gnttab_copy **gopp_copy)
I
Ian Campbell 已提交
438
{
Z
Zoltan Kiss 已提交
439
	struct gnttab_map_grant_ref *gop_map = *gopp_map;
440
	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
441 442 443
	/* This always points to the shinfo of the skb being checked, which
	 * could be either the first or the one on the frag_list
	 */
I
Ian Campbell 已提交
444
	struct skb_shared_info *shinfo = skb_shinfo(skb);
445 446 447 448
	/* If this is non-NULL, we are currently checking the frag_list skb, and
	 * this points to the shinfo of the first one
	 */
	struct skb_shared_info *first_shinfo = NULL;
I
Ian Campbell 已提交
449
	int nr_frags = shinfo->nr_frags;
450 451
	const bool sharedslot = nr_frags &&
				frag_get_pending_idx(&shinfo->frags[0]) == pending_idx;
452
	int i, err;
I
Ian Campbell 已提交
453 454

	/* Check status of header. */
455 456 457
	err = (*gopp_copy)->status;
	if (unlikely(err)) {
		if (net_ratelimit())
458
			netdev_dbg(queue->vif->dev,
459
				   "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
460 461 462
				   (*gopp_copy)->status,
				   pending_idx,
				   (*gopp_copy)->source.u.ref);
463 464 465 466
		/* The first frag might still have this slot mapped */
		if (!sharedslot)
			xenvif_idx_release(queue, pending_idx,
					   XEN_NETIF_RSP_ERROR);
467
	}
468
	(*gopp_copy)++;
I
Ian Campbell 已提交
469

470
check_frags:
471
	for (i = 0; i < nr_frags; i++, gop_map++) {
I
Ian Campbell 已提交
472 473
		int j, newerr;

474
		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
I
Ian Campbell 已提交
475 476

		/* Check error status: if okay then remember grant handle. */
477
		newerr = gop_map->status;
478

I
Ian Campbell 已提交
479
		if (likely(!newerr)) {
480
			xenvif_grant_handle_set(queue,
Z
Zoltan Kiss 已提交
481 482
						pending_idx,
						gop_map->handle);
I
Ian Campbell 已提交
483
			/* Had a previous error? Invalidate this fragment. */
484
			if (unlikely(err)) {
485
				xenvif_idx_unmap(queue, pending_idx);
486 487 488 489 490 491 492 493 494 495 496
				/* If the mapping of the first frag was OK, but
				 * the header's copy failed, and they are
				 * sharing a slot, send an error
				 */
				if (i == 0 && sharedslot)
					xenvif_idx_release(queue, pending_idx,
							   XEN_NETIF_RSP_ERROR);
				else
					xenvif_idx_release(queue, pending_idx,
							   XEN_NETIF_RSP_OKAY);
			}
I
Ian Campbell 已提交
497 498 499 500
			continue;
		}

		/* Error on this fragment: respond to client with an error. */
501
		if (net_ratelimit())
502
			netdev_dbg(queue->vif->dev,
503
				   "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n",
504 505 506 507
				   i,
				   gop_map->status,
				   pending_idx,
				   gop_map->ref);
508

509
		xenvif_idx_release(queue, pending_idx, XEN_NETIF_RSP_ERROR);
I
Ian Campbell 已提交
510 511 512 513

		/* Not the first error? Preceding frags already invalidated. */
		if (err)
			continue;
514 515 516 517 518 519 520 521 522 523

		/* First error: if the header haven't shared a slot with the
		 * first frag, release it as well.
		 */
		if (!sharedslot)
			xenvif_idx_release(queue,
					   XENVIF_TX_CB(skb)->pending_idx,
					   XEN_NETIF_RSP_OKAY);

		/* Invalidate preceding fragments of this skb. */
524
		for (j = 0; j < i; j++) {
525
			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
526
			xenvif_idx_unmap(queue, pending_idx);
527 528
			xenvif_idx_release(queue, pending_idx,
					   XEN_NETIF_RSP_OKAY);
I
Ian Campbell 已提交
529 530
		}

531 532 533 534 535 536 537
		/* And if we found the error while checking the frag_list, unmap
		 * the first skb's frags
		 */
		if (first_shinfo) {
			for (j = 0; j < first_shinfo->nr_frags; j++) {
				pending_idx = frag_get_pending_idx(&first_shinfo->frags[j]);
				xenvif_idx_unmap(queue, pending_idx);
538 539
				xenvif_idx_release(queue, pending_idx,
						   XEN_NETIF_RSP_OKAY);
540
			}
I
Ian Campbell 已提交
541 542 543 544 545 546
		}

		/* Remember the error: invalidate all subsequent fragments. */
		err = newerr;
	}

547 548 549
	if (skb_has_frag_list(skb) && !first_shinfo) {
		first_shinfo = skb_shinfo(skb);
		shinfo = skb_shinfo(skb_shinfo(skb)->frag_list);
550 551 552 553 554
		nr_frags = shinfo->nr_frags;

		goto check_frags;
	}

555
	*gopp_map = gop_map;
I
Ian Campbell 已提交
556 557 558
	return err;
}

559
static void xenvif_fill_frags(struct xenvif_queue *queue, struct sk_buff *skb)
I
Ian Campbell 已提交
560 561 562 563
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	int nr_frags = shinfo->nr_frags;
	int i;
564 565
	u16 prev_pending_idx = INVALID_PENDING_IDX;

I
Ian Campbell 已提交
566 567 568
	for (i = 0; i < nr_frags; i++) {
		skb_frag_t *frag = shinfo->frags + i;
		struct xen_netif_tx_request *txp;
569 570
		struct page *page;
		u16 pending_idx;
I
Ian Campbell 已提交
571

572
		pending_idx = frag_get_pending_idx(frag);
I
Ian Campbell 已提交
573

574
		/* If this is not the first frag, chain it to the previous*/
575
		if (prev_pending_idx == INVALID_PENDING_IDX)
576
			skb_shinfo(skb)->destructor_arg =
577
				&callback_param(queue, pending_idx);
578
		else
579 580
			callback_param(queue, prev_pending_idx).ctx =
				&callback_param(queue, pending_idx);
581

582
		callback_param(queue, pending_idx).ctx = NULL;
583 584
		prev_pending_idx = pending_idx;

585 586
		txp = &queue->pending_tx_info[pending_idx].req;
		page = virt_to_page(idx_to_kaddr(queue, pending_idx));
587
		__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
I
Ian Campbell 已提交
588 589 590 591
		skb->len += txp->size;
		skb->data_len += txp->size;
		skb->truesize += txp->size;

592
		/* Take an extra reference to offset network stack's put_page */
593
		get_page(queue->mmap_pages[pending_idx]);
I
Ian Campbell 已提交
594 595 596
	}
}

597
static int xenvif_get_extras(struct xenvif_queue *queue,
598 599 600
			     struct xen_netif_extra_info *extras,
			     unsigned int *extra_count,
			     int work_to_do)
I
Ian Campbell 已提交
601 602
{
	struct xen_netif_extra_info extra;
603
	RING_IDX cons = queue->tx.req_cons;
I
Ian Campbell 已提交
604 605 606

	do {
		if (unlikely(work_to_do-- <= 0)) {
607 608
			netdev_err(queue->vif->dev, "Missing extra info\n");
			xenvif_fatal_tx_err(queue->vif);
I
Ian Campbell 已提交
609 610 611
			return -EBADR;
		}

612
		RING_COPY_REQUEST(&queue->tx, cons, &extra);
613 614 615 616

		queue->tx.req_cons = ++cons;
		(*extra_count)++;

I
Ian Campbell 已提交
617 618
		if (unlikely(!extra.type ||
			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
619
			netdev_err(queue->vif->dev,
I
Ian Campbell 已提交
620
				   "Invalid extra type: %d\n", extra.type);
621
			xenvif_fatal_tx_err(queue->vif);
I
Ian Campbell 已提交
622 623 624 625 626 627 628 629 630
			return -EINVAL;
		}

		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);

	return work_to_do;
}

W
Wei Liu 已提交
631 632 633
static int xenvif_set_skb_gso(struct xenvif *vif,
			      struct sk_buff *skb,
			      struct xen_netif_extra_info *gso)
I
Ian Campbell 已提交
634 635
{
	if (!gso->u.gso.size) {
636
		netdev_err(vif->dev, "GSO size must not be zero.\n");
W
Wei Liu 已提交
637
		xenvif_fatal_tx_err(vif);
I
Ian Campbell 已提交
638 639 640
		return -EINVAL;
	}

641 642 643 644 645 646 647 648
	switch (gso->u.gso.type) {
	case XEN_NETIF_GSO_TYPE_TCPV4:
		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
		break;
	case XEN_NETIF_GSO_TYPE_TCPV6:
		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
		break;
	default:
649
		netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
W
Wei Liu 已提交
650
		xenvif_fatal_tx_err(vif);
I
Ian Campbell 已提交
651 652 653 654
		return -EINVAL;
	}

	skb_shinfo(skb)->gso_size = gso->u.gso.size;
655
	/* gso_segs will be calculated later */
I
Ian Campbell 已提交
656 657 658 659

	return 0;
}

660
static int checksum_setup(struct xenvif_queue *queue, struct sk_buff *skb)
661
{
662
	bool recalculate_partial_csum = false;
663 664 665 666 667 668 669

	/* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
	 * peers can fail to set NETRXF_csum_blank when sending a GSO
	 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
	 * recalculate the partial checksum.
	 */
	if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
670
		queue->stats.rx_gso_checksum_fixup++;
671
		skb->ip_summed = CHECKSUM_PARTIAL;
672
		recalculate_partial_csum = true;
673 674 675 676 677 678
	}

	/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
	if (skb->ip_summed != CHECKSUM_PARTIAL)
		return 0;

679
	return skb_checksum_setup(skb, recalculate_partial_csum);
680 681
}

682
static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
I
Ian Campbell 已提交
683
{
684
	u64 now = get_jiffies_64();
685 686
	u64 next_credit = queue->credit_window_start +
		msecs_to_jiffies(queue->credit_usec / 1000);
I
Ian Campbell 已提交
687 688

	/* Timer could already be pending in rare cases. */
689
	if (timer_pending(&queue->credit_timeout))
I
Ian Campbell 已提交
690 691 692
		return true;

	/* Passed the point where we can replenish credit? */
693
	if (time_after_eq64(now, next_credit)) {
694 695
		queue->credit_window_start = now;
		tx_add_credit(queue);
I
Ian Campbell 已提交
696 697 698
	}

	/* Still too big to send right now? Set a callback. */
699 700 701 702
	if (size > queue->remaining_credit) {
		queue->credit_timeout.data     =
			(unsigned long)queue;
		mod_timer(&queue->credit_timeout,
I
Ian Campbell 已提交
703
			  next_credit);
704
		queue->credit_window_start = next_credit;
I
Ian Campbell 已提交
705 706 707 708 709 710 711

		return true;
	}

	return false;
}

712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
/* No locking is required in xenvif_mcast_add/del() as they are
 * only ever invoked from NAPI poll. An RCU list is used because
 * xenvif_mcast_match() is called asynchronously, during start_xmit.
 */

static int xenvif_mcast_add(struct xenvif *vif, const u8 *addr)
{
	struct xenvif_mcast_addr *mcast;

	if (vif->fe_mcast_count == XEN_NETBK_MCAST_MAX) {
		if (net_ratelimit())
			netdev_err(vif->dev,
				   "Too many multicast addresses\n");
		return -ENOSPC;
	}

	mcast = kzalloc(sizeof(*mcast), GFP_ATOMIC);
	if (!mcast)
		return -ENOMEM;

	ether_addr_copy(mcast->addr, addr);
	list_add_tail_rcu(&mcast->entry, &vif->fe_mcast_addr);
	vif->fe_mcast_count++;

	return 0;
}

static void xenvif_mcast_del(struct xenvif *vif, const u8 *addr)
{
	struct xenvif_mcast_addr *mcast;

	list_for_each_entry_rcu(mcast, &vif->fe_mcast_addr, entry) {
		if (ether_addr_equal(addr, mcast->addr)) {
			--vif->fe_mcast_count;
			list_del_rcu(&mcast->entry);
			kfree_rcu(mcast, rcu);
			break;
		}
	}
}

bool xenvif_mcast_match(struct xenvif *vif, const u8 *addr)
{
	struct xenvif_mcast_addr *mcast;

	rcu_read_lock();
	list_for_each_entry_rcu(mcast, &vif->fe_mcast_addr, entry) {
		if (ether_addr_equal(addr, mcast->addr)) {
			rcu_read_unlock();
			return true;
		}
	}
	rcu_read_unlock();

	return false;
}

void xenvif_mcast_addr_list_free(struct xenvif *vif)
{
	/* No need for locking or RCU here. NAPI poll and TX queue
	 * are stopped.
	 */
	while (!list_empty(&vif->fe_mcast_addr)) {
		struct xenvif_mcast_addr *mcast;

		mcast = list_first_entry(&vif->fe_mcast_addr,
					 struct xenvif_mcast_addr,
					 entry);
		--vif->fe_mcast_count;
		list_del(&mcast->entry);
		kfree(mcast);
	}
}

786
static void xenvif_tx_build_gops(struct xenvif_queue *queue,
787 788 789
				     int budget,
				     unsigned *copy_ops,
				     unsigned *map_ops)
I
Ian Campbell 已提交
790
{
791 792
	struct gnttab_map_grant_ref *gop = queue->tx_map_ops;
	struct sk_buff *skb, *nskb;
I
Ian Campbell 已提交
793
	int ret;
794
	unsigned int frag_overflow;
I
Ian Campbell 已提交
795

796
	while (skb_queue_len(&queue->tx_queue) < budget) {
I
Ian Campbell 已提交
797
		struct xen_netif_tx_request txreq;
798
		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
I
Ian Campbell 已提交
799
		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
800
		unsigned int extra_count;
I
Ian Campbell 已提交
801 802 803 804 805 806
		u16 pending_idx;
		RING_IDX idx;
		int work_to_do;
		unsigned int data_len;
		pending_ring_idx_t index;

807
		if (queue->tx.sring->req_prod - queue->tx.req_cons >
808
		    XEN_NETIF_TX_RING_SIZE) {
809
			netdev_err(queue->vif->dev,
810 811
				   "Impossible number of requests. "
				   "req_prod %d, req_cons %d, size %ld\n",
812
				   queue->tx.sring->req_prod, queue->tx.req_cons,
813
				   XEN_NETIF_TX_RING_SIZE);
814
			xenvif_fatal_tx_err(queue->vif);
815
			break;
816 817
		}

818
		work_to_do = RING_HAS_UNCONSUMED_REQUESTS(&queue->tx);
819 820
		if (!work_to_do)
			break;
I
Ian Campbell 已提交
821

822
		idx = queue->tx.req_cons;
I
Ian Campbell 已提交
823
		rmb(); /* Ensure that we see the request before we copy it. */
824
		RING_COPY_REQUEST(&queue->tx, idx, &txreq);
I
Ian Campbell 已提交
825 826

		/* Credit-based scheduling. */
827 828
		if (txreq.size > queue->remaining_credit &&
		    tx_credit_exceeded(queue, txreq.size))
829
			break;
I
Ian Campbell 已提交
830

831
		queue->remaining_credit -= txreq.size;
I
Ian Campbell 已提交
832 833

		work_to_do--;
834
		queue->tx.req_cons = ++idx;
I
Ian Campbell 已提交
835 836

		memset(extras, 0, sizeof(extras));
837
		extra_count = 0;
I
Ian Campbell 已提交
838
		if (txreq.flags & XEN_NETTXF_extra_info) {
839
			work_to_do = xenvif_get_extras(queue, extras,
840
						       &extra_count,
W
Wei Liu 已提交
841
						       work_to_do);
842
			idx = queue->tx.req_cons;
843
			if (unlikely(work_to_do < 0))
844
				break;
I
Ian Campbell 已提交
845 846
		}

847 848 849 850 851 852
		if (extras[XEN_NETIF_EXTRA_TYPE_MCAST_ADD - 1].type) {
			struct xen_netif_extra_info *extra;

			extra = &extras[XEN_NETIF_EXTRA_TYPE_MCAST_ADD - 1];
			ret = xenvif_mcast_add(queue->vif, extra->u.mcast.addr);

853
			make_tx_response(queue, &txreq, extra_count,
854 855 856 857 858 859 860 861 862 863 864 865 866
					 (ret == 0) ?
					 XEN_NETIF_RSP_OKAY :
					 XEN_NETIF_RSP_ERROR);
			push_tx_responses(queue);
			continue;
		}

		if (extras[XEN_NETIF_EXTRA_TYPE_MCAST_DEL - 1].type) {
			struct xen_netif_extra_info *extra;

			extra = &extras[XEN_NETIF_EXTRA_TYPE_MCAST_DEL - 1];
			xenvif_mcast_del(queue->vif, extra->u.mcast.addr);

867 868
			make_tx_response(queue, &txreq, extra_count,
					 XEN_NETIF_RSP_OKAY);
869 870 871 872
			push_tx_responses(queue);
			continue;
		}

873 874
		ret = xenvif_count_requests(queue, &txreq, extra_count,
					    txfrags, work_to_do);
875
		if (unlikely(ret < 0))
876
			break;
877

I
Ian Campbell 已提交
878 879 880
		idx += ret;

		if (unlikely(txreq.size < ETH_HLEN)) {
881
			netdev_dbg(queue->vif->dev,
I
Ian Campbell 已提交
882
				   "Bad packet size: %d\n", txreq.size);
883
			xenvif_tx_err(queue, &txreq, extra_count, idx);
884
			break;
I
Ian Campbell 已提交
885 886 887
		}

		/* No crossing a page as the payload mustn't fragment. */
888
		if (unlikely((txreq.offset + txreq.size) > XEN_PAGE_SIZE)) {
889
			netdev_err(queue->vif->dev,
890
				   "txreq.offset: %u, size: %u, end: %lu\n",
I
Ian Campbell 已提交
891
				   txreq.offset, txreq.size,
892
				   (unsigned long)(txreq.offset&~XEN_PAGE_MASK) + txreq.size);
893
			xenvif_fatal_tx_err(queue->vif);
894
			break;
I
Ian Campbell 已提交
895 896
		}

897 898
		index = pending_index(queue->pending_cons);
		pending_idx = queue->pending_ring[index];
I
Ian Campbell 已提交
899

900
		data_len = (txreq.size > XEN_NETBACK_TX_COPY_LEN &&
901
			    ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
902
			XEN_NETBACK_TX_COPY_LEN : txreq.size;
I
Ian Campbell 已提交
903

904
		skb = xenvif_alloc_skb(data_len);
I
Ian Campbell 已提交
905
		if (unlikely(skb == NULL)) {
906
			netdev_dbg(queue->vif->dev,
I
Ian Campbell 已提交
907
				   "Can't allocate a skb in start_xmit.\n");
908
			xenvif_tx_err(queue, &txreq, extra_count, idx);
I
Ian Campbell 已提交
909 910 911
			break;
		}

912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
		skb_shinfo(skb)->nr_frags = ret;
		if (data_len < txreq.size)
			skb_shinfo(skb)->nr_frags++;
		/* At this point shinfo->nr_frags is in fact the number of
		 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
		 */
		frag_overflow = 0;
		nskb = NULL;
		if (skb_shinfo(skb)->nr_frags > MAX_SKB_FRAGS) {
			frag_overflow = skb_shinfo(skb)->nr_frags - MAX_SKB_FRAGS;
			BUG_ON(frag_overflow > MAX_SKB_FRAGS);
			skb_shinfo(skb)->nr_frags = MAX_SKB_FRAGS;
			nskb = xenvif_alloc_skb(0);
			if (unlikely(nskb == NULL)) {
				kfree_skb(skb);
927
				xenvif_tx_err(queue, &txreq, extra_count, idx);
928 929 930 931 932 933 934
				if (net_ratelimit())
					netdev_err(queue->vif->dev,
						   "Can't allocate the frag_list skb.\n");
				break;
			}
		}

I
Ian Campbell 已提交
935 936 937 938
		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
			struct xen_netif_extra_info *gso;
			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];

939
			if (xenvif_set_skb_gso(queue->vif, skb, gso)) {
W
Wei Liu 已提交
940
				/* Failure in xenvif_set_skb_gso is fatal. */
I
Ian Campbell 已提交
941
				kfree_skb(skb);
942
				kfree_skb(nskb);
943
				break;
I
Ian Campbell 已提交
944 945 946
			}
		}

947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973
		if (extras[XEN_NETIF_EXTRA_TYPE_HASH - 1].type) {
			struct xen_netif_extra_info *extra;
			enum pkt_hash_types type = PKT_HASH_TYPE_NONE;

			extra = &extras[XEN_NETIF_EXTRA_TYPE_HASH - 1];

			switch (extra->u.hash.type) {
			case _XEN_NETIF_CTRL_HASH_TYPE_IPV4:
			case _XEN_NETIF_CTRL_HASH_TYPE_IPV6:
				type = PKT_HASH_TYPE_L3;
				break;

			case _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP:
			case _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP:
				type = PKT_HASH_TYPE_L4;
				break;

			default:
				break;
			}

			if (type != PKT_HASH_TYPE_NONE)
				skb_set_hash(skb,
					     *(u32 *)extra->u.hash.value,
					     type);
		}

974
		XENVIF_TX_CB(skb)->pending_idx = pending_idx;
I
Ian Campbell 已提交
975 976

		__skb_put(skb, data_len);
977 978 979
		queue->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref;
		queue->tx_copy_ops[*copy_ops].source.domid = queue->vif->domid;
		queue->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
980

981
		queue->tx_copy_ops[*copy_ops].dest.u.gmfn =
982
			virt_to_gfn(skb->data);
983 984
		queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
		queue->tx_copy_ops[*copy_ops].dest.offset =
985
			offset_in_page(skb->data) & ~XEN_PAGE_MASK;
986

987 988
		queue->tx_copy_ops[*copy_ops].len = data_len;
		queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
989 990

		(*copy_ops)++;
I
Ian Campbell 已提交
991 992

		if (data_len < txreq.size) {
993 994
			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
					     pending_idx);
995 996
			xenvif_tx_create_map_op(queue, pending_idx, &txreq,
						extra_count, gop);
997
			gop++;
I
Ian Campbell 已提交
998
		} else {
999 1000
			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
					     INVALID_PENDING_IDX);
1001 1002 1003 1004
			memcpy(&queue->pending_tx_info[pending_idx].req,
			       &txreq, sizeof(txreq));
			queue->pending_tx_info[pending_idx].extra_count =
				extra_count;
I
Ian Campbell 已提交
1005 1006
		}

1007
		queue->pending_cons++;
I
Ian Campbell 已提交
1008

1009 1010
		gop = xenvif_get_requests(queue, skb, txfrags, gop,
				          frag_overflow, nskb);
I
Ian Campbell 已提交
1011

1012
		__skb_queue_tail(&queue->tx_queue, skb);
1013

1014
		queue->tx.req_cons = idx;
I
Ian Campbell 已提交
1015

1016 1017
		if (((gop-queue->tx_map_ops) >= ARRAY_SIZE(queue->tx_map_ops)) ||
		    (*copy_ops >= ARRAY_SIZE(queue->tx_copy_ops)))
I
Ian Campbell 已提交
1018 1019 1020
			break;
	}

1021
	(*map_ops) = gop - queue->tx_map_ops;
1022
	return;
I
Ian Campbell 已提交
1023 1024
}

1025 1026 1027
/* Consolidate skb with a frag_list into a brand new one with local pages on
 * frags. Returns 0 or -ENOMEM if can't allocate new pages.
 */
1028
static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *skb)
1029 1030 1031
{
	unsigned int offset = skb_headlen(skb);
	skb_frag_t frags[MAX_SKB_FRAGS];
1032
	int i, f;
1033 1034 1035
	struct ubuf_info *uarg;
	struct sk_buff *nskb = skb_shinfo(skb)->frag_list;

1036 1037
	queue->stats.tx_zerocopy_sent += 2;
	queue->stats.tx_frag_overflow++;
1038

1039
	xenvif_fill_frags(queue, nskb);
1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
	/* Subtract frags size, we will correct it later */
	skb->truesize -= skb->data_len;
	skb->len += nskb->len;
	skb->data_len += nskb->len;

	/* create a brand new frags array and coalesce there */
	for (i = 0; offset < skb->len; i++) {
		struct page *page;
		unsigned int len;

		BUG_ON(i >= MAX_SKB_FRAGS);
Z
Zoltan Kiss 已提交
1051
		page = alloc_page(GFP_ATOMIC);
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071
		if (!page) {
			int j;
			skb->truesize += skb->data_len;
			for (j = 0; j < i; j++)
				put_page(frags[j].page.p);
			return -ENOMEM;
		}

		if (offset + PAGE_SIZE < skb->len)
			len = PAGE_SIZE;
		else
			len = skb->len - offset;
		if (skb_copy_bits(skb, offset, page_address(page), len))
			BUG();

		offset += len;
		frags[i].page.p = page;
		frags[i].page_offset = 0;
		skb_frag_size_set(&frags[i], len);
	}
1072

1073 1074 1075 1076 1077
	/* Copied all the bits from the frag list -- free it. */
	skb_frag_list_init(skb);
	xenvif_skb_zerocopy_prepare(queue, nskb);
	kfree_skb(nskb);

1078 1079 1080
	/* Release all the original (foreign) frags. */
	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++)
		skb_frag_unref(skb, f);
1081
	uarg = skb_shinfo(skb)->destructor_arg;
1082 1083
	/* increase inflight counter to offset decrement in callback */
	atomic_inc(&queue->inflight_packets);
1084 1085 1086
	uarg->callback(uarg, true);
	skb_shinfo(skb)->destructor_arg = NULL;

1087 1088 1089 1090
	/* Fill the skb with the new (local) frags. */
	memcpy(skb_shinfo(skb)->frags, frags, i * sizeof(skb_frag_t));
	skb_shinfo(skb)->nr_frags = i;
	skb->truesize += i * PAGE_SIZE;
1091 1092 1093

	return 0;
}
1094

1095
static int xenvif_tx_submit(struct xenvif_queue *queue)
I
Ian Campbell 已提交
1096
{
1097 1098
	struct gnttab_map_grant_ref *gop_map = queue->tx_map_ops;
	struct gnttab_copy *gop_copy = queue->tx_copy_ops;
I
Ian Campbell 已提交
1099
	struct sk_buff *skb;
1100
	int work_done = 0;
I
Ian Campbell 已提交
1101

1102
	while ((skb = __skb_dequeue(&queue->tx_queue)) != NULL) {
I
Ian Campbell 已提交
1103 1104 1105 1106
		struct xen_netif_tx_request *txp;
		u16 pending_idx;
		unsigned data_len;

1107
		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
1108
		txp = &queue->pending_tx_info[pending_idx].req;
I
Ian Campbell 已提交
1109 1110

		/* Check the remap error code. */
1111
		if (unlikely(xenvif_tx_check_gop(queue, skb, &gop_map, &gop_copy))) {
1112 1113 1114 1115
			/* If there was an error, xenvif_tx_check_gop is
			 * expected to release all the frags which were mapped,
			 * so kfree_skb shouldn't do it again
			 */
I
Ian Campbell 已提交
1116
			skb_shinfo(skb)->nr_frags = 0;
1117 1118 1119 1120 1121
			if (skb_has_frag_list(skb)) {
				struct sk_buff *nskb =
						skb_shinfo(skb)->frag_list;
				skb_shinfo(nskb)->nr_frags = 0;
			}
I
Ian Campbell 已提交
1122 1123 1124 1125 1126
			kfree_skb(skb);
			continue;
		}

		data_len = skb->len;
1127
		callback_param(queue, pending_idx).ctx = NULL;
I
Ian Campbell 已提交
1128 1129 1130 1131 1132 1133
		if (data_len < txp->size) {
			/* Append the packet payload as a fragment. */
			txp->offset += data_len;
			txp->size -= data_len;
		} else {
			/* Schedule a response immediately. */
1134
			xenvif_idx_release(queue, pending_idx,
1135
					   XEN_NETIF_RSP_OKAY);
I
Ian Campbell 已提交
1136 1137 1138 1139 1140 1141 1142
		}

		if (txp->flags & XEN_NETTXF_csum_blank)
			skb->ip_summed = CHECKSUM_PARTIAL;
		else if (txp->flags & XEN_NETTXF_data_validated)
			skb->ip_summed = CHECKSUM_UNNECESSARY;

1143
		xenvif_fill_frags(queue, skb);
I
Ian Campbell 已提交
1144

1145
		if (unlikely(skb_has_frag_list(skb))) {
1146
			if (xenvif_handle_frag_list(queue, skb)) {
1147
				if (net_ratelimit())
1148
					netdev_err(queue->vif->dev,
1149
						   "Not enough memory to consolidate frag_list!\n");
1150
				xenvif_skb_zerocopy_prepare(queue, skb);
1151 1152 1153 1154 1155
				kfree_skb(skb);
				continue;
			}
		}

1156
		skb->dev      = queue->vif->dev;
I
Ian Campbell 已提交
1157
		skb->protocol = eth_type_trans(skb, skb->dev);
1158
		skb_reset_network_header(skb);
I
Ian Campbell 已提交
1159

1160 1161
		if (checksum_setup(queue, skb)) {
			netdev_dbg(queue->vif->dev,
I
Ian Campbell 已提交
1162
				   "Can't setup checksum in net_tx_action\n");
1163 1164
			/* We have to set this flag to trigger the callback */
			if (skb_shinfo(skb)->destructor_arg)
1165
				xenvif_skb_zerocopy_prepare(queue, skb);
I
Ian Campbell 已提交
1166 1167 1168 1169
			kfree_skb(skb);
			continue;
		}

1170
		skb_probe_transport_header(skb, 0);
1171

1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185
		/* If the packet is GSO then we will have just set up the
		 * transport header offset in checksum_setup so it's now
		 * straightforward to calculate gso_segs.
		 */
		if (skb_is_gso(skb)) {
			int mss = skb_shinfo(skb)->gso_size;
			int hdrlen = skb_transport_header(skb) -
				skb_mac_header(skb) +
				tcp_hdrlen(skb);

			skb_shinfo(skb)->gso_segs =
				DIV_ROUND_UP(skb->len - hdrlen, mss);
		}

1186 1187
		queue->stats.rx_bytes += skb->len;
		queue->stats.rx_packets++;
I
Ian Campbell 已提交
1188

1189 1190
		work_done++;

1191 1192 1193 1194 1195
		/* Set this flag right before netif_receive_skb, otherwise
		 * someone might think this packet already left netback, and
		 * do a skb_copy_ubufs while we are still in control of the
		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
		 */
1196
		if (skb_shinfo(skb)->destructor_arg) {
1197
			xenvif_skb_zerocopy_prepare(queue, skb);
1198
			queue->stats.tx_zerocopy_sent++;
1199
		}
1200

1201
		netif_receive_skb(skb);
I
Ian Campbell 已提交
1202
	}
1203 1204

	return work_done;
I
Ian Campbell 已提交
1205 1206
}

1207 1208
void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
{
1209 1210
	unsigned long flags;
	pending_ring_idx_t index;
1211
	struct xenvif_queue *queue = ubuf_to_queue(ubuf);
1212 1213 1214 1215

	/* This is the only place where we grab this lock, to protect callbacks
	 * from each other.
	 */
1216
	spin_lock_irqsave(&queue->callback_lock, flags);
1217 1218 1219
	do {
		u16 pending_idx = ubuf->desc;
		ubuf = (struct ubuf_info *) ubuf->ctx;
1220
		BUG_ON(queue->dealloc_prod - queue->dealloc_cons >=
1221
			MAX_PENDING_REQS);
1222 1223
		index = pending_index(queue->dealloc_prod);
		queue->dealloc_ring[index] = pending_idx;
1224 1225 1226 1227
		/* Sync with xenvif_tx_dealloc_action:
		 * insert idx then incr producer.
		 */
		smp_wmb();
1228
		queue->dealloc_prod++;
1229
	} while (ubuf);
1230
	spin_unlock_irqrestore(&queue->callback_lock, flags);
1231

1232
	if (likely(zerocopy_success))
1233
		queue->stats.tx_zerocopy_success++;
1234
	else
1235
		queue->stats.tx_zerocopy_fail++;
1236
	xenvif_skb_zerocopy_complete(queue);
1237 1238
}

1239
static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
1240 1241 1242 1243 1244 1245
{
	struct gnttab_unmap_grant_ref *gop;
	pending_ring_idx_t dc, dp;
	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
	unsigned int i = 0;

1246 1247
	dc = queue->dealloc_cons;
	gop = queue->tx_unmap_ops;
1248 1249 1250

	/* Free up any grants we have finished using */
	do {
1251
		dp = queue->dealloc_prod;
1252 1253 1254 1255 1256 1257 1258

		/* Ensure we see all indices enqueued by all
		 * xenvif_zerocopy_callback().
		 */
		smp_rmb();

		while (dc != dp) {
1259
			BUG_ON(gop - queue->tx_unmap_ops >= MAX_PENDING_REQS);
1260
			pending_idx =
1261
				queue->dealloc_ring[pending_index(dc++)];
1262

1263
			pending_idx_release[gop - queue->tx_unmap_ops] =
1264
				pending_idx;
1265
			queue->pages_to_unmap[gop - queue->tx_unmap_ops] =
1266
				queue->mmap_pages[pending_idx];
1267
			gnttab_set_unmap_op(gop,
1268
					    idx_to_kaddr(queue, pending_idx),
1269
					    GNTMAP_host_map,
1270 1271
					    queue->grant_tx_handle[pending_idx]);
			xenvif_grant_handle_reset(queue, pending_idx);
1272 1273 1274
			++gop;
		}

1275
	} while (dp != queue->dealloc_prod);
1276

1277
	queue->dealloc_cons = dc;
1278

1279
	if (gop - queue->tx_unmap_ops > 0) {
1280
		int ret;
1281
		ret = gnttab_unmap_refs(queue->tx_unmap_ops,
1282
					NULL,
1283 1284
					queue->pages_to_unmap,
					gop - queue->tx_unmap_ops);
1285
		if (ret) {
1286
			netdev_err(queue->vif->dev, "Unmap fail: nr_ops %tu ret %d\n",
1287 1288
				   gop - queue->tx_unmap_ops, ret);
			for (i = 0; i < gop - queue->tx_unmap_ops; ++i) {
1289
				if (gop[i].status != GNTST_okay)
1290
					netdev_err(queue->vif->dev,
1291
						   " host_addr: 0x%llx handle: 0x%x status: %d\n",
1292 1293 1294 1295 1296 1297 1298 1299
						   gop[i].host_addr,
						   gop[i].handle,
						   gop[i].status);
			}
			BUG();
		}
	}

1300 1301
	for (i = 0; i < gop - queue->tx_unmap_ops; ++i)
		xenvif_idx_release(queue, pending_idx_release[i],
1302
				   XEN_NETIF_RSP_OKAY);
1303 1304
}

1305

I
Ian Campbell 已提交
1306
/* Called after netfront has transmitted */
1307
int xenvif_tx_action(struct xenvif_queue *queue, int budget)
I
Ian Campbell 已提交
1308
{
1309
	unsigned nr_mops, nr_cops = 0;
1310
	int work_done, ret;
I
Ian Campbell 已提交
1311

1312
	if (unlikely(!tx_work_todo(queue)))
1313 1314
		return 0;

1315
	xenvif_tx_build_gops(queue, budget, &nr_cops, &nr_mops);
I
Ian Campbell 已提交
1316

1317
	if (nr_cops == 0)
1318 1319
		return 0;

1320
	gnttab_batch_copy(queue->tx_copy_ops, nr_cops);
1321
	if (nr_mops != 0) {
1322
		ret = gnttab_map_refs(queue->tx_map_ops,
1323
				      NULL,
1324
				      queue->pages_to_map,
1325 1326 1327
				      nr_mops);
		BUG_ON(ret);
	}
I
Ian Campbell 已提交
1328

1329
	work_done = xenvif_tx_submit(queue);
I
Ian Campbell 已提交
1330

1331
	return work_done;
I
Ian Campbell 已提交
1332 1333
}

1334
static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
W
Wei Liu 已提交
1335
			       u8 status)
I
Ian Campbell 已提交
1336 1337
{
	struct pending_tx_info *pending_tx_info;
1338 1339
	pending_ring_idx_t index;
	unsigned long flags;
1340

1341
	pending_tx_info = &queue->pending_tx_info[pending_idx];
1342

1343
	spin_lock_irqsave(&queue->response_lock, flags);
1344

1345 1346
	make_tx_response(queue, &pending_tx_info->req,
			 pending_tx_info->extra_count, status);
1347 1348 1349 1350 1351 1352

	/* Release the pending index before pusing the Tx response so
	 * its available before a new Tx request is pushed by the
	 * frontend.
	 */
	index = pending_index(queue->pending_prod++);
1353
	queue->pending_ring[index] = pending_idx;
1354

1355
	push_tx_responses(queue);
1356

1357
	spin_unlock_irqrestore(&queue->response_lock, flags);
I
Ian Campbell 已提交
1358 1359
}

1360

1361
static void make_tx_response(struct xenvif_queue *queue,
I
Ian Campbell 已提交
1362
			     struct xen_netif_tx_request *txp,
1363
			     unsigned int extra_count,
I
Ian Campbell 已提交
1364 1365
			     s8       st)
{
1366
	RING_IDX i = queue->tx.rsp_prod_pvt;
I
Ian Campbell 已提交
1367 1368
	struct xen_netif_tx_response *resp;

1369
	resp = RING_GET_RESPONSE(&queue->tx, i);
I
Ian Campbell 已提交
1370 1371 1372
	resp->id     = txp->id;
	resp->status = st;

1373
	while (extra_count-- != 0)
1374
		RING_GET_RESPONSE(&queue->tx, ++i)->status = XEN_NETIF_RSP_NULL;
I
Ian Campbell 已提交
1375

1376
	queue->tx.rsp_prod_pvt = ++i;
I
Ian Campbell 已提交
1377 1378
}

1379 1380 1381 1382 1383 1384 1385 1386 1387
static void push_tx_responses(struct xenvif_queue *queue)
{
	int notify;

	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue->tx, notify);
	if (notify)
		notify_remote_via_irq(queue->tx_irq);
}

1388
void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx)
1389 1390 1391 1392 1393
{
	int ret;
	struct gnttab_unmap_grant_ref tx_unmap_op;

	gnttab_set_unmap_op(&tx_unmap_op,
1394
			    idx_to_kaddr(queue, pending_idx),
1395
			    GNTMAP_host_map,
1396 1397
			    queue->grant_tx_handle[pending_idx]);
	xenvif_grant_handle_reset(queue, pending_idx);
1398 1399

	ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
1400
				&queue->mmap_pages[pending_idx], 1);
1401
	if (ret) {
1402
		netdev_err(queue->vif->dev,
1403
			   "Unmap fail: ret: %d pending_idx: %d host_addr: %llx handle: 0x%x status: %d\n",
1404 1405 1406 1407 1408 1409 1410
			   ret,
			   pending_idx,
			   tx_unmap_op.host_addr,
			   tx_unmap_op.handle,
			   tx_unmap_op.status);
		BUG();
	}
1411 1412
}

1413
static inline int tx_work_todo(struct xenvif_queue *queue)
I
Ian Campbell 已提交
1414
{
1415
	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&queue->tx)))
I
Ian Campbell 已提交
1416 1417 1418 1419 1420
		return 1;

	return 0;
}

1421
static inline bool tx_dealloc_work_todo(struct xenvif_queue *queue)
1422
{
1423
	return queue->dealloc_cons != queue->dealloc_prod;
1424 1425
}

1426
void xenvif_unmap_frontend_data_rings(struct xenvif_queue *queue)
I
Ian Campbell 已提交
1427
{
1428 1429 1430 1431 1432 1433
	if (queue->tx.sring)
		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
					queue->tx.sring);
	if (queue->rx.sring)
		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
					queue->rx.sring);
I
Ian Campbell 已提交
1434 1435
}

1436 1437 1438
int xenvif_map_frontend_data_rings(struct xenvif_queue *queue,
				   grant_ref_t tx_ring_ref,
				   grant_ref_t rx_ring_ref)
I
Ian Campbell 已提交
1439
{
1440
	void *addr;
I
Ian Campbell 已提交
1441 1442 1443 1444 1445
	struct xen_netif_tx_sring *txs;
	struct xen_netif_rx_sring *rxs;

	int err = -ENOMEM;

1446
	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
1447
				     &tx_ring_ref, 1, &addr);
1448
	if (err)
I
Ian Campbell 已提交
1449 1450
		goto err;

1451
	txs = (struct xen_netif_tx_sring *)addr;
1452
	BACK_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE);
I
Ian Campbell 已提交
1453

1454
	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
1455
				     &rx_ring_ref, 1, &addr);
1456
	if (err)
I
Ian Campbell 已提交
1457 1458
		goto err;

1459
	rxs = (struct xen_netif_rx_sring *)addr;
1460
	BACK_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE);
I
Ian Campbell 已提交
1461 1462 1463 1464

	return 0;

err:
1465
	xenvif_unmap_frontend_data_rings(queue);
I
Ian Campbell 已提交
1466 1467 1468
	return err;
}

1469 1470 1471 1472 1473 1474 1475 1476 1477
static bool xenvif_dealloc_kthread_should_stop(struct xenvif_queue *queue)
{
	/* Dealloc thread must remain running until all inflight
	 * packets complete.
	 */
	return kthread_should_stop() &&
		!atomic_read(&queue->inflight_packets);
}

1478 1479
int xenvif_dealloc_kthread(void *data)
{
1480
	struct xenvif_queue *queue = data;
1481

1482
	for (;;) {
1483 1484
		wait_event_interruptible(queue->dealloc_wq,
					 tx_dealloc_work_todo(queue) ||
1485 1486
					 xenvif_dealloc_kthread_should_stop(queue));
		if (xenvif_dealloc_kthread_should_stop(queue))
1487 1488
			break;

1489
		xenvif_tx_dealloc_action(queue);
1490 1491 1492 1493
		cond_resched();
	}

	/* Unmap anything remaining*/
1494 1495
	if (tx_dealloc_work_todo(queue))
		xenvif_tx_dealloc_action(queue);
1496 1497 1498 1499

	return 0;
}

1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
static void make_ctrl_response(struct xenvif *vif,
			       const struct xen_netif_ctrl_request *req,
			       u32 status, u32 data)
{
	RING_IDX idx = vif->ctrl.rsp_prod_pvt;
	struct xen_netif_ctrl_response rsp = {
		.id = req->id,
		.type = req->type,
		.status = status,
		.data = data,
	};

	*RING_GET_RESPONSE(&vif->ctrl, idx) = rsp;
	vif->ctrl.rsp_prod_pvt = ++idx;
}

static void push_ctrl_response(struct xenvif *vif)
{
	int notify;

	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->ctrl, notify);
	if (notify)
		notify_remote_via_irq(vif->ctrl_irq);
}

static void process_ctrl_request(struct xenvif *vif,
				 const struct xen_netif_ctrl_request *req)
{
1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569
	u32 status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED;
	u32 data = 0;

	switch (req->type) {
	case XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM:
		status = xenvif_set_hash_alg(vif, req->data[0]);
		break;

	case XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS:
		status = xenvif_get_hash_flags(vif, &data);
		break;

	case XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS:
		status = xenvif_set_hash_flags(vif, req->data[0]);
		break;

	case XEN_NETIF_CTRL_TYPE_SET_HASH_KEY:
		status = xenvif_set_hash_key(vif, req->data[0],
					     req->data[1]);
		break;

	case XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE:
		status = XEN_NETIF_CTRL_STATUS_SUCCESS;
		data = XEN_NETBK_MAX_HASH_MAPPING_SIZE;
		break;

	case XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE:
		status = xenvif_set_hash_mapping_size(vif,
						      req->data[0]);
		break;

	case XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING:
		status = xenvif_set_hash_mapping(vif, req->data[0],
						 req->data[1],
						 req->data[2]);
		break;

	default:
		break;
	}

	make_ctrl_response(vif, req, status, data);
1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608
	push_ctrl_response(vif);
}

static void xenvif_ctrl_action(struct xenvif *vif)
{
	for (;;) {
		RING_IDX req_prod, req_cons;

		req_prod = vif->ctrl.sring->req_prod;
		req_cons = vif->ctrl.req_cons;

		/* Make sure we can see requests before we process them. */
		rmb();

		if (req_cons == req_prod)
			break;

		while (req_cons != req_prod) {
			struct xen_netif_ctrl_request req;

			RING_COPY_REQUEST(&vif->ctrl, req_cons, &req);
			req_cons++;

			process_ctrl_request(vif, &req);
		}

		vif->ctrl.req_cons = req_cons;
		vif->ctrl.sring->req_event = req_cons + 1;
	}
}

static bool xenvif_ctrl_work_todo(struct xenvif *vif)
{
	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->ctrl)))
		return 1;

	return 0;
}

1609
irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data)
1610 1611 1612
{
	struct xenvif *vif = data;

1613 1614
	while (xenvif_ctrl_work_todo(vif))
		xenvif_ctrl_action(vif);
1615

1616
	return IRQ_HANDLED;
1617 1618
}

I
Ian Campbell 已提交
1619 1620 1621 1622
static int __init netback_init(void)
{
	int rc = 0;

1623
	if (!xen_domain())
I
Ian Campbell 已提交
1624 1625
		return -ENODEV;

1626
	/* Allow as many queues as there are CPUs but max. 8 if user has not
1627 1628 1629
	 * specified a value.
	 */
	if (xenvif_max_queues == 0)
1630 1631
		xenvif_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT,
					  num_online_cpus());
1632

1633
	if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) {
1634 1635
		pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",
			fatal_skb_slots, XEN_NETBK_LEGACY_SLOTS_MAX);
1636
		fatal_skb_slots = XEN_NETBK_LEGACY_SLOTS_MAX;
1637 1638
	}

I
Ian Campbell 已提交
1639 1640 1641 1642
	rc = xenvif_xenbus_init();
	if (rc)
		goto failed_init;

1643 1644 1645 1646 1647 1648 1649
#ifdef CONFIG_DEBUG_FS
	xen_netback_dbg_root = debugfs_create_dir("xen-netback", NULL);
	if (IS_ERR_OR_NULL(xen_netback_dbg_root))
		pr_warn("Init of debugfs returned %ld!\n",
			PTR_ERR(xen_netback_dbg_root));
#endif /* CONFIG_DEBUG_FS */

I
Ian Campbell 已提交
1650 1651 1652 1653 1654 1655 1656 1657
	return 0;

failed_init:
	return rc;
}

module_init(netback_init);

1658 1659
static void __exit netback_fini(void)
{
1660 1661 1662 1663
#ifdef CONFIG_DEBUG_FS
	if (!IS_ERR_OR_NULL(xen_netback_dbg_root))
		debugfs_remove_recursive(xen_netback_dbg_root);
#endif /* CONFIG_DEBUG_FS */
1664 1665 1666 1667
	xenvif_xenbus_fini();
}
module_exit(netback_fini);

I
Ian Campbell 已提交
1668
MODULE_LICENSE("Dual BSD/GPL");
1669
MODULE_ALIAS("xen-backend:vif");