netback.c 57.7 KB
Newer Older
I
Ian Campbell 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
/*
 * Back-end of the driver for virtual network devices. This portion of the
 * driver exports a 'unified' network-device interface that can be accessed
 * by any operating system that implements a compatible front end. A
 * reference front-end implementation can be found in:
 *  drivers/net/xen-netfront.c
 *
 * Copyright (c) 2002-2005, K A Fraser
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version 2
 * as published by the Free Software Foundation; or, when distributed
 * separately from the Linux kernel or incorporated into other
 * software packages, subject to the following license:
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this source file (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use, copy, modify,
 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "common.h"

#include <linux/kthread.h>
#include <linux/if_vlan.h>
#include <linux/udp.h>
40
#include <linux/highmem.h>
I
Ian Campbell 已提交
41 42 43

#include <net/tcp.h>

S
Stefano Stabellini 已提交
44
#include <xen/xen.h>
I
Ian Campbell 已提交
45 46 47 48 49 50
#include <xen/events.h>
#include <xen/interface/memory.h>

#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>

51 52 53 54 55 56 57
/* Provide an option to disable split event channels at load time as
 * event channels are limited resource. Split event channels are
 * enabled by default.
 */
bool separate_tx_rx_irq = 1;
module_param(separate_tx_rx_irq, bool, 0644);

58
/* When guest ring is filled up, qdisc queues the packets for us, but we have
59
 * to timeout them, otherwise other guests' packets can get stuck there
60 61 62 63 64
 */
unsigned int rx_drain_timeout_msecs = 10000;
module_param(rx_drain_timeout_msecs, uint, 0444);
unsigned int rx_drain_timeout_jiffies;

65 66 67 68 69
unsigned int xenvif_max_queues;
module_param_named(max_queues, xenvif_max_queues, uint, 0644);
MODULE_PARM_DESC(max_queues,
		 "Maximum number of queues per virtual interface");

70 71 72 73
/*
 * This is the maximum slots a skb can have. If a guest sends a skb
 * which exceeds this limit it is considered malicious.
 */
74 75 76 77
#define FATAL_SKB_SLOTS_DEFAULT 20
static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
module_param(fatal_skb_slots, uint, 0444);

78
static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
W
Wei Liu 已提交
79 80
			       u8 status);

81
static void make_tx_response(struct xenvif_queue *queue,
I
Ian Campbell 已提交
82 83
			     struct xen_netif_tx_request *txp,
			     s8       st);
84

85 86
static inline int tx_work_todo(struct xenvif_queue *queue);
static inline int rx_work_todo(struct xenvif_queue *queue);
87

88
static struct xen_netif_rx_response *make_rx_response(struct xenvif_queue *queue,
I
Ian Campbell 已提交
89 90 91 92 93 94
					     u16      id,
					     s8       st,
					     u16      offset,
					     u16      size,
					     u16      flags);

95
static inline unsigned long idx_to_pfn(struct xenvif_queue *queue,
96
				       u16 idx)
I
Ian Campbell 已提交
97
{
98
	return page_to_pfn(queue->mmap_pages[idx]);
I
Ian Campbell 已提交
99 100
}

101
static inline unsigned long idx_to_kaddr(struct xenvif_queue *queue,
102
					 u16 idx)
I
Ian Campbell 已提交
103
{
104
	return (unsigned long)pfn_to_kaddr(idx_to_pfn(queue, idx));
I
Ian Campbell 已提交
105 106
}

107 108 109
#define callback_param(vif, pending_idx) \
	(vif->pending_tx_info[pending_idx].callback_struct)

110 111
/* Find the containing VIF's structure from a pointer in pending_tx_info array
 */
112
static inline struct xenvif_queue *ubuf_to_queue(const struct ubuf_info *ubuf)
113
{
114 115 116 117
	u16 pending_idx = ubuf->desc;
	struct pending_tx_info *temp =
		container_of(ubuf, struct pending_tx_info, callback_struct);
	return container_of(temp - pending_idx,
118
			    struct xenvif_queue,
119
			    pending_tx_info[0]);
120
}
121

122 123 124 125
/* This is a miniumum size for the linear area to avoid lots of
 * calls to __pskb_pull_tail() as we set up checksum offsets. The
 * value 128 was chosen as it covers all IPv4 and most likely
 * IPv6 headers.
I
Ian Campbell 已提交
126
 */
127
#define PKT_PROT_LEN 128
I
Ian Campbell 已提交
128

129 130 131 132 133 134 135 136 137 138
static u16 frag_get_pending_idx(skb_frag_t *frag)
{
	return (u16)frag->page_offset;
}

static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx)
{
	frag->page_offset = pending_idx;
}

I
Ian Campbell 已提交
139 140 141 142 143
static inline pending_ring_idx_t pending_index(unsigned i)
{
	return i & (MAX_PENDING_REQS-1);
}

144
bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue, int needed)
I
Ian Campbell 已提交
145
{
146
	RING_IDX prod, cons;
I
Ian Campbell 已提交
147

148
	do {
149 150
		prod = queue->rx.sring->req_prod;
		cons = queue->rx.req_cons;
I
Ian Campbell 已提交
151

152 153
		if (prod - cons >= needed)
			return true;
I
Ian Campbell 已提交
154

155
		queue->rx.sring->req_event = prod + 1;
I
Ian Campbell 已提交
156

157 158 159 160
		/* Make sure event is visible before we check prod
		 * again.
		 */
		mb();
161
	} while (queue->rx.sring->req_prod != prod);
I
Ian Campbell 已提交
162

163
	return false;
I
Ian Campbell 已提交
164 165 166 167 168 169 170
}

/*
 * Returns true if we should start a new receive buffer instead of
 * adding 'size' bytes to a buffer which currently contains 'offset'
 * bytes.
 */
171 172
static bool start_new_rx_buffer(int offset, unsigned long size, int head,
				bool full_coalesce)
I
Ian Campbell 已提交
173 174 175 176 177 178 179 180 181 182 183
{
	/* simple case: we have completely filled the current buffer. */
	if (offset == MAX_BUFFER_OFFSET)
		return true;

	/*
	 * complex case: start a fresh buffer if the current frag
	 * would overflow the current buffer but only if:
	 *     (i)   this frag would fit completely in the next buffer
	 * and (ii)  there is already some data in the current buffer
	 * and (iii) this is not the head buffer.
184
	 * and (iv)  there is no need to fully utilize the buffers
I
Ian Campbell 已提交
185 186 187 188 189 190 191 192 193 194
	 *
	 * Where:
	 * - (i) stops us splitting a frag into two copies
	 *   unless the frag is too large for a single buffer.
	 * - (ii) stops us from leaving a buffer pointlessly empty.
	 * - (iii) stops us leaving the first buffer
	 *   empty. Strictly speaking this is already covered
	 *   by (ii) but is explicitly checked because
	 *   netfront relies on the first buffer being
	 *   non-empty and can crash otherwise.
195 196
	 * - (iv) is needed for skbs which can use up more than MAX_SKB_FRAGS
	 *   slot
I
Ian Campbell 已提交
197 198 199 200 201 202
	 *
	 * This means we will effectively linearise small
	 * frags but do not needlessly split large buffers
	 * into multiple copies tend to give large frags their
	 * own buffers as before.
	 */
203
	BUG_ON(size > MAX_BUFFER_OFFSET);
204 205
	if ((offset + size > MAX_BUFFER_OFFSET) && offset && !head &&
	    !full_coalesce)
I
Ian Campbell 已提交
206 207 208 209 210 211 212 213 214
		return true;

	return false;
}

struct netrx_pending_operations {
	unsigned copy_prod, copy_cons;
	unsigned meta_prod, meta_cons;
	struct gnttab_copy *copy;
215
	struct xenvif_rx_meta *meta;
I
Ian Campbell 已提交
216 217 218 219
	int copy_off;
	grant_ref_t copy_gref;
};

220
static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif_queue *queue,
221
						 struct netrx_pending_operations *npo)
I
Ian Campbell 已提交
222
{
223
	struct xenvif_rx_meta *meta;
I
Ian Campbell 已提交
224 225
	struct xen_netif_rx_request *req;

226
	req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
I
Ian Campbell 已提交
227 228

	meta = npo->meta + npo->meta_prod++;
229
	meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
I
Ian Campbell 已提交
230 231 232 233 234 235 236 237 238 239
	meta->gso_size = 0;
	meta->size = 0;
	meta->id = req->id;

	npo->copy_off = 0;
	npo->copy_gref = req->gref;

	return meta;
}

240 241 242 243 244 245 246
struct xenvif_rx_cb {
	int meta_slots_used;
	bool full_coalesce;
};

#define XENVIF_RX_CB(skb) ((struct xenvif_rx_cb *)(skb)->cb)

247 248 249 250
/*
 * Set up the grant operations for this fragment. If it's a flipping
 * interface, we also set up the unmap request from here.
 */
251
static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb,
W
Wei Liu 已提交
252 253
				 struct netrx_pending_operations *npo,
				 struct page *page, unsigned long size,
254
				 unsigned long offset, int *head,
255
				 struct xenvif_queue *foreign_queue,
256
				 grant_ref_t foreign_gref)
I
Ian Campbell 已提交
257 258
{
	struct gnttab_copy *copy_gop;
259
	struct xenvif_rx_meta *meta;
I
Ian Campbell 已提交
260
	unsigned long bytes;
261
	int gso_type = XEN_NETIF_GSO_TYPE_NONE;
I
Ian Campbell 已提交
262 263

	/* Data must not cross a page boundary. */
264
	BUG_ON(size + offset > PAGE_SIZE<<compound_order(page));
I
Ian Campbell 已提交
265 266 267

	meta = npo->meta + npo->meta_prod - 1;

268 269 270 271
	/* Skip unused frames from start of page */
	page += offset >> PAGE_SHIFT;
	offset &= ~PAGE_MASK;

I
Ian Campbell 已提交
272
	while (size > 0) {
273
		BUG_ON(offset >= PAGE_SIZE);
I
Ian Campbell 已提交
274 275
		BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);

276 277 278 279 280
		bytes = PAGE_SIZE - offset;

		if (bytes > size)
			bytes = size;

281 282 283 284
		if (start_new_rx_buffer(npo->copy_off,
					bytes,
					*head,
					XENVIF_RX_CB(skb)->full_coalesce)) {
I
Ian Campbell 已提交
285 286 287 288
			/*
			 * Netfront requires there to be some data in the head
			 * buffer.
			 */
289
			BUG_ON(*head);
I
Ian Campbell 已提交
290

291
			meta = get_next_rx_buffer(queue, npo);
I
Ian Campbell 已提交
292 293 294 295 296 297 298
		}

		if (npo->copy_off + bytes > MAX_BUFFER_OFFSET)
			bytes = MAX_BUFFER_OFFSET - npo->copy_off;

		copy_gop = npo->copy + npo->copy_prod++;
		copy_gop->flags = GNTCOPY_dest_gref;
299 300
		copy_gop->len = bytes;

301 302
		if (foreign_queue) {
			copy_gop->source.domid = foreign_queue->vif->domid;
303 304 305 306 307 308 309
			copy_gop->source.u.ref = foreign_gref;
			copy_gop->flags |= GNTCOPY_source_gref;
		} else {
			copy_gop->source.domid = DOMID_SELF;
			copy_gop->source.u.gmfn =
				virt_to_mfn(page_address(page));
		}
I
Ian Campbell 已提交
310 311
		copy_gop->source.offset = offset;

312
		copy_gop->dest.domid = queue->vif->domid;
I
Ian Campbell 已提交
313 314 315 316 317 318 319 320 321
		copy_gop->dest.offset = npo->copy_off;
		copy_gop->dest.u.ref = npo->copy_gref;

		npo->copy_off += bytes;
		meta->size += bytes;

		offset += bytes;
		size -= bytes;

322 323 324 325 326 327 328
		/* Next frame */
		if (offset == PAGE_SIZE && size) {
			BUG_ON(!PageCompound(page));
			page++;
			offset = 0;
		}

I
Ian Campbell 已提交
329
		/* Leave a gap for the GSO descriptor. */
330 331 332 333 334 335
		if (skb_is_gso(skb)) {
			if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
				gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
			else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
				gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
		}
336

337 338
		if (*head && ((1 << gso_type) & queue->vif->gso_mask))
			queue->rx.req_cons++;
I
Ian Campbell 已提交
339

340
		*head = 0; /* There must be something in this buffer now. */
I
Ian Campbell 已提交
341 342 343 344

	}
}

345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
/*
 * Find the grant ref for a given frag in a chain of struct ubuf_info's
 * skb: the skb itself
 * i: the frag's number
 * ubuf: a pointer to an element in the chain. It should not be NULL
 *
 * Returns a pointer to the element in the chain where the page were found. If
 * not found, returns NULL.
 * See the definition of callback_struct in common.h for more details about
 * the chain.
 */
static const struct ubuf_info *xenvif_find_gref(const struct sk_buff *const skb,
						const int i,
						const struct ubuf_info *ubuf)
{
360
	struct xenvif_queue *foreign_queue = ubuf_to_queue(ubuf);
361 362 363 364 365

	do {
		u16 pending_idx = ubuf->desc;

		if (skb_shinfo(skb)->frags[i].page.p ==
366
		    foreign_queue->mmap_pages[pending_idx])
367 368 369 370 371 372 373
			break;
		ubuf = (struct ubuf_info *) ubuf->ctx;
	} while (ubuf);

	return ubuf;
}

I
Ian Campbell 已提交
374 375 376 377 378 379 380 381 382 383 384 385
/*
 * Prepare an SKB to be transmitted to the frontend.
 *
 * This function is responsible for allocating grant operations, meta
 * structures, etc.
 *
 * It returns the number of meta structures consumed. The number of
 * ring slots used is always equal to the number of meta slots used
 * plus the number of GSO descriptors used. Currently, we use either
 * zero GSO descriptors (for non-GSO packets) or one descriptor (for
 * frontend-side LRO).
 */
W
Wei Liu 已提交
386
static int xenvif_gop_skb(struct sk_buff *skb,
387 388
			  struct netrx_pending_operations *npo,
			  struct xenvif_queue *queue)
I
Ian Campbell 已提交
389 390 391 392 393
{
	struct xenvif *vif = netdev_priv(skb->dev);
	int nr_frags = skb_shinfo(skb)->nr_frags;
	int i;
	struct xen_netif_rx_request *req;
394
	struct xenvif_rx_meta *meta;
I
Ian Campbell 已提交
395
	unsigned char *data;
396
	int head = 1;
I
Ian Campbell 已提交
397
	int old_meta_prod;
398
	int gso_type;
399 400
	const struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg;
	const struct ubuf_info *const head_ubuf = ubuf;
I
Ian Campbell 已提交
401 402 403

	old_meta_prod = npo->meta_prod;

404 405 406 407 408 409
	gso_type = XEN_NETIF_GSO_TYPE_NONE;
	if (skb_is_gso(skb)) {
		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
			gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
			gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
410 411
	}

I
Ian Campbell 已提交
412
	/* Set up a GSO prefix descriptor, if necessary */
P
Paul Durrant 已提交
413
	if ((1 << gso_type) & vif->gso_prefix_mask) {
414
		req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
I
Ian Campbell 已提交
415
		meta = npo->meta + npo->meta_prod++;
416
		meta->gso_type = gso_type;
417
		meta->gso_size = skb_shinfo(skb)->gso_size;
I
Ian Campbell 已提交
418 419 420 421
		meta->size = 0;
		meta->id = req->id;
	}

422
	req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
I
Ian Campbell 已提交
423 424
	meta = npo->meta + npo->meta_prod++;

425 426
	if ((1 << gso_type) & vif->gso_mask) {
		meta->gso_type = gso_type;
427
		meta->gso_size = skb_shinfo(skb)->gso_size;
428 429
	} else {
		meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
I
Ian Campbell 已提交
430
		meta->gso_size = 0;
431
	}
I
Ian Campbell 已提交
432 433 434 435 436 437 438 439 440 441 442 443 444 445

	meta->size = 0;
	meta->id = req->id;
	npo->copy_off = 0;
	npo->copy_gref = req->gref;

	data = skb->data;
	while (data < skb_tail_pointer(skb)) {
		unsigned int offset = offset_in_page(data);
		unsigned int len = PAGE_SIZE - offset;

		if (data + len > skb_tail_pointer(skb))
			len = skb_tail_pointer(skb) - data;

446
		xenvif_gop_frag_copy(queue, skb, npo,
447 448 449
				     virt_to_page(data), len, offset, &head,
				     NULL,
				     0);
I
Ian Campbell 已提交
450 451 452 453
		data += len;
	}

	for (i = 0; i < nr_frags; i++) {
454 455 456
		/* This variable also signals whether foreign_gref has a real
		 * value or not.
		 */
457
		struct xenvif_queue *foreign_queue = NULL;
458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
		grant_ref_t foreign_gref;

		if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) &&
			(ubuf->callback == &xenvif_zerocopy_callback)) {
			const struct ubuf_info *const startpoint = ubuf;

			/* Ideally ubuf points to the chain element which
			 * belongs to this frag. Or if frags were removed from
			 * the beginning, then shortly before it.
			 */
			ubuf = xenvif_find_gref(skb, i, ubuf);

			/* Try again from the beginning of the list, if we
			 * haven't tried from there. This only makes sense in
			 * the unlikely event of reordering the original frags.
			 * For injected local pages it's an unnecessary second
			 * run.
			 */
			if (unlikely(!ubuf) && startpoint != head_ubuf)
				ubuf = xenvif_find_gref(skb, i, head_ubuf);

			if (likely(ubuf)) {
				u16 pending_idx = ubuf->desc;

482 483 484
				foreign_queue = ubuf_to_queue(ubuf);
				foreign_gref =
					foreign_queue->pending_tx_info[pending_idx].req.gref;
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
				/* Just a safety measure. If this was the last
				 * element on the list, the for loop will
				 * iterate again if a local page were added to
				 * the end. Using head_ubuf here prevents the
				 * second search on the chain. Or the original
				 * frags changed order, but that's less likely.
				 * In any way, ubuf shouldn't be NULL.
				 */
				ubuf = ubuf->ctx ?
					(struct ubuf_info *) ubuf->ctx :
					head_ubuf;
			} else
				/* This frag was a local page, added to the
				 * array after the skb left netback.
				 */
				ubuf = head_ubuf;
		}
502
		xenvif_gop_frag_copy(queue, skb, npo,
W
Wei Liu 已提交
503 504 505
				     skb_frag_page(&skb_shinfo(skb)->frags[i]),
				     skb_frag_size(&skb_shinfo(skb)->frags[i]),
				     skb_shinfo(skb)->frags[i].page_offset,
506
				     &head,
507 508
				     foreign_queue,
				     foreign_queue ? foreign_gref : UINT_MAX);
I
Ian Campbell 已提交
509 510 511 512 513 514
	}

	return npo->meta_prod - old_meta_prod;
}

/*
W
Wei Liu 已提交
515
 * This is a twin to xenvif_gop_skb.  Assume that xenvif_gop_skb was
I
Ian Campbell 已提交
516 517 518 519
 * used to set up the operations on the top of
 * netrx_pending_operations, which have since been done.  Check that
 * they didn't give any errors and advance over them.
 */
W
Wei Liu 已提交
520 521
static int xenvif_check_gop(struct xenvif *vif, int nr_meta_slots,
			    struct netrx_pending_operations *npo)
I
Ian Campbell 已提交
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
{
	struct gnttab_copy     *copy_op;
	int status = XEN_NETIF_RSP_OKAY;
	int i;

	for (i = 0; i < nr_meta_slots; i++) {
		copy_op = npo->copy + npo->copy_cons++;
		if (copy_op->status != GNTST_okay) {
			netdev_dbg(vif->dev,
				   "Bad status %d from copy to DOM%d.\n",
				   copy_op->status, vif->domid);
			status = XEN_NETIF_RSP_ERROR;
		}
	}

	return status;
}

540
static void xenvif_add_frag_responses(struct xenvif_queue *queue, int status,
W
Wei Liu 已提交
541 542
				      struct xenvif_rx_meta *meta,
				      int nr_meta_slots)
I
Ian Campbell 已提交
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
{
	int i;
	unsigned long offset;

	/* No fragments used */
	if (nr_meta_slots <= 1)
		return;

	nr_meta_slots--;

	for (i = 0; i < nr_meta_slots; i++) {
		int flags;
		if (i == nr_meta_slots - 1)
			flags = 0;
		else
			flags = XEN_NETRXF_more_data;

		offset = 0;
561
		make_rx_response(queue, meta[i].id, status, offset,
I
Ian Campbell 已提交
562 563 564 565
				 meta[i].size, flags);
	}
}

566
void xenvif_kick_thread(struct xenvif_queue *queue)
567
{
568
	wake_up(&queue->wq);
569 570
}

571
static void xenvif_rx_action(struct xenvif_queue *queue)
I
Ian Campbell 已提交
572 573
{
	s8 status;
574
	u16 flags;
I
Ian Campbell 已提交
575 576 577 578 579 580
	struct xen_netif_rx_response *resp;
	struct sk_buff_head rxq;
	struct sk_buff *skb;
	LIST_HEAD(notify);
	int ret;
	unsigned long offset;
581
	bool need_to_notify = false;
I
Ian Campbell 已提交
582 583

	struct netrx_pending_operations npo = {
584 585
		.copy  = queue->grant_copy_op,
		.meta  = queue->meta,
I
Ian Campbell 已提交
586 587 588 589
	};

	skb_queue_head_init(&rxq);

590
	while ((skb = skb_dequeue(&queue->rx_queue)) != NULL) {
591
		RING_IDX max_slots_needed;
592 593
		RING_IDX old_req_cons;
		RING_IDX ring_slots_used;
594 595 596 597 598 599 600 601 602 603 604
		int i;

		/* We need a cheap worse case estimate for the number of
		 * slots we'll use.
		 */

		max_slots_needed = DIV_ROUND_UP(offset_in_page(skb->data) +
						skb_headlen(skb),
						PAGE_SIZE);
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
			unsigned int size;
605 606
			unsigned int offset;

607
			size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
608 609 610 611 612 613 614 615 616
			offset = skb_shinfo(skb)->frags[i].page_offset;

			/* For a worse-case estimate we need to factor in
			 * the fragment page offset as this will affect the
			 * number of times xenvif_gop_frag_copy() will
			 * call start_new_rx_buffer().
			 */
			max_slots_needed += DIV_ROUND_UP(offset + size,
							 PAGE_SIZE);
617
		}
618 619 620

		/* To avoid the estimate becoming too pessimal for some
		 * frontends that limit posted rx requests, cap the estimate
621 622
		 * at MAX_SKB_FRAGS. In this case netback will fully coalesce
		 * the skb into the provided slots.
623
		 */
624
		if (max_slots_needed > MAX_SKB_FRAGS) {
625
			max_slots_needed = MAX_SKB_FRAGS;
626 627 628 629
			XENVIF_RX_CB(skb)->full_coalesce = true;
		} else {
			XENVIF_RX_CB(skb)->full_coalesce = false;
		}
630 631

		/* We may need one more slot for GSO metadata */
632 633 634
		if (skb_is_gso(skb) &&
		   (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4 ||
		    skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6))
635 636 637
			max_slots_needed++;

		/* If the skb may not fit then bail out now */
638 639
		if (!xenvif_rx_ring_slots_available(queue, max_slots_needed)) {
			skb_queue_head(&queue->rx_queue, skb);
640
			need_to_notify = true;
641
			queue->rx_last_skb_slots = max_slots_needed;
642
			break;
643
		} else
644
			queue->rx_last_skb_slots = 0;
I
Ian Campbell 已提交
645

646 647 648
		old_req_cons = queue->rx.req_cons;
		XENVIF_RX_CB(skb)->meta_slots_used = xenvif_gop_skb(skb, &npo, queue);
		ring_slots_used = queue->rx.req_cons - old_req_cons;
649 650

		BUG_ON(ring_slots_used > max_slots_needed);
I
Ian Campbell 已提交
651 652 653 654

		__skb_queue_tail(&rxq, skb);
	}

655
	BUG_ON(npo.meta_prod > ARRAY_SIZE(queue->meta));
I
Ian Campbell 已提交
656 657

	if (!npo.copy_prod)
658
		goto done;
I
Ian Campbell 已提交
659

660
	BUG_ON(npo.copy_prod > MAX_GRANT_COPY_OPS);
661
	gnttab_batch_copy(queue->grant_copy_op, npo.copy_prod);
I
Ian Campbell 已提交
662 663 664

	while ((skb = __skb_dequeue(&rxq)) != NULL) {

665 666 667 668
		if ((1 << queue->meta[npo.meta_cons].gso_type) &
		    queue->vif->gso_prefix_mask) {
			resp = RING_GET_RESPONSE(&queue->rx,
						 queue->rx.rsp_prod_pvt++);
I
Ian Campbell 已提交
669 670 671

			resp->flags = XEN_NETRXF_gso_prefix | XEN_NETRXF_more_data;

672 673
			resp->offset = queue->meta[npo.meta_cons].gso_size;
			resp->id = queue->meta[npo.meta_cons].id;
674
			resp->status = XENVIF_RX_CB(skb)->meta_slots_used;
I
Ian Campbell 已提交
675 676

			npo.meta_cons++;
677
			XENVIF_RX_CB(skb)->meta_slots_used--;
I
Ian Campbell 已提交
678 679 680
		}


681 682
		queue->stats.tx_bytes += skb->len;
		queue->stats.tx_packets++;
I
Ian Campbell 已提交
683

684
		status = xenvif_check_gop(queue->vif,
685 686
					  XENVIF_RX_CB(skb)->meta_slots_used,
					  &npo);
I
Ian Campbell 已提交
687

688
		if (XENVIF_RX_CB(skb)->meta_slots_used == 1)
I
Ian Campbell 已提交
689 690 691 692 693 694 695 696 697 698 699
			flags = 0;
		else
			flags = XEN_NETRXF_more_data;

		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
			flags |= XEN_NETRXF_csum_blank | XEN_NETRXF_data_validated;
		else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
			/* remote but checksummed. */
			flags |= XEN_NETRXF_data_validated;

		offset = 0;
700
		resp = make_rx_response(queue, queue->meta[npo.meta_cons].id,
I
Ian Campbell 已提交
701
					status, offset,
702
					queue->meta[npo.meta_cons].size,
I
Ian Campbell 已提交
703 704
					flags);

705 706
		if ((1 << queue->meta[npo.meta_cons].gso_type) &
		    queue->vif->gso_mask) {
I
Ian Campbell 已提交
707 708
			struct xen_netif_extra_info *gso =
				(struct xen_netif_extra_info *)
709 710
				RING_GET_RESPONSE(&queue->rx,
						  queue->rx.rsp_prod_pvt++);
I
Ian Campbell 已提交
711 712 713

			resp->flags |= XEN_NETRXF_extra_info;

714 715
			gso->u.gso.type = queue->meta[npo.meta_cons].gso_type;
			gso->u.gso.size = queue->meta[npo.meta_cons].gso_size;
I
Ian Campbell 已提交
716 717 718 719 720 721 722
			gso->u.gso.pad = 0;
			gso->u.gso.features = 0;

			gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
			gso->flags = 0;
		}

723 724
		xenvif_add_frag_responses(queue, status,
					  queue->meta + npo.meta_cons + 1,
725
					  XENVIF_RX_CB(skb)->meta_slots_used);
I
Ian Campbell 已提交
726

727
		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue->rx, ret);
I
Ian Campbell 已提交
728

729
		need_to_notify |= !!ret;
730

731
		npo.meta_cons += XENVIF_RX_CB(skb)->meta_slots_used;
I
Ian Campbell 已提交
732 733 734
		dev_kfree_skb(skb);
	}

735
done:
736
	if (need_to_notify)
737
		notify_remote_via_irq(queue->rx_irq);
I
Ian Campbell 已提交
738 739
}

740
void xenvif_napi_schedule_or_enable_events(struct xenvif_queue *queue)
I
Ian Campbell 已提交
741 742 743
{
	int more_to_do;

744
	RING_FINAL_CHECK_FOR_REQUESTS(&queue->tx, more_to_do);
I
Ian Campbell 已提交
745 746

	if (more_to_do)
747
		napi_schedule(&queue->napi);
I
Ian Campbell 已提交
748 749
}

750
static void tx_add_credit(struct xenvif_queue *queue)
I
Ian Campbell 已提交
751 752 753 754 755 756 757
{
	unsigned long max_burst, max_credit;

	/*
	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
	 * Otherwise the interface can seize up due to insufficient credit.
	 */
758
	max_burst = RING_GET_REQUEST(&queue->tx, queue->tx.req_cons)->size;
I
Ian Campbell 已提交
759
	max_burst = min(max_burst, 131072UL);
760
	max_burst = max(max_burst, queue->credit_bytes);
I
Ian Campbell 已提交
761 762

	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
763 764
	max_credit = queue->remaining_credit + queue->credit_bytes;
	if (max_credit < queue->remaining_credit)
I
Ian Campbell 已提交
765 766
		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */

767
	queue->remaining_credit = min(max_credit, max_burst);
I
Ian Campbell 已提交
768 769 770 771
}

static void tx_credit_callback(unsigned long data)
{
772 773 774
	struct xenvif_queue *queue = (struct xenvif_queue *)data;
	tx_add_credit(queue);
	xenvif_napi_schedule_or_enable_events(queue);
I
Ian Campbell 已提交
775 776
}

777
static void xenvif_tx_err(struct xenvif_queue *queue,
W
Wei Liu 已提交
778
			  struct xen_netif_tx_request *txp, RING_IDX end)
I
Ian Campbell 已提交
779
{
780
	RING_IDX cons = queue->tx.req_cons;
781
	unsigned long flags;
I
Ian Campbell 已提交
782 783

	do {
784 785 786
		spin_lock_irqsave(&queue->response_lock, flags);
		make_tx_response(queue, txp, XEN_NETIF_RSP_ERROR);
		spin_unlock_irqrestore(&queue->response_lock, flags);
787
		if (cons == end)
I
Ian Campbell 已提交
788
			break;
789
		txp = RING_GET_REQUEST(&queue->tx, cons++);
I
Ian Campbell 已提交
790
	} while (1);
791
	queue->tx.req_cons = cons;
I
Ian Campbell 已提交
792 793
}

W
Wei Liu 已提交
794
static void xenvif_fatal_tx_err(struct xenvif *vif)
795 796
{
	netdev_err(vif->dev, "fatal error; disabling device\n");
797
	vif->disabled = true;
798 799 800
	/* Disable the vif from queue 0's kthread */
	if (vif->queues)
		xenvif_kick_thread(&vif->queues[0]);
801 802
}

803
static int xenvif_count_requests(struct xenvif_queue *queue,
W
Wei Liu 已提交
804 805 806
				 struct xen_netif_tx_request *first,
				 struct xen_netif_tx_request *txp,
				 int work_to_do)
I
Ian Campbell 已提交
807
{
808
	RING_IDX cons = queue->tx.req_cons;
809 810
	int slots = 0;
	int drop_err = 0;
811
	int more_data;
I
Ian Campbell 已提交
812 813 814 815 816

	if (!(first->flags & XEN_NETTXF_more_data))
		return 0;

	do {
817 818
		struct xen_netif_tx_request dropped_tx = { 0 };

819
		if (slots >= work_to_do) {
820
			netdev_err(queue->vif->dev,
821 822
				   "Asked for %d slots but exceeds this limit\n",
				   work_to_do);
823
			xenvif_fatal_tx_err(queue->vif);
824
			return -ENODATA;
I
Ian Campbell 已提交
825 826
		}

827 828 829
		/* This guest is really using too many slots and
		 * considered malicious.
		 */
830
		if (unlikely(slots >= fatal_skb_slots)) {
831
			netdev_err(queue->vif->dev,
832
				   "Malicious frontend using %d slots, threshold %u\n",
833
				   slots, fatal_skb_slots);
834
			xenvif_fatal_tx_err(queue->vif);
835
			return -E2BIG;
I
Ian Campbell 已提交
836 837
		}

838
		/* Xen network protocol had implicit dependency on
839 840 841 842 843
		 * MAX_SKB_FRAGS. XEN_NETBK_LEGACY_SLOTS_MAX is set to
		 * the historical MAX_SKB_FRAGS value 18 to honor the
		 * same behavior as before. Any packet using more than
		 * 18 slots but less than fatal_skb_slots slots is
		 * dropped
844
		 */
845
		if (!drop_err && slots >= XEN_NETBK_LEGACY_SLOTS_MAX) {
846
			if (net_ratelimit())
847
				netdev_dbg(queue->vif->dev,
848
					   "Too many slots (%d) exceeding limit (%d), dropping packet\n",
849
					   slots, XEN_NETBK_LEGACY_SLOTS_MAX);
850 851 852
			drop_err = -E2BIG;
		}

853 854 855
		if (drop_err)
			txp = &dropped_tx;

856
		memcpy(txp, RING_GET_REQUEST(&queue->tx, cons + slots),
I
Ian Campbell 已提交
857
		       sizeof(*txp));
858 859 860 861 862 863 864 865 866 867 868 869

		/* If the guest submitted a frame >= 64 KiB then
		 * first->size overflowed and following slots will
		 * appear to be larger than the frame.
		 *
		 * This cannot be fatal error as there are buggy
		 * frontends that do this.
		 *
		 * Consume all slots and drop the packet.
		 */
		if (!drop_err && txp->size > first->size) {
			if (net_ratelimit())
870
				netdev_dbg(queue->vif->dev,
871 872 873
					   "Invalid tx request, slot size %u > remaining size %u\n",
					   txp->size, first->size);
			drop_err = -EIO;
I
Ian Campbell 已提交
874 875 876
		}

		first->size -= txp->size;
877
		slots++;
I
Ian Campbell 已提交
878 879

		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
880
			netdev_err(queue->vif->dev, "Cross page boundary, txp->offset: %x, size: %u\n",
I
Ian Campbell 已提交
881
				 txp->offset, txp->size);
882
			xenvif_fatal_tx_err(queue->vif);
883
			return -EINVAL;
I
Ian Campbell 已提交
884
		}
885 886 887 888 889 890 891

		more_data = txp->flags & XEN_NETTXF_more_data;

		if (!drop_err)
			txp++;

	} while (more_data);
892 893

	if (drop_err) {
894
		xenvif_tx_err(queue, first, cons + slots);
895 896 897 898
		return drop_err;
	}

	return slots;
I
Ian Campbell 已提交
899 900
}

901 902 903 904 905 906 907

struct xenvif_tx_cb {
	u16 pending_idx;
};

#define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)

908
static inline void xenvif_tx_create_map_op(struct xenvif_queue *queue,
Z
Zoltan Kiss 已提交
909 910 911
					  u16 pending_idx,
					  struct xen_netif_tx_request *txp,
					  struct gnttab_map_grant_ref *mop)
912
{
913 914
	queue->pages_to_map[mop-queue->tx_map_ops] = queue->mmap_pages[pending_idx];
	gnttab_set_map_op(mop, idx_to_kaddr(queue, pending_idx),
915
			  GNTMAP_host_map | GNTMAP_readonly,
916
			  txp->gref, queue->vif->domid);
917

918
	memcpy(&queue->pending_tx_info[pending_idx].req, txp,
919 920 921
	       sizeof(*txp));
}

922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
{
	struct sk_buff *skb =
		alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
			  GFP_ATOMIC | __GFP_NOWARN);
	if (unlikely(skb == NULL))
		return NULL;

	/* Packets passed to netif_rx() must have some headroom. */
	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);

	/* Initialize it here to avoid later surprises */
	skb_shinfo(skb)->destructor_arg = NULL;

	return skb;
}

939
static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *queue,
940 941 942
							struct sk_buff *skb,
							struct xen_netif_tx_request *txp,
							struct gnttab_map_grant_ref *gop)
I
Ian Campbell 已提交
943 944 945
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	skb_frag_t *frags = shinfo->frags;
946
	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
947 948
	int start;
	pending_ring_idx_t index;
949
	unsigned int nr_slots, frag_overflow = 0;
950 951

	/* At this point shinfo->nr_frags is in fact the number of
952
	 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
953
	 */
954 955 956 957 958
	if (shinfo->nr_frags > MAX_SKB_FRAGS) {
		frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
		BUG_ON(frag_overflow > MAX_SKB_FRAGS);
		shinfo->nr_frags = MAX_SKB_FRAGS;
	}
959
	nr_slots = shinfo->nr_frags;
I
Ian Campbell 已提交
960 961

	/* Skip first skb fragment if it is on same page as header fragment. */
962
	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
I
Ian Campbell 已提交
963

964 965
	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
	     shinfo->nr_frags++, txp++, gop++) {
966 967 968
		index = pending_index(queue->pending_cons++);
		pending_idx = queue->pending_ring[index];
		xenvif_tx_create_map_op(queue, pending_idx, txp, gop);
969
		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
I
Ian Campbell 已提交
970 971
	}

972 973 974 975
	if (frag_overflow) {
		struct sk_buff *nskb = xenvif_alloc_skb(0);
		if (unlikely(nskb == NULL)) {
			if (net_ratelimit())
976
				netdev_err(queue->vif->dev,
977 978 979 980 981 982 983 984 985
					   "Can't allocate the frag_list skb.\n");
			return NULL;
		}

		shinfo = skb_shinfo(nskb);
		frags = shinfo->frags;

		for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
		     shinfo->nr_frags++, txp++, gop++) {
986 987 988
			index = pending_index(queue->pending_cons++);
			pending_idx = queue->pending_ring[index];
			xenvif_tx_create_map_op(queue, pending_idx, txp, gop);
989 990 991 992 993 994
			frag_set_pending_idx(&frags[shinfo->nr_frags],
					     pending_idx);
		}

		skb_shinfo(skb)->frag_list = nskb;
	}
995

I
Ian Campbell 已提交
996 997 998
	return gop;
}

999
static inline void xenvif_grant_handle_set(struct xenvif_queue *queue,
1000 1001 1002
					   u16 pending_idx,
					   grant_handle_t handle)
{
1003
	if (unlikely(queue->grant_tx_handle[pending_idx] !=
1004
		     NETBACK_INVALID_HANDLE)) {
1005
		netdev_err(queue->vif->dev,
1006 1007 1008 1009
			   "Trying to overwrite active handle! pending_idx: %x\n",
			   pending_idx);
		BUG();
	}
1010
	queue->grant_tx_handle[pending_idx] = handle;
1011 1012
}

1013
static inline void xenvif_grant_handle_reset(struct xenvif_queue *queue,
1014 1015
					     u16 pending_idx)
{
1016
	if (unlikely(queue->grant_tx_handle[pending_idx] ==
1017
		     NETBACK_INVALID_HANDLE)) {
1018
		netdev_err(queue->vif->dev,
1019 1020 1021 1022
			   "Trying to unmap invalid handle! pending_idx: %x\n",
			   pending_idx);
		BUG();
	}
1023
	queue->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
1024 1025
}

1026
static int xenvif_tx_check_gop(struct xenvif_queue *queue,
W
Wei Liu 已提交
1027
			       struct sk_buff *skb,
1028 1029
			       struct gnttab_map_grant_ref **gopp_map,
			       struct gnttab_copy **gopp_copy)
I
Ian Campbell 已提交
1030
{
Z
Zoltan Kiss 已提交
1031
	struct gnttab_map_grant_ref *gop_map = *gopp_map;
1032
	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
1033 1034 1035
	/* This always points to the shinfo of the skb being checked, which
	 * could be either the first or the one on the frag_list
	 */
I
Ian Campbell 已提交
1036
	struct skb_shared_info *shinfo = skb_shinfo(skb);
1037 1038 1039 1040
	/* If this is non-NULL, we are currently checking the frag_list skb, and
	 * this points to the shinfo of the first one
	 */
	struct skb_shared_info *first_shinfo = NULL;
I
Ian Campbell 已提交
1041
	int nr_frags = shinfo->nr_frags;
1042 1043
	const bool sharedslot = nr_frags &&
				frag_get_pending_idx(&shinfo->frags[0]) == pending_idx;
1044
	int i, err;
I
Ian Campbell 已提交
1045 1046

	/* Check status of header. */
1047 1048 1049
	err = (*gopp_copy)->status;
	if (unlikely(err)) {
		if (net_ratelimit())
1050
			netdev_dbg(queue->vif->dev,
1051
				   "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
1052 1053 1054
				   (*gopp_copy)->status,
				   pending_idx,
				   (*gopp_copy)->source.u.ref);
1055 1056 1057 1058
		/* The first frag might still have this slot mapped */
		if (!sharedslot)
			xenvif_idx_release(queue, pending_idx,
					   XEN_NETIF_RSP_ERROR);
1059
	}
1060
	(*gopp_copy)++;
I
Ian Campbell 已提交
1061

1062
check_frags:
1063
	for (i = 0; i < nr_frags; i++, gop_map++) {
I
Ian Campbell 已提交
1064 1065
		int j, newerr;

1066
		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
I
Ian Campbell 已提交
1067 1068

		/* Check error status: if okay then remember grant handle. */
1069
		newerr = gop_map->status;
1070

I
Ian Campbell 已提交
1071
		if (likely(!newerr)) {
1072
			xenvif_grant_handle_set(queue,
Z
Zoltan Kiss 已提交
1073 1074
						pending_idx,
						gop_map->handle);
I
Ian Campbell 已提交
1075
			/* Had a previous error? Invalidate this fragment. */
1076
			if (unlikely(err)) {
1077
				xenvif_idx_unmap(queue, pending_idx);
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
				/* If the mapping of the first frag was OK, but
				 * the header's copy failed, and they are
				 * sharing a slot, send an error
				 */
				if (i == 0 && sharedslot)
					xenvif_idx_release(queue, pending_idx,
							   XEN_NETIF_RSP_ERROR);
				else
					xenvif_idx_release(queue, pending_idx,
							   XEN_NETIF_RSP_OKAY);
			}
I
Ian Campbell 已提交
1089 1090 1091 1092
			continue;
		}

		/* Error on this fragment: respond to client with an error. */
1093
		if (net_ratelimit())
1094
			netdev_dbg(queue->vif->dev,
1095
				   "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n",
1096 1097 1098 1099
				   i,
				   gop_map->status,
				   pending_idx,
				   gop_map->ref);
1100

1101
		xenvif_idx_release(queue, pending_idx, XEN_NETIF_RSP_ERROR);
I
Ian Campbell 已提交
1102 1103 1104 1105

		/* Not the first error? Preceding frags already invalidated. */
		if (err)
			continue;
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115

		/* First error: if the header haven't shared a slot with the
		 * first frag, release it as well.
		 */
		if (!sharedslot)
			xenvif_idx_release(queue,
					   XENVIF_TX_CB(skb)->pending_idx,
					   XEN_NETIF_RSP_OKAY);

		/* Invalidate preceding fragments of this skb. */
1116
		for (j = 0; j < i; j++) {
1117
			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
1118
			xenvif_idx_unmap(queue, pending_idx);
1119 1120
			xenvif_idx_release(queue, pending_idx,
					   XEN_NETIF_RSP_OKAY);
I
Ian Campbell 已提交
1121 1122
		}

1123 1124 1125 1126 1127 1128 1129
		/* And if we found the error while checking the frag_list, unmap
		 * the first skb's frags
		 */
		if (first_shinfo) {
			for (j = 0; j < first_shinfo->nr_frags; j++) {
				pending_idx = frag_get_pending_idx(&first_shinfo->frags[j]);
				xenvif_idx_unmap(queue, pending_idx);
1130 1131
				xenvif_idx_release(queue, pending_idx,
						   XEN_NETIF_RSP_OKAY);
1132
			}
I
Ian Campbell 已提交
1133 1134 1135 1136 1137 1138
		}

		/* Remember the error: invalidate all subsequent fragments. */
		err = newerr;
	}

1139 1140 1141
	if (skb_has_frag_list(skb) && !first_shinfo) {
		first_shinfo = skb_shinfo(skb);
		shinfo = skb_shinfo(skb_shinfo(skb)->frag_list);
1142 1143 1144 1145 1146
		nr_frags = shinfo->nr_frags;

		goto check_frags;
	}

1147
	*gopp_map = gop_map;
I
Ian Campbell 已提交
1148 1149 1150
	return err;
}

1151
static void xenvif_fill_frags(struct xenvif_queue *queue, struct sk_buff *skb)
I
Ian Campbell 已提交
1152 1153 1154 1155
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	int nr_frags = shinfo->nr_frags;
	int i;
1156 1157
	u16 prev_pending_idx = INVALID_PENDING_IDX;

I
Ian Campbell 已提交
1158 1159 1160
	for (i = 0; i < nr_frags; i++) {
		skb_frag_t *frag = shinfo->frags + i;
		struct xen_netif_tx_request *txp;
1161 1162
		struct page *page;
		u16 pending_idx;
I
Ian Campbell 已提交
1163

1164
		pending_idx = frag_get_pending_idx(frag);
I
Ian Campbell 已提交
1165

1166
		/* If this is not the first frag, chain it to the previous*/
1167
		if (prev_pending_idx == INVALID_PENDING_IDX)
1168
			skb_shinfo(skb)->destructor_arg =
1169
				&callback_param(queue, pending_idx);
1170
		else
1171 1172
			callback_param(queue, prev_pending_idx).ctx =
				&callback_param(queue, pending_idx);
1173

1174
		callback_param(queue, pending_idx).ctx = NULL;
1175 1176
		prev_pending_idx = pending_idx;

1177 1178
		txp = &queue->pending_tx_info[pending_idx].req;
		page = virt_to_page(idx_to_kaddr(queue, pending_idx));
1179
		__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
I
Ian Campbell 已提交
1180 1181 1182 1183
		skb->len += txp->size;
		skb->data_len += txp->size;
		skb->truesize += txp->size;

1184
		/* Take an extra reference to offset network stack's put_page */
1185
		get_page(queue->mmap_pages[pending_idx]);
I
Ian Campbell 已提交
1186
	}
1187 1188 1189 1190 1191 1192
	/* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
	 * overlaps with "index", and "mapping" is not set. I think mapping
	 * should be set. If delivered to local stack, it would drop this
	 * skb in sk_filter unless the socket has the right to use it.
	 */
	skb->pfmemalloc	= false;
I
Ian Campbell 已提交
1193 1194
}

1195
static int xenvif_get_extras(struct xenvif_queue *queue,
I
Ian Campbell 已提交
1196 1197 1198 1199
				struct xen_netif_extra_info *extras,
				int work_to_do)
{
	struct xen_netif_extra_info extra;
1200
	RING_IDX cons = queue->tx.req_cons;
I
Ian Campbell 已提交
1201 1202 1203

	do {
		if (unlikely(work_to_do-- <= 0)) {
1204 1205
			netdev_err(queue->vif->dev, "Missing extra info\n");
			xenvif_fatal_tx_err(queue->vif);
I
Ian Campbell 已提交
1206 1207 1208
			return -EBADR;
		}

1209
		memcpy(&extra, RING_GET_REQUEST(&queue->tx, cons),
I
Ian Campbell 已提交
1210 1211 1212
		       sizeof(extra));
		if (unlikely(!extra.type ||
			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
1213 1214
			queue->tx.req_cons = ++cons;
			netdev_err(queue->vif->dev,
I
Ian Campbell 已提交
1215
				   "Invalid extra type: %d\n", extra.type);
1216
			xenvif_fatal_tx_err(queue->vif);
I
Ian Campbell 已提交
1217 1218 1219 1220
			return -EINVAL;
		}

		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
1221
		queue->tx.req_cons = ++cons;
I
Ian Campbell 已提交
1222 1223 1224 1225 1226
	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);

	return work_to_do;
}

W
Wei Liu 已提交
1227 1228 1229
static int xenvif_set_skb_gso(struct xenvif *vif,
			      struct sk_buff *skb,
			      struct xen_netif_extra_info *gso)
I
Ian Campbell 已提交
1230 1231
{
	if (!gso->u.gso.size) {
1232
		netdev_err(vif->dev, "GSO size must not be zero.\n");
W
Wei Liu 已提交
1233
		xenvif_fatal_tx_err(vif);
I
Ian Campbell 已提交
1234 1235 1236
		return -EINVAL;
	}

1237 1238 1239 1240 1241 1242 1243 1244
	switch (gso->u.gso.type) {
	case XEN_NETIF_GSO_TYPE_TCPV4:
		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
		break;
	case XEN_NETIF_GSO_TYPE_TCPV6:
		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
		break;
	default:
1245
		netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
W
Wei Liu 已提交
1246
		xenvif_fatal_tx_err(vif);
I
Ian Campbell 已提交
1247 1248 1249 1250
		return -EINVAL;
	}

	skb_shinfo(skb)->gso_size = gso->u.gso.size;
1251
	/* gso_segs will be calculated later */
I
Ian Campbell 已提交
1252 1253 1254 1255

	return 0;
}

1256
static int checksum_setup(struct xenvif_queue *queue, struct sk_buff *skb)
1257
{
1258
	bool recalculate_partial_csum = false;
1259 1260 1261 1262 1263 1264 1265

	/* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
	 * peers can fail to set NETRXF_csum_blank when sending a GSO
	 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
	 * recalculate the partial checksum.
	 */
	if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
1266
		queue->stats.rx_gso_checksum_fixup++;
1267
		skb->ip_summed = CHECKSUM_PARTIAL;
1268
		recalculate_partial_csum = true;
1269 1270 1271 1272 1273 1274
	}

	/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
	if (skb->ip_summed != CHECKSUM_PARTIAL)
		return 0;

1275
	return skb_checksum_setup(skb, recalculate_partial_csum);
1276 1277
}

1278
static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
I
Ian Campbell 已提交
1279
{
1280
	u64 now = get_jiffies_64();
1281 1282
	u64 next_credit = queue->credit_window_start +
		msecs_to_jiffies(queue->credit_usec / 1000);
I
Ian Campbell 已提交
1283 1284

	/* Timer could already be pending in rare cases. */
1285
	if (timer_pending(&queue->credit_timeout))
I
Ian Campbell 已提交
1286 1287 1288
		return true;

	/* Passed the point where we can replenish credit? */
1289
	if (time_after_eq64(now, next_credit)) {
1290 1291
		queue->credit_window_start = now;
		tx_add_credit(queue);
I
Ian Campbell 已提交
1292 1293 1294
	}

	/* Still too big to send right now? Set a callback. */
1295 1296 1297 1298
	if (size > queue->remaining_credit) {
		queue->credit_timeout.data     =
			(unsigned long)queue;
		queue->credit_timeout.function =
I
Ian Campbell 已提交
1299
			tx_credit_callback;
1300
		mod_timer(&queue->credit_timeout,
I
Ian Campbell 已提交
1301
			  next_credit);
1302
		queue->credit_window_start = next_credit;
I
Ian Campbell 已提交
1303 1304 1305 1306 1307 1308 1309

		return true;
	}

	return false;
}

1310
static void xenvif_tx_build_gops(struct xenvif_queue *queue,
1311 1312 1313
				     int budget,
				     unsigned *copy_ops,
				     unsigned *map_ops)
I
Ian Campbell 已提交
1314
{
1315
	struct gnttab_map_grant_ref *gop = queue->tx_map_ops, *request_gop;
I
Ian Campbell 已提交
1316 1317 1318
	struct sk_buff *skb;
	int ret;

1319
	while (skb_queue_len(&queue->tx_queue) < budget) {
I
Ian Campbell 已提交
1320
		struct xen_netif_tx_request txreq;
1321
		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
I
Ian Campbell 已提交
1322 1323 1324 1325 1326 1327 1328
		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
		u16 pending_idx;
		RING_IDX idx;
		int work_to_do;
		unsigned int data_len;
		pending_ring_idx_t index;

1329
		if (queue->tx.sring->req_prod - queue->tx.req_cons >
1330
		    XEN_NETIF_TX_RING_SIZE) {
1331
			netdev_err(queue->vif->dev,
1332 1333
				   "Impossible number of requests. "
				   "req_prod %d, req_cons %d, size %ld\n",
1334
				   queue->tx.sring->req_prod, queue->tx.req_cons,
1335
				   XEN_NETIF_TX_RING_SIZE);
1336
			xenvif_fatal_tx_err(queue->vif);
1337
			break;
1338 1339
		}

1340
		work_to_do = RING_HAS_UNCONSUMED_REQUESTS(&queue->tx);
1341 1342
		if (!work_to_do)
			break;
I
Ian Campbell 已提交
1343

1344
		idx = queue->tx.req_cons;
I
Ian Campbell 已提交
1345
		rmb(); /* Ensure that we see the request before we copy it. */
1346
		memcpy(&txreq, RING_GET_REQUEST(&queue->tx, idx), sizeof(txreq));
I
Ian Campbell 已提交
1347 1348

		/* Credit-based scheduling. */
1349 1350
		if (txreq.size > queue->remaining_credit &&
		    tx_credit_exceeded(queue, txreq.size))
1351
			break;
I
Ian Campbell 已提交
1352

1353
		queue->remaining_credit -= txreq.size;
I
Ian Campbell 已提交
1354 1355

		work_to_do--;
1356
		queue->tx.req_cons = ++idx;
I
Ian Campbell 已提交
1357 1358 1359

		memset(extras, 0, sizeof(extras));
		if (txreq.flags & XEN_NETTXF_extra_info) {
1360
			work_to_do = xenvif_get_extras(queue, extras,
W
Wei Liu 已提交
1361
						       work_to_do);
1362
			idx = queue->tx.req_cons;
1363
			if (unlikely(work_to_do < 0))
1364
				break;
I
Ian Campbell 已提交
1365 1366
		}

1367
		ret = xenvif_count_requests(queue, &txreq, txfrags, work_to_do);
1368
		if (unlikely(ret < 0))
1369
			break;
1370

I
Ian Campbell 已提交
1371 1372 1373
		idx += ret;

		if (unlikely(txreq.size < ETH_HLEN)) {
1374
			netdev_dbg(queue->vif->dev,
I
Ian Campbell 已提交
1375
				   "Bad packet size: %d\n", txreq.size);
1376
			xenvif_tx_err(queue, &txreq, idx);
1377
			break;
I
Ian Campbell 已提交
1378 1379 1380 1381
		}

		/* No crossing a page as the payload mustn't fragment. */
		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
1382
			netdev_err(queue->vif->dev,
I
Ian Campbell 已提交
1383 1384 1385
				   "txreq.offset: %x, size: %u, end: %lu\n",
				   txreq.offset, txreq.size,
				   (txreq.offset&~PAGE_MASK) + txreq.size);
1386
			xenvif_fatal_tx_err(queue->vif);
1387
			break;
I
Ian Campbell 已提交
1388 1389
		}

1390 1391
		index = pending_index(queue->pending_cons);
		pending_idx = queue->pending_ring[index];
I
Ian Campbell 已提交
1392 1393

		data_len = (txreq.size > PKT_PROT_LEN &&
1394
			    ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
I
Ian Campbell 已提交
1395 1396
			PKT_PROT_LEN : txreq.size;

1397
		skb = xenvif_alloc_skb(data_len);
I
Ian Campbell 已提交
1398
		if (unlikely(skb == NULL)) {
1399
			netdev_dbg(queue->vif->dev,
I
Ian Campbell 已提交
1400
				   "Can't allocate a skb in start_xmit.\n");
1401
			xenvif_tx_err(queue, &txreq, idx);
I
Ian Campbell 已提交
1402 1403 1404 1405 1406 1407 1408
			break;
		}

		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
			struct xen_netif_extra_info *gso;
			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];

1409
			if (xenvif_set_skb_gso(queue->vif, skb, gso)) {
W
Wei Liu 已提交
1410
				/* Failure in xenvif_set_skb_gso is fatal. */
I
Ian Campbell 已提交
1411
				kfree_skb(skb);
1412
				break;
I
Ian Campbell 已提交
1413 1414 1415
			}
		}

1416
		XENVIF_TX_CB(skb)->pending_idx = pending_idx;
I
Ian Campbell 已提交
1417 1418

		__skb_put(skb, data_len);
1419 1420 1421
		queue->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref;
		queue->tx_copy_ops[*copy_ops].source.domid = queue->vif->domid;
		queue->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
1422

1423
		queue->tx_copy_ops[*copy_ops].dest.u.gmfn =
1424
			virt_to_mfn(skb->data);
1425 1426
		queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
		queue->tx_copy_ops[*copy_ops].dest.offset =
1427 1428
			offset_in_page(skb->data);

1429 1430
		queue->tx_copy_ops[*copy_ops].len = data_len;
		queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
1431 1432

		(*copy_ops)++;
I
Ian Campbell 已提交
1433 1434 1435 1436

		skb_shinfo(skb)->nr_frags = ret;
		if (data_len < txreq.size) {
			skb_shinfo(skb)->nr_frags++;
1437 1438
			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
					     pending_idx);
1439
			xenvif_tx_create_map_op(queue, pending_idx, &txreq, gop);
1440
			gop++;
I
Ian Campbell 已提交
1441
		} else {
1442 1443
			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
					     INVALID_PENDING_IDX);
1444
			memcpy(&queue->pending_tx_info[pending_idx].req, &txreq,
1445
			       sizeof(txreq));
I
Ian Campbell 已提交
1446 1447
		}

1448
		queue->pending_cons++;
I
Ian Campbell 已提交
1449

1450
		request_gop = xenvif_get_requests(queue, skb, txfrags, gop);
I
Ian Campbell 已提交
1451 1452
		if (request_gop == NULL) {
			kfree_skb(skb);
1453
			xenvif_tx_err(queue, &txreq, idx);
1454
			break;
I
Ian Campbell 已提交
1455 1456 1457
		}
		gop = request_gop;

1458
		__skb_queue_tail(&queue->tx_queue, skb);
1459

1460
		queue->tx.req_cons = idx;
I
Ian Campbell 已提交
1461

1462 1463
		if (((gop-queue->tx_map_ops) >= ARRAY_SIZE(queue->tx_map_ops)) ||
		    (*copy_ops >= ARRAY_SIZE(queue->tx_copy_ops)))
I
Ian Campbell 已提交
1464 1465 1466
			break;
	}

1467
	(*map_ops) = gop - queue->tx_map_ops;
1468
	return;
I
Ian Campbell 已提交
1469 1470
}

1471 1472 1473
/* Consolidate skb with a frag_list into a brand new one with local pages on
 * frags. Returns 0 or -ENOMEM if can't allocate new pages.
 */
1474
static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *skb)
1475 1476 1477 1478 1479 1480 1481
{
	unsigned int offset = skb_headlen(skb);
	skb_frag_t frags[MAX_SKB_FRAGS];
	int i;
	struct ubuf_info *uarg;
	struct sk_buff *nskb = skb_shinfo(skb)->frag_list;

1482 1483
	queue->stats.tx_zerocopy_sent += 2;
	queue->stats.tx_frag_overflow++;
1484

1485
	xenvif_fill_frags(queue, nskb);
1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535
	/* Subtract frags size, we will correct it later */
	skb->truesize -= skb->data_len;
	skb->len += nskb->len;
	skb->data_len += nskb->len;

	/* create a brand new frags array and coalesce there */
	for (i = 0; offset < skb->len; i++) {
		struct page *page;
		unsigned int len;

		BUG_ON(i >= MAX_SKB_FRAGS);
		page = alloc_page(GFP_ATOMIC|__GFP_COLD);
		if (!page) {
			int j;
			skb->truesize += skb->data_len;
			for (j = 0; j < i; j++)
				put_page(frags[j].page.p);
			return -ENOMEM;
		}

		if (offset + PAGE_SIZE < skb->len)
			len = PAGE_SIZE;
		else
			len = skb->len - offset;
		if (skb_copy_bits(skb, offset, page_address(page), len))
			BUG();

		offset += len;
		frags[i].page.p = page;
		frags[i].page_offset = 0;
		skb_frag_size_set(&frags[i], len);
	}
	/* swap out with old one */
	memcpy(skb_shinfo(skb)->frags,
	       frags,
	       i * sizeof(skb_frag_t));
	skb_shinfo(skb)->nr_frags = i;
	skb->truesize += i * PAGE_SIZE;

	/* remove traces of mapped pages and frag_list */
	skb_frag_list_init(skb);
	uarg = skb_shinfo(skb)->destructor_arg;
	uarg->callback(uarg, true);
	skb_shinfo(skb)->destructor_arg = NULL;

	skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
	kfree_skb(nskb);

	return 0;
}
1536

1537
static int xenvif_tx_submit(struct xenvif_queue *queue)
I
Ian Campbell 已提交
1538
{
1539 1540
	struct gnttab_map_grant_ref *gop_map = queue->tx_map_ops;
	struct gnttab_copy *gop_copy = queue->tx_copy_ops;
I
Ian Campbell 已提交
1541
	struct sk_buff *skb;
1542
	int work_done = 0;
I
Ian Campbell 已提交
1543

1544
	while ((skb = __skb_dequeue(&queue->tx_queue)) != NULL) {
I
Ian Campbell 已提交
1545 1546 1547 1548
		struct xen_netif_tx_request *txp;
		u16 pending_idx;
		unsigned data_len;

1549
		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
1550
		txp = &queue->pending_tx_info[pending_idx].req;
I
Ian Campbell 已提交
1551 1552

		/* Check the remap error code. */
1553
		if (unlikely(xenvif_tx_check_gop(queue, skb, &gop_map, &gop_copy))) {
1554 1555 1556 1557
			/* If there was an error, xenvif_tx_check_gop is
			 * expected to release all the frags which were mapped,
			 * so kfree_skb shouldn't do it again
			 */
I
Ian Campbell 已提交
1558
			skb_shinfo(skb)->nr_frags = 0;
1559 1560 1561 1562 1563
			if (skb_has_frag_list(skb)) {
				struct sk_buff *nskb =
						skb_shinfo(skb)->frag_list;
				skb_shinfo(nskb)->nr_frags = 0;
			}
I
Ian Campbell 已提交
1564 1565 1566 1567 1568
			kfree_skb(skb);
			continue;
		}

		data_len = skb->len;
1569
		callback_param(queue, pending_idx).ctx = NULL;
I
Ian Campbell 已提交
1570 1571 1572 1573 1574 1575
		if (data_len < txp->size) {
			/* Append the packet payload as a fragment. */
			txp->offset += data_len;
			txp->size -= data_len;
		} else {
			/* Schedule a response immediately. */
1576
			xenvif_idx_release(queue, pending_idx,
1577
					   XEN_NETIF_RSP_OKAY);
I
Ian Campbell 已提交
1578 1579 1580 1581 1582 1583 1584
		}

		if (txp->flags & XEN_NETTXF_csum_blank)
			skb->ip_summed = CHECKSUM_PARTIAL;
		else if (txp->flags & XEN_NETTXF_data_validated)
			skb->ip_summed = CHECKSUM_UNNECESSARY;

1585
		xenvif_fill_frags(queue, skb);
I
Ian Campbell 已提交
1586

1587
		if (unlikely(skb_has_frag_list(skb))) {
1588
			if (xenvif_handle_frag_list(queue, skb)) {
1589
				if (net_ratelimit())
1590
					netdev_err(queue->vif->dev,
1591 1592 1593 1594 1595 1596 1597
						   "Not enough memory to consolidate frag_list!\n");
				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
				kfree_skb(skb);
				continue;
			}
		}

1598
		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
I
Ian Campbell 已提交
1599 1600 1601 1602
			int target = min_t(int, skb->len, PKT_PROT_LEN);
			__pskb_pull_tail(skb, target - skb_headlen(skb));
		}

1603
		skb->dev      = queue->vif->dev;
I
Ian Campbell 已提交
1604
		skb->protocol = eth_type_trans(skb, skb->dev);
1605
		skb_reset_network_header(skb);
I
Ian Campbell 已提交
1606

1607 1608
		if (checksum_setup(queue, skb)) {
			netdev_dbg(queue->vif->dev,
I
Ian Campbell 已提交
1609
				   "Can't setup checksum in net_tx_action\n");
1610 1611 1612
			/* We have to set this flag to trigger the callback */
			if (skb_shinfo(skb)->destructor_arg)
				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
I
Ian Campbell 已提交
1613 1614 1615 1616
			kfree_skb(skb);
			continue;
		}

1617
		skb_probe_transport_header(skb, 0);
1618

1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632
		/* If the packet is GSO then we will have just set up the
		 * transport header offset in checksum_setup so it's now
		 * straightforward to calculate gso_segs.
		 */
		if (skb_is_gso(skb)) {
			int mss = skb_shinfo(skb)->gso_size;
			int hdrlen = skb_transport_header(skb) -
				skb_mac_header(skb) +
				tcp_hdrlen(skb);

			skb_shinfo(skb)->gso_segs =
				DIV_ROUND_UP(skb->len - hdrlen, mss);
		}

1633 1634
		queue->stats.rx_bytes += skb->len;
		queue->stats.rx_packets++;
I
Ian Campbell 已提交
1635

1636 1637
		work_done++;

1638 1639 1640 1641 1642
		/* Set this flag right before netif_receive_skb, otherwise
		 * someone might think this packet already left netback, and
		 * do a skb_copy_ubufs while we are still in control of the
		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
		 */
1643
		if (skb_shinfo(skb)->destructor_arg) {
1644
			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1645
			queue->stats.tx_zerocopy_sent++;
1646
		}
1647

1648
		netif_receive_skb(skb);
I
Ian Campbell 已提交
1649
	}
1650 1651

	return work_done;
I
Ian Campbell 已提交
1652 1653
}

1654 1655
void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
{
1656 1657
	unsigned long flags;
	pending_ring_idx_t index;
1658
	struct xenvif_queue *queue = ubuf_to_queue(ubuf);
1659 1660 1661 1662

	/* This is the only place where we grab this lock, to protect callbacks
	 * from each other.
	 */
1663
	spin_lock_irqsave(&queue->callback_lock, flags);
1664 1665 1666
	do {
		u16 pending_idx = ubuf->desc;
		ubuf = (struct ubuf_info *) ubuf->ctx;
1667
		BUG_ON(queue->dealloc_prod - queue->dealloc_cons >=
1668
			MAX_PENDING_REQS);
1669 1670
		index = pending_index(queue->dealloc_prod);
		queue->dealloc_ring[index] = pending_idx;
1671 1672 1673 1674
		/* Sync with xenvif_tx_dealloc_action:
		 * insert idx then incr producer.
		 */
		smp_wmb();
1675
		queue->dealloc_prod++;
1676
	} while (ubuf);
1677 1678
	wake_up(&queue->dealloc_wq);
	spin_unlock_irqrestore(&queue->callback_lock, flags);
1679

1680
	if (likely(zerocopy_success))
1681
		queue->stats.tx_zerocopy_success++;
1682
	else
1683
		queue->stats.tx_zerocopy_fail++;
1684 1685
}

1686
static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
1687 1688 1689 1690 1691 1692
{
	struct gnttab_unmap_grant_ref *gop;
	pending_ring_idx_t dc, dp;
	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
	unsigned int i = 0;

1693 1694
	dc = queue->dealloc_cons;
	gop = queue->tx_unmap_ops;
1695 1696 1697

	/* Free up any grants we have finished using */
	do {
1698
		dp = queue->dealloc_prod;
1699 1700 1701 1702 1703 1704 1705

		/* Ensure we see all indices enqueued by all
		 * xenvif_zerocopy_callback().
		 */
		smp_rmb();

		while (dc != dp) {
1706
			BUG_ON(gop - queue->tx_unmap_ops > MAX_PENDING_REQS);
1707
			pending_idx =
1708
				queue->dealloc_ring[pending_index(dc++)];
1709

1710
			pending_idx_release[gop-queue->tx_unmap_ops] =
1711
				pending_idx;
1712 1713
			queue->pages_to_unmap[gop-queue->tx_unmap_ops] =
				queue->mmap_pages[pending_idx];
1714
			gnttab_set_unmap_op(gop,
1715
					    idx_to_kaddr(queue, pending_idx),
1716
					    GNTMAP_host_map,
1717 1718
					    queue->grant_tx_handle[pending_idx]);
			xenvif_grant_handle_reset(queue, pending_idx);
1719 1720 1721
			++gop;
		}

1722
	} while (dp != queue->dealloc_prod);
1723

1724
	queue->dealloc_cons = dc;
1725

1726
	if (gop - queue->tx_unmap_ops > 0) {
1727
		int ret;
1728
		ret = gnttab_unmap_refs(queue->tx_unmap_ops,
1729
					NULL,
1730 1731
					queue->pages_to_unmap,
					gop - queue->tx_unmap_ops);
1732
		if (ret) {
1733 1734 1735
			netdev_err(queue->vif->dev, "Unmap fail: nr_ops %tx ret %d\n",
				   gop - queue->tx_unmap_ops, ret);
			for (i = 0; i < gop - queue->tx_unmap_ops; ++i) {
1736
				if (gop[i].status != GNTST_okay)
1737
					netdev_err(queue->vif->dev,
1738 1739 1740 1741 1742 1743 1744 1745 1746
						   " host_addr: %llx handle: %x status: %d\n",
						   gop[i].host_addr,
						   gop[i].handle,
						   gop[i].status);
			}
			BUG();
		}
	}

1747 1748
	for (i = 0; i < gop - queue->tx_unmap_ops; ++i)
		xenvif_idx_release(queue, pending_idx_release[i],
1749
				   XEN_NETIF_RSP_OKAY);
1750 1751
}

1752

I
Ian Campbell 已提交
1753
/* Called after netfront has transmitted */
1754
int xenvif_tx_action(struct xenvif_queue *queue, int budget)
I
Ian Campbell 已提交
1755
{
1756
	unsigned nr_mops, nr_cops = 0;
1757
	int work_done, ret;
I
Ian Campbell 已提交
1758

1759
	if (unlikely(!tx_work_todo(queue)))
1760 1761
		return 0;

1762
	xenvif_tx_build_gops(queue, budget, &nr_cops, &nr_mops);
I
Ian Campbell 已提交
1763

1764
	if (nr_cops == 0)
1765 1766
		return 0;

1767
	gnttab_batch_copy(queue->tx_copy_ops, nr_cops);
1768
	if (nr_mops != 0) {
1769
		ret = gnttab_map_refs(queue->tx_map_ops,
1770
				      NULL,
1771
				      queue->pages_to_map,
1772 1773 1774
				      nr_mops);
		BUG_ON(ret);
	}
I
Ian Campbell 已提交
1775

1776
	work_done = xenvif_tx_submit(queue);
I
Ian Campbell 已提交
1777

1778
	return work_done;
I
Ian Campbell 已提交
1779 1780
}

1781
static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
W
Wei Liu 已提交
1782
			       u8 status)
I
Ian Campbell 已提交
1783 1784
{
	struct pending_tx_info *pending_tx_info;
1785 1786
	pending_ring_idx_t index;
	unsigned long flags;
1787

1788 1789 1790 1791 1792
	pending_tx_info = &queue->pending_tx_info[pending_idx];
	spin_lock_irqsave(&queue->response_lock, flags);
	make_tx_response(queue, &pending_tx_info->req, status);
	index = pending_index(queue->pending_prod);
	queue->pending_ring[index] = pending_idx;
1793 1794
	/* TX shouldn't use the index before we give it back here */
	mb();
1795 1796
	queue->pending_prod++;
	spin_unlock_irqrestore(&queue->response_lock, flags);
I
Ian Campbell 已提交
1797 1798
}

1799

1800
static void make_tx_response(struct xenvif_queue *queue,
I
Ian Campbell 已提交
1801 1802 1803
			     struct xen_netif_tx_request *txp,
			     s8       st)
{
1804
	RING_IDX i = queue->tx.rsp_prod_pvt;
I
Ian Campbell 已提交
1805 1806 1807
	struct xen_netif_tx_response *resp;
	int notify;

1808
	resp = RING_GET_RESPONSE(&queue->tx, i);
I
Ian Campbell 已提交
1809 1810 1811 1812
	resp->id     = txp->id;
	resp->status = st;

	if (txp->flags & XEN_NETTXF_extra_info)
1813
		RING_GET_RESPONSE(&queue->tx, ++i)->status = XEN_NETIF_RSP_NULL;
I
Ian Campbell 已提交
1814

1815 1816
	queue->tx.rsp_prod_pvt = ++i;
	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue->tx, notify);
I
Ian Campbell 已提交
1817
	if (notify)
1818
		notify_remote_via_irq(queue->tx_irq);
I
Ian Campbell 已提交
1819 1820
}

1821
static struct xen_netif_rx_response *make_rx_response(struct xenvif_queue *queue,
I
Ian Campbell 已提交
1822 1823 1824 1825 1826 1827
					     u16      id,
					     s8       st,
					     u16      offset,
					     u16      size,
					     u16      flags)
{
1828
	RING_IDX i = queue->rx.rsp_prod_pvt;
I
Ian Campbell 已提交
1829 1830
	struct xen_netif_rx_response *resp;

1831
	resp = RING_GET_RESPONSE(&queue->rx, i);
I
Ian Campbell 已提交
1832 1833 1834 1835 1836 1837 1838
	resp->offset     = offset;
	resp->flags      = flags;
	resp->id         = id;
	resp->status     = (s16)size;
	if (st < 0)
		resp->status = (s16)st;

1839
	queue->rx.rsp_prod_pvt = ++i;
I
Ian Campbell 已提交
1840 1841 1842 1843

	return resp;
}

1844
void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx)
1845 1846 1847 1848 1849
{
	int ret;
	struct gnttab_unmap_grant_ref tx_unmap_op;

	gnttab_set_unmap_op(&tx_unmap_op,
1850
			    idx_to_kaddr(queue, pending_idx),
1851
			    GNTMAP_host_map,
1852 1853
			    queue->grant_tx_handle[pending_idx]);
	xenvif_grant_handle_reset(queue, pending_idx);
1854 1855

	ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
1856
				&queue->mmap_pages[pending_idx], 1);
1857
	if (ret) {
1858
		netdev_err(queue->vif->dev,
1859 1860 1861 1862 1863 1864 1865 1866
			   "Unmap fail: ret: %d pending_idx: %d host_addr: %llx handle: %x status: %d\n",
			   ret,
			   pending_idx,
			   tx_unmap_op.host_addr,
			   tx_unmap_op.handle,
			   tx_unmap_op.status);
		BUG();
	}
1867 1868
}

1869
static inline int rx_work_todo(struct xenvif_queue *queue)
I
Ian Campbell 已提交
1870
{
1871
	return (!skb_queue_empty(&queue->rx_queue) &&
1872
	       xenvif_rx_ring_slots_available(queue, queue->rx_last_skb_slots));
I
Ian Campbell 已提交
1873 1874
}

1875
static inline int tx_work_todo(struct xenvif_queue *queue)
I
Ian Campbell 已提交
1876
{
1877
	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&queue->tx)))
I
Ian Campbell 已提交
1878 1879 1880 1881 1882
		return 1;

	return 0;
}

1883
static inline bool tx_dealloc_work_todo(struct xenvif_queue *queue)
1884
{
1885
	return queue->dealloc_cons != queue->dealloc_prod;
1886 1887
}

1888
void xenvif_unmap_frontend_rings(struct xenvif_queue *queue)
I
Ian Campbell 已提交
1889
{
1890 1891 1892 1893 1894 1895
	if (queue->tx.sring)
		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
					queue->tx.sring);
	if (queue->rx.sring)
		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
					queue->rx.sring);
I
Ian Campbell 已提交
1896 1897
}

1898
int xenvif_map_frontend_rings(struct xenvif_queue *queue,
W
Wei Liu 已提交
1899 1900
			      grant_ref_t tx_ring_ref,
			      grant_ref_t rx_ring_ref)
I
Ian Campbell 已提交
1901
{
1902
	void *addr;
I
Ian Campbell 已提交
1903 1904 1905 1906 1907
	struct xen_netif_tx_sring *txs;
	struct xen_netif_rx_sring *rxs;

	int err = -ENOMEM;

1908
	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
1909 1910
				     tx_ring_ref, &addr);
	if (err)
I
Ian Campbell 已提交
1911 1912
		goto err;

1913
	txs = (struct xen_netif_tx_sring *)addr;
1914
	BACK_RING_INIT(&queue->tx, txs, PAGE_SIZE);
I
Ian Campbell 已提交
1915

1916
	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
1917 1918
				     rx_ring_ref, &addr);
	if (err)
I
Ian Campbell 已提交
1919 1920
		goto err;

1921
	rxs = (struct xen_netif_rx_sring *)addr;
1922
	BACK_RING_INIT(&queue->rx, rxs, PAGE_SIZE);
I
Ian Campbell 已提交
1923 1924 1925 1926

	return 0;

err:
1927
	xenvif_unmap_frontend_rings(queue);
I
Ian Campbell 已提交
1928 1929 1930
	return err;
}

1931
static void xenvif_start_queue(struct xenvif_queue *queue)
1932
{
1933 1934
	if (xenvif_schedulable(queue->vif))
		xenvif_wake_queue(queue);
1935 1936
}

1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
/* Only called from the queue's thread, it handles the situation when the guest
 * doesn't post enough requests on the receiving ring.
 * First xenvif_start_xmit disables QDisc and start a timer, and then either the
 * timer fires, or the guest send an interrupt after posting new request. If it
 * is the timer, the carrier is turned off here.
 * */
static void xenvif_rx_purge_event(struct xenvif_queue *queue)
{
	/* Either the last unsuccesful skb or at least 1 slot should fit */
	int needed = queue->rx_last_skb_slots ?
		     queue->rx_last_skb_slots : 1;

	/* It is assumed that if the guest post new slots after this, the RX
	 * interrupt will set the QUEUE_STATUS_RX_PURGE_EVENT bit and wake up
	 * the thread again
	 */
	set_bit(QUEUE_STATUS_RX_STALLED, &queue->status);
	if (!xenvif_rx_ring_slots_available(queue, needed)) {
		rtnl_lock();
		if (netif_carrier_ok(queue->vif->dev)) {
			/* Timer fired and there are still no slots. Turn off
			 * everything except the interrupts
			 */
			netif_carrier_off(queue->vif->dev);
			skb_queue_purge(&queue->rx_queue);
			queue->rx_last_skb_slots = 0;
			if (net_ratelimit())
				netdev_err(queue->vif->dev, "Carrier off due to lack of guest response on queue %d\n", queue->id);
		} else {
			/* Probably an another queue already turned the carrier
			 * off, make sure nothing is stucked in the internal
			 * queue of this queue
			 */
			skb_queue_purge(&queue->rx_queue);
			queue->rx_last_skb_slots = 0;
		}
		rtnl_unlock();
	} else if (!netif_carrier_ok(queue->vif->dev)) {
		unsigned int num_queues = queue->vif->num_queues;
		unsigned int i;
		/* The carrier was down, but an interrupt kicked
		 * the thread again after new requests were
		 * posted
		 */
		clear_bit(QUEUE_STATUS_RX_STALLED,
			  &queue->status);
		rtnl_lock();
		netif_carrier_on(queue->vif->dev);
		netif_tx_wake_all_queues(queue->vif->dev);
		rtnl_unlock();

		for (i = 0; i < num_queues; i++) {
			struct xenvif_queue *temp = &queue->vif->queues[i];

			xenvif_napi_schedule_or_enable_events(temp);
		}
		if (net_ratelimit())
			netdev_err(queue->vif->dev, "Carrier on again\n");
	} else {
		/* Queuing were stopped, but the guest posted
		 * new requests and sent an interrupt
		 */
		clear_bit(QUEUE_STATUS_RX_STALLED,
			  &queue->status);
		del_timer_sync(&queue->rx_stalled);
		xenvif_start_queue(queue);
	}
}

2006
int xenvif_kthread_guest_rx(void *data)
2007
{
2008
	struct xenvif_queue *queue = data;
2009
	struct sk_buff *skb;
2010 2011

	while (!kthread_should_stop()) {
2012 2013 2014
		wait_event_interruptible(queue->wq,
					 rx_work_todo(queue) ||
					 queue->vif->disabled ||
2015
					 test_bit(QUEUE_STATUS_RX_PURGE_EVENT, &queue->status) ||
2016
					 kthread_should_stop());
2017

2018 2019 2020
		if (kthread_should_stop())
			break;

2021 2022 2023 2024
		/* This frontend is found to be rogue, disable it in
		 * kthread context. Currently this is only set when
		 * netback finds out frontend sends malformed packet,
		 * but we cannot disable the interface in softirq
2025 2026
		 * context so we defer it here, if this thread is
		 * associated with queue 0.
2027
		 */
2028
		if (unlikely(queue->vif->disabled && queue->id == 0))
2029
			xenvif_carrier_off(queue->vif);
2030 2031 2032 2033 2034 2035 2036 2037
		else if (unlikely(test_and_clear_bit(QUEUE_STATUS_RX_PURGE_EVENT,
						     &queue->status))) {
			xenvif_rx_purge_event(queue);
		} else if (!netif_carrier_ok(queue->vif->dev)) {
			/* Another queue stalled and turned the carrier off, so
			 * purge the internal queue of queues which were not
			 * blocked
			 */
2038
			skb_queue_purge(&queue->rx_queue);
2039
			queue->rx_last_skb_slots = 0;
2040 2041
		}

2042 2043
		if (!skb_queue_empty(&queue->rx_queue))
			xenvif_rx_action(queue);
2044 2045 2046 2047

		cond_resched();
	}

2048
	/* Bin any remaining skbs */
2049
	while ((skb = skb_dequeue(&queue->rx_queue)) != NULL)
2050 2051
		dev_kfree_skb(skb);

2052 2053 2054
	return 0;
}

2055 2056
int xenvif_dealloc_kthread(void *data)
{
2057
	struct xenvif_queue *queue = data;
2058 2059

	while (!kthread_should_stop()) {
2060 2061
		wait_event_interruptible(queue->dealloc_wq,
					 tx_dealloc_work_todo(queue) ||
2062 2063 2064 2065
					 kthread_should_stop());
		if (kthread_should_stop())
			break;

2066
		xenvif_tx_dealloc_action(queue);
2067 2068 2069 2070
		cond_resched();
	}

	/* Unmap anything remaining*/
2071 2072
	if (tx_dealloc_work_todo(queue))
		xenvif_tx_dealloc_action(queue);
2073 2074 2075 2076

	return 0;
}

I
Ian Campbell 已提交
2077 2078 2079 2080
static int __init netback_init(void)
{
	int rc = 0;

2081
	if (!xen_domain())
I
Ian Campbell 已提交
2082 2083
		return -ENODEV;

2084 2085 2086
	/* Allow as many queues as there are CPUs, by default */
	xenvif_max_queues = num_online_cpus();

2087
	if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) {
2088 2089
		pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",
			fatal_skb_slots, XEN_NETBK_LEGACY_SLOTS_MAX);
2090
		fatal_skb_slots = XEN_NETBK_LEGACY_SLOTS_MAX;
2091 2092
	}

I
Ian Campbell 已提交
2093 2094 2095 2096
	rc = xenvif_xenbus_init();
	if (rc)
		goto failed_init;

2097 2098
	rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs);

2099 2100 2101 2102 2103 2104 2105
#ifdef CONFIG_DEBUG_FS
	xen_netback_dbg_root = debugfs_create_dir("xen-netback", NULL);
	if (IS_ERR_OR_NULL(xen_netback_dbg_root))
		pr_warn("Init of debugfs returned %ld!\n",
			PTR_ERR(xen_netback_dbg_root));
#endif /* CONFIG_DEBUG_FS */

I
Ian Campbell 已提交
2106 2107 2108 2109 2110 2111 2112 2113
	return 0;

failed_init:
	return rc;
}

module_init(netback_init);

2114 2115
static void __exit netback_fini(void)
{
2116 2117 2118 2119
#ifdef CONFIG_DEBUG_FS
	if (!IS_ERR_OR_NULL(xen_netback_dbg_root))
		debugfs_remove_recursive(xen_netback_dbg_root);
#endif /* CONFIG_DEBUG_FS */
2120 2121 2122 2123
	xenvif_xenbus_fini();
}
module_exit(netback_fini);

I
Ian Campbell 已提交
2124
MODULE_LICENSE("Dual BSD/GPL");
2125
MODULE_ALIAS("xen-backend:vif");