rx.c 28.1 KB
Newer Older
1
/****************************************************************************
B
Ben Hutchings 已提交
2
 * Driver for Solarflare network controllers and boards
3
 * Copyright 2005-2006 Fen Systems Ltd.
B
Ben Hutchings 已提交
4
 * Copyright 2005-2013 Solarflare Communications Inc.
5 6 7 8 9 10 11 12
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation, incorporated herein by reference.
 */

#include <linux/socket.h>
#include <linux/in.h>
13
#include <linux/slab.h>
14
#include <linux/ip.h>
B
Ben Hutchings 已提交
15
#include <linux/ipv6.h>
16 17
#include <linux/tcp.h>
#include <linux/udp.h>
18
#include <linux/prefetch.h>
19
#include <linux/moduleparam.h>
20
#include <linux/iommu.h>
21 22 23 24
#include <net/ip.h>
#include <net/checksum.h>
#include "net_driver.h"
#include "efx.h"
25
#include "filter.h"
B
Ben Hutchings 已提交
26
#include "nic.h"
27
#include "selftest.h"
28 29
#include "workarounds.h"

30 31
/* Preferred number of descriptors to fill at once */
#define EFX_RX_PREFERRED_BATCH 8U
32

33 34 35 36 37
/* Number of RX buffers to recycle pages for.  When creating the RX page recycle
 * ring, this number is divided by the number of buffers per page to calculate
 * the number of pages to store in the RX page recycle ring.
 */
#define EFX_RECYCLE_RING_SIZE_IOMMU 4096
38
#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH)
39

40
/* Size of buffer allocated for skb header area. */
41
#define EFX_SKB_HEADERS  128u
42 43 44 45

/* This is the percentage fill level below which new RX descriptors
 * will be added to the RX descriptor ring.
 */
46
static unsigned int rx_refill_threshold;
47

48 49 50 51
/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */
#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \
				      EFX_RX_USR_BUF_SIZE)

52 53 54
/*
 * RX maximum head room required.
 *
55 56
 * This must be at least 1 to prevent overflow, plus one packet-worth
 * to allow pipelined receives.
57
 */
58
#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS)
59

60
static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
61
{
62
	return page_address(buf->page) + buf->page_offset;
63 64
}

65
static inline u32 efx_rx_buf_hash(struct efx_nic *efx, const u8 *eh)
66
{
67 68
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
	return __le32_to_cpup((const __le32 *)(eh + efx->rx_packet_hash_offset));
69
#else
70
	const u8 *data = eh + efx->rx_packet_hash_offset;
71 72 73 74
	return (u32)data[0]	  |
	       (u32)data[1] << 8  |
	       (u32)data[2] << 16 |
	       (u32)data[3] << 24;
75 76 77
#endif
}

78 79 80 81 82 83 84 85 86
static inline struct efx_rx_buffer *
efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf)
{
	if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask)))
		return efx_rx_buffer(rx_queue, 0);
	else
		return rx_buf + 1;
}

87 88 89 90 91 92 93 94
static inline void efx_sync_rx_buffer(struct efx_nic *efx,
				      struct efx_rx_buffer *rx_buf,
				      unsigned int len)
{
	dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, len,
				DMA_FROM_DEVICE);
}

95 96
void efx_rx_config_page_split(struct efx_nic *efx)
{
97
	efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + efx->rx_ip_align,
98
				      EFX_RX_BUF_ALIGNMENT);
99 100 101 102 103 104 105 106 107
	efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 :
		((PAGE_SIZE - sizeof(struct efx_rx_page_state)) /
		 efx->rx_page_buf_step);
	efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) /
		efx->rx_bufs_per_page;
	efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH,
					       efx->rx_bufs_per_page);
}

108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
/* Check the RX page recycle ring for a page that can be reused. */
static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
{
	struct efx_nic *efx = rx_queue->efx;
	struct page *page;
	struct efx_rx_page_state *state;
	unsigned index;

	index = rx_queue->page_remove & rx_queue->page_ptr_mask;
	page = rx_queue->page_ring[index];
	if (page == NULL)
		return NULL;

	rx_queue->page_ring[index] = NULL;
	/* page_remove cannot exceed page_add. */
	if (rx_queue->page_remove != rx_queue->page_add)
		++rx_queue->page_remove;

	/* If page_count is 1 then we hold the only reference to this page. */
	if (page_count(page) == 1) {
		++rx_queue->page_recycle_count;
		return page;
	} else {
		state = page_address(page);
		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
			       PAGE_SIZE << efx->rx_buffer_order,
			       DMA_FROM_DEVICE);
		put_page(page);
		++rx_queue->page_recycle_failed;
	}

	return NULL;
}

142
/**
143
 * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
144 145 146
 *
 * @rx_queue:		Efx RX queue
 *
147 148 149 150
 * This allocates a batch of pages, maps them for DMA, and populates
 * struct efx_rx_buffers for each one. Return a negative error code or
 * 0 on success. If a single page can be used for multiple buffers,
 * then the page will either be inserted fully, or not at all.
151
 */
152
static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
153 154
{
	struct efx_nic *efx = rx_queue->efx;
155 156
	struct efx_rx_buffer *rx_buf;
	struct page *page;
157
	unsigned int page_offset;
158
	struct efx_rx_page_state *state;
159 160 161
	dma_addr_t dma_addr;
	unsigned index, count;

162 163
	count = 0;
	do {
164 165
		page = efx_reuse_page(rx_queue);
		if (page == NULL) {
166 167
			page = alloc_pages(__GFP_COLD | __GFP_COMP |
					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
					   efx->rx_buffer_order);
			if (unlikely(page == NULL))
				return -ENOMEM;
			dma_addr =
				dma_map_page(&efx->pci_dev->dev, page, 0,
					     PAGE_SIZE << efx->rx_buffer_order,
					     DMA_FROM_DEVICE);
			if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
						       dma_addr))) {
				__free_pages(page, efx->rx_buffer_order);
				return -EIO;
			}
			state = page_address(page);
			state->dma_addr = dma_addr;
		} else {
			state = page_address(page);
			dma_addr = state->dma_addr;
185
		}
186 187

		dma_addr += sizeof(struct efx_rx_page_state);
188
		page_offset = sizeof(struct efx_rx_page_state);
189

190 191 192
		do {
			index = rx_queue->added_count & rx_queue->ptr_mask;
			rx_buf = efx_rx_buffer(rx_queue, index);
193
			rx_buf->dma_addr = dma_addr + efx->rx_ip_align;
194
			rx_buf->page = page;
195
			rx_buf->page_offset = page_offset + efx->rx_ip_align;
196
			rx_buf->len = efx->rx_dma_len;
197
			rx_buf->flags = 0;
198 199 200 201 202
			++rx_queue->added_count;
			get_page(page);
			dma_addr += efx->rx_page_buf_step;
			page_offset += efx->rx_page_buf_step;
		} while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE);
203 204

		rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE;
205
	} while (++count < efx->rx_pages_per_batch);
206 207 208 209

	return 0;
}

210 211 212
/* Unmap a DMA-mapped page.  This function is only called for the final RX
 * buffer in a page.
 */
213
static void efx_unmap_rx_buffer(struct efx_nic *efx,
214
				struct efx_rx_buffer *rx_buf)
215
{
216 217 218 219 220 221 222 223
	struct page *page = rx_buf->page;

	if (page) {
		struct efx_rx_page_state *state = page_address(page);
		dma_unmap_page(&efx->pci_dev->dev,
			       state->dma_addr,
			       PAGE_SIZE << efx->rx_buffer_order,
			       DMA_FROM_DEVICE);
224 225 226
	}
}

227
static void efx_free_rx_buffer(struct efx_rx_buffer *rx_buf)
228
{
229
	if (rx_buf->page) {
230
		put_page(rx_buf->page);
231
		rx_buf->page = NULL;
232 233 234
	}
}

235 236 237 238 239 240
/* Attempt to recycle the page if there is an RX recycle ring; the page can
 * only be added if this is the final RX buffer, to prevent pages being used in
 * the descriptor ring and appearing in the recycle ring simultaneously.
 */
static void efx_recycle_rx_page(struct efx_channel *channel,
				struct efx_rx_buffer *rx_buf)
241
{
242 243 244 245
	struct page *page = rx_buf->page;
	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
	struct efx_nic *efx = rx_queue->efx;
	unsigned index;
246

247
	/* Only recycle the page after processing the final buffer. */
248
	if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE))
249
		return;
250

251 252 253 254
	index = rx_queue->page_add & rx_queue->page_ptr_mask;
	if (rx_queue->page_ring[index] == NULL) {
		unsigned read_index = rx_queue->page_remove &
			rx_queue->page_ptr_mask;
255

256 257 258 259 260 261 262 263 264 265 266 267 268
		/* The next slot in the recycle ring is available, but
		 * increment page_remove if the read pointer currently
		 * points here.
		 */
		if (read_index == index)
			++rx_queue->page_remove;
		rx_queue->page_ring[index] = page;
		++rx_queue->page_add;
		return;
	}
	++rx_queue->page_recycle_full;
	efx_unmap_rx_buffer(efx, rx_buf);
	put_page(rx_buf->page);
269 270
}

271 272 273 274 275 276 277 278
static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
			       struct efx_rx_buffer *rx_buf)
{
	/* Release the page reference we hold for the buffer. */
	if (rx_buf->page)
		put_page(rx_buf->page);

	/* If this is the last buffer in a page, unmap and free it. */
279
	if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) {
280 281 282 283 284 285 286
		efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
		efx_free_rx_buffer(rx_buf);
	}
	rx_buf->page = NULL;
}

/* Recycle the pages that are used by buffers that have just been received. */
287 288 289
static void efx_recycle_rx_pages(struct efx_channel *channel,
				 struct efx_rx_buffer *rx_buf,
				 unsigned int n_frags)
290
{
291
	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
292

293
	do {
294
		efx_recycle_rx_page(channel, rx_buf);
295 296
		rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
	} while (--n_frags);
297 298
}

299 300 301 302 303 304 305 306 307 308 309 310 311 312
static void efx_discard_rx_packet(struct efx_channel *channel,
				  struct efx_rx_buffer *rx_buf,
				  unsigned int n_frags)
{
	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);

	efx_recycle_rx_pages(channel, rx_buf, n_frags);

	do {
		efx_free_rx_buffer(rx_buf);
		rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
	} while (--n_frags);
}

313 314 315
/**
 * efx_fast_push_rx_descriptors - push new RX descriptors quickly
 * @rx_queue:		RX descriptor queue
316
 *
317
 * This will aim to fill the RX descriptor queue up to
318
 * @rx_queue->@max_fill. If there is insufficient atomic
319 320 321 322 323
 * memory to do so, a slow fill will be scheduled.
 *
 * The caller must provide serialisation (none is used here). In practise,
 * this means this function must run from the NAPI handler, or be called
 * when NAPI is disabled.
324
 */
325
void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic)
326
{
327 328
	struct efx_nic *efx = rx_queue->efx;
	unsigned int fill_level, batch_size;
329
	int space, rc = 0;
330

331 332 333
	if (!rx_queue->refill_enabled)
		return;

334
	/* Calculate current fill level, and exit if we don't need to fill */
335
	fill_level = (rx_queue->added_count - rx_queue->removed_count);
336
	EFX_BUG_ON_PARANOID(fill_level > rx_queue->efx->rxq_entries);
337
	if (fill_level >= rx_queue->fast_fill_trigger)
338
		goto out;
339 340

	/* Record minimum fill level */
341
	if (unlikely(fill_level < rx_queue->min_fill)) {
342 343
		if (fill_level)
			rx_queue->min_fill = fill_level;
344
	}
345

346
	batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page;
347
	space = rx_queue->max_fill - fill_level;
348
	EFX_BUG_ON_PARANOID(space < batch_size);
349

350 351
	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
		   "RX queue %d fast-filling descriptor ring from"
352
		   " level %d to level %d\n",
353
		   efx_rx_queue_index(rx_queue), fill_level,
354 355
		   rx_queue->max_fill);

356 357

	do {
358
		rc = efx_init_rx_buffers(rx_queue, atomic);
359 360 361 362 363
		if (unlikely(rc)) {
			/* Ensure that we don't leave the rx queue empty */
			if (rx_queue->added_count == rx_queue->removed_count)
				efx_schedule_slow_fill(rx_queue);
			goto out;
364
		}
365
	} while ((space -= batch_size) >= batch_size);
366

367 368
	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
		   "RX queue %d fast-filled descriptor ring "
369
		   "to level %d\n", efx_rx_queue_index(rx_queue),
370
		   rx_queue->added_count - rx_queue->removed_count);
371 372

 out:
373 374
	if (rx_queue->notified_count != rx_queue->added_count)
		efx_nic_notify_rx_desc(rx_queue);
375 376
}

377
void efx_rx_slow_fill(unsigned long context)
378
{
379
	struct efx_rx_queue *rx_queue = (struct efx_rx_queue *)context;
380

381
	/* Post an event to cause NAPI to run and refill the queue */
382
	efx_nic_generate_fill_event(rx_queue);
383 384 385
	++rx_queue->slow_fill_count;
}

386 387
static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
				     struct efx_rx_buffer *rx_buf,
388
				     int len)
389 390 391 392 393 394 395 396 397 398
{
	struct efx_nic *efx = rx_queue->efx;
	unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding;

	if (likely(len <= max_len))
		return;

	/* The packet must be discarded, but this is only a fatal error
	 * if the caller indicated it was
	 */
399
	rx_buf->flags |= EFX_RX_PKT_DISCARD;
400 401

	if ((len > rx_buf->len) && EFX_WORKAROUND_8071(efx)) {
402 403 404 405
		if (net_ratelimit())
			netif_err(efx, rx_err, efx->net_dev,
				  " RX queue %d seriously overlength "
				  "RX event (0x%x > 0x%x+0x%x). Leaking\n",
406
				  efx_rx_queue_index(rx_queue), len, max_len,
407
				  efx->type->rx_buffer_padding);
408 409
		efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY);
	} else {
410 411 412 413
		if (net_ratelimit())
			netif_err(efx, rx_err, efx->net_dev,
				  " RX queue %d overlength RX event "
				  "(0x%x > 0x%x)\n",
414
				  efx_rx_queue_index(rx_queue), len, max_len);
415 416
	}

417
	efx_rx_queue_channel(rx_queue)->n_rx_overlength++;
418 419
}

420 421
/* Pass a received packet up through GRO.  GRO can handle pages
 * regardless of checksum state and skbs with a good checksum.
422
 */
423 424 425
static void
efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
		  unsigned int n_frags, u8 *eh)
426
{
H
Herbert Xu 已提交
427
	struct napi_struct *napi = &channel->napi_str;
428
	gro_result_t gro_result;
429 430
	struct efx_nic *efx = channel->efx;
	struct sk_buff *skb;
431

432
	skb = napi_get_frags(napi);
433 434 435 436 437 438
	if (unlikely(!skb)) {
		while (n_frags--) {
			put_page(rx_buf->page);
			rx_buf->page = NULL;
			rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
		}
439 440
		return;
	}
441

442
	if (efx->net_dev->features & NETIF_F_RXHASH)
443
		skb->rxhash = efx_rx_buf_hash(efx, eh);
444 445
	skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
			  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
446

447 448 449 450 451 452 453 454
	for (;;) {
		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
				   rx_buf->page, rx_buf->page_offset,
				   rx_buf->len);
		rx_buf->page = NULL;
		skb->len += rx_buf->len;
		if (skb_shinfo(skb)->nr_frags == n_frags)
			break;
455

456 457 458 459 460 461 462
		rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
	}

	skb->data_len = skb->len;
	skb->truesize += n_frags * efx->rx_buffer_truesize;

	skb_record_rx_queue(skb, channel->rx_queue.core_index);
463

464
	gro_result = napi_gro_frags(napi);
465 466 467
	if (gro_result != GRO_DROP)
		channel->irq_mod_score += 2;
}
468

469
/* Allocate and construct an SKB around page fragments */
470 471
static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
				     struct efx_rx_buffer *rx_buf,
472
				     unsigned int n_frags,
473 474 475 476
				     u8 *eh, int hdr_len)
{
	struct efx_nic *efx = channel->efx;
	struct sk_buff *skb;
477

478
	/* Allocate an SKB to store the headers */
479 480 481
	skb = netdev_alloc_skb(efx->net_dev,
			       efx->rx_ip_align + efx->rx_prefix_size +
			       hdr_len);
482 483 484 485 486
	if (unlikely(skb == NULL))
		return NULL;

	EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len);

487 488 489 490
	memcpy(skb->data + efx->rx_ip_align, eh - efx->rx_prefix_size,
	       efx->rx_prefix_size + hdr_len);
	skb_reserve(skb, efx->rx_ip_align + efx->rx_prefix_size);
	__skb_put(skb, hdr_len);
491

492
	/* Append the remaining page(s) onto the frag list */
493
	if (rx_buf->len > hdr_len) {
494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
		rx_buf->page_offset += hdr_len;
		rx_buf->len -= hdr_len;

		for (;;) {
			skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
					   rx_buf->page, rx_buf->page_offset,
					   rx_buf->len);
			rx_buf->page = NULL;
			skb->len += rx_buf->len;
			skb->data_len += rx_buf->len;
			if (skb_shinfo(skb)->nr_frags == n_frags)
				break;

			rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
		}
509 510
	} else {
		__free_pages(rx_buf->page, efx->rx_buffer_order);
511 512
		rx_buf->page = NULL;
		n_frags = 0;
513
	}
514

515
	skb->truesize += n_frags * efx->rx_buffer_truesize;
516 517 518 519 520

	/* Move past the ethernet header */
	skb->protocol = eth_type_trans(skb, efx->net_dev);

	return skb;
521 522 523
}

void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
524
		   unsigned int n_frags, unsigned int len, u16 flags)
525 526
{
	struct efx_nic *efx = rx_queue->efx;
527
	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
528 529 530
	struct efx_rx_buffer *rx_buf;

	rx_buf = efx_rx_buffer(rx_queue, index);
531
	rx_buf->flags |= flags;
532

533 534
	/* Validate the number of fragments and completed length */
	if (n_frags == 1) {
535 536
		if (!(flags & EFX_RX_PKT_PREFIX_LEN))
			efx_rx_packet__check_len(rx_queue, rx_buf, len);
537
	} else if (unlikely(n_frags > EFX_RX_MAX_FRAGS) ||
J
Jon Cooper 已提交
538 539
		   unlikely(len <= (n_frags - 1) * efx->rx_dma_len) ||
		   unlikely(len > n_frags * efx->rx_dma_len) ||
540 541 542 543 544 545 546
		   unlikely(!efx->rx_scatter)) {
		/* If this isn't an explicit discard request, either
		 * the hardware or the driver is broken.
		 */
		WARN_ON(!(len == 0 && rx_buf->flags & EFX_RX_PKT_DISCARD));
		rx_buf->flags |= EFX_RX_PKT_DISCARD;
	}
547

548
	netif_vdbg(efx, rx_status, efx->net_dev,
549
		   "RX queue %d received ids %x-%x len %d %s%s\n",
550
		   efx_rx_queue_index(rx_queue), index,
551
		   (index + n_frags - 1) & rx_queue->ptr_mask, len,
552 553
		   (rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "",
		   (rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : "");
554

555 556 557
	/* Discard packet, if instructed to do so.  Process the
	 * previous receive first.
	 */
558
	if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) {
559
		efx_rx_flush_packet(channel);
560
		efx_discard_rx_packet(channel, rx_buf, n_frags);
561
		return;
562 563
	}

564
	if (n_frags == 1 && !(flags & EFX_RX_PKT_PREFIX_LEN))
565 566
		rx_buf->len = len;

567 568
	/* Release and/or sync the DMA mapping - assumes all RX buffers
	 * consumed in-order per RX queue.
569
	 */
570
	efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
571 572 573 574

	/* Prefetch nice and early so data will (hopefully) be in cache by
	 * the time we look at it.
	 */
575
	prefetch(efx_rx_buf_va(rx_buf));
576

577 578
	rx_buf->page_offset += efx->rx_prefix_size;
	rx_buf->len -= efx->rx_prefix_size;
579 580 581 582 583 584 585 586 587 588 589

	if (n_frags > 1) {
		/* Release/sync DMA mapping for additional fragments.
		 * Fix length for last fragment.
		 */
		unsigned int tail_frags = n_frags - 1;

		for (;;) {
			rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
			if (--tail_frags == 0)
				break;
J
Jon Cooper 已提交
590
			efx_sync_rx_buffer(efx, rx_buf, efx->rx_dma_len);
591
		}
J
Jon Cooper 已提交
592
		rx_buf->len = len - (n_frags - 1) * efx->rx_dma_len;
593
		efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
594
	}
595

596
	/* All fragments have been DMA-synced, so recycle pages. */
597
	rx_buf = efx_rx_buffer(rx_queue, index);
598
	efx_recycle_rx_pages(channel, rx_buf, n_frags);
599

600 601 602
	/* Pipeline receives so that we give time for packet headers to be
	 * prefetched into cache.
	 */
603
	efx_rx_flush_packet(channel);
604 605
	channel->rx_pkt_n_frags = n_frags;
	channel->rx_pkt_index = index;
606 607
}

608
static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
609 610
			   struct efx_rx_buffer *rx_buf,
			   unsigned int n_frags)
611 612
{
	struct sk_buff *skb;
613
	u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS);
614

615
	skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len);
616
	if (unlikely(skb == NULL)) {
617
		efx_free_rx_buffer(rx_buf);
618 619 620
		return;
	}
	skb_record_rx_queue(skb, channel->rx_queue.core_index);
621 622 623

	/* Set the SKB flags */
	skb_checksum_none_assert(skb);
624 625
	if (likely(rx_buf->flags & EFX_RX_PKT_CSUMMED))
		skb->ip_summed = CHECKSUM_UNNECESSARY;
626

627
	if (channel->type->receive_skb)
628
		if (channel->type->receive_skb(channel, skb))
629
			return;
630 631 632

	/* Pass the packet up */
	netif_receive_skb(skb);
633 634
}

635
/* Handle a received packet.  Second half: Touches packet payload. */
636
void __efx_rx_packet(struct efx_channel *channel)
637 638
{
	struct efx_nic *efx = channel->efx;
639 640
	struct efx_rx_buffer *rx_buf =
		efx_rx_buffer(&channel->rx_queue, channel->rx_pkt_index);
641
	u8 *eh = efx_rx_buf_va(rx_buf);
642

643 644 645 646 647 648 649
	/* Read length from the prefix if necessary.  This already
	 * excludes the length of the prefix itself.
	 */
	if (rx_buf->flags & EFX_RX_PKT_PREFIX_LEN)
		rx_buf->len = le16_to_cpup((__le16 *)
					   (eh + efx->rx_packet_len_offset));

650 651 652 653
	/* If we're in loopback test, then pass the packet directly to the
	 * loopback layer, and free the rx_buf here
	 */
	if (unlikely(efx->loopback_selftest)) {
654
		efx_loopback_rx_packet(efx, eh, rx_buf->len);
655
		efx_free_rx_buffer(rx_buf);
656
		goto out;
657 658
	}

659
	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
660
		rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
661

662
	if ((rx_buf->flags & EFX_RX_PKT_TCP) && !channel->type->receive_skb)
663
		efx_rx_packet_gro(channel, rx_buf, channel->rx_pkt_n_frags, eh);
664
	else
665 666 667
		efx_rx_deliver(channel, eh, rx_buf, channel->rx_pkt_n_frags);
out:
	channel->rx_pkt_n_frags = 0;
668 669 670 671 672
}

int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
{
	struct efx_nic *efx = rx_queue->efx;
673
	unsigned int entries;
674 675
	int rc;

676 677 678 679 680
	/* Create the smallest power-of-two aligned ring */
	entries = max(roundup_pow_of_two(efx->rxq_entries), EFX_MIN_DMAQ_SIZE);
	EFX_BUG_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE);
	rx_queue->ptr_mask = entries - 1;

681
	netif_dbg(efx, probe, efx->net_dev,
682 683 684
		  "creating RX queue %d size %#x mask %#x\n",
		  efx_rx_queue_index(rx_queue), efx->rxq_entries,
		  rx_queue->ptr_mask);
685 686

	/* Allocate RX buffers */
687
	rx_queue->buffer = kcalloc(entries, sizeof(*rx_queue->buffer),
688
				   GFP_KERNEL);
689 690
	if (!rx_queue->buffer)
		return -ENOMEM;
691

692
	rc = efx_nic_probe_rx(rx_queue);
693 694 695 696
	if (rc) {
		kfree(rx_queue->buffer);
		rx_queue->buffer = NULL;
	}
697

698 699 700
	return rc;
}

701 702
static void efx_init_rx_recycle_ring(struct efx_nic *efx,
				     struct efx_rx_queue *rx_queue)
703 704 705 706 707 708 709
{
	unsigned int bufs_in_recycle_ring, page_ring_size;

	/* Set the RX recycle ring size */
#ifdef CONFIG_PPC64
	bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
#else
710
	if (iommu_present(&pci_bus_type))
711 712 713 714 715 716 717 718 719 720 721 722
		bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
	else
		bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU;
#endif /* CONFIG_PPC64 */

	page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring /
					    efx->rx_bufs_per_page);
	rx_queue->page_ring = kcalloc(page_ring_size,
				      sizeof(*rx_queue->page_ring), GFP_KERNEL);
	rx_queue->page_ptr_mask = page_ring_size - 1;
}

723
void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
724
{
725
	struct efx_nic *efx = rx_queue->efx;
726
	unsigned int max_fill, trigger, max_trigger;
727

728
	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
729
		  "initialising RX queue %d\n", efx_rx_queue_index(rx_queue));
730 731 732 733 734 735

	/* Initialise ptr fields */
	rx_queue->added_count = 0;
	rx_queue->notified_count = 0;
	rx_queue->removed_count = 0;
	rx_queue->min_fill = -1U;
736 737 738 739 740 741 742
	efx_init_rx_recycle_ring(efx, rx_queue);

	rx_queue->page_remove = 0;
	rx_queue->page_add = rx_queue->page_ptr_mask + 1;
	rx_queue->page_recycle_count = 0;
	rx_queue->page_recycle_failed = 0;
	rx_queue->page_recycle_full = 0;
743 744

	/* Initialise limit fields */
745
	max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM;
746 747
	max_trigger =
		max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page;
748 749 750 751 752 753 754
	if (rx_refill_threshold != 0) {
		trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;
		if (trigger > max_trigger)
			trigger = max_trigger;
	} else {
		trigger = max_trigger;
	}
755 756 757

	rx_queue->max_fill = max_fill;
	rx_queue->fast_fill_trigger = trigger;
758
	rx_queue->refill_enabled = true;
759 760

	/* Set up RX descriptor ring */
761
	efx_nic_init_rx(rx_queue);
762 763 764 765 766
}

void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
{
	int i;
767
	struct efx_nic *efx = rx_queue->efx;
768 769
	struct efx_rx_buffer *rx_buf;

770
	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
771
		  "shutting down RX queue %d\n", efx_rx_queue_index(rx_queue));
772

773
	del_timer_sync(&rx_queue->slow_fill);
774

775
	/* Release RX buffers from the current read ptr to the write ptr */
776
	if (rx_queue->buffer) {
777 778 779 780
		for (i = rx_queue->removed_count; i < rx_queue->added_count;
		     i++) {
			unsigned index = i & rx_queue->ptr_mask;
			rx_buf = efx_rx_buffer(rx_queue, index);
781 782 783
			efx_fini_rx_buffer(rx_queue, rx_buf);
		}
	}
784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800

	/* Unmap and release the pages in the recycle ring. Remove the ring. */
	for (i = 0; i <= rx_queue->page_ptr_mask; i++) {
		struct page *page = rx_queue->page_ring[i];
		struct efx_rx_page_state *state;

		if (page == NULL)
			continue;

		state = page_address(page);
		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
			       PAGE_SIZE << efx->rx_buffer_order,
			       DMA_FROM_DEVICE);
		put_page(page);
	}
	kfree(rx_queue->page_ring);
	rx_queue->page_ring = NULL;
801 802 803 804
}

void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
{
805
	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
806
		  "destroying RX queue %d\n", efx_rx_queue_index(rx_queue));
807

808
	efx_nic_remove_rx(rx_queue);
809 810 811 812 813 814 815 816

	kfree(rx_queue->buffer);
	rx_queue->buffer = NULL;
}


module_param(rx_refill_threshold, uint, 0444);
MODULE_PARM_DESC(rx_refill_threshold,
817
		 "RX descriptor ring refill threshold (%)");
818

819 820 821 822 823 824 825 826 827
#ifdef CONFIG_RFS_ACCEL

int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
		   u16 rxq_index, u32 flow_id)
{
	struct efx_nic *efx = netdev_priv(net_dev);
	struct efx_channel *channel;
	struct efx_filter_spec spec;
	const __be16 *ports;
B
Ben Hutchings 已提交
828
	__be16 ether_type;
829 830 831
	int nhoff;
	int rc;

B
Ben Hutchings 已提交
832 833 834 835
	/* The core RPS/RFS code has already parsed and validated
	 * VLAN, IP and transport headers.  We assume they are in the
	 * header area.
	 */
836 837

	if (skb->protocol == htons(ETH_P_8021Q)) {
B
Ben Hutchings 已提交
838 839
		const struct vlan_hdr *vh =
			(const struct vlan_hdr *)skb->data;
840

B
Ben Hutchings 已提交
841 842 843
		/* We can't filter on the IP 5-tuple and the vlan
		 * together, so just strip the vlan header and filter
		 * on the IP part.
844
		 */
B
Ben Hutchings 已提交
845 846 847 848 849 850
		EFX_BUG_ON_PARANOID(skb_headlen(skb) < sizeof(*vh));
		ether_type = vh->h_vlan_encapsulated_proto;
		nhoff = sizeof(struct vlan_hdr);
	} else {
		ether_type = skb->protocol;
		nhoff = 0;
851 852
	}

B
Ben Hutchings 已提交
853
	if (ether_type != htons(ETH_P_IP) && ether_type != htons(ETH_P_IPV6))
854 855 856 857 858
		return -EPROTONOSUPPORT;

	efx_filter_init_rx(&spec, EFX_FILTER_PRI_HINT,
			   efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
			   rxq_index);
B
Ben Hutchings 已提交
859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890
	spec.match_flags =
		EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_IP_PROTO |
		EFX_FILTER_MATCH_LOC_HOST | EFX_FILTER_MATCH_LOC_PORT |
		EFX_FILTER_MATCH_REM_HOST | EFX_FILTER_MATCH_REM_PORT;
	spec.ether_type = ether_type;

	if (ether_type == htons(ETH_P_IP)) {
		const struct iphdr *ip =
			(const struct iphdr *)(skb->data + nhoff);

		EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + sizeof(*ip));
		if (ip_is_fragment(ip))
			return -EPROTONOSUPPORT;
		spec.ip_proto = ip->protocol;
		spec.rem_host[0] = ip->saddr;
		spec.loc_host[0] = ip->daddr;
		EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + 4 * ip->ihl + 4);
		ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
	} else {
		const struct ipv6hdr *ip6 =
			(const struct ipv6hdr *)(skb->data + nhoff);

		EFX_BUG_ON_PARANOID(skb_headlen(skb) <
				    nhoff + sizeof(*ip6) + 4);
		spec.ip_proto = ip6->nexthdr;
		memcpy(spec.rem_host, &ip6->saddr, sizeof(ip6->saddr));
		memcpy(spec.loc_host, &ip6->daddr, sizeof(ip6->daddr));
		ports = (const __be16 *)(ip6 + 1);
	}

	spec.rem_port = ports[0];
	spec.loc_port = ports[1];
891 892 893 894 895 896 897 898 899 900

	rc = efx->type->filter_rfs_insert(efx, &spec);
	if (rc < 0)
		return rc;

	/* Remember this so we can check whether to expire the filter later */
	efx->rps_flow_id[rc] = flow_id;
	channel = efx_get_channel(efx, skb_get_rx_queue(skb));
	++channel->rfs_filters_added;

B
Ben Hutchings 已提交
901 902 903 904 905 906 907 908 909 910 911 912
	if (ether_type == htons(ETH_P_IP))
		netif_info(efx, rx_status, efx->net_dev,
			   "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n",
			   (spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
			   spec.rem_host, ntohs(ports[0]), spec.loc_host,
			   ntohs(ports[1]), rxq_index, flow_id, rc);
	else
		netif_info(efx, rx_status, efx->net_dev,
			   "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d]\n",
			   (spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
			   spec.rem_host, ntohs(ports[0]), spec.loc_host,
			   ntohs(ports[1]), rxq_index, flow_id, rc);
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944

	return rc;
}

bool __efx_filter_rfs_expire(struct efx_nic *efx, unsigned int quota)
{
	bool (*expire_one)(struct efx_nic *efx, u32 flow_id, unsigned int index);
	unsigned int index, size;
	u32 flow_id;

	if (!spin_trylock_bh(&efx->filter_lock))
		return false;

	expire_one = efx->type->filter_rfs_expire_one;
	index = efx->rps_expire_index;
	size = efx->type->max_rx_ip_filters;
	while (quota--) {
		flow_id = efx->rps_flow_id[index];
		if (expire_one(efx, flow_id, index))
			netif_info(efx, rx_status, efx->net_dev,
				   "expired filter %d [flow %u]\n",
				   index, flow_id);
		if (++index == size)
			index = 0;
	}
	efx->rps_expire_index = index;

	spin_unlock_bh(&efx->filter_lock);
	return true;
}

#endif /* CONFIG_RFS_ACCEL */
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978

/**
 * efx_filter_is_mc_recipient - test whether spec is a multicast recipient
 * @spec: Specification to test
 *
 * Return: %true if the specification is a non-drop RX filter that
 * matches a local MAC address I/G bit value of 1 or matches a local
 * IPv4 or IPv6 address value in the respective multicast address
 * range.  Otherwise %false.
 */
bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec)
{
	if (!(spec->flags & EFX_FILTER_FLAG_RX) ||
	    spec->dmaq_id == EFX_FILTER_RX_DMAQ_ID_DROP)
		return false;

	if (spec->match_flags &
	    (EFX_FILTER_MATCH_LOC_MAC | EFX_FILTER_MATCH_LOC_MAC_IG) &&
	    is_multicast_ether_addr(spec->loc_mac))
		return true;

	if ((spec->match_flags &
	     (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) ==
	    (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) {
		if (spec->ether_type == htons(ETH_P_IP) &&
		    ipv4_is_multicast(spec->loc_host[0]))
			return true;
		if (spec->ether_type == htons(ETH_P_IPV6) &&
		    ((const u8 *)spec->loc_host)[0] == 0xff)
			return true;
	}

	return false;
}