en_tx.c 28.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
/*
 * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include <asm/page.h>
#include <linux/mlx4/cq.h>
36
#include <linux/slab.h>
37 38 39
#include <linux/mlx4/qp.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
40
#include <linux/prefetch.h>
41
#include <linux/vmalloc.h>
42
#include <linux/tcp.h>
43
#include <linux/ip.h>
44
#include <linux/moduleparam.h>
45 46 47 48

#include "mlx4_en.h"

int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
49
			   struct mlx4_en_tx_ring **pring, u32 size,
50
			   u16 stride, int node, int queue_index)
51 52
{
	struct mlx4_en_dev *mdev = priv->mdev;
53
	struct mlx4_en_tx_ring *ring;
54 55 56
	int tmp;
	int err;

57
	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
58
	if (!ring) {
59 60 61 62 63
		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
		if (!ring) {
			en_err(priv, "Failed allocating TX ring\n");
			return -ENOMEM;
		}
64 65
	}

66 67 68
	ring->size = size;
	ring->size_mask = size - 1;
	ring->stride = stride;
69
	ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
70 71

	tmp = size * sizeof(struct mlx4_en_tx_info);
72
	ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
73
	if (!ring->tx_info) {
74 75 76 77 78
		ring->tx_info = vmalloc(tmp);
		if (!ring->tx_info) {
			err = -ENOMEM;
			goto err_ring;
		}
79
	}
80

81
	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
82 83
		 ring->tx_info, tmp);

84
	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
85
	if (!ring->bounce_buf) {
86 87 88 89 90
		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
		if (!ring->bounce_buf) {
			err = -ENOMEM;
			goto err_info;
		}
91 92 93
	}
	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);

94
	/* Allocate HW buffers on provided NUMA node */
95
	set_dev_node(&mdev->dev->persist->pdev->dev, node);
96 97
	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
				 2 * PAGE_SIZE);
98
	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
99
	if (err) {
100
		en_err(priv, "Failed allocating hwq resources\n");
101 102 103 104 105
		goto err_bounce;
	}

	err = mlx4_en_map_buffer(&ring->wqres.buf);
	if (err) {
106
		en_err(priv, "Failed to map TX buffer\n");
107 108 109 110 111
		goto err_hwq_res;
	}

	ring->buf = ring->wqres.buf.direct.buf;

J
Joe Perches 已提交
112 113 114
	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
	       ring, ring->buf, ring->size, ring->buf_size,
	       (unsigned long long) ring->wqres.buf.direct.map);
115

116 117 118 119 120 121 122
	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
				    MLX4_RESERVE_ETH_BF_QP);
	if (err) {
		en_err(priv, "failed reserving qp for TX ring\n");
		goto err_map;
	}

123
	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
124
	if (err) {
125
		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
126
		goto err_reserve;
127
	}
128
	ring->qp.event = mlx4_en_sqp_event;
129

130
	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
131
	if (err) {
J
Joe Perches 已提交
132
		en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
133 134 135
		ring->bf.uar = &mdev->priv_uar;
		ring->bf.uar->map = mdev->uar_map;
		ring->bf_enabled = false;
136 137 138 139 140 141 142
		ring->bf_alloced = false;
		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
	} else {
		ring->bf_alloced = true;
		ring->bf_enabled = !!(priv->pflags &
				      MLX4_EN_PRIV_FLAGS_BLUEFLAME);
	}
143

144
	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
145 146
	ring->queue_index = queue_index;

147
	if (queue_index < priv->num_tx_rings_p_up)
148 149 150
		cpumask_set_cpu(cpumask_local_spread(queue_index,
						     priv->mdev->dev->numa_node),
				&ring->affinity_mask);
151

152
	*pring = ring;
153 154
	return 0;

155 156
err_reserve:
	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
157 158 159 160 161 162 163
err_map:
	mlx4_en_unmap_buffer(&ring->wqres.buf);
err_hwq_res:
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
err_bounce:
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
164
err_info:
165
	kvfree(ring->tx_info);
166
	ring->tx_info = NULL;
167 168 169
err_ring:
	kfree(ring);
	*pring = NULL;
170 171 172 173
	return err;
}

void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
174
			     struct mlx4_en_tx_ring **pring)
175 176
{
	struct mlx4_en_dev *mdev = priv->mdev;
177
	struct mlx4_en_tx_ring *ring = *pring;
178
	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
179

180
	if (ring->bf_alloced)
181
		mlx4_bf_free(mdev->dev, &ring->bf);
182 183
	mlx4_qp_remove(mdev->dev, &ring->qp);
	mlx4_qp_free(mdev->dev, &ring->qp);
184
	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
185 186 187 188
	mlx4_en_unmap_buffer(&ring->wqres.buf);
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
189
	kvfree(ring->tx_info);
190
	ring->tx_info = NULL;
191 192
	kfree(ring);
	*pring = NULL;
193 194 195 196
}

int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
			     struct mlx4_en_tx_ring *ring,
197
			     int cq, int user_prio)
198 199 200 201 202 203 204 205 206 207 208 209
{
	struct mlx4_en_dev *mdev = priv->mdev;
	int err;

	ring->cqn = cq;
	ring->prod = 0;
	ring->cons = 0xffffffff;
	ring->last_nr_txbb = 1;
	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
	memset(ring->buf, 0, ring->buf_size);

	ring->qp_state = MLX4_QP_STATE_RST;
210 211
	ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
	ring->mr_key = cpu_to_be32(mdev->mr.key);
212 213

	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
214
				ring->cqn, user_prio, &ring->context);
215
	if (ring->bf_alloced)
216 217 218
		ring->context.usr_page =
			cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
							 ring->bf.uar->index));
219 220 221

	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
			       &ring->qp, &ring->qp_state);
222
	if (!cpumask_empty(&ring->affinity_mask))
223 224
		netif_set_xps_queue(priv->dev, &ring->affinity_mask,
				    ring->queue_index);
225 226 227 228 229 230 231 232 233 234 235 236 237

	return err;
}

void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_dev *mdev = priv->mdev;

	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
}

238 239 240 241 242
static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
{
	return ring->prod - ring->cons > ring->full_size;
}

243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
			      struct mlx4_en_tx_ring *ring, int index,
			      u8 owner)
{
	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	void *end = ring->buf + ring->buf_size;
	__be32 *ptr = (__be32 *)tx_desc;
	int i;

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
		}
	} else {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
			if ((void *)ptr >= end) {
				ptr = ring->buf;
				stamp ^= cpu_to_be32(0x80000000);
			}
		}
	}
}

276 277 278

static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring,
279 280
				int index, u8 owner, u64 timestamp,
				int napi_mode)
281 282 283 284 285
{
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
	void *end = ring->buf + ring->buf_size;
286 287
	struct sk_buff *skb = tx_info->skb;
	int nr_maps = tx_info->nr_maps;
288
	int i;
289

290 291 292 293 294
	/* We do not touch skb here, so prefetch skb->users location
	 * to speedup consume_skb()
	 */
	prefetchw(&skb->users);

295 296 297 298
	if (unlikely(timestamp)) {
		struct skb_shared_hwtstamps hwts;

		mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
299 300
		skb_tstamp_tx(skb, &hwts);
	}
301 302 303

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
304
		if (!tx_info->inl) {
305
			if (tx_info->linear)
306
				dma_unmap_single(priv->ddev,
307 308 309 310 311 312 313 314 315 316
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
317
				dma_unmap_page(priv->ddev,
318 319 320
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
321
			}
322 323
		}
	} else {
324 325
		if (!tx_info->inl) {
			if ((void *) data >= end) {
326
				data = ring->buf + ((void *)data - end);
327
			}
328

329
			if (tx_info->linear)
330
				dma_unmap_single(priv->ddev,
331 332 333 334 335 336 337 338 339 340
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
341 342
				/* Check for wraparound before unmapping */
				if ((void *) data >= end)
343
					data = ring->buf;
344
				dma_unmap_page(priv->ddev,
345 346 347
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
348
			}
349 350
		}
	}
351 352
	napi_consume_skb(skb, napi_mode);

353 354 355 356 357 358 359 360 361 362 363
	return tx_info->nr_txbb;
}


int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int cnt = 0;

	/* Skip last polled descriptor */
	ring->cons += ring->last_nr_txbb;
364
	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
365 366 367 368
		 ring->cons, ring->prod);

	if ((u32) (ring->prod - ring->cons) > ring->size) {
		if (netif_msg_tx_err(priv))
369
			en_warn(priv, "Tx consumer passed producer!\n");
370 371 372 373 374 375
		return 0;
	}

	while (ring->cons != ring->prod) {
		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
						ring->cons & ring->size_mask,
376 377
						!!(ring->cons & ring->size), 0,
						0 /* Non-NAPI caller */);
378 379 380 381
		ring->cons += ring->last_nr_txbb;
		cnt++;
	}

382 383
	netdev_tx_reset_queue(ring->tx_queue);

384
	if (cnt)
385
		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
386 387 388 389

	return cnt;
}

390
static bool mlx4_en_process_tx_cq(struct net_device *dev,
391
				  struct mlx4_en_cq *cq, int napi_budget)
392 393 394
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	struct mlx4_cq *mcq = &cq->mcq;
395
	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
396
	struct mlx4_cqe *cqe;
397
	u16 index;
398
	u16 new_index, ring_index, stamp_index;
399
	u32 txbbs_skipped = 0;
400
	u32 txbbs_stamp = 0;
401 402 403 404
	u32 cons_index = mcq->cons_index;
	int size = cq->size;
	u32 size_mask = ring->size_mask;
	struct mlx4_cqe *buf = cq->buf;
405 406
	u32 packets = 0;
	u32 bytes = 0;
O
Or Gerlitz 已提交
407
	int factor = priv->cqe_factor;
408
	u64 timestamp = 0;
409
	int done = 0;
410
	int budget = priv->tx_work_limit;
411 412
	u32 last_nr_txbb;
	u32 ring_cons;
413 414

	if (!priv->port_up)
415
		return true;
416

417 418
	netdev_txq_bql_complete_prefetchw(ring->tx_queue);

419
	index = cons_index & size_mask;
420
	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
421 422 423
	last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
	ring_cons = ACCESS_ONCE(ring->cons);
	ring_index = ring_cons & size_mask;
424
	stamp_index = ring_index;
425 426 427

	/* Process all completed CQEs */
	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
428
			cons_index & size) && (done < budget)) {
429 430 431 432
		/*
		 * make sure we read the CQE after we read the
		 * ownership bit
		 */
433
		dma_rmb();
434

435 436 437 438 439 440 441 442 443
		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
			     MLX4_CQE_OPCODE_ERROR)) {
			struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;

			en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
			       cqe_err->vendor_err_syndrome,
			       cqe_err->syndrome);
		}

444 445 446
		/* Skip over last polled CQE */
		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;

447
		do {
448 449
			txbbs_skipped += last_nr_txbb;
			ring_index = (ring_index + last_nr_txbb) & size_mask;
450 451 452
			if (ring->tx_info[ring_index].ts_requested)
				timestamp = mlx4_en_get_cqe_ts(cqe);

453
			/* free next descriptor */
454
			last_nr_txbb = mlx4_en_free_tx_desc(
455
					priv, ring, ring_index,
456
					!!((ring_cons + txbbs_skipped) &
457
					ring->size), timestamp, napi_budget);
458 459

			mlx4_en_stamp_wqe(priv, ring, stamp_index,
460
					  !!((ring_cons + txbbs_stamp) &
461 462 463
						ring->size));
			stamp_index = ring_index;
			txbbs_stamp = txbbs_skipped;
464 465
			packets++;
			bytes += ring->tx_info[ring_index].nr_bytes;
466
		} while ((++done < budget) && (ring_index != new_index));
467 468 469

		++cons_index;
		index = cons_index & size_mask;
470
		cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
471
	}
472 473 474 475 476 477


	/*
	 * To prevent CQ overflow we first update CQ consumer and only then
	 * the ring consumer.
	 */
478
	mcq->cons_index = cons_index;
479 480
	mlx4_cq_set_ci(mcq);
	wmb();
481 482 483 484 485

	/* we want to dirty this cache line once */
	ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
	ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;

486
	netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
487

488
	/* Wakeup Tx queue if this stopped, and ring is not full.
489
	 */
490 491
	if (netif_tx_queue_stopped(ring->tx_queue) &&
	    !mlx4_en_is_tx_ring_full(ring)) {
492
		netif_tx_wake_queue(ring->tx_queue);
493
		ring->wake_queue++;
494
	}
495
	return done < budget;
496 497 498 499 500 501 502
}

void mlx4_en_tx_irq(struct mlx4_cq *mcq)
{
	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
	struct mlx4_en_priv *priv = netdev_priv(cq->dev);

E
Eric Dumazet 已提交
503 504
	if (likely(priv->port_up))
		napi_schedule_irqoff(&cq->napi);
505 506
	else
		mlx4_en_arm_cq(priv, cq);
507 508
}

509 510 511 512 513 514
/* TX CQ polling - called by NAPI */
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
{
	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
	struct net_device *dev = cq->dev;
	struct mlx4_en_priv *priv = netdev_priv(dev);
515
	int clean_complete;
516

517
	clean_complete = mlx4_en_process_tx_cq(dev, cq, budget);
518 519
	if (!clean_complete)
		return budget;
520

521 522 523 524
	napi_complete(napi);
	mlx4_en_arm_cq(priv, cq);

	return 0;
525
}
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554

static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
						      struct mlx4_en_tx_ring *ring,
						      u32 index,
						      unsigned int desc_size)
{
	u32 copy = (ring->size - index) * TXBB_SIZE;
	int i;

	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + i)) =
			*((u32 *) (ring->bounce_buf + copy + i));
	}

	for (i = copy - 4; i >= 4 ; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
			*((u32 *) (ring->bounce_buf + i));
	}

	/* Return real descriptor location */
	return ring->buf + index * TXBB_SIZE;
}

555 556 557 558 559 560 561
/* Decide if skb can be inlined in tx descriptor to avoid dma mapping
 *
 * It seems strange we do not simply use skb_copy_bits().
 * This would allow to inline all skbs iff skb->len <= inline_thold
 *
 * Note that caller already checked skb was not a gso packet
 */
562
static bool is_inline(int inline_thold, const struct sk_buff *skb,
563
		      const struct skb_shared_info *shinfo,
564
		      void **pfrag)
565 566 567
{
	void *ptr;

568 569
	if (skb->len > inline_thold || !inline_thold)
		return false;
570

571 572 573 574 575 576
	if (shinfo->nr_frags == 1) {
		ptr = skb_frag_address_safe(&shinfo->frags[0]);
		if (unlikely(!ptr))
			return false;
		*pfrag = ptr;
		return true;
577
	}
578 579 580
	if (shinfo->nr_frags)
		return false;
	return true;
581 582
}

583
static int inline_size(const struct sk_buff *skb)
584 585 586 587 588 589 590 591 592 593
{
	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
	    <= MLX4_INLINE_ALIGN)
		return ALIGN(skb->len + CTRL_SIZE +
			     sizeof(struct mlx4_wqe_inline_seg), 16);
	else
		return ALIGN(skb->len + CTRL_SIZE + 2 *
			     sizeof(struct mlx4_wqe_inline_seg), 16);
}

594
static int get_real_size(const struct sk_buff *skb,
595
			 const struct skb_shared_info *shinfo,
596
			 struct net_device *dev,
597 598 599
			 int *lso_header_size,
			 bool *inline_ok,
			 void **pfrag)
600 601 602 603
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int real_size;

604
	if (shinfo->gso_size) {
605
		*inline_ok = false;
606 607 608 609
		if (skb->encapsulation)
			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
		else
			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
610
		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
611 612 613 614 615 616 617 618
			ALIGN(*lso_header_size + 4, DS_SIZE);
		if (unlikely(*lso_header_size != skb_headlen(skb))) {
			/* We add a segment for the skb linear buffer only if
			 * it contains data */
			if (*lso_header_size < skb_headlen(skb))
				real_size += DS_SIZE;
			else {
				if (netif_msg_tx_err(priv))
619
					en_warn(priv, "Non-linear headers\n");
620 621 622 623 624
				return 0;
			}
		}
	} else {
		*lso_header_size = 0;
625 626 627 628
		*inline_ok = is_inline(priv->prof->inline_thold, skb,
				       shinfo, pfrag);

		if (*inline_ok)
629
			real_size = inline_size(skb);
630 631 632
		else
			real_size = CTRL_SIZE +
				    (shinfo->nr_frags + 1) * DS_SIZE;
633 634 635 636 637
	}

	return real_size;
}

638 639
static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
			     const struct sk_buff *skb,
640
			     const struct skb_shared_info *shinfo,
641 642
			     int real_size, u16 *vlan_tag,
			     int tx_ind, void *fragptr)
643 644 645
{
	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
646
	unsigned int hlen = skb_headlen(skb);
647 648

	if (skb->len <= spc) {
649 650 651 652 653 654 655
		if (likely(skb->len >= MIN_PKT_LEN)) {
			inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
		} else {
			inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN);
			memset(((void *)(inl + 1)) + skb->len, 0,
			       MIN_PKT_LEN - skb->len);
		}
656
		skb_copy_from_linear_data(skb, inl + 1, hlen);
657
		if (shinfo->nr_frags)
658
			memcpy(((void *)(inl + 1)) + hlen, fragptr,
659
			       skb_frag_size(&shinfo->frags[0]));
660 661 662

	} else {
		inl->byte_count = cpu_to_be32(1 << 31 | spc);
663 664 665 666 667 668
		if (hlen <= spc) {
			skb_copy_from_linear_data(skb, inl + 1, hlen);
			if (hlen < spc) {
				memcpy(((void *)(inl + 1)) + hlen,
				       fragptr, spc - hlen);
				fragptr +=  spc - hlen;
669 670 671 672 673 674 675
			}
			inl = (void *) (inl + 1) + spc;
			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
		} else {
			skb_copy_from_linear_data(skb, inl + 1, spc);
			inl = (void *) (inl + 1) + spc;
			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
676
							 hlen - spc);
677
			if (shinfo->nr_frags)
678
				memcpy(((void *)(inl + 1)) + hlen - spc,
679 680
				       fragptr,
				       skb_frag_size(&shinfo->frags[0]));
681 682
		}

683
		dma_wmb();
684 685 686 687
		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
	}
}

688
u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
689
			 void *accel_priv, select_queue_fallback_t fallback)
690
{
691
	struct mlx4_en_priv *priv = netdev_priv(dev);
692
	u16 rings_p_up = priv->num_tx_rings_p_up;
693
	u8 up = 0;
694

695 696 697
	if (dev->num_tc)
		return skb_tx_hash(dev, skb);

698 699
	if (skb_vlan_tag_present(skb))
		up = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT;
Y
Yevgeny Petrilin 已提交
700

701
	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
702 703
}

704 705
static void mlx4_bf_copy(void __iomem *dst, const void *src,
			 unsigned int bytecnt)
706 707 708 709
{
	__iowrite64_copy(dst, src, bytecnt / 8);
}

710
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
711
{
712
	struct skb_shared_info *shinfo = skb_shinfo(skb);
713
	struct mlx4_en_priv *priv = netdev_priv(dev);
714
	struct device *ddev = priv->ddev;
715 716 717 718 719 720 721 722
	struct mlx4_en_tx_ring *ring;
	struct mlx4_en_tx_desc *tx_desc;
	struct mlx4_wqe_data_seg *data;
	struct mlx4_en_tx_info *tx_info;
	int tx_ind = 0;
	int nr_txbb;
	int desc_size;
	int real_size;
723
	u32 index, bf_index;
724
	__be32 op_own;
Y
Yevgeny Petrilin 已提交
725
	u16 vlan_tag = 0;
726
	u16 vlan_proto = 0;
727
	int i_frag;
728
	int lso_header_size;
729
	void *fragptr = NULL;
730
	bool bounce = false;
731
	bool send_doorbell;
E
Eric Dumazet 已提交
732
	bool stop_queue;
733
	bool inline_ok;
734
	u32 ring_cons;
735

736 737 738
	if (!priv->port_up)
		goto tx_drop;

739 740 741 742 743 744
	tx_ind = skb_get_queue_mapping(skb);
	ring = priv->tx_ring[tx_ind];

	/* fetch ring->cons far ahead before needing it to avoid stall */
	ring_cons = ACCESS_ONCE(ring->cons);

745 746
	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
				  &inline_ok, &fragptr);
747
	if (unlikely(!real_size))
748
		goto tx_drop;
749

L
Lucas De Marchi 已提交
750
	/* Align descriptor to TXBB size */
751 752 753 754
	desc_size = ALIGN(real_size, TXBB_SIZE);
	nr_txbb = desc_size / TXBB_SIZE;
	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
		if (netif_msg_tx_err(priv))
755
			en_warn(priv, "Oversized header or SG list\n");
756
		goto tx_drop;
757 758
	}

759
	if (skb_vlan_tag_present(skb)) {
760
		vlan_tag = skb_vlan_tag_get(skb);
761 762
		vlan_proto = be16_to_cpu(skb->vlan_proto);
	}
763

764
	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
765

766 767
	/* Track current inflight packets for performance analysis */
	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
768
			 (u32)(ring->prod - ring_cons - 1));
769 770 771

	/* Packet is good - grab an index and transmit it */
	index = ring->prod & ring->size_mask;
772
	bf_index = ring->prod;
773 774 775 776 777

	/* See if we have enough space for whole descriptor TXBB for setting
	 * SW ownership on next descriptor; if not, use a bounce buffer. */
	if (likely(index + nr_txbb <= ring->size))
		tx_desc = ring->buf + index * TXBB_SIZE;
778
	else {
779
		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
780 781
		bounce = true;
	}
782 783 784 785 786 787

	/* Save skb in tx_info ring */
	tx_info = &ring->tx_info[index];
	tx_info->skb = skb;
	tx_info->nr_txbb = nr_txbb;

788
	data = &tx_desc->data;
789 790 791 792 793 794 795
	if (lso_header_size)
		data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
						      DS_SIZE));

	/* valid only for none inline segments */
	tx_info->data_offset = (void *)data - (void *)tx_desc;

796 797
	tx_info->inl = inline_ok;

798
	tx_info->linear = (lso_header_size < skb_headlen(skb) &&
799
			   !inline_ok) ? 1 : 0;
800

801
	tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
802
	data += tx_info->nr_maps - 1;
803

804
	if (!tx_info->inl) {
805 806 807
		dma_addr_t dma = 0;
		u32 byte_count = 0;

808
		/* Map fragments if any */
809
		for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
810
			const struct skb_frag_struct *frag;
811 812

			frag = &shinfo->frags[i_frag];
813
			byte_count = skb_frag_size(frag);
814
			dma = skb_frag_dma_map(ddev, frag,
815
					       0, byte_count,
816 817 818 819 820
					       DMA_TO_DEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
821
			data->lkey = ring->mr_key;
822
			dma_wmb();
823
			data->byte_count = cpu_to_be32(byte_count);
824 825 826
			--data;
		}

827
		/* Map linear part if needed */
828
		if (tx_info->linear) {
829
			byte_count = skb_headlen(skb) - lso_header_size;
830

831 832 833 834 835 836 837
			dma = dma_map_single(ddev, skb->data +
					     lso_header_size, byte_count,
					     PCI_DMA_TODEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
838
			data->lkey = ring->mr_key;
839
			dma_wmb();
840 841
			data->byte_count = cpu_to_be32(byte_count);
		}
842 843 844
		/* tx completion can avoid cache line miss for common cases */
		tx_info->map0_dma = dma;
		tx_info->map0_byte_count = byte_count;
845 846
	}

847 848 849 850
	/*
	 * For timestamping add flag to skb_shinfo and
	 * set flag for further reference
	 */
851
	tx_info->ts_requested = 0;
852 853 854
	if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
		     shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
855 856 857
		tx_info->ts_requested = 1;
	}

858 859
	/* Prepare ctrl segement apart opcode+ownership, which depends on
	 * whether LSO is used */
A
Amir Vadai 已提交
860
	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
861
	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
862 863 864 865 866
		if (!skb->encapsulation)
			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
								 MLX4_WQE_CTRL_TCP_UDP_CSUM);
		else
			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
867
		ring->tx_csum++;
868 869
	}

870
	if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
871 872
		struct ethhdr *ethh;

873 874 875 876 877 878 879 880
		/* Copy dst mac address to wqe. This allows loopback in eSwitch,
		 * so that VFs and PF can communicate with each other
		 */
		ethh = (struct ethhdr *)skb->data;
		tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
		tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
	}

881 882
	/* Handle LSO (TSO) packets */
	if (lso_header_size) {
883 884
		int i;

885 886 887 888 889 890 891
		/* Mark opcode as LSO */
		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
			((ring->prod & ring->size) ?
				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);

		/* Fill in the LSO prefix */
		tx_desc->lso.mss_hdr_size = cpu_to_be32(
892
			shinfo->gso_size << 16 | lso_header_size);
893 894 895 896 897

		/* Copy headers;
		 * note that we already verified that it is linear */
		memcpy(tx_desc->lso.header, skb->data, lso_header_size);

E
Eric Dumazet 已提交
898
		ring->tso_packets++;
899 900 901

		i = ((skb->len - lso_header_size) / shinfo->gso_size) +
			!!((skb->len - lso_header_size) % shinfo->gso_size);
902
		tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
903 904 905 906 907 908
		ring->packets += i;
	} else {
		/* Normal (Non LSO) packet */
		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
			((ring->prod & ring->size) ?
			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
909
		tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
910 911
		ring->packets++;
	}
912 913
	ring->bytes += tx_info->nr_bytes;
	netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
914 915
	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);

916
	if (tx_info->inl)
917 918
		build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
				 tx_ind, fragptr);
919

920 921 922 923 924 925 926 927
	if (skb->encapsulation) {
		struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb);
		if (ipv4->protocol == IPPROTO_TCP || ipv4->protocol == IPPROTO_UDP)
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP);
		else
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
	}

928 929 930
	ring->prod += nr_txbb;

	/* If we used a bounce buffer then copy descriptor back into place */
931
	if (unlikely(bounce))
932 933
		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);

934 935
	skb_tx_timestamp(skb);

E
Eric Dumazet 已提交
936
	/* Check available TXBBs And 2K spare for prefetch */
937
	stop_queue = mlx4_en_is_tx_ring_full(ring);
E
Eric Dumazet 已提交
938 939 940 941
	if (unlikely(stop_queue)) {
		netif_tx_stop_queue(ring->tx_queue);
		ring->queue_stopped++;
	}
942 943
	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);

944 945
	real_size = (real_size / 16) & 0x3f;

946
	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
947
	    !skb_vlan_tag_present(skb) && send_doorbell) {
948 949
		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
				       cpu_to_be32(real_size);
950

951
		op_own |= htonl((bf_index & 0xffff) << 8);
952 953 954
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
955
		dma_wmb();
956
		tx_desc->ctrl.owner_opcode = op_own;
957

958 959
		wmb();

960 961
		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
			     desc_size);
962 963 964 965 966

		wmb();

		ring->bf.offset ^= ring->bf.buf_size;
	} else {
967
		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
968 969 970 971
		if (vlan_proto == ETH_P_8021AD)
			tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN;
		else if (vlan_proto == ETH_P_8021Q)
			tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
972 973
		else
			tx_desc->ctrl.ins_vlan = 0;
974

975 976
		tx_desc->ctrl.fence_size = real_size;

977 978 979
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
980
		dma_wmb();
981
		tx_desc->ctrl.owner_opcode = op_own;
982 983
		if (send_doorbell) {
			wmb();
984 985 986 987 988 989 990 991 992 993 994
			/* Since there is no iowrite*_native() that writes the
			 * value as is, without byteswapping - using the one
			 * the doesn't do byteswapping in the relevant arch
			 * endianness.
			 */
#if defined(__LITTLE_ENDIAN)
			iowrite32(
#else
			iowrite32be(
#endif
				  ring->doorbell_qpn,
995
				  ring->bf.uar->map + MLX4_SEND_DOORBELL);
E
Eric Dumazet 已提交
996 997
		} else {
			ring->xmit_more++;
998
		}
999
	}
1000

E
Eric Dumazet 已提交
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
	if (unlikely(stop_queue)) {
		/* If queue was emptied after the if (stop_queue) , and before
		 * the netif_tx_stop_queue() - need to wake the queue,
		 * or else it will remain stopped forever.
		 * Need a memory barrier to make sure ring->cons was not
		 * updated before queue was stopped.
		 */
		smp_rmb();

		ring_cons = ACCESS_ONCE(ring->cons);
1011
		if (unlikely(!mlx4_en_is_tx_ring_full(ring))) {
E
Eric Dumazet 已提交
1012 1013 1014 1015
			netif_tx_wake_queue(ring->tx_queue);
			ring->wake_queue++;
		}
	}
1016
	return NETDEV_TX_OK;
1017

1018 1019 1020
tx_drop_unmap:
	en_err(priv, "DMA mapping error\n");

1021 1022
	while (++i_frag < shinfo->nr_frags) {
		++data;
1023 1024 1025 1026 1027
		dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
			       be32_to_cpu(data->byte_count),
			       PCI_DMA_TODEVICE);
	}

1028 1029 1030 1031
tx_drop:
	dev_kfree_skb_any(skb);
	priv->stats.tx_dropped++;
	return NETDEV_TX_OK;
1032 1033
}