en_tx.c 27.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
/*
 * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include <asm/page.h>
#include <linux/mlx4/cq.h>
36
#include <linux/slab.h>
37 38 39
#include <linux/mlx4/qp.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
40
#include <linux/prefetch.h>
41
#include <linux/vmalloc.h>
42
#include <linux/tcp.h>
43
#include <linux/ip.h>
44
#include <linux/moduleparam.h>
45 46 47 48

#include "mlx4_en.h"

int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
49
			   struct mlx4_en_tx_ring **pring, u32 size,
50
			   u16 stride, int node, int queue_index)
51 52
{
	struct mlx4_en_dev *mdev = priv->mdev;
53
	struct mlx4_en_tx_ring *ring;
54 55 56
	int tmp;
	int err;

57
	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
58
	if (!ring) {
59 60 61 62 63
		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
		if (!ring) {
			en_err(priv, "Failed allocating TX ring\n");
			return -ENOMEM;
		}
64 65
	}

66 67 68 69 70
	ring->size = size;
	ring->size_mask = size - 1;
	ring->stride = stride;

	tmp = size * sizeof(struct mlx4_en_tx_info);
71
	ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
72
	if (!ring->tx_info) {
73 74 75 76 77
		ring->tx_info = vmalloc(tmp);
		if (!ring->tx_info) {
			err = -ENOMEM;
			goto err_ring;
		}
78
	}
79

80
	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
81 82
		 ring->tx_info, tmp);

83
	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
84
	if (!ring->bounce_buf) {
85 86 87 88 89
		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
		if (!ring->bounce_buf) {
			err = -ENOMEM;
			goto err_info;
		}
90 91 92
	}
	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);

93
	/* Allocate HW buffers on provided NUMA node */
94
	set_dev_node(&mdev->dev->persist->pdev->dev, node);
95 96
	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
				 2 * PAGE_SIZE);
97
	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
98
	if (err) {
99
		en_err(priv, "Failed allocating hwq resources\n");
100 101 102 103 104
		goto err_bounce;
	}

	err = mlx4_en_map_buffer(&ring->wqres.buf);
	if (err) {
105
		en_err(priv, "Failed to map TX buffer\n");
106 107 108 109 110
		goto err_hwq_res;
	}

	ring->buf = ring->wqres.buf.direct.buf;

J
Joe Perches 已提交
111 112 113
	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
	       ring, ring->buf, ring->size, ring->buf_size,
	       (unsigned long long) ring->wqres.buf.direct.map);
114

115 116 117 118 119 120 121
	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
				    MLX4_RESERVE_ETH_BF_QP);
	if (err) {
		en_err(priv, "failed reserving qp for TX ring\n");
		goto err_map;
	}

122
	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
123
	if (err) {
124
		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
125
		goto err_reserve;
126
	}
127
	ring->qp.event = mlx4_en_sqp_event;
128

129
	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
130
	if (err) {
J
Joe Perches 已提交
131
		en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
132 133 134
		ring->bf.uar = &mdev->priv_uar;
		ring->bf.uar->map = mdev->uar_map;
		ring->bf_enabled = false;
135 136 137 138 139 140 141
		ring->bf_alloced = false;
		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
	} else {
		ring->bf_alloced = true;
		ring->bf_enabled = !!(priv->pflags &
				      MLX4_EN_PRIV_FLAGS_BLUEFLAME);
	}
142

143
	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
144 145 146 147
	ring->queue_index = queue_index;

	if (queue_index < priv->num_tx_rings_p_up && cpu_online(queue_index))
		cpumask_set_cpu(queue_index, &ring->affinity_mask);
148

149
	*pring = ring;
150 151
	return 0;

152 153
err_reserve:
	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
154 155 156 157 158 159 160
err_map:
	mlx4_en_unmap_buffer(&ring->wqres.buf);
err_hwq_res:
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
err_bounce:
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
161
err_info:
162
	kvfree(ring->tx_info);
163
	ring->tx_info = NULL;
164 165 166
err_ring:
	kfree(ring);
	*pring = NULL;
167 168 169 170
	return err;
}

void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
171
			     struct mlx4_en_tx_ring **pring)
172 173
{
	struct mlx4_en_dev *mdev = priv->mdev;
174
	struct mlx4_en_tx_ring *ring = *pring;
175
	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
176

177
	if (ring->bf_alloced)
178
		mlx4_bf_free(mdev->dev, &ring->bf);
179 180 181 182 183 184
	mlx4_qp_remove(mdev->dev, &ring->qp);
	mlx4_qp_free(mdev->dev, &ring->qp);
	mlx4_en_unmap_buffer(&ring->wqres.buf);
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
185
	kvfree(ring->tx_info);
186
	ring->tx_info = NULL;
187 188
	kfree(ring);
	*pring = NULL;
189 190 191 192
}

int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
			     struct mlx4_en_tx_ring *ring,
193
			     int cq, int user_prio)
194 195 196 197 198 199 200 201 202 203 204 205
{
	struct mlx4_en_dev *mdev = priv->mdev;
	int err;

	ring->cqn = cq;
	ring->prod = 0;
	ring->cons = 0xffffffff;
	ring->last_nr_txbb = 1;
	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
	memset(ring->buf, 0, ring->buf_size);

	ring->qp_state = MLX4_QP_STATE_RST;
206 207
	ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
	ring->mr_key = cpu_to_be32(mdev->mr.key);
208 209

	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
210
				ring->cqn, user_prio, &ring->context);
211
	if (ring->bf_alloced)
212
		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
213 214 215

	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
			       &ring->qp, &ring->qp_state);
216 217 218
	if (!user_prio && cpu_online(ring->queue_index))
		netif_set_xps_queue(priv->dev, &ring->affinity_mask,
				    ring->queue_index);
219 220 221 222 223 224 225 226 227 228 229 230 231

	return err;
}

void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_dev *mdev = priv->mdev;

	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
}

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
			      struct mlx4_en_tx_ring *ring, int index,
			      u8 owner)
{
	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	void *end = ring->buf + ring->buf_size;
	__be32 *ptr = (__be32 *)tx_desc;
	int i;

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
		}
	} else {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
			if ((void *)ptr >= end) {
				ptr = ring->buf;
				stamp ^= cpu_to_be32(0x80000000);
			}
		}
	}
}

265 266 267

static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring,
268
				int index, u8 owner, u64 timestamp)
269 270 271 272 273
{
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
	void *end = ring->buf + ring->buf_size;
274 275
	struct sk_buff *skb = tx_info->skb;
	int nr_maps = tx_info->nr_maps;
276
	int i;
277

278 279 280 281 282
	/* We do not touch skb here, so prefetch skb->users location
	 * to speedup consume_skb()
	 */
	prefetchw(&skb->users);

283 284 285 286
	if (unlikely(timestamp)) {
		struct skb_shared_hwtstamps hwts;

		mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
287 288
		skb_tstamp_tx(skb, &hwts);
	}
289 290 291

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
292
		if (!tx_info->inl) {
293
			if (tx_info->linear)
294
				dma_unmap_single(priv->ddev,
295 296 297 298 299 300 301 302 303 304
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
305
				dma_unmap_page(priv->ddev,
306 307 308
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
309
			}
310 311
		}
	} else {
312 313
		if (!tx_info->inl) {
			if ((void *) data >= end) {
314
				data = ring->buf + ((void *)data - end);
315
			}
316

317
			if (tx_info->linear)
318
				dma_unmap_single(priv->ddev,
319 320 321 322 323 324 325 326 327 328
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
329 330
				/* Check for wraparound before unmapping */
				if ((void *) data >= end)
331
					data = ring->buf;
332
				dma_unmap_page(priv->ddev,
333 334 335
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
336
			}
337 338
		}
	}
339
	dev_consume_skb_any(skb);
340 341 342 343 344 345 346 347 348 349 350
	return tx_info->nr_txbb;
}


int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int cnt = 0;

	/* Skip last polled descriptor */
	ring->cons += ring->last_nr_txbb;
351
	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
352 353 354 355
		 ring->cons, ring->prod);

	if ((u32) (ring->prod - ring->cons) > ring->size) {
		if (netif_msg_tx_err(priv))
356
			en_warn(priv, "Tx consumer passed producer!\n");
357 358 359 360 361 362
		return 0;
	}

	while (ring->cons != ring->prod) {
		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
						ring->cons & ring->size_mask,
363
						!!(ring->cons & ring->size), 0);
364 365 366 367
		ring->cons += ring->last_nr_txbb;
		cnt++;
	}

368 369
	netdev_tx_reset_queue(ring->tx_queue);

370
	if (cnt)
371
		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
372 373 374 375

	return cnt;
}

376 377
static bool mlx4_en_process_tx_cq(struct net_device *dev,
				 struct mlx4_en_cq *cq)
378 379 380
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	struct mlx4_cq *mcq = &cq->mcq;
381
	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
382
	struct mlx4_cqe *cqe;
383
	u16 index;
384
	u16 new_index, ring_index, stamp_index;
385
	u32 txbbs_skipped = 0;
386
	u32 txbbs_stamp = 0;
387 388 389 390
	u32 cons_index = mcq->cons_index;
	int size = cq->size;
	u32 size_mask = ring->size_mask;
	struct mlx4_cqe *buf = cq->buf;
391 392
	u32 packets = 0;
	u32 bytes = 0;
O
Or Gerlitz 已提交
393
	int factor = priv->cqe_factor;
394
	u64 timestamp = 0;
395
	int done = 0;
396
	int budget = priv->tx_work_limit;
397 398
	u32 last_nr_txbb;
	u32 ring_cons;
399 400

	if (!priv->port_up)
401
		return true;
402

403 404
	netdev_txq_bql_complete_prefetchw(ring->tx_queue);

405
	index = cons_index & size_mask;
406
	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
407 408 409
	last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
	ring_cons = ACCESS_ONCE(ring->cons);
	ring_index = ring_cons & size_mask;
410
	stamp_index = ring_index;
411 412 413

	/* Process all completed CQEs */
	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
414
			cons_index & size) && (done < budget)) {
415 416 417 418
		/*
		 * make sure we read the CQE after we read the
		 * ownership bit
		 */
419
		dma_rmb();
420

421 422 423 424 425 426 427 428 429
		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
			     MLX4_CQE_OPCODE_ERROR)) {
			struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;

			en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
			       cqe_err->vendor_err_syndrome,
			       cqe_err->syndrome);
		}

430 431 432
		/* Skip over last polled CQE */
		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;

433
		do {
434 435
			txbbs_skipped += last_nr_txbb;
			ring_index = (ring_index + last_nr_txbb) & size_mask;
436 437 438
			if (ring->tx_info[ring_index].ts_requested)
				timestamp = mlx4_en_get_cqe_ts(cqe);

439
			/* free next descriptor */
440
			last_nr_txbb = mlx4_en_free_tx_desc(
441
					priv, ring, ring_index,
442
					!!((ring_cons + txbbs_skipped) &
443
					ring->size), timestamp);
444 445

			mlx4_en_stamp_wqe(priv, ring, stamp_index,
446
					  !!((ring_cons + txbbs_stamp) &
447 448 449
						ring->size));
			stamp_index = ring_index;
			txbbs_stamp = txbbs_skipped;
450 451
			packets++;
			bytes += ring->tx_info[ring_index].nr_bytes;
452
		} while ((++done < budget) && (ring_index != new_index));
453 454 455

		++cons_index;
		index = cons_index & size_mask;
456
		cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
457
	}
458 459 460 461 462 463


	/*
	 * To prevent CQ overflow we first update CQ consumer and only then
	 * the ring consumer.
	 */
464
	mcq->cons_index = cons_index;
465 466
	mlx4_cq_set_ci(mcq);
	wmb();
467 468 469 470 471

	/* we want to dirty this cache line once */
	ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
	ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;

472
	netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
473

474 475 476 477 478 479
	/*
	 * Wakeup Tx queue if this stopped, and at least 1 packet
	 * was completed
	 */
	if (netif_tx_queue_stopped(ring->tx_queue) && txbbs_skipped > 0) {
		netif_tx_wake_queue(ring->tx_queue);
480
		ring->wake_queue++;
481
	}
482
	return done < budget;
483 484 485 486 487 488 489
}

void mlx4_en_tx_irq(struct mlx4_cq *mcq)
{
	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
	struct mlx4_en_priv *priv = netdev_priv(cq->dev);

E
Eric Dumazet 已提交
490 491
	if (likely(priv->port_up))
		napi_schedule_irqoff(&cq->napi);
492 493
	else
		mlx4_en_arm_cq(priv, cq);
494 495
}

496 497 498 499 500 501
/* TX CQ polling - called by NAPI */
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
{
	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
	struct net_device *dev = cq->dev;
	struct mlx4_en_priv *priv = netdev_priv(dev);
502
	int clean_complete;
503

504 505 506
	clean_complete = mlx4_en_process_tx_cq(dev, cq);
	if (!clean_complete)
		return budget;
507

508 509 510 511
	napi_complete(napi);
	mlx4_en_arm_cq(priv, cq);

	return 0;
512
}
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541

static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
						      struct mlx4_en_tx_ring *ring,
						      u32 index,
						      unsigned int desc_size)
{
	u32 copy = (ring->size - index) * TXBB_SIZE;
	int i;

	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + i)) =
			*((u32 *) (ring->bounce_buf + copy + i));
	}

	for (i = copy - 4; i >= 4 ; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
			*((u32 *) (ring->bounce_buf + i));
	}

	/* Return real descriptor location */
	return ring->buf + index * TXBB_SIZE;
}

542 543 544 545 546 547 548
/* Decide if skb can be inlined in tx descriptor to avoid dma mapping
 *
 * It seems strange we do not simply use skb_copy_bits().
 * This would allow to inline all skbs iff skb->len <= inline_thold
 *
 * Note that caller already checked skb was not a gso packet
 */
549
static bool is_inline(int inline_thold, const struct sk_buff *skb,
550
		      const struct skb_shared_info *shinfo,
551
		      void **pfrag)
552 553 554
{
	void *ptr;

555 556
	if (skb->len > inline_thold || !inline_thold)
		return false;
557

558 559 560 561 562 563
	if (shinfo->nr_frags == 1) {
		ptr = skb_frag_address_safe(&shinfo->frags[0]);
		if (unlikely(!ptr))
			return false;
		*pfrag = ptr;
		return true;
564
	}
565 566 567
	if (shinfo->nr_frags)
		return false;
	return true;
568 569
}

570
static int inline_size(const struct sk_buff *skb)
571 572 573 574 575 576 577 578 579 580
{
	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
	    <= MLX4_INLINE_ALIGN)
		return ALIGN(skb->len + CTRL_SIZE +
			     sizeof(struct mlx4_wqe_inline_seg), 16);
	else
		return ALIGN(skb->len + CTRL_SIZE + 2 *
			     sizeof(struct mlx4_wqe_inline_seg), 16);
}

581
static int get_real_size(const struct sk_buff *skb,
582
			 const struct skb_shared_info *shinfo,
583
			 struct net_device *dev,
584 585 586
			 int *lso_header_size,
			 bool *inline_ok,
			 void **pfrag)
587 588 589 590
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int real_size;

591
	if (shinfo->gso_size) {
592
		*inline_ok = false;
593 594 595 596
		if (skb->encapsulation)
			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
		else
			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
597
		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
598 599 600 601 602 603 604 605
			ALIGN(*lso_header_size + 4, DS_SIZE);
		if (unlikely(*lso_header_size != skb_headlen(skb))) {
			/* We add a segment for the skb linear buffer only if
			 * it contains data */
			if (*lso_header_size < skb_headlen(skb))
				real_size += DS_SIZE;
			else {
				if (netif_msg_tx_err(priv))
606
					en_warn(priv, "Non-linear headers\n");
607 608 609 610 611
				return 0;
			}
		}
	} else {
		*lso_header_size = 0;
612 613 614 615
		*inline_ok = is_inline(priv->prof->inline_thold, skb,
				       shinfo, pfrag);

		if (*inline_ok)
616
			real_size = inline_size(skb);
617 618 619
		else
			real_size = CTRL_SIZE +
				    (shinfo->nr_frags + 1) * DS_SIZE;
620 621 622 623 624
	}

	return real_size;
}

625 626
static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
			     const struct sk_buff *skb,
627
			     const struct skb_shared_info *shinfo,
628 629
			     int real_size, u16 *vlan_tag,
			     int tx_ind, void *fragptr)
630 631 632
{
	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
633
	unsigned int hlen = skb_headlen(skb);
634 635

	if (skb->len <= spc) {
636 637 638 639 640 641 642
		if (likely(skb->len >= MIN_PKT_LEN)) {
			inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
		} else {
			inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN);
			memset(((void *)(inl + 1)) + skb->len, 0,
			       MIN_PKT_LEN - skb->len);
		}
643
		skb_copy_from_linear_data(skb, inl + 1, hlen);
644
		if (shinfo->nr_frags)
645
			memcpy(((void *)(inl + 1)) + hlen, fragptr,
646
			       skb_frag_size(&shinfo->frags[0]));
647 648 649

	} else {
		inl->byte_count = cpu_to_be32(1 << 31 | spc);
650 651 652 653 654 655
		if (hlen <= spc) {
			skb_copy_from_linear_data(skb, inl + 1, hlen);
			if (hlen < spc) {
				memcpy(((void *)(inl + 1)) + hlen,
				       fragptr, spc - hlen);
				fragptr +=  spc - hlen;
656 657 658 659 660 661 662
			}
			inl = (void *) (inl + 1) + spc;
			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
		} else {
			skb_copy_from_linear_data(skb, inl + 1, spc);
			inl = (void *) (inl + 1) + spc;
			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
663
							 hlen - spc);
664
			if (shinfo->nr_frags)
665
				memcpy(((void *)(inl + 1)) + hlen - spc,
666 667
				       fragptr,
				       skb_frag_size(&shinfo->frags[0]));
668 669
		}

670
		dma_wmb();
671 672 673 674
		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
	}
}

675
u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
676
			 void *accel_priv, select_queue_fallback_t fallback)
677
{
678
	struct mlx4_en_priv *priv = netdev_priv(dev);
679
	u16 rings_p_up = priv->num_tx_rings_p_up;
680
	u8 up = 0;
681

682 683 684
	if (dev->num_tc)
		return skb_tx_hash(dev, skb);

685 686
	if (skb_vlan_tag_present(skb))
		up = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT;
Y
Yevgeny Petrilin 已提交
687

688
	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
689 690
}

691 692
static void mlx4_bf_copy(void __iomem *dst, const void *src,
			 unsigned int bytecnt)
693 694 695 696
{
	__iowrite64_copy(dst, src, bytecnt / 8);
}

697
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
698
{
699
	struct skb_shared_info *shinfo = skb_shinfo(skb);
700
	struct mlx4_en_priv *priv = netdev_priv(dev);
701
	struct device *ddev = priv->ddev;
702 703 704 705 706 707 708 709
	struct mlx4_en_tx_ring *ring;
	struct mlx4_en_tx_desc *tx_desc;
	struct mlx4_wqe_data_seg *data;
	struct mlx4_en_tx_info *tx_info;
	int tx_ind = 0;
	int nr_txbb;
	int desc_size;
	int real_size;
710
	u32 index, bf_index;
711
	__be32 op_own;
Y
Yevgeny Petrilin 已提交
712
	u16 vlan_tag = 0;
713
	int i_frag;
714
	int lso_header_size;
715
	void *fragptr = NULL;
716
	bool bounce = false;
717
	bool send_doorbell;
E
Eric Dumazet 已提交
718
	bool stop_queue;
719
	bool inline_ok;
720
	u32 ring_cons;
721

722 723 724
	if (!priv->port_up)
		goto tx_drop;

725 726 727 728 729 730
	tx_ind = skb_get_queue_mapping(skb);
	ring = priv->tx_ring[tx_ind];

	/* fetch ring->cons far ahead before needing it to avoid stall */
	ring_cons = ACCESS_ONCE(ring->cons);

731 732
	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
				  &inline_ok, &fragptr);
733
	if (unlikely(!real_size))
734
		goto tx_drop;
735

L
Lucas De Marchi 已提交
736
	/* Align descriptor to TXBB size */
737 738 739 740
	desc_size = ALIGN(real_size, TXBB_SIZE);
	nr_txbb = desc_size / TXBB_SIZE;
	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
		if (netif_msg_tx_err(priv))
741
			en_warn(priv, "Oversized header or SG list\n");
742
		goto tx_drop;
743 744
	}

745 746
	if (skb_vlan_tag_present(skb))
		vlan_tag = skb_vlan_tag_get(skb);
747 748


749
	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
750

751 752
	/* Track current inflight packets for performance analysis */
	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
753
			 (u32)(ring->prod - ring_cons - 1));
754 755 756

	/* Packet is good - grab an index and transmit it */
	index = ring->prod & ring->size_mask;
757
	bf_index = ring->prod;
758 759 760 761 762

	/* See if we have enough space for whole descriptor TXBB for setting
	 * SW ownership on next descriptor; if not, use a bounce buffer. */
	if (likely(index + nr_txbb <= ring->size))
		tx_desc = ring->buf + index * TXBB_SIZE;
763
	else {
764
		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
765 766
		bounce = true;
	}
767 768 769 770 771 772

	/* Save skb in tx_info ring */
	tx_info = &ring->tx_info[index];
	tx_info->skb = skb;
	tx_info->nr_txbb = nr_txbb;

773
	data = &tx_desc->data;
774 775 776 777 778 779 780
	if (lso_header_size)
		data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
						      DS_SIZE));

	/* valid only for none inline segments */
	tx_info->data_offset = (void *)data - (void *)tx_desc;

781 782
	tx_info->inl = inline_ok;

783
	tx_info->linear = (lso_header_size < skb_headlen(skb) &&
784
			   !inline_ok) ? 1 : 0;
785

786
	tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
787
	data += tx_info->nr_maps - 1;
788

789
	if (!tx_info->inl) {
790 791 792
		dma_addr_t dma = 0;
		u32 byte_count = 0;

793
		/* Map fragments if any */
794
		for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
795
			const struct skb_frag_struct *frag;
796 797

			frag = &shinfo->frags[i_frag];
798
			byte_count = skb_frag_size(frag);
799
			dma = skb_frag_dma_map(ddev, frag,
800
					       0, byte_count,
801 802 803 804 805
					       DMA_TO_DEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
806
			data->lkey = ring->mr_key;
807
			dma_wmb();
808
			data->byte_count = cpu_to_be32(byte_count);
809 810 811
			--data;
		}

812
		/* Map linear part if needed */
813
		if (tx_info->linear) {
814
			byte_count = skb_headlen(skb) - lso_header_size;
815

816 817 818 819 820 821 822
			dma = dma_map_single(ddev, skb->data +
					     lso_header_size, byte_count,
					     PCI_DMA_TODEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
823
			data->lkey = ring->mr_key;
824
			dma_wmb();
825 826
			data->byte_count = cpu_to_be32(byte_count);
		}
827 828 829
		/* tx completion can avoid cache line miss for common cases */
		tx_info->map0_dma = dma;
		tx_info->map0_byte_count = byte_count;
830 831
	}

832 833 834 835
	/*
	 * For timestamping add flag to skb_shinfo and
	 * set flag for further reference
	 */
836
	tx_info->ts_requested = 0;
837 838 839
	if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
		     shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
840 841 842
		tx_info->ts_requested = 1;
	}

843 844
	/* Prepare ctrl segement apart opcode+ownership, which depends on
	 * whether LSO is used */
A
Amir Vadai 已提交
845
	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
846
	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
847 848 849 850 851
		if (!skb->encapsulation)
			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
								 MLX4_WQE_CTRL_TCP_UDP_CSUM);
		else
			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
852
		ring->tx_csum++;
853 854
	}

855
	if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
856 857
		struct ethhdr *ethh;

858 859 860 861 862 863 864 865
		/* Copy dst mac address to wqe. This allows loopback in eSwitch,
		 * so that VFs and PF can communicate with each other
		 */
		ethh = (struct ethhdr *)skb->data;
		tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
		tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
	}

866 867
	/* Handle LSO (TSO) packets */
	if (lso_header_size) {
868 869
		int i;

870 871 872 873 874 875 876
		/* Mark opcode as LSO */
		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
			((ring->prod & ring->size) ?
				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);

		/* Fill in the LSO prefix */
		tx_desc->lso.mss_hdr_size = cpu_to_be32(
877
			shinfo->gso_size << 16 | lso_header_size);
878 879 880 881 882

		/* Copy headers;
		 * note that we already verified that it is linear */
		memcpy(tx_desc->lso.header, skb->data, lso_header_size);

E
Eric Dumazet 已提交
883
		ring->tso_packets++;
884 885 886

		i = ((skb->len - lso_header_size) / shinfo->gso_size) +
			!!((skb->len - lso_header_size) % shinfo->gso_size);
887
		tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
888 889 890 891 892 893
		ring->packets += i;
	} else {
		/* Normal (Non LSO) packet */
		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
			((ring->prod & ring->size) ?
			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
894
		tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
895 896
		ring->packets++;
	}
897 898
	ring->bytes += tx_info->nr_bytes;
	netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
899 900
	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);

901
	if (tx_info->inl)
902 903
		build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
				 tx_ind, fragptr);
904

905 906 907 908 909 910 911 912
	if (skb->encapsulation) {
		struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb);
		if (ipv4->protocol == IPPROTO_TCP || ipv4->protocol == IPPROTO_UDP)
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP);
		else
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
	}

913 914 915
	ring->prod += nr_txbb;

	/* If we used a bounce buffer then copy descriptor back into place */
916
	if (unlikely(bounce))
917 918
		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);

919 920
	skb_tx_timestamp(skb);

E
Eric Dumazet 已提交
921 922 923 924 925 926 927
	/* Check available TXBBs And 2K spare for prefetch */
	stop_queue = (int)(ring->prod - ring_cons) >
		      ring->size - HEADROOM - MAX_DESC_TXBBS;
	if (unlikely(stop_queue)) {
		netif_tx_stop_queue(ring->tx_queue);
		ring->queue_stopped++;
	}
928 929
	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);

930 931
	real_size = (real_size / 16) & 0x3f;

932
	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
933
	    !skb_vlan_tag_present(skb) && send_doorbell) {
934 935
		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
				       cpu_to_be32(real_size);
936

937
		op_own |= htonl((bf_index & 0xffff) << 8);
938 939 940
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
941
		dma_wmb();
942
		tx_desc->ctrl.owner_opcode = op_own;
943

944 945
		wmb();

946 947
		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
			     desc_size);
948 949 950 951 952

		wmb();

		ring->bf.offset ^= ring->bf.buf_size;
	} else {
953 954
		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
955
			!!skb_vlan_tag_present(skb);
956 957
		tx_desc->ctrl.fence_size = real_size;

958 959 960
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
961
		dma_wmb();
962
		tx_desc->ctrl.owner_opcode = op_own;
963 964
		if (send_doorbell) {
			wmb();
965 966 967 968 969 970 971 972 973 974 975
			/* Since there is no iowrite*_native() that writes the
			 * value as is, without byteswapping - using the one
			 * the doesn't do byteswapping in the relevant arch
			 * endianness.
			 */
#if defined(__LITTLE_ENDIAN)
			iowrite32(
#else
			iowrite32be(
#endif
				  ring->doorbell_qpn,
976
				  ring->bf.uar->map + MLX4_SEND_DOORBELL);
E
Eric Dumazet 已提交
977 978
		} else {
			ring->xmit_more++;
979
		}
980
	}
981

E
Eric Dumazet 已提交
982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997
	if (unlikely(stop_queue)) {
		/* If queue was emptied after the if (stop_queue) , and before
		 * the netif_tx_stop_queue() - need to wake the queue,
		 * or else it will remain stopped forever.
		 * Need a memory barrier to make sure ring->cons was not
		 * updated before queue was stopped.
		 */
		smp_rmb();

		ring_cons = ACCESS_ONCE(ring->cons);
		if (unlikely(((int)(ring->prod - ring_cons)) <=
			     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
			netif_tx_wake_queue(ring->tx_queue);
			ring->wake_queue++;
		}
	}
998
	return NETDEV_TX_OK;
999

1000 1001 1002
tx_drop_unmap:
	en_err(priv, "DMA mapping error\n");

1003 1004
	while (++i_frag < shinfo->nr_frags) {
		++data;
1005 1006 1007 1008 1009
		dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
			       be32_to_cpu(data->byte_count),
			       PCI_DMA_TODEVICE);
	}

1010 1011 1012 1013
tx_drop:
	dev_kfree_skb_any(skb);
	priv->stats.tx_dropped++;
	return NETDEV_TX_OK;
1014 1015
}