en_tx.c 26.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
/*
 * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include <asm/page.h>
#include <linux/mlx4/cq.h>
36
#include <linux/slab.h>
37 38 39
#include <linux/mlx4/qp.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
40
#include <linux/prefetch.h>
41
#include <linux/vmalloc.h>
42
#include <linux/tcp.h>
43
#include <linux/ip.h>
44
#include <linux/moduleparam.h>
45 46 47 48

#include "mlx4_en.h"

int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
49
			   struct mlx4_en_tx_ring **pring, int qpn, u32 size,
50
			   u16 stride, int node, int queue_index)
51 52
{
	struct mlx4_en_dev *mdev = priv->mdev;
53
	struct mlx4_en_tx_ring *ring;
54 55 56
	int tmp;
	int err;

57
	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
58
	if (!ring) {
59 60 61 62 63
		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
		if (!ring) {
			en_err(priv, "Failed allocating TX ring\n");
			return -ENOMEM;
		}
64 65
	}

66 67 68 69 70
	ring->size = size;
	ring->size_mask = size - 1;
	ring->stride = stride;

	tmp = size * sizeof(struct mlx4_en_tx_info);
71
	ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
72
	if (!ring->tx_info) {
73 74 75 76 77
		ring->tx_info = vmalloc(tmp);
		if (!ring->tx_info) {
			err = -ENOMEM;
			goto err_ring;
		}
78
	}
79

80
	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
81 82
		 ring->tx_info, tmp);

83
	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
84
	if (!ring->bounce_buf) {
85 86 87 88 89
		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
		if (!ring->bounce_buf) {
			err = -ENOMEM;
			goto err_info;
		}
90 91 92
	}
	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);

93 94
	/* Allocate HW buffers on provided NUMA node */
	set_dev_node(&mdev->dev->pdev->dev, node);
95 96
	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
				 2 * PAGE_SIZE);
97
	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
98
	if (err) {
99
		en_err(priv, "Failed allocating hwq resources\n");
100 101 102 103 104
		goto err_bounce;
	}

	err = mlx4_en_map_buffer(&ring->wqres.buf);
	if (err) {
105
		en_err(priv, "Failed to map TX buffer\n");
106 107 108 109 110
		goto err_hwq_res;
	}

	ring->buf = ring->wqres.buf.direct.buf;

J
Joe Perches 已提交
111 112 113
	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
	       ring, ring->buf, ring->size, ring->buf_size,
	       (unsigned long long) ring->wqres.buf.direct.map);
114

115
	ring->qpn = qpn;
116
	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
117
	if (err) {
118
		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
119
		goto err_map;
120
	}
121
	ring->qp.event = mlx4_en_sqp_event;
122

123
	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
124
	if (err) {
J
Joe Perches 已提交
125
		en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
126 127 128
		ring->bf.uar = &mdev->priv_uar;
		ring->bf.uar->map = mdev->uar_map;
		ring->bf_enabled = false;
129 130 131 132 133 134 135
		ring->bf_alloced = false;
		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
	} else {
		ring->bf_alloced = true;
		ring->bf_enabled = !!(priv->pflags &
				      MLX4_EN_PRIV_FLAGS_BLUEFLAME);
	}
136

137
	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
138 139 140 141
	ring->queue_index = queue_index;

	if (queue_index < priv->num_tx_rings_p_up && cpu_online(queue_index))
		cpumask_set_cpu(queue_index, &ring->affinity_mask);
142

143
	*pring = ring;
144 145 146 147 148 149 150 151 152
	return 0;

err_map:
	mlx4_en_unmap_buffer(&ring->wqres.buf);
err_hwq_res:
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
err_bounce:
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
153
err_info:
154
	kvfree(ring->tx_info);
155
	ring->tx_info = NULL;
156 157 158
err_ring:
	kfree(ring);
	*pring = NULL;
159 160 161 162
	return err;
}

void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
163
			     struct mlx4_en_tx_ring **pring)
164 165
{
	struct mlx4_en_dev *mdev = priv->mdev;
166
	struct mlx4_en_tx_ring *ring = *pring;
167
	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
168

169
	if (ring->bf_alloced)
170
		mlx4_bf_free(mdev->dev, &ring->bf);
171 172 173 174 175 176
	mlx4_qp_remove(mdev->dev, &ring->qp);
	mlx4_qp_free(mdev->dev, &ring->qp);
	mlx4_en_unmap_buffer(&ring->wqres.buf);
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
177
	kvfree(ring->tx_info);
178
	ring->tx_info = NULL;
179 180
	kfree(ring);
	*pring = NULL;
181 182 183 184
}

int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
			     struct mlx4_en_tx_ring *ring,
185
			     int cq, int user_prio)
186 187 188 189 190 191 192 193 194 195 196 197
{
	struct mlx4_en_dev *mdev = priv->mdev;
	int err;

	ring->cqn = cq;
	ring->prod = 0;
	ring->cons = 0xffffffff;
	ring->last_nr_txbb = 1;
	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
	memset(ring->buf, 0, ring->buf_size);

	ring->qp_state = MLX4_QP_STATE_RST;
198 199
	ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
	ring->mr_key = cpu_to_be32(mdev->mr.key);
200 201

	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
202
				ring->cqn, user_prio, &ring->context);
203
	if (ring->bf_alloced)
204
		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
205 206 207

	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
			       &ring->qp, &ring->qp_state);
208 209 210
	if (!user_prio && cpu_online(ring->queue_index))
		netif_set_xps_queue(priv->dev, &ring->affinity_mask,
				    ring->queue_index);
211 212 213 214 215 216 217 218 219 220 221 222 223

	return err;
}

void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_dev *mdev = priv->mdev;

	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
}

224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
			      struct mlx4_en_tx_ring *ring, int index,
			      u8 owner)
{
	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	void *end = ring->buf + ring->buf_size;
	__be32 *ptr = (__be32 *)tx_desc;
	int i;

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
		}
	} else {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
			if ((void *)ptr >= end) {
				ptr = ring->buf;
				stamp ^= cpu_to_be32(0x80000000);
			}
		}
	}
}

257 258 259

static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring,
260
				int index, u8 owner, u64 timestamp)
261 262 263 264 265
{
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
	void *end = ring->buf + ring->buf_size;
266 267
	struct sk_buff *skb = tx_info->skb;
	int nr_maps = tx_info->nr_maps;
268
	int i;
269

270 271 272 273 274
	/* We do not touch skb here, so prefetch skb->users location
	 * to speedup consume_skb()
	 */
	prefetchw(&skb->users);

275 276 277 278
	if (unlikely(timestamp)) {
		struct skb_shared_hwtstamps hwts;

		mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
279 280
		skb_tstamp_tx(skb, &hwts);
	}
281 282 283

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
284
		if (!tx_info->inl) {
285
			if (tx_info->linear)
286
				dma_unmap_single(priv->ddev,
287 288 289 290 291 292 293 294 295 296
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
297
				dma_unmap_page(priv->ddev,
298 299 300
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
301
			}
302 303
		}
	} else {
304 305
		if (!tx_info->inl) {
			if ((void *) data >= end) {
306
				data = ring->buf + ((void *)data - end);
307
			}
308

309
			if (tx_info->linear)
310
				dma_unmap_single(priv->ddev,
311 312 313 314 315 316 317 318 319 320
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
321 322
				/* Check for wraparound before unmapping */
				if ((void *) data >= end)
323
					data = ring->buf;
324
				dma_unmap_page(priv->ddev,
325 326 327
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
328
			}
329 330
		}
	}
331
	dev_consume_skb_any(skb);
332 333 334 335 336 337 338 339 340 341 342
	return tx_info->nr_txbb;
}


int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int cnt = 0;

	/* Skip last polled descriptor */
	ring->cons += ring->last_nr_txbb;
343
	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
344 345 346 347
		 ring->cons, ring->prod);

	if ((u32) (ring->prod - ring->cons) > ring->size) {
		if (netif_msg_tx_err(priv))
348
			en_warn(priv, "Tx consumer passed producer!\n");
349 350 351 352 353 354
		return 0;
	}

	while (ring->cons != ring->prod) {
		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
						ring->cons & ring->size_mask,
355
						!!(ring->cons & ring->size), 0);
356 357 358 359
		ring->cons += ring->last_nr_txbb;
		cnt++;
	}

360 361
	netdev_tx_reset_queue(ring->tx_queue);

362
	if (cnt)
363
		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
364 365 366 367

	return cnt;
}

368 369
static bool mlx4_en_process_tx_cq(struct net_device *dev,
				 struct mlx4_en_cq *cq)
370 371 372
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	struct mlx4_cq *mcq = &cq->mcq;
373
	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
374
	struct mlx4_cqe *cqe;
375
	u16 index;
376
	u16 new_index, ring_index, stamp_index;
377
	u32 txbbs_skipped = 0;
378
	u32 txbbs_stamp = 0;
379 380 381 382
	u32 cons_index = mcq->cons_index;
	int size = cq->size;
	u32 size_mask = ring->size_mask;
	struct mlx4_cqe *buf = cq->buf;
383 384
	u32 packets = 0;
	u32 bytes = 0;
O
Or Gerlitz 已提交
385
	int factor = priv->cqe_factor;
386
	u64 timestamp = 0;
387
	int done = 0;
388
	int budget = priv->tx_work_limit;
389 390
	u32 last_nr_txbb;
	u32 ring_cons;
391 392

	if (!priv->port_up)
393
		return true;
394

395 396
	netdev_txq_bql_complete_prefetchw(ring->tx_queue);

397
	index = cons_index & size_mask;
398
	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
399 400 401
	last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
	ring_cons = ACCESS_ONCE(ring->cons);
	ring_index = ring_cons & size_mask;
402
	stamp_index = ring_index;
403 404 405

	/* Process all completed CQEs */
	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
406
			cons_index & size) && (done < budget)) {
407 408 409 410 411 412
		/*
		 * make sure we read the CQE after we read the
		 * ownership bit
		 */
		rmb();

413 414 415 416 417 418 419 420 421
		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
			     MLX4_CQE_OPCODE_ERROR)) {
			struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;

			en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
			       cqe_err->vendor_err_syndrome,
			       cqe_err->syndrome);
		}

422 423 424
		/* Skip over last polled CQE */
		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;

425
		do {
426 427
			txbbs_skipped += last_nr_txbb;
			ring_index = (ring_index + last_nr_txbb) & size_mask;
428 429 430
			if (ring->tx_info[ring_index].ts_requested)
				timestamp = mlx4_en_get_cqe_ts(cqe);

431
			/* free next descriptor */
432
			last_nr_txbb = mlx4_en_free_tx_desc(
433
					priv, ring, ring_index,
434
					!!((ring_cons + txbbs_skipped) &
435
					ring->size), timestamp);
436 437

			mlx4_en_stamp_wqe(priv, ring, stamp_index,
438
					  !!((ring_cons + txbbs_stamp) &
439 440 441
						ring->size));
			stamp_index = ring_index;
			txbbs_stamp = txbbs_skipped;
442 443
			packets++;
			bytes += ring->tx_info[ring_index].nr_bytes;
444
		} while ((++done < budget) && (ring_index != new_index));
445 446 447

		++cons_index;
		index = cons_index & size_mask;
448
		cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
449
	}
450 451 452 453 454 455


	/*
	 * To prevent CQ overflow we first update CQ consumer and only then
	 * the ring consumer.
	 */
456
	mcq->cons_index = cons_index;
457 458
	mlx4_cq_set_ci(mcq);
	wmb();
459 460 461 462 463

	/* we want to dirty this cache line once */
	ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
	ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;

464
	netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
465

466 467 468 469 470 471
	/*
	 * Wakeup Tx queue if this stopped, and at least 1 packet
	 * was completed
	 */
	if (netif_tx_queue_stopped(ring->tx_queue) && txbbs_skipped > 0) {
		netif_tx_wake_queue(ring->tx_queue);
472
		ring->wake_queue++;
473
	}
474
	return done < budget;
475 476 477 478 479 480 481
}

void mlx4_en_tx_irq(struct mlx4_cq *mcq)
{
	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
	struct mlx4_en_priv *priv = netdev_priv(cq->dev);

482 483 484 485
	if (priv->port_up)
		napi_schedule(&cq->napi);
	else
		mlx4_en_arm_cq(priv, cq);
486 487
}

488 489 490 491 492 493
/* TX CQ polling - called by NAPI */
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
{
	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
	struct net_device *dev = cq->dev;
	struct mlx4_en_priv *priv = netdev_priv(dev);
494
	int clean_complete;
495

496 497 498
	clean_complete = mlx4_en_process_tx_cq(dev, cq);
	if (!clean_complete)
		return budget;
499

500 501 502 503
	napi_complete(napi);
	mlx4_en_arm_cq(priv, cq);

	return 0;
504
}
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533

static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
						      struct mlx4_en_tx_ring *ring,
						      u32 index,
						      unsigned int desc_size)
{
	u32 copy = (ring->size - index) * TXBB_SIZE;
	int i;

	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + i)) =
			*((u32 *) (ring->bounce_buf + copy + i));
	}

	for (i = copy - 4; i >= 4 ; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
			*((u32 *) (ring->bounce_buf + i));
	}

	/* Return real descriptor location */
	return ring->buf + index * TXBB_SIZE;
}

534 535 536 537 538 539 540
/* Decide if skb can be inlined in tx descriptor to avoid dma mapping
 *
 * It seems strange we do not simply use skb_copy_bits().
 * This would allow to inline all skbs iff skb->len <= inline_thold
 *
 * Note that caller already checked skb was not a gso packet
 */
541
static bool is_inline(int inline_thold, const struct sk_buff *skb,
542
		      const struct skb_shared_info *shinfo,
543
		      void **pfrag)
544 545 546
{
	void *ptr;

547 548
	if (skb->len > inline_thold || !inline_thold)
		return false;
549

550 551 552 553 554 555
	if (shinfo->nr_frags == 1) {
		ptr = skb_frag_address_safe(&shinfo->frags[0]);
		if (unlikely(!ptr))
			return false;
		*pfrag = ptr;
		return true;
556
	}
557 558 559
	if (shinfo->nr_frags)
		return false;
	return true;
560 561
}

562
static int inline_size(const struct sk_buff *skb)
563 564 565 566 567 568 569 570 571 572
{
	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
	    <= MLX4_INLINE_ALIGN)
		return ALIGN(skb->len + CTRL_SIZE +
			     sizeof(struct mlx4_wqe_inline_seg), 16);
	else
		return ALIGN(skb->len + CTRL_SIZE + 2 *
			     sizeof(struct mlx4_wqe_inline_seg), 16);
}

573
static int get_real_size(const struct sk_buff *skb,
574
			 const struct skb_shared_info *shinfo,
575
			 struct net_device *dev,
576 577 578
			 int *lso_header_size,
			 bool *inline_ok,
			 void **pfrag)
579 580 581 582
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int real_size;

583
	if (shinfo->gso_size) {
584
		*inline_ok = false;
585 586 587 588
		if (skb->encapsulation)
			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
		else
			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
589
		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
590 591 592 593 594 595 596 597
			ALIGN(*lso_header_size + 4, DS_SIZE);
		if (unlikely(*lso_header_size != skb_headlen(skb))) {
			/* We add a segment for the skb linear buffer only if
			 * it contains data */
			if (*lso_header_size < skb_headlen(skb))
				real_size += DS_SIZE;
			else {
				if (netif_msg_tx_err(priv))
598
					en_warn(priv, "Non-linear headers\n");
599 600 601 602 603
				return 0;
			}
		}
	} else {
		*lso_header_size = 0;
604 605 606 607
		*inline_ok = is_inline(priv->prof->inline_thold, skb,
				       shinfo, pfrag);

		if (*inline_ok)
608
			real_size = inline_size(skb);
609 610 611
		else
			real_size = CTRL_SIZE +
				    (shinfo->nr_frags + 1) * DS_SIZE;
612 613 614 615 616
	}

	return real_size;
}

617 618
static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
			     const struct sk_buff *skb,
619
			     const struct skb_shared_info *shinfo,
620 621
			     int real_size, u16 *vlan_tag,
			     int tx_ind, void *fragptr)
622 623 624
{
	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
625
	unsigned int hlen = skb_headlen(skb);
626 627

	if (skb->len <= spc) {
628 629 630 631 632 633 634
		if (likely(skb->len >= MIN_PKT_LEN)) {
			inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
		} else {
			inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN);
			memset(((void *)(inl + 1)) + skb->len, 0,
			       MIN_PKT_LEN - skb->len);
		}
635
		skb_copy_from_linear_data(skb, inl + 1, hlen);
636
		if (shinfo->nr_frags)
637
			memcpy(((void *)(inl + 1)) + hlen, fragptr,
638
			       skb_frag_size(&shinfo->frags[0]));
639 640 641

	} else {
		inl->byte_count = cpu_to_be32(1 << 31 | spc);
642 643 644 645 646 647
		if (hlen <= spc) {
			skb_copy_from_linear_data(skb, inl + 1, hlen);
			if (hlen < spc) {
				memcpy(((void *)(inl + 1)) + hlen,
				       fragptr, spc - hlen);
				fragptr +=  spc - hlen;
648 649 650 651 652 653 654
			}
			inl = (void *) (inl + 1) + spc;
			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
		} else {
			skb_copy_from_linear_data(skb, inl + 1, spc);
			inl = (void *) (inl + 1) + spc;
			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
655
							 hlen - spc);
656
			if (shinfo->nr_frags)
657
				memcpy(((void *)(inl + 1)) + hlen - spc,
658 659
				       fragptr,
				       skb_frag_size(&shinfo->frags[0]));
660 661 662 663 664 665 666
		}

		wmb();
		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
	}
}

667
u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
668
			 void *accel_priv, select_queue_fallback_t fallback)
669
{
670
	struct mlx4_en_priv *priv = netdev_priv(dev);
671
	u16 rings_p_up = priv->num_tx_rings_p_up;
672
	u8 up = 0;
673

674 675 676 677 678
	if (dev->num_tc)
		return skb_tx_hash(dev, skb);

	if (vlan_tx_tag_present(skb))
		up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT;
Y
Yevgeny Petrilin 已提交
679

680
	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
681 682
}

683 684
static void mlx4_bf_copy(void __iomem *dst, const void *src,
			 unsigned int bytecnt)
685 686 687 688
{
	__iowrite64_copy(dst, src, bytecnt / 8);
}

689
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
690
{
691
	struct skb_shared_info *shinfo = skb_shinfo(skb);
692
	struct mlx4_en_priv *priv = netdev_priv(dev);
693
	struct device *ddev = priv->ddev;
694 695 696 697 698 699 700 701
	struct mlx4_en_tx_ring *ring;
	struct mlx4_en_tx_desc *tx_desc;
	struct mlx4_wqe_data_seg *data;
	struct mlx4_en_tx_info *tx_info;
	int tx_ind = 0;
	int nr_txbb;
	int desc_size;
	int real_size;
702
	u32 index, bf_index;
703
	__be32 op_own;
Y
Yevgeny Petrilin 已提交
704
	u16 vlan_tag = 0;
705
	int i_frag;
706
	int lso_header_size;
707
	void *fragptr = NULL;
708
	bool bounce = false;
709
	bool send_doorbell;
E
Eric Dumazet 已提交
710
	bool stop_queue;
711
	bool inline_ok;
712
	u32 ring_cons;
713

714 715 716
	if (!priv->port_up)
		goto tx_drop;

717 718 719 720 721 722
	tx_ind = skb_get_queue_mapping(skb);
	ring = priv->tx_ring[tx_ind];

	/* fetch ring->cons far ahead before needing it to avoid stall */
	ring_cons = ACCESS_ONCE(ring->cons);

723 724
	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
				  &inline_ok, &fragptr);
725
	if (unlikely(!real_size))
726
		goto tx_drop;
727

L
Lucas De Marchi 已提交
728
	/* Align descriptor to TXBB size */
729 730 731 732
	desc_size = ALIGN(real_size, TXBB_SIZE);
	nr_txbb = desc_size / TXBB_SIZE;
	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
		if (netif_msg_tx_err(priv))
733
			en_warn(priv, "Oversized header or SG list\n");
734
		goto tx_drop;
735 736
	}

737
	if (vlan_tx_tag_present(skb))
Y
Yevgeny Petrilin 已提交
738
		vlan_tag = vlan_tx_tag_get(skb);
739 740


741
	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
742

743 744
	/* Track current inflight packets for performance analysis */
	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
745
			 (u32)(ring->prod - ring_cons - 1));
746 747 748

	/* Packet is good - grab an index and transmit it */
	index = ring->prod & ring->size_mask;
749
	bf_index = ring->prod;
750 751 752 753 754

	/* See if we have enough space for whole descriptor TXBB for setting
	 * SW ownership on next descriptor; if not, use a bounce buffer. */
	if (likely(index + nr_txbb <= ring->size))
		tx_desc = ring->buf + index * TXBB_SIZE;
755
	else {
756
		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
757 758
		bounce = true;
	}
759 760 761 762 763 764

	/* Save skb in tx_info ring */
	tx_info = &ring->tx_info[index];
	tx_info->skb = skb;
	tx_info->nr_txbb = nr_txbb;

765
	data = &tx_desc->data;
766 767 768 769 770 771 772
	if (lso_header_size)
		data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
						      DS_SIZE));

	/* valid only for none inline segments */
	tx_info->data_offset = (void *)data - (void *)tx_desc;

773 774
	tx_info->inl = inline_ok;

775
	tx_info->linear = (lso_header_size < skb_headlen(skb) &&
776
			   !inline_ok) ? 1 : 0;
777

778
	tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
779
	data += tx_info->nr_maps - 1;
780

781
	if (!tx_info->inl) {
782 783 784
		dma_addr_t dma = 0;
		u32 byte_count = 0;

785
		/* Map fragments if any */
786
		for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
787
			const struct skb_frag_struct *frag;
788 789

			frag = &shinfo->frags[i_frag];
790
			byte_count = skb_frag_size(frag);
791
			dma = skb_frag_dma_map(ddev, frag,
792
					       0, byte_count,
793 794 795 796 797
					       DMA_TO_DEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
798
			data->lkey = ring->mr_key;
799
			wmb();
800
			data->byte_count = cpu_to_be32(byte_count);
801 802 803
			--data;
		}

804
		/* Map linear part if needed */
805
		if (tx_info->linear) {
806
			byte_count = skb_headlen(skb) - lso_header_size;
807

808 809 810 811 812 813 814
			dma = dma_map_single(ddev, skb->data +
					     lso_header_size, byte_count,
					     PCI_DMA_TODEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
815
			data->lkey = ring->mr_key;
816 817 818
			wmb();
			data->byte_count = cpu_to_be32(byte_count);
		}
819 820 821
		/* tx completion can avoid cache line miss for common cases */
		tx_info->map0_dma = dma;
		tx_info->map0_byte_count = byte_count;
822 823
	}

824 825 826 827
	/*
	 * For timestamping add flag to skb_shinfo and
	 * set flag for further reference
	 */
828
	tx_info->ts_requested = 0;
829 830 831
	if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
		     shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
832 833 834
		tx_info->ts_requested = 1;
	}

835 836
	/* Prepare ctrl segement apart opcode+ownership, which depends on
	 * whether LSO is used */
A
Amir Vadai 已提交
837
	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
838 839 840
	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
							 MLX4_WQE_CTRL_TCP_UDP_CSUM);
841
		ring->tx_csum++;
842 843
	}

844
	if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
845 846
		struct ethhdr *ethh;

847 848 849 850 851 852 853 854
		/* Copy dst mac address to wqe. This allows loopback in eSwitch,
		 * so that VFs and PF can communicate with each other
		 */
		ethh = (struct ethhdr *)skb->data;
		tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
		tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
	}

855 856
	/* Handle LSO (TSO) packets */
	if (lso_header_size) {
857 858
		int i;

859 860 861 862 863 864 865
		/* Mark opcode as LSO */
		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
			((ring->prod & ring->size) ?
				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);

		/* Fill in the LSO prefix */
		tx_desc->lso.mss_hdr_size = cpu_to_be32(
866
			shinfo->gso_size << 16 | lso_header_size);
867 868 869 870 871

		/* Copy headers;
		 * note that we already verified that it is linear */
		memcpy(tx_desc->lso.header, skb->data, lso_header_size);

E
Eric Dumazet 已提交
872
		ring->tso_packets++;
873 874 875

		i = ((skb->len - lso_header_size) / shinfo->gso_size) +
			!!((skb->len - lso_header_size) % shinfo->gso_size);
876
		tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
877 878 879 880 881 882
		ring->packets += i;
	} else {
		/* Normal (Non LSO) packet */
		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
			((ring->prod & ring->size) ?
			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
883
		tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
884 885
		ring->packets++;
	}
886 887
	ring->bytes += tx_info->nr_bytes;
	netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
888 889
	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);

890
	if (tx_info->inl)
891 892
		build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
				 tx_ind, fragptr);
893

894 895 896 897 898 899 900 901
	if (skb->encapsulation) {
		struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb);
		if (ipv4->protocol == IPPROTO_TCP || ipv4->protocol == IPPROTO_UDP)
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP);
		else
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
	}

902 903 904
	ring->prod += nr_txbb;

	/* If we used a bounce buffer then copy descriptor back into place */
905
	if (unlikely(bounce))
906 907
		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);

908 909
	skb_tx_timestamp(skb);

E
Eric Dumazet 已提交
910 911 912 913 914 915 916
	/* Check available TXBBs And 2K spare for prefetch */
	stop_queue = (int)(ring->prod - ring_cons) >
		      ring->size - HEADROOM - MAX_DESC_TXBBS;
	if (unlikely(stop_queue)) {
		netif_tx_stop_queue(ring->tx_queue);
		ring->queue_stopped++;
	}
917 918
	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);

919 920
	real_size = (real_size / 16) & 0x3f;

921 922
	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
	    !vlan_tx_tag_present(skb) && send_doorbell) {
923 924
		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
				       cpu_to_be32(real_size);
925

926
		op_own |= htonl((bf_index & 0xffff) << 8);
927 928 929
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
930 931
		wmb();
		tx_desc->ctrl.owner_opcode = op_own;
932

933 934
		wmb();

935 936
		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
			     desc_size);
937 938 939 940 941

		wmb();

		ring->bf.offset ^= ring->bf.buf_size;
	} else {
942 943 944 945 946
		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
			!!vlan_tx_tag_present(skb);
		tx_desc->ctrl.fence_size = real_size;

947 948 949
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
950 951
		wmb();
		tx_desc->ctrl.owner_opcode = op_own;
952 953
		if (send_doorbell) {
			wmb();
954 955
			iowrite32(ring->doorbell_qpn,
				  ring->bf.uar->map + MLX4_SEND_DOORBELL);
E
Eric Dumazet 已提交
956 957
		} else {
			ring->xmit_more++;
958
		}
959
	}
960

E
Eric Dumazet 已提交
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
	if (unlikely(stop_queue)) {
		/* If queue was emptied after the if (stop_queue) , and before
		 * the netif_tx_stop_queue() - need to wake the queue,
		 * or else it will remain stopped forever.
		 * Need a memory barrier to make sure ring->cons was not
		 * updated before queue was stopped.
		 */
		smp_rmb();

		ring_cons = ACCESS_ONCE(ring->cons);
		if (unlikely(((int)(ring->prod - ring_cons)) <=
			     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
			netif_tx_wake_queue(ring->tx_queue);
			ring->wake_queue++;
		}
	}
977
	return NETDEV_TX_OK;
978

979 980 981
tx_drop_unmap:
	en_err(priv, "DMA mapping error\n");

982 983
	while (++i_frag < shinfo->nr_frags) {
		++data;
984 985 986 987 988
		dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
			       be32_to_cpu(data->byte_count),
			       PCI_DMA_TODEVICE);
	}

989 990 991 992
tx_drop:
	dev_kfree_skb_any(skb);
	priv->stats.tx_dropped++;
	return NETDEV_TX_OK;
993 994
}