en_tx.c 26.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
/*
 * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include <asm/page.h>
#include <linux/mlx4/cq.h>
36
#include <linux/slab.h>
37 38 39
#include <linux/mlx4/qp.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
40
#include <linux/prefetch.h>
41
#include <linux/vmalloc.h>
42
#include <linux/tcp.h>
43
#include <linux/ip.h>
44
#include <linux/moduleparam.h>
45 46 47 48

#include "mlx4_en.h"

int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
49
			   struct mlx4_en_tx_ring **pring, int qpn, u32 size,
50
			   u16 stride, int node, int queue_index)
51 52
{
	struct mlx4_en_dev *mdev = priv->mdev;
53
	struct mlx4_en_tx_ring *ring;
54 55 56
	int tmp;
	int err;

57
	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
58
	if (!ring) {
59 60 61 62 63
		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
		if (!ring) {
			en_err(priv, "Failed allocating TX ring\n");
			return -ENOMEM;
		}
64 65
	}

66 67 68
	ring->size = size;
	ring->size_mask = size - 1;
	ring->stride = stride;
69
	ring->inline_thold = priv->prof->inline_thold;
70 71

	tmp = size * sizeof(struct mlx4_en_tx_info);
72
	ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
73
	if (!ring->tx_info) {
74 75 76 77 78
		ring->tx_info = vmalloc(tmp);
		if (!ring->tx_info) {
			err = -ENOMEM;
			goto err_ring;
		}
79
	}
80

81
	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
82 83
		 ring->tx_info, tmp);

84
	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
85
	if (!ring->bounce_buf) {
86 87 88 89 90
		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
		if (!ring->bounce_buf) {
			err = -ENOMEM;
			goto err_info;
		}
91 92 93
	}
	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);

94 95
	/* Allocate HW buffers on provided NUMA node */
	set_dev_node(&mdev->dev->pdev->dev, node);
96 97
	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
				 2 * PAGE_SIZE);
98
	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
99
	if (err) {
100
		en_err(priv, "Failed allocating hwq resources\n");
101 102 103 104 105
		goto err_bounce;
	}

	err = mlx4_en_map_buffer(&ring->wqres.buf);
	if (err) {
106
		en_err(priv, "Failed to map TX buffer\n");
107 108 109 110 111
		goto err_hwq_res;
	}

	ring->buf = ring->wqres.buf.direct.buf;

J
Joe Perches 已提交
112 113 114
	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
	       ring, ring->buf, ring->size, ring->buf_size,
	       (unsigned long long) ring->wqres.buf.direct.map);
115

116
	ring->qpn = qpn;
117
	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
118
	if (err) {
119
		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
120
		goto err_map;
121
	}
122
	ring->qp.event = mlx4_en_sqp_event;
123

124
	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
125
	if (err) {
J
Joe Perches 已提交
126
		en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
127 128 129
		ring->bf.uar = &mdev->priv_uar;
		ring->bf.uar->map = mdev->uar_map;
		ring->bf_enabled = false;
130 131 132 133 134 135 136
		ring->bf_alloced = false;
		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
	} else {
		ring->bf_alloced = true;
		ring->bf_enabled = !!(priv->pflags &
				      MLX4_EN_PRIV_FLAGS_BLUEFLAME);
	}
137

138
	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
139 140 141 142
	ring->queue_index = queue_index;

	if (queue_index < priv->num_tx_rings_p_up && cpu_online(queue_index))
		cpumask_set_cpu(queue_index, &ring->affinity_mask);
143

144
	*pring = ring;
145 146 147 148 149 150 151 152 153
	return 0;

err_map:
	mlx4_en_unmap_buffer(&ring->wqres.buf);
err_hwq_res:
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
err_bounce:
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
154
err_info:
155
	kvfree(ring->tx_info);
156
	ring->tx_info = NULL;
157 158 159
err_ring:
	kfree(ring);
	*pring = NULL;
160 161 162 163
	return err;
}

void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
164
			     struct mlx4_en_tx_ring **pring)
165 166
{
	struct mlx4_en_dev *mdev = priv->mdev;
167
	struct mlx4_en_tx_ring *ring = *pring;
168
	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
169

170
	if (ring->bf_alloced)
171
		mlx4_bf_free(mdev->dev, &ring->bf);
172 173 174 175 176 177
	mlx4_qp_remove(mdev->dev, &ring->qp);
	mlx4_qp_free(mdev->dev, &ring->qp);
	mlx4_en_unmap_buffer(&ring->wqres.buf);
	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
	kfree(ring->bounce_buf);
	ring->bounce_buf = NULL;
178
	kvfree(ring->tx_info);
179
	ring->tx_info = NULL;
180 181
	kfree(ring);
	*pring = NULL;
182 183 184 185
}

int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
			     struct mlx4_en_tx_ring *ring,
186
			     int cq, int user_prio)
187 188 189 190 191 192 193 194 195 196 197 198
{
	struct mlx4_en_dev *mdev = priv->mdev;
	int err;

	ring->cqn = cq;
	ring->prod = 0;
	ring->cons = 0xffffffff;
	ring->last_nr_txbb = 1;
	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
	memset(ring->buf, 0, ring->buf_size);

	ring->qp_state = MLX4_QP_STATE_RST;
199 200
	ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
	ring->mr_key = cpu_to_be32(mdev->mr.key);
201 202

	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
203
				ring->cqn, user_prio, &ring->context);
204
	if (ring->bf_alloced)
205
		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
206 207 208

	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
			       &ring->qp, &ring->qp_state);
209 210 211
	if (!user_prio && cpu_online(ring->queue_index))
		netif_set_xps_queue(priv->dev, &ring->affinity_mask,
				    ring->queue_index);
212 213 214 215 216 217 218 219 220 221 222 223 224

	return err;
}

void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_dev *mdev = priv->mdev;

	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
}

225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
			      struct mlx4_en_tx_ring *ring, int index,
			      u8 owner)
{
	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	void *end = ring->buf + ring->buf_size;
	__be32 *ptr = (__be32 *)tx_desc;
	int i;

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
		}
	} else {
		/* Stamp the freed descriptor */
		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
		     i += STAMP_STRIDE) {
			*ptr = stamp;
			ptr += STAMP_DWORDS;
			if ((void *)ptr >= end) {
				ptr = ring->buf;
				stamp ^= cpu_to_be32(0x80000000);
			}
		}
	}
}

258 259 260

static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
				struct mlx4_en_tx_ring *ring,
261
				int index, u8 owner, u64 timestamp)
262 263 264 265 266
{
	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
	void *end = ring->buf + ring->buf_size;
267 268
	struct sk_buff *skb = tx_info->skb;
	int nr_maps = tx_info->nr_maps;
269
	int i;
270

271 272 273 274 275
	/* We do not touch skb here, so prefetch skb->users location
	 * to speedup consume_skb()
	 */
	prefetchw(&skb->users);

276 277 278 279
	if (unlikely(timestamp)) {
		struct skb_shared_hwtstamps hwts;

		mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
280 281
		skb_tstamp_tx(skb, &hwts);
	}
282 283 284

	/* Optimize the common case when there are no wraparounds */
	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
285
		if (!tx_info->inl) {
286
			if (tx_info->linear)
287
				dma_unmap_single(priv->ddev,
288 289 290 291 292 293 294 295 296 297
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
298
				dma_unmap_page(priv->ddev,
299 300 301
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
302
			}
303 304
		}
	} else {
305 306
		if (!tx_info->inl) {
			if ((void *) data >= end) {
307
				data = ring->buf + ((void *)data - end);
308
			}
309

310
			if (tx_info->linear)
311
				dma_unmap_single(priv->ddev,
312 313 314 315 316 317 318 319 320 321
						tx_info->map0_dma,
						tx_info->map0_byte_count,
						PCI_DMA_TODEVICE);
			else
				dma_unmap_page(priv->ddev,
					       tx_info->map0_dma,
					       tx_info->map0_byte_count,
					       PCI_DMA_TODEVICE);
			for (i = 1; i < nr_maps; i++) {
				data++;
322 323
				/* Check for wraparound before unmapping */
				if ((void *) data >= end)
324
					data = ring->buf;
325
				dma_unmap_page(priv->ddev,
326 327 328
					(dma_addr_t)be64_to_cpu(data->addr),
					be32_to_cpu(data->byte_count),
					PCI_DMA_TODEVICE);
329
			}
330 331
		}
	}
332
	dev_consume_skb_any(skb);
333 334 335 336 337 338 339 340 341 342 343
	return tx_info->nr_txbb;
}


int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int cnt = 0;

	/* Skip last polled descriptor */
	ring->cons += ring->last_nr_txbb;
344
	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
345 346 347 348
		 ring->cons, ring->prod);

	if ((u32) (ring->prod - ring->cons) > ring->size) {
		if (netif_msg_tx_err(priv))
349
			en_warn(priv, "Tx consumer passed producer!\n");
350 351 352 353 354 355
		return 0;
	}

	while (ring->cons != ring->prod) {
		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
						ring->cons & ring->size_mask,
356
						!!(ring->cons & ring->size), 0);
357 358 359 360
		ring->cons += ring->last_nr_txbb;
		cnt++;
	}

361 362
	netdev_tx_reset_queue(ring->tx_queue);

363
	if (cnt)
364
		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
365 366 367 368

	return cnt;
}

369 370
static bool mlx4_en_process_tx_cq(struct net_device *dev,
				 struct mlx4_en_cq *cq)
371 372 373
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	struct mlx4_cq *mcq = &cq->mcq;
374
	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
375
	struct mlx4_cqe *cqe;
376
	u16 index;
377
	u16 new_index, ring_index, stamp_index;
378
	u32 txbbs_skipped = 0;
379
	u32 txbbs_stamp = 0;
380 381 382 383
	u32 cons_index = mcq->cons_index;
	int size = cq->size;
	u32 size_mask = ring->size_mask;
	struct mlx4_cqe *buf = cq->buf;
384 385
	u32 packets = 0;
	u32 bytes = 0;
O
Or Gerlitz 已提交
386
	int factor = priv->cqe_factor;
387
	u64 timestamp = 0;
388
	int done = 0;
389
	int budget = priv->tx_work_limit;
390 391
	u32 last_nr_txbb;
	u32 ring_cons;
392 393

	if (!priv->port_up)
394
		return true;
395

396
	prefetchw(&ring->tx_queue->dql.limit);
397
	index = cons_index & size_mask;
398
	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
399 400 401
	last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
	ring_cons = ACCESS_ONCE(ring->cons);
	ring_index = ring_cons & size_mask;
402
	stamp_index = ring_index;
403 404 405

	/* Process all completed CQEs */
	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
406
			cons_index & size) && (done < budget)) {
407 408 409 410 411 412
		/*
		 * make sure we read the CQE after we read the
		 * ownership bit
		 */
		rmb();

413 414 415 416 417 418 419 420 421
		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
			     MLX4_CQE_OPCODE_ERROR)) {
			struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;

			en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
			       cqe_err->vendor_err_syndrome,
			       cqe_err->syndrome);
		}

422 423 424
		/* Skip over last polled CQE */
		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;

425
		do {
426 427
			txbbs_skipped += last_nr_txbb;
			ring_index = (ring_index + last_nr_txbb) & size_mask;
428 429 430
			if (ring->tx_info[ring_index].ts_requested)
				timestamp = mlx4_en_get_cqe_ts(cqe);

431
			/* free next descriptor */
432
			last_nr_txbb = mlx4_en_free_tx_desc(
433
					priv, ring, ring_index,
434
					!!((ring_cons + txbbs_skipped) &
435
					ring->size), timestamp);
436 437

			mlx4_en_stamp_wqe(priv, ring, stamp_index,
438
					  !!((ring_cons + txbbs_stamp) &
439 440 441
						ring->size));
			stamp_index = ring_index;
			txbbs_stamp = txbbs_skipped;
442 443
			packets++;
			bytes += ring->tx_info[ring_index].nr_bytes;
444
		} while ((++done < budget) && (ring_index != new_index));
445 446 447

		++cons_index;
		index = cons_index & size_mask;
448
		cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
449
	}
450 451 452 453 454 455


	/*
	 * To prevent CQ overflow we first update CQ consumer and only then
	 * the ring consumer.
	 */
456
	mcq->cons_index = cons_index;
457 458
	mlx4_cq_set_ci(mcq);
	wmb();
459 460 461 462 463

	/* we want to dirty this cache line once */
	ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
	ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;

464
	netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
465

466 467 468 469 470 471
	/*
	 * Wakeup Tx queue if this stopped, and at least 1 packet
	 * was completed
	 */
	if (netif_tx_queue_stopped(ring->tx_queue) && txbbs_skipped > 0) {
		netif_tx_wake_queue(ring->tx_queue);
472
		ring->wake_queue++;
473
	}
474
	return done < budget;
475 476 477 478 479 480 481
}

void mlx4_en_tx_irq(struct mlx4_cq *mcq)
{
	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
	struct mlx4_en_priv *priv = netdev_priv(cq->dev);

482 483 484 485
	if (priv->port_up)
		napi_schedule(&cq->napi);
	else
		mlx4_en_arm_cq(priv, cq);
486 487
}

488 489 490 491 492 493
/* TX CQ polling - called by NAPI */
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
{
	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
	struct net_device *dev = cq->dev;
	struct mlx4_en_priv *priv = netdev_priv(dev);
494
	int clean_complete;
495

496 497 498
	clean_complete = mlx4_en_process_tx_cq(dev, cq);
	if (!clean_complete)
		return budget;
499

500 501 502 503
	napi_complete(napi);
	mlx4_en_arm_cq(priv, cq);

	return 0;
504
}
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533

static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
						      struct mlx4_en_tx_ring *ring,
						      u32 index,
						      unsigned int desc_size)
{
	u32 copy = (ring->size - index) * TXBB_SIZE;
	int i;

	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + i)) =
			*((u32 *) (ring->bounce_buf + copy + i));
	}

	for (i = copy - 4; i >= 4 ; i -= 4) {
		if ((i & (TXBB_SIZE - 1)) == 0)
			wmb();

		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
			*((u32 *) (ring->bounce_buf + i));
	}

	/* Return real descriptor location */
	return ring->buf + index * TXBB_SIZE;
}

534
static bool is_inline(int inline_thold, const struct sk_buff *skb,
535
		      const struct skb_shared_info *shinfo,
536
		      void **pfrag)
537 538 539 540
{
	void *ptr;

	if (inline_thold && !skb_is_gso(skb) && skb->len <= inline_thold) {
541 542
		if (shinfo->nr_frags == 1) {
			ptr = skb_frag_address_safe(&shinfo->frags[0]);
543 544 545 546 547 548 549
			if (unlikely(!ptr))
				return 0;

			if (pfrag)
				*pfrag = ptr;

			return 1;
550
		} else if (unlikely(shinfo->nr_frags))
551 552 553 554 555 556 557 558
			return 0;
		else
			return 1;
	}

	return 0;
}

559
static int inline_size(const struct sk_buff *skb)
560 561 562 563 564 565 566 567 568 569
{
	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
	    <= MLX4_INLINE_ALIGN)
		return ALIGN(skb->len + CTRL_SIZE +
			     sizeof(struct mlx4_wqe_inline_seg), 16);
	else
		return ALIGN(skb->len + CTRL_SIZE + 2 *
			     sizeof(struct mlx4_wqe_inline_seg), 16);
}

570
static int get_real_size(const struct sk_buff *skb,
571
			 const struct skb_shared_info *shinfo,
572
			 struct net_device *dev,
573 574 575 576 577
			 int *lso_header_size)
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int real_size;

578
	if (shinfo->gso_size) {
579 580 581 582
		if (skb->encapsulation)
			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
		else
			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
583
		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
584 585 586 587 588 589 590 591
			ALIGN(*lso_header_size + 4, DS_SIZE);
		if (unlikely(*lso_header_size != skb_headlen(skb))) {
			/* We add a segment for the skb linear buffer only if
			 * it contains data */
			if (*lso_header_size < skb_headlen(skb))
				real_size += DS_SIZE;
			else {
				if (netif_msg_tx_err(priv))
592
					en_warn(priv, "Non-linear headers\n");
593 594 595 596 597
				return 0;
			}
		}
	} else {
		*lso_header_size = 0;
598 599
		if (!is_inline(priv->prof->inline_thold, skb, shinfo, NULL))
			real_size = CTRL_SIZE + (shinfo->nr_frags + 1) * DS_SIZE;
600 601 602 603 604 605 606
		else
			real_size = inline_size(skb);
	}

	return real_size;
}

607 608
static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
			     const struct sk_buff *skb,
609
			     const struct skb_shared_info *shinfo,
610 611
			     int real_size, u16 *vlan_tag,
			     int tx_ind, void *fragptr)
612 613 614
{
	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
615
	unsigned int hlen = skb_headlen(skb);
616 617

	if (skb->len <= spc) {
618 619 620 621 622 623 624
		if (likely(skb->len >= MIN_PKT_LEN)) {
			inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
		} else {
			inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN);
			memset(((void *)(inl + 1)) + skb->len, 0,
			       MIN_PKT_LEN - skb->len);
		}
625
		skb_copy_from_linear_data(skb, inl + 1, hlen);
626
		if (shinfo->nr_frags)
627
			memcpy(((void *)(inl + 1)) + hlen, fragptr,
628
			       skb_frag_size(&shinfo->frags[0]));
629 630 631

	} else {
		inl->byte_count = cpu_to_be32(1 << 31 | spc);
632 633 634 635 636 637
		if (hlen <= spc) {
			skb_copy_from_linear_data(skb, inl + 1, hlen);
			if (hlen < spc) {
				memcpy(((void *)(inl + 1)) + hlen,
				       fragptr, spc - hlen);
				fragptr +=  spc - hlen;
638 639 640 641 642 643 644
			}
			inl = (void *) (inl + 1) + spc;
			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
		} else {
			skb_copy_from_linear_data(skb, inl + 1, spc);
			inl = (void *) (inl + 1) + spc;
			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
645
							 hlen - spc);
646
			if (shinfo->nr_frags)
647
				memcpy(((void *)(inl + 1)) + hlen - spc,
648 649
				       fragptr,
				       skb_frag_size(&shinfo->frags[0]));
650 651 652 653 654 655 656
		}

		wmb();
		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
	}
}

657
u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
658
			 void *accel_priv, select_queue_fallback_t fallback)
659
{
660
	struct mlx4_en_priv *priv = netdev_priv(dev);
661
	u16 rings_p_up = priv->num_tx_rings_p_up;
662
	u8 up = 0;
663

664 665 666 667 668
	if (dev->num_tc)
		return skb_tx_hash(dev, skb);

	if (vlan_tx_tag_present(skb))
		up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT;
Y
Yevgeny Petrilin 已提交
669

670
	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
671 672
}

673 674
static void mlx4_bf_copy(void __iomem *dst, const void *src,
			 unsigned int bytecnt)
675 676 677 678
{
	__iowrite64_copy(dst, src, bytecnt / 8);
}

679
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
680
{
681
	struct skb_shared_info *shinfo = skb_shinfo(skb);
682
	struct mlx4_en_priv *priv = netdev_priv(dev);
683
	struct device *ddev = priv->ddev;
684 685 686 687 688 689 690 691
	struct mlx4_en_tx_ring *ring;
	struct mlx4_en_tx_desc *tx_desc;
	struct mlx4_wqe_data_seg *data;
	struct mlx4_en_tx_info *tx_info;
	int tx_ind = 0;
	int nr_txbb;
	int desc_size;
	int real_size;
692
	u32 index, bf_index;
693
	__be32 op_own;
Y
Yevgeny Petrilin 已提交
694
	u16 vlan_tag = 0;
695
	int i_frag;
696 697
	int lso_header_size;
	void *fragptr;
698
	bool bounce = false;
699
	bool send_doorbell;
700
	u32 ring_cons;
701

702 703 704
	if (!priv->port_up)
		goto tx_drop;

705 706 707 708 709 710
	tx_ind = skb_get_queue_mapping(skb);
	ring = priv->tx_ring[tx_ind];

	/* fetch ring->cons far ahead before needing it to avoid stall */
	ring_cons = ACCESS_ONCE(ring->cons);

711
	real_size = get_real_size(skb, shinfo, dev, &lso_header_size);
712
	if (unlikely(!real_size))
713
		goto tx_drop;
714

L
Lucas De Marchi 已提交
715
	/* Align descriptor to TXBB size */
716 717 718 719
	desc_size = ALIGN(real_size, TXBB_SIZE);
	nr_txbb = desc_size / TXBB_SIZE;
	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
		if (netif_msg_tx_err(priv))
720
			en_warn(priv, "Oversized header or SG list\n");
721
		goto tx_drop;
722 723
	}

724
	if (vlan_tx_tag_present(skb))
Y
Yevgeny Petrilin 已提交
725
		vlan_tag = vlan_tx_tag_get(skb);
726 727

	/* Check available TXBBs And 2K spare for prefetch */
728
	if (unlikely(((int)(ring->prod - ring_cons)) >
729
		     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
Y
Yevgeny Petrilin 已提交
730
		/* every full Tx ring stops queue */
731
		netif_tx_stop_queue(ring->tx_queue);
732
		ring->queue_stopped++;
733

734 735 736 737 738 739 740 741
		/* If queue was emptied after the if, and before the
		 * stop_queue - need to wake the queue, or else it will remain
		 * stopped forever.
		 * Need a memory barrier to make sure ring->cons was not
		 * updated before queue was stopped.
		 */
		wmb();

742 743
		ring_cons = ACCESS_ONCE(ring->cons);
		if (unlikely(((int)(ring->prod - ring_cons)) <=
744 745
			     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
			netif_tx_wake_queue(ring->tx_queue);
746
			ring->wake_queue++;
747 748 749
		} else {
			return NETDEV_TX_BUSY;
		}
750 751
	}

752 753
	prefetchw(&ring->tx_queue->dql);

754 755
	/* Track current inflight packets for performance analysis */
	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
756
			 (u32)(ring->prod - ring_cons - 1));
757 758 759

	/* Packet is good - grab an index and transmit it */
	index = ring->prod & ring->size_mask;
760
	bf_index = ring->prod;
761 762 763 764 765

	/* See if we have enough space for whole descriptor TXBB for setting
	 * SW ownership on next descriptor; if not, use a bounce buffer. */
	if (likely(index + nr_txbb <= ring->size))
		tx_desc = ring->buf + index * TXBB_SIZE;
766
	else {
767
		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
768 769
		bounce = true;
	}
770 771 772 773 774 775

	/* Save skb in tx_info ring */
	tx_info = &ring->tx_info[index];
	tx_info->skb = skb;
	tx_info->nr_txbb = nr_txbb;

776
	data = &tx_desc->data;
777 778 779 780 781 782 783 784
	if (lso_header_size)
		data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
						      DS_SIZE));

	/* valid only for none inline segments */
	tx_info->data_offset = (void *)data - (void *)tx_desc;

	tx_info->linear = (lso_header_size < skb_headlen(skb) &&
785
			   !is_inline(ring->inline_thold, skb, shinfo, NULL)) ? 1 : 0;
786

787
	tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
788
	data += tx_info->nr_maps - 1;
789

790
	if (is_inline(ring->inline_thold, skb, shinfo, &fragptr)) {
791 792
		tx_info->inl = 1;
	} else {
793 794 795
		dma_addr_t dma = 0;
		u32 byte_count = 0;

796
		/* Map fragments if any */
797
		for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
798
			const struct skb_frag_struct *frag;
799 800

			frag = &shinfo->frags[i_frag];
801
			byte_count = skb_frag_size(frag);
802
			dma = skb_frag_dma_map(ddev, frag,
803
					       0, byte_count,
804 805 806 807 808
					       DMA_TO_DEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
809
			data->lkey = ring->mr_key;
810
			wmb();
811
			data->byte_count = cpu_to_be32(byte_count);
812 813 814
			--data;
		}

815
		/* Map linear part if needed */
816
		if (tx_info->linear) {
817
			byte_count = skb_headlen(skb) - lso_header_size;
818

819 820 821 822 823 824 825
			dma = dma_map_single(ddev, skb->data +
					     lso_header_size, byte_count,
					     PCI_DMA_TODEVICE);
			if (dma_mapping_error(ddev, dma))
				goto tx_drop_unmap;

			data->addr = cpu_to_be64(dma);
826
			data->lkey = ring->mr_key;
827 828 829 830
			wmb();
			data->byte_count = cpu_to_be32(byte_count);
		}
		tx_info->inl = 0;
831 832 833
		/* tx completion can avoid cache line miss for common cases */
		tx_info->map0_dma = dma;
		tx_info->map0_byte_count = byte_count;
834 835
	}

836 837 838 839
	/*
	 * For timestamping add flag to skb_shinfo and
	 * set flag for further reference
	 */
840 841 842
	if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
		     shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
843 844 845
		tx_info->ts_requested = 1;
	}

846 847
	/* Prepare ctrl segement apart opcode+ownership, which depends on
	 * whether LSO is used */
A
Amir Vadai 已提交
848
	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
849 850 851
	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
							 MLX4_WQE_CTRL_TCP_UDP_CSUM);
852
		ring->tx_csum++;
853 854
	}

855
	if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
856 857
		struct ethhdr *ethh;

858 859 860 861 862 863 864 865
		/* Copy dst mac address to wqe. This allows loopback in eSwitch,
		 * so that VFs and PF can communicate with each other
		 */
		ethh = (struct ethhdr *)skb->data;
		tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
		tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
	}

866 867
	/* Handle LSO (TSO) packets */
	if (lso_header_size) {
868 869
		int i;

870 871 872 873 874 875 876
		/* Mark opcode as LSO */
		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
			((ring->prod & ring->size) ?
				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);

		/* Fill in the LSO prefix */
		tx_desc->lso.mss_hdr_size = cpu_to_be32(
877
			shinfo->gso_size << 16 | lso_header_size);
878 879 880 881 882

		/* Copy headers;
		 * note that we already verified that it is linear */
		memcpy(tx_desc->lso.header, skb->data, lso_header_size);

E
Eric Dumazet 已提交
883
		ring->tso_packets++;
884 885 886

		i = ((skb->len - lso_header_size) / shinfo->gso_size) +
			!!((skb->len - lso_header_size) % shinfo->gso_size);
887
		tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
888 889 890 891 892 893
		ring->packets += i;
	} else {
		/* Normal (Non LSO) packet */
		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
			((ring->prod & ring->size) ?
			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
894
		tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
895 896
		ring->packets++;
	}
897 898
	ring->bytes += tx_info->nr_bytes;
	netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
899 900
	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);

901
	if (tx_info->inl) {
902 903
		build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
				 tx_ind, fragptr);
904 905
		tx_info->inl = 1;
	}
906

907 908 909 910 911 912 913 914
	if (skb->encapsulation) {
		struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb);
		if (ipv4->protocol == IPPROTO_TCP || ipv4->protocol == IPPROTO_UDP)
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP);
		else
			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
	}

915 916 917
	ring->prod += nr_txbb;

	/* If we used a bounce buffer then copy descriptor back into place */
918
	if (unlikely(bounce))
919 920
		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);

921 922
	skb_tx_timestamp(skb);

923 924
	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);

925 926
	real_size = (real_size / 16) & 0x3f;

927 928
	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
	    !vlan_tx_tag_present(skb) && send_doorbell) {
929 930
		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
				       cpu_to_be32(real_size);
931

932
		op_own |= htonl((bf_index & 0xffff) << 8);
933 934 935
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
936 937
		wmb();
		tx_desc->ctrl.owner_opcode = op_own;
938

939 940
		wmb();

941 942
		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
			     desc_size);
943 944 945 946 947

		wmb();

		ring->bf.offset ^= ring->bf.buf_size;
	} else {
948 949 950 951 952
		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
			!!vlan_tx_tag_present(skb);
		tx_desc->ctrl.fence_size = real_size;

953 954 955
		/* Ensure new descriptor hits memory
		 * before setting ownership of this descriptor to HW
		 */
956 957
		wmb();
		tx_desc->ctrl.owner_opcode = op_own;
958 959
		if (send_doorbell) {
			wmb();
960 961
			iowrite32(ring->doorbell_qpn,
				  ring->bf.uar->map + MLX4_SEND_DOORBELL);
E
Eric Dumazet 已提交
962 963
		} else {
			ring->xmit_more++;
964
		}
965
	}
966

967
	return NETDEV_TX_OK;
968

969 970 971
tx_drop_unmap:
	en_err(priv, "DMA mapping error\n");

972 973
	while (++i_frag < shinfo->nr_frags) {
		++data;
974 975 976 977 978
		dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
			       be32_to_cpu(data->byte_count),
			       PCI_DMA_TODEVICE);
	}

979 980 981 982
tx_drop:
	dev_kfree_skb_any(skb);
	priv->stats.tx_dropped++;
	return NETDEV_TX_OK;
983 984
}