提交 4e62ccd9 编写于 作者: D David S. Miller

Merge branch 'mlx4-next'

Amir Vadai says:

====================
net/mlx4_en: Optimizations to TX flow

This patchset contains optimizations to TX flow in mlx4_en driver. It also introduce
setting/getting tx copybreak, to enable controlling inline threshold dynamically.

TX flow optimizations was authored and posted to the mailing list by Eric
Dumazet [1] as a single patch. I splitted this patch to smaller patches,
Reviewed it and tested.
Changed from original patch:
- s/iowrite32be/iowrite32/, since ring->doorbell_qpn is stored as be32

The tx copybreak patch was also suggested by Eric Dumazet, and was edited and
reviewed by me. User space patch will be sent after kernel code is ready.

I am sending this patchset now since the merge window is near and don't want to
miss it.

More work need to do:
- Disable BF when xmit_more is in use
- Make TSO use xmit_more too. Maybe by splitting small TSO packets in the
  driver itself, to avoid extra cpu/memory costs of GSO before the driver
- Fix mlx4_en_xmit buggy handling of queue full in the middle of a burst
  partially posted to send queue using xmit_more

Eric, I edited the patches to have you as the Author and the first
signed-off-by. I hope it is ok with you (I wasn't sure if it is ok to sign by
you), anyway all the credit to those changes should go to you.

Patchset was tested and applied over commit 1e203c1a "(net: sched:
suspicious RCU usage in qdisc_watchdog")

[1] - https://patchwork.ozlabs.org/patch/394256/
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
...@@ -1267,6 +1267,48 @@ static u32 mlx4_en_get_priv_flags(struct net_device *dev) ...@@ -1267,6 +1267,48 @@ static u32 mlx4_en_get_priv_flags(struct net_device *dev)
return priv->pflags; return priv->pflags;
} }
static int mlx4_en_get_tunable(struct net_device *dev,
const struct ethtool_tunable *tuna,
void *data)
{
const struct mlx4_en_priv *priv = netdev_priv(dev);
int ret = 0;
switch (tuna->id) {
case ETHTOOL_TX_COPYBREAK:
*(u32 *)data = priv->prof->inline_thold;
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
static int mlx4_en_set_tunable(struct net_device *dev,
const struct ethtool_tunable *tuna,
const void *data)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
int val, ret = 0;
switch (tuna->id) {
case ETHTOOL_TX_COPYBREAK:
val = *(u32 *)data;
if (val < MIN_PKT_LEN || val > MAX_INLINE)
ret = -EINVAL;
else
priv->prof->inline_thold = val;
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
const struct ethtool_ops mlx4_en_ethtool_ops = { const struct ethtool_ops mlx4_en_ethtool_ops = {
.get_drvinfo = mlx4_en_get_drvinfo, .get_drvinfo = mlx4_en_get_drvinfo,
...@@ -1297,6 +1339,8 @@ const struct ethtool_ops mlx4_en_ethtool_ops = { ...@@ -1297,6 +1339,8 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
.get_ts_info = mlx4_en_get_ts_info, .get_ts_info = mlx4_en_get_ts_info,
.set_priv_flags = mlx4_en_set_priv_flags, .set_priv_flags = mlx4_en_set_priv_flags,
.get_priv_flags = mlx4_en_get_priv_flags, .get_priv_flags = mlx4_en_get_priv_flags,
.get_tunable = mlx4_en_get_tunable,
.set_tunable = mlx4_en_set_tunable,
}; };
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include <linux/mlx4/qp.h> #include <linux/mlx4/qp.h>
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <linux/if_vlan.h> #include <linux/if_vlan.h>
#include <linux/prefetch.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/tcp.h> #include <linux/tcp.h>
#include <linux/ip.h> #include <linux/ip.h>
...@@ -65,10 +66,9 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, ...@@ -65,10 +66,9 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
ring->size = size; ring->size = size;
ring->size_mask = size - 1; ring->size_mask = size - 1;
ring->stride = stride; ring->stride = stride;
ring->inline_thold = priv->prof->inline_thold;
tmp = size * sizeof(struct mlx4_en_tx_info); tmp = size * sizeof(struct mlx4_en_tx_info);
ring->tx_info = vmalloc_node(tmp, node); ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
if (!ring->tx_info) { if (!ring->tx_info) {
ring->tx_info = vmalloc(tmp); ring->tx_info = vmalloc(tmp);
if (!ring->tx_info) { if (!ring->tx_info) {
...@@ -151,7 +151,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, ...@@ -151,7 +151,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
kfree(ring->bounce_buf); kfree(ring->bounce_buf);
ring->bounce_buf = NULL; ring->bounce_buf = NULL;
err_info: err_info:
vfree(ring->tx_info); kvfree(ring->tx_info);
ring->tx_info = NULL; ring->tx_info = NULL;
err_ring: err_ring:
kfree(ring); kfree(ring);
...@@ -174,7 +174,7 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, ...@@ -174,7 +174,7 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
kfree(ring->bounce_buf); kfree(ring->bounce_buf);
ring->bounce_buf = NULL; ring->bounce_buf = NULL;
vfree(ring->tx_info); kvfree(ring->tx_info);
ring->tx_info = NULL; ring->tx_info = NULL;
kfree(ring); kfree(ring);
*pring = NULL; *pring = NULL;
...@@ -191,12 +191,12 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, ...@@ -191,12 +191,12 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
ring->prod = 0; ring->prod = 0;
ring->cons = 0xffffffff; ring->cons = 0xffffffff;
ring->last_nr_txbb = 1; ring->last_nr_txbb = 1;
ring->poll_cnt = 0;
memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
memset(ring->buf, 0, ring->buf_size); memset(ring->buf, 0, ring->buf_size);
ring->qp_state = MLX4_QP_STATE_RST; ring->qp_state = MLX4_QP_STATE_RST;
ring->doorbell_qpn = ring->qp.qpn << 8; ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
ring->mr_key = cpu_to_be32(mdev->mr.key);
mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
ring->cqn, user_prio, &ring->context); ring->cqn, user_prio, &ring->context);
...@@ -259,38 +259,45 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, ...@@ -259,38 +259,45 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring, struct mlx4_en_tx_ring *ring,
int index, u8 owner, u64 timestamp) int index, u8 owner, u64 timestamp)
{ {
struct mlx4_en_dev *mdev = priv->mdev;
struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
struct sk_buff *skb = tx_info->skb;
struct skb_frag_struct *frag;
void *end = ring->buf + ring->buf_size; void *end = ring->buf + ring->buf_size;
int frags = skb_shinfo(skb)->nr_frags; struct sk_buff *skb = tx_info->skb;
int nr_maps = tx_info->nr_maps;
int i; int i;
struct skb_shared_hwtstamps hwts;
if (timestamp) { /* We do not touch skb here, so prefetch skb->users location
mlx4_en_fill_hwtstamps(mdev, &hwts, timestamp); * to speedup consume_skb()
*/
prefetchw(&skb->users);
if (unlikely(timestamp)) {
struct skb_shared_hwtstamps hwts;
mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
skb_tstamp_tx(skb, &hwts); skb_tstamp_tx(skb, &hwts);
} }
/* Optimize the common case when there are no wraparounds */ /* Optimize the common case when there are no wraparounds */
if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
if (!tx_info->inl) { if (!tx_info->inl) {
if (tx_info->linear) { if (tx_info->linear)
dma_unmap_single(priv->ddev, dma_unmap_single(priv->ddev,
(dma_addr_t) be64_to_cpu(data->addr), tx_info->map0_dma,
be32_to_cpu(data->byte_count), tx_info->map0_byte_count,
PCI_DMA_TODEVICE); PCI_DMA_TODEVICE);
++data; else
} dma_unmap_page(priv->ddev,
tx_info->map0_dma,
for (i = 0; i < frags; i++) { tx_info->map0_byte_count,
frag = &skb_shinfo(skb)->frags[i]; PCI_DMA_TODEVICE);
for (i = 1; i < nr_maps; i++) {
data++;
dma_unmap_page(priv->ddev, dma_unmap_page(priv->ddev,
(dma_addr_t) be64_to_cpu(data[i].addr), (dma_addr_t)be64_to_cpu(data->addr),
skb_frag_size(frag), PCI_DMA_TODEVICE); be32_to_cpu(data->byte_count),
PCI_DMA_TODEVICE);
} }
} }
} else { } else {
...@@ -299,23 +306,25 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, ...@@ -299,23 +306,25 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
data = ring->buf + ((void *)data - end); data = ring->buf + ((void *)data - end);
} }
if (tx_info->linear) { if (tx_info->linear)
dma_unmap_single(priv->ddev, dma_unmap_single(priv->ddev,
(dma_addr_t) be64_to_cpu(data->addr), tx_info->map0_dma,
be32_to_cpu(data->byte_count), tx_info->map0_byte_count,
PCI_DMA_TODEVICE); PCI_DMA_TODEVICE);
++data; else
} dma_unmap_page(priv->ddev,
tx_info->map0_dma,
for (i = 0; i < frags; i++) { tx_info->map0_byte_count,
PCI_DMA_TODEVICE);
for (i = 1; i < nr_maps; i++) {
data++;
/* Check for wraparound before unmapping */ /* Check for wraparound before unmapping */
if ((void *) data >= end) if ((void *) data >= end)
data = ring->buf; data = ring->buf;
frag = &skb_shinfo(skb)->frags[i];
dma_unmap_page(priv->ddev, dma_unmap_page(priv->ddev,
(dma_addr_t) be64_to_cpu(data->addr), (dma_addr_t)be64_to_cpu(data->addr),
skb_frag_size(frag), PCI_DMA_TODEVICE); be32_to_cpu(data->byte_count),
++data; PCI_DMA_TODEVICE);
} }
} }
} }
...@@ -377,13 +386,18 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -377,13 +386,18 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
u64 timestamp = 0; u64 timestamp = 0;
int done = 0; int done = 0;
int budget = priv->tx_work_limit; int budget = priv->tx_work_limit;
u32 last_nr_txbb;
u32 ring_cons;
if (!priv->port_up) if (!priv->port_up)
return true; return true;
prefetchw(&ring->tx_queue->dql.limit);
index = cons_index & size_mask; index = cons_index & size_mask;
cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor; cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
ring_index = ring->cons & size_mask; last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
ring_cons = ACCESS_ONCE(ring->cons);
ring_index = ring_cons & size_mask;
stamp_index = ring_index; stamp_index = ring_index;
/* Process all completed CQEs */ /* Process all completed CQEs */
...@@ -408,19 +422,19 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -408,19 +422,19 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
new_index = be16_to_cpu(cqe->wqe_index) & size_mask; new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
do { do {
txbbs_skipped += ring->last_nr_txbb; txbbs_skipped += last_nr_txbb;
ring_index = (ring_index + ring->last_nr_txbb) & size_mask; ring_index = (ring_index + last_nr_txbb) & size_mask;
if (ring->tx_info[ring_index].ts_requested) if (ring->tx_info[ring_index].ts_requested)
timestamp = mlx4_en_get_cqe_ts(cqe); timestamp = mlx4_en_get_cqe_ts(cqe);
/* free next descriptor */ /* free next descriptor */
ring->last_nr_txbb = mlx4_en_free_tx_desc( last_nr_txbb = mlx4_en_free_tx_desc(
priv, ring, ring_index, priv, ring, ring_index,
!!((ring->cons + txbbs_skipped) & !!((ring_cons + txbbs_skipped) &
ring->size), timestamp); ring->size), timestamp);
mlx4_en_stamp_wqe(priv, ring, stamp_index, mlx4_en_stamp_wqe(priv, ring, stamp_index,
!!((ring->cons + txbbs_stamp) & !!((ring_cons + txbbs_stamp) &
ring->size)); ring->size));
stamp_index = ring_index; stamp_index = ring_index;
txbbs_stamp = txbbs_skipped; txbbs_stamp = txbbs_skipped;
...@@ -441,7 +455,11 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -441,7 +455,11 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
mcq->cons_index = cons_index; mcq->cons_index = cons_index;
mlx4_cq_set_ci(mcq); mlx4_cq_set_ci(mcq);
wmb(); wmb();
ring->cons += txbbs_skipped;
/* we want to dirty this cache line once */
ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
netdev_tx_completed_queue(ring->tx_queue, packets, bytes); netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
/* /*
...@@ -512,30 +530,35 @@ static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, ...@@ -512,30 +530,35 @@ static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
return ring->buf + index * TXBB_SIZE; return ring->buf + index * TXBB_SIZE;
} }
static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag) /* Decide if skb can be inlined in tx descriptor to avoid dma mapping
*
* It seems strange we do not simply use skb_copy_bits().
* This would allow to inline all skbs iff skb->len <= inline_thold
*
* Note that caller already checked skb was not a gso packet
*/
static bool is_inline(int inline_thold, const struct sk_buff *skb,
const struct skb_shared_info *shinfo,
void **pfrag)
{ {
void *ptr; void *ptr;
if (inline_thold && !skb_is_gso(skb) && skb->len <= inline_thold) { if (skb->len > inline_thold || !inline_thold)
if (skb_shinfo(skb)->nr_frags == 1) { return false;
ptr = skb_frag_address_safe(&skb_shinfo(skb)->frags[0]);
if (unlikely(!ptr))
return 0;
if (pfrag)
*pfrag = ptr;
return 1; if (shinfo->nr_frags == 1) {
} else if (unlikely(skb_shinfo(skb)->nr_frags)) ptr = skb_frag_address_safe(&shinfo->frags[0]);
return 0; if (unlikely(!ptr))
else return false;
return 1; *pfrag = ptr;
return true;
} }
if (shinfo->nr_frags)
return 0; return false;
return true;
} }
static int inline_size(struct sk_buff *skb) static int inline_size(const struct sk_buff *skb)
{ {
if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
<= MLX4_INLINE_ALIGN) <= MLX4_INLINE_ALIGN)
...@@ -546,18 +569,23 @@ static int inline_size(struct sk_buff *skb) ...@@ -546,18 +569,23 @@ static int inline_size(struct sk_buff *skb)
sizeof(struct mlx4_wqe_inline_seg), 16); sizeof(struct mlx4_wqe_inline_seg), 16);
} }
static int get_real_size(struct sk_buff *skb, struct net_device *dev, static int get_real_size(const struct sk_buff *skb,
int *lso_header_size) const struct skb_shared_info *shinfo,
struct net_device *dev,
int *lso_header_size,
bool *inline_ok,
void **pfrag)
{ {
struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_priv *priv = netdev_priv(dev);
int real_size; int real_size;
if (skb_is_gso(skb)) { if (shinfo->gso_size) {
*inline_ok = false;
if (skb->encapsulation) if (skb->encapsulation)
*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb); *lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
else else
*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb); *lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
real_size = CTRL_SIZE + skb_shinfo(skb)->nr_frags * DS_SIZE + real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
ALIGN(*lso_header_size + 4, DS_SIZE); ALIGN(*lso_header_size + 4, DS_SIZE);
if (unlikely(*lso_header_size != skb_headlen(skb))) { if (unlikely(*lso_header_size != skb_headlen(skb))) {
/* We add a segment for the skb linear buffer only if /* We add a segment for the skb linear buffer only if
...@@ -572,20 +600,28 @@ static int get_real_size(struct sk_buff *skb, struct net_device *dev, ...@@ -572,20 +600,28 @@ static int get_real_size(struct sk_buff *skb, struct net_device *dev,
} }
} else { } else {
*lso_header_size = 0; *lso_header_size = 0;
if (!is_inline(priv->prof->inline_thold, skb, NULL)) *inline_ok = is_inline(priv->prof->inline_thold, skb,
real_size = CTRL_SIZE + (skb_shinfo(skb)->nr_frags + 1) * DS_SIZE; shinfo, pfrag);
else
if (*inline_ok)
real_size = inline_size(skb); real_size = inline_size(skb);
else
real_size = CTRL_SIZE +
(shinfo->nr_frags + 1) * DS_SIZE;
} }
return real_size; return real_size;
} }
static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *skb, static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
int real_size, u16 *vlan_tag, int tx_ind, void *fragptr) const struct sk_buff *skb,
const struct skb_shared_info *shinfo,
int real_size, u16 *vlan_tag,
int tx_ind, void *fragptr)
{ {
struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
unsigned int hlen = skb_headlen(skb);
if (skb->len <= spc) { if (skb->len <= spc) {
if (likely(skb->len >= MIN_PKT_LEN)) { if (likely(skb->len >= MIN_PKT_LEN)) {
...@@ -595,19 +631,19 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk ...@@ -595,19 +631,19 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk
memset(((void *)(inl + 1)) + skb->len, 0, memset(((void *)(inl + 1)) + skb->len, 0,
MIN_PKT_LEN - skb->len); MIN_PKT_LEN - skb->len);
} }
skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb)); skb_copy_from_linear_data(skb, inl + 1, hlen);
if (skb_shinfo(skb)->nr_frags) if (shinfo->nr_frags)
memcpy(((void *)(inl + 1)) + skb_headlen(skb), fragptr, memcpy(((void *)(inl + 1)) + hlen, fragptr,
skb_frag_size(&skb_shinfo(skb)->frags[0])); skb_frag_size(&shinfo->frags[0]));
} else { } else {
inl->byte_count = cpu_to_be32(1 << 31 | spc); inl->byte_count = cpu_to_be32(1 << 31 | spc);
if (skb_headlen(skb) <= spc) { if (hlen <= spc) {
skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb)); skb_copy_from_linear_data(skb, inl + 1, hlen);
if (skb_headlen(skb) < spc) { if (hlen < spc) {
memcpy(((void *)(inl + 1)) + skb_headlen(skb), memcpy(((void *)(inl + 1)) + hlen,
fragptr, spc - skb_headlen(skb)); fragptr, spc - hlen);
fragptr += spc - skb_headlen(skb); fragptr += spc - hlen;
} }
inl = (void *) (inl + 1) + spc; inl = (void *) (inl + 1) + spc;
memcpy(((void *)(inl + 1)), fragptr, skb->len - spc); memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
...@@ -615,10 +651,11 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk ...@@ -615,10 +651,11 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk
skb_copy_from_linear_data(skb, inl + 1, spc); skb_copy_from_linear_data(skb, inl + 1, spc);
inl = (void *) (inl + 1) + spc; inl = (void *) (inl + 1) + spc;
skb_copy_from_linear_data_offset(skb, spc, inl + 1, skb_copy_from_linear_data_offset(skb, spc, inl + 1,
skb_headlen(skb) - spc); hlen - spc);
if (skb_shinfo(skb)->nr_frags) if (shinfo->nr_frags)
memcpy(((void *)(inl + 1)) + skb_headlen(skb) - spc, memcpy(((void *)(inl + 1)) + hlen - spc,
fragptr, skb_frag_size(&skb_shinfo(skb)->frags[0])); fragptr,
skb_frag_size(&shinfo->frags[0]));
} }
wmb(); wmb();
...@@ -642,15 +679,16 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, ...@@ -642,15 +679,16 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
return fallback(dev, skb) % rings_p_up + up * rings_p_up; return fallback(dev, skb) % rings_p_up + up * rings_p_up;
} }
static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) static void mlx4_bf_copy(void __iomem *dst, const void *src,
unsigned int bytecnt)
{ {
__iowrite64_copy(dst, src, bytecnt / 8); __iowrite64_copy(dst, src, bytecnt / 8);
} }
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
{ {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_priv *priv = netdev_priv(dev);
struct mlx4_en_dev *mdev = priv->mdev;
struct device *ddev = priv->ddev; struct device *ddev = priv->ddev;
struct mlx4_en_tx_ring *ring; struct mlx4_en_tx_ring *ring;
struct mlx4_en_tx_desc *tx_desc; struct mlx4_en_tx_desc *tx_desc;
...@@ -663,16 +701,25 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -663,16 +701,25 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
u32 index, bf_index; u32 index, bf_index;
__be32 op_own; __be32 op_own;
u16 vlan_tag = 0; u16 vlan_tag = 0;
int i; int i_frag;
int lso_header_size; int lso_header_size;
void *fragptr; void *fragptr = NULL;
bool bounce = false; bool bounce = false;
bool send_doorbell; bool send_doorbell;
bool inline_ok;
u32 ring_cons;
if (!priv->port_up) if (!priv->port_up)
goto tx_drop; goto tx_drop;
real_size = get_real_size(skb, dev, &lso_header_size); tx_ind = skb_get_queue_mapping(skb);
ring = priv->tx_ring[tx_ind];
/* fetch ring->cons far ahead before needing it to avoid stall */
ring_cons = ACCESS_ONCE(ring->cons);
real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
&inline_ok, &fragptr);
if (unlikely(!real_size)) if (unlikely(!real_size))
goto tx_drop; goto tx_drop;
...@@ -685,13 +732,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -685,13 +732,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_drop; goto tx_drop;
} }
tx_ind = skb->queue_mapping;
ring = priv->tx_ring[tx_ind];
if (vlan_tx_tag_present(skb)) if (vlan_tx_tag_present(skb))
vlan_tag = vlan_tx_tag_get(skb); vlan_tag = vlan_tx_tag_get(skb);
/* Check available TXBBs And 2K spare for prefetch */ /* Check available TXBBs And 2K spare for prefetch */
if (unlikely(((int)(ring->prod - ring->cons)) > if (unlikely(((int)(ring->prod - ring_cons)) >
ring->size - HEADROOM - MAX_DESC_TXBBS)) { ring->size - HEADROOM - MAX_DESC_TXBBS)) {
/* every full Tx ring stops queue */ /* every full Tx ring stops queue */
netif_tx_stop_queue(ring->tx_queue); netif_tx_stop_queue(ring->tx_queue);
...@@ -705,7 +750,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -705,7 +750,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
*/ */
wmb(); wmb();
if (unlikely(((int)(ring->prod - ring->cons)) <= ring_cons = ACCESS_ONCE(ring->cons);
if (unlikely(((int)(ring->prod - ring_cons)) <=
ring->size - HEADROOM - MAX_DESC_TXBBS)) { ring->size - HEADROOM - MAX_DESC_TXBBS)) {
netif_tx_wake_queue(ring->tx_queue); netif_tx_wake_queue(ring->tx_queue);
ring->wake_queue++; ring->wake_queue++;
...@@ -714,9 +760,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -714,9 +760,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
} }
} }
prefetchw(&ring->tx_queue->dql);
/* Track current inflight packets for performance analysis */ /* Track current inflight packets for performance analysis */
AVG_PERF_COUNTER(priv->pstats.inflight_avg, AVG_PERF_COUNTER(priv->pstats.inflight_avg,
(u32) (ring->prod - ring->cons - 1)); (u32)(ring->prod - ring_cons - 1));
/* Packet is good - grab an index and transmit it */ /* Packet is good - grab an index and transmit it */
index = ring->prod & ring->size_mask; index = ring->prod & ring->size_mask;
...@@ -736,46 +784,48 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -736,46 +784,48 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_info->skb = skb; tx_info->skb = skb;
tx_info->nr_txbb = nr_txbb; tx_info->nr_txbb = nr_txbb;
data = &tx_desc->data;
if (lso_header_size) if (lso_header_size)
data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
DS_SIZE)); DS_SIZE));
else
data = &tx_desc->data;
/* valid only for none inline segments */ /* valid only for none inline segments */
tx_info->data_offset = (void *)data - (void *)tx_desc; tx_info->data_offset = (void *)data - (void *)tx_desc;
tx_info->inl = inline_ok;
tx_info->linear = (lso_header_size < skb_headlen(skb) && tx_info->linear = (lso_header_size < skb_headlen(skb) &&
!is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0; !inline_ok) ? 1 : 0;
data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1; tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
data += tx_info->nr_maps - 1;
if (is_inline(ring->inline_thold, skb, &fragptr)) { if (!tx_info->inl) {
tx_info->inl = 1; dma_addr_t dma = 0;
} else { u32 byte_count = 0;
/* Map fragments */
for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
struct skb_frag_struct *frag;
dma_addr_t dma;
frag = &skb_shinfo(skb)->frags[i]; /* Map fragments if any */
for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
const struct skb_frag_struct *frag;
frag = &shinfo->frags[i_frag];
byte_count = skb_frag_size(frag);
dma = skb_frag_dma_map(ddev, frag, dma = skb_frag_dma_map(ddev, frag,
0, skb_frag_size(frag), 0, byte_count,
DMA_TO_DEVICE); DMA_TO_DEVICE);
if (dma_mapping_error(ddev, dma)) if (dma_mapping_error(ddev, dma))
goto tx_drop_unmap; goto tx_drop_unmap;
data->addr = cpu_to_be64(dma); data->addr = cpu_to_be64(dma);
data->lkey = cpu_to_be32(mdev->mr.key); data->lkey = ring->mr_key;
wmb(); wmb();
data->byte_count = cpu_to_be32(skb_frag_size(frag)); data->byte_count = cpu_to_be32(byte_count);
--data; --data;
} }
/* Map linear part */ /* Map linear part if needed */
if (tx_info->linear) { if (tx_info->linear) {
u32 byte_count = skb_headlen(skb) - lso_header_size; byte_count = skb_headlen(skb) - lso_header_size;
dma_addr_t dma;
dma = dma_map_single(ddev, skb->data + dma = dma_map_single(ddev, skb->data +
lso_header_size, byte_count, lso_header_size, byte_count,
...@@ -784,29 +834,28 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -784,29 +834,28 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_drop_unmap; goto tx_drop_unmap;
data->addr = cpu_to_be64(dma); data->addr = cpu_to_be64(dma);
data->lkey = cpu_to_be32(mdev->mr.key); data->lkey = ring->mr_key;
wmb(); wmb();
data->byte_count = cpu_to_be32(byte_count); data->byte_count = cpu_to_be32(byte_count);
} }
tx_info->inl = 0; /* tx completion can avoid cache line miss for common cases */
tx_info->map0_dma = dma;
tx_info->map0_byte_count = byte_count;
} }
/* /*
* For timestamping add flag to skb_shinfo and * For timestamping add flag to skb_shinfo and
* set flag for further reference * set flag for further reference
*/ */
if (ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && tx_info->ts_requested = 0;
skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
shinfo->tx_flags |= SKBTX_IN_PROGRESS;
tx_info->ts_requested = 1; tx_info->ts_requested = 1;
} }
/* Prepare ctrl segement apart opcode+ownership, which depends on /* Prepare ctrl segement apart opcode+ownership, which depends on
* whether LSO is used */ * whether LSO is used */
tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
!!vlan_tx_tag_present(skb);
tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
...@@ -827,6 +876,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -827,6 +876,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* Handle LSO (TSO) packets */ /* Handle LSO (TSO) packets */
if (lso_header_size) { if (lso_header_size) {
int i;
/* Mark opcode as LSO */ /* Mark opcode as LSO */
op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) | op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
((ring->prod & ring->size) ? ((ring->prod & ring->size) ?
...@@ -834,15 +885,16 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -834,15 +885,16 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* Fill in the LSO prefix */ /* Fill in the LSO prefix */
tx_desc->lso.mss_hdr_size = cpu_to_be32( tx_desc->lso.mss_hdr_size = cpu_to_be32(
skb_shinfo(skb)->gso_size << 16 | lso_header_size); shinfo->gso_size << 16 | lso_header_size);
/* Copy headers; /* Copy headers;
* note that we already verified that it is linear */ * note that we already verified that it is linear */
memcpy(tx_desc->lso.header, skb->data, lso_header_size); memcpy(tx_desc->lso.header, skb->data, lso_header_size);
ring->tso_packets++; ring->tso_packets++;
i = ((skb->len - lso_header_size) / skb_shinfo(skb)->gso_size) +
!!((skb->len - lso_header_size) % skb_shinfo(skb)->gso_size); i = ((skb->len - lso_header_size) / shinfo->gso_size) +
!!((skb->len - lso_header_size) % shinfo->gso_size);
tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size; tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
ring->packets += i; ring->packets += i;
} else { } else {
...@@ -852,16 +904,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -852,16 +904,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
ring->packets++; ring->packets++;
} }
ring->bytes += tx_info->nr_bytes; ring->bytes += tx_info->nr_bytes;
netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes); netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len); AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
if (tx_info->inl) { if (tx_info->inl)
build_inline_wqe(tx_desc, skb, real_size, &vlan_tag, tx_ind, fragptr); build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
tx_info->inl = 1; tx_ind, fragptr);
}
if (skb->encapsulation) { if (skb->encapsulation) {
struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb); struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb);
...@@ -874,16 +924,19 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -874,16 +924,19 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
ring->prod += nr_txbb; ring->prod += nr_txbb;
/* If we used a bounce buffer then copy descriptor back into place */ /* If we used a bounce buffer then copy descriptor back into place */
if (bounce) if (unlikely(bounce))
tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
skb_tx_timestamp(skb); skb_tx_timestamp(skb);
send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue); send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
real_size = (real_size / 16) & 0x3f;
if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
!vlan_tx_tag_present(skb) && send_doorbell) { !vlan_tx_tag_present(skb) && send_doorbell) {
tx_desc->ctrl.bf_qpn |= cpu_to_be32(ring->doorbell_qpn); tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
cpu_to_be32(real_size);
op_own |= htonl((bf_index & 0xffff) << 8); op_own |= htonl((bf_index & 0xffff) << 8);
/* Ensure new descriptor hits memory /* Ensure new descriptor hits memory
...@@ -894,13 +947,18 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -894,13 +947,18 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
wmb(); wmb();
mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl, mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
desc_size); desc_size);
wmb(); wmb();
ring->bf.offset ^= ring->bf.buf_size; ring->bf.offset ^= ring->bf.buf_size;
} else { } else {
tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
!!vlan_tx_tag_present(skb);
tx_desc->ctrl.fence_size = real_size;
/* Ensure new descriptor hits memory /* Ensure new descriptor hits memory
* before setting ownership of this descriptor to HW * before setting ownership of this descriptor to HW
*/ */
...@@ -908,8 +966,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -908,8 +966,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_desc->ctrl.owner_opcode = op_own; tx_desc->ctrl.owner_opcode = op_own;
if (send_doorbell) { if (send_doorbell) {
wmb(); wmb();
iowrite32be(ring->doorbell_qpn, iowrite32(ring->doorbell_qpn,
ring->bf.uar->map + MLX4_SEND_DOORBELL); ring->bf.uar->map + MLX4_SEND_DOORBELL);
} else { } else {
ring->xmit_more++; ring->xmit_more++;
} }
...@@ -920,8 +978,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -920,8 +978,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_drop_unmap: tx_drop_unmap:
en_err(priv, "DMA mapping error\n"); en_err(priv, "DMA mapping error\n");
for (i++; i < skb_shinfo(skb)->nr_frags; i++) { while (++i_frag < shinfo->nr_frags) {
data++; ++data;
dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr), dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
be32_to_cpu(data->byte_count), be32_to_cpu(data->byte_count),
PCI_DMA_TODEVICE); PCI_DMA_TODEVICE);
......
...@@ -216,13 +216,16 @@ enum cq_type { ...@@ -216,13 +216,16 @@ enum cq_type {
struct mlx4_en_tx_info { struct mlx4_en_tx_info {
struct sk_buff *skb; struct sk_buff *skb;
u32 nr_txbb; dma_addr_t map0_dma;
u32 nr_bytes; u32 map0_byte_count;
u8 linear; u32 nr_txbb;
u8 data_offset; u32 nr_bytes;
u8 inl; u8 linear;
u8 ts_requested; u8 data_offset;
}; u8 inl;
u8 ts_requested;
u8 nr_maps;
} ____cacheline_aligned_in_smp;
#define MLX4_EN_BIT_DESC_OWN 0x80000000 #define MLX4_EN_BIT_DESC_OWN 0x80000000
...@@ -253,41 +256,46 @@ struct mlx4_en_rx_alloc { ...@@ -253,41 +256,46 @@ struct mlx4_en_rx_alloc {
}; };
struct mlx4_en_tx_ring { struct mlx4_en_tx_ring {
/* cache line used and dirtied in tx completion
* (mlx4_en_free_tx_buf())
*/
u32 last_nr_txbb;
u32 cons;
unsigned long wake_queue;
/* cache line used and dirtied in mlx4_en_xmit() */
u32 prod ____cacheline_aligned_in_smp;
unsigned long bytes;
unsigned long packets;
unsigned long tx_csum;
unsigned long tso_packets;
unsigned long xmit_more;
struct mlx4_bf bf;
unsigned long queue_stopped;
/* Following part should be mostly read */
cpumask_t affinity_mask;
struct mlx4_qp qp;
struct mlx4_hwq_resources wqres; struct mlx4_hwq_resources wqres;
u32 size ; /* number of TXBBs */ u32 size; /* number of TXBBs */
u32 size_mask; u32 size_mask;
u16 stride; u16 stride;
u16 cqn; /* index of port CQ associated with this ring */ u16 cqn; /* index of port CQ associated with this ring */
u32 prod; u32 buf_size;
u32 cons; __be32 doorbell_qpn;
u32 buf_size; __be32 mr_key;
u32 doorbell_qpn; void *buf;
void *buf; struct mlx4_en_tx_info *tx_info;
u16 poll_cnt; u8 *bounce_buf;
struct mlx4_en_tx_info *tx_info; struct mlx4_qp_context context;
u8 *bounce_buf; int qpn;
u8 queue_index; enum mlx4_qp_state qp_state;
cpumask_t affinity_mask; u8 queue_index;
u32 last_nr_txbb; bool bf_enabled;
struct mlx4_qp qp; bool bf_alloced;
struct mlx4_qp_context context; struct netdev_queue *tx_queue;
int qpn; int hwtstamp_tx_type;
enum mlx4_qp_state qp_state; } ____cacheline_aligned_in_smp;
struct mlx4_srq dummy;
unsigned long bytes;
unsigned long packets;
unsigned long tx_csum;
unsigned long queue_stopped;
unsigned long wake_queue;
unsigned long tso_packets;
unsigned long xmit_more;
struct mlx4_bf bf;
bool bf_enabled;
bool bf_alloced;
struct netdev_queue *tx_queue;
int hwtstamp_tx_type;
int inline_thold;
};
struct mlx4_en_rx_desc { struct mlx4_en_rx_desc {
/* actual number of entries depends on rx ring stride */ /* actual number of entries depends on rx ring stride */
......
...@@ -583,7 +583,7 @@ struct mlx4_uar { ...@@ -583,7 +583,7 @@ struct mlx4_uar {
}; };
struct mlx4_bf { struct mlx4_bf {
unsigned long offset; unsigned int offset;
int buf_size; int buf_size;
struct mlx4_uar *uar; struct mlx4_uar *uar;
void __iomem *reg; void __iomem *reg;
......
...@@ -212,6 +212,7 @@ struct ethtool_value { ...@@ -212,6 +212,7 @@ struct ethtool_value {
enum tunable_id { enum tunable_id {
ETHTOOL_ID_UNSPEC, ETHTOOL_ID_UNSPEC,
ETHTOOL_RX_COPYBREAK, ETHTOOL_RX_COPYBREAK,
ETHTOOL_TX_COPYBREAK,
}; };
enum tunable_type_id { enum tunable_type_id {
......
...@@ -1625,6 +1625,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) ...@@ -1625,6 +1625,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
{ {
switch (tuna->id) { switch (tuna->id) {
case ETHTOOL_RX_COPYBREAK: case ETHTOOL_RX_COPYBREAK:
case ETHTOOL_TX_COPYBREAK:
if (tuna->len != sizeof(u32) || if (tuna->len != sizeof(u32) ||
tuna->type_id != ETHTOOL_TUNABLE_U32) tuna->type_id != ETHTOOL_TUNABLE_U32)
return -EINVAL; return -EINVAL;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册