提交 c4268778 编写于 作者: Y Yuval Shaia 提交者: Doug Ledford

IB/ipoib: Scatter-Gather support in connected mode

By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better
performance.
This MTU plus overhead puts the memory allocation for IP based packets at
32 4k pages (order 5), which have to be contiguous.
When the system memory under pressure, it was observed that allocating 128k
contiguous physical memory is difficult and causes serious errors (such as
system becomes unusable).

This enhancement resolve the issue by removing the physically contiguous
memory requirement using Scatter/Gather feature that exists in Linux stack.

With this fix Scatter-Gather will be supported also in connected mode.

This change reverts some of the change made in commit e112373f
("IPoIB/cm: Reduce connected mode TX object size").

The ability to use SG in IPoIB CM is possible because the coupling
between NETIF_F_SG and NETIF_F_CSUM was removed in commit
ec5f0615 ("net: Kill link between CSUM and SG features.")
Signed-off-by: NYuval Shaia <yuval.shaia@oracle.com>
Acked-by: NChristian Marie <christian@ponies.io>
Signed-off-by: NDoug Ledford <dledford@redhat.com>
上级 59d40dd9
...@@ -239,7 +239,7 @@ struct ipoib_cm_tx { ...@@ -239,7 +239,7 @@ struct ipoib_cm_tx {
struct net_device *dev; struct net_device *dev;
struct ipoib_neigh *neigh; struct ipoib_neigh *neigh;
struct ipoib_path *path; struct ipoib_path *path;
struct ipoib_cm_tx_buf *tx_ring; struct ipoib_tx_buf *tx_ring;
unsigned tx_head; unsigned tx_head;
unsigned tx_tail; unsigned tx_tail;
unsigned long flags; unsigned long flags;
...@@ -504,6 +504,33 @@ int ipoib_mcast_stop_thread(struct net_device *dev); ...@@ -504,6 +504,33 @@ int ipoib_mcast_stop_thread(struct net_device *dev);
void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_down(struct net_device *dev);
void ipoib_mcast_dev_flush(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev);
int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
struct ipoib_tx_buf *tx_req);
static inline void ipoib_build_sge(struct ipoib_dev_priv *priv,
struct ipoib_tx_buf *tx_req)
{
int i, off;
struct sk_buff *skb = tx_req->skb;
skb_frag_t *frags = skb_shinfo(skb)->frags;
int nr_frags = skb_shinfo(skb)->nr_frags;
u64 *mapping = tx_req->mapping;
if (skb_headlen(skb)) {
priv->tx_sge[0].addr = mapping[0];
priv->tx_sge[0].length = skb_headlen(skb);
off = 1;
} else
off = 0;
for (i = 0; i < nr_frags; ++i) {
priv->tx_sge[i + off].addr = mapping[i + off];
priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
}
priv->tx_wr.num_sge = nr_frags + off;
}
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
......
...@@ -694,14 +694,12 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -694,14 +694,12 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
static inline int post_send(struct ipoib_dev_priv *priv, static inline int post_send(struct ipoib_dev_priv *priv,
struct ipoib_cm_tx *tx, struct ipoib_cm_tx *tx,
unsigned int wr_id, unsigned int wr_id,
u64 addr, int len) struct ipoib_tx_buf *tx_req)
{ {
struct ib_send_wr *bad_wr; struct ib_send_wr *bad_wr;
priv->tx_sge[0].addr = addr; ipoib_build_sge(priv, tx_req);
priv->tx_sge[0].length = len;
priv->tx_wr.num_sge = 1;
priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
...@@ -710,8 +708,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, ...@@ -710,8 +708,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
{ {
struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_cm_tx_buf *tx_req; struct ipoib_tx_buf *tx_req;
u64 addr;
int rc; int rc;
if (unlikely(skb->len > tx->mtu)) { if (unlikely(skb->len > tx->mtu)) {
...@@ -735,24 +732,21 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ ...@@ -735,24 +732,21 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
*/ */
tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
tx_req->skb = skb; tx_req->skb = skb;
addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
++dev->stats.tx_errors; ++dev->stats.tx_errors;
dev_kfree_skb_any(skb); dev_kfree_skb_any(skb);
return; return;
} }
tx_req->mapping = addr;
skb_orphan(skb); skb_orphan(skb);
skb_dst_drop(skb); skb_dst_drop(skb);
rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
addr, skb->len);
if (unlikely(rc)) { if (unlikely(rc)) {
ipoib_warn(priv, "post_send failed, error %d\n", rc); ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors; ++dev->stats.tx_errors;
ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb); dev_kfree_skb_any(skb);
} else { } else {
dev->trans_start = jiffies; dev->trans_start = jiffies;
...@@ -777,7 +771,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -777,7 +771,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_cm_tx *tx = wc->qp->qp_context; struct ipoib_cm_tx *tx = wc->qp->qp_context;
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
struct ipoib_cm_tx_buf *tx_req; struct ipoib_tx_buf *tx_req;
unsigned long flags; unsigned long flags;
ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
...@@ -791,7 +785,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -791,7 +785,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
tx_req = &tx->tx_ring[wr_id]; tx_req = &tx->tx_ring[wr_id];
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); ipoib_dma_unmap_tx(priv, tx_req);
/* FIXME: is this right? Shouldn't we only increment on success? */ /* FIXME: is this right? Shouldn't we only increment on success? */
++dev->stats.tx_packets; ++dev->stats.tx_packets;
...@@ -1036,6 +1030,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ ...@@ -1036,6 +1030,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
struct ib_qp *tx_qp; struct ib_qp *tx_qp;
if (dev->features & NETIF_F_SG)
attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
tx_qp = ib_create_qp(priv->pd, &attr); tx_qp = ib_create_qp(priv->pd, &attr);
if (PTR_ERR(tx_qp) == -EINVAL) { if (PTR_ERR(tx_qp) == -EINVAL) {
ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
...@@ -1170,7 +1167,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, ...@@ -1170,7 +1167,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
{ {
struct ipoib_dev_priv *priv = netdev_priv(p->dev); struct ipoib_dev_priv *priv = netdev_priv(p->dev);
struct ipoib_cm_tx_buf *tx_req; struct ipoib_tx_buf *tx_req;
unsigned long begin; unsigned long begin;
ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
...@@ -1197,8 +1194,7 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) ...@@ -1197,8 +1194,7 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
while ((int) p->tx_tail - (int) p->tx_head < 0) { while ((int) p->tx_tail - (int) p->tx_head < 0) {
tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, ipoib_dma_unmap_tx(priv, tx_req);
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb); dev_kfree_skb_any(tx_req->skb);
++p->tx_tail; ++p->tx_tail;
netif_tx_lock_bh(p->dev); netif_tx_lock_bh(p->dev);
...@@ -1455,7 +1451,6 @@ static void ipoib_cm_stale_task(struct work_struct *work) ...@@ -1455,7 +1451,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
spin_unlock_irq(&priv->lock); spin_unlock_irq(&priv->lock);
} }
static ssize_t show_mode(struct device *d, struct device_attribute *attr, static ssize_t show_mode(struct device *d, struct device_attribute *attr,
char *buf) char *buf)
{ {
......
...@@ -263,8 +263,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -263,8 +263,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
"for buf %d\n", wr_id); "for buf %d\n", wr_id);
} }
static int ipoib_dma_map_tx(struct ib_device *ca, int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
struct ipoib_tx_buf *tx_req)
{ {
struct sk_buff *skb = tx_req->skb; struct sk_buff *skb = tx_req->skb;
u64 *mapping = tx_req->mapping; u64 *mapping = tx_req->mapping;
...@@ -305,8 +304,8 @@ static int ipoib_dma_map_tx(struct ib_device *ca, ...@@ -305,8 +304,8 @@ static int ipoib_dma_map_tx(struct ib_device *ca,
return -EIO; return -EIO;
} }
static void ipoib_dma_unmap_tx(struct ib_device *ca, void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
struct ipoib_tx_buf *tx_req) struct ipoib_tx_buf *tx_req)
{ {
struct sk_buff *skb = tx_req->skb; struct sk_buff *skb = tx_req->skb;
u64 *mapping = tx_req->mapping; u64 *mapping = tx_req->mapping;
...@@ -314,7 +313,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, ...@@ -314,7 +313,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
int off; int off;
if (skb_headlen(skb)) { if (skb_headlen(skb)) {
ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
DMA_TO_DEVICE);
off = 1; off = 1;
} else } else
off = 0; off = 0;
...@@ -322,8 +322,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, ...@@ -322,8 +322,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), ib_dma_unmap_page(priv->ca, mapping[i + off],
DMA_TO_DEVICE); skb_frag_size(frag), DMA_TO_DEVICE);
} }
} }
...@@ -389,7 +389,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -389,7 +389,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
tx_req = &priv->tx_ring[wr_id]; tx_req = &priv->tx_ring[wr_id];
ipoib_dma_unmap_tx(priv->ca, tx_req); ipoib_dma_unmap_tx(priv, tx_req);
++dev->stats.tx_packets; ++dev->stats.tx_packets;
dev->stats.tx_bytes += tx_req->skb->len; dev->stats.tx_bytes += tx_req->skb->len;
...@@ -514,24 +514,10 @@ static inline int post_send(struct ipoib_dev_priv *priv, ...@@ -514,24 +514,10 @@ static inline int post_send(struct ipoib_dev_priv *priv,
void *head, int hlen) void *head, int hlen)
{ {
struct ib_send_wr *bad_wr; struct ib_send_wr *bad_wr;
int i, off;
struct sk_buff *skb = tx_req->skb; struct sk_buff *skb = tx_req->skb;
skb_frag_t *frags = skb_shinfo(skb)->frags;
int nr_frags = skb_shinfo(skb)->nr_frags;
u64 *mapping = tx_req->mapping;
if (skb_headlen(skb)) { ipoib_build_sge(priv, tx_req);
priv->tx_sge[0].addr = mapping[0];
priv->tx_sge[0].length = skb_headlen(skb);
off = 1;
} else
off = 0;
for (i = 0; i < nr_frags; ++i) {
priv->tx_sge[i + off].addr = mapping[i + off];
priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
}
priv->tx_wr.num_sge = nr_frags + off;
priv->tx_wr.wr_id = wr_id; priv->tx_wr.wr_id = wr_id;
priv->tx_wr.wr.ud.remote_qpn = qpn; priv->tx_wr.wr.ud.remote_qpn = qpn;
priv->tx_wr.wr.ud.ah = address; priv->tx_wr.wr.ud.ah = address;
...@@ -617,7 +603,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ...@@ -617,7 +603,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
ipoib_warn(priv, "post_send failed, error %d\n", rc); ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors; ++dev->stats.tx_errors;
--priv->tx_outstanding; --priv->tx_outstanding;
ipoib_dma_unmap_tx(priv->ca, tx_req); ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb); dev_kfree_skb_any(skb);
if (netif_queue_stopped(dev)) if (netif_queue_stopped(dev))
netif_wake_queue(dev); netif_wake_queue(dev);
...@@ -868,7 +854,7 @@ int ipoib_ib_dev_stop(struct net_device *dev) ...@@ -868,7 +854,7 @@ int ipoib_ib_dev_stop(struct net_device *dev)
while ((int) priv->tx_tail - (int) priv->tx_head < 0) { while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
tx_req = &priv->tx_ring[priv->tx_tail & tx_req = &priv->tx_ring[priv->tx_tail &
(ipoib_sendq_size - 1)]; (ipoib_sendq_size - 1)];
ipoib_dma_unmap_tx(priv->ca, tx_req); ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(tx_req->skb); dev_kfree_skb_any(tx_req->skb);
++priv->tx_tail; ++priv->tx_tail;
--priv->tx_outstanding; --priv->tx_outstanding;
......
...@@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu ...@@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_dev_priv *priv = netdev_priv(dev);
if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
return features; return features;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册