提交 2f8af120 编写于 作者: M Michael Chan 提交者: David S. Miller

[BNX2]: Fix tx race condition.

Fix a subtle race condition between bnx2_start_xmit() and bnx2_tx_int()
similar to the one in tg3 discovered by Herbert Xu:

CPU0					CPU1
bnx2_start_xmit()
	if (tx_ring_full) {
		tx_lock
					bnx2_tx()
						if (!netif_queue_stopped)
		netif_stop_queue()
		if (!tx_ring_full)
						update_tx_ring
			netif_wake_queue()
		tx_unlock
	}

Even though tx_ring is updated before the if statement in bnx2_tx_int() in
program order, it can be re-ordered by the CPU as shown above.  This
scenario can cause the tx queue to be stopped forever if bnx2_tx_int() has
just freed up the entire tx_ring.  The possibility of this happening
should be very rare though.

The following changes are made, very much identical to the tg3 fix:

1. Add memory barrier to fix the above race condition.

2. Eliminate the private tx_lock altogether and rely solely on
netif_tx_lock.  This eliminates one spinlock in bnx2_start_xmit()
when the ring is full.

3. Because of 2, use netif_tx_lock in bnx2_tx_int() before calling
netif_wake_queue().

4. Add memory barrier to bnx2_tx_avail().

5. Add bp->tx_wake_thresh which is set to half the tx ring size.

6. Check for the full wake queue condition before getting
netif_tx_lock in tg3_tx().  This reduces the number of unnecessary
spinlocks when the tx ring is full in a steady-state condition.
Signed-off-by: NMichael Chan <mchan@broadcom.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 fb33f825
...@@ -209,8 +209,10 @@ MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl); ...@@ -209,8 +209,10 @@ MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl);
static inline u32 bnx2_tx_avail(struct bnx2 *bp) static inline u32 bnx2_tx_avail(struct bnx2 *bp)
{ {
u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); u32 diff;
smp_mb();
diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
if (diff > MAX_TX_DESC_CNT) if (diff > MAX_TX_DESC_CNT)
diff = (diff & MAX_TX_DESC_CNT) - 1; diff = (diff & MAX_TX_DESC_CNT) - 1;
return (bp->tx_ring_size - diff); return (bp->tx_ring_size - diff);
...@@ -1686,15 +1688,20 @@ bnx2_tx_int(struct bnx2 *bp) ...@@ -1686,15 +1688,20 @@ bnx2_tx_int(struct bnx2 *bp)
} }
bp->tx_cons = sw_cons; bp->tx_cons = sw_cons;
/* Need to make the tx_cons update visible to bnx2_start_xmit()
* before checking for netif_queue_stopped(). Without the
* memory barrier, there is a small possibility that bnx2_start_xmit()
* will miss it and cause the queue to be stopped forever.
*/
smp_mb();
if (unlikely(netif_queue_stopped(bp->dev))) { if (unlikely(netif_queue_stopped(bp->dev)) &&
spin_lock(&bp->tx_lock); (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) {
netif_tx_lock(bp->dev);
if ((netif_queue_stopped(bp->dev)) && if ((netif_queue_stopped(bp->dev)) &&
(bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) { (bnx2_tx_avail(bp) > bp->tx_wake_thresh))
netif_wake_queue(bp->dev); netif_wake_queue(bp->dev);
} netif_tx_unlock(bp->dev);
spin_unlock(&bp->tx_lock);
} }
} }
...@@ -3503,6 +3510,8 @@ bnx2_init_tx_ring(struct bnx2 *bp) ...@@ -3503,6 +3510,8 @@ bnx2_init_tx_ring(struct bnx2 *bp)
struct tx_bd *txbd; struct tx_bd *txbd;
u32 val; u32 val;
bp->tx_wake_thresh = bp->tx_ring_size / 2;
txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT]; txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT];
txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32; txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32;
...@@ -4390,10 +4399,8 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid) ...@@ -4390,10 +4399,8 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
#endif #endif
/* Called with netif_tx_lock. /* Called with netif_tx_lock.
* hard_start_xmit is pseudo-lockless - a lock is only required when * bnx2_tx_int() runs without netif_tx_lock unless it needs to call
* the tx queue is full. This way, we get the benefit of lockless * netif_wake_queue().
* operations most of the time without the complexities to handle
* netif_stop_queue/wake_queue race conditions.
*/ */
static int static int
bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
...@@ -4512,12 +4519,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -4512,12 +4519,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
dev->trans_start = jiffies; dev->trans_start = jiffies;
if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) {
spin_lock(&bp->tx_lock);
netif_stop_queue(dev); netif_stop_queue(dev);
if (bnx2_tx_avail(bp) > bp->tx_wake_thresh)
if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)
netif_wake_queue(dev); netif_wake_queue(dev);
spin_unlock(&bp->tx_lock);
} }
return NETDEV_TX_OK; return NETDEV_TX_OK;
...@@ -5628,7 +5632,6 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) ...@@ -5628,7 +5632,6 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
bp->pdev = pdev; bp->pdev = pdev;
spin_lock_init(&bp->phy_lock); spin_lock_init(&bp->phy_lock);
spin_lock_init(&bp->tx_lock);
INIT_WORK(&bp->reset_task, bnx2_reset_task, bp); INIT_WORK(&bp->reset_task, bnx2_reset_task, bp);
dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0); dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0);
......
...@@ -3890,10 +3890,6 @@ struct bnx2 { ...@@ -3890,10 +3890,6 @@ struct bnx2 {
u32 tx_prod_bseq __attribute__((aligned(L1_CACHE_BYTES))); u32 tx_prod_bseq __attribute__((aligned(L1_CACHE_BYTES)));
u16 tx_prod; u16 tx_prod;
struct tx_bd *tx_desc_ring;
struct sw_bd *tx_buf_ring;
int tx_ring_size;
u16 tx_cons __attribute__((aligned(L1_CACHE_BYTES))); u16 tx_cons __attribute__((aligned(L1_CACHE_BYTES)));
u16 hw_tx_cons; u16 hw_tx_cons;
...@@ -3916,9 +3912,11 @@ struct bnx2 { ...@@ -3916,9 +3912,11 @@ struct bnx2 {
struct sw_bd *rx_buf_ring; struct sw_bd *rx_buf_ring;
struct rx_bd *rx_desc_ring[MAX_RX_RINGS]; struct rx_bd *rx_desc_ring[MAX_RX_RINGS];
/* Only used to synchronize netif_stop_queue/wake_queue when tx */ /* TX constants */
/* ring is full */ struct tx_bd *tx_desc_ring;
spinlock_t tx_lock; struct sw_bd *tx_buf_ring;
int tx_ring_size;
u32 tx_wake_thresh;
/* End of fields used in the performance code paths. */ /* End of fields used in the performance code paths. */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册