ipoib_ib.c 32.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3 4 5
 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
 * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <linux/delay.h>
37
#include <linux/moduleparam.h>
L
Linus Torvalds 已提交
38
#include <linux/dma-mapping.h>
39
#include <linux/slab.h>
L
Linus Torvalds 已提交
40

E
Eli Cohen 已提交
41 42
#include <linux/ip.h>
#include <linux/tcp.h>
L
Linus Torvalds 已提交
43 44 45 46 47 48 49 50 51 52 53 54

#include "ipoib.h"

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
static int data_debug_level;

module_param(data_debug_level, int, 0644);
MODULE_PARM_DESC(data_debug_level,
		 "Enable data path debug tracing if > 0");
#endif

struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
55
				 struct ib_pd *pd, struct rdma_ah_attr *attr)
L
Linus Torvalds 已提交
56 57
{
	struct ipoib_ah *ah;
58
	struct ib_ah *vah;
L
Linus Torvalds 已提交
59 60 61

	ah = kmalloc(sizeof *ah, GFP_KERNEL);
	if (!ah)
62
		return ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
63 64 65 66 67

	ah->dev       = dev;
	ah->last_send = 0;
	kref_init(&ah->ref);

68
	vah = rdma_create_ah(pd, attr);
69
	if (IS_ERR(vah)) {
L
Linus Torvalds 已提交
70
		kfree(ah);
71 72 73
		ah = (struct ipoib_ah *)vah;
	} else {
		ah->ah = vah;
74
		ipoib_dbg(ipoib_priv(dev), "Created ah %p\n", ah->ah);
75
	}
L
Linus Torvalds 已提交
76 77 78 79 80 81 82

	return ah;
}

void ipoib_free_ah(struct kref *kref)
{
	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
83
	struct ipoib_dev_priv *priv = ipoib_priv(ah->dev);
L
Linus Torvalds 已提交
84 85 86

	unsigned long flags;

87 88 89
	spin_lock_irqsave(&priv->lock, flags);
	list_add_tail(&ah->list, &priv->dead_ahs);
	spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
90 91
}

92 93 94
static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
				  u64 mapping[IPOIB_UD_RX_SG])
{
95 96 97
	ib_dma_unmap_single(priv->ca, mapping[0],
			    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
			    DMA_FROM_DEVICE);
98 99
}

100
static int ipoib_ib_post_receive(struct net_device *dev, int id)
L
Linus Torvalds 已提交
101
{
102
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
103
	struct ib_recv_wr *bad_wr;
104 105
	int ret;

106 107 108
	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
109 110


111
	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
112 113
	if (unlikely(ret)) {
		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
114
		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
115 116 117
		dev_kfree_skb_any(priv->rx_ring[id].skb);
		priv->rx_ring[id].skb = NULL;
	}
L
Linus Torvalds 已提交
118

119
	return ret;
L
Linus Torvalds 已提交
120 121
}

122
static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
L
Linus Torvalds 已提交
123
{
124
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
125
	struct sk_buff *skb;
126 127
	int buf_size;
	u64 *mapping;
L
Linus Torvalds 已提交
128

129
	buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
130

131
	skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
132 133
	if (unlikely(!skb))
		return NULL;
134 135

	/*
136 137
	 * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
	 * 64 bytes aligned
138
	 */
139
	skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
140

141 142 143 144 145 146 147 148 149 150 151
	mapping = priv->rx_ring[id].mapping;
	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
				       DMA_FROM_DEVICE);
	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
		goto error;

	priv->rx_ring[id].skb = skb;
	return skb;
error:
	dev_kfree_skb_any(skb);
	return NULL;
L
Linus Torvalds 已提交
152 153 154 155
}

static int ipoib_ib_post_receives(struct net_device *dev)
{
156
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
157 158
	int i;

159
	for (i = 0; i < ipoib_recvq_size; ++i) {
160
		if (!ipoib_alloc_rx_skb(dev, i)) {
161 162 163
			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
			return -ENOMEM;
		}
L
Linus Torvalds 已提交
164 165 166 167 168 169 170 171 172
		if (ipoib_ib_post_receive(dev, i)) {
			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
			return -EIO;
		}
	}

	return 0;
}

173
static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
L
Linus Torvalds 已提交
174
{
175
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
176 177
	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
	struct sk_buff *skb;
178
	u64 mapping[IPOIB_UD_RX_SG];
179
	union ib_gid *dgid;
180
	union ib_gid *sgid;
L
Linus Torvalds 已提交
181

182 183
	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
		       wr_id, wc->status);
L
Linus Torvalds 已提交
184

185 186 187 188 189 190 191 192 193 194 195 196 197
	if (unlikely(wr_id >= ipoib_recvq_size)) {
		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
			   wr_id, ipoib_recvq_size);
		return;
	}

	skb  = priv->rx_ring[wr_id].skb;

	if (unlikely(wc->status != IB_WC_SUCCESS)) {
		if (wc->status != IB_WC_WR_FLUSH_ERR)
			ipoib_warn(priv, "failed recv event "
				   "(status=%d, wrid=%d vend_err %x)\n",
				   wc->status, wr_id, wc->vendor_err);
198
		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
199 200 201 202
		dev_kfree_skb_any(skb);
		priv->rx_ring[wr_id].skb = NULL;
		return;
	}
L
Linus Torvalds 已提交
203

204 205 206
	memcpy(mapping, priv->rx_ring[wr_id].mapping,
	       IPOIB_UD_RX_SG * sizeof *mapping);

207 208 209 210
	/*
	 * If we can't allocate a new RX buffer, dump
	 * this packet and reuse the old buffer.
	 */
211
	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
212
		++dev->stats.rx_dropped;
213 214 215 216 217 218
		goto repost;
	}

	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
		       wc->byte_len, wc->slid);

219
	ipoib_ud_dma_unmap_rx(priv, mapping);
220 221

	skb_put(skb, wc->byte_len);
222

223 224 225 226 227 228 229 230 231 232
	/* First byte of dgid signals multicast when 0xff */
	dgid = &((struct ib_grh *)skb->data)->dgid;

	if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
		skb->pkt_type = PACKET_HOST;
	else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
		skb->pkt_type = PACKET_BROADCAST;
	else
		skb->pkt_type = PACKET_MULTICAST;

233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
	sgid = &((struct ib_grh *)skb->data)->sgid;

	/*
	 * Drop packets that this interface sent, ie multicast packets
	 * that the HCA has replicated.
	 */
	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) {
		int need_repost = 1;

		if ((wc->wc_flags & IB_WC_GRH) &&
		    sgid->global.interface_id != priv->local_gid.global.interface_id)
			need_repost = 0;

		if (need_repost) {
			dev_kfree_skb_any(skb);
			goto repost;
		}
	}

252 253
	skb_pull(skb, IB_GRH_BYTES);

254
	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
255
	skb_add_pseudo_hdr(skb);
256

257 258
	++dev->stats.rx_packets;
	dev->stats.rx_bytes += skb->len;
259 260

	skb->dev = dev;
261 262
	if ((dev->features & NETIF_F_RXCSUM) &&
			likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
263 264
		skb->ip_summed = CHECKSUM_UNNECESSARY;

O
Or Gerlitz 已提交
265
	napi_gro_receive(&priv->napi, skb);
L
Linus Torvalds 已提交
266

267 268 269 270 271
repost:
	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
		ipoib_warn(priv, "ipoib_ib_post_receive failed "
			   "for buf %d\n", wr_id);
}
L
Linus Torvalds 已提交
272

273
int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
E
Eli Cohen 已提交
274 275 276 277
{
	struct sk_buff *skb = tx_req->skb;
	u64 *mapping = tx_req->mapping;
	int i;
E
Eli Cohen 已提交
278
	int off;
E
Eli Cohen 已提交
279

E
Eli Cohen 已提交
280 281 282 283 284 285 286 287 288
	if (skb_headlen(skb)) {
		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
					       DMA_TO_DEVICE);
		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
			return -EIO;

		off = 1;
	} else
		off = 0;
E
Eli Cohen 已提交
289 290

	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
E
Eric Dumazet 已提交
291
		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
292 293
		mapping[i + off] = ib_dma_map_page(ca,
						 skb_frag_page(frag),
E
Eric Dumazet 已提交
294
						 frag->page_offset, skb_frag_size(frag),
E
Eli Cohen 已提交
295
						 DMA_TO_DEVICE);
E
Eli Cohen 已提交
296
		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
E
Eli Cohen 已提交
297 298 299 300 301 302
			goto partial_error;
	}
	return 0;

partial_error:
	for (; i > 0; --i) {
E
Eric Dumazet 已提交
303 304 305
		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];

		ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
E
Eli Cohen 已提交
306
	}
E
Eli Cohen 已提交
307 308 309 310

	if (off)
		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);

E
Eli Cohen 已提交
311 312 313
	return -EIO;
}

314 315
void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
			struct ipoib_tx_buf *tx_req)
E
Eli Cohen 已提交
316 317 318 319
{
	struct sk_buff *skb = tx_req->skb;
	u64 *mapping = tx_req->mapping;
	int i;
E
Eli Cohen 已提交
320
	int off;
E
Eli Cohen 已提交
321

E
Eli Cohen 已提交
322
	if (skb_headlen(skb)) {
323 324
		ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
				    DMA_TO_DEVICE);
E
Eli Cohen 已提交
325 326 327
		off = 1;
	} else
		off = 0;
E
Eli Cohen 已提交
328 329

	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
E
Eric Dumazet 已提交
330 331
		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

332 333
		ib_dma_unmap_page(priv->ca, mapping[i + off],
				  skb_frag_size(frag), DMA_TO_DEVICE);
E
Eli Cohen 已提交
334 335 336
	}
}

E
Erez Shitrit 已提交
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
/*
 * As the result of a completion error the QP Can be transferred to SQE states.
 * The function checks if the (send)QP is in SQE state and
 * moves it back to RTS state, that in order to have it functional again.
 */
static void ipoib_qp_state_validate_work(struct work_struct *work)
{
	struct ipoib_qp_state_validate *qp_work =
		container_of(work, struct ipoib_qp_state_validate, work);

	struct ipoib_dev_priv *priv = qp_work->priv;
	struct ib_qp_attr qp_attr;
	struct ib_qp_init_attr query_init_attr;
	int ret;

	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
	if (ret) {
		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
			   __func__, ret);
		goto free_res;
	}
	pr_info("%s: QP: 0x%x is in state: %d\n",
		__func__, priv->qp->qp_num, qp_attr.qp_state);

	/* currently support only in SQE->RTS transition*/
	if (qp_attr.qp_state == IB_QPS_SQE) {
		qp_attr.qp_state = IB_QPS_RTS;

		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
		if (ret) {
			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
				ret, priv->qp->qp_num);
			goto free_res;
		}
		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
			__func__, priv->qp->qp_num);
	} else {
		pr_warn("QP (%d) will stay in state: %d\n",
			priv->qp->qp_num, qp_attr.qp_state);
	}

free_res:
	kfree(qp_work);
}

382 383
static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
{
384
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
385 386
	unsigned int wr_id = wc->wr_id;
	struct ipoib_tx_buf *tx_req;
L
Linus Torvalds 已提交
387

388 389
	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
		       wr_id, wc->status);
L
Linus Torvalds 已提交
390

391 392 393 394
	if (unlikely(wr_id >= ipoib_sendq_size)) {
		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
			   wr_id, ipoib_sendq_size);
		return;
L
Linus Torvalds 已提交
395
	}
396 397 398

	tx_req = &priv->tx_ring[wr_id];

399
	ipoib_dma_unmap_tx(priv, tx_req);
400

401 402
	++dev->stats.tx_packets;
	dev->stats.tx_bytes += tx_req->skb->len;
403 404 405 406

	dev_kfree_skb_any(tx_req->skb);

	++priv->tx_tail;
407 408 409
	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
	    netif_queue_stopped(dev) &&
	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
410 411 412
		netif_wake_queue(dev);

	if (wc->status != IB_WC_SUCCESS &&
E
Erez Shitrit 已提交
413 414
	    wc->status != IB_WC_WR_FLUSH_ERR) {
		struct ipoib_qp_state_validate *qp_work;
415 416 417
		ipoib_warn(priv, "failed send event "
			   "(status=%d, wrid=%d vend_err %x)\n",
			   wc->status, wr_id, wc->vendor_err);
E
Erez Shitrit 已提交
418
		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
419
		if (!qp_work)
E
Erez Shitrit 已提交
420 421 422 423 424 425
			return;

		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
		qp_work->priv = priv;
		queue_work(priv->wq, &qp_work->work);
	}
426 427
}

428 429 430 431 432 433 434 435 436 437 438
static int poll_tx(struct ipoib_dev_priv *priv)
{
	int n, i;

	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
	for (i = 0; i < n; ++i)
		ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);

	return n == MAX_SEND_CQE;
}

439
int ipoib_poll(struct napi_struct *napi, int budget)
440
{
441 442
	struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi);
	struct net_device *dev = priv->dev;
R
Roland Dreier 已提交
443 444 445 446 447 448
	int done;
	int t;
	int n, i;

	done  = 0;

449 450 451 452
poll_more:
	while (done < budget) {
		int max = (budget - done);

R
Roland Dreier 已提交
453
		t = min(IPOIB_NUM_WC, max);
454
		n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
R
Roland Dreier 已提交
455

456
		for (i = 0; i < n; i++) {
R
Roland Dreier 已提交
457 458
			struct ib_wc *wc = priv->ibwc + i;

459
			if (wc->wr_id & IPOIB_OP_RECV) {
R
Roland Dreier 已提交
460
				++done;
461 462 463 464
				if (wc->wr_id & IPOIB_OP_CM)
					ipoib_cm_handle_rx_wc(dev, wc);
				else
					ipoib_ib_handle_rx_wc(dev, wc);
465 466
			} else
				ipoib_cm_handle_tx_wc(priv->dev, wc);
R
Roland Dreier 已提交
467 468
		}

469
		if (n != t)
R
Roland Dreier 已提交
470 471 472
			break;
	}

473
	if (done < budget) {
474
		napi_complete(napi);
475
		if (unlikely(ib_req_notify_cq(priv->recv_cq,
R
Roland Dreier 已提交
476 477
					      IB_CQ_NEXT_COMP |
					      IB_CQ_REPORT_MISSED_EVENTS)) &&
478
		    napi_reschedule(napi))
479
			goto poll_more;
R
Roland Dreier 已提交
480 481
	}

482
	return done;
L
Linus Torvalds 已提交
483 484 485 486
}

void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
{
487
	struct net_device *dev = dev_ptr;
488
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
489

490
	napi_schedule(&priv->napi);
L
Linus Torvalds 已提交
491 492
}

493 494
static void drain_tx_cq(struct net_device *dev)
{
495
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
496

497
	netif_tx_lock(dev);
498 499 500 501 502 503
	while (poll_tx(priv))
		; /* nothing */

	if (netif_queue_stopped(dev))
		mod_timer(&priv->poll_timer, jiffies + 1);

504
	netif_tx_unlock(dev);
505 506 507 508
}

void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
{
509
	struct ipoib_dev_priv *priv = ipoib_priv(dev_ptr);
510 511

	mod_timer(&priv->poll_timer, jiffies);
512 513
}

L
Linus Torvalds 已提交
514 515
static inline int post_send(struct ipoib_dev_priv *priv,
			    unsigned int wr_id,
516
			    struct ib_ah *address, u32 dqpn,
E
Eli Cohen 已提交
517 518
			    struct ipoib_tx_buf *tx_req,
			    void *head, int hlen)
L
Linus Torvalds 已提交
519 520
{
	struct ib_send_wr *bad_wr;
E
Eli Cohen 已提交
521 522
	struct sk_buff *skb = tx_req->skb;

523
	ipoib_build_sge(priv, tx_req);
L
Linus Torvalds 已提交
524

C
Christoph Hellwig 已提交
525
	priv->tx_wr.wr.wr_id	= wr_id;
526
	priv->tx_wr.remote_qpn	= dqpn;
C
Christoph Hellwig 已提交
527
	priv->tx_wr.ah		= address;
L
Linus Torvalds 已提交
528

E
Eli Cohen 已提交
529
	if (head) {
C
Christoph Hellwig 已提交
530 531 532 533
		priv->tx_wr.mss		= skb_shinfo(skb)->gso_size;
		priv->tx_wr.header	= head;
		priv->tx_wr.hlen	= hlen;
		priv->tx_wr.wr.opcode	= IB_WR_LSO;
E
Eli Cohen 已提交
534
	} else
C
Christoph Hellwig 已提交
535
		priv->tx_wr.wr.opcode	= IB_WR_SEND;
E
Eli Cohen 已提交
536

C
Christoph Hellwig 已提交
537
	return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
L
Linus Torvalds 已提交
538 539
}

540 541
int ipoib_send(struct net_device *dev, struct sk_buff *skb,
	       struct ib_ah *address, u32 dqpn)
L
Linus Torvalds 已提交
542
{
543
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
544
	struct ipoib_tx_buf *tx_req;
545
	int hlen, rc;
E
Eli Cohen 已提交
546
	void *phead;
547
	unsigned usable_sge = priv->max_send_sge - !!skb_headlen(skb);
E
Eli Cohen 已提交
548 549 550 551 552 553 554 555 556

	if (skb_is_gso(skb)) {
		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
		phead = skb->data;
		if (unlikely(!skb_pull(skb, hlen))) {
			ipoib_warn(priv, "linear data too small\n");
			++dev->stats.tx_dropped;
			++dev->stats.tx_errors;
			dev_kfree_skb_any(skb);
557
			return -1;
E
Eli Cohen 已提交
558 559 560 561 562 563 564 565
		}
	} else {
		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
				   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
			++dev->stats.tx_dropped;
			++dev->stats.tx_errors;
			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
566
			return -1;
E
Eli Cohen 已提交
567 568 569
		}
		phead = NULL;
		hlen  = 0;
L
Linus Torvalds 已提交
570
	}
571 572 573 574 575 576
	if (skb_shinfo(skb)->nr_frags > usable_sge) {
		if (skb_linearize(skb) < 0) {
			ipoib_warn(priv, "skb could not be linearized\n");
			++dev->stats.tx_dropped;
			++dev->stats.tx_errors;
			dev_kfree_skb_any(skb);
577
			return -1;
578 579 580 581 582 583 584
		}
		/* Does skb_linearize return ok without reducing nr_frags? */
		if (skb_shinfo(skb)->nr_frags > usable_sge) {
			ipoib_warn(priv, "too many frags after skb linearize\n");
			++dev->stats.tx_dropped;
			++dev->stats.tx_errors;
			dev_kfree_skb_any(skb);
585
			return -1;
586 587
		}
	}
L
Linus Torvalds 已提交
588

589 590 591
	ipoib_dbg_data(priv,
		       "sending packet, length=%d address=%p dqpn=0x%06x\n",
		       skb->len, address, dqpn);
L
Linus Torvalds 已提交
592 593 594 595 596 597 598 599

	/*
	 * We put the skb into the tx_ring _before_ we call post_send()
	 * because it's entirely possible that the completion handler will
	 * run before we execute anything after the post_send().  That
	 * means we have to make sure everything is properly recorded and
	 * our state is consistent before we call post_send().
	 */
600
	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
L
Linus Torvalds 已提交
601
	tx_req->skb = skb;
E
Eli Cohen 已提交
602
	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
603
		++dev->stats.tx_errors;
604
		dev_kfree_skb_any(skb);
605
		return -1;
606
	}
L
Linus Torvalds 已提交
607

608
	if (skb->ip_summed == CHECKSUM_PARTIAL)
C
Christoph Hellwig 已提交
609
		priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
610
	else
C
Christoph Hellwig 已提交
611
		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
612

613 614 615 616 617 618 619
	if (++priv->tx_outstanding == ipoib_sendq_size) {
		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
			ipoib_warn(priv, "request notify on send CQ failed\n");
		netif_stop_queue(dev);
	}

620 621 622
	skb_orphan(skb);
	skb_dst_drop(skb);

623
	rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
624
		       address, dqpn, tx_req, phead, hlen);
625 626
	if (unlikely(rc)) {
		ipoib_warn(priv, "post_send failed, error %d\n", rc);
627
		++dev->stats.tx_errors;
628
		--priv->tx_outstanding;
629
		ipoib_dma_unmap_tx(priv, tx_req);
L
Linus Torvalds 已提交
630
		dev_kfree_skb_any(skb);
631 632
		if (netif_queue_stopped(dev))
			netif_wake_queue(dev);
633
		rc = 0;
L
Linus Torvalds 已提交
634
	} else {
635
		netif_trans_update(dev);
L
Linus Torvalds 已提交
636

637
		rc = priv->tx_head;
L
Linus Torvalds 已提交
638 639
		++priv->tx_head;
	}
640 641

	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
642 643
		while (poll_tx(priv))
			; /* nothing */
644 645

	return rc;
L
Linus Torvalds 已提交
646 647 648 649
}

static void __ipoib_reap_ah(struct net_device *dev)
{
650
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
651 652
	struct ipoib_ah *ah, *tah;
	LIST_HEAD(remove_list);
653 654 655 656
	unsigned long flags;

	netif_tx_lock_bh(dev);
	spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
657 658

	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
659
		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
L
Linus Torvalds 已提交
660
			list_del(&ah->list);
661
			rdma_destroy_ah(ah->ah);
662
			kfree(ah);
L
Linus Torvalds 已提交
663
		}
664 665 666

	spin_unlock_irqrestore(&priv->lock, flags);
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
667 668
}

D
David Howells 已提交
669
void ipoib_reap_ah(struct work_struct *work)
L
Linus Torvalds 已提交
670
{
D
David Howells 已提交
671 672 673
	struct ipoib_dev_priv *priv =
		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
	struct net_device *dev = priv->dev;
L
Linus Torvalds 已提交
674 675 676 677

	__ipoib_reap_ah(dev);

	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
678
		queue_delayed_work(priv->wq, &priv->ah_reap_task,
679
				   round_jiffies_relative(HZ));
L
Linus Torvalds 已提交
680 681
}

682
static void ipoib_flush_ah(struct net_device *dev)
D
Doug Ledford 已提交
683
{
684
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
D
Doug Ledford 已提交
685 686

	cancel_delayed_work(&priv->ah_reap_task);
687
	flush_workqueue(priv->wq);
D
Doug Ledford 已提交
688 689 690
	ipoib_reap_ah(&priv->ah_reap_task.work);
}

691
static void ipoib_stop_ah(struct net_device *dev)
D
Doug Ledford 已提交
692
{
693
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
D
Doug Ledford 已提交
694 695

	set_bit(IPOIB_STOP_REAPER, &priv->flags);
696
	ipoib_flush_ah(dev);
D
Doug Ledford 已提交
697 698
}

699
static int recvs_pending(struct net_device *dev)
700
{
701
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
702 703 704 705 706 707 708 709
	int pending = 0;
	int i;

	for (i = 0; i < ipoib_recvq_size; ++i)
		if (priv->rx_ring[i].skb)
			++pending;

	return pending;
710 711
}

712
int ipoib_ib_dev_stop_default(struct net_device *dev)
L
Linus Torvalds 已提交
713
{
714
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
715 716 717 718
	struct ib_qp_attr qp_attr;
	unsigned long begin;
	struct ipoib_tx_buf *tx_req;
	int i;
L
Linus Torvalds 已提交
719

720
	if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
721
		napi_disable(&priv->napi);
722

723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
	ipoib_cm_dev_stop(dev);

	/*
	 * Move our QP to the error state and then reinitialize in
	 * when all work requests have completed or have been flushed.
	 */
	qp_attr.qp_state = IB_QPS_ERR;
	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");

	/* Wait for all sends and receives to complete */
	begin = jiffies;

	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
		if (time_after(jiffies, begin + 5 * HZ)) {
			ipoib_warn(priv,
				   "timing out; %d sends %d receives not completed\n",
				   priv->tx_head - priv->tx_tail,
				   recvs_pending(dev));

			/*
			 * assume the HW is wedged and just free up
			 * all our pending work requests.
			 */
			while ((int)priv->tx_tail - (int)priv->tx_head < 0) {
				tx_req = &priv->tx_ring[priv->tx_tail &
							(ipoib_sendq_size - 1)];
				ipoib_dma_unmap_tx(priv, tx_req);
				dev_kfree_skb_any(tx_req->skb);
				++priv->tx_tail;
				--priv->tx_outstanding;
			}

			for (i = 0; i < ipoib_recvq_size; ++i) {
				struct ipoib_rx_buf *rx_req;

				rx_req = &priv->rx_ring[i];
				if (!rx_req->skb)
					continue;
				ipoib_ud_dma_unmap_rx(priv,
						      priv->rx_ring[i].mapping);
				dev_kfree_skb_any(rx_req->skb);
				rx_req->skb = NULL;
			}

			goto timeout;
		}

		ipoib_drain_cq(dev);

		msleep(1);
774 775
	}

776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
	ipoib_dbg(priv, "All sends and receives done.\n");

timeout:
	del_timer_sync(&priv->poll_timer);
	qp_attr.qp_state = IB_QPS_RESET;
	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
		ipoib_warn(priv, "Failed to modify QP to RESET state\n");

	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);

	return 0;
}

int ipoib_ib_dev_stop(struct net_device *dev)
{
791 792 793
	struct ipoib_dev_priv *priv = ipoib_priv(dev);

	priv->rn_ops->ndo_stop(dev);
794

795
	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
796 797 798 799 800 801 802 803 804 805 806 807
	ipoib_flush_ah(dev);

	return 0;
}

void ipoib_ib_tx_timer_func(unsigned long ctx)
{
	drain_tx_cq((struct net_device *)ctx);
}

int ipoib_ib_dev_open_default(struct net_device *dev)
{
808
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
809 810
	int ret;

811
	ret = ipoib_init_qp(dev);
L
Linus Torvalds 已提交
812
	if (ret) {
813
		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
L
Linus Torvalds 已提交
814 815 816 817 818 819
		return -1;
	}

	ret = ipoib_ib_post_receives(dev);
	if (ret) {
		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
820
		goto out;
L
Linus Torvalds 已提交
821 822
	}

823 824
	ret = ipoib_cm_dev_open(dev);
	if (ret) {
825
		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
826
		goto out;
827 828
	}

829
	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
830
		napi_enable(&priv->napi);
L
Leonid Arsh 已提交
831

L
Linus Torvalds 已提交
832
	return 0;
833
out:
834
	return -1;
L
Linus Torvalds 已提交
835 836
}

837 838
int ipoib_ib_dev_open(struct net_device *dev)
{
839
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
840 841 842 843 844 845 846 847 848 849 850 851 852

	ipoib_pkey_dev_check_presence(dev);

	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
		ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
			   (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
		return -1;
	}

	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
	queue_delayed_work(priv->wq, &priv->ah_reap_task,
			   round_jiffies_relative(HZ));

853
	if (priv->rn_ops->ndo_open(dev)) {
854
		pr_warn("%s: Failed to open dev\n", dev->name);
855
		goto dev_stop;
856 857
	}

858 859
	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);

860 861
	return 0;

862
dev_stop:
863 864
	set_bit(IPOIB_STOP_REAPER, &priv->flags);
	cancel_delayed_work(&priv->ah_reap_task);
865 866 867
	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
	napi_enable(&priv->napi);
	ipoib_ib_dev_stop(dev);
868 869 870
	return -1;
}

871
void ipoib_pkey_dev_check_presence(struct net_device *dev)
L
Leonid Arsh 已提交
872
{
873
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Leonid Arsh 已提交
874

875 876 877
	if (!(priv->pkey & 0x7fff) ||
	    ib_find_pkey(priv->ca, priv->port, priv->pkey,
			 &priv->pkey_index))
L
Leonid Arsh 已提交
878 879 880 881 882
		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
	else
		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
}

883
void ipoib_ib_dev_up(struct net_device *dev)
L
Linus Torvalds 已提交
884
{
885
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
886

L
Leonid Arsh 已提交
887 888 889 890
	ipoib_pkey_dev_check_presence(dev);

	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
		ipoib_dbg(priv, "PKEY is not assigned.\n");
891
		return;
L
Leonid Arsh 已提交
892 893
	}

L
Linus Torvalds 已提交
894 895
	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);

896
	ipoib_mcast_start_thread(dev);
L
Linus Torvalds 已提交
897 898
}

Z
Zhu Yanjun 已提交
899
void ipoib_ib_dev_down(struct net_device *dev)
L
Linus Torvalds 已提交
900
{
901
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
902 903 904 905 906 907

	ipoib_dbg(priv, "downing ib_dev\n");

	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
	netif_carrier_off(dev);

908
	ipoib_mcast_stop_thread(dev);
L
Linus Torvalds 已提交
909 910 911 912 913
	ipoib_mcast_dev_flush(dev);

	ipoib_flush_paths(dev);
}

914 915
void ipoib_drain_cq(struct net_device *dev)
{
916
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
917
	int i, n;
918 919 920 921 922 923 924 925

	/*
	 * We call completion handling routines that expect to be
	 * called from the BH-disabled NAPI poll context, so disable
	 * BHs here too.
	 */
	local_bh_disable();

926
	do {
927
		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
928
		for (i = 0; i < n; ++i) {
929 930 931 932 933 934 935 936
			/*
			 * Convert any successful completions to flush
			 * errors to avoid passing packets up the
			 * stack after bringing the device down.
			 */
			if (priv->ibwc[i].status == IB_WC_SUCCESS)
				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;

937 938 939 940 941
			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
				else
					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
942 943
			} else
				ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
944 945
		}
	} while (n == IPOIB_NUM_WC);
946 947 948

	while (poll_tx(priv))
		; /* nothing */
949 950

	local_bh_enable();
951 952
}

953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
/*
 * Takes whatever value which is in pkey index 0 and updates priv->pkey
 * returns 0 if the pkey value was changed.
 */
static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
{
	int result;
	u16 prev_pkey;

	prev_pkey = priv->pkey;
	result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
	if (result) {
		ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
			   priv->port, result);
		return result;
	}

	priv->pkey |= 0x8000;

	if (prev_pkey != priv->pkey) {
		ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
			  prev_pkey, priv->pkey);
		/*
		 * Update the pkey in the broadcast address, while making sure to set
		 * the full membership bit, so that we join the right broadcast group.
		 */
		priv->dev->broadcast[8] = priv->pkey >> 8;
		priv->dev->broadcast[9] = priv->pkey & 0xff;
981 982 983 984 985 986 987 988 989 990 991 992 993

		/*
		 * Update the broadcast address in the priv->broadcast object,
		 * in case it already exists, otherwise no one will do that.
		 */
		if (priv->broadcast) {
			spin_lock_irq(&priv->lock);
			memcpy(priv->broadcast->mcmember.mgid.raw,
			       priv->dev->broadcast + 4,
			sizeof(union ib_gid));
			spin_unlock_irq(&priv->lock);
		}

994 995 996 997 998
		return 0;
	}

	return 1;
}
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
/*
 * returns 0 if pkey value was found in a different slot.
 */
static inline int update_child_pkey(struct ipoib_dev_priv *priv)
{
	u16 old_index = priv->pkey_index;

	priv->pkey_index = 0;
	ipoib_pkey_dev_check_presence(priv->dev);

	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
	    (old_index == priv->pkey_index))
		return 1;
	return 0;
}
1014

1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
/*
 * returns true if the device address of the ipoib interface has changed and the
 * new address is a valid one (i.e in the gid table), return false otherwise.
 */
static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
{
	union ib_gid search_gid;
	union ib_gid gid0;
	union ib_gid *netdev_gid;
	int err;
	u16 index;
	u8 port;
	bool ret = false;

	netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
	if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL))
		return false;

1033
	netif_addr_lock_bh(priv->dev);
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043

	/* The subnet prefix may have changed, update it now so we won't have
	 * to do it later
	 */
	priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
	netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix;
	search_gid.global.subnet_prefix = gid0.global.subnet_prefix;

	search_gid.global.interface_id = priv->local_gid.global.interface_id;

1044
	netif_addr_unlock_bh(priv->dev);
1045 1046 1047 1048

	err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB,
			  priv->dev, &port, &index);

1049
	netif_addr_lock_bh(priv->dev);
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109

	if (search_gid.global.interface_id !=
	    priv->local_gid.global.interface_id)
		/* There was a change while we were looking up the gid, bail
		 * here and let the next work sort this out
		 */
		goto out;

	/* The next section of code needs some background:
	 * Per IB spec the port GUID can't change if the HCA is powered on.
	 * port GUID is the basis for GID at index 0 which is the basis for
	 * the default device address of a ipoib interface.
	 *
	 * so it seems the flow should be:
	 * if user_changed_dev_addr && gid in gid tbl
	 *	set bit dev_addr_set
	 *	return true
	 * else
	 *	return false
	 *
	 * The issue is that there are devices that don't follow the spec,
	 * they change the port GUID when the HCA is powered, so in order
	 * not to break userspace applications, We need to check if the
	 * user wanted to control the device address and we assume that
	 * if he sets the device address back to be based on GID index 0,
	 * he no longer wishs to control it.
	 *
	 * If the user doesn't control the the device address,
	 * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
	 * the port GUID has changed and GID at index 0 has changed
	 * so we need to change priv->local_gid and priv->dev->dev_addr
	 * to reflect the new GID.
	 */
	if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
		if (!err && port == priv->port) {
			set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
			if (index == 0)
				clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
					  &priv->flags);
			else
				set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
			ret = true;
		} else {
			ret = false;
		}
	} else {
		if (!err && port == priv->port) {
			ret = true;
		} else {
			if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
				memcpy(&priv->local_gid, &gid0,
				       sizeof(priv->local_gid));
				memcpy(priv->dev->dev_addr + 4, &gid0,
				       sizeof(priv->local_gid));
				ret = true;
			}
		}
	}

out:
1110
	netif_addr_unlock_bh(priv->dev);
1111 1112 1113 1114

	return ret;
}

1115
static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
1116 1117
				enum ipoib_flush_level level,
				int nesting)
L
Linus Torvalds 已提交
1118
{
1119
	struct ipoib_dev_priv *cpriv;
D
David Howells 已提交
1120
	struct net_device *dev = priv->dev;
1121
	int result;
1122

1123
	down_read_nested(&priv->vlan_rwsem, nesting);
L
Linus Torvalds 已提交
1124

1125 1126 1127 1128 1129
	/*
	 * Flush any child interfaces too -- they might be up even if
	 * the parent is down.
	 */
	list_for_each_entry(cpriv, &priv->child_intfs, list)
1130
		__ipoib_ib_dev_flush(cpriv, level, nesting + 1);
1131

1132
	up_read(&priv->vlan_rwsem);
1133

1134 1135
	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
	    level != IPOIB_FLUSH_HEAVY) {
1136 1137 1138
		/* Make sure the dev_addr is set even if not flushing */
		if (level == IPOIB_FLUSH_LIGHT)
			ipoib_dev_addr_changed_valid(priv);
L
Leonid Arsh 已提交
1139 1140 1141 1142 1143
		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
		return;
	}

	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
1144 1145 1146 1147 1148 1149
		/* interface is down. update pkey and leave. */
		if (level == IPOIB_FLUSH_HEAVY) {
			if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
				update_parent_pkey(priv);
			else
				update_child_pkey(priv);
1150 1151
		} else if (level == IPOIB_FLUSH_LIGHT)
			ipoib_dev_addr_changed_valid(priv);
L
Leonid Arsh 已提交
1152
		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
L
Linus Torvalds 已提交
1153
		return;
L
Leonid Arsh 已提交
1154
	}
L
Linus Torvalds 已提交
1155

1156
	if (level == IPOIB_FLUSH_HEAVY) {
1157 1158 1159 1160
		/* child devices chase their origin pkey value, while non-child
		 * (parent) devices should always takes what present in pkey index 0
		 */
		if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
1161 1162 1163
			result = update_child_pkey(priv);
			if (result) {
				/* restart QP only if P_Key index is changed */
1164
				ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
1165
				return;
1166
			}
1167

1168 1169 1170 1171 1172 1173 1174
		} else {
			result = update_parent_pkey(priv);
			/* restart QP only if P_Key value changed */
			if (result) {
				ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
				return;
			}
1175 1176 1177
		}
	}

1178
	if (level == IPOIB_FLUSH_LIGHT) {
1179
		int oper_up;
1180
		ipoib_mark_paths_invalid(dev);
1181 1182 1183 1184 1185 1186
		/* Set IPoIB operation as down to prevent races between:
		 * the flush flow which leaves MCG and on the fly joins
		 * which can happen during that time. mcast restart task
		 * should deal with join requests we missed.
		 */
		oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
1187
		ipoib_mcast_dev_flush(dev);
1188 1189
		if (oper_up)
			set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
1190
		ipoib_flush_ah(dev);
1191
	}
L
Linus Torvalds 已提交
1192

1193
	if (level >= IPOIB_FLUSH_NORMAL)
1194
		ipoib_ib_dev_down(dev);
L
Linus Torvalds 已提交
1195

1196
	if (level == IPOIB_FLUSH_HEAVY) {
1197
		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
1198 1199
			ipoib_ib_dev_stop(dev);
		if (ipoib_ib_dev_open(dev) != 0)
1200 1201 1202
			return;
		if (netif_queue_stopped(dev))
			netif_start_queue(dev);
1203 1204
	}

L
Linus Torvalds 已提交
1205 1206 1207 1208
	/*
	 * The device could have been brought down between the start and when
	 * we get here, don't bring it back up if it's not configured up
	 */
1209
	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
1210 1211
		if (level >= IPOIB_FLUSH_NORMAL)
			ipoib_ib_dev_up(dev);
1212 1213
		if (ipoib_dev_addr_changed_valid(priv))
			ipoib_mcast_restart_task(&priv->restart_task);
1214
	}
1215
}
L
Linus Torvalds 已提交
1216

1217 1218 1219 1220 1221
void ipoib_ib_dev_flush_light(struct work_struct *work)
{
	struct ipoib_dev_priv *priv =
		container_of(work, struct ipoib_dev_priv, flush_light);

1222
	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0);
1223 1224 1225
}

void ipoib_ib_dev_flush_normal(struct work_struct *work)
1226 1227
{
	struct ipoib_dev_priv *priv =
1228
		container_of(work, struct ipoib_dev_priv, flush_normal);
1229

1230
	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0);
1231
}
1232

1233
void ipoib_ib_dev_flush_heavy(struct work_struct *work)
1234 1235
{
	struct ipoib_dev_priv *priv =
1236
		container_of(work, struct ipoib_dev_priv, flush_heavy);
1237

1238
	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
L
Linus Torvalds 已提交
1239 1240 1241 1242
}

void ipoib_ib_dev_cleanup(struct net_device *dev)
{
1243
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1244 1245

	ipoib_dbg(priv, "cleaning up ib_dev\n");
1246 1247 1248 1249 1250
	/*
	 * We must make sure there are no more (path) completions
	 * that may wish to touch priv fields that are no longer valid
	 */
	ipoib_flush_paths(dev);
L
Linus Torvalds 已提交
1251

1252
	ipoib_mcast_stop_thread(dev);
1253
	ipoib_mcast_dev_flush(dev);
L
Linus Torvalds 已提交
1254

D
Doug Ledford 已提交
1255 1256 1257 1258 1259 1260
	/*
	 * All of our ah references aren't free until after
	 * ipoib_mcast_dev_flush(), ipoib_flush_paths, and
	 * the neighbor garbage collection is stopped and reaped.
	 * That should all be done now, so make a final ah flush.
	 */
1261
	ipoib_stop_ah(dev);
D
Doug Ledford 已提交
1262

1263 1264
	clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);

1265
	priv->rn_ops->ndo_uninit(dev);
L
Linus Torvalds 已提交
1266

1267 1268 1269 1270 1271
	if (priv->pd) {
		ib_dealloc_pd(priv->pd);
		priv->pd = NULL;
	}
}
L
Linus Torvalds 已提交
1272