ipoib_main.c 44.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3 4
 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "ipoib.h"

#include <linux/module.h>

#include <linux/init.h>
#include <linux/slab.h>
41
#include <linux/kernel.h>
42
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
43 44 45 46 47 48

#include <linux/if_arp.h>	/* For ARPHRD_xxx */

#include <linux/ip.h>
#include <linux/in.h>

49 50
#include <linux/jhash.h>
#include <net/arp.h>
51

52 53 54 55
#define DRV_VERSION "1.0.0"

const char ipoib_driver_version[] = DRV_VERSION;

L
Linus Torvalds 已提交
56 57 58
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");
59
MODULE_VERSION(DRV_VERSION);
L
Linus Torvalds 已提交
60

61 62 63 64 65 66 67 68
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;

module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");

L
Linus Torvalds 已提交
69 70 71 72 73 74 75
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;

module_param_named(debug_level, ipoib_debug_level, int, 0644);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
#endif

76 77 78 79 80
struct ipoib_path_iter {
	struct net_device *dev;
	struct ipoib_path  path;
};

L
Linus Torvalds 已提交
81 82 83 84 85 86 87 88
static const u8 ipv4_bcast_addr[] = {
	0x00, 0xff, 0xff, 0xff,
	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
};

struct workqueue_struct *ipoib_workqueue;

89 90
struct ib_sa_client ipoib_sa_client;

L
Linus Torvalds 已提交
91
static void ipoib_add_one(struct ib_device *device);
92
static void ipoib_remove_one(struct ib_device *device, void *client_data);
93
static void ipoib_neigh_reclaim(struct rcu_head *rp);
L
Linus Torvalds 已提交
94 95 96 97 98 99 100 101 102 103 104 105 106

static struct ib_client ipoib_client = {
	.name   = "ipoib",
	.add    = ipoib_add_one,
	.remove = ipoib_remove_one
};

int ipoib_open(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

	ipoib_dbg(priv, "bringing up interface\n");

107 108
	netif_carrier_off(dev);

109
	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
L
Linus Torvalds 已提交
110

111
	if (ipoib_ib_dev_open(dev)) {
112 113
		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
			return 0;
114
		goto err_disable;
115
	}
116

117 118
	if (ipoib_ib_dev_up(dev))
		goto err_stop;
L
Linus Torvalds 已提交
119 120 121 122 123

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		struct ipoib_dev_priv *cpriv;

		/* Bring up any child interfaces too */
124
		down_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
125 126 127 128 129 130 131 132 133
		list_for_each_entry(cpriv, &priv->child_intfs, list) {
			int flags;

			flags = cpriv->dev->flags;
			if (flags & IFF_UP)
				continue;

			dev_change_flags(cpriv->dev, flags | IFF_UP);
		}
134
		up_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
135 136 137 138 139
	}

	netif_start_queue(dev);

	return 0;
140 141

err_stop:
142
	ipoib_ib_dev_stop(dev);
143 144 145 146 147

err_disable:
	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

	return -EINVAL;
L
Linus Torvalds 已提交
148 149 150 151 152 153 154 155 156 157 158 159
}

static int ipoib_stop(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

	ipoib_dbg(priv, "stopping interface\n");

	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

	netif_stop_queue(dev);

160 161
	ipoib_ib_dev_down(dev);
	ipoib_ib_dev_stop(dev);
L
Linus Torvalds 已提交
162 163 164 165 166

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		struct ipoib_dev_priv *cpriv;

		/* Bring down any child interfaces too */
167
		down_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
168 169 170 171 172 173 174 175 176
		list_for_each_entry(cpriv, &priv->child_intfs, list) {
			int flags;

			flags = cpriv->dev->flags;
			if (!(flags & IFF_UP))
				continue;

			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
		}
177
		up_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
178 179 180 181 182
	}

	return 0;
}

O
Or Gerlitz 已提交
183 184 185 186 187
static void ipoib_uninit(struct net_device *dev)
{
	ipoib_dev_cleanup(dev);
}

188
static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
189 190 191 192
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
193
		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
194 195 196 197

	return features;
}

L
Linus Torvalds 已提交
198 199 200 201
static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

202
	/* dev->mtu > 2K ==> connected mode */
203 204 205 206
	if (ipoib_cm_admin_enabled(dev)) {
		if (new_mtu > ipoib_cm_max_mtu(dev))
			return -EINVAL;

207 208 209
		if (new_mtu > priv->mcast_mtu)
			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
				   priv->mcast_mtu);
210

211 212 213 214
		dev->mtu = new_mtu;
		return 0;
	}

215
	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
L
Linus Torvalds 已提交
216 217 218 219 220 221 222 223 224
		return -EINVAL;

	priv->admin_mtu = new_mtu;

	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);

	return 0;
}

225 226 227 228 229 230 231 232 233 234
int ipoib_set_mode(struct net_device *dev, const char *buf)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

	/* flush paths if we switch modes so that connections are restarted */
	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
		ipoib_warn(priv, "enabling connected mode "
			   "will cause multicast packet drops\n");
		netdev_update_features(dev);
235
		dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
		rtnl_unlock();
		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;

		ipoib_flush_paths(dev);
		rtnl_lock();
		return 0;
	}

	if (!strcmp(buf, "datagram\n")) {
		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
		netdev_update_features(dev);
		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
		rtnl_unlock();
		ipoib_flush_paths(dev);
		rtnl_lock();
		return 0;
	}

	return -EINVAL;
}

257
static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
L
Linus Torvalds 已提交
258 259 260 261 262 263 264 265 266
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct rb_node *n = priv->path_tree.rb_node;
	struct ipoib_path *path;
	int ret;

	while (n) {
		path = rb_entry(n, struct ipoib_path, rb_node);

267
		ret = memcmp(gid, path->pathrec.dgid.raw,
L
Linus Torvalds 已提交
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
			     sizeof (union ib_gid));

		if (ret < 0)
			n = n->rb_left;
		else if (ret > 0)
			n = n->rb_right;
		else
			return path;
	}

	return NULL;
}

static int __path_add(struct net_device *dev, struct ipoib_path *path)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct rb_node **n = &priv->path_tree.rb_node;
	struct rb_node *pn = NULL;
	struct ipoib_path *tpath;
	int ret;

	while (*n) {
		pn = *n;
		tpath = rb_entry(pn, struct ipoib_path, rb_node);

		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
			     sizeof (union ib_gid));
		if (ret < 0)
			n = &pn->rb_left;
		else if (ret > 0)
			n = &pn->rb_right;
		else
			return -EEXIST;
	}

	rb_link_node(&path->rb_node, pn, n);
	rb_insert_color(&path->rb_node, &priv->path_tree);

	list_add_tail(&path->list, &priv->path_list);

	return 0;
}

static void path_free(struct net_device *dev, struct ipoib_path *path)
{
	struct sk_buff *skb;

	while ((skb = __skb_dequeue(&path->queue)))
		dev_kfree_skb_irq(skb);

318
	ipoib_dbg(netdev_priv(dev), "path_free\n");
L
Linus Torvalds 已提交
319

320 321
	/* remove all neigh connected to this path */
	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
322 323 324 325 326 327 328

	if (path->ah)
		ipoib_put_ah(path->ah);

	kfree(path);
}

329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG

struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
{
	struct ipoib_path_iter *iter;

	iter = kmalloc(sizeof *iter, GFP_KERNEL);
	if (!iter)
		return NULL;

	iter->dev = dev;
	memset(iter->path.pathrec.dgid.raw, 0, 16);

	if (ipoib_path_iter_next(iter)) {
		kfree(iter);
		return NULL;
	}

	return iter;
}

int ipoib_path_iter_next(struct ipoib_path_iter *iter)
{
	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
	struct rb_node *n;
	struct ipoib_path *path;
	int ret = 1;

	spin_lock_irq(&priv->lock);

	n = rb_first(&priv->path_tree);

	while (n) {
		path = rb_entry(n, struct ipoib_path, rb_node);

		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
			   sizeof (union ib_gid)) < 0) {
			iter->path = *path;
			ret = 0;
			break;
		}

		n = rb_next(n);
	}

	spin_unlock_irq(&priv->lock);

	return ret;
}

void ipoib_path_iter_read(struct ipoib_path_iter *iter,
			  struct ipoib_path *path)
{
	*path = iter->path;
}

#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */

387 388 389 390 391 392 393 394
void ipoib_mark_paths_invalid(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_path *path, *tp;

	spin_lock_irq(&priv->lock);

	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
H
Harvey Harrison 已提交
395
		ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
396
			be16_to_cpu(path->pathrec.dlid),
397
			path->pathrec.dgid.raw);
398 399 400 401 402 403
		path->valid =  0;
	}

	spin_unlock_irq(&priv->lock);
}

L
Linus Torvalds 已提交
404 405 406 407 408
void ipoib_flush_paths(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_path *path, *tp;
	LIST_HEAD(remove_list);
409
	unsigned long flags;
L
Linus Torvalds 已提交
410

411 412
	netif_tx_lock_bh(dev);
	spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
413

414
	list_splice_init(&priv->path_list, &remove_list);
L
Linus Torvalds 已提交
415 416 417 418 419 420 421

	list_for_each_entry(path, &remove_list, list)
		rb_erase(&path->rb_node, &priv->path_tree);

	list_for_each_entry_safe(path, tp, &remove_list, list) {
		if (path->query)
			ib_sa_cancel_query(path->query_id, path->query);
422 423
		spin_unlock_irqrestore(&priv->lock, flags);
		netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
424 425
		wait_for_completion(&path->done);
		path_free(dev, path);
426 427
		netif_tx_lock_bh(dev);
		spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
428
	}
429 430 431

	spin_unlock_irqrestore(&priv->lock, flags);
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
432 433 434 435 436 437 438 439 440 441
}

static void path_rec_completion(int status,
				struct ib_sa_path_rec *pathrec,
				void *path_ptr)
{
	struct ipoib_path *path = path_ptr;
	struct net_device *dev = path->dev;
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_ah *ah = NULL;
442
	struct ipoib_ah *old_ah = NULL;
443
	struct ipoib_neigh *neigh, *tn;
L
Linus Torvalds 已提交
444 445 446 447
	struct sk_buff_head skqueue;
	struct sk_buff *skb;
	unsigned long flags;

448
	if (!status)
H
Harvey Harrison 已提交
449
		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
450
			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
L
Linus Torvalds 已提交
451
	else
H
Harvey Harrison 已提交
452
		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
453
			  status, path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
454 455 456 457

	skb_queue_head_init(&skqueue);

	if (!status) {
458 459 460 461
		struct ib_ah_attr av;

		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
			ah = ipoib_create_ah(dev, priv->pd, &av);
L
Linus Torvalds 已提交
462 463 464 465
	}

	spin_lock_irqsave(&priv->lock, flags);

466
	if (!IS_ERR_OR_NULL(ah)) {
L
Linus Torvalds 已提交
467 468
		path->pathrec = *pathrec;

469 470 471
		old_ah   = path->ah;
		path->ah = ah;

L
Linus Torvalds 已提交
472 473 474 475 476 477
		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);

		while ((skb = __skb_dequeue(&path->queue)))
			__skb_queue_tail(&skqueue, skb);

478
		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
479 480 481 482 483 484 485 486 487 488 489
			if (neigh->ah) {
				WARN_ON(neigh->ah != old_ah);
				/*
				 * Dropping the ah reference inside
				 * priv->lock is safe here, because we
				 * will hold one more reference from
				 * the original value of path->ah (ie
				 * old_ah).
				 */
				ipoib_put_ah(neigh->ah);
			}
L
Linus Torvalds 已提交
490 491 492
			kref_get(&path->ah->ref);
			neigh->ah = path->ah;

493
			if (ipoib_cm_enabled(dev, neigh->daddr)) {
494 495 496 497 498
				if (!ipoib_cm_get(neigh))
					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
									       path,
									       neigh));
				if (!ipoib_cm_get(neigh)) {
499
					ipoib_neigh_free(neigh);
500 501 502 503
					continue;
				}
			}

L
Linus Torvalds 已提交
504 505 506
			while ((skb = __skb_dequeue(&neigh->queue)))
				__skb_queue_tail(&skqueue, skb);
		}
507
		path->valid = 1;
508
	}
L
Linus Torvalds 已提交
509

510
	path->query = NULL;
L
Linus Torvalds 已提交
511 512 513 514
	complete(&path->done);

	spin_unlock_irqrestore(&priv->lock, flags);

515 516 517
	if (IS_ERR_OR_NULL(ah))
		ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);

518 519 520
	if (old_ah)
		ipoib_put_ah(old_ah);

L
Linus Torvalds 已提交
521 522 523 524 525 526 527 528
	while ((skb = __skb_dequeue(&skqueue))) {
		skb->dev = dev;
		if (dev_queue_xmit(skb))
			ipoib_warn(priv, "dev_queue_xmit failed "
				   "to requeue packet\n");
	}
}

529
static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
L
Linus Torvalds 已提交
530 531 532 533
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_path *path;

534 535 536
	if (!priv->broadcast)
		return NULL;

537
	path = kzalloc(sizeof *path, GFP_ATOMIC);
L
Linus Torvalds 已提交
538 539 540
	if (!path)
		return NULL;

541
	path->dev = dev;
L
Linus Torvalds 已提交
542 543 544 545 546

	skb_queue_head_init(&path->queue);

	INIT_LIST_HEAD(&path->neigh_list);

547
	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
548 549
	path->pathrec.sgid	    = priv->local_gid;
	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
550 551
	path->pathrec.numb_path     = 1;
	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
L
Linus Torvalds 已提交
552 553 554 555 556 557 558 559 560

	return path;
}

static int path_rec_start(struct net_device *dev,
			  struct ipoib_path *path)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

H
Harvey Harrison 已提交
561
	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
562
		  path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
563

564 565
	init_completion(&path->done);

L
Linus Torvalds 已提交
566
	path->query_id =
567
		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
L
Linus Torvalds 已提交
568 569 570 571
				   &path->pathrec,
				   IB_SA_PATH_REC_DGID		|
				   IB_SA_PATH_REC_SGID		|
				   IB_SA_PATH_REC_NUMB_PATH	|
572
				   IB_SA_PATH_REC_TRAFFIC_CLASS |
L
Linus Torvalds 已提交
573 574 575 576 577
				   IB_SA_PATH_REC_PKEY,
				   1000, GFP_ATOMIC,
				   path_rec_completion,
				   path, &path->query);
	if (path->query_id < 0) {
578
		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
L
Linus Torvalds 已提交
579
		path->query = NULL;
580
		complete(&path->done);
L
Linus Torvalds 已提交
581 582 583 584 585 586
		return path->query_id;
	}

	return 0;
}

587 588
static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
			   struct net_device *dev)
L
Linus Torvalds 已提交
589 590 591 592
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_path *path;
	struct ipoib_neigh *neigh;
593
	unsigned long flags;
L
Linus Torvalds 已提交
594

595
	spin_lock_irqsave(&priv->lock, flags);
596
	neigh = ipoib_neigh_alloc(daddr, dev);
L
Linus Torvalds 已提交
597
	if (!neigh) {
598
		spin_unlock_irqrestore(&priv->lock, flags);
599
		++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
600 601 602 603
		dev_kfree_skb_any(skb);
		return;
	}

604
	path = __path_find(dev, daddr + 4);
L
Linus Torvalds 已提交
605
	if (!path) {
606
		path = path_rec_create(dev, daddr + 4);
L
Linus Torvalds 已提交
607
		if (!path)
608
			goto err_path;
L
Linus Torvalds 已提交
609 610 611 612 613 614

		__path_add(dev, path);
	}

	list_add_tail(&neigh->list, &path->neigh_list);

615
	if (path->ah) {
L
Linus Torvalds 已提交
616 617 618
		kref_get(&path->ah->ref);
		neigh->ah = path->ah;

619
		if (ipoib_cm_enabled(dev, neigh->daddr)) {
620 621 622
			if (!ipoib_cm_get(neigh))
				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
			if (!ipoib_cm_get(neigh)) {
623
				ipoib_neigh_free(neigh);
624 625 626 627 628 629 630 631 632
				goto err_drop;
			}
			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
				__skb_queue_tail(&neigh->queue, skb);
			else {
				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
					   skb_queue_len(&neigh->queue));
				goto err_drop;
			}
633 634
		} else {
			spin_unlock_irqrestore(&priv->lock, flags);
635 636
			ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
			ipoib_neigh_put(neigh);
637 638
			return;
		}
L
Linus Torvalds 已提交
639 640 641 642
	} else {
		neigh->ah  = NULL;

		if (!path->query && path_rec_start(dev, path))
643
			goto err_path;
644 645 646 647
		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
			__skb_queue_tail(&neigh->queue, skb);
		else
			goto err_drop;
L
Linus Torvalds 已提交
648 649
	}

650
	spin_unlock_irqrestore(&priv->lock, flags);
651
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
652 653
	return;

654
err_path:
655
	ipoib_neigh_free(neigh);
656
err_drop:
657
	++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
658 659
	dev_kfree_skb_any(skb);

660
	spin_unlock_irqrestore(&priv->lock, flags);
661
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
662 663 664
}

static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
665
			     struct ipoib_cb *cb)
L
Linus Torvalds 已提交
666 667 668
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_path *path;
669
	unsigned long flags;
L
Linus Torvalds 已提交
670

671
	spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
672

673
	path = __path_find(dev, cb->hwaddr + 4);
674
	if (!path || !path->valid) {
675 676 677
		int new_path = 0;

		if (!path) {
678
			path = path_rec_create(dev, cb->hwaddr + 4);
679 680
			new_path = 1;
		}
L
Linus Torvalds 已提交
681
		if (path) {
682 683 684 685 686 687
			if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
				__skb_queue_tail(&path->queue, skb);
			} else {
				++dev->stats.tx_dropped;
				dev_kfree_skb_any(skb);
			}
L
Linus Torvalds 已提交
688

689
			if (!path->query && path_rec_start(dev, path)) {
690
				spin_unlock_irqrestore(&priv->lock, flags);
691 692
				if (new_path)
					path_free(dev, path);
L
Linus Torvalds 已提交
693 694 695 696
				return;
			} else
				__path_add(dev, path);
		} else {
697
			++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
698 699 700
			dev_kfree_skb_any(skb);
		}

701
		spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
702 703 704
		return;
	}

705
	if (path->ah) {
L
Linus Torvalds 已提交
706 707 708
		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
			  be16_to_cpu(path->pathrec.dlid));

709
		spin_unlock_irqrestore(&priv->lock, flags);
710
		ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
711
		return;
L
Linus Torvalds 已提交
712 713 714 715
	} else if ((path->query || !path_rec_start(dev, path)) &&
		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
		__skb_queue_tail(&path->queue, skb);
	} else {
716
		++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
717 718 719
		dev_kfree_skb_any(skb);
	}

720
	spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
721 722 723 724 725 726
}

static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_neigh *neigh;
727
	struct ipoib_cb *cb = ipoib_skb_cb(skb);
728
	struct ipoib_header *header;
L
Linus Torvalds 已提交
729 730
	unsigned long flags;

731 732 733 734 735 736 737
	header = (struct ipoib_header *) skb->data;

	if (unlikely(cb->hwaddr[4] == 0xff)) {
		/* multicast, arrange "if" according to probability */
		if ((header->proto != htons(ETH_P_IP)) &&
		    (header->proto != htons(ETH_P_IPV6)) &&
		    (header->proto != htons(ETH_P_ARP)) &&
738 739
		    (header->proto != htons(ETH_P_RARP)) &&
		    (header->proto != htons(ETH_P_TIPC))) {
740
			/* ethertype not supported by IPoIB */
741 742
			++dev->stats.tx_dropped;
			dev_kfree_skb_any(skb);
743
			return NETDEV_TX_OK;
744
		}
745 746 747 748 749 750 751 752 753
		/* Add in the P_Key for multicast*/
		cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
		cb->hwaddr[9] = priv->pkey & 0xff;

		neigh = ipoib_neigh_get(dev, cb->hwaddr);
		if (likely(neigh))
			goto send_using_neigh;
		ipoib_mcast_send(dev, cb->hwaddr, skb);
		return NETDEV_TX_OK;
754
	}
L
Linus Torvalds 已提交
755

756 757 758 759
	/* unicast, arrange "switch" according to probability */
	switch (header->proto) {
	case htons(ETH_P_IP):
	case htons(ETH_P_IPV6):
760
	case htons(ETH_P_TIPC):
761 762 763 764
		neigh = ipoib_neigh_get(dev, cb->hwaddr);
		if (unlikely(!neigh)) {
			neigh_add_path(skb, cb->hwaddr, dev);
			return NETDEV_TX_OK;
765
		}
766 767 768 769 770 771 772 773 774 775 776 777
		break;
	case htons(ETH_P_ARP):
	case htons(ETH_P_RARP):
		/* for unicast ARP and RARP should always perform path find */
		unicast_arp_send(skb, dev, cb);
		return NETDEV_TX_OK;
	default:
		/* ethertype not supported by IPoIB */
		++dev->stats.tx_dropped;
		dev_kfree_skb_any(skb);
		return NETDEV_TX_OK;
	}
778

779 780 781 782 783 784
send_using_neigh:
	/* note we now hold a ref to neigh */
	if (ipoib_cm_get(neigh)) {
		if (ipoib_cm_up(neigh)) {
			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
			goto unref;
L
Linus Torvalds 已提交
785
		}
786 787 788 789
	} else if (neigh->ah) {
		ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
		goto unref;
	}
L
Linus Torvalds 已提交
790

791 792 793 794
	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
		spin_lock_irqsave(&priv->lock, flags);
		__skb_queue_tail(&neigh->queue, skb);
		spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
795
	} else {
796 797 798
		++dev->stats.tx_dropped;
		dev_kfree_skb_any(skb);
	}
L
Linus Torvalds 已提交
799

800 801
unref:
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
802 803 804 805 806 807 808 809

	return NETDEV_TX_OK;
}

static void ipoib_timeout(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

810 811 812 813 814
	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
		   jiffies_to_msecs(jiffies - dev->trans_start));
	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
		   netif_queue_stopped(dev),
		   priv->tx_head, priv->tx_tail);
L
Linus Torvalds 已提交
815 816 817 818 819 820
	/* XXX reset QP, etc. */
}

static int ipoib_hard_header(struct sk_buff *skb,
			     struct net_device *dev,
			     unsigned short type,
821
			     const void *daddr, const void *saddr, unsigned len)
L
Linus Torvalds 已提交
822 823
{
	struct ipoib_header *header;
824
	struct ipoib_cb *cb = ipoib_skb_cb(skb);
L
Linus Torvalds 已提交
825 826 827 828 829 830 831

	header = (struct ipoib_header *) skb_push(skb, sizeof *header);

	header->proto = htons(type);
	header->reserved = 0;

	/*
832
	 * we don't rely on dst_entry structure,  always stuff the
833 834
	 * destination address into skb->cb so we can figure out where
	 * to send the packet later.
L
Linus Torvalds 已提交
835
	 */
836
	memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
L
Linus Torvalds 已提交
837

838
	return sizeof *header;
L
Linus Torvalds 已提交
839 840 841 842 843 844
}

static void ipoib_set_mcast_list(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

L
Leonid Arsh 已提交
845 846 847 848 849
	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
		return;
	}

850
	queue_work(priv->wq, &priv->restart_task);
L
Linus Torvalds 已提交
851 852
}

853 854 855 856
static int ipoib_get_iflink(const struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

E
Erez Shitrit 已提交
857 858 859 860 861
	/* parent interface */
	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
		return dev->ifindex;

	/* child/vlan interface */
862 863 864
	return priv->parent->ifindex;
}

865
static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
L
Linus Torvalds 已提交
866
{
867 868 869 870 871 872 873
	/*
	 * Use only the address parts that contributes to spreading
	 * The subnet prefix is not used as one can not connect to
	 * same remote port (GUID) using the same remote QPN via two
	 * different subnets.
	 */
	 /* qpn octets[1:4) & port GUID octets[12:20) */
874
	u32 *d32 = (u32 *) daddr;
875 876
	u32 hv;

877
	hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
	return hv & htbl->mask;
}

struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh *neigh = NULL;
	u32 hash_val;

	rcu_read_lock_bh();

	htbl = rcu_dereference_bh(ntbl->htbl);

	if (!htbl)
		goto out_unlock;

	hash_val = ipoib_addr_hash(htbl, daddr);
	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
	     neigh != NULL;
	     neigh = rcu_dereference_bh(neigh->hnext)) {
		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
			/* found, take one ref on behalf of the caller */
			if (!atomic_inc_not_zero(&neigh->refcnt)) {
				/* deleted */
				neigh = NULL;
				goto out_unlock;
			}
			neigh->alive = jiffies;
			goto out_unlock;
		}
	}

out_unlock:
	rcu_read_unlock_bh();
	return neigh;
}

static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long neigh_obsolete;
	unsigned long dt;
L
Linus Torvalds 已提交
923
	unsigned long flags;
924
	int i;
L
Linus Torvalds 已提交
925

926
	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
927
		return;
L
Linus Torvalds 已提交
928

929
	spin_lock_irqsave(&priv->lock, flags);
930 931

	htbl = rcu_dereference_protected(ntbl->htbl,
932
					 lockdep_is_held(&priv->lock));
933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948

	if (!htbl)
		goto out_unlock;

	/* neigh is obsolete if it was idle for two GC periods */
	dt = 2 * arp_tbl.gc_interval;
	neigh_obsolete = jiffies - dt;
	/* handle possible race condition */
	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
		goto out_unlock;

	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
949
							  lockdep_is_held(&priv->lock))) != NULL) {
950 951 952 953
			/* was the neigh idle for two GC periods */
			if (time_after(neigh_obsolete, neigh->alive)) {
				rcu_assign_pointer(*np,
						   rcu_dereference_protected(neigh->hnext,
954
									     lockdep_is_held(&priv->lock)));
955 956 957 958 959 960
				/* remove from path/mc list */
				list_del(&neigh->list);
				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
			} else {
				np = &neigh->hnext;
			}
L
Linus Torvalds 已提交
961

962 963
		}
	}
L
Linus Torvalds 已提交
964

965
out_unlock:
966
	spin_unlock_irqrestore(&priv->lock, flags);
967
}
L
Linus Torvalds 已提交
968

969 970 971 972 973 974 975 976
static void ipoib_reap_neigh(struct work_struct *work)
{
	struct ipoib_dev_priv *priv =
		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);

	__ipoib_reap_neigh(priv);

	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
977
		queue_delayed_work(priv->wq, &priv->neigh_reap_task,
978
				   arp_tbl.gc_interval);
L
Linus Torvalds 已提交
979 980
}

981 982

static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
983
				      struct net_device *dev)
984 985 986
{
	struct ipoib_neigh *neigh;

987
	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
988 989 990
	if (!neigh)
		return NULL;

991
	neigh->dev = dev;
992
	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
993
	skb_queue_head_init(&neigh->queue);
994
	INIT_LIST_HEAD(&neigh->list);
995
	ipoib_cm_set(neigh, NULL);
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
	/* one ref on behalf of the caller */
	atomic_set(&neigh->refcnt, 1);

	return neigh;
}

struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
				      struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh *neigh;
	u32 hash_val;

	htbl = rcu_dereference_protected(ntbl->htbl,
1012
					 lockdep_is_held(&priv->lock));
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
	if (!htbl) {
		neigh = NULL;
		goto out_unlock;
	}

	/* need to add a new neigh, but maybe some other thread succeeded?
	 * recalc hash, maybe hash resize took place so we do a search
	 */
	hash_val = ipoib_addr_hash(htbl, daddr);
	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1023
					       lockdep_is_held(&priv->lock));
1024 1025
	     neigh != NULL;
	     neigh = rcu_dereference_protected(neigh->hnext,
1026
					       lockdep_is_held(&priv->lock))) {
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
			/* found, take one ref on behalf of the caller */
			if (!atomic_inc_not_zero(&neigh->refcnt)) {
				/* deleted */
				neigh = NULL;
				break;
			}
			neigh->alive = jiffies;
			goto out_unlock;
		}
	}

	neigh = ipoib_neigh_ctor(daddr, dev);
	if (!neigh)
		goto out_unlock;

	/* one ref on behalf of the hash table */
	atomic_inc(&neigh->refcnt);
	neigh->alive = jiffies;
	/* put in hash */
	rcu_assign_pointer(neigh->hnext,
			   rcu_dereference_protected(htbl->buckets[hash_val],
1049
						     lockdep_is_held(&priv->lock)));
1050 1051 1052 1053
	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
	atomic_inc(&ntbl->entries);

out_unlock:
1054 1055 1056 1057

	return neigh;
}

1058
void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1059
{
1060 1061 1062
	/* neigh reference count was dropprd to zero */
	struct net_device *dev = neigh->dev;
	struct ipoib_dev_priv *priv = netdev_priv(dev);
1063
	struct sk_buff *skb;
1064 1065
	if (neigh->ah)
		ipoib_put_ah(neigh->ah);
1066
	while ((skb = __skb_dequeue(&neigh->queue))) {
1067
		++dev->stats.tx_dropped;
1068 1069
		dev_kfree_skb_any(skb);
	}
1070 1071
	if (ipoib_cm_get(neigh))
		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1072 1073 1074 1075
	ipoib_dbg(netdev_priv(dev),
		  "neigh free for %06x %pI6\n",
		  IPOIB_QPN(neigh->daddr),
		  neigh->daddr + 4);
1076
	kfree(neigh);
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
	if (atomic_dec_and_test(&priv->ntbl.entries)) {
		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
			complete(&priv->ntbl.flushed);
	}
}

static void ipoib_neigh_reclaim(struct rcu_head *rp)
{
	/* Called as a result of removal from hash table */
	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
	/* note TX context may hold another ref */
	ipoib_neigh_put(neigh);
1089 1090
}

1091
void ipoib_neigh_free(struct ipoib_neigh *neigh)
L
Linus Torvalds 已提交
1092
{
1093 1094 1095 1096 1097 1098 1099 1100 1101
	struct net_device *dev = neigh->dev;
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh __rcu **np;
	struct ipoib_neigh *n;
	u32 hash_val;

	htbl = rcu_dereference_protected(ntbl->htbl,
1102
					lockdep_is_held(&priv->lock));
1103
	if (!htbl)
1104
		return;
1105 1106 1107 1108

	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
	np = &htbl->buckets[hash_val];
	for (n = rcu_dereference_protected(*np,
1109
					    lockdep_is_held(&priv->lock));
1110
	     n != NULL;
1111
	     n = rcu_dereference_protected(*np,
1112
					lockdep_is_held(&priv->lock))) {
1113 1114 1115 1116
		if (n == neigh) {
			/* found */
			rcu_assign_pointer(*np,
					   rcu_dereference_protected(neigh->hnext,
1117
								     lockdep_is_held(&priv->lock)));
1118 1119
			/* remove from parent list */
			list_del(&neigh->list);
1120
			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1121
			return;
1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
		} else {
			np = &n->hnext;
		}
	}
}

static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
1132
	struct ipoib_neigh __rcu **buckets;
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	u32 size;

	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
	ntbl->htbl = NULL;
	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
	if (!htbl)
		return -ENOMEM;
	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
	if (!buckets) {
		kfree(htbl);
		return -ENOMEM;
	}
	htbl->size = size;
	htbl->mask = (size - 1);
	htbl->buckets = buckets;
1150
	RCU_INIT_POINTER(ntbl->htbl, htbl);
1151
	htbl->ntbl = ntbl;
1152 1153 1154 1155
	atomic_set(&ntbl->entries, 0);

	/* start garbage collection */
	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1156
	queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1157
			   arp_tbl.gc_interval);
L
Linus Torvalds 已提交
1158 1159 1160 1161

	return 0;
}

1162 1163 1164 1165 1166 1167
static void neigh_hash_free_rcu(struct rcu_head *head)
{
	struct ipoib_neigh_hash *htbl = container_of(head,
						    struct ipoib_neigh_hash,
						    rcu);
	struct ipoib_neigh __rcu **buckets = htbl->buckets;
1168
	struct ipoib_neigh_table *ntbl = htbl->ntbl;
1169 1170 1171

	kfree(buckets);
	kfree(htbl);
1172
	complete(&ntbl->deleted);
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
}

void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long flags;
	int i;

	/* remove all neigh connected to a given path or mcast */
1184
	spin_lock_irqsave(&priv->lock, flags);
1185 1186

	htbl = rcu_dereference_protected(ntbl->htbl,
1187
					 lockdep_is_held(&priv->lock));
1188 1189 1190 1191 1192 1193 1194 1195 1196

	if (!htbl)
		goto out_unlock;

	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1197
							  lockdep_is_held(&priv->lock))) != NULL) {
1198 1199 1200 1201
			/* delete neighs belong to this parent */
			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
				rcu_assign_pointer(*np,
						   rcu_dereference_protected(neigh->hnext,
1202
									     lockdep_is_held(&priv->lock)));
1203 1204 1205 1206 1207 1208 1209 1210 1211 1212
				/* remove from parent list */
				list_del(&neigh->list);
				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
			} else {
				np = &neigh->hnext;
			}

		}
	}
out_unlock:
1213
	spin_unlock_irqrestore(&priv->lock, flags);
1214 1215 1216 1217 1218 1219 1220
}

static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long flags;
1221
	int i, wait_flushed = 0;
1222

1223
	init_completion(&priv->ntbl.flushed);
1224

1225
	spin_lock_irqsave(&priv->lock, flags);
1226 1227

	htbl = rcu_dereference_protected(ntbl->htbl,
1228
					lockdep_is_held(&priv->lock));
1229 1230 1231
	if (!htbl)
		goto out_unlock;

1232 1233 1234 1235
	wait_flushed = atomic_read(&priv->ntbl.entries);
	if (!wait_flushed)
		goto free_htbl;

1236 1237 1238 1239 1240
	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1241
				       lockdep_is_held(&priv->lock))) != NULL) {
1242 1243
			rcu_assign_pointer(*np,
					   rcu_dereference_protected(neigh->hnext,
1244
								     lockdep_is_held(&priv->lock)));
1245 1246 1247 1248 1249 1250
			/* remove from path/mc list */
			list_del(&neigh->list);
			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
		}
	}

1251
free_htbl:
1252 1253 1254 1255
	rcu_assign_pointer(ntbl->htbl, NULL);
	call_rcu(&htbl->rcu, neigh_hash_free_rcu);

out_unlock:
1256
	spin_unlock_irqrestore(&priv->lock, flags);
1257 1258
	if (wait_flushed)
		wait_for_completion(&priv->ntbl.flushed);
1259 1260 1261 1262 1263 1264 1265 1266
}

static void ipoib_neigh_hash_uninit(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	int stopped;

	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1267
	init_completion(&priv->ntbl.deleted);
1268 1269 1270 1271 1272 1273 1274
	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);

	/* Stop GC if called at init fail need to cancel work */
	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	if (!stopped)
		cancel_delayed_work(&priv->neigh_reap_task);

1275 1276 1277
	ipoib_flush_neighs(priv);

	wait_for_completion(&priv->ntbl.deleted);
1278 1279 1280
}


L
Linus Torvalds 已提交
1281 1282 1283 1284 1285
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

	/* Allocate RX/TX "rings" to hold queued skbs */
1286
	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
L
Linus Torvalds 已提交
1287 1288 1289
				GFP_KERNEL);
	if (!priv->rx_ring) {
		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1290
		       ca->name, ipoib_recvq_size);
1291
		goto out;
L
Linus Torvalds 已提交
1292 1293
	}

1294
	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
L
Linus Torvalds 已提交
1295 1296
	if (!priv->tx_ring) {
		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1297
		       ca->name, ipoib_sendq_size);
L
Linus Torvalds 已提交
1298 1299 1300
		goto out_rx_ring_cleanup;
	}

1301
	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
L
Linus Torvalds 已提交
1302 1303 1304 1305

	if (ipoib_ib_dev_init(dev, ca, port))
		goto out_tx_ring_cleanup;

1306 1307 1308 1309 1310 1311 1312
	/*
	 * Must be after ipoib_ib_dev_init so we can allocate a per
	 * device wq there and use it here
	 */
	if (ipoib_neigh_hash_init(priv) < 0)
		goto out_dev_uninit;

L
Linus Torvalds 已提交
1313 1314
	return 0;

1315 1316 1317
out_dev_uninit:
	ipoib_ib_dev_cleanup(dev);

L
Linus Torvalds 已提交
1318
out_tx_ring_cleanup:
1319
	vfree(priv->tx_ring);
L
Linus Torvalds 已提交
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330

out_rx_ring_cleanup:
	kfree(priv->rx_ring);

out:
	return -ENOMEM;
}

void ipoib_dev_cleanup(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
O
Or Gerlitz 已提交
1331 1332 1333
	LIST_HEAD(head);

	ASSERT_RTNL();
L
Linus Torvalds 已提交
1334

1335
	ipoib_delete_debug_files(dev);
L
Linus Torvalds 已提交
1336 1337 1338

	/* Delete any child interfaces first */
	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1339 1340 1341
		/* Stop GC on child */
		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
		cancel_delayed_work(&cpriv->neigh_reap_task);
O
Or Gerlitz 已提交
1342
		unregister_netdevice_queue(cpriv->dev, &head);
L
Linus Torvalds 已提交
1343
	}
O
Or Gerlitz 已提交
1344
	unregister_netdevice_many(&head);
L
Linus Torvalds 已提交
1345

1346 1347 1348 1349 1350 1351
	/*
	 * Must be before ipoib_ib_dev_cleanup or we delete an in use
	 * work queue
	 */
	ipoib_neigh_hash_uninit(dev);

L
Linus Torvalds 已提交
1352 1353
	ipoib_ib_dev_cleanup(dev);

1354
	kfree(priv->rx_ring);
1355
	vfree(priv->tx_ring);
L
Linus Torvalds 已提交
1356

1357 1358
	priv->rx_ring = NULL;
	priv->tx_ring = NULL;
L
Linus Torvalds 已提交
1359 1360
}

1361 1362 1363 1364
static const struct header_ops ipoib_header_ops = {
	.create	= ipoib_hard_header,
};

1365
static const struct net_device_ops ipoib_netdev_ops = {
O
Or Gerlitz 已提交
1366
	.ndo_uninit		 = ipoib_uninit,
1367 1368 1369
	.ndo_open		 = ipoib_open,
	.ndo_stop		 = ipoib_stop,
	.ndo_change_mtu		 = ipoib_change_mtu,
1370
	.ndo_fix_features	 = ipoib_fix_features,
1371 1372
	.ndo_start_xmit	 	 = ipoib_start_xmit,
	.ndo_tx_timeout		 = ipoib_timeout,
1373
	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
1374
	.ndo_get_iflink		 = ipoib_get_iflink,
1375 1376
};

O
Or Gerlitz 已提交
1377
void ipoib_setup(struct net_device *dev)
L
Linus Torvalds 已提交
1378 1379 1380
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);

1381
	dev->netdev_ops		 = &ipoib_netdev_ops;
1382
	dev->header_ops		 = &ipoib_header_ops;
1383

E
Eli Cohen 已提交
1384 1385
	ipoib_set_ethtool_ops(dev);

M
Michal Schmidt 已提交
1386
	netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
L
Linus Torvalds 已提交
1387

1388
	dev->watchdog_timeo	 = HZ;
L
Linus Torvalds 已提交
1389

1390
	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
L
Linus Torvalds 已提交
1391

1392
	dev->hard_header_len	 = IPOIB_ENCAP_LEN;
1393 1394 1395
	dev->addr_len		 = INFINIBAND_ALEN;
	dev->type		 = ARPHRD_INFINIBAND;
	dev->tx_queue_len	 = ipoib_sendq_size * 2;
E
Eli Cohen 已提交
1396 1397
	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
				    NETIF_F_HIGHDMA);
1398
	netif_keep_dst(dev);
L
Linus Torvalds 已提交
1399 1400 1401 1402 1403 1404 1405

	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);

	priv->dev = dev;

	spin_lock_init(&priv->lock);

1406
	init_rwsem(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
1407 1408 1409 1410 1411 1412

	INIT_LIST_HEAD(&priv->path_list);
	INIT_LIST_HEAD(&priv->child_intfs);
	INIT_LIST_HEAD(&priv->dead_ahs);
	INIT_LIST_HEAD(&priv->multicast_list);

D
David Howells 已提交
1413
	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1414
	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1415 1416 1417
	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
D
David Howells 已提交
1418 1419
	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1420
	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
L
Linus Torvalds 已提交
1421 1422 1423 1424 1425 1426
}

struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
{
	struct net_device *dev;

1427 1428
	dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name,
			   NET_NAME_UNKNOWN, ipoib_setup);
L
Linus Torvalds 已提交
1429 1430 1431 1432 1433 1434
	if (!dev)
		return NULL;

	return netdev_priv(dev);
}

1435 1436
static ssize_t show_pkey(struct device *dev,
			 struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
1437
{
1438
	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
L
Linus Torvalds 已提交
1439 1440 1441

	return sprintf(buf, "0x%04x\n", priv->pkey);
}
1442
static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
L
Linus Torvalds 已提交
1443

1444 1445 1446 1447 1448 1449 1450 1451
static ssize_t show_umcast(struct device *dev,
			   struct device_attribute *attr, char *buf)
{
	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));

	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
}

1452
void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
1453
{
1454
	struct ipoib_dev_priv *priv = netdev_priv(ndev);
1455 1456 1457 1458 1459 1460 1461

	if (umcast_val > 0) {
		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
		ipoib_warn(priv, "ignoring multicast groups joined directly "
				"by userspace\n");
	} else
		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1462 1463 1464 1465 1466 1467 1468 1469 1470
}

static ssize_t set_umcast(struct device *dev,
			  struct device_attribute *attr,
			  const char *buf, size_t count)
{
	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);

	ipoib_set_umcast(to_net_dev(dev), umcast_val);
1471 1472 1473 1474 1475 1476 1477 1478 1479 1480

	return count;
}
static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);

int ipoib_add_umcast_attr(struct net_device *dev)
{
	return device_create_file(&dev->dev, &dev_attr_umcast);
}

1481 1482
static ssize_t create_child(struct device *dev,
			    struct device_attribute *attr,
L
Linus Torvalds 已提交
1483 1484 1485 1486 1487 1488 1489 1490
			    const char *buf, size_t count)
{
	int pkey;
	int ret;

	if (sscanf(buf, "%i", &pkey) != 1)
		return -EINVAL;

1491
	if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
L
Linus Torvalds 已提交
1492 1493
		return -EINVAL;

1494 1495 1496 1497 1498 1499
	/*
	 * Set the full membership bit, so that we join the right
	 * broadcast group, etc.
	 */
	pkey |= 0x8000;

1500
	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
L
Linus Torvalds 已提交
1501 1502 1503

	return ret ? ret : count;
}
1504
static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
L
Linus Torvalds 已提交
1505

1506 1507
static ssize_t delete_child(struct device *dev,
			    struct device_attribute *attr,
L
Linus Torvalds 已提交
1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518
			    const char *buf, size_t count)
{
	int pkey;
	int ret;

	if (sscanf(buf, "%i", &pkey) != 1)
		return -EINVAL;

	if (pkey < 0 || pkey > 0xffff)
		return -EINVAL;

1519
	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
L
Linus Torvalds 已提交
1520 1521 1522 1523

	return ret ? ret : count;

}
1524
static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
L
Linus Torvalds 已提交
1525 1526 1527

int ipoib_add_pkey_attr(struct net_device *dev)
{
1528
	return device_create_file(&dev->dev, &dev_attr_pkey);
L
Linus Torvalds 已提交
1529 1530
}

1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554
int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
{
	struct ib_device_attr *device_attr;
	int result = -ENOMEM;

	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
	if (!device_attr) {
		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
		       hca->name, sizeof *device_attr);
		return result;
	}

	result = ib_query_device(hca, device_attr);
	if (result) {
		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
		       hca->name, result);
		kfree(device_attr);
		return result;
	}
	priv->hca_caps = device_attr->device_cap_flags;

	kfree(device_attr);

	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1555 1556
		priv->dev->hw_features = NETIF_F_SG |
			NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1557

1558 1559
		if (priv->hca_caps & IB_DEVICE_UD_TSO)
			priv->dev->hw_features |= NETIF_F_TSO;
O
Or Gerlitz 已提交
1560

1561 1562
		priv->dev->features |= priv->dev->hw_features;
	}
1563 1564 1565 1566

	return 0;
}

L
Linus Torvalds 已提交
1567 1568 1569 1570
static struct net_device *ipoib_add_port(const char *format,
					 struct ib_device *hca, u8 port)
{
	struct ipoib_dev_priv *priv;
1571
	struct ib_port_attr attr;
L
Linus Torvalds 已提交
1572 1573 1574 1575 1576 1577 1578
	int result = -ENOMEM;

	priv = ipoib_intf_alloc(format);
	if (!priv)
		goto alloc_mem_failed;

	SET_NETDEV_DEV(priv->dev, hca->dma_device);
E
Eli Cohen 已提交
1579
	priv->dev->dev_id = port - 1;
L
Linus Torvalds 已提交
1580

1581 1582
	result = ib_query_port(hca, port, &attr);
	if (!result)
1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593
		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
	else {
		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
		       hca->name, port);
		goto device_init_failed;
	}

	/* MTU will be reset when mcast join happens */
	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;

1594 1595
	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);

L
Linus Torvalds 已提交
1596 1597 1598 1599
	result = ib_query_pkey(hca, port, 0, &priv->pkey);
	if (result) {
		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
		       hca->name, port, result);
E
Eli Cohen 已提交
1600
		goto device_init_failed;
L
Linus Torvalds 已提交
1601 1602
	}

1603 1604
	result = ipoib_set_dev_features(priv, hca);
	if (result)
1605
		goto device_init_failed;
V
Vladimir Sokolovsky 已提交
1606

1607 1608 1609 1610 1611 1612
	/*
	 * Set the full membership bit, so that we join the right
	 * broadcast group, etc.
	 */
	priv->pkey |= 0x8000;

L
Linus Torvalds 已提交
1613 1614 1615 1616 1617 1618 1619
	priv->dev->broadcast[8] = priv->pkey >> 8;
	priv->dev->broadcast[9] = priv->pkey & 0xff;

	result = ib_query_gid(hca, port, 0, &priv->local_gid);
	if (result) {
		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
		       hca->name, port, result);
E
Eli Cohen 已提交
1620
		goto device_init_failed;
L
Linus Torvalds 已提交
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647
	} else
		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));

	result = ipoib_dev_init(priv->dev, hca, port);
	if (result < 0) {
		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
		       hca->name, port, result);
		goto device_init_failed;
	}

	INIT_IB_EVENT_HANDLER(&priv->event_handler,
			      priv->ca, ipoib_event);
	result = ib_register_event_handler(&priv->event_handler);
	if (result < 0) {
		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
		       "port %d (ret = %d)\n",
		       hca->name, port, result);
		goto event_failed;
	}

	result = register_netdev(priv->dev);
	if (result) {
		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
		       hca->name, port, result);
		goto register_failed;
	}

1648
	ipoib_create_debug_files(priv->dev);
L
Linus Torvalds 已提交
1649

1650 1651
	if (ipoib_cm_add_mode_attr(priv->dev))
		goto sysfs_failed;
L
Linus Torvalds 已提交
1652 1653
	if (ipoib_add_pkey_attr(priv->dev))
		goto sysfs_failed;
1654 1655
	if (ipoib_add_umcast_attr(priv->dev))
		goto sysfs_failed;
1656
	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
L
Linus Torvalds 已提交
1657
		goto sysfs_failed;
1658
	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
L
Linus Torvalds 已提交
1659 1660 1661 1662 1663
		goto sysfs_failed;

	return priv->dev;

sysfs_failed:
1664
	ipoib_delete_debug_files(priv->dev);
L
Linus Torvalds 已提交
1665 1666 1667 1668
	unregister_netdev(priv->dev);

register_failed:
	ib_unregister_event_handler(&priv->event_handler);
1669
	flush_workqueue(ipoib_workqueue);
1670 1671 1672
	/* Stop GC if started before flush */
	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	cancel_delayed_work(&priv->neigh_reap_task);
1673
	flush_workqueue(priv->wq);
L
Linus Torvalds 已提交
1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689

event_failed:
	ipoib_dev_cleanup(priv->dev);

device_init_failed:
	free_netdev(priv->dev);

alloc_mem_failed:
	return ERR_PTR(result);
}

static void ipoib_add_one(struct ib_device *device)
{
	struct list_head *dev_list;
	struct net_device *dev;
	struct ipoib_dev_priv *priv;
1690
	int p;
M
Michael Wang 已提交
1691
	int count = 0;
T
Tom Tucker 已提交
1692

L
Linus Torvalds 已提交
1693 1694 1695 1696 1697 1698
	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
	if (!dev_list)
		return;

	INIT_LIST_HEAD(dev_list);

1699
	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
M
Michael Wang 已提交
1700
		if (!rdma_protocol_ib(device, p))
E
Eli Cohen 已提交
1701
			continue;
L
Linus Torvalds 已提交
1702 1703 1704 1705
		dev = ipoib_add_port("ib%d", device, p);
		if (!IS_ERR(dev)) {
			priv = netdev_priv(dev);
			list_add_tail(&priv->list, dev_list);
M
Michael Wang 已提交
1706
			count++;
L
Linus Torvalds 已提交
1707 1708 1709
		}
	}

M
Michael Wang 已提交
1710 1711 1712 1713 1714
	if (!count) {
		kfree(dev_list);
		return;
	}

L
Linus Torvalds 已提交
1715 1716 1717
	ib_set_client_data(device, &ipoib_client, dev_list);
}

1718
static void ipoib_remove_one(struct ib_device *device, void *client_data)
L
Linus Torvalds 已提交
1719 1720
{
	struct ipoib_dev_priv *priv, *tmp;
1721
	struct list_head *dev_list = client_data;
L
Linus Torvalds 已提交
1722

1723 1724
	if (!dev_list)
		return;
L
Linus Torvalds 已提交
1725 1726 1727

	list_for_each_entry_safe(priv, tmp, dev_list, list) {
		ib_unregister_event_handler(&priv->event_handler);
1728
		flush_workqueue(ipoib_workqueue);
1729 1730 1731 1732 1733

		rtnl_lock();
		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
		rtnl_unlock();

1734 1735 1736
		/* Stop GC */
		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
		cancel_delayed_work(&priv->neigh_reap_task);
1737
		flush_workqueue(priv->wq);
L
Linus Torvalds 已提交
1738 1739 1740 1741

		unregister_netdev(priv->dev);
		free_netdev(priv->dev);
	}
1742 1743

	kfree(dev_list);
L
Linus Torvalds 已提交
1744 1745 1746 1747 1748 1749
}

static int __init ipoib_init_module(void)
{
	int ret;

1750 1751 1752 1753 1754 1755
	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);

	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1756
	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
1757 1758 1759
#ifdef CONFIG_INFINIBAND_IPOIB_CM
	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
#endif
1760

1761 1762 1763 1764 1765 1766
	/*
	 * When copying small received packets, we only copy from the
	 * linear data part of the SKB, so we rely on this condition.
	 */
	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);

L
Linus Torvalds 已提交
1767 1768 1769 1770 1771
	ret = ipoib_register_debugfs();
	if (ret)
		return ret;

	/*
1772 1773 1774 1775 1776 1777 1778 1779
	 * We create a global workqueue here that is used for all flush
	 * operations.  However, if you attempt to flush a workqueue
	 * from a task on that same workqueue, it deadlocks the system.
	 * We want to be able to flush the tasks associated with a
	 * specific net device, so we also create a workqueue for each
	 * netdevice.  We queue up the tasks for that device only on
	 * its private workqueue, and we only queue up flush events
	 * on our global flush workqueue.  This avoids the deadlocks.
L
Linus Torvalds 已提交
1780
	 */
1781
	ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
L
Linus Torvalds 已提交
1782 1783 1784 1785 1786
	if (!ipoib_workqueue) {
		ret = -ENOMEM;
		goto err_fs;
	}

1787 1788
	ib_sa_register_client(&ipoib_sa_client);

L
Linus Torvalds 已提交
1789 1790
	ret = ib_register_client(&ipoib_client);
	if (ret)
1791
		goto err_sa;
L
Linus Torvalds 已提交
1792

O
Or Gerlitz 已提交
1793 1794 1795 1796
	ret = ipoib_netlink_init();
	if (ret)
		goto err_client;

L
Linus Torvalds 已提交
1797 1798
	return 0;

O
Or Gerlitz 已提交
1799 1800 1801
err_client:
	ib_unregister_client(&ipoib_client);

1802 1803
err_sa:
	ib_sa_unregister_client(&ipoib_sa_client);
L
Linus Torvalds 已提交
1804 1805
	destroy_workqueue(ipoib_workqueue);

1806 1807 1808
err_fs:
	ipoib_unregister_debugfs();

L
Linus Torvalds 已提交
1809 1810 1811 1812 1813
	return ret;
}

static void __exit ipoib_cleanup_module(void)
{
O
Or Gerlitz 已提交
1814
	ipoib_netlink_fini();
L
Linus Torvalds 已提交
1815
	ib_unregister_client(&ipoib_client);
1816
	ib_sa_unregister_client(&ipoib_sa_client);
1817
	ipoib_unregister_debugfs();
L
Linus Torvalds 已提交
1818 1819 1820 1821 1822
	destroy_workqueue(ipoib_workqueue);
}

module_init(ipoib_init_module);
module_exit(ipoib_cleanup_module);