ipoib_main.c 58.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3 4
 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "ipoib.h"

#include <linux/module.h>

#include <linux/init.h>
#include <linux/slab.h>
41
#include <linux/kernel.h>
42
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
43 44 45 46 47 48

#include <linux/if_arp.h>	/* For ARPHRD_xxx */

#include <linux/ip.h>
#include <linux/in.h>

49 50
#include <linux/jhash.h>
#include <net/arp.h>
51 52 53
#include <net/addrconf.h>
#include <linux/inetdevice.h>
#include <rdma/ib_cache.h>
54
#include <linux/pci.h>
55

56 57 58 59
#define DRV_VERSION "1.0.0"

const char ipoib_driver_version[] = DRV_VERSION;

L
Linus Torvalds 已提交
60 61 62
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");
63
MODULE_VERSION(DRV_VERSION);
L
Linus Torvalds 已提交
64

65 66 67 68 69 70 71 72
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;

module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");

L
Linus Torvalds 已提交
73 74 75 76 77 78 79
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;

module_param_named(debug_level, ipoib_debug_level, int, 0644);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
#endif

80 81 82 83 84
struct ipoib_path_iter {
	struct net_device *dev;
	struct ipoib_path  path;
};

L
Linus Torvalds 已提交
85 86 87 88 89 90 91 92
static const u8 ipv4_bcast_addr[] = {
	0x00, 0xff, 0xff, 0xff,
	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
};

struct workqueue_struct *ipoib_workqueue;

93 94
struct ib_sa_client ipoib_sa_client;

L
Linus Torvalds 已提交
95
static void ipoib_add_one(struct ib_device *device);
96
static void ipoib_remove_one(struct ib_device *device, void *client_data);
97
static void ipoib_neigh_reclaim(struct rcu_head *rp);
98 99 100 101
static struct net_device *ipoib_get_net_dev_by_params(
		struct ib_device *dev, u8 port, u16 pkey,
		const union ib_gid *gid, const struct sockaddr *addr,
		void *client_data);
102
static int ipoib_set_mac(struct net_device *dev, void *addr);
L
Linus Torvalds 已提交
103 104 105 106

static struct ib_client ipoib_client = {
	.name   = "ipoib",
	.add    = ipoib_add_one,
107 108
	.remove = ipoib_remove_one,
	.get_net_dev_by_params = ipoib_get_net_dev_by_params,
L
Linus Torvalds 已提交
109 110
};

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
static int ipoib_netdev_event(struct notifier_block *this,
			      unsigned long event, void *ptr)
{
	struct netdev_notifier_info *ni = ptr;
	struct net_device *dev = ni->dev;

	if (dev->netdev_ops->ndo_open != ipoib_open)
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_REGISTER:
		ipoib_create_debug_files(dev);
		break;
	case NETDEV_CHANGENAME:
		ipoib_delete_debug_files(dev);
		ipoib_create_debug_files(dev);
		break;
	case NETDEV_UNREGISTER:
		ipoib_delete_debug_files(dev);
		break;
	}

	return NOTIFY_DONE;
}
#endif

L
Linus Torvalds 已提交
138 139
int ipoib_open(struct net_device *dev)
{
140
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
141 142 143

	ipoib_dbg(priv, "bringing up interface\n");

144 145
	netif_carrier_off(dev);

146
	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
L
Linus Torvalds 已提交
147

148 149
	priv->sm_fullmember_sendonly_support = false;

150
	if (ipoib_ib_dev_open(dev)) {
151 152
		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
			return 0;
153
		goto err_disable;
154
	}
155

156
	ipoib_ib_dev_up(dev);
L
Linus Torvalds 已提交
157 158 159 160 161

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		struct ipoib_dev_priv *cpriv;

		/* Bring up any child interfaces too */
162
		down_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
163 164 165 166 167 168 169 170 171
		list_for_each_entry(cpriv, &priv->child_intfs, list) {
			int flags;

			flags = cpriv->dev->flags;
			if (flags & IFF_UP)
				continue;

			dev_change_flags(cpriv->dev, flags | IFF_UP);
		}
172
		up_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
173 174 175 176 177
	}

	netif_start_queue(dev);

	return 0;
178 179 180 181 182

err_disable:
	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

	return -EINVAL;
L
Linus Torvalds 已提交
183 184 185 186
}

static int ipoib_stop(struct net_device *dev)
{
187
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
188 189 190 191 192 193 194

	ipoib_dbg(priv, "stopping interface\n");

	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

	netif_stop_queue(dev);

195
	ipoib_ib_dev_down(dev);
196
	ipoib_ib_dev_stop(dev);
L
Linus Torvalds 已提交
197 198 199 200 201

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		struct ipoib_dev_priv *cpriv;

		/* Bring down any child interfaces too */
202
		down_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
203 204 205 206 207 208 209 210 211
		list_for_each_entry(cpriv, &priv->child_intfs, list) {
			int flags;

			flags = cpriv->dev->flags;
			if (!(flags & IFF_UP))
				continue;

			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
		}
212
		up_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
213 214 215 216 217
	}

	return 0;
}

O
Or Gerlitz 已提交
218 219 220 221 222
static void ipoib_uninit(struct net_device *dev)
{
	ipoib_dev_cleanup(dev);
}

223
static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
224
{
225
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
226 227

	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
228
		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
229 230 231 232

	return features;
}

L
Linus Torvalds 已提交
233 234
static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
{
235
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
236

237
	/* dev->mtu > 2K ==> connected mode */
238 239 240 241
	if (ipoib_cm_admin_enabled(dev)) {
		if (new_mtu > ipoib_cm_max_mtu(dev))
			return -EINVAL;

242 243 244
		if (new_mtu > priv->mcast_mtu)
			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
				   priv->mcast_mtu);
245

246 247 248 249
		dev->mtu = new_mtu;
		return 0;
	}

250
	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
L
Linus Torvalds 已提交
251 252 253 254
		return -EINVAL;

	priv->admin_mtu = new_mtu;

255 256 257 258
	if (priv->mcast_mtu < priv->admin_mtu)
		ipoib_dbg(priv, "MTU must be smaller than the underlying "
				"link layer MTU - 4 (%u)\n", priv->mcast_mtu);

L
Linus Torvalds 已提交
259 260 261 262 263
	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);

	return 0;
}

264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
/* Called with an RCU read lock taken */
static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
					struct net_device *dev)
{
	struct net *net = dev_net(dev);
	struct in_device *in_dev;
	struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
	struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
	__be32 ret_addr;

	switch (addr->sa_family) {
	case AF_INET:
		in_dev = in_dev_get(dev);
		if (!in_dev)
			return false;

		ret_addr = inet_confirm_addr(net, in_dev, 0,
					     addr_in->sin_addr.s_addr,
					     RT_SCOPE_HOST);
		in_dev_put(in_dev);
		if (ret_addr)
			return true;

		break;
	case AF_INET6:
		if (IS_ENABLED(CONFIG_IPV6) &&
		    ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
			return true;

		break;
	}
	return false;
}

/**
 * Find the master net_device on top of the given net_device.
 * @dev: base IPoIB net_device
 *
 * Returns the master net_device with a reference held, or the same net_device
 * if no master exists.
 */
static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
{
	struct net_device *master;

	rcu_read_lock();
	master = netdev_master_upper_dev_get_rcu(dev);
	if (master)
		dev_hold(master);
	rcu_read_unlock();

	if (master)
		return master;

	dev_hold(dev);
	return dev;
}

D
David Ahern 已提交
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
struct ipoib_walk_data {
	const struct sockaddr *addr;
	struct net_device *result;
};

static int ipoib_upper_walk(struct net_device *upper, void *_data)
{
	struct ipoib_walk_data *data = _data;
	int ret = 0;

	if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
		dev_hold(upper);
		data->result = upper;
		ret = 1;
	}

	return ret;
}

341 342 343 344 345 346 347 348 349 350 351 352
/**
 * Find a net_device matching the given address, which is an upper device of
 * the given net_device.
 * @addr: IP address to look for.
 * @dev: base IPoIB net_device
 *
 * If found, returns the net_device with a reference held. Otherwise return
 * NULL.
 */
static struct net_device *ipoib_get_net_dev_match_addr(
		const struct sockaddr *addr, struct net_device *dev)
{
D
David Ahern 已提交
353 354 355
	struct ipoib_walk_data data = {
		.addr = addr,
	};
356 357 358 359

	rcu_read_lock();
	if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
		dev_hold(dev);
D
David Ahern 已提交
360
		data.result = dev;
361 362 363
		goto out;
	}

D
David Ahern 已提交
364
	netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data);
365 366
out:
	rcu_read_unlock();
D
David Ahern 已提交
367
	return data.result;
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
}

/* returns the number of IPoIB netdevs on top a given ipoib device matching a
 * pkey_index and address, if one exists.
 *
 * @found_net_dev: contains a matching net_device if the return value >= 1,
 * with a reference held. */
static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
				     const union ib_gid *gid,
				     u16 pkey_index,
				     const struct sockaddr *addr,
				     int nesting,
				     struct net_device **found_net_dev)
{
	struct ipoib_dev_priv *child_priv;
	struct net_device *net_dev = NULL;
	int matches = 0;

	if (priv->pkey_index == pkey_index &&
	    (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
		if (!addr) {
			net_dev = ipoib_get_master_net_dev(priv->dev);
		} else {
			/* Verify the net_device matches the IP address, as
			 * IPoIB child devices currently share a GID. */
			net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
		}
		if (net_dev) {
			if (!*found_net_dev)
				*found_net_dev = net_dev;
			else
				dev_put(net_dev);
			++matches;
		}
	}

	/* Check child interfaces */
	down_read_nested(&priv->vlan_rwsem, nesting);
	list_for_each_entry(child_priv, &priv->child_intfs, list) {
		matches += ipoib_match_gid_pkey_addr(child_priv, gid,
						    pkey_index, addr,
						    nesting + 1,
						    found_net_dev);
		if (matches > 1)
			break;
	}
	up_read(&priv->vlan_rwsem);

	return matches;
}

/* Returns the number of matching net_devs found (between 0 and 2). Also
 * return the matching net_device in the @net_dev parameter, holding a
 * reference to the net_device, if the number of matches >= 1 */
static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
					 u16 pkey_index,
					 const union ib_gid *gid,
					 const struct sockaddr *addr,
					 struct net_device **net_dev)
{
	struct ipoib_dev_priv *priv;
	int matches = 0;

	*net_dev = NULL;

	list_for_each_entry(priv, dev_list, list) {
		if (priv->port != port)
			continue;

		matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
						     addr, 0, net_dev);
		if (matches > 1)
			break;
	}

	return matches;
}

static struct net_device *ipoib_get_net_dev_by_params(
		struct ib_device *dev, u8 port, u16 pkey,
		const union ib_gid *gid, const struct sockaddr *addr,
		void *client_data)
{
	struct net_device *net_dev;
	struct list_head *dev_list = client_data;
	u16 pkey_index;
	int matches;
	int ret;

	if (!rdma_protocol_ib(dev, port))
		return NULL;

	ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
	if (ret)
		return NULL;

	if (!dev_list)
		return NULL;

	/* See if we can find a unique device matching the L2 parameters */
	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
						gid, NULL, &net_dev);

	switch (matches) {
	case 0:
		return NULL;
	case 1:
		return net_dev;
	}

	dev_put(net_dev);

	/* Couldn't find a unique device with L2 parameters only. Use L3
	 * address to uniquely match the net device */
	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
						gid, addr, &net_dev);
	switch (matches) {
	case 0:
		return NULL;
	default:
		dev_warn_ratelimited(&dev->dev,
				     "duplicate IP address detected\n");
		/* Fall through */
	case 1:
		return net_dev;
	}
}

496 497
int ipoib_set_mode(struct net_device *dev, const char *buf)
{
498
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
499

500 501 502 503 504 505 506
	if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
	     !strcmp(buf, "connected\n")) ||
	     (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
	     !strcmp(buf, "datagram\n"))) {
		return 0;
	}

507 508 509 510 511 512
	/* flush paths if we switch modes so that connections are restarted */
	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
		ipoib_warn(priv, "enabling connected mode "
			   "will cause multicast packet drops\n");
		netdev_update_features(dev);
513
		dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
514
		rtnl_unlock();
C
Christoph Hellwig 已提交
515
		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
516 517

		ipoib_flush_paths(dev);
518
		return (!rtnl_trylock()) ? -EBUSY : 0;
519 520 521 522 523 524 525 526
	}

	if (!strcmp(buf, "datagram\n")) {
		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
		netdev_update_features(dev);
		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
		rtnl_unlock();
		ipoib_flush_paths(dev);
527
		return (!rtnl_trylock()) ? -EBUSY : 0;
528 529 530 531 532
	}

	return -EINVAL;
}

533
struct ipoib_path *__path_find(struct net_device *dev, void *gid)
L
Linus Torvalds 已提交
534
{
535
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
536 537 538 539 540 541 542
	struct rb_node *n = priv->path_tree.rb_node;
	struct ipoib_path *path;
	int ret;

	while (n) {
		path = rb_entry(n, struct ipoib_path, rb_node);

543
		ret = memcmp(gid, path->pathrec.dgid.raw,
L
Linus Torvalds 已提交
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
			     sizeof (union ib_gid));

		if (ret < 0)
			n = n->rb_left;
		else if (ret > 0)
			n = n->rb_right;
		else
			return path;
	}

	return NULL;
}

static int __path_add(struct net_device *dev, struct ipoib_path *path)
{
559
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
	struct rb_node **n = &priv->path_tree.rb_node;
	struct rb_node *pn = NULL;
	struct ipoib_path *tpath;
	int ret;

	while (*n) {
		pn = *n;
		tpath = rb_entry(pn, struct ipoib_path, rb_node);

		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
			     sizeof (union ib_gid));
		if (ret < 0)
			n = &pn->rb_left;
		else if (ret > 0)
			n = &pn->rb_right;
		else
			return -EEXIST;
	}

	rb_link_node(&path->rb_node, pn, n);
	rb_insert_color(&path->rb_node, &priv->path_tree);

	list_add_tail(&path->list, &priv->path_list);

	return 0;
}

static void path_free(struct net_device *dev, struct ipoib_path *path)
{
	struct sk_buff *skb;

	while ((skb = __skb_dequeue(&path->queue)))
		dev_kfree_skb_irq(skb);

594
	ipoib_dbg(ipoib_priv(dev), "path_free\n");
L
Linus Torvalds 已提交
595

596 597
	/* remove all neigh connected to this path */
	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
598 599 600 601 602 603 604

	if (path->ah)
		ipoib_put_ah(path->ah);

	kfree(path);
}

605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG

struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
{
	struct ipoib_path_iter *iter;

	iter = kmalloc(sizeof *iter, GFP_KERNEL);
	if (!iter)
		return NULL;

	iter->dev = dev;
	memset(iter->path.pathrec.dgid.raw, 0, 16);

	if (ipoib_path_iter_next(iter)) {
		kfree(iter);
		return NULL;
	}

	return iter;
}

int ipoib_path_iter_next(struct ipoib_path_iter *iter)
{
628
	struct ipoib_dev_priv *priv = ipoib_priv(iter->dev);
629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
	struct rb_node *n;
	struct ipoib_path *path;
	int ret = 1;

	spin_lock_irq(&priv->lock);

	n = rb_first(&priv->path_tree);

	while (n) {
		path = rb_entry(n, struct ipoib_path, rb_node);

		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
			   sizeof (union ib_gid)) < 0) {
			iter->path = *path;
			ret = 0;
			break;
		}

		n = rb_next(n);
	}

	spin_unlock_irq(&priv->lock);

	return ret;
}

void ipoib_path_iter_read(struct ipoib_path_iter *iter,
			  struct ipoib_path *path)
{
	*path = iter->path;
}

#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */

663 664
void ipoib_mark_paths_invalid(struct net_device *dev)
{
665
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
666 667 668 669 670
	struct ipoib_path *path, *tp;

	spin_lock_irq(&priv->lock);

	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
H
Harvey Harrison 已提交
671
		ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
672
			be16_to_cpu(path->pathrec.dlid),
673
			path->pathrec.dgid.raw);
674 675 676 677 678 679
		path->valid =  0;
	}

	spin_unlock_irq(&priv->lock);
}

680 681 682 683 684 685 686 687
static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
{
	struct ipoib_pseudo_header *phdr;

	phdr = (struct ipoib_pseudo_header *)skb_push(skb, sizeof(*phdr));
	memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
}

L
Linus Torvalds 已提交
688 689
void ipoib_flush_paths(struct net_device *dev)
{
690
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
691 692
	struct ipoib_path *path, *tp;
	LIST_HEAD(remove_list);
693
	unsigned long flags;
L
Linus Torvalds 已提交
694

695 696
	netif_tx_lock_bh(dev);
	spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
697

698
	list_splice_init(&priv->path_list, &remove_list);
L
Linus Torvalds 已提交
699 700 701 702 703 704 705

	list_for_each_entry(path, &remove_list, list)
		rb_erase(&path->rb_node, &priv->path_tree);

	list_for_each_entry_safe(path, tp, &remove_list, list) {
		if (path->query)
			ib_sa_cancel_query(path->query_id, path->query);
706 707
		spin_unlock_irqrestore(&priv->lock, flags);
		netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
708 709
		wait_for_completion(&path->done);
		path_free(dev, path);
710 711
		netif_tx_lock_bh(dev);
		spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
712
	}
713 714 715

	spin_unlock_irqrestore(&priv->lock, flags);
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
716 717 718
}

static void path_rec_completion(int status,
719
				struct sa_path_rec *pathrec,
L
Linus Torvalds 已提交
720 721 722 723
				void *path_ptr)
{
	struct ipoib_path *path = path_ptr;
	struct net_device *dev = path->dev;
724
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
725
	struct ipoib_ah *ah = NULL;
726
	struct ipoib_ah *old_ah = NULL;
727
	struct ipoib_neigh *neigh, *tn;
L
Linus Torvalds 已提交
728 729 730 731
	struct sk_buff_head skqueue;
	struct sk_buff *skb;
	unsigned long flags;

732
	if (!status)
H
Harvey Harrison 已提交
733
		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
734
			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
L
Linus Torvalds 已提交
735
	else
H
Harvey Harrison 已提交
736
		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
737
			  status, path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
738 739 740 741

	skb_queue_head_init(&skqueue);

	if (!status) {
742
		struct rdma_ah_attr av;
743 744 745

		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
			ah = ipoib_create_ah(dev, priv->pd, &av);
L
Linus Torvalds 已提交
746 747 748 749
	}

	spin_lock_irqsave(&priv->lock, flags);

750
	if (!IS_ERR_OR_NULL(ah)) {
L
Linus Torvalds 已提交
751 752
		path->pathrec = *pathrec;

753 754 755
		old_ah   = path->ah;
		path->ah = ah;

L
Linus Torvalds 已提交
756 757 758 759 760 761
		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);

		while ((skb = __skb_dequeue(&path->queue)))
			__skb_queue_tail(&skqueue, skb);

762
		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
763 764 765 766 767 768 769 770 771 772 773
			if (neigh->ah) {
				WARN_ON(neigh->ah != old_ah);
				/*
				 * Dropping the ah reference inside
				 * priv->lock is safe here, because we
				 * will hold one more reference from
				 * the original value of path->ah (ie
				 * old_ah).
				 */
				ipoib_put_ah(neigh->ah);
			}
L
Linus Torvalds 已提交
774 775 776
			kref_get(&path->ah->ref);
			neigh->ah = path->ah;

777
			if (ipoib_cm_enabled(dev, neigh->daddr)) {
778 779 780 781 782
				if (!ipoib_cm_get(neigh))
					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
									       path,
									       neigh));
				if (!ipoib_cm_get(neigh)) {
783
					ipoib_neigh_free(neigh);
784 785 786 787
					continue;
				}
			}

L
Linus Torvalds 已提交
788 789 790
			while ((skb = __skb_dequeue(&neigh->queue)))
				__skb_queue_tail(&skqueue, skb);
		}
791
		path->valid = 1;
792
	}
L
Linus Torvalds 已提交
793

794
	path->query = NULL;
L
Linus Torvalds 已提交
795 796 797 798
	complete(&path->done);

	spin_unlock_irqrestore(&priv->lock, flags);

799 800 801
	if (IS_ERR_OR_NULL(ah))
		ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);

802 803 804
	if (old_ah)
		ipoib_put_ah(old_ah);

L
Linus Torvalds 已提交
805
	while ((skb = __skb_dequeue(&skqueue))) {
806
		int ret;
L
Linus Torvalds 已提交
807
		skb->dev = dev;
808 809 810 811
		ret = dev_queue_xmit(skb);
		if (ret)
			ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n",
				   __func__, ret);
L
Linus Torvalds 已提交
812 813 814
	}
}

815
static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
L
Linus Torvalds 已提交
816
{
817
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
818 819
	struct ipoib_path *path;

820 821 822
	if (!priv->broadcast)
		return NULL;

823
	path = kzalloc(sizeof *path, GFP_ATOMIC);
L
Linus Torvalds 已提交
824 825 826
	if (!path)
		return NULL;

827
	path->dev = dev;
L
Linus Torvalds 已提交
828 829 830 831 832

	skb_queue_head_init(&path->queue);

	INIT_LIST_HEAD(&path->neigh_list);

833
	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
834 835
	path->pathrec.sgid	    = priv->local_gid;
	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
836 837
	path->pathrec.numb_path     = 1;
	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
L
Linus Torvalds 已提交
838 839 840 841 842 843 844

	return path;
}

static int path_rec_start(struct net_device *dev,
			  struct ipoib_path *path)
{
845
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
846

H
Harvey Harrison 已提交
847
	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
848
		  path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
849

850 851
	init_completion(&path->done);

L
Linus Torvalds 已提交
852
	path->query_id =
853
		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
L
Linus Torvalds 已提交
854 855 856 857
				   &path->pathrec,
				   IB_SA_PATH_REC_DGID		|
				   IB_SA_PATH_REC_SGID		|
				   IB_SA_PATH_REC_NUMB_PATH	|
858
				   IB_SA_PATH_REC_TRAFFIC_CLASS |
L
Linus Torvalds 已提交
859 860 861 862 863
				   IB_SA_PATH_REC_PKEY,
				   1000, GFP_ATOMIC,
				   path_rec_completion,
				   path, &path->query);
	if (path->query_id < 0) {
864
		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
L
Linus Torvalds 已提交
865
		path->query = NULL;
866
		complete(&path->done);
L
Linus Torvalds 已提交
867 868 869 870 871 872
		return path->query_id;
	}

	return 0;
}

873 874
static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
			   struct net_device *dev)
L
Linus Torvalds 已提交
875
{
876
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
877
	struct rdma_netdev *rn = netdev_priv(dev);
L
Linus Torvalds 已提交
878 879
	struct ipoib_path *path;
	struct ipoib_neigh *neigh;
880
	unsigned long flags;
L
Linus Torvalds 已提交
881

882
	spin_lock_irqsave(&priv->lock, flags);
883
	neigh = ipoib_neigh_alloc(daddr, dev);
L
Linus Torvalds 已提交
884
	if (!neigh) {
885
		spin_unlock_irqrestore(&priv->lock, flags);
886
		++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
887 888 889 890
		dev_kfree_skb_any(skb);
		return;
	}

891
	path = __path_find(dev, daddr + 4);
L
Linus Torvalds 已提交
892
	if (!path) {
893
		path = path_rec_create(dev, daddr + 4);
L
Linus Torvalds 已提交
894
		if (!path)
895
			goto err_path;
L
Linus Torvalds 已提交
896 897 898 899 900 901

		__path_add(dev, path);
	}

	list_add_tail(&neigh->list, &path->neigh_list);

902
	if (path->ah) {
L
Linus Torvalds 已提交
903 904 905
		kref_get(&path->ah->ref);
		neigh->ah = path->ah;

906
		if (ipoib_cm_enabled(dev, neigh->daddr)) {
907 908 909
			if (!ipoib_cm_get(neigh))
				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
			if (!ipoib_cm_get(neigh)) {
910
				ipoib_neigh_free(neigh);
911 912
				goto err_drop;
			}
913 914
			if (skb_queue_len(&neigh->queue) <
			    IPOIB_MAX_PATH_REC_QUEUE) {
915
				push_pseudo_header(skb, neigh->daddr);
916
				__skb_queue_tail(&neigh->queue, skb);
917
			} else {
918 919 920 921
				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
					   skb_queue_len(&neigh->queue));
				goto err_drop;
			}
922 923
		} else {
			spin_unlock_irqrestore(&priv->lock, flags);
924 925
			path->ah->last_send = rn->send(dev, skb, path->ah->ah,
						       IPOIB_QPN(daddr));
926
			ipoib_neigh_put(neigh);
927 928
			return;
		}
L
Linus Torvalds 已提交
929 930 931 932
	} else {
		neigh->ah  = NULL;

		if (!path->query && path_rec_start(dev, path))
933
			goto err_path;
934 935
		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
			push_pseudo_header(skb, neigh->daddr);
936
			__skb_queue_tail(&neigh->queue, skb);
937
		} else {
938
			goto err_drop;
939
		}
L
Linus Torvalds 已提交
940 941
	}

942
	spin_unlock_irqrestore(&priv->lock, flags);
943
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
944 945
	return;

946
err_path:
947
	ipoib_neigh_free(neigh);
948
err_drop:
949
	++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
950 951
	dev_kfree_skb_any(skb);

952
	spin_unlock_irqrestore(&priv->lock, flags);
953
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
954 955 956
}

static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
957
			     struct ipoib_pseudo_header *phdr)
L
Linus Torvalds 已提交
958
{
959
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
960
	struct rdma_netdev *rn = netdev_priv(dev);
L
Linus Torvalds 已提交
961
	struct ipoib_path *path;
962
	unsigned long flags;
L
Linus Torvalds 已提交
963

964
	spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
965

966
	path = __path_find(dev, phdr->hwaddr + 4);
967
	if (!path || !path->valid) {
968 969 970
		int new_path = 0;

		if (!path) {
971
			path = path_rec_create(dev, phdr->hwaddr + 4);
972 973
			new_path = 1;
		}
L
Linus Torvalds 已提交
974
		if (path) {
975
			if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
976
				push_pseudo_header(skb, phdr->hwaddr);
977 978 979 980 981
				__skb_queue_tail(&path->queue, skb);
			} else {
				++dev->stats.tx_dropped;
				dev_kfree_skb_any(skb);
			}
L
Linus Torvalds 已提交
982

983
			if (!path->query && path_rec_start(dev, path)) {
984
				spin_unlock_irqrestore(&priv->lock, flags);
985 986
				if (new_path)
					path_free(dev, path);
L
Linus Torvalds 已提交
987 988 989 990
				return;
			} else
				__path_add(dev, path);
		} else {
991
			++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
992 993 994
			dev_kfree_skb_any(skb);
		}

995
		spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
996 997 998
		return;
	}

999
	if (path->ah) {
L
Linus Torvalds 已提交
1000 1001 1002
		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
			  be16_to_cpu(path->pathrec.dlid));

1003
		spin_unlock_irqrestore(&priv->lock, flags);
1004 1005
		path->ah->last_send = rn->send(dev, skb, path->ah->ah,
					       IPOIB_QPN(phdr->hwaddr));
1006
		return;
L
Linus Torvalds 已提交
1007 1008
	} else if ((path->query || !path_rec_start(dev, path)) &&
		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1009
		push_pseudo_header(skb, phdr->hwaddr);
L
Linus Torvalds 已提交
1010 1011
		__skb_queue_tail(&path->queue, skb);
	} else {
1012
		++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
1013 1014 1015
		dev_kfree_skb_any(skb);
	}

1016
	spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
1017 1018 1019 1020
}

static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
1021
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1022
	struct rdma_netdev *rn = netdev_priv(dev);
L
Linus Torvalds 已提交
1023
	struct ipoib_neigh *neigh;
1024
	struct ipoib_pseudo_header *phdr;
1025
	struct ipoib_header *header;
L
Linus Torvalds 已提交
1026 1027
	unsigned long flags;

1028 1029
	phdr = (struct ipoib_pseudo_header *) skb->data;
	skb_pull(skb, sizeof(*phdr));
1030 1031
	header = (struct ipoib_header *) skb->data;

1032
	if (unlikely(phdr->hwaddr[4] == 0xff)) {
1033 1034 1035 1036
		/* multicast, arrange "if" according to probability */
		if ((header->proto != htons(ETH_P_IP)) &&
		    (header->proto != htons(ETH_P_IPV6)) &&
		    (header->proto != htons(ETH_P_ARP)) &&
1037 1038
		    (header->proto != htons(ETH_P_RARP)) &&
		    (header->proto != htons(ETH_P_TIPC))) {
1039
			/* ethertype not supported by IPoIB */
1040 1041
			++dev->stats.tx_dropped;
			dev_kfree_skb_any(skb);
1042
			return NETDEV_TX_OK;
1043
		}
1044
		/* Add in the P_Key for multicast*/
1045 1046
		phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
		phdr->hwaddr[9] = priv->pkey & 0xff;
1047

1048
		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1049 1050
		if (likely(neigh))
			goto send_using_neigh;
1051
		ipoib_mcast_send(dev, phdr->hwaddr, skb);
1052
		return NETDEV_TX_OK;
1053
	}
L
Linus Torvalds 已提交
1054

1055 1056 1057 1058
	/* unicast, arrange "switch" according to probability */
	switch (header->proto) {
	case htons(ETH_P_IP):
	case htons(ETH_P_IPV6):
1059
	case htons(ETH_P_TIPC):
1060
		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1061
		if (unlikely(!neigh)) {
1062
			neigh_add_path(skb, phdr->hwaddr, dev);
1063
			return NETDEV_TX_OK;
1064
		}
1065 1066 1067 1068
		break;
	case htons(ETH_P_ARP):
	case htons(ETH_P_RARP):
		/* for unicast ARP and RARP should always perform path find */
1069
		unicast_arp_send(skb, dev, phdr);
1070 1071 1072 1073 1074 1075 1076
		return NETDEV_TX_OK;
	default:
		/* ethertype not supported by IPoIB */
		++dev->stats.tx_dropped;
		dev_kfree_skb_any(skb);
		return NETDEV_TX_OK;
	}
1077

1078 1079 1080 1081 1082 1083
send_using_neigh:
	/* note we now hold a ref to neigh */
	if (ipoib_cm_get(neigh)) {
		if (ipoib_cm_up(neigh)) {
			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
			goto unref;
L
Linus Torvalds 已提交
1084
		}
1085
	} else if (neigh->ah) {
1086 1087
		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
						IPOIB_QPN(phdr->hwaddr));
1088 1089
		goto unref;
	}
L
Linus Torvalds 已提交
1090

1091
	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1092
		push_pseudo_header(skb, phdr->hwaddr);
1093 1094 1095
		spin_lock_irqsave(&priv->lock, flags);
		__skb_queue_tail(&neigh->queue, skb);
		spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
1096
	} else {
1097 1098 1099
		++dev->stats.tx_dropped;
		dev_kfree_skb_any(skb);
	}
L
Linus Torvalds 已提交
1100

1101 1102
unref:
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
1103 1104 1105 1106 1107 1108

	return NETDEV_TX_OK;
}

static void ipoib_timeout(struct net_device *dev)
{
1109
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1110

1111
	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
1112
		   jiffies_to_msecs(jiffies - dev_trans_start(dev)));
1113 1114 1115
	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
		   netif_queue_stopped(dev),
		   priv->tx_head, priv->tx_tail);
L
Linus Torvalds 已提交
1116 1117 1118 1119 1120 1121
	/* XXX reset QP, etc. */
}

static int ipoib_hard_header(struct sk_buff *skb,
			     struct net_device *dev,
			     unsigned short type,
1122
			     const void *daddr, const void *saddr, unsigned len)
L
Linus Torvalds 已提交
1123 1124 1125 1126 1127 1128 1129 1130 1131
{
	struct ipoib_header *header;

	header = (struct ipoib_header *) skb_push(skb, sizeof *header);

	header->proto = htons(type);
	header->reserved = 0;

	/*
1132
	 * we don't rely on dst_entry structure,  always stuff the
1133
	 * destination address into skb hard header so we can figure out where
1134
	 * to send the packet later.
L
Linus Torvalds 已提交
1135
	 */
1136
	push_pseudo_header(skb, daddr);
L
Linus Torvalds 已提交
1137

1138
	return IPOIB_HARD_LEN;
L
Linus Torvalds 已提交
1139 1140 1141 1142
}

static void ipoib_set_mcast_list(struct net_device *dev)
{
1143
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1144

L
Leonid Arsh 已提交
1145 1146 1147 1148 1149
	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
		return;
	}

1150
	queue_work(priv->wq, &priv->restart_task);
L
Linus Torvalds 已提交
1151 1152
}

1153 1154
static int ipoib_get_iflink(const struct net_device *dev)
{
1155
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1156

E
Erez Shitrit 已提交
1157 1158 1159 1160 1161
	/* parent interface */
	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
		return dev->ifindex;

	/* child/vlan interface */
1162 1163 1164
	return priv->parent->ifindex;
}

1165
static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
L
Linus Torvalds 已提交
1166
{
1167 1168 1169 1170 1171 1172 1173
	/*
	 * Use only the address parts that contributes to spreading
	 * The subnet prefix is not used as one can not connect to
	 * same remote port (GUID) using the same remote QPN via two
	 * different subnets.
	 */
	 /* qpn octets[1:4) & port GUID octets[12:20) */
1174
	u32 *d32 = (u32 *) daddr;
1175 1176
	u32 hv;

1177
	hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
1178 1179 1180 1181 1182
	return hv & htbl->mask;
}

struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
{
1183
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh *neigh = NULL;
	u32 hash_val;

	rcu_read_lock_bh();

	htbl = rcu_dereference_bh(ntbl->htbl);

	if (!htbl)
		goto out_unlock;

	hash_val = ipoib_addr_hash(htbl, daddr);
	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
	     neigh != NULL;
	     neigh = rcu_dereference_bh(neigh->hnext)) {
		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
			/* found, take one ref on behalf of the caller */
			if (!atomic_inc_not_zero(&neigh->refcnt)) {
				/* deleted */
				neigh = NULL;
				goto out_unlock;
			}
1207 1208 1209

			if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
				neigh->alive = jiffies;
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224
			goto out_unlock;
		}
	}

out_unlock:
	rcu_read_unlock_bh();
	return neigh;
}

static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long neigh_obsolete;
	unsigned long dt;
L
Linus Torvalds 已提交
1225
	unsigned long flags;
1226
	int i;
1227
	LIST_HEAD(remove_list);
L
Linus Torvalds 已提交
1228

1229
	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1230
		return;
L
Linus Torvalds 已提交
1231

1232
	spin_lock_irqsave(&priv->lock, flags);
1233 1234

	htbl = rcu_dereference_protected(ntbl->htbl,
1235
					 lockdep_is_held(&priv->lock));
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251

	if (!htbl)
		goto out_unlock;

	/* neigh is obsolete if it was idle for two GC periods */
	dt = 2 * arp_tbl.gc_interval;
	neigh_obsolete = jiffies - dt;
	/* handle possible race condition */
	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
		goto out_unlock;

	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1252
							  lockdep_is_held(&priv->lock))) != NULL) {
1253 1254
			/* was the neigh idle for two GC periods */
			if (time_after(neigh_obsolete, neigh->alive)) {
1255

1256
				ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
1257

1258 1259
				rcu_assign_pointer(*np,
						   rcu_dereference_protected(neigh->hnext,
1260
									     lockdep_is_held(&priv->lock)));
1261
				/* remove from path/mc list */
1262
				list_del_init(&neigh->list);
1263 1264 1265 1266
				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
			} else {
				np = &neigh->hnext;
			}
L
Linus Torvalds 已提交
1267

1268 1269
		}
	}
L
Linus Torvalds 已提交
1270

1271
out_unlock:
1272
	spin_unlock_irqrestore(&priv->lock, flags);
1273
	ipoib_mcast_remove_list(&remove_list);
1274
}
L
Linus Torvalds 已提交
1275

1276 1277 1278 1279 1280 1281 1282 1283
static void ipoib_reap_neigh(struct work_struct *work)
{
	struct ipoib_dev_priv *priv =
		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);

	__ipoib_reap_neigh(priv);

	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1284
		queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1285
				   arp_tbl.gc_interval);
L
Linus Torvalds 已提交
1286 1287
}

1288 1289

static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
1290
				      struct net_device *dev)
1291 1292 1293
{
	struct ipoib_neigh *neigh;

1294
	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
1295 1296 1297
	if (!neigh)
		return NULL;

1298
	neigh->dev = dev;
1299
	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
1300
	skb_queue_head_init(&neigh->queue);
1301
	INIT_LIST_HEAD(&neigh->list);
1302
	ipoib_cm_set(neigh, NULL);
1303 1304 1305 1306 1307 1308 1309 1310 1311
	/* one ref on behalf of the caller */
	atomic_set(&neigh->refcnt, 1);

	return neigh;
}

struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
				      struct net_device *dev)
{
1312
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1313 1314 1315 1316 1317 1318
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh *neigh;
	u32 hash_val;

	htbl = rcu_dereference_protected(ntbl->htbl,
1319
					 lockdep_is_held(&priv->lock));
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
	if (!htbl) {
		neigh = NULL;
		goto out_unlock;
	}

	/* need to add a new neigh, but maybe some other thread succeeded?
	 * recalc hash, maybe hash resize took place so we do a search
	 */
	hash_val = ipoib_addr_hash(htbl, daddr);
	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1330
					       lockdep_is_held(&priv->lock));
1331 1332
	     neigh != NULL;
	     neigh = rcu_dereference_protected(neigh->hnext,
1333
					       lockdep_is_held(&priv->lock))) {
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355
		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
			/* found, take one ref on behalf of the caller */
			if (!atomic_inc_not_zero(&neigh->refcnt)) {
				/* deleted */
				neigh = NULL;
				break;
			}
			neigh->alive = jiffies;
			goto out_unlock;
		}
	}

	neigh = ipoib_neigh_ctor(daddr, dev);
	if (!neigh)
		goto out_unlock;

	/* one ref on behalf of the hash table */
	atomic_inc(&neigh->refcnt);
	neigh->alive = jiffies;
	/* put in hash */
	rcu_assign_pointer(neigh->hnext,
			   rcu_dereference_protected(htbl->buckets[hash_val],
1356
						     lockdep_is_held(&priv->lock)));
1357 1358 1359 1360
	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
	atomic_inc(&ntbl->entries);

out_unlock:
1361 1362 1363 1364

	return neigh;
}

1365
void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1366
{
1367 1368
	/* neigh reference count was dropprd to zero */
	struct net_device *dev = neigh->dev;
1369
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1370
	struct sk_buff *skb;
1371 1372
	if (neigh->ah)
		ipoib_put_ah(neigh->ah);
1373
	while ((skb = __skb_dequeue(&neigh->queue))) {
1374
		++dev->stats.tx_dropped;
1375 1376
		dev_kfree_skb_any(skb);
	}
1377 1378
	if (ipoib_cm_get(neigh))
		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1379
	ipoib_dbg(ipoib_priv(dev),
1380 1381 1382
		  "neigh free for %06x %pI6\n",
		  IPOIB_QPN(neigh->daddr),
		  neigh->daddr + 4);
1383
	kfree(neigh);
1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395
	if (atomic_dec_and_test(&priv->ntbl.entries)) {
		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
			complete(&priv->ntbl.flushed);
	}
}

static void ipoib_neigh_reclaim(struct rcu_head *rp)
{
	/* Called as a result of removal from hash table */
	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
	/* note TX context may hold another ref */
	ipoib_neigh_put(neigh);
1396 1397
}

1398
void ipoib_neigh_free(struct ipoib_neigh *neigh)
L
Linus Torvalds 已提交
1399
{
1400
	struct net_device *dev = neigh->dev;
1401
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1402 1403 1404 1405 1406 1407 1408
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh __rcu **np;
	struct ipoib_neigh *n;
	u32 hash_val;

	htbl = rcu_dereference_protected(ntbl->htbl,
1409
					lockdep_is_held(&priv->lock));
1410
	if (!htbl)
1411
		return;
1412 1413 1414 1415

	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
	np = &htbl->buckets[hash_val];
	for (n = rcu_dereference_protected(*np,
1416
					    lockdep_is_held(&priv->lock));
1417
	     n != NULL;
1418
	     n = rcu_dereference_protected(*np,
1419
					lockdep_is_held(&priv->lock))) {
1420 1421 1422 1423
		if (n == neigh) {
			/* found */
			rcu_assign_pointer(*np,
					   rcu_dereference_protected(neigh->hnext,
1424
								     lockdep_is_held(&priv->lock)));
1425
			/* remove from parent list */
1426
			list_del_init(&neigh->list);
1427
			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1428
			return;
1429 1430 1431 1432 1433 1434 1435 1436 1437 1438
		} else {
			np = &n->hnext;
		}
	}
}

static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
1439
	struct ipoib_neigh __rcu **buckets;
1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
	u32 size;

	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
	ntbl->htbl = NULL;
	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
	if (!htbl)
		return -ENOMEM;
	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
	if (!buckets) {
		kfree(htbl);
		return -ENOMEM;
	}
	htbl->size = size;
	htbl->mask = (size - 1);
	htbl->buckets = buckets;
1457
	RCU_INIT_POINTER(ntbl->htbl, htbl);
1458
	htbl->ntbl = ntbl;
1459 1460 1461 1462
	atomic_set(&ntbl->entries, 0);

	/* start garbage collection */
	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1463
	queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1464
			   arp_tbl.gc_interval);
L
Linus Torvalds 已提交
1465 1466 1467 1468

	return 0;
}

1469 1470 1471 1472 1473 1474
static void neigh_hash_free_rcu(struct rcu_head *head)
{
	struct ipoib_neigh_hash *htbl = container_of(head,
						    struct ipoib_neigh_hash,
						    rcu);
	struct ipoib_neigh __rcu **buckets = htbl->buckets;
1475
	struct ipoib_neigh_table *ntbl = htbl->ntbl;
1476 1477 1478

	kfree(buckets);
	kfree(htbl);
1479
	complete(&ntbl->deleted);
1480 1481 1482 1483
}

void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
{
1484
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1485 1486 1487 1488 1489 1490
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long flags;
	int i;

	/* remove all neigh connected to a given path or mcast */
1491
	spin_lock_irqsave(&priv->lock, flags);
1492 1493

	htbl = rcu_dereference_protected(ntbl->htbl,
1494
					 lockdep_is_held(&priv->lock));
1495 1496 1497 1498 1499 1500 1501 1502 1503

	if (!htbl)
		goto out_unlock;

	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1504
							  lockdep_is_held(&priv->lock))) != NULL) {
1505 1506 1507 1508
			/* delete neighs belong to this parent */
			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
				rcu_assign_pointer(*np,
						   rcu_dereference_protected(neigh->hnext,
1509
									     lockdep_is_held(&priv->lock)));
1510
				/* remove from parent list */
1511
				list_del_init(&neigh->list);
1512 1513 1514 1515 1516 1517 1518 1519
				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
			} else {
				np = &neigh->hnext;
			}

		}
	}
out_unlock:
1520
	spin_unlock_irqrestore(&priv->lock, flags);
1521 1522 1523 1524 1525 1526 1527
}

static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long flags;
1528
	int i, wait_flushed = 0;
1529

1530
	init_completion(&priv->ntbl.flushed);
1531

1532
	spin_lock_irqsave(&priv->lock, flags);
1533 1534

	htbl = rcu_dereference_protected(ntbl->htbl,
1535
					lockdep_is_held(&priv->lock));
1536 1537 1538
	if (!htbl)
		goto out_unlock;

1539 1540 1541 1542
	wait_flushed = atomic_read(&priv->ntbl.entries);
	if (!wait_flushed)
		goto free_htbl;

1543 1544 1545 1546 1547
	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1548
				       lockdep_is_held(&priv->lock))) != NULL) {
1549 1550
			rcu_assign_pointer(*np,
					   rcu_dereference_protected(neigh->hnext,
1551
								     lockdep_is_held(&priv->lock)));
1552
			/* remove from path/mc list */
1553
			list_del_init(&neigh->list);
1554 1555 1556 1557
			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
		}
	}

1558
free_htbl:
1559 1560 1561 1562
	rcu_assign_pointer(ntbl->htbl, NULL);
	call_rcu(&htbl->rcu, neigh_hash_free_rcu);

out_unlock:
1563
	spin_unlock_irqrestore(&priv->lock, flags);
1564 1565
	if (wait_flushed)
		wait_for_completion(&priv->ntbl.flushed);
1566 1567 1568 1569
}

static void ipoib_neigh_hash_uninit(struct net_device *dev)
{
1570
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1571 1572 1573
	int stopped;

	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1574
	init_completion(&priv->ntbl.deleted);
1575 1576 1577 1578 1579 1580 1581
	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);

	/* Stop GC if called at init fail need to cancel work */
	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	if (!stopped)
		cancel_delayed_work(&priv->neigh_reap_task);

1582 1583 1584
	ipoib_flush_neighs(priv);

	wait_for_completion(&priv->ntbl.deleted);
1585 1586
}

1587 1588
void ipoib_dev_uninit_default(struct net_device *dev)
{
1589
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1590

1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601
	ipoib_transport_dev_cleanup(dev);

	ipoib_cm_dev_cleanup(dev);

	kfree(priv->rx_ring);
	vfree(priv->tx_ring);

	priv->rx_ring = NULL;
	priv->tx_ring = NULL;
}

1602
static int ipoib_dev_init_default(struct net_device *dev)
L
Linus Torvalds 已提交
1603
{
1604
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1605

1606 1607
	netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);

L
Linus Torvalds 已提交
1608
	/* Allocate RX/TX "rings" to hold queued skbs */
1609
	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
L
Linus Torvalds 已提交
1610
				GFP_KERNEL);
1611
	if (!priv->rx_ring)
1612
		goto out;
L
Linus Torvalds 已提交
1613

1614
	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
L
Linus Torvalds 已提交
1615 1616
	if (!priv->tx_ring) {
		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1617
		       priv->ca->name, ipoib_sendq_size);
L
Linus Torvalds 已提交
1618 1619 1620
		goto out_rx_ring_cleanup;
	}

1621
	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
L
Linus Torvalds 已提交
1622

1623 1624 1625
	if (ipoib_transport_dev_init(dev, priv->ca)) {
		pr_warn("%s: ipoib_transport_dev_init failed\n",
			priv->ca->name);
L
Linus Torvalds 已提交
1626
		goto out_tx_ring_cleanup;
1627 1628
	}

1629 1630 1631 1632 1633
	/* after qp created set dev address */
	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
	priv->dev->dev_addr[3] = (priv->qp->qp_num) & 0xff;

1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650
	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
		    (unsigned long)dev);

	return 0;

out_tx_ring_cleanup:
	vfree(priv->tx_ring);

out_rx_ring_cleanup:
	kfree(priv->rx_ring);

out:
	return -ENOMEM;
}

int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
{
1651
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1652 1653 1654 1655 1656
	int ret = -ENOMEM;

	priv->ca = ca;
	priv->port = port;
	priv->qp = NULL;
L
Linus Torvalds 已提交
1657

1658
	/*
1659 1660
	 * the various IPoIB tasks assume they will never race against
	 * themselves, so always use a single thread workqueue
1661
	 */
1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674
	priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM);
	if (!priv->wq) {
		pr_warn("%s: failed to allocate device WQ\n", dev->name);
		goto out;
	}

	/* create pd, which used both for control and datapath*/
	priv->pd = ib_alloc_pd(priv->ca, 0);
	if (IS_ERR(priv->pd)) {
		pr_warn("%s: failed to allocate PD\n", ca->name);
		goto clean_wq;
	}

1675
	ret = priv->rn_ops->ndo_init(dev);
1676 1677 1678 1679 1680 1681 1682
	if (ret) {
		pr_warn("%s failed to init HW resource\n", dev->name);
		goto out_free_pd;
	}

	if (ipoib_neigh_hash_init(priv) < 0) {
		pr_warn("%s failed to init neigh hash\n", dev->name);
1683
		goto out_dev_uninit;
1684 1685 1686 1687 1688 1689 1690 1691 1692
	}

	if (dev->flags & IFF_UP) {
		if (ipoib_ib_dev_open(dev)) {
			pr_warn("%s failed to open device\n", dev->name);
			ret = -ENODEV;
			goto out_dev_uninit;
		}
	}
1693

L
Linus Torvalds 已提交
1694 1695
	return 0;

1696 1697 1698
out_dev_uninit:
	ipoib_ib_dev_cleanup(dev);

1699 1700 1701 1702 1703
out_free_pd:
	if (priv->pd) {
		ib_dealloc_pd(priv->pd);
		priv->pd = NULL;
	}
L
Linus Torvalds 已提交
1704

1705 1706 1707 1708 1709
clean_wq:
	if (priv->wq) {
		destroy_workqueue(priv->wq);
		priv->wq = NULL;
	}
L
Linus Torvalds 已提交
1710 1711

out:
1712
	return ret;
L
Linus Torvalds 已提交
1713 1714 1715 1716
}

void ipoib_dev_cleanup(struct net_device *dev)
{
1717
	struct ipoib_dev_priv *priv = ipoib_priv(dev), *cpriv, *tcpriv;
O
Or Gerlitz 已提交
1718 1719 1720
	LIST_HEAD(head);

	ASSERT_RTNL();
L
Linus Torvalds 已提交
1721 1722 1723

	/* Delete any child interfaces first */
	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1724 1725 1726
		/* Stop GC on child */
		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
		cancel_delayed_work(&cpriv->neigh_reap_task);
O
Or Gerlitz 已提交
1727
		unregister_netdevice_queue(cpriv->dev, &head);
L
Linus Torvalds 已提交
1728
	}
O
Or Gerlitz 已提交
1729
	unregister_netdevice_many(&head);
L
Linus Torvalds 已提交
1730

1731 1732
	ipoib_neigh_hash_uninit(dev);

L
Linus Torvalds 已提交
1733 1734
	ipoib_ib_dev_cleanup(dev);

1735 1736 1737 1738 1739 1740
	/* no more works over the priv->wq */
	if (priv->wq) {
		flush_workqueue(priv->wq);
		destroy_workqueue(priv->wq);
		priv->wq = NULL;
	}
L
Linus Torvalds 已提交
1741 1742
}

1743 1744
static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
{
1745
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1746 1747 1748 1749 1750 1751 1752

	return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
}

static int ipoib_get_vf_config(struct net_device *dev, int vf,
			       struct ifla_vf_info *ivf)
{
1753
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766
	int err;

	err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
	if (err)
		return err;

	ivf->vf = vf;

	return 0;
}

static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
{
1767
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1768 1769 1770 1771 1772 1773 1774 1775 1776 1777

	if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
		return -EINVAL;

	return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
}

static int ipoib_get_vf_stats(struct net_device *dev, int vf,
			      struct ifla_vf_stats *vf_stats)
{
1778
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1779 1780 1781 1782

	return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
}

1783 1784 1785 1786
static const struct header_ops ipoib_header_ops = {
	.create	= ipoib_hard_header,
};

1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800
static const struct net_device_ops ipoib_netdev_ops_pf = {
	.ndo_uninit		 = ipoib_uninit,
	.ndo_open		 = ipoib_open,
	.ndo_stop		 = ipoib_stop,
	.ndo_change_mtu		 = ipoib_change_mtu,
	.ndo_fix_features	 = ipoib_fix_features,
	.ndo_start_xmit		 = ipoib_start_xmit,
	.ndo_tx_timeout		 = ipoib_timeout,
	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
	.ndo_get_iflink		 = ipoib_get_iflink,
	.ndo_set_vf_link_state	 = ipoib_set_vf_link_state,
	.ndo_get_vf_config	 = ipoib_get_vf_config,
	.ndo_get_vf_stats	 = ipoib_get_vf_stats,
	.ndo_set_vf_guid	 = ipoib_set_vf_guid,
1801
	.ndo_set_mac_address	 = ipoib_set_mac,
1802 1803 1804
};

static const struct net_device_ops ipoib_netdev_ops_vf = {
O
Or Gerlitz 已提交
1805
	.ndo_uninit		 = ipoib_uninit,
1806 1807 1808
	.ndo_open		 = ipoib_open,
	.ndo_stop		 = ipoib_stop,
	.ndo_change_mtu		 = ipoib_change_mtu,
1809
	.ndo_fix_features	 = ipoib_fix_features,
1810 1811
	.ndo_start_xmit	 	 = ipoib_start_xmit,
	.ndo_tx_timeout		 = ipoib_timeout,
1812
	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
1813
	.ndo_get_iflink		 = ipoib_get_iflink,
1814 1815
};

1816
void ipoib_setup_common(struct net_device *dev)
L
Linus Torvalds 已提交
1817
{
1818
	dev->header_ops		 = &ipoib_header_ops;
1819

E
Eli Cohen 已提交
1820 1821
	ipoib_set_ethtool_ops(dev);

1822
	dev->watchdog_timeo	 = HZ;
L
Linus Torvalds 已提交
1823

1824
	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
L
Linus Torvalds 已提交
1825

1826
	dev->hard_header_len	 = IPOIB_HARD_LEN;
1827 1828 1829
	dev->addr_len		 = INFINIBAND_ALEN;
	dev->type		 = ARPHRD_INFINIBAND;
	dev->tx_queue_len	 = ipoib_sendq_size * 2;
E
Eli Cohen 已提交
1830 1831
	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
				    NETIF_F_HIGHDMA);
1832
	netif_keep_dst(dev);
L
Linus Torvalds 已提交
1833 1834

	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1835
}
L
Linus Torvalds 已提交
1836

1837 1838 1839
static void ipoib_build_priv(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1840

1841
	priv->dev = dev;
L
Linus Torvalds 已提交
1842
	spin_lock_init(&priv->lock);
1843
	init_rwsem(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
1844 1845 1846 1847 1848 1849

	INIT_LIST_HEAD(&priv->path_list);
	INIT_LIST_HEAD(&priv->child_intfs);
	INIT_LIST_HEAD(&priv->dead_ahs);
	INIT_LIST_HEAD(&priv->multicast_list);

D
David Howells 已提交
1850
	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1851
	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1852 1853 1854
	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
D
David Howells 已提交
1855 1856
	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1857
	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
L
Linus Torvalds 已提交
1858 1859
}

1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871
static const struct net_device_ops ipoib_netdev_default_pf = {
	.ndo_init		 = ipoib_dev_init_default,
	.ndo_uninit		 = ipoib_dev_uninit_default,
	.ndo_open		 = ipoib_ib_dev_open_default,
	.ndo_stop		 = ipoib_ib_dev_stop_default,
};

static struct net_device
*ipoib_create_netdev_default(struct ib_device *hca,
			     const char *name,
			     unsigned char name_assign_type,
			     void (*setup)(struct net_device *))
L
Linus Torvalds 已提交
1872 1873
{
	struct net_device *dev;
1874
	struct rdma_netdev *rn;
L
Linus Torvalds 已提交
1875

1876 1877 1878
	dev = alloc_netdev((int)sizeof(struct rdma_netdev),
			   name,
			   name_assign_type, setup);
L
Linus Torvalds 已提交
1879 1880 1881
	if (!dev)
		return NULL;

1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945
	rn = netdev_priv(dev);

	rn->send = ipoib_send;
	rn->attach_mcast = ipoib_mcast_attach;
	rn->detach_mcast = ipoib_mcast_detach;
	rn->hca = hca;

	dev->netdev_ops = &ipoib_netdev_default_pf;

	return dev;
}

static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port,
					   const char *name)
{
	struct net_device *dev;

	if (hca->alloc_rdma_netdev) {
		dev = hca->alloc_rdma_netdev(hca, port,
					     RDMA_NETDEV_IPOIB, name,
					     NET_NAME_UNKNOWN,
					     ipoib_setup_common);
		if (IS_ERR_OR_NULL(dev) && PTR_ERR(dev) != -EOPNOTSUPP)
			return NULL;
	}

	if (!hca->alloc_rdma_netdev || PTR_ERR(dev) == -EOPNOTSUPP)
		dev = ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
						  ipoib_setup_common);

	return dev;
}

struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
					const char *name)
{
	struct net_device *dev;
	struct ipoib_dev_priv *priv;
	struct rdma_netdev *rn;

	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
	if (!priv)
		return NULL;

	dev = ipoib_get_netdev(hca, port, name);
	if (!dev)
		goto free_priv;

	priv->rn_ops = dev->netdev_ops;

	/* fixme : should be after the query_cap */
	if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
		dev->netdev_ops	= &ipoib_netdev_ops_vf;
	else
		dev->netdev_ops	= &ipoib_netdev_ops_pf;

	rn = netdev_priv(dev);
	rn->clnt_priv = priv;
	ipoib_build_priv(dev);

	return priv;
free_priv:
	kfree(priv);
	return NULL;
L
Linus Torvalds 已提交
1946 1947
}

1948 1949
static ssize_t show_pkey(struct device *dev,
			 struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
1950
{
1951 1952
	struct net_device *ndev = to_net_dev(dev);
	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
L
Linus Torvalds 已提交
1953 1954 1955

	return sprintf(buf, "0x%04x\n", priv->pkey);
}
1956
static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
L
Linus Torvalds 已提交
1957

1958 1959 1960
static ssize_t show_umcast(struct device *dev,
			   struct device_attribute *attr, char *buf)
{
1961 1962
	struct net_device *ndev = to_net_dev(dev);
	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
1963 1964 1965 1966

	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
}

1967
void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
1968
{
1969
	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
1970 1971 1972 1973 1974 1975 1976

	if (umcast_val > 0) {
		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
		ipoib_warn(priv, "ignoring multicast groups joined directly "
				"by userspace\n");
	} else
		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1977 1978 1979 1980 1981 1982 1983 1984 1985
}

static ssize_t set_umcast(struct device *dev,
			  struct device_attribute *attr,
			  const char *buf, size_t count)
{
	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);

	ipoib_set_umcast(to_net_dev(dev), umcast_val);
1986 1987 1988 1989 1990 1991 1992 1993 1994 1995

	return count;
}
static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);

int ipoib_add_umcast_attr(struct net_device *dev)
{
	return device_create_file(&dev->dev, &dev_attr_umcast);
}

1996 1997 1998 1999 2000
static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
{
	struct ipoib_dev_priv *child_priv;
	struct net_device *netdev = priv->dev;

2001
	netif_addr_lock_bh(netdev);
2002 2003 2004 2005 2006 2007 2008

	memcpy(&priv->local_gid.global.interface_id,
	       &gid->global.interface_id,
	       sizeof(gid->global.interface_id));
	memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
	clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);

2009
	netif_addr_unlock_bh(netdev);
2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		down_read(&priv->vlan_rwsem);
		list_for_each_entry(child_priv, &priv->child_intfs, list)
			set_base_guid(child_priv, gid);
		up_read(&priv->vlan_rwsem);
	}
}

static int ipoib_check_lladdr(struct net_device *dev,
			      struct sockaddr_storage *ss)
{
	union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
	int ret = 0;

2025
	netif_addr_lock_bh(dev);
2026 2027 2028 2029 2030 2031 2032 2033 2034

	/* Make sure the QPN, reserved and subnet prefix match the current
	 * lladdr, it also makes sure the lladdr is unicast.
	 */
	if (memcmp(dev->dev_addr, ss->__data,
		   4 + sizeof(gid->global.subnet_prefix)) ||
	    gid->global.interface_id == 0)
		ret = -EINVAL;

2035
	netif_addr_unlock_bh(dev);
2036 2037 2038 2039 2040 2041

	return ret;
}

static int ipoib_set_mac(struct net_device *dev, void *addr)
{
2042
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059
	struct sockaddr_storage *ss = addr;
	int ret;

	if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
		return -EBUSY;

	ret = ipoib_check_lladdr(dev, ss);
	if (ret)
		return ret;

	set_base_guid(priv, (union ib_gid *)(ss->__data + 4));

	queue_work(ipoib_workqueue, &priv->flush_light);

	return 0;
}

2060 2061
static ssize_t create_child(struct device *dev,
			    struct device_attribute *attr,
L
Linus Torvalds 已提交
2062 2063 2064 2065 2066 2067 2068 2069
			    const char *buf, size_t count)
{
	int pkey;
	int ret;

	if (sscanf(buf, "%i", &pkey) != 1)
		return -EINVAL;

2070
	if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
L
Linus Torvalds 已提交
2071 2072
		return -EINVAL;

2073 2074 2075 2076 2077 2078
	/*
	 * Set the full membership bit, so that we join the right
	 * broadcast group, etc.
	 */
	pkey |= 0x8000;

2079
	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
L
Linus Torvalds 已提交
2080 2081 2082

	return ret ? ret : count;
}
2083
static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
L
Linus Torvalds 已提交
2084

2085 2086
static ssize_t delete_child(struct device *dev,
			    struct device_attribute *attr,
L
Linus Torvalds 已提交
2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097
			    const char *buf, size_t count)
{
	int pkey;
	int ret;

	if (sscanf(buf, "%i", &pkey) != 1)
		return -EINVAL;

	if (pkey < 0 || pkey > 0xffff)
		return -EINVAL;

2098
	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
L
Linus Torvalds 已提交
2099 2100 2101 2102

	return ret ? ret : count;

}
2103
static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
L
Linus Torvalds 已提交
2104 2105 2106

int ipoib_add_pkey_attr(struct net_device *dev)
{
2107
	return device_create_file(&dev->dev, &dev_attr_pkey);
L
Linus Torvalds 已提交
2108 2109
}

2110
void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
2111
{
2112
	priv->hca_caps = hca->attrs.device_cap_flags;
2113 2114

	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
2115
		priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
2116

2117 2118
		if (priv->hca_caps & IB_DEVICE_UD_TSO)
			priv->dev->hw_features |= NETIF_F_TSO;
O
Or Gerlitz 已提交
2119

2120 2121
		priv->dev->features |= priv->dev->hw_features;
	}
2122 2123
}

L
Linus Torvalds 已提交
2124 2125 2126 2127
static struct net_device *ipoib_add_port(const char *format,
					 struct ib_device *hca, u8 port)
{
	struct ipoib_dev_priv *priv;
2128
	struct ib_port_attr attr;
L
Linus Torvalds 已提交
2129 2130
	int result = -ENOMEM;

2131
	priv = ipoib_intf_alloc(hca, port, format);
L
Linus Torvalds 已提交
2132 2133 2134
	if (!priv)
		goto alloc_mem_failed;

2135
	SET_NETDEV_DEV(priv->dev, hca->dev.parent);
E
Eli Cohen 已提交
2136
	priv->dev->dev_id = port - 1;
L
Linus Torvalds 已提交
2137

2138 2139
	result = ib_query_port(hca, port, &attr);
	if (!result)
2140 2141 2142 2143 2144 2145 2146 2147 2148 2149
		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
	else {
		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
		       hca->name, port);
		goto device_init_failed;
	}

	/* MTU will be reset when mcast join happens */
	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
2150
	priv->dev->max_mtu = IPOIB_CM_MTU;
2151

2152 2153
	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);

L
Linus Torvalds 已提交
2154 2155 2156 2157
	result = ib_query_pkey(hca, port, 0, &priv->pkey);
	if (result) {
		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
		       hca->name, port, result);
E
Eli Cohen 已提交
2158
		goto device_init_failed;
L
Linus Torvalds 已提交
2159 2160
	}

2161
	ipoib_set_dev_features(priv, hca);
V
Vladimir Sokolovsky 已提交
2162

2163 2164 2165 2166 2167 2168
	/*
	 * Set the full membership bit, so that we join the right
	 * broadcast group, etc.
	 */
	priv->pkey |= 0x8000;

L
Linus Torvalds 已提交
2169 2170 2171
	priv->dev->broadcast[8] = priv->pkey >> 8;
	priv->dev->broadcast[9] = priv->pkey & 0xff;

2172
	result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
L
Linus Torvalds 已提交
2173 2174 2175
	if (result) {
		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
		       hca->name, port, result);
E
Eli Cohen 已提交
2176
		goto device_init_failed;
L
Linus Torvalds 已提交
2177 2178
	} else
		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
2179
	set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
L
Linus Torvalds 已提交
2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204

	result = ipoib_dev_init(priv->dev, hca, port);
	if (result < 0) {
		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
		       hca->name, port, result);
		goto device_init_failed;
	}

	INIT_IB_EVENT_HANDLER(&priv->event_handler,
			      priv->ca, ipoib_event);
	result = ib_register_event_handler(&priv->event_handler);
	if (result < 0) {
		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
		       "port %d (ret = %d)\n",
		       hca->name, port, result);
		goto event_failed;
	}

	result = register_netdev(priv->dev);
	if (result) {
		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
		       hca->name, port, result);
		goto register_failed;
	}

2205 2206
	if (ipoib_cm_add_mode_attr(priv->dev))
		goto sysfs_failed;
L
Linus Torvalds 已提交
2207 2208
	if (ipoib_add_pkey_attr(priv->dev))
		goto sysfs_failed;
2209 2210
	if (ipoib_add_umcast_attr(priv->dev))
		goto sysfs_failed;
2211
	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
L
Linus Torvalds 已提交
2212
		goto sysfs_failed;
2213
	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
L
Linus Torvalds 已提交
2214 2215 2216 2217 2218 2219 2220 2221 2222
		goto sysfs_failed;

	return priv->dev;

sysfs_failed:
	unregister_netdev(priv->dev);

register_failed:
	ib_unregister_event_handler(&priv->event_handler);
2223
	flush_workqueue(ipoib_workqueue);
2224 2225 2226
	/* Stop GC if started before flush */
	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	cancel_delayed_work(&priv->neigh_reap_task);
2227
	flush_workqueue(priv->wq);
L
Linus Torvalds 已提交
2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243

event_failed:
	ipoib_dev_cleanup(priv->dev);

device_init_failed:
	free_netdev(priv->dev);

alloc_mem_failed:
	return ERR_PTR(result);
}

static void ipoib_add_one(struct ib_device *device)
{
	struct list_head *dev_list;
	struct net_device *dev;
	struct ipoib_dev_priv *priv;
2244
	int p;
M
Michael Wang 已提交
2245
	int count = 0;
T
Tom Tucker 已提交
2246

L
Linus Torvalds 已提交
2247 2248 2249 2250 2251 2252
	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
	if (!dev_list)
		return;

	INIT_LIST_HEAD(dev_list);

2253
	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
M
Michael Wang 已提交
2254
		if (!rdma_protocol_ib(device, p))
E
Eli Cohen 已提交
2255
			continue;
L
Linus Torvalds 已提交
2256 2257
		dev = ipoib_add_port("ib%d", device, p);
		if (!IS_ERR(dev)) {
2258
			priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
2259
			list_add_tail(&priv->list, dev_list);
M
Michael Wang 已提交
2260
			count++;
L
Linus Torvalds 已提交
2261 2262 2263
		}
	}

M
Michael Wang 已提交
2264 2265 2266 2267 2268
	if (!count) {
		kfree(dev_list);
		return;
	}

L
Linus Torvalds 已提交
2269 2270 2271
	ib_set_client_data(device, &ipoib_client, dev_list);
}

2272
static void ipoib_remove_one(struct ib_device *device, void *client_data)
L
Linus Torvalds 已提交
2273 2274
{
	struct ipoib_dev_priv *priv, *tmp;
2275
	struct list_head *dev_list = client_data;
L
Linus Torvalds 已提交
2276

2277 2278
	if (!dev_list)
		return;
L
Linus Torvalds 已提交
2279 2280 2281

	list_for_each_entry_safe(priv, tmp, dev_list, list) {
		ib_unregister_event_handler(&priv->event_handler);
2282
		flush_workqueue(ipoib_workqueue);
2283

2284 2285 2286
		/* mark interface in the middle of destruction */
		set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags);

2287 2288 2289 2290
		rtnl_lock();
		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
		rtnl_unlock();

2291 2292 2293
		/* Stop GC */
		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
		cancel_delayed_work(&priv->neigh_reap_task);
2294
		flush_workqueue(priv->wq);
L
Linus Torvalds 已提交
2295 2296 2297

		unregister_netdev(priv->dev);
		free_netdev(priv->dev);
2298
		kfree(priv);
L
Linus Torvalds 已提交
2299
	}
2300 2301

	kfree(dev_list);
L
Linus Torvalds 已提交
2302 2303
}

2304 2305 2306 2307 2308 2309
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
static struct notifier_block ipoib_netdev_notifier = {
	.notifier_call = ipoib_netdev_event,
};
#endif

L
Linus Torvalds 已提交
2310 2311 2312 2313
static int __init ipoib_init_module(void)
{
	int ret;

2314 2315 2316 2317 2318 2319
	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);

	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
2320
	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
2321 2322 2323
#ifdef CONFIG_INFINIBAND_IPOIB_CM
	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
#endif
2324

2325 2326 2327 2328 2329 2330
	/*
	 * When copying small received packets, we only copy from the
	 * linear data part of the SKB, so we rely on this condition.
	 */
	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);

L
Linus Torvalds 已提交
2331 2332 2333 2334 2335
	ret = ipoib_register_debugfs();
	if (ret)
		return ret;

	/*
2336 2337 2338 2339 2340 2341 2342 2343
	 * We create a global workqueue here that is used for all flush
	 * operations.  However, if you attempt to flush a workqueue
	 * from a task on that same workqueue, it deadlocks the system.
	 * We want to be able to flush the tasks associated with a
	 * specific net device, so we also create a workqueue for each
	 * netdevice.  We queue up the tasks for that device only on
	 * its private workqueue, and we only queue up flush events
	 * on our global flush workqueue.  This avoids the deadlocks.
L
Linus Torvalds 已提交
2344
	 */
2345 2346
	ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush",
						  WQ_MEM_RECLAIM);
L
Linus Torvalds 已提交
2347 2348 2349 2350 2351
	if (!ipoib_workqueue) {
		ret = -ENOMEM;
		goto err_fs;
	}

2352 2353
	ib_sa_register_client(&ipoib_sa_client);

L
Linus Torvalds 已提交
2354 2355
	ret = ib_register_client(&ipoib_client);
	if (ret)
2356
		goto err_sa;
L
Linus Torvalds 已提交
2357

O
Or Gerlitz 已提交
2358 2359 2360 2361
	ret = ipoib_netlink_init();
	if (ret)
		goto err_client;

2362 2363 2364
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
	register_netdevice_notifier(&ipoib_netdev_notifier);
#endif
L
Linus Torvalds 已提交
2365 2366
	return 0;

O
Or Gerlitz 已提交
2367 2368 2369
err_client:
	ib_unregister_client(&ipoib_client);

2370 2371
err_sa:
	ib_sa_unregister_client(&ipoib_sa_client);
L
Linus Torvalds 已提交
2372 2373
	destroy_workqueue(ipoib_workqueue);

2374 2375 2376
err_fs:
	ipoib_unregister_debugfs();

L
Linus Torvalds 已提交
2377 2378 2379 2380 2381
	return ret;
}

static void __exit ipoib_cleanup_module(void)
{
2382 2383 2384
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
	unregister_netdevice_notifier(&ipoib_netdev_notifier);
#endif
O
Or Gerlitz 已提交
2385
	ipoib_netlink_fini();
L
Linus Torvalds 已提交
2386
	ib_unregister_client(&ipoib_client);
2387
	ib_sa_unregister_client(&ipoib_sa_client);
2388
	ipoib_unregister_debugfs();
L
Linus Torvalds 已提交
2389 2390 2391 2392 2393
	destroy_workqueue(ipoib_workqueue);
}

module_init(ipoib_init_module);
module_exit(ipoib_cleanup_module);