ipoib_main.c 60.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3 4
 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "ipoib.h"

#include <linux/module.h>

#include <linux/init.h>
#include <linux/slab.h>
41
#include <linux/kernel.h>
42
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
43 44 45 46 47 48

#include <linux/if_arp.h>	/* For ARPHRD_xxx */

#include <linux/ip.h>
#include <linux/in.h>

49 50
#include <linux/jhash.h>
#include <net/arp.h>
51 52 53
#include <net/addrconf.h>
#include <linux/inetdevice.h>
#include <rdma/ib_cache.h>
54

55 56 57 58
#define DRV_VERSION "1.0.0"

const char ipoib_driver_version[] = DRV_VERSION;

L
Linus Torvalds 已提交
59 60 61 62
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");

63 64 65 66 67 68 69 70
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;

module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");

L
Linus Torvalds 已提交
71 72 73 74 75 76 77
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;

module_param_named(debug_level, ipoib_debug_level, int, 0644);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
#endif

78 79 80 81 82
struct ipoib_path_iter {
	struct net_device *dev;
	struct ipoib_path  path;
};

L
Linus Torvalds 已提交
83 84 85 86 87 88 89 90
static const u8 ipv4_bcast_addr[] = {
	0x00, 0xff, 0xff, 0xff,
	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
};

struct workqueue_struct *ipoib_workqueue;

91 92
struct ib_sa_client ipoib_sa_client;

L
Linus Torvalds 已提交
93
static void ipoib_add_one(struct ib_device *device);
94
static void ipoib_remove_one(struct ib_device *device, void *client_data);
95
static void ipoib_neigh_reclaim(struct rcu_head *rp);
96 97 98 99
static struct net_device *ipoib_get_net_dev_by_params(
		struct ib_device *dev, u8 port, u16 pkey,
		const union ib_gid *gid, const struct sockaddr *addr,
		void *client_data);
100
static int ipoib_set_mac(struct net_device *dev, void *addr);
101 102
static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
		       int cmd);
L
Linus Torvalds 已提交
103 104 105 106

static struct ib_client ipoib_client = {
	.name   = "ipoib",
	.add    = ipoib_add_one,
107 108
	.remove = ipoib_remove_one,
	.get_net_dev_by_params = ipoib_get_net_dev_by_params,
L
Linus Torvalds 已提交
109 110
};

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
static int ipoib_netdev_event(struct notifier_block *this,
			      unsigned long event, void *ptr)
{
	struct netdev_notifier_info *ni = ptr;
	struct net_device *dev = ni->dev;

	if (dev->netdev_ops->ndo_open != ipoib_open)
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_REGISTER:
		ipoib_create_debug_files(dev);
		break;
	case NETDEV_CHANGENAME:
		ipoib_delete_debug_files(dev);
		ipoib_create_debug_files(dev);
		break;
	case NETDEV_UNREGISTER:
		ipoib_delete_debug_files(dev);
		break;
	}

	return NOTIFY_DONE;
}
#endif

L
Linus Torvalds 已提交
138 139
int ipoib_open(struct net_device *dev)
{
140
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
141 142 143

	ipoib_dbg(priv, "bringing up interface\n");

144 145
	netif_carrier_off(dev);

146
	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
L
Linus Torvalds 已提交
147

148 149
	priv->sm_fullmember_sendonly_support = false;

150
	if (ipoib_ib_dev_open(dev)) {
151 152
		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
			return 0;
153
		goto err_disable;
154
	}
155

156
	ipoib_ib_dev_up(dev);
L
Linus Torvalds 已提交
157 158 159 160 161

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		struct ipoib_dev_priv *cpriv;

		/* Bring up any child interfaces too */
162
		down_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
163 164 165 166 167 168 169 170 171
		list_for_each_entry(cpriv, &priv->child_intfs, list) {
			int flags;

			flags = cpriv->dev->flags;
			if (flags & IFF_UP)
				continue;

			dev_change_flags(cpriv->dev, flags | IFF_UP);
		}
172
		up_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
173 174 175 176 177
	}

	netif_start_queue(dev);

	return 0;
178 179 180 181 182

err_disable:
	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

	return -EINVAL;
L
Linus Torvalds 已提交
183 184 185 186
}

static int ipoib_stop(struct net_device *dev)
{
187
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
188 189 190 191 192 193 194

	ipoib_dbg(priv, "stopping interface\n");

	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

	netif_stop_queue(dev);

195
	ipoib_ib_dev_down(dev);
196
	ipoib_ib_dev_stop(dev);
L
Linus Torvalds 已提交
197 198 199 200 201

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		struct ipoib_dev_priv *cpriv;

		/* Bring down any child interfaces too */
202
		down_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
203 204 205 206 207 208 209 210 211
		list_for_each_entry(cpriv, &priv->child_intfs, list) {
			int flags;

			flags = cpriv->dev->flags;
			if (!(flags & IFF_UP))
				continue;

			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
		}
212
		up_read(&priv->vlan_rwsem);
L
Linus Torvalds 已提交
213 214 215 216 217
	}

	return 0;
}

O
Or Gerlitz 已提交
218 219 220 221 222
static void ipoib_uninit(struct net_device *dev)
{
	ipoib_dev_cleanup(dev);
}

223
static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
224
{
225
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
226 227

	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
228
		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
229 230 231 232

	return features;
}

L
Linus Torvalds 已提交
233 234
static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
{
235
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
236
	int ret = 0;
L
Linus Torvalds 已提交
237

238
	/* dev->mtu > 2K ==> connected mode */
239 240 241 242
	if (ipoib_cm_admin_enabled(dev)) {
		if (new_mtu > ipoib_cm_max_mtu(dev))
			return -EINVAL;

243 244 245
		if (new_mtu > priv->mcast_mtu)
			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
				   priv->mcast_mtu);
246

247 248 249 250
		dev->mtu = new_mtu;
		return 0;
	}

251
	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
L
Linus Torvalds 已提交
252 253 254 255
		return -EINVAL;

	priv->admin_mtu = new_mtu;

256 257 258 259
	if (priv->mcast_mtu < priv->admin_mtu)
		ipoib_dbg(priv, "MTU must be smaller than the underlying "
				"link layer MTU - 4 (%u)\n", priv->mcast_mtu);

260
	new_mtu = min(priv->mcast_mtu, priv->admin_mtu);
L
Linus Torvalds 已提交
261

262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
	if (priv->rn_ops->ndo_change_mtu) {
		bool carrier_status = netif_carrier_ok(dev);

		netif_carrier_off(dev);

		/* notify lower level on the real mtu */
		ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu);

		if (carrier_status)
			netif_carrier_on(dev);
	} else {
		dev->mtu = new_mtu;
	}

	return ret;
L
Linus Torvalds 已提交
277 278
}

279 280 281 282 283 284 285 286 287 288 289
static void ipoib_get_stats(struct net_device *dev,
			    struct rtnl_link_stats64 *stats)
{
	struct ipoib_dev_priv *priv = ipoib_priv(dev);

	if (priv->rn_ops->ndo_get_stats64)
		priv->rn_ops->ndo_get_stats64(dev, stats);
	else
		netdev_stats_to_stats64(stats, &dev->stats);
}

290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
/* Called with an RCU read lock taken */
static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
					struct net_device *dev)
{
	struct net *net = dev_net(dev);
	struct in_device *in_dev;
	struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
	struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
	__be32 ret_addr;

	switch (addr->sa_family) {
	case AF_INET:
		in_dev = in_dev_get(dev);
		if (!in_dev)
			return false;

		ret_addr = inet_confirm_addr(net, in_dev, 0,
					     addr_in->sin_addr.s_addr,
					     RT_SCOPE_HOST);
		in_dev_put(in_dev);
		if (ret_addr)
			return true;

		break;
	case AF_INET6:
		if (IS_ENABLED(CONFIG_IPV6) &&
		    ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
			return true;

		break;
	}
	return false;
}

/**
 * Find the master net_device on top of the given net_device.
 * @dev: base IPoIB net_device
 *
 * Returns the master net_device with a reference held, or the same net_device
 * if no master exists.
 */
static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
{
	struct net_device *master;

	rcu_read_lock();
	master = netdev_master_upper_dev_get_rcu(dev);
	if (master)
		dev_hold(master);
	rcu_read_unlock();

	if (master)
		return master;

	dev_hold(dev);
	return dev;
}

D
David Ahern 已提交
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
struct ipoib_walk_data {
	const struct sockaddr *addr;
	struct net_device *result;
};

static int ipoib_upper_walk(struct net_device *upper, void *_data)
{
	struct ipoib_walk_data *data = _data;
	int ret = 0;

	if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
		dev_hold(upper);
		data->result = upper;
		ret = 1;
	}

	return ret;
}

367 368 369 370 371 372 373 374 375 376 377 378
/**
 * Find a net_device matching the given address, which is an upper device of
 * the given net_device.
 * @addr: IP address to look for.
 * @dev: base IPoIB net_device
 *
 * If found, returns the net_device with a reference held. Otherwise return
 * NULL.
 */
static struct net_device *ipoib_get_net_dev_match_addr(
		const struct sockaddr *addr, struct net_device *dev)
{
D
David Ahern 已提交
379 380 381
	struct ipoib_walk_data data = {
		.addr = addr,
	};
382 383 384 385

	rcu_read_lock();
	if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
		dev_hold(dev);
D
David Ahern 已提交
386
		data.result = dev;
387 388 389
		goto out;
	}

D
David Ahern 已提交
390
	netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data);
391 392
out:
	rcu_read_unlock();
D
David Ahern 已提交
393
	return data.result;
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
}

/* returns the number of IPoIB netdevs on top a given ipoib device matching a
 * pkey_index and address, if one exists.
 *
 * @found_net_dev: contains a matching net_device if the return value >= 1,
 * with a reference held. */
static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
				     const union ib_gid *gid,
				     u16 pkey_index,
				     const struct sockaddr *addr,
				     int nesting,
				     struct net_device **found_net_dev)
{
	struct ipoib_dev_priv *child_priv;
	struct net_device *net_dev = NULL;
	int matches = 0;

	if (priv->pkey_index == pkey_index &&
	    (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
		if (!addr) {
			net_dev = ipoib_get_master_net_dev(priv->dev);
		} else {
			/* Verify the net_device matches the IP address, as
			 * IPoIB child devices currently share a GID. */
			net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
		}
		if (net_dev) {
			if (!*found_net_dev)
				*found_net_dev = net_dev;
			else
				dev_put(net_dev);
			++matches;
		}
	}

	/* Check child interfaces */
	down_read_nested(&priv->vlan_rwsem, nesting);
	list_for_each_entry(child_priv, &priv->child_intfs, list) {
		matches += ipoib_match_gid_pkey_addr(child_priv, gid,
						    pkey_index, addr,
						    nesting + 1,
						    found_net_dev);
		if (matches > 1)
			break;
	}
	up_read(&priv->vlan_rwsem);

	return matches;
}

/* Returns the number of matching net_devs found (between 0 and 2). Also
 * return the matching net_device in the @net_dev parameter, holding a
 * reference to the net_device, if the number of matches >= 1 */
static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
					 u16 pkey_index,
					 const union ib_gid *gid,
					 const struct sockaddr *addr,
					 struct net_device **net_dev)
{
	struct ipoib_dev_priv *priv;
	int matches = 0;

	*net_dev = NULL;

	list_for_each_entry(priv, dev_list, list) {
		if (priv->port != port)
			continue;

		matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
						     addr, 0, net_dev);
		if (matches > 1)
			break;
	}

	return matches;
}

static struct net_device *ipoib_get_net_dev_by_params(
		struct ib_device *dev, u8 port, u16 pkey,
		const union ib_gid *gid, const struct sockaddr *addr,
		void *client_data)
{
	struct net_device *net_dev;
	struct list_head *dev_list = client_data;
	u16 pkey_index;
	int matches;
	int ret;

	if (!rdma_protocol_ib(dev, port))
		return NULL;

	ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
	if (ret)
		return NULL;

	if (!dev_list)
		return NULL;

	/* See if we can find a unique device matching the L2 parameters */
	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
						gid, NULL, &net_dev);

	switch (matches) {
	case 0:
		return NULL;
	case 1:
		return net_dev;
	}

	dev_put(net_dev);

	/* Couldn't find a unique device with L2 parameters only. Use L3
	 * address to uniquely match the net device */
	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
						gid, addr, &net_dev);
	switch (matches) {
	case 0:
		return NULL;
	default:
		dev_warn_ratelimited(&dev->dev,
				     "duplicate IP address detected\n");
		/* Fall through */
	case 1:
		return net_dev;
	}
}

522 523
int ipoib_set_mode(struct net_device *dev, const char *buf)
{
524
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
525

526 527 528 529 530 531 532
	if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
	     !strcmp(buf, "connected\n")) ||
	     (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
	     !strcmp(buf, "datagram\n"))) {
		return 0;
	}

533 534 535 536 537 538
	/* flush paths if we switch modes so that connections are restarted */
	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
		ipoib_warn(priv, "enabling connected mode "
			   "will cause multicast packet drops\n");
		netdev_update_features(dev);
539
		dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
540
		rtnl_unlock();
C
Christoph Hellwig 已提交
541
		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
542 543

		ipoib_flush_paths(dev);
544
		return (!rtnl_trylock()) ? -EBUSY : 0;
545 546 547 548 549 550 551 552
	}

	if (!strcmp(buf, "datagram\n")) {
		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
		netdev_update_features(dev);
		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
		rtnl_unlock();
		ipoib_flush_paths(dev);
553
		return (!rtnl_trylock()) ? -EBUSY : 0;
554 555 556 557 558
	}

	return -EINVAL;
}

559
struct ipoib_path *__path_find(struct net_device *dev, void *gid)
L
Linus Torvalds 已提交
560
{
561
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
562 563 564 565 566 567 568
	struct rb_node *n = priv->path_tree.rb_node;
	struct ipoib_path *path;
	int ret;

	while (n) {
		path = rb_entry(n, struct ipoib_path, rb_node);

569
		ret = memcmp(gid, path->pathrec.dgid.raw,
L
Linus Torvalds 已提交
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
			     sizeof (union ib_gid));

		if (ret < 0)
			n = n->rb_left;
		else if (ret > 0)
			n = n->rb_right;
		else
			return path;
	}

	return NULL;
}

static int __path_add(struct net_device *dev, struct ipoib_path *path)
{
585
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
	struct rb_node **n = &priv->path_tree.rb_node;
	struct rb_node *pn = NULL;
	struct ipoib_path *tpath;
	int ret;

	while (*n) {
		pn = *n;
		tpath = rb_entry(pn, struct ipoib_path, rb_node);

		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
			     sizeof (union ib_gid));
		if (ret < 0)
			n = &pn->rb_left;
		else if (ret > 0)
			n = &pn->rb_right;
		else
			return -EEXIST;
	}

	rb_link_node(&path->rb_node, pn, n);
	rb_insert_color(&path->rb_node, &priv->path_tree);

	list_add_tail(&path->list, &priv->path_list);

	return 0;
}

static void path_free(struct net_device *dev, struct ipoib_path *path)
{
	struct sk_buff *skb;

	while ((skb = __skb_dequeue(&path->queue)))
		dev_kfree_skb_irq(skb);

620
	ipoib_dbg(ipoib_priv(dev), "path_free\n");
L
Linus Torvalds 已提交
621

622 623
	/* remove all neigh connected to this path */
	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
624 625 626 627 628 629 630

	if (path->ah)
		ipoib_put_ah(path->ah);

	kfree(path);
}

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG

struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
{
	struct ipoib_path_iter *iter;

	iter = kmalloc(sizeof *iter, GFP_KERNEL);
	if (!iter)
		return NULL;

	iter->dev = dev;
	memset(iter->path.pathrec.dgid.raw, 0, 16);

	if (ipoib_path_iter_next(iter)) {
		kfree(iter);
		return NULL;
	}

	return iter;
}

int ipoib_path_iter_next(struct ipoib_path_iter *iter)
{
654
	struct ipoib_dev_priv *priv = ipoib_priv(iter->dev);
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
	struct rb_node *n;
	struct ipoib_path *path;
	int ret = 1;

	spin_lock_irq(&priv->lock);

	n = rb_first(&priv->path_tree);

	while (n) {
		path = rb_entry(n, struct ipoib_path, rb_node);

		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
			   sizeof (union ib_gid)) < 0) {
			iter->path = *path;
			ret = 0;
			break;
		}

		n = rb_next(n);
	}

	spin_unlock_irq(&priv->lock);

	return ret;
}

void ipoib_path_iter_read(struct ipoib_path_iter *iter,
			  struct ipoib_path *path)
{
	*path = iter->path;
}

#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */

689 690
void ipoib_mark_paths_invalid(struct net_device *dev)
{
691
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
692 693 694 695 696
	struct ipoib_path *path, *tp;

	spin_lock_irq(&priv->lock);

	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
697 698 699
		ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n",
			  be32_to_cpu(sa_path_get_dlid(&path->pathrec)),
			  path->pathrec.dgid.raw);
700 701 702 703 704 705
		path->valid =  0;
	}

	spin_unlock_irq(&priv->lock);
}

706 707 708 709
static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
{
	struct ipoib_pseudo_header *phdr;

710
	phdr = skb_push(skb, sizeof(*phdr));
711 712 713
	memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
}

L
Linus Torvalds 已提交
714 715
void ipoib_flush_paths(struct net_device *dev)
{
716
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
717 718
	struct ipoib_path *path, *tp;
	LIST_HEAD(remove_list);
719
	unsigned long flags;
L
Linus Torvalds 已提交
720

721 722
	netif_tx_lock_bh(dev);
	spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
723

724
	list_splice_init(&priv->path_list, &remove_list);
L
Linus Torvalds 已提交
725 726 727 728 729 730 731

	list_for_each_entry(path, &remove_list, list)
		rb_erase(&path->rb_node, &priv->path_tree);

	list_for_each_entry_safe(path, tp, &remove_list, list) {
		if (path->query)
			ib_sa_cancel_query(path->query_id, path->query);
732 733
		spin_unlock_irqrestore(&priv->lock, flags);
		netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
734 735
		wait_for_completion(&path->done);
		path_free(dev, path);
736 737
		netif_tx_lock_bh(dev);
		spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
738
	}
739 740 741

	spin_unlock_irqrestore(&priv->lock, flags);
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
742 743 744
}

static void path_rec_completion(int status,
745
				struct sa_path_rec *pathrec,
L
Linus Torvalds 已提交
746 747 748 749
				void *path_ptr)
{
	struct ipoib_path *path = path_ptr;
	struct net_device *dev = path->dev;
750
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
751
	struct ipoib_ah *ah = NULL;
752
	struct ipoib_ah *old_ah = NULL;
753
	struct ipoib_neigh *neigh, *tn;
L
Linus Torvalds 已提交
754 755 756 757
	struct sk_buff_head skqueue;
	struct sk_buff *skb;
	unsigned long flags;

758
	if (!status)
H
Harvey Harrison 已提交
759
		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
760
			  be32_to_cpu(sa_path_get_dlid(pathrec)),
761
			  pathrec->dgid.raw);
L
Linus Torvalds 已提交
762
	else
H
Harvey Harrison 已提交
763
		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
764
			  status, path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
765 766 767 768

	skb_queue_head_init(&skqueue);

	if (!status) {
769
		struct rdma_ah_attr av;
770 771 772

		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
			ah = ipoib_create_ah(dev, priv->pd, &av);
L
Linus Torvalds 已提交
773 774 775 776
	}

	spin_lock_irqsave(&priv->lock, flags);

777
	if (!IS_ERR_OR_NULL(ah)) {
L
Linus Torvalds 已提交
778 779
		path->pathrec = *pathrec;

780 781 782
		old_ah   = path->ah;
		path->ah = ah;

L
Linus Torvalds 已提交
783
		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
784
			  ah, be32_to_cpu(sa_path_get_dlid(pathrec)),
785
			  pathrec->sl);
L
Linus Torvalds 已提交
786 787 788 789

		while ((skb = __skb_dequeue(&path->queue)))
			__skb_queue_tail(&skqueue, skb);

790
		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
791 792 793 794 795 796 797 798 799 800 801
			if (neigh->ah) {
				WARN_ON(neigh->ah != old_ah);
				/*
				 * Dropping the ah reference inside
				 * priv->lock is safe here, because we
				 * will hold one more reference from
				 * the original value of path->ah (ie
				 * old_ah).
				 */
				ipoib_put_ah(neigh->ah);
			}
L
Linus Torvalds 已提交
802 803 804
			kref_get(&path->ah->ref);
			neigh->ah = path->ah;

805
			if (ipoib_cm_enabled(dev, neigh->daddr)) {
806 807 808 809 810
				if (!ipoib_cm_get(neigh))
					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
									       path,
									       neigh));
				if (!ipoib_cm_get(neigh)) {
811
					ipoib_neigh_free(neigh);
812 813 814 815
					continue;
				}
			}

L
Linus Torvalds 已提交
816 817 818
			while ((skb = __skb_dequeue(&neigh->queue)))
				__skb_queue_tail(&skqueue, skb);
		}
819
		path->valid = 1;
820
	}
L
Linus Torvalds 已提交
821

822
	path->query = NULL;
L
Linus Torvalds 已提交
823 824 825 826
	complete(&path->done);

	spin_unlock_irqrestore(&priv->lock, flags);

827 828 829
	if (IS_ERR_OR_NULL(ah))
		ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);

830 831 832
	if (old_ah)
		ipoib_put_ah(old_ah);

L
Linus Torvalds 已提交
833
	while ((skb = __skb_dequeue(&skqueue))) {
834
		int ret;
L
Linus Torvalds 已提交
835
		skb->dev = dev;
836 837 838 839
		ret = dev_queue_xmit(skb);
		if (ret)
			ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n",
				   __func__, ret);
L
Linus Torvalds 已提交
840 841 842
	}
}

843
static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
L
Linus Torvalds 已提交
844
{
845
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
846 847
	struct ipoib_path *path;

848 849 850
	if (!priv->broadcast)
		return NULL;

851
	path = kzalloc(sizeof *path, GFP_ATOMIC);
L
Linus Torvalds 已提交
852 853 854
	if (!path)
		return NULL;

855
	path->dev = dev;
L
Linus Torvalds 已提交
856 857 858 859 860

	skb_queue_head_init(&path->queue);

	INIT_LIST_HEAD(&path->neigh_list);

861 862 863 864
	if (rdma_cap_opa_ah(priv->ca, priv->port))
		path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA;
	else
		path->pathrec.rec_type = SA_PATH_REC_TYPE_IB;
865
	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
866 867
	path->pathrec.sgid	    = priv->local_gid;
	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
868 869
	path->pathrec.numb_path     = 1;
	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
L
Linus Torvalds 已提交
870 871 872 873 874 875 876

	return path;
}

static int path_rec_start(struct net_device *dev,
			  struct ipoib_path *path)
{
877
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
878

H
Harvey Harrison 已提交
879
	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
880
		  path->pathrec.dgid.raw);
L
Linus Torvalds 已提交
881

882 883
	init_completion(&path->done);

L
Linus Torvalds 已提交
884
	path->query_id =
885
		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
L
Linus Torvalds 已提交
886 887 888 889
				   &path->pathrec,
				   IB_SA_PATH_REC_DGID		|
				   IB_SA_PATH_REC_SGID		|
				   IB_SA_PATH_REC_NUMB_PATH	|
890
				   IB_SA_PATH_REC_TRAFFIC_CLASS |
L
Linus Torvalds 已提交
891 892 893 894 895
				   IB_SA_PATH_REC_PKEY,
				   1000, GFP_ATOMIC,
				   path_rec_completion,
				   path, &path->query);
	if (path->query_id < 0) {
896
		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
L
Linus Torvalds 已提交
897
		path->query = NULL;
898
		complete(&path->done);
L
Linus Torvalds 已提交
899 900 901 902 903 904
		return path->query_id;
	}

	return 0;
}

905 906
static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
			   struct net_device *dev)
L
Linus Torvalds 已提交
907
{
908
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
909
	struct rdma_netdev *rn = netdev_priv(dev);
L
Linus Torvalds 已提交
910 911
	struct ipoib_path *path;
	struct ipoib_neigh *neigh;
912
	unsigned long flags;
L
Linus Torvalds 已提交
913

914
	spin_lock_irqsave(&priv->lock, flags);
915
	neigh = ipoib_neigh_alloc(daddr, dev);
L
Linus Torvalds 已提交
916
	if (!neigh) {
917
		spin_unlock_irqrestore(&priv->lock, flags);
918
		++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
919 920 921 922
		dev_kfree_skb_any(skb);
		return;
	}

923
	path = __path_find(dev, daddr + 4);
L
Linus Torvalds 已提交
924
	if (!path) {
925
		path = path_rec_create(dev, daddr + 4);
L
Linus Torvalds 已提交
926
		if (!path)
927
			goto err_path;
L
Linus Torvalds 已提交
928 929 930 931 932 933

		__path_add(dev, path);
	}

	list_add_tail(&neigh->list, &path->neigh_list);

934
	if (path->ah) {
L
Linus Torvalds 已提交
935 936 937
		kref_get(&path->ah->ref);
		neigh->ah = path->ah;

938
		if (ipoib_cm_enabled(dev, neigh->daddr)) {
939 940 941
			if (!ipoib_cm_get(neigh))
				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
			if (!ipoib_cm_get(neigh)) {
942
				ipoib_neigh_free(neigh);
943 944
				goto err_drop;
			}
945 946
			if (skb_queue_len(&neigh->queue) <
			    IPOIB_MAX_PATH_REC_QUEUE) {
947
				push_pseudo_header(skb, neigh->daddr);
948
				__skb_queue_tail(&neigh->queue, skb);
949
			} else {
950 951 952 953
				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
					   skb_queue_len(&neigh->queue));
				goto err_drop;
			}
954 955
		} else {
			spin_unlock_irqrestore(&priv->lock, flags);
956 957
			path->ah->last_send = rn->send(dev, skb, path->ah->ah,
						       IPOIB_QPN(daddr));
958
			ipoib_neigh_put(neigh);
959 960
			return;
		}
L
Linus Torvalds 已提交
961 962 963 964
	} else {
		neigh->ah  = NULL;

		if (!path->query && path_rec_start(dev, path))
965
			goto err_path;
966 967
		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
			push_pseudo_header(skb, neigh->daddr);
968
			__skb_queue_tail(&neigh->queue, skb);
969
		} else {
970
			goto err_drop;
971
		}
L
Linus Torvalds 已提交
972 973
	}

974
	spin_unlock_irqrestore(&priv->lock, flags);
975
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
976 977
	return;

978
err_path:
979
	ipoib_neigh_free(neigh);
980
err_drop:
981
	++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
982 983
	dev_kfree_skb_any(skb);

984
	spin_unlock_irqrestore(&priv->lock, flags);
985
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
986 987 988
}

static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
989
			     struct ipoib_pseudo_header *phdr)
L
Linus Torvalds 已提交
990
{
991
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
992
	struct rdma_netdev *rn = netdev_priv(dev);
L
Linus Torvalds 已提交
993
	struct ipoib_path *path;
994
	unsigned long flags;
L
Linus Torvalds 已提交
995

996
	spin_lock_irqsave(&priv->lock, flags);
L
Linus Torvalds 已提交
997

998
	path = __path_find(dev, phdr->hwaddr + 4);
999
	if (!path || !path->valid) {
1000 1001 1002
		int new_path = 0;

		if (!path) {
1003
			path = path_rec_create(dev, phdr->hwaddr + 4);
1004 1005
			new_path = 1;
		}
L
Linus Torvalds 已提交
1006
		if (path) {
1007
			if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1008
				push_pseudo_header(skb, phdr->hwaddr);
1009 1010 1011 1012 1013
				__skb_queue_tail(&path->queue, skb);
			} else {
				++dev->stats.tx_dropped;
				dev_kfree_skb_any(skb);
			}
L
Linus Torvalds 已提交
1014

1015
			if (!path->query && path_rec_start(dev, path)) {
1016
				spin_unlock_irqrestore(&priv->lock, flags);
1017 1018
				if (new_path)
					path_free(dev, path);
L
Linus Torvalds 已提交
1019 1020 1021 1022
				return;
			} else
				__path_add(dev, path);
		} else {
1023
			++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
1024 1025 1026
			dev_kfree_skb_any(skb);
		}

1027
		spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
1028 1029 1030
		return;
	}

1031
	if (path->ah) {
1032 1033
		ipoib_dbg(priv, "Send unicast ARP to %08x\n",
			  be32_to_cpu(sa_path_get_dlid(&path->pathrec)));
L
Linus Torvalds 已提交
1034

1035
		spin_unlock_irqrestore(&priv->lock, flags);
1036 1037
		path->ah->last_send = rn->send(dev, skb, path->ah->ah,
					       IPOIB_QPN(phdr->hwaddr));
1038
		return;
L
Linus Torvalds 已提交
1039 1040
	} else if ((path->query || !path_rec_start(dev, path)) &&
		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1041
		push_pseudo_header(skb, phdr->hwaddr);
L
Linus Torvalds 已提交
1042 1043
		__skb_queue_tail(&path->queue, skb);
	} else {
1044
		++dev->stats.tx_dropped;
L
Linus Torvalds 已提交
1045 1046 1047
		dev_kfree_skb_any(skb);
	}

1048
	spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
1049 1050 1051 1052
}

static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
1053
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1054
	struct rdma_netdev *rn = netdev_priv(dev);
L
Linus Torvalds 已提交
1055
	struct ipoib_neigh *neigh;
1056
	struct ipoib_pseudo_header *phdr;
1057
	struct ipoib_header *header;
L
Linus Torvalds 已提交
1058 1059
	unsigned long flags;

1060 1061
	phdr = (struct ipoib_pseudo_header *) skb->data;
	skb_pull(skb, sizeof(*phdr));
1062 1063
	header = (struct ipoib_header *) skb->data;

1064
	if (unlikely(phdr->hwaddr[4] == 0xff)) {
1065 1066 1067 1068
		/* multicast, arrange "if" according to probability */
		if ((header->proto != htons(ETH_P_IP)) &&
		    (header->proto != htons(ETH_P_IPV6)) &&
		    (header->proto != htons(ETH_P_ARP)) &&
1069 1070
		    (header->proto != htons(ETH_P_RARP)) &&
		    (header->proto != htons(ETH_P_TIPC))) {
1071
			/* ethertype not supported by IPoIB */
1072 1073
			++dev->stats.tx_dropped;
			dev_kfree_skb_any(skb);
1074
			return NETDEV_TX_OK;
1075
		}
1076
		/* Add in the P_Key for multicast*/
1077 1078
		phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
		phdr->hwaddr[9] = priv->pkey & 0xff;
1079

1080
		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1081 1082
		if (likely(neigh))
			goto send_using_neigh;
1083
		ipoib_mcast_send(dev, phdr->hwaddr, skb);
1084
		return NETDEV_TX_OK;
1085
	}
L
Linus Torvalds 已提交
1086

1087 1088 1089 1090
	/* unicast, arrange "switch" according to probability */
	switch (header->proto) {
	case htons(ETH_P_IP):
	case htons(ETH_P_IPV6):
1091
	case htons(ETH_P_TIPC):
1092
		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1093
		if (unlikely(!neigh)) {
1094
			neigh_add_path(skb, phdr->hwaddr, dev);
1095
			return NETDEV_TX_OK;
1096
		}
1097 1098 1099 1100
		break;
	case htons(ETH_P_ARP):
	case htons(ETH_P_RARP):
		/* for unicast ARP and RARP should always perform path find */
1101
		unicast_arp_send(skb, dev, phdr);
1102 1103 1104 1105 1106 1107 1108
		return NETDEV_TX_OK;
	default:
		/* ethertype not supported by IPoIB */
		++dev->stats.tx_dropped;
		dev_kfree_skb_any(skb);
		return NETDEV_TX_OK;
	}
1109

1110 1111 1112 1113 1114 1115
send_using_neigh:
	/* note we now hold a ref to neigh */
	if (ipoib_cm_get(neigh)) {
		if (ipoib_cm_up(neigh)) {
			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
			goto unref;
L
Linus Torvalds 已提交
1116
		}
1117
	} else if (neigh->ah) {
1118 1119
		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
						IPOIB_QPN(phdr->hwaddr));
1120 1121
		goto unref;
	}
L
Linus Torvalds 已提交
1122

1123
	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1124
		push_pseudo_header(skb, phdr->hwaddr);
1125 1126 1127
		spin_lock_irqsave(&priv->lock, flags);
		__skb_queue_tail(&neigh->queue, skb);
		spin_unlock_irqrestore(&priv->lock, flags);
L
Linus Torvalds 已提交
1128
	} else {
1129 1130 1131
		++dev->stats.tx_dropped;
		dev_kfree_skb_any(skb);
	}
L
Linus Torvalds 已提交
1132

1133 1134
unref:
	ipoib_neigh_put(neigh);
L
Linus Torvalds 已提交
1135 1136 1137 1138 1139 1140

	return NETDEV_TX_OK;
}

static void ipoib_timeout(struct net_device *dev)
{
1141
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1142

1143
	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
1144
		   jiffies_to_msecs(jiffies - dev_trans_start(dev)));
1145 1146 1147
	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
		   netif_queue_stopped(dev),
		   priv->tx_head, priv->tx_tail);
L
Linus Torvalds 已提交
1148 1149 1150 1151 1152 1153
	/* XXX reset QP, etc. */
}

static int ipoib_hard_header(struct sk_buff *skb,
			     struct net_device *dev,
			     unsigned short type,
1154
			     const void *daddr, const void *saddr, unsigned len)
L
Linus Torvalds 已提交
1155 1156 1157
{
	struct ipoib_header *header;

1158
	header = skb_push(skb, sizeof *header);
L
Linus Torvalds 已提交
1159 1160 1161 1162 1163

	header->proto = htons(type);
	header->reserved = 0;

	/*
1164
	 * we don't rely on dst_entry structure,  always stuff the
1165
	 * destination address into skb hard header so we can figure out where
1166
	 * to send the packet later.
L
Linus Torvalds 已提交
1167
	 */
1168
	push_pseudo_header(skb, daddr);
L
Linus Torvalds 已提交
1169

1170
	return IPOIB_HARD_LEN;
L
Linus Torvalds 已提交
1171 1172 1173 1174
}

static void ipoib_set_mcast_list(struct net_device *dev)
{
1175
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1176

L
Leonid Arsh 已提交
1177 1178 1179 1180 1181
	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
		return;
	}

1182
	queue_work(priv->wq, &priv->restart_task);
L
Linus Torvalds 已提交
1183 1184
}

1185 1186
static int ipoib_get_iflink(const struct net_device *dev)
{
1187
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1188

E
Erez Shitrit 已提交
1189 1190 1191 1192 1193
	/* parent interface */
	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
		return dev->ifindex;

	/* child/vlan interface */
1194 1195 1196
	return priv->parent->ifindex;
}

1197
static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
L
Linus Torvalds 已提交
1198
{
1199 1200 1201 1202 1203 1204 1205
	/*
	 * Use only the address parts that contributes to spreading
	 * The subnet prefix is not used as one can not connect to
	 * same remote port (GUID) using the same remote QPN via two
	 * different subnets.
	 */
	 /* qpn octets[1:4) & port GUID octets[12:20) */
1206
	u32 *d32 = (u32 *) daddr;
1207 1208
	u32 hv;

1209
	hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
1210 1211 1212 1213 1214
	return hv & htbl->mask;
}

struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
{
1215
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh *neigh = NULL;
	u32 hash_val;

	rcu_read_lock_bh();

	htbl = rcu_dereference_bh(ntbl->htbl);

	if (!htbl)
		goto out_unlock;

	hash_val = ipoib_addr_hash(htbl, daddr);
	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
	     neigh != NULL;
	     neigh = rcu_dereference_bh(neigh->hnext)) {
		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
			/* found, take one ref on behalf of the caller */
			if (!atomic_inc_not_zero(&neigh->refcnt)) {
				/* deleted */
				neigh = NULL;
				goto out_unlock;
			}
1239 1240 1241

			if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
				neigh->alive = jiffies;
1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
			goto out_unlock;
		}
	}

out_unlock:
	rcu_read_unlock_bh();
	return neigh;
}

static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long neigh_obsolete;
	unsigned long dt;
L
Linus Torvalds 已提交
1257
	unsigned long flags;
1258
	int i;
1259
	LIST_HEAD(remove_list);
L
Linus Torvalds 已提交
1260

1261
	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1262
		return;
L
Linus Torvalds 已提交
1263

1264
	spin_lock_irqsave(&priv->lock, flags);
1265 1266

	htbl = rcu_dereference_protected(ntbl->htbl,
1267
					 lockdep_is_held(&priv->lock));
1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283

	if (!htbl)
		goto out_unlock;

	/* neigh is obsolete if it was idle for two GC periods */
	dt = 2 * arp_tbl.gc_interval;
	neigh_obsolete = jiffies - dt;
	/* handle possible race condition */
	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
		goto out_unlock;

	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1284
							  lockdep_is_held(&priv->lock))) != NULL) {
1285 1286
			/* was the neigh idle for two GC periods */
			if (time_after(neigh_obsolete, neigh->alive)) {
1287

1288
				ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
1289

1290 1291
				rcu_assign_pointer(*np,
						   rcu_dereference_protected(neigh->hnext,
1292
									     lockdep_is_held(&priv->lock)));
1293
				/* remove from path/mc list */
1294
				list_del_init(&neigh->list);
1295 1296 1297 1298
				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
			} else {
				np = &neigh->hnext;
			}
L
Linus Torvalds 已提交
1299

1300 1301
		}
	}
L
Linus Torvalds 已提交
1302

1303
out_unlock:
1304
	spin_unlock_irqrestore(&priv->lock, flags);
1305
	ipoib_mcast_remove_list(&remove_list);
1306
}
L
Linus Torvalds 已提交
1307

1308 1309 1310 1311 1312 1313 1314 1315
static void ipoib_reap_neigh(struct work_struct *work)
{
	struct ipoib_dev_priv *priv =
		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);

	__ipoib_reap_neigh(priv);

	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1316
		queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1317
				   arp_tbl.gc_interval);
L
Linus Torvalds 已提交
1318 1319
}

1320 1321

static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
1322
				      struct net_device *dev)
1323 1324 1325
{
	struct ipoib_neigh *neigh;

1326
	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
1327 1328 1329
	if (!neigh)
		return NULL;

1330
	neigh->dev = dev;
1331
	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
1332
	skb_queue_head_init(&neigh->queue);
1333
	INIT_LIST_HEAD(&neigh->list);
1334
	ipoib_cm_set(neigh, NULL);
1335 1336 1337 1338 1339 1340 1341 1342 1343
	/* one ref on behalf of the caller */
	atomic_set(&neigh->refcnt, 1);

	return neigh;
}

struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
				      struct net_device *dev)
{
1344
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1345 1346 1347 1348 1349 1350
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh *neigh;
	u32 hash_val;

	htbl = rcu_dereference_protected(ntbl->htbl,
1351
					 lockdep_is_held(&priv->lock));
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
	if (!htbl) {
		neigh = NULL;
		goto out_unlock;
	}

	/* need to add a new neigh, but maybe some other thread succeeded?
	 * recalc hash, maybe hash resize took place so we do a search
	 */
	hash_val = ipoib_addr_hash(htbl, daddr);
	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1362
					       lockdep_is_held(&priv->lock));
1363 1364
	     neigh != NULL;
	     neigh = rcu_dereference_protected(neigh->hnext,
1365
					       lockdep_is_held(&priv->lock))) {
1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
			/* found, take one ref on behalf of the caller */
			if (!atomic_inc_not_zero(&neigh->refcnt)) {
				/* deleted */
				neigh = NULL;
				break;
			}
			neigh->alive = jiffies;
			goto out_unlock;
		}
	}

	neigh = ipoib_neigh_ctor(daddr, dev);
	if (!neigh)
		goto out_unlock;

	/* one ref on behalf of the hash table */
	atomic_inc(&neigh->refcnt);
	neigh->alive = jiffies;
	/* put in hash */
	rcu_assign_pointer(neigh->hnext,
			   rcu_dereference_protected(htbl->buckets[hash_val],
1388
						     lockdep_is_held(&priv->lock)));
1389 1390 1391 1392
	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
	atomic_inc(&ntbl->entries);

out_unlock:
1393 1394 1395 1396

	return neigh;
}

1397
void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1398
{
1399 1400
	/* neigh reference count was dropprd to zero */
	struct net_device *dev = neigh->dev;
1401
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1402
	struct sk_buff *skb;
1403 1404
	if (neigh->ah)
		ipoib_put_ah(neigh->ah);
1405
	while ((skb = __skb_dequeue(&neigh->queue))) {
1406
		++dev->stats.tx_dropped;
1407 1408
		dev_kfree_skb_any(skb);
	}
1409 1410
	if (ipoib_cm_get(neigh))
		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1411
	ipoib_dbg(ipoib_priv(dev),
1412 1413 1414
		  "neigh free for %06x %pI6\n",
		  IPOIB_QPN(neigh->daddr),
		  neigh->daddr + 4);
1415
	kfree(neigh);
1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
	if (atomic_dec_and_test(&priv->ntbl.entries)) {
		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
			complete(&priv->ntbl.flushed);
	}
}

static void ipoib_neigh_reclaim(struct rcu_head *rp)
{
	/* Called as a result of removal from hash table */
	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
	/* note TX context may hold another ref */
	ipoib_neigh_put(neigh);
1428 1429
}

1430
void ipoib_neigh_free(struct ipoib_neigh *neigh)
L
Linus Torvalds 已提交
1431
{
1432
	struct net_device *dev = neigh->dev;
1433
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1434 1435 1436 1437 1438 1439 1440
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	struct ipoib_neigh __rcu **np;
	struct ipoib_neigh *n;
	u32 hash_val;

	htbl = rcu_dereference_protected(ntbl->htbl,
1441
					lockdep_is_held(&priv->lock));
1442
	if (!htbl)
1443
		return;
1444 1445 1446 1447

	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
	np = &htbl->buckets[hash_val];
	for (n = rcu_dereference_protected(*np,
1448
					    lockdep_is_held(&priv->lock));
1449
	     n != NULL;
1450
	     n = rcu_dereference_protected(*np,
1451
					lockdep_is_held(&priv->lock))) {
1452 1453 1454 1455
		if (n == neigh) {
			/* found */
			rcu_assign_pointer(*np,
					   rcu_dereference_protected(neigh->hnext,
1456
								     lockdep_is_held(&priv->lock)));
1457
			/* remove from parent list */
1458
			list_del_init(&neigh->list);
1459
			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1460
			return;
1461 1462 1463 1464 1465 1466 1467 1468 1469 1470
		} else {
			np = &n->hnext;
		}
	}
}

static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
1471
	struct ipoib_neigh __rcu **buckets;
1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488
	u32 size;

	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
	ntbl->htbl = NULL;
	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
	if (!htbl)
		return -ENOMEM;
	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
	if (!buckets) {
		kfree(htbl);
		return -ENOMEM;
	}
	htbl->size = size;
	htbl->mask = (size - 1);
	htbl->buckets = buckets;
1489
	RCU_INIT_POINTER(ntbl->htbl, htbl);
1490
	htbl->ntbl = ntbl;
1491 1492 1493 1494
	atomic_set(&ntbl->entries, 0);

	/* start garbage collection */
	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1495
	queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1496
			   arp_tbl.gc_interval);
L
Linus Torvalds 已提交
1497 1498 1499 1500

	return 0;
}

1501 1502 1503 1504 1505 1506
static void neigh_hash_free_rcu(struct rcu_head *head)
{
	struct ipoib_neigh_hash *htbl = container_of(head,
						    struct ipoib_neigh_hash,
						    rcu);
	struct ipoib_neigh __rcu **buckets = htbl->buckets;
1507
	struct ipoib_neigh_table *ntbl = htbl->ntbl;
1508 1509 1510

	kfree(buckets);
	kfree(htbl);
1511
	complete(&ntbl->deleted);
1512 1513 1514 1515
}

void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
{
1516
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1517 1518 1519 1520 1521 1522
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long flags;
	int i;

	/* remove all neigh connected to a given path or mcast */
1523
	spin_lock_irqsave(&priv->lock, flags);
1524 1525

	htbl = rcu_dereference_protected(ntbl->htbl,
1526
					 lockdep_is_held(&priv->lock));
1527 1528 1529 1530 1531 1532 1533 1534 1535

	if (!htbl)
		goto out_unlock;

	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1536
							  lockdep_is_held(&priv->lock))) != NULL) {
1537 1538 1539 1540
			/* delete neighs belong to this parent */
			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
				rcu_assign_pointer(*np,
						   rcu_dereference_protected(neigh->hnext,
1541
									     lockdep_is_held(&priv->lock)));
1542
				/* remove from parent list */
1543
				list_del_init(&neigh->list);
1544 1545 1546 1547 1548 1549 1550 1551
				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
			} else {
				np = &neigh->hnext;
			}

		}
	}
out_unlock:
1552
	spin_unlock_irqrestore(&priv->lock, flags);
1553 1554 1555 1556 1557 1558 1559
}

static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
{
	struct ipoib_neigh_table *ntbl = &priv->ntbl;
	struct ipoib_neigh_hash *htbl;
	unsigned long flags;
1560
	int i, wait_flushed = 0;
1561

1562
	init_completion(&priv->ntbl.flushed);
1563
	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1564

1565
	spin_lock_irqsave(&priv->lock, flags);
1566 1567

	htbl = rcu_dereference_protected(ntbl->htbl,
1568
					lockdep_is_held(&priv->lock));
1569 1570 1571
	if (!htbl)
		goto out_unlock;

1572 1573 1574 1575
	wait_flushed = atomic_read(&priv->ntbl.entries);
	if (!wait_flushed)
		goto free_htbl;

1576 1577 1578 1579 1580
	for (i = 0; i < htbl->size; i++) {
		struct ipoib_neigh *neigh;
		struct ipoib_neigh __rcu **np = &htbl->buckets[i];

		while ((neigh = rcu_dereference_protected(*np,
1581
				       lockdep_is_held(&priv->lock))) != NULL) {
1582 1583
			rcu_assign_pointer(*np,
					   rcu_dereference_protected(neigh->hnext,
1584
								     lockdep_is_held(&priv->lock)));
1585
			/* remove from path/mc list */
1586
			list_del_init(&neigh->list);
1587 1588 1589 1590
			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
		}
	}

1591
free_htbl:
1592 1593 1594 1595
	rcu_assign_pointer(ntbl->htbl, NULL);
	call_rcu(&htbl->rcu, neigh_hash_free_rcu);

out_unlock:
1596
	spin_unlock_irqrestore(&priv->lock, flags);
1597 1598
	if (wait_flushed)
		wait_for_completion(&priv->ntbl.flushed);
1599 1600 1601 1602
}

static void ipoib_neigh_hash_uninit(struct net_device *dev)
{
1603
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1604 1605 1606
	int stopped;

	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1607
	init_completion(&priv->ntbl.deleted);
1608 1609 1610 1611 1612 1613

	/* Stop GC if called at init fail need to cancel work */
	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	if (!stopped)
		cancel_delayed_work(&priv->neigh_reap_task);

1614 1615 1616
	ipoib_flush_neighs(priv);

	wait_for_completion(&priv->ntbl.deleted);
1617 1618
}

E
Erez Shitrit 已提交
1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634
static void ipoib_napi_add(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = ipoib_priv(dev);

	netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC);
	netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE);
}

static void ipoib_napi_del(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = ipoib_priv(dev);

	netif_napi_del(&priv->recv_napi);
	netif_napi_del(&priv->send_napi);
}

1635
static void ipoib_dev_uninit_default(struct net_device *dev)
1636
{
1637
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1638

1639 1640
	ipoib_transport_dev_cleanup(dev);

E
Erez Shitrit 已提交
1641
	ipoib_napi_del(dev);
1642

1643 1644 1645 1646 1647 1648 1649 1650 1651
	ipoib_cm_dev_cleanup(dev);

	kfree(priv->rx_ring);
	vfree(priv->tx_ring);

	priv->rx_ring = NULL;
	priv->tx_ring = NULL;
}

1652
static int ipoib_dev_init_default(struct net_device *dev)
L
Linus Torvalds 已提交
1653
{
1654
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1655

E
Erez Shitrit 已提交
1656
	ipoib_napi_add(dev);
1657

L
Linus Torvalds 已提交
1658
	/* Allocate RX/TX "rings" to hold queued skbs */
1659
	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
L
Linus Torvalds 已提交
1660
				GFP_KERNEL);
1661
	if (!priv->rx_ring)
1662
		goto out;
L
Linus Torvalds 已提交
1663

1664
	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
L
Linus Torvalds 已提交
1665
	if (!priv->tx_ring) {
1666 1667
		pr_warn("%s: failed to allocate TX ring (%d entries)\n",
			priv->ca->name, ipoib_sendq_size);
L
Linus Torvalds 已提交
1668 1669 1670
		goto out_rx_ring_cleanup;
	}

1671
	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
L
Linus Torvalds 已提交
1672

1673 1674 1675
	if (ipoib_transport_dev_init(dev, priv->ca)) {
		pr_warn("%s: ipoib_transport_dev_init failed\n",
			priv->ca->name);
L
Linus Torvalds 已提交
1676
		goto out_tx_ring_cleanup;
1677 1678
	}

1679 1680 1681 1682 1683
	/* after qp created set dev address */
	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
	priv->dev->dev_addr[3] = (priv->qp->qp_num) & 0xff;

1684 1685 1686 1687 1688 1689 1690 1691 1692
	return 0;

out_tx_ring_cleanup:
	vfree(priv->tx_ring);

out_rx_ring_cleanup:
	kfree(priv->rx_ring);

out:
E
Erez Shitrit 已提交
1693
	ipoib_napi_del(dev);
1694 1695 1696
	return -ENOMEM;
}

1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707
static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
		       int cmd)
{
	struct ipoib_dev_priv *priv = ipoib_priv(dev);

	if (!priv->rn_ops->ndo_do_ioctl)
		return -EOPNOTSUPP;

	return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd);
}

1708 1709
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
{
1710
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1711 1712 1713 1714 1715
	int ret = -ENOMEM;

	priv->ca = ca;
	priv->port = port;
	priv->qp = NULL;
L
Linus Torvalds 已提交
1716

1717
	/*
1718 1719
	 * the various IPoIB tasks assume they will never race against
	 * themselves, so always use a single thread workqueue
1720
	 */
1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
	priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM);
	if (!priv->wq) {
		pr_warn("%s: failed to allocate device WQ\n", dev->name);
		goto out;
	}

	/* create pd, which used both for control and datapath*/
	priv->pd = ib_alloc_pd(priv->ca, 0);
	if (IS_ERR(priv->pd)) {
		pr_warn("%s: failed to allocate PD\n", ca->name);
		goto clean_wq;
	}

1734
	ret = priv->rn_ops->ndo_init(dev);
1735 1736 1737 1738 1739 1740 1741
	if (ret) {
		pr_warn("%s failed to init HW resource\n", dev->name);
		goto out_free_pd;
	}

	if (ipoib_neigh_hash_init(priv) < 0) {
		pr_warn("%s failed to init neigh hash\n", dev->name);
1742
		goto out_dev_uninit;
1743 1744 1745 1746 1747 1748 1749 1750 1751
	}

	if (dev->flags & IFF_UP) {
		if (ipoib_ib_dev_open(dev)) {
			pr_warn("%s failed to open device\n", dev->name);
			ret = -ENODEV;
			goto out_dev_uninit;
		}
	}
1752

L
Linus Torvalds 已提交
1753 1754
	return 0;

1755 1756 1757
out_dev_uninit:
	ipoib_ib_dev_cleanup(dev);

1758 1759 1760 1761 1762
out_free_pd:
	if (priv->pd) {
		ib_dealloc_pd(priv->pd);
		priv->pd = NULL;
	}
L
Linus Torvalds 已提交
1763

1764 1765 1766 1767 1768
clean_wq:
	if (priv->wq) {
		destroy_workqueue(priv->wq);
		priv->wq = NULL;
	}
L
Linus Torvalds 已提交
1769 1770

out:
1771
	return ret;
L
Linus Torvalds 已提交
1772 1773 1774 1775
}

void ipoib_dev_cleanup(struct net_device *dev)
{
1776
	struct ipoib_dev_priv *priv = ipoib_priv(dev), *cpriv, *tcpriv;
O
Or Gerlitz 已提交
1777 1778 1779
	LIST_HEAD(head);

	ASSERT_RTNL();
L
Linus Torvalds 已提交
1780 1781 1782

	/* Delete any child interfaces first */
	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1783 1784 1785
		/* Stop GC on child */
		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
		cancel_delayed_work(&cpriv->neigh_reap_task);
O
Or Gerlitz 已提交
1786
		unregister_netdevice_queue(cpriv->dev, &head);
L
Linus Torvalds 已提交
1787
	}
O
Or Gerlitz 已提交
1788
	unregister_netdevice_many(&head);
L
Linus Torvalds 已提交
1789

1790 1791
	ipoib_neigh_hash_uninit(dev);

L
Linus Torvalds 已提交
1792 1793
	ipoib_ib_dev_cleanup(dev);

1794 1795 1796 1797 1798 1799
	/* no more works over the priv->wq */
	if (priv->wq) {
		flush_workqueue(priv->wq);
		destroy_workqueue(priv->wq);
		priv->wq = NULL;
	}
L
Linus Torvalds 已提交
1800 1801
}

1802 1803
static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
{
1804
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1805 1806 1807 1808 1809 1810 1811

	return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
}

static int ipoib_get_vf_config(struct net_device *dev, int vf,
			       struct ifla_vf_info *ivf)
{
1812
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825
	int err;

	err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
	if (err)
		return err;

	ivf->vf = vf;

	return 0;
}

static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
{
1826
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1827 1828 1829 1830 1831 1832 1833 1834 1835 1836

	if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
		return -EINVAL;

	return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
}

static int ipoib_get_vf_stats(struct net_device *dev, int vf,
			      struct ifla_vf_stats *vf_stats)
{
1837
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1838 1839 1840 1841

	return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
}

1842 1843 1844 1845
static const struct header_ops ipoib_header_ops = {
	.create	= ipoib_hard_header,
};

1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859
static const struct net_device_ops ipoib_netdev_ops_pf = {
	.ndo_uninit		 = ipoib_uninit,
	.ndo_open		 = ipoib_open,
	.ndo_stop		 = ipoib_stop,
	.ndo_change_mtu		 = ipoib_change_mtu,
	.ndo_fix_features	 = ipoib_fix_features,
	.ndo_start_xmit		 = ipoib_start_xmit,
	.ndo_tx_timeout		 = ipoib_timeout,
	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
	.ndo_get_iflink		 = ipoib_get_iflink,
	.ndo_set_vf_link_state	 = ipoib_set_vf_link_state,
	.ndo_get_vf_config	 = ipoib_get_vf_config,
	.ndo_get_vf_stats	 = ipoib_get_vf_stats,
	.ndo_set_vf_guid	 = ipoib_set_vf_guid,
1860
	.ndo_set_mac_address	 = ipoib_set_mac,
1861
	.ndo_get_stats64	 = ipoib_get_stats,
1862
	.ndo_do_ioctl		 = ipoib_ioctl,
1863 1864 1865
};

static const struct net_device_ops ipoib_netdev_ops_vf = {
O
Or Gerlitz 已提交
1866
	.ndo_uninit		 = ipoib_uninit,
1867 1868 1869
	.ndo_open		 = ipoib_open,
	.ndo_stop		 = ipoib_stop,
	.ndo_change_mtu		 = ipoib_change_mtu,
1870
	.ndo_fix_features	 = ipoib_fix_features,
1871 1872
	.ndo_start_xmit	 	 = ipoib_start_xmit,
	.ndo_tx_timeout		 = ipoib_timeout,
1873
	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
1874
	.ndo_get_iflink		 = ipoib_get_iflink,
1875
	.ndo_get_stats64	 = ipoib_get_stats,
1876
	.ndo_do_ioctl		 = ipoib_ioctl,
1877 1878
};

1879
void ipoib_setup_common(struct net_device *dev)
L
Linus Torvalds 已提交
1880
{
1881
	dev->header_ops		 = &ipoib_header_ops;
1882

E
Eli Cohen 已提交
1883 1884
	ipoib_set_ethtool_ops(dev);

1885
	dev->watchdog_timeo	 = HZ;
L
Linus Torvalds 已提交
1886

1887
	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
L
Linus Torvalds 已提交
1888

1889
	dev->hard_header_len	 = IPOIB_HARD_LEN;
1890 1891 1892
	dev->addr_len		 = INFINIBAND_ALEN;
	dev->type		 = ARPHRD_INFINIBAND;
	dev->tx_queue_len	 = ipoib_sendq_size * 2;
E
Eli Cohen 已提交
1893 1894
	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
				    NETIF_F_HIGHDMA);
1895
	netif_keep_dst(dev);
L
Linus Torvalds 已提交
1896 1897

	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1898
}
L
Linus Torvalds 已提交
1899

1900 1901 1902
static void ipoib_build_priv(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
1903

1904
	priv->dev = dev;
L
Linus Torvalds 已提交
1905
	spin_lock_init(&priv->lock);
1906
	init_rwsem(&priv->vlan_rwsem);
1907
	mutex_init(&priv->mcast_mutex);
1908
	mutex_init(&priv->sysfs_mutex);
L
Linus Torvalds 已提交
1909 1910 1911 1912 1913 1914

	INIT_LIST_HEAD(&priv->path_list);
	INIT_LIST_HEAD(&priv->child_intfs);
	INIT_LIST_HEAD(&priv->dead_ahs);
	INIT_LIST_HEAD(&priv->multicast_list);

D
David Howells 已提交
1915
	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1916
	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1917 1918 1919
	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
D
David Howells 已提交
1920 1921
	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1922
	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
L
Linus Torvalds 已提交
1923 1924
}

1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936
static const struct net_device_ops ipoib_netdev_default_pf = {
	.ndo_init		 = ipoib_dev_init_default,
	.ndo_uninit		 = ipoib_dev_uninit_default,
	.ndo_open		 = ipoib_ib_dev_open_default,
	.ndo_stop		 = ipoib_ib_dev_stop_default,
};

static struct net_device
*ipoib_create_netdev_default(struct ib_device *hca,
			     const char *name,
			     unsigned char name_assign_type,
			     void (*setup)(struct net_device *))
L
Linus Torvalds 已提交
1937 1938
{
	struct net_device *dev;
1939
	struct rdma_netdev *rn;
L
Linus Torvalds 已提交
1940

1941 1942 1943
	dev = alloc_netdev((int)sizeof(struct rdma_netdev),
			   name,
			   name_assign_type, setup);
L
Linus Torvalds 已提交
1944 1945 1946
	if (!dev)
		return NULL;

1947 1948 1949 1950 1951
	rn = netdev_priv(dev);

	rn->send = ipoib_send;
	rn->attach_mcast = ipoib_mcast_attach;
	rn->detach_mcast = ipoib_mcast_detach;
1952
	rn->free_rdma_netdev = free_netdev;
1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
	rn->hca = hca;

	dev->netdev_ops = &ipoib_netdev_default_pf;

	return dev;
}

static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port,
					   const char *name)
{
	struct net_device *dev;

	if (hca->alloc_rdma_netdev) {
		dev = hca->alloc_rdma_netdev(hca, port,
					     RDMA_NETDEV_IPOIB, name,
					     NET_NAME_UNKNOWN,
					     ipoib_setup_common);
		if (IS_ERR_OR_NULL(dev) && PTR_ERR(dev) != -EOPNOTSUPP)
			return NULL;
	}

	if (!hca->alloc_rdma_netdev || PTR_ERR(dev) == -EOPNOTSUPP)
		dev = ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
						  ipoib_setup_common);

	return dev;
}

struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
					const char *name)
{
	struct net_device *dev;
	struct ipoib_dev_priv *priv;
	struct rdma_netdev *rn;

	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
	if (!priv)
		return NULL;

	dev = ipoib_get_netdev(hca, port, name);
	if (!dev)
		goto free_priv;

	priv->rn_ops = dev->netdev_ops;

	/* fixme : should be after the query_cap */
	if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
		dev->netdev_ops	= &ipoib_netdev_ops_vf;
	else
		dev->netdev_ops	= &ipoib_netdev_ops_pf;

	rn = netdev_priv(dev);
	rn->clnt_priv = priv;
	ipoib_build_priv(dev);

	return priv;
free_priv:
	kfree(priv);
	return NULL;
L
Linus Torvalds 已提交
2012 2013
}

2014 2015
static ssize_t show_pkey(struct device *dev,
			 struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
2016
{
2017 2018
	struct net_device *ndev = to_net_dev(dev);
	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
L
Linus Torvalds 已提交
2019 2020 2021

	return sprintf(buf, "0x%04x\n", priv->pkey);
}
2022
static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
L
Linus Torvalds 已提交
2023

2024 2025 2026
static ssize_t show_umcast(struct device *dev,
			   struct device_attribute *attr, char *buf)
{
2027 2028
	struct net_device *ndev = to_net_dev(dev);
	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
2029 2030 2031 2032

	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
}

2033
void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
2034
{
2035
	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
2036 2037 2038 2039 2040 2041 2042

	if (umcast_val > 0) {
		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
		ipoib_warn(priv, "ignoring multicast groups joined directly "
				"by userspace\n");
	} else
		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
2043 2044 2045 2046 2047 2048 2049 2050 2051
}

static ssize_t set_umcast(struct device *dev,
			  struct device_attribute *attr,
			  const char *buf, size_t count)
{
	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);

	ipoib_set_umcast(to_net_dev(dev), umcast_val);
2052 2053 2054 2055 2056 2057 2058 2059 2060 2061

	return count;
}
static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);

int ipoib_add_umcast_attr(struct net_device *dev)
{
	return device_create_file(&dev->dev, &dev_attr_umcast);
}

2062 2063 2064 2065 2066
static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
{
	struct ipoib_dev_priv *child_priv;
	struct net_device *netdev = priv->dev;

2067
	netif_addr_lock_bh(netdev);
2068 2069 2070 2071 2072 2073 2074

	memcpy(&priv->local_gid.global.interface_id,
	       &gid->global.interface_id,
	       sizeof(gid->global.interface_id));
	memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
	clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);

2075
	netif_addr_unlock_bh(netdev);
2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090

	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
		down_read(&priv->vlan_rwsem);
		list_for_each_entry(child_priv, &priv->child_intfs, list)
			set_base_guid(child_priv, gid);
		up_read(&priv->vlan_rwsem);
	}
}

static int ipoib_check_lladdr(struct net_device *dev,
			      struct sockaddr_storage *ss)
{
	union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
	int ret = 0;

2091
	netif_addr_lock_bh(dev);
2092 2093 2094 2095 2096 2097 2098 2099 2100

	/* Make sure the QPN, reserved and subnet prefix match the current
	 * lladdr, it also makes sure the lladdr is unicast.
	 */
	if (memcmp(dev->dev_addr, ss->__data,
		   4 + sizeof(gid->global.subnet_prefix)) ||
	    gid->global.interface_id == 0)
		ret = -EINVAL;

2101
	netif_addr_unlock_bh(dev);
2102 2103 2104 2105 2106 2107

	return ret;
}

static int ipoib_set_mac(struct net_device *dev, void *addr)
{
2108
	struct ipoib_dev_priv *priv = ipoib_priv(dev);
2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125
	struct sockaddr_storage *ss = addr;
	int ret;

	if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
		return -EBUSY;

	ret = ipoib_check_lladdr(dev, ss);
	if (ret)
		return ret;

	set_base_guid(priv, (union ib_gid *)(ss->__data + 4));

	queue_work(ipoib_workqueue, &priv->flush_light);

	return 0;
}

2126 2127
static ssize_t create_child(struct device *dev,
			    struct device_attribute *attr,
L
Linus Torvalds 已提交
2128 2129 2130 2131 2132 2133 2134 2135
			    const char *buf, size_t count)
{
	int pkey;
	int ret;

	if (sscanf(buf, "%i", &pkey) != 1)
		return -EINVAL;

2136
	if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
L
Linus Torvalds 已提交
2137 2138
		return -EINVAL;

2139 2140 2141 2142 2143 2144
	/*
	 * Set the full membership bit, so that we join the right
	 * broadcast group, etc.
	 */
	pkey |= 0x8000;

2145
	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
L
Linus Torvalds 已提交
2146 2147 2148

	return ret ? ret : count;
}
2149
static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
L
Linus Torvalds 已提交
2150

2151 2152
static ssize_t delete_child(struct device *dev,
			    struct device_attribute *attr,
L
Linus Torvalds 已提交
2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163
			    const char *buf, size_t count)
{
	int pkey;
	int ret;

	if (sscanf(buf, "%i", &pkey) != 1)
		return -EINVAL;

	if (pkey < 0 || pkey > 0xffff)
		return -EINVAL;

2164
	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
L
Linus Torvalds 已提交
2165 2166 2167 2168

	return ret ? ret : count;

}
2169
static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
L
Linus Torvalds 已提交
2170 2171 2172

int ipoib_add_pkey_attr(struct net_device *dev)
{
2173
	return device_create_file(&dev->dev, &dev_attr_pkey);
L
Linus Torvalds 已提交
2174 2175
}

2176
void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
2177
{
2178
	priv->hca_caps = hca->attrs.device_cap_flags;
2179 2180

	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
2181
		priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
2182

2183 2184
		if (priv->hca_caps & IB_DEVICE_UD_TSO)
			priv->dev->hw_features |= NETIF_F_TSO;
O
Or Gerlitz 已提交
2185

2186 2187
		priv->dev->features |= priv->dev->hw_features;
	}
2188 2189
}

L
Linus Torvalds 已提交
2190 2191 2192 2193
static struct net_device *ipoib_add_port(const char *format,
					 struct ib_device *hca, u8 port)
{
	struct ipoib_dev_priv *priv;
2194
	struct ib_port_attr attr;
2195
	struct rdma_netdev *rn;
L
Linus Torvalds 已提交
2196 2197
	int result = -ENOMEM;

2198
	priv = ipoib_intf_alloc(hca, port, format);
L
Linus Torvalds 已提交
2199 2200 2201
	if (!priv)
		goto alloc_mem_failed;

2202
	SET_NETDEV_DEV(priv->dev, hca->dev.parent);
E
Eli Cohen 已提交
2203
	priv->dev->dev_id = port - 1;
L
Linus Torvalds 已提交
2204

2205
	result = ib_query_port(hca, port, &attr);
2206
	if (result) {
2207
		pr_warn("%s: ib_query_port %d failed\n", hca->name, port);
2208 2209 2210
		goto device_init_failed;
	}

2211 2212
	priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);

2213 2214 2215
	/* MTU will be reset when mcast join happens */
	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
2216
	priv->dev->max_mtu = IPOIB_CM_MTU;
2217

2218 2219
	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);

L
Linus Torvalds 已提交
2220 2221
	result = ib_query_pkey(hca, port, 0, &priv->pkey);
	if (result) {
2222 2223
		pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n",
			hca->name, port, result);
E
Eli Cohen 已提交
2224
		goto device_init_failed;
L
Linus Torvalds 已提交
2225 2226
	}

2227
	ipoib_set_dev_features(priv, hca);
V
Vladimir Sokolovsky 已提交
2228

2229 2230 2231 2232 2233 2234
	/*
	 * Set the full membership bit, so that we join the right
	 * broadcast group, etc.
	 */
	priv->pkey |= 0x8000;

L
Linus Torvalds 已提交
2235 2236 2237
	priv->dev->broadcast[8] = priv->pkey >> 8;
	priv->dev->broadcast[9] = priv->pkey & 0xff;

2238
	result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
L
Linus Torvalds 已提交
2239
	if (result) {
2240 2241
		pr_warn("%s: ib_query_gid port %d failed (ret = %d)\n",
			hca->name, port, result);
E
Eli Cohen 已提交
2242
		goto device_init_failed;
2243 2244 2245 2246
	}

	memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw,
	       sizeof(union ib_gid));
2247
	set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
L
Linus Torvalds 已提交
2248 2249

	result = ipoib_dev_init(priv->dev, hca, port);
2250
	if (result) {
2251 2252
		pr_warn("%s: failed to initialize port %d (ret = %d)\n",
			hca->name, port, result);
L
Linus Torvalds 已提交
2253 2254 2255 2256 2257
		goto device_init_failed;
	}

	INIT_IB_EVENT_HANDLER(&priv->event_handler,
			      priv->ca, ipoib_event);
2258
	ib_register_event_handler(&priv->event_handler);
L
Linus Torvalds 已提交
2259 2260 2261

	result = register_netdev(priv->dev);
	if (result) {
2262 2263
		pr_warn("%s: couldn't register ipoib port %d; error %d\n",
			hca->name, port, result);
L
Linus Torvalds 已提交
2264 2265 2266
		goto register_failed;
	}

2267
	result = -ENOMEM;
2268 2269
	if (ipoib_cm_add_mode_attr(priv->dev))
		goto sysfs_failed;
L
Linus Torvalds 已提交
2270 2271
	if (ipoib_add_pkey_attr(priv->dev))
		goto sysfs_failed;
2272 2273
	if (ipoib_add_umcast_attr(priv->dev))
		goto sysfs_failed;
2274
	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
L
Linus Torvalds 已提交
2275
		goto sysfs_failed;
2276
	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
L
Linus Torvalds 已提交
2277 2278 2279 2280 2281 2282 2283 2284 2285
		goto sysfs_failed;

	return priv->dev;

sysfs_failed:
	unregister_netdev(priv->dev);

register_failed:
	ib_unregister_event_handler(&priv->event_handler);
2286
	flush_workqueue(ipoib_workqueue);
2287 2288 2289
	/* Stop GC if started before flush */
	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
	cancel_delayed_work(&priv->neigh_reap_task);
2290
	flush_workqueue(priv->wq);
L
Linus Torvalds 已提交
2291 2292 2293
	ipoib_dev_cleanup(priv->dev);

device_init_failed:
2294 2295
	rn = netdev_priv(priv->dev);
	rn->free_rdma_netdev(priv->dev);
2296
	kfree(priv);
L
Linus Torvalds 已提交
2297 2298 2299 2300 2301 2302 2303 2304 2305 2306

alloc_mem_failed:
	return ERR_PTR(result);
}

static void ipoib_add_one(struct ib_device *device)
{
	struct list_head *dev_list;
	struct net_device *dev;
	struct ipoib_dev_priv *priv;
2307
	int p;
M
Michael Wang 已提交
2308
	int count = 0;
T
Tom Tucker 已提交
2309

L
Linus Torvalds 已提交
2310 2311 2312 2313 2314 2315
	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
	if (!dev_list)
		return;

	INIT_LIST_HEAD(dev_list);

2316
	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
M
Michael Wang 已提交
2317
		if (!rdma_protocol_ib(device, p))
E
Eli Cohen 已提交
2318
			continue;
L
Linus Torvalds 已提交
2319 2320
		dev = ipoib_add_port("ib%d", device, p);
		if (!IS_ERR(dev)) {
2321
			priv = ipoib_priv(dev);
L
Linus Torvalds 已提交
2322
			list_add_tail(&priv->list, dev_list);
M
Michael Wang 已提交
2323
			count++;
L
Linus Torvalds 已提交
2324 2325 2326
		}
	}

M
Michael Wang 已提交
2327
	if (!count) {
2328 2329
		pr_err("Failed to init port, removing it\n");
		ipoib_remove_one(device, dev_list);
M
Michael Wang 已提交
2330 2331 2332
		return;
	}

L
Linus Torvalds 已提交
2333 2334 2335
	ib_set_client_data(device, &ipoib_client, dev_list);
}

2336
static void ipoib_remove_one(struct ib_device *device, void *client_data)
L
Linus Torvalds 已提交
2337
{
2338
	struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
2339
	struct list_head *dev_list = client_data;
L
Linus Torvalds 已提交
2340

2341 2342
	if (!dev_list)
		return;
L
Linus Torvalds 已提交
2343 2344

	list_for_each_entry_safe(priv, tmp, dev_list, list) {
2345
		struct rdma_netdev *parent_rn = netdev_priv(priv->dev);
2346

L
Linus Torvalds 已提交
2347
		ib_unregister_event_handler(&priv->event_handler);
2348
		flush_workqueue(ipoib_workqueue);
2349

2350 2351 2352
		/* mark interface in the middle of destruction */
		set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags);

2353 2354 2355 2356
		rtnl_lock();
		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
		rtnl_unlock();

2357 2358 2359
		/* Stop GC */
		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
		cancel_delayed_work(&priv->neigh_reap_task);
2360
		flush_workqueue(priv->wq);
L
Linus Torvalds 已提交
2361

2362 2363
		/* Wrap rtnl_lock/unlock with mutex to protect sysfs calls */
		mutex_lock(&priv->sysfs_mutex);
L
Linus Torvalds 已提交
2364
		unregister_netdev(priv->dev);
2365 2366
		mutex_unlock(&priv->sysfs_mutex);

2367 2368 2369 2370
		parent_rn->free_rdma_netdev(priv->dev);

		list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
			struct rdma_netdev *child_rn;
2371

2372 2373
			child_rn = netdev_priv(cpriv->dev);
			child_rn->free_rdma_netdev(cpriv->dev);
2374
			kfree(cpriv);
2375
		}
2376

2377
		kfree(priv);
L
Linus Torvalds 已提交
2378
	}
2379 2380

	kfree(dev_list);
L
Linus Torvalds 已提交
2381 2382
}

2383 2384 2385 2386 2387 2388
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
static struct notifier_block ipoib_netdev_notifier = {
	.notifier_call = ipoib_netdev_event,
};
#endif

L
Linus Torvalds 已提交
2389 2390 2391 2392
static int __init ipoib_init_module(void)
{
	int ret;

2393 2394 2395 2396 2397 2398
	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);

	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
2399
	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
2400 2401
#ifdef CONFIG_INFINIBAND_IPOIB_CM
	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
2402
	ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0);
2403
#endif
2404

2405 2406 2407 2408 2409 2410
	/*
	 * When copying small received packets, we only copy from the
	 * linear data part of the SKB, so we rely on this condition.
	 */
	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);

L
Linus Torvalds 已提交
2411 2412 2413 2414 2415
	ret = ipoib_register_debugfs();
	if (ret)
		return ret;

	/*
2416 2417 2418 2419 2420 2421 2422 2423
	 * We create a global workqueue here that is used for all flush
	 * operations.  However, if you attempt to flush a workqueue
	 * from a task on that same workqueue, it deadlocks the system.
	 * We want to be able to flush the tasks associated with a
	 * specific net device, so we also create a workqueue for each
	 * netdevice.  We queue up the tasks for that device only on
	 * its private workqueue, and we only queue up flush events
	 * on our global flush workqueue.  This avoids the deadlocks.
L
Linus Torvalds 已提交
2424
	 */
2425 2426
	ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush",
						  WQ_MEM_RECLAIM);
L
Linus Torvalds 已提交
2427 2428 2429 2430 2431
	if (!ipoib_workqueue) {
		ret = -ENOMEM;
		goto err_fs;
	}

2432 2433
	ib_sa_register_client(&ipoib_sa_client);

L
Linus Torvalds 已提交
2434 2435
	ret = ib_register_client(&ipoib_client);
	if (ret)
2436
		goto err_sa;
L
Linus Torvalds 已提交
2437

O
Or Gerlitz 已提交
2438 2439 2440 2441
	ret = ipoib_netlink_init();
	if (ret)
		goto err_client;

2442 2443 2444
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
	register_netdevice_notifier(&ipoib_netdev_notifier);
#endif
L
Linus Torvalds 已提交
2445 2446
	return 0;

O
Or Gerlitz 已提交
2447 2448 2449
err_client:
	ib_unregister_client(&ipoib_client);

2450 2451
err_sa:
	ib_sa_unregister_client(&ipoib_sa_client);
L
Linus Torvalds 已提交
2452 2453
	destroy_workqueue(ipoib_workqueue);

2454 2455 2456
err_fs:
	ipoib_unregister_debugfs();

L
Linus Torvalds 已提交
2457 2458 2459 2460 2461
	return ret;
}

static void __exit ipoib_cleanup_module(void)
{
2462 2463 2464
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
	unregister_netdevice_notifier(&ipoib_netdev_notifier);
#endif
O
Or Gerlitz 已提交
2465
	ipoib_netlink_fini();
L
Linus Torvalds 已提交
2466
	ib_unregister_client(&ipoib_client);
2467
	ib_sa_unregister_client(&ipoib_sa_client);
2468
	ipoib_unregister_debugfs();
L
Linus Torvalds 已提交
2469 2470 2471 2472 2473
	destroy_workqueue(ipoib_workqueue);
}

module_init(ipoib_init_module);
module_exit(ipoib_cleanup_module);