bond_main.c 151.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
/*
 * originally based on the dummy device.
 *
 * Copyright 1999, Thomas Davis, tadavis@lbl.gov.
 * Licensed under the GPL. Based on dummy.c, and eql.c devices.
 *
 * bonding.c: an Ethernet Bonding driver
 *
 * This is useful to talk to a Cisco EtherChannel compatible equipment:
 *	Cisco 5500
 *	Sun Trunking (Solaris)
 *	Alteon AceDirector Trunks
 *	Linux Bonding
 *	and probably many L2 switches ...
 *
 * How it works:
 *    ifconfig bond0 ipaddress netmask up
 *      will setup a network device, with an ip address.  No mac address
 *	will be assigned at this time.  The hw mac address will come from
 *	the first slave bonded to the channel.  All slaves will then use
 *	this hw mac address.
 *
 *    ifconfig bond0 down
 *         will release all slaves, marking them as down.
 *
 *    ifenslave bond0 eth0
 *	will attach eth0 to bond0 as a slave.  eth0 hw mac address will either
 *	a: be used as initial mac address
 *	b: if a hw mac address already is there, eth0's hw mac address
 *	   will then be set from bond0.
 *
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/interrupt.h>
#include <linux/ptrace.h>
#include <linux/ioport.h>
#include <linux/in.h>
42
#include <net/ip.h>
L
Linus Torvalds 已提交
43
#include <linux/ip.h>
M
Matteo Croce 已提交
44 45
#include <linux/icmp.h>
#include <linux/icmpv6.h>
46 47
#include <linux/tcp.h>
#include <linux/udp.h>
L
Linus Torvalds 已提交
48 49 50 51 52 53 54 55
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/socket.h>
#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/bitops.h>
S
Stephen Hemminger 已提交
56
#include <linux/io.h>
L
Linus Torvalds 已提交
57
#include <asm/dma.h>
S
Stephen Hemminger 已提交
58
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
59 60 61
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
62
#include <linux/igmp.h>
L
Linus Torvalds 已提交
63 64 65 66 67 68 69 70 71 72 73
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/smp.h>
#include <linux/if_ether.h>
#include <net/arp.h>
#include <linux/mii.h>
#include <linux/ethtool.h>
#include <linux/if_vlan.h>
#include <linux/if_bonding.h>
D
David Sterba 已提交
74
#include <linux/jiffies.h>
75
#include <linux/preempt.h>
J
Jay Vosburgh 已提交
76
#include <net/route.h>
77
#include <net/net_namespace.h>
78
#include <net/netns/generic.h>
79
#include <net/pkt_sched.h>
80
#include <linux/rculist.h>
81
#include <net/flow_dissector.h>
82
#include <net/xfrm.h>
83 84 85
#include <net/bonding.h>
#include <net/bond_3ad.h>
#include <net/bond_alb.h>
86 87 88
#if IS_ENABLED(CONFIG_TLS_DEVICE)
#include <net/tls.h>
#endif
L
Linus Torvalds 已提交
89

90 91
#include "bonding_priv.h"

L
Linus Torvalds 已提交
92 93 94 95 96
/*---------------------------- Module parameters ----------------------------*/

/* monitor all links that often (in milliseconds). <=0 disables monitoring */

static int max_bonds	= BOND_DEFAULT_MAX_BONDS;
97
static int tx_queues	= BOND_DEFAULT_TX_QUEUES;
98
static int num_peer_notif = 1;
99
static int miimon;
S
Stephen Hemminger 已提交
100 101
static int updelay;
static int downdelay;
L
Linus Torvalds 已提交
102
static int use_carrier	= 1;
S
Stephen Hemminger 已提交
103 104
static char *mode;
static char *primary;
105
static char *primary_reselect;
S
Stephen Hemminger 已提交
106
static char *lacp_rate;
107
static int min_links;
S
Stephen Hemminger 已提交
108 109
static char *ad_select;
static char *xmit_hash_policy;
110
static int arp_interval;
S
Stephen Hemminger 已提交
111 112
static char *arp_ip_target[BOND_MAX_ARP_TARGETS];
static char *arp_validate;
113
static char *arp_all_targets;
S
Stephen Hemminger 已提交
114
static char *fail_over_mac;
115
static int all_slaves_active;
116
static struct bond_params bonding_defaults;
117
static int resend_igmp = BOND_DEFAULT_RESEND_IGMP;
118
static int packets_per_slave = 1;
119
static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
L
Linus Torvalds 已提交
120 121 122

module_param(max_bonds, int, 0);
MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
123 124
module_param(tx_queues, int, 0);
MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)");
125
module_param_named(num_grat_arp, num_peer_notif, int, 0644);
126 127
MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on "
			       "failover event (alias of num_unsol_na)");
128
module_param_named(num_unsol_na, num_peer_notif, int, 0644);
129 130
MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on "
			       "failover event (alias of num_grat_arp)");
L
Linus Torvalds 已提交
131 132 133 134 135
module_param(miimon, int, 0);
MODULE_PARM_DESC(miimon, "Link check interval in milliseconds");
module_param(updelay, int, 0);
MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds");
module_param(downdelay, int, 0);
136 137
MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
			    "in milliseconds");
L
Linus Torvalds 已提交
138
module_param(use_carrier, int, 0);
139
MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
140
			      "0 for off, 1 for on (default)");
L
Linus Torvalds 已提交
141
module_param(mode, charp, 0);
142
MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
143 144 145
		       "1 for active-backup, 2 for balance-xor, "
		       "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, "
		       "6 for balance-alb");
L
Linus Torvalds 已提交
146 147
module_param(primary, charp, 0);
MODULE_PARM_DESC(primary, "Primary network device to use");
148 149 150 151 152 153 154 155
module_param(primary_reselect, charp, 0);
MODULE_PARM_DESC(primary_reselect, "Reselect primary slave "
				   "once it comes up; "
				   "0 for always (default), "
				   "1 for only if speed of primary is "
				   "better, "
				   "2 for only on active slave "
				   "failure");
L
Linus Torvalds 已提交
156
module_param(lacp_rate, charp, 0);
157 158
MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; "
			    "0 for slow, 1 for fast");
159
module_param(ad_select, charp, 0);
Z
Zhu Yanjun 已提交
160
MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; "
161 162
			    "0 for stable (default), 1 for bandwidth, "
			    "2 for count");
163 164 165
module_param(min_links, int, 0);
MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");

166
module_param(xmit_hash_policy, charp, 0);
167
MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
168
				   "0 for layer 2 (default), 1 for layer 3+4, "
169
				   "2 for layer 2+3, 3 for encap layer 2+3, "
170
				   "4 for encap layer 3+4, 5 for vlan+srcmac");
L
Linus Torvalds 已提交
171 172 173 174
module_param(arp_interval, int, 0);
MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
module_param_array(arp_ip_target, charp, NULL, 0);
MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form");
175
module_param(arp_validate, charp, 0);
176 177 178
MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; "
			       "0 for none (default), 1 for active, "
			       "2 for backup, 3 for all");
179 180
module_param(arp_all_targets, charp, 0);
MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all");
181
module_param(fail_over_mac, charp, 0);
182 183 184
MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to "
				"the same MAC; 0 for none (default), "
				"1 for active, 2 for follow");
185
module_param(all_slaves_active, int, 0);
186
MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface "
187
				     "by setting active flag for all slaves; "
188
				     "0 for never (default), 1 for always.");
189
module_param(resend_igmp, int, 0);
190 191
MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on "
			      "link failure");
192 193 194 195
module_param(packets_per_slave, int, 0);
MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr "
				    "mode; 0 for a random slave, 1 packet per "
				    "slave (default), >1 packets per slave.");
196 197 198 199
module_param(lp_interval, uint, 0);
MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where "
			      "the bonding driver sends learning packets to "
			      "each slaves peer switch. The default is 1.");
L
Linus Torvalds 已提交
200 201 202

/*----------------------------- Global variables ----------------------------*/

203
#ifdef CONFIG_NET_POLL_CONTROLLER
204
atomic_t netpoll_block_tx = ATOMIC_INIT(0);
205 206
#endif

207
unsigned int bond_net_id __read_mostly;
L
Linus Torvalds 已提交
208

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
static const struct flow_dissector_key flow_keys_bonding_keys[] = {
	{
		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
		.offset = offsetof(struct flow_keys, control),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_BASIC,
		.offset = offsetof(struct flow_keys, basic),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
		.offset = offsetof(struct flow_keys, addrs.v4addrs),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
		.offset = offsetof(struct flow_keys, addrs.v6addrs),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_TIPC,
		.offset = offsetof(struct flow_keys, addrs.tipckey),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_PORTS,
		.offset = offsetof(struct flow_keys, ports),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_ICMP,
		.offset = offsetof(struct flow_keys, icmp),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_VLAN,
		.offset = offsetof(struct flow_keys, vlan),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
		.offset = offsetof(struct flow_keys, tags),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
		.offset = offsetof(struct flow_keys, keyid),
	},
};

static struct flow_dissector flow_keys_bonding __read_mostly;

L
Linus Torvalds 已提交
254 255
/*-------------------------- Forward declarations ---------------------------*/

256
static int bond_init(struct net_device *bond_dev);
257
static void bond_uninit(struct net_device *bond_dev);
258 259
static void bond_get_stats(struct net_device *bond_dev,
			   struct rtnl_link_stats64 *stats);
260
static void bond_slave_arr_handler(struct work_struct *work);
261 262
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
				  int mod);
263
static void bond_netdev_notify_work(struct work_struct *work);
L
Linus Torvalds 已提交
264 265 266

/*---------------------------- General routines -----------------------------*/

267
const char *bond_mode_name(int mode)
L
Linus Torvalds 已提交
268
{
269 270 271 272 273
	static const char *names[] = {
		[BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)",
		[BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)",
		[BOND_MODE_XOR] = "load balancing (xor)",
		[BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)",
S
Stephen Hemminger 已提交
274
		[BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation",
275 276 277 278
		[BOND_MODE_TLB] = "transmit load balancing",
		[BOND_MODE_ALB] = "adaptive load balancing",
	};

279
	if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB)
L
Linus Torvalds 已提交
280
		return "unknown";
281 282

	return names[mode];
L
Linus Torvalds 已提交
283 284 285 286
}

/**
 * bond_dev_queue_xmit - Prepare skb for xmit.
S
Stephen Hemminger 已提交
287
 *
L
Linus Torvalds 已提交
288 289 290 291
 * @bond: bond device that got this skb for tx.
 * @skb: hw accel VLAN tagged skb to transmit
 * @slave_dev: slave that is supposed to xmit this skbuff
 */
292
netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
S
Stephen Hemminger 已提交
293
			struct net_device *slave_dev)
L
Linus Torvalds 已提交
294
{
295
	skb->dev = slave_dev;
296

297
	BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
298
		     sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
299
	skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
300

301
	if (unlikely(netpoll_tx_running(bond->dev)))
302 303 304
		return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb);

	return dev_queue_xmit(skb);
L
Linus Torvalds 已提交
305 306
}

307 308 309 310 311 312 313 314 315 316 317 318 319
bool bond_sk_check(struct bonding *bond)
{
	switch (BOND_MODE(bond)) {
	case BOND_MODE_8023AD:
	case BOND_MODE_XOR:
		if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34)
			return true;
		fallthrough;
	default:
		return false;
	}
}

320 321
/*---------------------------------- VLAN -----------------------------------*/

322
/* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
J
Jiri Pirko 已提交
323
 * We don't protect the slave list iteration with a lock because:
L
Linus Torvalds 已提交
324 325 326 327
 * a. This operation is performed in IOCTL context,
 * b. The operation is protected by the RTNL semaphore in the 8021q code,
 * c. Holding a lock with BH disabled while directly calling a base driver
 *    entry point is generally a BAD idea.
S
Stephen Hemminger 已提交
328
 *
L
Linus Torvalds 已提交
329 330 331 332 333 334 335 336 337 338 339 340
 * The design of synchronization/protection for this operation in the 8021q
 * module is good for one or more VLAN devices over a single physical device
 * and cannot be extended for a teaming solution like bonding, so there is a
 * potential race condition here where a net device from the vlan group might
 * be referenced (either by a base driver or the 8021q code) while it is being
 * removed from the system. However, it turns out we're not making matters
 * worse, and if it works for regular VLAN usage it will work here too.
*/

/**
 * bond_vlan_rx_add_vid - Propagates adding an id to slaves
 * @bond_dev: bonding net device that got called
341
 * @proto: network protocol ID
L
Linus Torvalds 已提交
342 343
 * @vid: vlan id being added
 */
344 345
static int bond_vlan_rx_add_vid(struct net_device *bond_dev,
				__be16 proto, u16 vid)
L
Linus Torvalds 已提交
346
{
347
	struct bonding *bond = netdev_priv(bond_dev);
348
	struct slave *slave, *rollback_slave;
349
	struct list_head *iter;
350
	int res;
L
Linus Torvalds 已提交
351

352
	bond_for_each_slave(bond, slave, iter) {
353
		res = vlan_vid_add(slave->dev, proto, vid);
354 355
		if (res)
			goto unwind;
L
Linus Torvalds 已提交
356 357
	}

358
	return 0;
359 360

unwind:
361
	/* unwind to the slave that failed */
362
	bond_for_each_slave(bond, rollback_slave, iter) {
363 364 365 366 367
		if (rollback_slave == slave)
			break;

		vlan_vid_del(rollback_slave->dev, proto, vid);
	}
368 369

	return res;
L
Linus Torvalds 已提交
370 371 372 373 374
}

/**
 * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves
 * @bond_dev: bonding net device that got called
375
 * @proto: network protocol ID
L
Linus Torvalds 已提交
376 377
 * @vid: vlan id being removed
 */
378 379
static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
				 __be16 proto, u16 vid)
L
Linus Torvalds 已提交
380
{
381
	struct bonding *bond = netdev_priv(bond_dev);
382
	struct list_head *iter;
L
Linus Torvalds 已提交
383 384
	struct slave *slave;

385
	bond_for_each_slave(bond, slave, iter)
386
		vlan_vid_del(slave->dev, proto, vid);
L
Linus Torvalds 已提交
387

388 389
	if (bond_is_lb(bond))
		bond_alb_clear_vlan(bond, vid);
390 391

	return 0;
L
Linus Torvalds 已提交
392 393
}

394 395 396 397 398 399 400 401 402 403
/*---------------------------------- XFRM -----------------------------------*/

#ifdef CONFIG_XFRM_OFFLOAD
/**
 * bond_ipsec_add_sa - program device with a security association
 * @xs: pointer to transformer state struct
 **/
static int bond_ipsec_add_sa(struct xfrm_state *xs)
{
	struct net_device *bond_dev = xs->xso.dev;
404 405
	struct bonding *bond;
	struct slave *slave;
406

407 408 409 410
	if (!bond_dev)
		return -EINVAL;

	bond = netdev_priv(bond_dev);
411
	slave = rcu_dereference(bond->curr_active_slave);
412
	xs->xso.real_dev = slave->dev;
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
	bond->xs = xs;

	if (!(slave->dev->xfrmdev_ops
	      && slave->dev->xfrmdev_ops->xdo_dev_state_add)) {
		slave_warn(bond_dev, slave->dev, "Slave does not support ipsec offload\n");
		return -EINVAL;
	}

	return slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
}

/**
 * bond_ipsec_del_sa - clear out this specific SA
 * @xs: pointer to transformer state struct
 **/
static void bond_ipsec_del_sa(struct xfrm_state *xs)
{
	struct net_device *bond_dev = xs->xso.dev;
431 432 433 434 435 436 437
	struct bonding *bond;
	struct slave *slave;

	if (!bond_dev)
		return;

	bond = netdev_priv(bond_dev);
438
	slave = rcu_dereference(bond->curr_active_slave);
439 440 441 442

	if (!slave)
		return;

443
	xs->xso.real_dev = slave->dev;
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462

	if (!(slave->dev->xfrmdev_ops
	      && slave->dev->xfrmdev_ops->xdo_dev_state_delete)) {
		slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__);
		return;
	}

	slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs);
}

/**
 * bond_ipsec_offload_ok - can this packet use the xfrm hw offload
 * @skb: current data packet
 * @xs: pointer to transformer state struct
 **/
static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
{
	struct net_device *bond_dev = xs->xso.dev;
	struct bonding *bond = netdev_priv(bond_dev);
463
	struct slave *curr_active = rcu_dereference(bond->curr_active_slave);
464 465
	struct net_device *slave_dev = curr_active->dev;

466 467 468
	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
		return true;

469 470 471 472 473 474
	if (!(slave_dev->xfrmdev_ops
	      && slave_dev->xfrmdev_ops->xdo_dev_offload_ok)) {
		slave_warn(bond_dev, slave_dev, "%s: no slave xdo_dev_offload_ok\n", __func__);
		return false;
	}

475
	xs->xso.real_dev = slave_dev;
476 477 478 479 480 481 482 483 484 485
	return slave_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
}

static const struct xfrmdev_ops bond_xfrmdev_ops = {
	.xdo_dev_state_add = bond_ipsec_add_sa,
	.xdo_dev_state_delete = bond_ipsec_del_sa,
	.xdo_dev_offload_ok = bond_ipsec_offload_ok,
};
#endif /* CONFIG_XFRM_OFFLOAD */

L
Linus Torvalds 已提交
486 487
/*------------------------------- Link status -------------------------------*/

488
/* Set the carrier state for the master according to the state of its
489 490 491 492 493
 * slaves.  If any slaves are up, the master is up.  In 802.3ad mode,
 * do special 802.3ad magic.
 *
 * Returns zero if carrier state does not change, nonzero if it does.
 */
494
int bond_set_carrier(struct bonding *bond)
495
{
496
	struct list_head *iter;
497 498
	struct slave *slave;

499
	if (!bond_has_slaves(bond))
500 501
		goto down;

502
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
503 504
		return bond_3ad_set_carrier(bond);

505
	bond_for_each_slave(bond, slave, iter) {
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
		if (slave->link == BOND_LINK_UP) {
			if (!netif_carrier_ok(bond->dev)) {
				netif_carrier_on(bond->dev);
				return 1;
			}
			return 0;
		}
	}

down:
	if (netif_carrier_ok(bond->dev)) {
		netif_carrier_off(bond->dev);
		return 1;
	}
	return 0;
}

523
/* Get link speed and duplex from the slave's base driver
L
Linus Torvalds 已提交
524
 * using ethtool. If for some reason the call fails or the
525
 * values are invalid, set speed and duplex to -1,
526 527
 * and return. Return 1 if speed or duplex settings are
 * UNKNOWN; 0 otherwise.
L
Linus Torvalds 已提交
528
 */
529
static int bond_update_speed_duplex(struct slave *slave)
L
Linus Torvalds 已提交
530 531
{
	struct net_device *slave_dev = slave->dev;
532
	struct ethtool_link_ksettings ecmd;
533
	int res;
L
Linus Torvalds 已提交
534

535 536
	slave->speed = SPEED_UNKNOWN;
	slave->duplex = DUPLEX_UNKNOWN;
L
Linus Torvalds 已提交
537

538
	res = __ethtool_get_link_ksettings(slave_dev, &ecmd);
539
	if (res < 0)
540
		return 1;
541
	if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
542
		return 1;
543
	switch (ecmd.base.duplex) {
L
Linus Torvalds 已提交
544 545 546 547
	case DUPLEX_FULL:
	case DUPLEX_HALF:
		break;
	default:
548
		return 1;
L
Linus Torvalds 已提交
549 550
	}

551 552
	slave->speed = ecmd.base.speed;
	slave->duplex = ecmd.base.duplex;
L
Linus Torvalds 已提交
553

554
	return 0;
L
Linus Torvalds 已提交
555 556
}

557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
const char *bond_slave_link_status(s8 link)
{
	switch (link) {
	case BOND_LINK_UP:
		return "up";
	case BOND_LINK_FAIL:
		return "going down";
	case BOND_LINK_DOWN:
		return "down";
	case BOND_LINK_BACK:
		return "going back";
	default:
		return "unknown";
	}
}

573
/* if <dev> supports MII link status reporting, check its link status.
L
Linus Torvalds 已提交
574 575
 *
 * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(),
S
Stephen Hemminger 已提交
576
 * depending upon the setting of the use_carrier parameter.
L
Linus Torvalds 已提交
577 578 579 580 581 582 583 584 585 586 587
 *
 * Return either BMSR_LSTATUS, meaning that the link is up (or we
 * can't tell and just pretend it is), or 0, meaning that the link is
 * down.
 *
 * If reporting is non-zero, instead of faking link up, return -1 if
 * both ETHTOOL and MII ioctls fail (meaning the device does not
 * support them).  If use_carrier is set, return whatever it says.
 * It'd be nice if there was a good way to tell if a driver supports
 * netif_carrier, but there really isn't.
 */
S
Stephen Hemminger 已提交
588 589
static int bond_check_dev_link(struct bonding *bond,
			       struct net_device *slave_dev, int reporting)
L
Linus Torvalds 已提交
590
{
591
	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
592
	int (*ioctl)(struct net_device *, struct ifreq *, int);
L
Linus Torvalds 已提交
593 594 595
	struct ifreq ifr;
	struct mii_ioctl_data *mii;

596 597 598
	if (!reporting && !netif_running(slave_dev))
		return 0;

599
	if (bond->params.use_carrier)
600
		return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
L
Linus Torvalds 已提交
601

602
	/* Try to get link status using Ethtool first. */
603 604 605
	if (slave_dev->ethtool_ops->get_link)
		return slave_dev->ethtool_ops->get_link(slave_dev) ?
			BMSR_LSTATUS : 0;
606

S
Stephen Hemminger 已提交
607
	/* Ethtool can't be used, fallback to MII ioctls. */
608
	ioctl = slave_ops->ndo_do_ioctl;
L
Linus Torvalds 已提交
609
	if (ioctl) {
610 611 612 613 614 615 616 617
		/* TODO: set pointer to correct ioctl on a per team member
		 *       bases to make this more efficient. that is, once
		 *       we determine the correct ioctl, we will always
		 *       call it and not the others for that team
		 *       member.
		 */

		/* We cannot assume that SIOCGMIIPHY will also read a
L
Linus Torvalds 已提交
618 619 620 621 622 623 624
		 * register; not all network drivers (e.g., e100)
		 * support that.
		 */

		/* Yes, the mii is overlaid on the ifreq.ifr_ifru */
		strncpy(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
		mii = if_mii(&ifr);
A
Al Viro 已提交
625
		if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
L
Linus Torvalds 已提交
626
			mii->reg_num = MII_BMSR;
A
Al Viro 已提交
627
			if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
S
Stephen Hemminger 已提交
628
				return mii->val_out & BMSR_LSTATUS;
L
Linus Torvalds 已提交
629 630 631
		}
	}

632
	/* If reporting, report that either there's no dev->do_ioctl,
633
	 * or both SIOCGMIIREG and get_link failed (meaning that we
L
Linus Torvalds 已提交
634 635 636
	 * cannot report link status).  If not reporting, pretend
	 * we're ok.
	 */
S
Stephen Hemminger 已提交
637
	return reporting ? -1 : BMSR_LSTATUS;
L
Linus Torvalds 已提交
638 639 640 641
}

/*----------------------------- Multicast list ------------------------------*/

642
/* Push the promiscuity flag down to appropriate slaves */
643
static int bond_set_promiscuity(struct bonding *bond, int inc)
L
Linus Torvalds 已提交
644
{
645
	struct list_head *iter;
646
	int err = 0;
647

648
	if (bond_uses_primary(bond)) {
649
		struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
650 651 652

		if (curr_active)
			err = dev_set_promiscuity(curr_active->dev, inc);
L
Linus Torvalds 已提交
653 654
	} else {
		struct slave *slave;
655

656
		bond_for_each_slave(bond, slave, iter) {
657 658 659
			err = dev_set_promiscuity(slave->dev, inc);
			if (err)
				return err;
L
Linus Torvalds 已提交
660 661
		}
	}
662
	return err;
L
Linus Torvalds 已提交
663 664
}

665
/* Push the allmulti flag down to all slaves */
666
static int bond_set_allmulti(struct bonding *bond, int inc)
L
Linus Torvalds 已提交
667
{
668
	struct list_head *iter;
669
	int err = 0;
670

671
	if (bond_uses_primary(bond)) {
672
		struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
673 674 675

		if (curr_active)
			err = dev_set_allmulti(curr_active->dev, inc);
L
Linus Torvalds 已提交
676 677
	} else {
		struct slave *slave;
678

679
		bond_for_each_slave(bond, slave, iter) {
680 681 682
			err = dev_set_allmulti(slave->dev, inc);
			if (err)
				return err;
L
Linus Torvalds 已提交
683 684
		}
	}
685
	return err;
L
Linus Torvalds 已提交
686 687
}

688
/* Retrieve the list of registered multicast addresses for the bonding
689 690 691
 * device and retransmit an IGMP JOIN request to the current active
 * slave.
 */
692
static void bond_resend_igmp_join_requests_delayed(struct work_struct *work)
693
{
694 695 696
	struct bonding *bond = container_of(work, struct bonding,
					    mcast_work.work);

697
	if (!rtnl_trylock()) {
698
		queue_delayed_work(bond->wq, &bond->mcast_work, 1);
699
		return;
700
	}
701
	call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev);
702

703 704
	if (bond->igmp_retrans > 1) {
		bond->igmp_retrans--;
705
		queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
706
	}
707
	rtnl_unlock();
708 709
}

710
/* Flush bond's hardware addresses from slave */
711
static void bond_hw_addr_flush(struct net_device *bond_dev,
S
Stephen Hemminger 已提交
712
			       struct net_device *slave_dev)
L
Linus Torvalds 已提交
713
{
714
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
715

716 717
	dev_uc_unsync(slave_dev, bond_dev);
	dev_mc_unsync(slave_dev, bond_dev);
L
Linus Torvalds 已提交
718

719
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
L
Linus Torvalds 已提交
720 721 722
		/* del lacpdu mc addr from mc list */
		u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;

723
		dev_mc_del(slave_dev, lacpdu_multicast);
L
Linus Torvalds 已提交
724 725 726 727 728
	}
}

/*--------------------------- Active slave change ---------------------------*/

729
/* Update the hardware address list and promisc/allmulti for the new and
730 731
 * old active slaves (if any).  Modes that are not using primary keep all
 * slaves up date at all times; only the modes that use primary need to call
732
 * this function to swap these settings during a failover.
L
Linus Torvalds 已提交
733
 */
734 735
static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
			      struct slave *old_active)
L
Linus Torvalds 已提交
736 737
{
	if (old_active) {
S
Stephen Hemminger 已提交
738
		if (bond->dev->flags & IFF_PROMISC)
L
Linus Torvalds 已提交
739 740
			dev_set_promiscuity(old_active->dev, -1);

S
Stephen Hemminger 已提交
741
		if (bond->dev->flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
742 743
			dev_set_allmulti(old_active->dev, -1);

744
		bond_hw_addr_flush(bond->dev, old_active->dev);
L
Linus Torvalds 已提交
745 746 747
	}

	if (new_active) {
748
		/* FIXME: Signal errors upstream. */
S
Stephen Hemminger 已提交
749
		if (bond->dev->flags & IFF_PROMISC)
L
Linus Torvalds 已提交
750 751
			dev_set_promiscuity(new_active->dev, 1);

S
Stephen Hemminger 已提交
752
		if (bond->dev->flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
753 754
			dev_set_allmulti(new_active->dev, 1);

755
		netif_addr_lock_bh(bond->dev);
756 757
		dev_uc_sync(new_active->dev, bond->dev);
		dev_mc_sync(new_active->dev, bond->dev);
758
		netif_addr_unlock_bh(bond->dev);
L
Linus Torvalds 已提交
759 760 761
	}
}

762 763 764 765 766 767 768
/**
 * bond_set_dev_addr - clone slave's address to bond
 * @bond_dev: bond net device
 * @slave_dev: slave net device
 *
 * Should be called with RTNL held.
 */
769 770
static int bond_set_dev_addr(struct net_device *bond_dev,
			     struct net_device *slave_dev)
771
{
772 773
	int err;

774 775
	slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n",
		  bond_dev, slave_dev, slave_dev->addr_len);
776 777 778 779
	err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL);
	if (err)
		return err;

780 781 782
	memcpy(bond_dev->dev_addr, slave_dev->dev_addr, slave_dev->addr_len);
	bond_dev->addr_assign_type = NET_ADDR_STOLEN;
	call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev);
783
	return 0;
784 785
}

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
static struct slave *bond_get_old_active(struct bonding *bond,
					 struct slave *new_active)
{
	struct slave *slave;
	struct list_head *iter;

	bond_for_each_slave(bond, slave, iter) {
		if (slave == new_active)
			continue;

		if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr))
			return slave;
	}

	return NULL;
}

803
/* bond_do_fail_over_mac
804 805 806
 *
 * Perform special MAC address swapping for fail_over_mac settings
 *
807
 * Called with RTNL
808 809 810 811 812
 */
static void bond_do_fail_over_mac(struct bonding *bond,
				  struct slave *new_active,
				  struct slave *old_active)
{
813 814
	u8 tmp_mac[MAX_ADDR_LEN];
	struct sockaddr_storage ss;
815 816 817 818
	int rv;

	switch (bond->params.fail_over_mac) {
	case BOND_FOM_ACTIVE:
819 820 821
		if (new_active) {
			rv = bond_set_dev_addr(bond->dev, new_active->dev);
			if (rv)
822 823
				slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n",
					  -rv);
824
		}
825 826
		break;
	case BOND_FOM_FOLLOW:
827
		/* if new_active && old_active, swap them
828 829 830 831 832 833
		 * if just old_active, do nothing (going to no active slave)
		 * if just new_active, set new_active to bond's MAC
		 */
		if (!new_active)
			return;

834 835 836
		if (!old_active)
			old_active = bond_get_old_active(bond, new_active);

837
		if (old_active) {
838 839 840 841 842 843
			bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr,
					  new_active->dev->addr_len);
			bond_hw_addr_copy(ss.__data,
					  old_active->dev->dev_addr,
					  old_active->dev->addr_len);
			ss.ss_family = new_active->dev->type;
844
		} else {
845 846 847
			bond_hw_addr_copy(ss.__data, bond->dev->dev_addr,
					  bond->dev->addr_len);
			ss.ss_family = bond->dev->type;
848 849
		}

850
		rv = dev_set_mac_address(new_active->dev,
851
					 (struct sockaddr *)&ss, NULL);
852
		if (rv) {
853 854
			slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n",
				  -rv);
855 856 857 858 859 860
			goto out;
		}

		if (!old_active)
			goto out;

861 862 863
		bond_hw_addr_copy(ss.__data, tmp_mac,
				  new_active->dev->addr_len);
		ss.ss_family = old_active->dev->type;
864

865
		rv = dev_set_mac_address(old_active->dev,
866
					 (struct sockaddr *)&ss, NULL);
867
		if (rv)
868 869
			slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n",
				  -rv);
870 871 872
out:
		break;
	default:
873 874
		netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n",
			   bond->params.fail_over_mac);
875 876 877 878 879
		break;
	}

}

880
static struct slave *bond_choose_primary_or_current(struct bonding *bond)
881
{
882
	struct slave *prim = rtnl_dereference(bond->primary_slave);
883
	struct slave *curr = rtnl_dereference(bond->curr_active_slave);
884

885 886 887 888 889 890
	if (!prim || prim->link != BOND_LINK_UP) {
		if (!curr || curr->link != BOND_LINK_UP)
			return NULL;
		return curr;
	}

891 892
	if (bond->force_primary) {
		bond->force_primary = false;
893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
		return prim;
	}

	if (!curr || curr->link != BOND_LINK_UP)
		return prim;

	/* At this point, prim and curr are both up */
	switch (bond->params.primary_reselect) {
	case BOND_PRI_RESELECT_ALWAYS:
		return prim;
	case BOND_PRI_RESELECT_BETTER:
		if (prim->speed < curr->speed)
			return curr;
		if (prim->speed == curr->speed && prim->duplex <= curr->duplex)
			return curr;
		return prim;
	case BOND_PRI_RESELECT_FAILURE:
		return curr;
	default:
		netdev_err(bond->dev, "impossible primary_reselect %d\n",
			   bond->params.primary_reselect);
		return curr;
915 916
	}
}
917

L
Linus Torvalds 已提交
918
/**
919
 * bond_find_best_slave - select the best available slave to be the active one
L
Linus Torvalds 已提交
920 921 922 923
 * @bond: our bonding struct
 */
static struct slave *bond_find_best_slave(struct bonding *bond)
{
924
	struct slave *slave, *bestslave = NULL;
925
	struct list_head *iter;
L
Linus Torvalds 已提交
926 927
	int mintime = bond->params.updelay;

928 929 930
	slave = bond_choose_primary_or_current(bond);
	if (slave)
		return slave;
L
Linus Torvalds 已提交
931

932 933 934
	bond_for_each_slave(bond, slave, iter) {
		if (slave->link == BOND_LINK_UP)
			return slave;
935
		if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) &&
936 937 938
		    slave->delay < mintime) {
			mintime = slave->delay;
			bestslave = slave;
L
Linus Torvalds 已提交
939 940 941 942 943 944
		}
	}

	return bestslave;
}

945 946
static bool bond_should_notify_peers(struct bonding *bond)
{
947 948 949 950 951
	struct slave *slave;

	rcu_read_lock();
	slave = rcu_dereference(bond->curr_active_slave);
	rcu_read_unlock();
952

953 954
	netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n",
		   slave ? slave->dev->name : "NULL");
955 956

	if (!slave || !bond->send_peer_notif ||
957 958
	    bond->send_peer_notif %
	    max(1, bond->params.peer_notif_delay) != 0 ||
959
	    !netif_carrier_ok(bond->dev) ||
960 961 962 963 964 965
	    test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state))
		return false;

	return true;
}

L
Linus Torvalds 已提交
966
/**
967
 * bond_change_active_slave - change the active slave into the specified one
L
Linus Torvalds 已提交
968
 * @bond: our bonding struct
969
 * @new_active: the new slave to make the active one
L
Linus Torvalds 已提交
970 971 972 973 974 975 976 977 978
 *
 * Set the new slave to the bond's settings and unset them on the old
 * curr_active_slave.
 * Setting include flags, mc-list, promiscuity, allmulti, etc.
 *
 * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP,
 * because it is apparently the best available slave we have, even though its
 * updelay hasn't timed out yet.
 *
979
 * Caller must hold RTNL.
L
Linus Torvalds 已提交
980
 */
981
void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
L
Linus Torvalds 已提交
982
{
983 984
	struct slave *old_active;

985 986 987
	ASSERT_RTNL();

	old_active = rtnl_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
988

S
Stephen Hemminger 已提交
989
	if (old_active == new_active)
L
Linus Torvalds 已提交
990 991
		return;

992
#ifdef CONFIG_XFRM_OFFLOAD
993 994
	if (old_active && bond->xs)
		bond_ipsec_del_sa(bond->xs);
995 996
#endif /* CONFIG_XFRM_OFFLOAD */

997
	if (new_active) {
998
		new_active->last_link_up = jiffies;
999

L
Linus Torvalds 已提交
1000
		if (new_active->link == BOND_LINK_BACK) {
1001
			if (bond_uses_primary(bond)) {
1002 1003
				slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n",
					   (bond->params.updelay - new_active->delay) * bond->params.miimon);
L
Linus Torvalds 已提交
1004 1005 1006
			}

			new_active->delay = 0;
1007 1008
			bond_set_slave_link_state(new_active, BOND_LINK_UP,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1009

1010
			if (BOND_MODE(bond) == BOND_MODE_8023AD)
L
Linus Torvalds 已提交
1011 1012
				bond_3ad_handle_link_change(new_active, BOND_LINK_UP);

1013
			if (bond_is_lb(bond))
L
Linus Torvalds 已提交
1014 1015
				bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP);
		} else {
1016
			if (bond_uses_primary(bond)) {
1017
				slave_info(bond->dev, new_active->dev, "making interface the new active one\n");
L
Linus Torvalds 已提交
1018 1019 1020 1021
			}
		}
	}

1022
	if (bond_uses_primary(bond))
1023
		bond_hw_addr_swap(bond, new_active, old_active);
L
Linus Torvalds 已提交
1024

1025
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
1026
		bond_alb_handle_active_change(bond, new_active);
1027
		if (old_active)
1028 1029
			bond_set_slave_inactive_flags(old_active,
						      BOND_SLAVE_NOTIFY_NOW);
1030
		if (new_active)
1031 1032
			bond_set_slave_active_flags(new_active,
						    BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1033
	} else {
1034
		rcu_assign_pointer(bond->curr_active_slave, new_active);
L
Linus Torvalds 已提交
1035
	}
J
Jay Vosburgh 已提交
1036

1037
	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
S
Stephen Hemminger 已提交
1038
		if (old_active)
1039 1040
			bond_set_slave_inactive_flags(old_active,
						      BOND_SLAVE_NOTIFY_NOW);
J
Jay Vosburgh 已提交
1041 1042

		if (new_active) {
1043 1044
			bool should_notify_peers = false;

1045 1046
			bond_set_slave_active_flags(new_active,
						    BOND_SLAVE_NOTIFY_NOW);
1047

1048 1049 1050
			if (bond->params.fail_over_mac)
				bond_do_fail_over_mac(bond, new_active,
						      old_active);
1051

1052 1053
			if (netif_running(bond->dev)) {
				bond->send_peer_notif =
1054 1055
					bond->params.num_peer_notif *
					max(1, bond->params.peer_notif_delay);
1056 1057 1058 1059
				should_notify_peers =
					bond_should_notify_peers(bond);
			}

1060
			call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev);
1061 1062
			if (should_notify_peers) {
				bond->send_peer_notif--;
1063 1064
				call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
							 bond->dev);
1065
			}
1066
		}
J
Jay Vosburgh 已提交
1067
	}
1068

1069 1070 1071 1072 1073 1074 1075
#ifdef CONFIG_XFRM_OFFLOAD
	if (new_active && bond->xs) {
		xfrm_dev_state_flush(dev_net(bond->dev), bond->dev, true);
		bond_ipsec_add_sa(bond->xs);
	}
#endif /* CONFIG_XFRM_OFFLOAD */

1076
	/* resend IGMP joins since active slave has changed or
1077 1078
	 * all were sent on curr_active_slave.
	 * resend only if bond is brought up with the affected
1079 1080
	 * bonding modes and the retransmission is enabled
	 */
1081
	if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) &&
1082
	    ((bond_uses_primary(bond) && new_active) ||
1083
	     BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) {
1084
		bond->igmp_retrans = bond->params.resend_igmp;
1085
		queue_delayed_work(bond->wq, &bond->mcast_work, 1);
1086
	}
L
Linus Torvalds 已提交
1087 1088 1089 1090 1091 1092
}

/**
 * bond_select_active_slave - select a new active slave, if needed
 * @bond: our bonding struct
 *
S
Stephen Hemminger 已提交
1093
 * This functions should be called when one of the following occurs:
L
Linus Torvalds 已提交
1094 1095 1096 1097
 * - The old curr_active_slave has been released or lost its link.
 * - The primary_slave has got its link back.
 * - A slave has got its link back and there's no old curr_active_slave.
 *
1098
 * Caller must hold RTNL.
L
Linus Torvalds 已提交
1099
 */
1100
void bond_select_active_slave(struct bonding *bond)
L
Linus Torvalds 已提交
1101 1102
{
	struct slave *best_slave;
1103
	int rv;
L
Linus Torvalds 已提交
1104

1105 1106
	ASSERT_RTNL();

L
Linus Torvalds 已提交
1107
	best_slave = bond_find_best_slave(bond);
1108
	if (best_slave != rtnl_dereference(bond->curr_active_slave)) {
L
Linus Torvalds 已提交
1109
		bond_change_active_slave(bond, best_slave);
1110 1111 1112 1113
		rv = bond_set_carrier(bond);
		if (!rv)
			return;

Z
Zhang Shengju 已提交
1114
		if (netif_carrier_ok(bond->dev))
1115
			netdev_info(bond->dev, "active interface up!\n");
Z
Zhang Shengju 已提交
1116
		else
1117
			netdev_info(bond->dev, "now running without any active interface!\n");
L
Linus Torvalds 已提交
1118 1119 1120
	}
}

1121
#ifdef CONFIG_NET_POLL_CONTROLLER
1122
static inline int slave_enable_netpoll(struct slave *slave)
1123
{
1124 1125
	struct netpoll *np;
	int err = 0;
1126

1127
	np = kzalloc(sizeof(*np), GFP_KERNEL);
1128 1129 1130 1131
	err = -ENOMEM;
	if (!np)
		goto out;

1132
	err = __netpoll_setup(np, slave->dev);
1133 1134 1135
	if (err) {
		kfree(np);
		goto out;
1136
	}
1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
	slave->np = np;
out:
	return err;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
	struct netpoll *np = slave->np;

	if (!np)
		return;

	slave->np = NULL;
1149 1150

	__netpoll_free(np);
1151
}
1152 1153 1154

static void bond_poll_controller(struct net_device *bond_dev)
{
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
	struct bonding *bond = netdev_priv(bond_dev);
	struct slave *slave = NULL;
	struct list_head *iter;
	struct ad_info ad_info;

	if (BOND_MODE(bond) == BOND_MODE_8023AD)
		if (bond_3ad_get_active_agg_info(bond, &ad_info))
			return;

	bond_for_each_slave_rcu(bond, slave, iter) {
1165
		if (!bond_slave_is_up(slave))
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
			continue;

		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			struct aggregator *agg =
			    SLAVE_AD_INFO(slave)->port.aggregator;

			if (agg &&
			    agg->aggregator_identifier != ad_info.aggregator_id)
				continue;
		}

1177
		netpoll_poll_dev(slave->dev);
1178
	}
1179 1180
}

1181
static void bond_netpoll_cleanup(struct net_device *bond_dev)
1182
{
1183
	struct bonding *bond = netdev_priv(bond_dev);
1184
	struct list_head *iter;
1185 1186
	struct slave *slave;

1187
	bond_for_each_slave(bond, slave, iter)
1188
		if (bond_slave_is_up(slave))
1189
			slave_disable_netpoll(slave);
1190
}
1191

1192
static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
1193 1194
{
	struct bonding *bond = netdev_priv(dev);
1195
	struct list_head *iter;
1196
	struct slave *slave;
1197
	int err = 0;
1198

1199
	bond_for_each_slave(bond, slave, iter) {
1200 1201
		err = slave_enable_netpoll(slave);
		if (err) {
1202
			bond_netpoll_cleanup(dev);
1203
			break;
1204 1205
		}
	}
1206
	return err;
1207
}
1208 1209 1210 1211 1212 1213 1214 1215
#else
static inline int slave_enable_netpoll(struct slave *slave)
{
	return 0;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
}
1216 1217 1218 1219 1220
static void bond_netpoll_cleanup(struct net_device *bond_dev)
{
}
#endif

L
Linus Torvalds 已提交
1221 1222
/*---------------------------------- IOCTL ----------------------------------*/

1223
static netdev_features_t bond_fix_features(struct net_device *dev,
1224
					   netdev_features_t features)
1225
{
1226
	struct bonding *bond = netdev_priv(dev);
1227
	struct list_head *iter;
1228
	netdev_features_t mask;
1229
	struct slave *slave;
1230

1231 1232 1233 1234 1235 1236 1237
#if IS_ENABLED(CONFIG_TLS_DEVICE)
	if (bond_sk_check(bond))
		features |= BOND_TLS_FEATURES;
	else
		features &= ~BOND_TLS_FEATURES;
#endif

1238
	mask = features;
1239

1240
	features &= ~NETIF_F_ONE_FOR_ALL;
1241
	features |= NETIF_F_ALL_FOR_ALL;
1242

1243
	bond_for_each_slave(bond, slave, iter) {
1244 1245
		features = netdev_increment_features(features,
						     slave->dev->features,
1246 1247
						     mask);
	}
1248
	features = netdev_add_tso_features(features, mask);
1249 1250 1251 1252

	return features;
}

1253
#define BOND_VLAN_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
1254
				 NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \
1255
				 NETIF_F_HIGHDMA | NETIF_F_LRO)
1256

1257
#define BOND_ENC_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
1258
				 NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE)
1259

1260
#define BOND_MPLS_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
1261
				 NETIF_F_GSO_SOFTWARE)
1262

1263

1264 1265
static void bond_compute_features(struct bonding *bond)
{
1266 1267
	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
					IFF_XMIT_DST_RELEASE_PERM;
1268
	netdev_features_t vlan_features = BOND_VLAN_FEATURES;
1269
	netdev_features_t enc_features  = BOND_ENC_FEATURES;
1270 1271 1272
#ifdef CONFIG_XFRM_OFFLOAD
	netdev_features_t xfrm_features  = BOND_XFRM_FEATURES;
#endif /* CONFIG_XFRM_OFFLOAD */
1273
	netdev_features_t mpls_features  = BOND_MPLS_FEATURES;
1274 1275 1276
	struct net_device *bond_dev = bond->dev;
	struct list_head *iter;
	struct slave *slave;
1277
	unsigned short max_hard_header_len = ETH_HLEN;
1278 1279
	unsigned int gso_max_size = GSO_MAX_SIZE;
	u16 gso_max_segs = GSO_MAX_SEGS;
1280

1281
	if (!bond_has_slaves(bond))
1282
		goto done;
1283
	vlan_features &= NETIF_F_ALL_FOR_ALL;
1284
	mpls_features &= NETIF_F_ALL_FOR_ALL;
1285

1286
	bond_for_each_slave(bond, slave, iter) {
1287
		vlan_features = netdev_increment_features(vlan_features,
1288 1289
			slave->dev->vlan_features, BOND_VLAN_FEATURES);

1290 1291 1292
		enc_features = netdev_increment_features(enc_features,
							 slave->dev->hw_enc_features,
							 BOND_ENC_FEATURES);
1293

1294 1295 1296 1297 1298 1299
#ifdef CONFIG_XFRM_OFFLOAD
		xfrm_features = netdev_increment_features(xfrm_features,
							  slave->dev->hw_enc_features,
							  BOND_XFRM_FEATURES);
#endif /* CONFIG_XFRM_OFFLOAD */

1300 1301 1302 1303
		mpls_features = netdev_increment_features(mpls_features,
							  slave->dev->mpls_features,
							  BOND_MPLS_FEATURES);

1304
		dst_release_flag &= slave->dev->priv_flags;
1305 1306
		if (slave->dev->hard_header_len > max_hard_header_len)
			max_hard_header_len = slave->dev->hard_header_len;
1307 1308 1309

		gso_max_size = min(gso_max_size, slave->dev->gso_max_size);
		gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs);
1310
	}
1311
	bond_dev->hard_header_len = max_hard_header_len;
1312

1313
done:
1314
	bond_dev->vlan_features = vlan_features;
1315
	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
1316
				    NETIF_F_HW_VLAN_CTAG_TX |
1317
				    NETIF_F_HW_VLAN_STAG_TX;
1318 1319 1320
#ifdef CONFIG_XFRM_OFFLOAD
	bond_dev->hw_enc_features |= xfrm_features;
#endif /* CONFIG_XFRM_OFFLOAD */
1321
	bond_dev->mpls_features = mpls_features;
1322 1323
	bond_dev->gso_max_segs = gso_max_segs;
	netif_set_gso_max_size(bond_dev, gso_max_size);
1324

1325 1326 1327 1328
	bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
	if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
	    dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
		bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
1329

1330
	netdev_change_features(bond_dev);
1331 1332
}

1333 1334 1335
static void bond_setup_by_slave(struct net_device *bond_dev,
				struct net_device *slave_dev)
{
1336
	bond_dev->header_ops	    = slave_dev->header_ops;
1337 1338 1339

	bond_dev->type		    = slave_dev->type;
	bond_dev->hard_header_len   = slave_dev->hard_header_len;
1340
	bond_dev->needed_headroom   = slave_dev->needed_headroom;
1341 1342 1343 1344 1345 1346
	bond_dev->addr_len	    = slave_dev->addr_len;

	memcpy(bond_dev->broadcast, slave_dev->broadcast,
		slave_dev->addr_len);
}

1347
/* On bonding slaves other than the currently active slave, suppress
1348
 * duplicates except for alb non-mcast/bcast.
1349 1350
 */
static bool bond_should_deliver_exact_match(struct sk_buff *skb,
1351 1352
					    struct slave *slave,
					    struct bonding *bond)
1353
{
1354
	if (bond_is_slave_inactive(slave)) {
1355
		if (BOND_MODE(bond) == BOND_MODE_ALB &&
1356 1357 1358 1359 1360 1361 1362 1363
		    skb->pkt_type != PACKET_BROADCAST &&
		    skb->pkt_type != PACKET_MULTICAST)
			return false;
		return true;
	}
	return false;
}

1364
static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
1365
{
1366
	struct sk_buff *skb = *pskb;
1367
	struct slave *slave;
1368
	struct bonding *bond;
1369 1370
	int (*recv_probe)(const struct sk_buff *, struct bonding *,
			  struct slave *);
1371
	int ret = RX_HANDLER_ANOTHER;
1372

1373 1374 1375 1376 1377
	skb = skb_share_check(skb, GFP_ATOMIC);
	if (unlikely(!skb))
		return RX_HANDLER_CONSUMED;

	*pskb = skb;
1378

J
Jiri Pirko 已提交
1379 1380
	slave = bond_slave_get_rcu(skb->dev);
	bond = slave->bond;
1381

1382
	recv_probe = READ_ONCE(bond->recv_probe);
1383
	if (recv_probe) {
1384 1385 1386 1387
		ret = recv_probe(skb, bond, slave);
		if (ret == RX_HANDLER_CONSUMED) {
			consume_skb(skb);
			return ret;
1388 1389 1390
		}
	}

1391 1392 1393 1394 1395 1396 1397 1398 1399 1400
	/*
	 * For packets determined by bond_should_deliver_exact_match() call to
	 * be suppressed we want to make an exception for link-local packets.
	 * This is necessary for e.g. LLDP daemons to be able to monitor
	 * inactive slave links without being forced to bind to them
	 * explicitly.
	 *
	 * At the same time, packets that are passed to the bonding master
	 * (including link-local ones) can have their originating interface
	 * determined via PACKET_ORIGDEV socket option.
1401
	 */
1402 1403 1404
	if (bond_should_deliver_exact_match(skb, slave, bond)) {
		if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
			return RX_HANDLER_PASS;
1405
		return RX_HANDLER_EXACT;
1406
	}
1407

J
Jiri Pirko 已提交
1408
	skb->dev = bond->dev;
1409

1410
	if (BOND_MODE(bond) == BOND_MODE_ALB &&
1411
	    netif_is_bridge_port(bond->dev) &&
1412 1413
	    skb->pkt_type == PACKET_HOST) {

1414 1415 1416
		if (unlikely(skb_cow_head(skb,
					  skb->data - skb_mac_header(skb)))) {
			kfree_skb(skb);
1417
			return RX_HANDLER_CONSUMED;
1418
		}
1419 1420
		bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr,
				  bond->dev->addr_len);
1421 1422
	}

1423
	return ret;
1424 1425
}

1426
static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
1427
{
1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442
	switch (BOND_MODE(bond)) {
	case BOND_MODE_ROUNDROBIN:
		return NETDEV_LAG_TX_TYPE_ROUNDROBIN;
	case BOND_MODE_ACTIVEBACKUP:
		return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
	case BOND_MODE_BROADCAST:
		return NETDEV_LAG_TX_TYPE_BROADCAST;
	case BOND_MODE_XOR:
	case BOND_MODE_8023AD:
		return NETDEV_LAG_TX_TYPE_HASH;
	default:
		return NETDEV_LAG_TX_TYPE_UNKNOWN;
	}
}

1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459
static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
					       enum netdev_lag_tx_type type)
{
	if (type != NETDEV_LAG_TX_TYPE_HASH)
		return NETDEV_LAG_HASH_NONE;

	switch (bond->params.xmit_policy) {
	case BOND_XMIT_POLICY_LAYER2:
		return NETDEV_LAG_HASH_L2;
	case BOND_XMIT_POLICY_LAYER34:
		return NETDEV_LAG_HASH_L34;
	case BOND_XMIT_POLICY_LAYER23:
		return NETDEV_LAG_HASH_L23;
	case BOND_XMIT_POLICY_ENCAP23:
		return NETDEV_LAG_HASH_E23;
	case BOND_XMIT_POLICY_ENCAP34:
		return NETDEV_LAG_HASH_E34;
1460 1461
	case BOND_XMIT_POLICY_VLAN_SRCMAC:
		return NETDEV_LAG_HASH_VLAN_SRCMAC;
1462 1463 1464 1465 1466
	default:
		return NETDEV_LAG_HASH_UNKNOWN;
	}
}

1467 1468
static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
				      struct netlink_ext_ack *extack)
1469 1470
{
	struct netdev_lag_upper_info lag_upper_info;
1471
	enum netdev_lag_tx_type type;
1472

1473 1474 1475
	type = bond_lag_tx_type(bond);
	lag_upper_info.tx_type = type;
	lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
1476 1477 1478

	return netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
					    &lag_upper_info, extack);
1479 1480
}

1481
static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave)
1482
{
1483 1484
	netdev_upper_dev_unlink(slave->dev, bond->dev);
	slave->dev->flags &= ~IFF_SLAVE;
1485 1486
}

1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519
static void slave_kobj_release(struct kobject *kobj)
{
	struct slave *slave = to_slave(kobj);
	struct bonding *bond = bond_get_bond_by_slave(slave);

	cancel_delayed_work_sync(&slave->notify_work);
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
		kfree(SLAVE_AD_INFO(slave));

	kfree(slave);
}

static struct kobj_type slave_ktype = {
	.release = slave_kobj_release,
#ifdef CONFIG_SYSFS
	.sysfs_ops = &slave_sysfs_ops,
#endif
};

static int bond_kobj_init(struct slave *slave)
{
	int err;

	err = kobject_init_and_add(&slave->kobj, &slave_ktype,
				   &(slave->dev->dev.kobj), "bonding_slave");
	if (err)
		kobject_put(&slave->kobj);

	return err;
}

static struct slave *bond_alloc_slave(struct bonding *bond,
				      struct net_device *slave_dev)
1520 1521 1522
{
	struct slave *slave = NULL;

Z
Zhang Shengju 已提交
1523
	slave = kzalloc(sizeof(*slave), GFP_KERNEL);
1524 1525 1526
	if (!slave)
		return NULL;

1527 1528 1529 1530 1531 1532
	slave->bond = bond;
	slave->dev = slave_dev;

	if (bond_kobj_init(slave))
		return NULL;

1533
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
1534 1535 1536
		SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info),
					       GFP_KERNEL);
		if (!SLAVE_AD_INFO(slave)) {
1537
			kobject_put(&slave->kobj);
1538 1539 1540
			return NULL;
		}
	}
1541 1542
	INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work);

1543 1544 1545
	return slave;
}

1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info)
{
	info->bond_mode = BOND_MODE(bond);
	info->miimon = bond->params.miimon;
	info->num_slaves = bond->slave_cnt;
}

static void bond_fill_ifslave(struct slave *slave, struct ifslave *info)
{
	strcpy(info->slave_name, slave->dev->name);
	info->link = slave->link;
	info->state = bond_slave_state(slave);
	info->link_failure_count = slave->link_failure_count;
}

1561 1562
static void bond_netdev_notify_work(struct work_struct *_work)
{
1563 1564 1565 1566 1567
	struct slave *slave = container_of(_work, struct slave,
					   notify_work.work);

	if (rtnl_trylock()) {
		struct netdev_bonding_info binfo;
1568

1569 1570 1571 1572 1573 1574 1575
		bond_fill_ifslave(slave, &binfo.slave);
		bond_fill_ifbond(slave->bond, &binfo.master);
		netdev_bonding_info_change(slave->dev, &binfo);
		rtnl_unlock();
	} else {
		queue_delayed_work(slave->bond->wq, &slave->notify_work, 1);
	}
1576 1577 1578 1579
}

void bond_queue_slave_event(struct slave *slave)
{
1580
	queue_delayed_work(slave->bond->wq, &slave->notify_work, 0);
1581 1582
}

1583 1584 1585 1586 1587 1588 1589 1590 1591 1592
void bond_lower_state_changed(struct slave *slave)
{
	struct netdev_lag_lower_state_info info;

	info.link_up = slave->link == BOND_LINK_UP ||
		       slave->link == BOND_LINK_FAIL;
	info.tx_enabled = bond_is_active_slave(slave);
	netdev_lower_state_changed(slave->dev, &info);
}

L
Linus Torvalds 已提交
1593
/* enslave device <slave> to bond device <master> */
D
David Ahern 已提交
1594 1595
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
		 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1596
{
1597
	struct bonding *bond = netdev_priv(bond_dev);
1598
	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
1599
	struct slave *new_slave = NULL, *prev_slave;
1600
	struct sockaddr_storage ss;
L
Linus Torvalds 已提交
1601
	int link_reporting;
1602
	int res = 0, i;
L
Linus Torvalds 已提交
1603

1604 1605 1606
	if (!bond->params.use_carrier &&
	    slave_dev->ethtool_ops->get_link == NULL &&
	    slave_ops->ndo_do_ioctl == NULL) {
1607
		slave_warn(bond_dev, slave_dev, "no link monitoring support\n");
L
Linus Torvalds 已提交
1608 1609
	}

M
Mahesh Bandewar 已提交
1610 1611
	/* already in-use? */
	if (netdev_is_rx_handler_busy(slave_dev)) {
1612
		NL_SET_ERR_MSG(extack, "Device is in use and cannot be enslaved");
1613 1614
		slave_err(bond_dev, slave_dev,
			  "Error: Device is in use and cannot be enslaved\n");
L
Linus Torvalds 已提交
1615 1616 1617
		return -EBUSY;
	}

1618
	if (bond_dev == slave_dev) {
1619
		NL_SET_ERR_MSG(extack, "Cannot enslave bond to itself.");
1620
		netdev_err(bond_dev, "cannot enslave bond to itself.\n");
1621 1622 1623
		return -EPERM;
	}

L
Linus Torvalds 已提交
1624 1625 1626
	/* vlan challenged mutual exclusion */
	/* no need to lock since we're protected by rtnl_lock */
	if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) {
1627
		slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n");
1628
		if (vlan_uses_dev(bond_dev)) {
1629
			NL_SET_ERR_MSG(extack, "Can not enslave VLAN challenged device to VLAN enabled bond");
1630
			slave_err(bond_dev, slave_dev, "Error: cannot enslave VLAN challenged slave on VLAN enabled bond\n");
L
Linus Torvalds 已提交
1631 1632
			return -EPERM;
		} else {
1633
			slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n");
L
Linus Torvalds 已提交
1634 1635
		}
	} else {
1636
		slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n");
L
Linus Torvalds 已提交
1637 1638
	}

1639 1640 1641
	if (slave_dev->features & NETIF_F_HW_ESP)
		slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n");

1642
	/* Old ifenslave binaries are no longer supported.  These can
S
Stephen Hemminger 已提交
1643
	 * be identified with moderate accuracy by the state of the slave:
1644 1645 1646
	 * the current ifenslave will set the interface down prior to
	 * enslaving it; the old ifenslave will not.
	 */
Y
yzhu1 已提交
1647
	if (slave_dev->flags & IFF_UP) {
1648
		NL_SET_ERR_MSG(extack, "Device can not be enslaved while up");
1649
		slave_err(bond_dev, slave_dev, "slave is up - this may be due to an out of date ifenslave\n");
1650
		return -EPERM;
1651
	}
L
Linus Torvalds 已提交
1652

1653 1654 1655 1656 1657 1658 1659
	/* set bonding device ether type by slave - bonding netdevices are
	 * created with ether_setup, so when the slave type is not ARPHRD_ETHER
	 * there is a need to override some of the type dependent attribs/funcs.
	 *
	 * bond ether type mutual exclusion - don't allow slaves of dissimilar
	 * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond
	 */
1660
	if (!bond_has_slaves(bond)) {
1661
		if (bond_dev->type != slave_dev->type) {
1662 1663
			slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n",
				  bond_dev->type, slave_dev->type);
1664

1665 1666
			res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
						       bond_dev);
1667 1668
			res = notifier_to_errno(res);
			if (res) {
1669
				slave_err(bond_dev, slave_dev, "refused to change device type\n");
1670
				return -EBUSY;
1671
			}
1672

1673
			/* Flush unicast and multicast addresses */
1674
			dev_uc_flush(bond_dev);
1675
			dev_mc_flush(bond_dev);
1676

1677 1678
			if (slave_dev->type != ARPHRD_ETHER)
				bond_setup_by_slave(bond_dev, slave_dev);
1679
			else {
1680
				ether_setup(bond_dev);
1681 1682
				bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
			}
1683

1684 1685
			call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
						 bond_dev);
1686
		}
1687
	} else if (bond_dev->type != slave_dev->type) {
1688
		NL_SET_ERR_MSG(extack, "Device type is different from other slaves");
1689 1690
		slave_err(bond_dev, slave_dev, "ether type (%d) is different from other slaves (%d), can not enslave it\n",
			  slave_dev->type, bond_dev->type);
1691
		return -EINVAL;
1692 1693
	}

1694 1695
	if (slave_dev->type == ARPHRD_INFINIBAND &&
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
1696
		NL_SET_ERR_MSG(extack, "Only active-backup mode is supported for infiniband slaves");
1697 1698
		slave_warn(bond_dev, slave_dev, "Type (%d) supports only active-backup mode\n",
			   slave_dev->type);
1699 1700 1701 1702 1703 1704
		res = -EOPNOTSUPP;
		goto err_undo_flags;
	}

	if (!slave_ops->ndo_set_mac_address ||
	    slave_dev->type == ARPHRD_INFINIBAND) {
1705
		slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n");
1706 1707 1708
		if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP &&
		    bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
			if (!bond_has_slaves(bond)) {
1709
				bond->params.fail_over_mac = BOND_FOM_ACTIVE;
1710
				slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n");
1711
			} else {
1712
				NL_SET_ERR_MSG(extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active");
1713
				slave_err(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address, but fail_over_mac is not set to active\n");
1714 1715
				res = -EOPNOTSUPP;
				goto err_undo_flags;
1716
			}
1717
		}
L
Linus Torvalds 已提交
1718 1719
	}

1720 1721
	call_netdevice_notifiers(NETDEV_JOIN, slave_dev);

1722
	/* If this is the first slave, then we need to set the master's hardware
1723 1724
	 * address to be the same as the slave's.
	 */
1725
	if (!bond_has_slaves(bond) &&
1726 1727 1728 1729 1730
	    bond->dev->addr_assign_type == NET_ADDR_RANDOM) {
		res = bond_set_dev_addr(bond->dev, slave_dev);
		if (res)
			goto err_undo_flags;
	}
1731

1732
	new_slave = bond_alloc_slave(bond, slave_dev);
L
Linus Torvalds 已提交
1733 1734 1735 1736
	if (!new_slave) {
		res = -ENOMEM;
		goto err_undo_flags;
	}
1737

1738
	/* Set the new_slave's queue_id to be zero.  Queue ID mapping
1739 1740 1741 1742
	 * is set via sysfs or module option if desired.
	 */
	new_slave->queue_id = 0;

1743 1744 1745 1746
	/* Save slave's original mtu and then set it to match the bond */
	new_slave->original_mtu = slave_dev->mtu;
	res = dev_set_mtu(slave_dev, bond->dev->mtu);
	if (res) {
1747
		slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res);
1748 1749 1750
		goto err_free;
	}

1751
	/* Save slave's original ("permanent") mac address for modes
1752 1753 1754
	 * that need it, and for restoring it upon release, and then
	 * set it to the master's address
	 */
1755 1756
	bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr,
			  slave_dev->addr_len);
L
Linus Torvalds 已提交
1757

1758
	if (!bond->params.fail_over_mac ||
1759
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
1760
		/* Set slave to master's mac address.  The application already
1761 1762
		 * set the master's mac address to that of the first slave
		 */
1763 1764
		memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
		ss.ss_family = slave_dev->type;
1765 1766
		res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss,
					  extack);
1767
		if (res) {
1768
			slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res);
1769
			goto err_restore_mtu;
1770
		}
1771
	}
L
Linus Torvalds 已提交
1772

1773 1774 1775
	/* set slave flag before open to prevent IPv6 addrconf */
	slave_dev->flags |= IFF_SLAVE;

1776
	/* open the slave since the application closed it */
1777
	res = dev_open(slave_dev, extack);
1778
	if (res) {
1779
		slave_err(bond_dev, slave_dev, "Opening slave failed\n");
1780
		goto err_restore_mac;
L
Linus Torvalds 已提交
1781 1782
	}

1783
	slave_dev->priv_flags |= IFF_BONDING;
1784 1785
	/* initialize slave stats */
	dev_get_stats(new_slave->dev, &new_slave->slave_stats);
L
Linus Torvalds 已提交
1786

1787
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
1788 1789 1790 1791
		/* bond_alb_init_slave() must be called before all other stages since
		 * it might fail and we do not want to have to undo everything
		 */
		res = bond_alb_init_slave(bond, new_slave);
S
Stephen Hemminger 已提交
1792
		if (res)
1793
			goto err_close;
L
Linus Torvalds 已提交
1794 1795
	}

1796 1797
	res = vlan_vids_add_by_dev(slave_dev, bond_dev);
	if (res) {
1798
		slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n");
1799
		goto err_close;
1800
	}
L
Linus Torvalds 已提交
1801

1802
	prev_slave = bond_last_slave(bond);
L
Linus Torvalds 已提交
1803 1804 1805 1806

	new_slave->delay = 0;
	new_slave->link_failure_count = 0;

1807 1808
	if (bond_update_speed_duplex(new_slave) &&
	    bond_needs_speed_duplex(bond))
1809
		new_slave->link = BOND_LINK_DOWN;
1810

1811
	new_slave->last_rx = jiffies -
1812
		(msecs_to_jiffies(bond->params.arp_interval) + 1);
1813
	for (i = 0; i < BOND_MAX_ARP_TARGETS; i++)
1814
		new_slave->target_last_arp_rx[i] = new_slave->last_rx;
1815

L
Linus Torvalds 已提交
1816 1817 1818 1819
	if (bond->params.miimon && !bond->params.use_carrier) {
		link_reporting = bond_check_dev_link(bond, slave_dev, 1);

		if ((link_reporting == -1) && !bond->params.arp_interval) {
1820
			/* miimon is set but a bonded network driver
L
Linus Torvalds 已提交
1821 1822 1823 1824 1825 1826 1827
			 * does not support ETHTOOL/MII and
			 * arp_interval is not set.  Note: if
			 * use_carrier is enabled, we will never go
			 * here (because netif_carrier is always
			 * supported); thus, we don't need to change
			 * the messages for netif_carrier.
			 */
1828
			slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n");
L
Linus Torvalds 已提交
1829 1830
		} else if (link_reporting == -1) {
			/* unable get link status using mii/ethtool */
1831
			slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n");
L
Linus Torvalds 已提交
1832 1833 1834 1835
		}
	}

	/* check for initial state */
1836
	new_slave->link = BOND_LINK_NOCHANGE;
1837 1838 1839
	if (bond->params.miimon) {
		if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) {
			if (bond->params.updelay) {
1840
				bond_set_slave_link_state(new_slave,
1841 1842
							  BOND_LINK_BACK,
							  BOND_SLAVE_NOTIFY_NOW);
1843 1844
				new_slave->delay = bond->params.updelay;
			} else {
1845
				bond_set_slave_link_state(new_slave,
1846 1847
							  BOND_LINK_UP,
							  BOND_SLAVE_NOTIFY_NOW);
1848
			}
L
Linus Torvalds 已提交
1849
		} else {
1850 1851
			bond_set_slave_link_state(new_slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1852
		}
1853
	} else if (bond->params.arp_interval) {
1854 1855
		bond_set_slave_link_state(new_slave,
					  (netif_carrier_ok(slave_dev) ?
1856 1857
					  BOND_LINK_UP : BOND_LINK_DOWN),
					  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1858
	} else {
1859 1860
		bond_set_slave_link_state(new_slave, BOND_LINK_UP,
					  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1861 1862
	}

1863
	if (new_slave->link != BOND_LINK_DOWN)
1864
		new_slave->last_link_up = jiffies;
1865 1866 1867
	slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n",
		  new_slave->link == BOND_LINK_DOWN ? "DOWN" :
		  (new_slave->link == BOND_LINK_UP ? "UP" : "BACK"));
1868

1869
	if (bond_uses_primary(bond) && bond->params.primary[0]) {
L
Linus Torvalds 已提交
1870
		/* if there is a primary slave, remember it */
1871
		if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
1872
			rcu_assign_pointer(bond->primary_slave, new_slave);
1873 1874
			bond->force_primary = true;
		}
L
Linus Torvalds 已提交
1875 1876
	}

1877
	switch (BOND_MODE(bond)) {
L
Linus Torvalds 已提交
1878
	case BOND_MODE_ACTIVEBACKUP:
1879 1880
		bond_set_slave_inactive_flags(new_slave,
					      BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1881 1882 1883 1884 1885 1886
		break;
	case BOND_MODE_8023AD:
		/* in 802.3ad mode, the internal mechanism
		 * will activate the slaves in the selected
		 * aggregator
		 */
1887
		bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1888
		/* if this is the first slave */
1889
		if (!prev_slave) {
1890
			SLAVE_AD_INFO(new_slave)->id = 1;
L
Linus Torvalds 已提交
1891 1892 1893
			/* Initialize AD with the number of times that the AD timer is called in 1 second
			 * can be called only after the mac address of the bond is set
			 */
1894
			bond_3ad_initialize(bond, 1000/AD_TIMER_INTERVAL);
L
Linus Torvalds 已提交
1895
		} else {
1896 1897
			SLAVE_AD_INFO(new_slave)->id =
				SLAVE_AD_INFO(prev_slave)->id + 1;
L
Linus Torvalds 已提交
1898 1899 1900 1901 1902 1903
		}

		bond_3ad_bind_slave(new_slave);
		break;
	case BOND_MODE_TLB:
	case BOND_MODE_ALB:
J
Jiri Pirko 已提交
1904
		bond_set_active_slave(new_slave);
1905
		bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1906 1907
		break;
	default:
1908
		slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n");
L
Linus Torvalds 已提交
1909 1910

		/* always active in trunk mode */
J
Jiri Pirko 已提交
1911
		bond_set_active_slave(new_slave);
L
Linus Torvalds 已提交
1912 1913 1914 1915 1916

		/* In trunking mode there is little meaning to curr_active_slave
		 * anyway (it holds no special properties of the bond device),
		 * so we can change it without calling change_active_interface()
		 */
1917 1918
		if (!rcu_access_pointer(bond->curr_active_slave) &&
		    new_slave->link == BOND_LINK_UP)
1919
			rcu_assign_pointer(bond->curr_active_slave, new_slave);
S
Stephen Hemminger 已提交
1920

L
Linus Torvalds 已提交
1921 1922 1923
		break;
	} /* switch(bond_mode) */

1924
#ifdef CONFIG_NET_POLL_CONTROLLER
1925
	if (bond->dev->npinfo) {
1926
		if (slave_enable_netpoll(new_slave)) {
1927
			slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n");
1928
			res = -EBUSY;
1929
			goto err_detach;
1930
		}
1931 1932
	}
#endif
1933

1934 1935 1936
	if (!(bond_dev->features & NETIF_F_LRO))
		dev_disable_lro(slave_dev);

J
Jiri Pirko 已提交
1937 1938 1939
	res = netdev_rx_handler_register(slave_dev, bond_handle_frame,
					 new_slave);
	if (res) {
1940
		slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res);
1941
		goto err_detach;
J
Jiri Pirko 已提交
1942 1943
	}

1944
	res = bond_master_upper_dev_link(bond, new_slave, extack);
1945
	if (res) {
1946
		slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res);
1947 1948 1949
		goto err_unregister;
	}

1950 1951
	bond_lower_state_changed(new_slave);

1952 1953
	res = bond_sysfs_slave_add(new_slave);
	if (res) {
1954
		slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res);
1955 1956 1957
		goto err_upper_unlink;
	}

1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
	/* If the mode uses primary, then the following is handled by
	 * bond_change_active_slave().
	 */
	if (!bond_uses_primary(bond)) {
		/* set promiscuity level to new slave */
		if (bond_dev->flags & IFF_PROMISC) {
			res = dev_set_promiscuity(slave_dev, 1);
			if (res)
				goto err_sysfs_del;
		}

		/* set allmulti level to new slave */
		if (bond_dev->flags & IFF_ALLMULTI) {
			res = dev_set_allmulti(slave_dev, 1);
1972 1973 1974
			if (res) {
				if (bond_dev->flags & IFF_PROMISC)
					dev_set_promiscuity(slave_dev, -1);
1975
				goto err_sysfs_del;
1976
			}
1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991
		}

		netif_addr_lock_bh(bond_dev);
		dev_mc_sync_multiple(slave_dev, bond_dev);
		dev_uc_sync_multiple(slave_dev, bond_dev);
		netif_addr_unlock_bh(bond_dev);

		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			/* add lacpdu mc addr to mc list */
			u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;

			dev_mc_add(slave_dev, lacpdu_multicast);
		}
	}

1992 1993 1994 1995
	bond->slave_cnt++;
	bond_compute_features(bond);
	bond_set_carrier(bond);

1996
	if (bond_uses_primary(bond)) {
1997
		block_netpoll_tx();
1998
		bond_select_active_slave(bond);
1999
		unblock_netpoll_tx();
2000
	}
2001

2002
	if (bond_mode_can_use_xmit_hash(bond))
2003 2004
		bond_update_slave_arr(bond, NULL);

2005

2006 2007 2008
	slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n",
		   bond_is_active_slave(new_slave) ? "an active" : "a backup",
		   new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");
L
Linus Torvalds 已提交
2009 2010

	/* enslave is successful */
2011
	bond_queue_slave_event(new_slave);
L
Linus Torvalds 已提交
2012 2013 2014
	return 0;

/* Undo stages on error */
2015 2016 2017
err_sysfs_del:
	bond_sysfs_slave_del(new_slave);

2018
err_upper_unlink:
2019
	bond_upper_dev_unlink(bond, new_slave);
2020

2021 2022 2023
err_unregister:
	netdev_rx_handler_unregister(slave_dev);

2024
err_detach:
2025
	vlan_vids_del_by_dev(slave_dev, bond_dev);
2026 2027
	if (rcu_access_pointer(bond->primary_slave) == new_slave)
		RCU_INIT_POINTER(bond->primary_slave, NULL);
2028
	if (rcu_access_pointer(bond->curr_active_slave) == new_slave) {
2029
		block_netpoll_tx();
2030
		bond_change_active_slave(bond, NULL);
2031
		bond_select_active_slave(bond);
2032
		unblock_netpoll_tx();
2033
	}
2034 2035
	/* either primary_slave or curr_active_slave might've changed */
	synchronize_rcu();
2036
	slave_disable_netpoll(new_slave);
2037

L
Linus Torvalds 已提交
2038
err_close:
2039 2040
	if (!netif_is_bond_master(slave_dev))
		slave_dev->priv_flags &= ~IFF_BONDING;
L
Linus Torvalds 已提交
2041 2042 2043
	dev_close(slave_dev);

err_restore_mac:
2044
	slave_dev->flags &= ~IFF_SLAVE;
2045
	if (!bond->params.fail_over_mac ||
2046
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2047 2048 2049 2050
		/* XXX TODO - fom follow mode needs to change master's
		 * MAC if this slave's MAC is in use by the bond, or at
		 * least print a warning.
		 */
2051 2052 2053
		bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr,
				  new_slave->dev->addr_len);
		ss.ss_family = slave_dev->type;
2054
		dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
2055
	}
L
Linus Torvalds 已提交
2056

2057 2058 2059
err_restore_mtu:
	dev_set_mtu(slave_dev, new_slave->original_mtu);

L
Linus Torvalds 已提交
2060
err_free:
2061
	kobject_put(&new_slave->kobj);
L
Linus Torvalds 已提交
2062 2063

err_undo_flags:
2064
	/* Enslave of first slave has failed and we need to fix master's mac */
2065 2066 2067 2068 2069
	if (!bond_has_slaves(bond)) {
		if (ether_addr_equal_64bits(bond_dev->dev_addr,
					    slave_dev->dev_addr))
			eth_hw_addr_random(bond_dev);
		if (bond_dev->type != ARPHRD_ETHER) {
2070
			dev_close(bond_dev);
2071 2072 2073 2074 2075
			ether_setup(bond_dev);
			bond_dev->flags |= IFF_MASTER;
			bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
		}
	}
S
Stephen Hemminger 已提交
2076

L
Linus Torvalds 已提交
2077 2078 2079
	return res;
}

2080
/* Try to release the slave device <slave> from the bond device <master>
L
Linus Torvalds 已提交
2081
 * It is legal to access curr_active_slave without a lock because all the function
2082
 * is RTNL-locked. If "all" is true it means that the function is being called
2083
 * while destroying a bond interface and all slaves are being released.
L
Linus Torvalds 已提交
2084 2085 2086 2087 2088 2089 2090
 *
 * The rules for slave state should be:
 *   for Active/Backup:
 *     Active stays on all backups go down
 *   for Bonded connections:
 *     The first up interface should be left on and all others downed.
 */
2091 2092
static int __bond_release_one(struct net_device *bond_dev,
			      struct net_device *slave_dev,
2093
			      bool all, bool unregister)
L
Linus Torvalds 已提交
2094
{
2095
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
2096
	struct slave *slave, *oldcurrent;
2097
	struct sockaddr_storage ss;
2098
	int old_flags = bond_dev->flags;
2099
	netdev_features_t old_features = bond_dev->features;
L
Linus Torvalds 已提交
2100 2101 2102

	/* slave is not a slave or master is not master of this slave */
	if (!(slave_dev->flags & IFF_SLAVE) ||
2103
	    !netdev_has_upper_dev(slave_dev, bond_dev)) {
2104
		slave_dbg(bond_dev, slave_dev, "cannot release slave\n");
L
Linus Torvalds 已提交
2105 2106 2107
		return -EINVAL;
	}

2108
	block_netpoll_tx();
L
Linus Torvalds 已提交
2109 2110 2111 2112

	slave = bond_get_slave_by_dev(bond, slave_dev);
	if (!slave) {
		/* not a slave of this bond */
2113
		slave_info(bond_dev, slave_dev, "interface not enslaved\n");
2114
		unblock_netpoll_tx();
L
Linus Torvalds 已提交
2115 2116 2117
		return -EINVAL;
	}

2118 2119
	bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW);

2120 2121
	bond_sysfs_slave_del(slave);

2122 2123 2124
	/* recompute stats just before removing the slave */
	bond_get_stats(bond->dev, &bond->bond_stats);

2125
	bond_upper_dev_unlink(bond, slave);
J
Jiri Pirko 已提交
2126 2127 2128 2129 2130
	/* unregister rx_handler early so bond_handle_frame wouldn't be called
	 * for this slave anymore.
	 */
	netdev_rx_handler_unregister(slave_dev);

2131
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
L
Linus Torvalds 已提交
2132 2133
		bond_3ad_unbind_slave(slave);

2134
	if (bond_mode_can_use_xmit_hash(bond))
2135 2136
		bond_update_slave_arr(bond, slave);

2137 2138
	slave_info(bond_dev, slave_dev, "Releasing %s interface\n",
		    bond_is_active_slave(slave) ? "active" : "backup");
L
Linus Torvalds 已提交
2139

2140
	oldcurrent = rcu_access_pointer(bond->curr_active_slave);
L
Linus Torvalds 已提交
2141

2142
	RCU_INIT_POINTER(bond->current_arp_slave, NULL);
L
Linus Torvalds 已提交
2143

2144
	if (!all && (!bond->params.fail_over_mac ||
2145
		     BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
2146
		if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) &&
2147
		    bond_has_slaves(bond))
2148 2149
			slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n",
				   slave->perm_hwaddr);
2150 2151
	}

2152 2153
	if (rtnl_dereference(bond->primary_slave) == slave)
		RCU_INIT_POINTER(bond->primary_slave, NULL);
L
Linus Torvalds 已提交
2154

2155
	if (oldcurrent == slave)
L
Linus Torvalds 已提交
2156 2157
		bond_change_active_slave(bond, NULL);

2158
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
2159 2160 2161 2162 2163 2164 2165 2166
		/* Must be called only after the slave has been
		 * detached from the list and the curr_active_slave
		 * has been cleared (if our_slave == old_current),
		 * but before a new active slave is selected.
		 */
		bond_alb_deinit_slave(bond, slave);
	}

2167
	if (all) {
2168
		RCU_INIT_POINTER(bond->curr_active_slave, NULL);
2169
	} else if (oldcurrent == slave) {
2170
		/* Note that we hold RTNL over this sequence, so there
2171 2172 2173
		 * is no concern that another slave add/remove event
		 * will interfere.
		 */
L
Linus Torvalds 已提交
2174
		bond_select_active_slave(bond);
2175 2176
	}

2177
	if (!bond_has_slaves(bond)) {
2178
		bond_set_carrier(bond);
2179
		eth_hw_addr_random(bond_dev);
L
Linus Torvalds 已提交
2180 2181
	}

2182
	unblock_netpoll_tx();
2183
	synchronize_rcu();
2184
	bond->slave_cnt--;
L
Linus Torvalds 已提交
2185

2186
	if (!bond_has_slaves(bond)) {
2187
		call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev);
2188 2189
		call_netdevice_notifiers(NETDEV_RELEASE, bond->dev);
	}
2190

2191 2192 2193
	bond_compute_features(bond);
	if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) &&
	    (old_features & NETIF_F_VLAN_CHALLENGED))
2194
		slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n");
2195

2196
	vlan_vids_del_by_dev(slave_dev, bond_dev);
L
Linus Torvalds 已提交
2197

2198
	/* If the mode uses primary, then this case was handled above by
2199
	 * bond_change_active_slave(..., NULL)
L
Linus Torvalds 已提交
2200
	 */
2201
	if (!bond_uses_primary(bond)) {
2202 2203 2204 2205 2206 2207 2208 2209
		/* unset promiscuity level from slave
		 * NOTE: The NETDEV_CHANGEADDR call above may change the value
		 * of the IFF_PROMISC flag in the bond_dev, but we need the
		 * value of that flag before that change, as that was the value
		 * when this slave was attached, so we cache at the start of the
		 * function and use it here. Same goes for ALLMULTI below
		 */
		if (old_flags & IFF_PROMISC)
L
Linus Torvalds 已提交
2210 2211 2212
			dev_set_promiscuity(slave_dev, -1);

		/* unset allmulti level from slave */
2213
		if (old_flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
2214 2215
			dev_set_allmulti(slave_dev, -1);

2216
		bond_hw_addr_flush(bond_dev, slave_dev);
L
Linus Torvalds 已提交
2217 2218
	}

2219
	slave_disable_netpoll(slave);
2220

L
Linus Torvalds 已提交
2221 2222 2223
	/* close slave before restoring its mac address */
	dev_close(slave_dev);

2224
	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
2225
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2226
		/* restore original ("permanent") mac address */
2227 2228 2229
		bond_hw_addr_copy(ss.__data, slave->perm_hwaddr,
				  slave->dev->addr_len);
		ss.ss_family = slave_dev->type;
2230
		dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
2231
	}
L
Linus Torvalds 已提交
2232

2233 2234 2235 2236
	if (unregister)
		__dev_set_mtu(slave_dev, slave->original_mtu);
	else
		dev_set_mtu(slave_dev, slave->original_mtu);
2237

2238 2239
	if (!netif_is_bond_master(slave_dev))
		slave_dev->priv_flags &= ~IFF_BONDING;
L
Linus Torvalds 已提交
2240

2241
	kobject_put(&slave->kobj);
L
Linus Torvalds 已提交
2242

2243
	return 0;
L
Linus Torvalds 已提交
2244 2245
}

2246 2247 2248
/* A wrapper used because of ndo_del_link */
int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
{
2249
	return __bond_release_one(bond_dev, slave_dev, false, false);
2250 2251
}

2252 2253 2254
/* First release a slave and then destroy the bond if no more slaves are left.
 * Must be under rtnl_lock when this function is called.
 */
2255 2256
static int bond_release_and_destroy(struct net_device *bond_dev,
				    struct net_device *slave_dev)
2257
{
2258
	struct bonding *bond = netdev_priv(bond_dev);
2259 2260
	int ret;

2261
	ret = __bond_release_one(bond_dev, slave_dev, false, true);
2262 2263
	if (ret == 0 && !bond_has_slaves(bond) &&
	    bond_dev->reg_state != NETREG_UNREGISTERING) {
2264
		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
2265
		netdev_info(bond_dev, "Destroying bond\n");
2266
		bond_remove_proc_entry(bond);
S
Stephen Hemminger 已提交
2267
		unregister_netdevice(bond_dev);
2268 2269 2270 2271
	}
	return ret;
}

2272
static void bond_info_query(struct net_device *bond_dev, struct ifbond *info)
L
Linus Torvalds 已提交
2273
{
2274
	struct bonding *bond = netdev_priv(bond_dev);
2275

2276
	bond_fill_ifbond(bond, info);
L
Linus Torvalds 已提交
2277 2278 2279 2280
}

static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info)
{
2281
	struct bonding *bond = netdev_priv(bond_dev);
2282
	struct list_head *iter;
2283
	int i = 0, res = -ENODEV;
L
Linus Torvalds 已提交
2284 2285
	struct slave *slave;

2286
	bond_for_each_slave(bond, slave, iter) {
2287
		if (i++ == (int)info->slave_id) {
2288
			res = 0;
2289
			bond_fill_ifslave(slave, info);
L
Linus Torvalds 已提交
2290 2291 2292 2293
			break;
		}
	}

2294
	return res;
L
Linus Torvalds 已提交
2295 2296 2297 2298
}

/*-------------------------------- Monitoring -------------------------------*/

2299
/* called with rcu_read_lock() */
J
Jay Vosburgh 已提交
2300 2301
static int bond_miimon_inspect(struct bonding *bond)
{
2302
	int link_state, commit = 0;
2303
	struct list_head *iter;
J
Jay Vosburgh 已提交
2304
	struct slave *slave;
2305 2306
	bool ignore_updelay;

2307
	ignore_updelay = !rcu_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
2308

2309
	bond_for_each_slave_rcu(bond, slave, iter) {
2310
		bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
L
Linus Torvalds 已提交
2311

J
Jay Vosburgh 已提交
2312
		link_state = bond_check_dev_link(bond, slave->dev, 0);
L
Linus Torvalds 已提交
2313 2314

		switch (slave->link) {
J
Jay Vosburgh 已提交
2315 2316 2317
		case BOND_LINK_UP:
			if (link_state)
				continue;
L
Linus Torvalds 已提交
2318

2319
			bond_propose_link_state(slave, BOND_LINK_FAIL);
2320
			commit++;
J
Jay Vosburgh 已提交
2321 2322
			slave->delay = bond->params.downdelay;
			if (slave->delay) {
2323 2324 2325 2326 2327 2328
				slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
					   (BOND_MODE(bond) ==
					    BOND_MODE_ACTIVEBACKUP) ?
					    (bond_is_active_slave(slave) ?
					     "active " : "backup ") : "",
					   bond->params.downdelay * bond->params.miimon);
L
Linus Torvalds 已提交
2329
			}
2330
			fallthrough;
J
Jay Vosburgh 已提交
2331 2332
		case BOND_LINK_FAIL:
			if (link_state) {
2333
				/* recovered before downdelay expired */
2334
				bond_propose_link_state(slave, BOND_LINK_UP);
2335
				slave->last_link_up = jiffies;
2336 2337 2338
				slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
					   (bond->params.downdelay - slave->delay) *
					   bond->params.miimon);
2339
				commit++;
J
Jay Vosburgh 已提交
2340
				continue;
L
Linus Torvalds 已提交
2341
			}
J
Jay Vosburgh 已提交
2342 2343

			if (slave->delay <= 0) {
2344
				bond_propose_link_state(slave, BOND_LINK_DOWN);
J
Jay Vosburgh 已提交
2345 2346
				commit++;
				continue;
L
Linus Torvalds 已提交
2347 2348
			}

J
Jay Vosburgh 已提交
2349 2350 2351 2352 2353 2354 2355
			slave->delay--;
			break;

		case BOND_LINK_DOWN:
			if (!link_state)
				continue;

2356
			bond_propose_link_state(slave, BOND_LINK_BACK);
2357
			commit++;
J
Jay Vosburgh 已提交
2358 2359 2360
			slave->delay = bond->params.updelay;

			if (slave->delay) {
2361 2362 2363 2364
				slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
					   ignore_updelay ? 0 :
					   bond->params.updelay *
					   bond->params.miimon);
J
Jay Vosburgh 已提交
2365
			}
2366
			fallthrough;
J
Jay Vosburgh 已提交
2367 2368
		case BOND_LINK_BACK:
			if (!link_state) {
2369
				bond_propose_link_state(slave, BOND_LINK_DOWN);
2370 2371 2372
				slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
					   (bond->params.updelay - slave->delay) *
					   bond->params.miimon);
2373
				commit++;
J
Jay Vosburgh 已提交
2374 2375 2376
				continue;
			}

2377 2378 2379
			if (ignore_updelay)
				slave->delay = 0;

J
Jay Vosburgh 已提交
2380
			if (slave->delay <= 0) {
2381
				bond_propose_link_state(slave, BOND_LINK_UP);
J
Jay Vosburgh 已提交
2382
				commit++;
2383
				ignore_updelay = false;
J
Jay Vosburgh 已提交
2384
				continue;
L
Linus Torvalds 已提交
2385
			}
J
Jay Vosburgh 已提交
2386 2387

			slave->delay--;
L
Linus Torvalds 已提交
2388
			break;
J
Jay Vosburgh 已提交
2389 2390
		}
	}
L
Linus Torvalds 已提交
2391

J
Jay Vosburgh 已提交
2392 2393
	return commit;
}
L
Linus Torvalds 已提交
2394

2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412
static void bond_miimon_link_change(struct bonding *bond,
				    struct slave *slave,
				    char link)
{
	switch (BOND_MODE(bond)) {
	case BOND_MODE_8023AD:
		bond_3ad_handle_link_change(slave, link);
		break;
	case BOND_MODE_TLB:
	case BOND_MODE_ALB:
		bond_alb_handle_link_change(bond, slave, link);
		break;
	case BOND_MODE_XOR:
		bond_update_slave_arr(bond, NULL);
		break;
	}
}

J
Jay Vosburgh 已提交
2413 2414
static void bond_miimon_commit(struct bonding *bond)
{
2415
	struct list_head *iter;
2416
	struct slave *slave, *primary;
J
Jay Vosburgh 已提交
2417

2418
	bond_for_each_slave(bond, slave, iter) {
2419
		switch (slave->link_new_state) {
J
Jay Vosburgh 已提交
2420
		case BOND_LINK_NOCHANGE:
2421 2422 2423 2424 2425 2426 2427 2428 2429
			/* For 802.3ad mode, check current slave speed and
			 * duplex again in case its port was disabled after
			 * invalid speed/duplex reporting but recovered before
			 * link monitoring could make a decision on the actual
			 * link status
			 */
			if (BOND_MODE(bond) == BOND_MODE_8023AD &&
			    slave->link == BOND_LINK_UP)
				bond_3ad_adapter_speed_duplex_changed(slave);
J
Jay Vosburgh 已提交
2430
			continue;
L
Linus Torvalds 已提交
2431

J
Jay Vosburgh 已提交
2432
		case BOND_LINK_UP:
2433 2434
			if (bond_update_speed_duplex(slave) &&
			    bond_needs_speed_duplex(bond)) {
2435
				slave->link = BOND_LINK_DOWN;
2436
				if (net_ratelimit())
2437 2438
					slave_warn(bond->dev, slave->dev,
						   "failed to get link speed/duplex\n");
2439 2440
				continue;
			}
2441 2442
			bond_set_slave_link_state(slave, BOND_LINK_UP,
						  BOND_SLAVE_NOTIFY_NOW);
2443
			slave->last_link_up = jiffies;
J
Jay Vosburgh 已提交
2444

2445
			primary = rtnl_dereference(bond->primary_slave);
2446
			if (BOND_MODE(bond) == BOND_MODE_8023AD) {
J
Jay Vosburgh 已提交
2447
				/* prevent it from being the active one */
J
Jiri Pirko 已提交
2448
				bond_set_backup_slave(slave);
2449
			} else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
J
Jay Vosburgh 已提交
2450
				/* make it immediately active */
J
Jiri Pirko 已提交
2451
				bond_set_active_slave(slave);
L
Linus Torvalds 已提交
2452 2453
			}

2454 2455 2456
			slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n",
				   slave->speed == SPEED_UNKNOWN ? 0 : slave->speed,
				   slave->duplex ? "full" : "half");
L
Linus Torvalds 已提交
2457

2458
			bond_miimon_link_change(bond, slave, BOND_LINK_UP);
2459

2460
			if (!bond->curr_active_slave || slave == primary)
J
Jay Vosburgh 已提交
2461
				goto do_failover;
L
Linus Torvalds 已提交
2462

J
Jay Vosburgh 已提交
2463
			continue;
2464

J
Jay Vosburgh 已提交
2465
		case BOND_LINK_DOWN:
J
Jay Vosburgh 已提交
2466 2467 2468
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;

2469 2470
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2471

2472 2473
			if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP ||
			    BOND_MODE(bond) == BOND_MODE_8023AD)
2474 2475
				bond_set_slave_inactive_flags(slave,
							      BOND_SLAVE_NOTIFY_NOW);
J
Jay Vosburgh 已提交
2476

2477
			slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");
J
Jay Vosburgh 已提交
2478

2479
			bond_miimon_link_change(bond, slave, BOND_LINK_DOWN);
2480

2481
			if (slave == rcu_access_pointer(bond->curr_active_slave))
J
Jay Vosburgh 已提交
2482 2483 2484 2485 2486
				goto do_failover;

			continue;

		default:
2487
			slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n",
2488 2489
				  slave->link_new_state);
			bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
J
Jay Vosburgh 已提交
2490 2491 2492 2493 2494

			continue;
		}

do_failover:
2495
		block_netpoll_tx();
J
Jay Vosburgh 已提交
2496
		bond_select_active_slave(bond);
2497
		unblock_netpoll_tx();
J
Jay Vosburgh 已提交
2498 2499 2500
	}

	bond_set_carrier(bond);
L
Linus Torvalds 已提交
2501 2502
}

2503
/* bond_mii_monitor
2504 2505
 *
 * Really a wrapper that splits the mii monitor into two phases: an
J
Jay Vosburgh 已提交
2506 2507 2508
 * inspection, then (if inspection indicates something needs to be done)
 * an acquisition of appropriate locks followed by a commit phase to
 * implement whatever link state changes are indicated.
2509
 */
2510
static void bond_mii_monitor(struct work_struct *work)
2511 2512 2513
{
	struct bonding *bond = container_of(work, struct bonding,
					    mii_work.work);
2514
	bool should_notify_peers = false;
2515
	bool commit;
2516
	unsigned long delay;
2517 2518
	struct slave *slave;
	struct list_head *iter;
2519

2520 2521 2522
	delay = msecs_to_jiffies(bond->params.miimon);

	if (!bond_has_slaves(bond))
J
Jay Vosburgh 已提交
2523
		goto re_arm;
2524

2525
	rcu_read_lock();
2526
	should_notify_peers = bond_should_notify_peers(bond);
2527 2528 2529 2530 2531 2532 2533 2534
	commit = !!bond_miimon_inspect(bond);
	if (bond->send_peer_notif) {
		rcu_read_unlock();
		if (rtnl_trylock()) {
			bond->send_peer_notif--;
			rtnl_unlock();
		}
	} else {
2535
		rcu_read_unlock();
2536
	}
J
Jay Vosburgh 已提交
2537

2538
	if (commit) {
2539 2540 2541 2542 2543 2544
		/* Race avoidance with bond_close cancel of workqueue */
		if (!rtnl_trylock()) {
			delay = 1;
			should_notify_peers = false;
			goto re_arm;
		}
2545

2546 2547 2548
		bond_for_each_slave(bond, slave, iter) {
			bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER);
		}
2549 2550 2551
		bond_miimon_commit(bond);

		rtnl_unlock();	/* might sleep, hold no other locks */
2552
	}
2553

J
Jay Vosburgh 已提交
2554
re_arm:
2555
	if (bond->params.miimon)
2556 2557 2558 2559 2560 2561 2562 2563
		queue_delayed_work(bond->wq, &bond->mii_work, delay);

	if (should_notify_peers) {
		if (!rtnl_trylock())
			return;
		call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
		rtnl_unlock();
	}
2564
}
J
Jay Vosburgh 已提交
2565

2566 2567
static int bond_upper_dev_walk(struct net_device *upper,
			       struct netdev_nested_priv *priv)
2568
{
2569
	__be32 ip = *(__be32 *)priv->data;
2570 2571 2572 2573

	return ip == bond_confirm_addr(upper, 0, ip);
}

2574
static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
2575
{
2576 2577 2578
	struct netdev_nested_priv priv = {
		.data = (void *)&ip,
	};
2579
	bool ret = false;
2580

2581
	if (ip == bond_confirm_addr(bond->dev, 0, ip))
2582
		return true;
2583

2584
	rcu_read_lock();
2585
	if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv))
2586
		ret = true;
2587
	rcu_read_unlock();
2588

2589
	return ret;
2590 2591
}

2592
/* We go to the (large) trouble of VLAN tagging ARP frames because
J
Jay Vosburgh 已提交
2593 2594 2595
 * switches in VLAN mode (especially if ports are configured as
 * "native" to a VLAN) might not pass non-tagged frames.
 */
2596 2597
static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip,
			  __be32 src_ip, struct bond_vlan_tag *tags)
J
Jay Vosburgh 已提交
2598 2599
{
	struct sk_buff *skb;
2600
	struct bond_vlan_tag *outer_tag = tags;
2601 2602
	struct net_device *slave_dev = slave->dev;
	struct net_device *bond_dev = slave->bond->dev;
J
Jay Vosburgh 已提交
2603

2604 2605
	slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n",
		  arp_op, &dest_ip, &src_ip);
S
Stephen Hemminger 已提交
2606

J
Jay Vosburgh 已提交
2607 2608 2609 2610
	skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip,
			 NULL, slave_dev->dev_addr, NULL);

	if (!skb) {
2611
		net_err_ratelimited("ARP packet allocation failed\n");
J
Jay Vosburgh 已提交
2612 2613
		return;
	}
2614

2615 2616 2617 2618 2619
	if (!tags || tags->vlan_proto == VLAN_N_VID)
		goto xmit;

	tags++;

2620
	/* Go through all the tags backwards and add them to the packet */
2621 2622 2623
	while (tags->vlan_proto != VLAN_N_VID) {
		if (!tags->vlan_id) {
			tags++;
2624
			continue;
2625
		}
2626

2627 2628
		slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n",
			  ntohs(outer_tag->vlan_proto), tags->vlan_id);
2629 2630
		skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto,
						tags->vlan_id);
2631 2632 2633 2634
		if (!skb) {
			net_err_ratelimited("failed to insert inner VLAN tag\n");
			return;
		}
2635 2636

		tags++;
2637 2638
	}
	/* Set the outer tag */
2639
	if (outer_tag->vlan_id) {
2640 2641
		slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n",
			  ntohs(outer_tag->vlan_proto), outer_tag->vlan_id);
J
Jiri Pirko 已提交
2642 2643
		__vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto,
				       outer_tag->vlan_id);
J
Jay Vosburgh 已提交
2644
	}
2645 2646

xmit:
J
Jay Vosburgh 已提交
2647 2648 2649
	arp_xmit(skb);
}

2650 2651 2652 2653 2654 2655
/* Validate the device path between the @start_dev and the @end_dev.
 * The path is valid if the @end_dev is reachable through device
 * stacking.
 * When the path is validated, collect any vlan information in the
 * path.
 */
2656 2657 2658
struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
					      struct net_device *end_dev,
					      int level)
2659
{
2660
	struct bond_vlan_tag *tags;
2661 2662 2663
	struct net_device *upper;
	struct list_head  *iter;

2664
	if (start_dev == end_dev) {
K
Kees Cook 已提交
2665
		tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC);
2666 2667 2668 2669 2670
		if (!tags)
			return ERR_PTR(-ENOMEM);
		tags[level].vlan_proto = VLAN_N_VID;
		return tags;
	}
2671 2672

	netdev_for_each_upper_dev_rcu(start_dev, upper, iter) {
2673 2674 2675 2676 2677
		tags = bond_verify_device_path(upper, end_dev, level + 1);
		if (IS_ERR_OR_NULL(tags)) {
			if (IS_ERR(tags))
				return tags;
			continue;
2678
		}
2679 2680 2681 2682 2683 2684
		if (is_vlan_dev(upper)) {
			tags[level].vlan_proto = vlan_dev_vlan_proto(upper);
			tags[level].vlan_id = vlan_dev_vlan_id(upper);
		}

		return tags;
2685 2686
	}

2687
	return NULL;
2688
}
J
Jay Vosburgh 已提交
2689

L
Linus Torvalds 已提交
2690 2691
static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
{
J
Jay Vosburgh 已提交
2692
	struct rtable *rt;
2693
	struct bond_vlan_tag *tags;
2694
	__be32 *targets = bond->params.arp_targets, addr;
2695
	int i;
L
Linus Torvalds 已提交
2696

2697
	for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
2698 2699
		slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n",
			  __func__, &targets[i]);
2700
		tags = NULL;
J
Jay Vosburgh 已提交
2701

2702
		/* Find out through which dev should the packet go */
2703 2704
		rt = ip_route_output(dev_net(bond->dev), targets[i], 0,
				     RTO_ONLINK, 0);
2705
		if (IS_ERR(rt)) {
2706 2707 2708
			/* there's no route to target - try to send arp
			 * probe to generate any traffic (arp_validate=0)
			 */
2709 2710 2711 2712
			if (bond->params.arp_validate)
				net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
						     bond->dev->name,
						     &targets[i]);
2713
			bond_arp_send(slave, ARPOP_REQUEST, targets[i],
2714
				      0, tags);
J
Jay Vosburgh 已提交
2715 2716 2717
			continue;
		}

2718 2719 2720 2721 2722
		/* bond device itself */
		if (rt->dst.dev == bond->dev)
			goto found;

		rcu_read_lock();
2723
		tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0);
2724
		rcu_read_unlock();
J
Jay Vosburgh 已提交
2725

2726
		if (!IS_ERR_OR_NULL(tags))
2727 2728
			goto found;

2729
		/* Not our device - skip */
2730
		slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n",
2731
			   &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL");
2732

2733
		ip_rt_put(rt);
2734 2735 2736 2737 2738
		continue;

found:
		addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
		ip_rt_put(rt);
2739
		bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags);
2740
		kfree(tags);
J
Jay Vosburgh 已提交
2741 2742 2743
	}
}

2744
static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip)
2745
{
2746 2747
	int i;

2748
	if (!sip || !bond_has_this_ip(bond, tip)) {
2749 2750
		slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n",
			   __func__, &sip, &tip);
2751 2752
		return;
	}
2753

2754 2755
	i = bond_get_targets_ip(bond->params.arp_targets, sip);
	if (i == -1) {
2756 2757
		slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n",
			   __func__, &sip);
2758
		return;
2759
	}
2760
	slave->last_rx = jiffies;
2761
	slave->target_last_arp_rx[i] = jiffies;
2762 2763
}

2764 2765
int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
		 struct slave *slave)
2766
{
2767
	struct arphdr *arp = (struct arphdr *)skb->data;
2768
	struct slave *curr_active_slave, *curr_arp_slave;
2769
	unsigned char *arp_ptr;
2770
	__be32 sip, tip;
2771 2772
	int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
	unsigned int alen;
2773

2774
	if (!slave_do_arp_validate(bond, slave)) {
2775 2776
		if ((slave_do_arp_validate_only(bond) && is_arp) ||
		    !slave_do_arp_validate_only(bond))
2777
			slave->last_rx = jiffies;
2778
		return RX_HANDLER_ANOTHER;
2779 2780 2781
	} else if (!is_arp) {
		return RX_HANDLER_ANOTHER;
	}
2782

2783
	alen = arp_hdr_len(bond->dev);
2784

2785 2786
	slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n",
		   __func__, skb->dev->name);
2787

2788 2789 2790 2791 2792 2793 2794
	if (alen > skb_headlen(skb)) {
		arp = kmalloc(alen, GFP_ATOMIC);
		if (!arp)
			goto out_unlock;
		if (skb_copy_bits(skb, 0, arp, alen) < 0)
			goto out_unlock;
	}
2795

2796
	if (arp->ar_hln != bond->dev->addr_len ||
2797 2798 2799 2800 2801 2802 2803 2804
	    skb->pkt_type == PACKET_OTHERHOST ||
	    skb->pkt_type == PACKET_LOOPBACK ||
	    arp->ar_hrd != htons(ARPHRD_ETHER) ||
	    arp->ar_pro != htons(ETH_P_IP) ||
	    arp->ar_pln != 4)
		goto out_unlock;

	arp_ptr = (unsigned char *)(arp + 1);
2805
	arp_ptr += bond->dev->addr_len;
2806
	memcpy(&sip, arp_ptr, 4);
2807
	arp_ptr += 4 + bond->dev->addr_len;
2808 2809
	memcpy(&tip, arp_ptr, 4);

2810 2811 2812 2813
	slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n",
		  __func__, slave->dev->name, bond_slave_state(slave),
		  bond->params.arp_validate, slave_do_arp_validate(bond, slave),
		  &sip, &tip);
2814

2815
	curr_active_slave = rcu_dereference(bond->curr_active_slave);
2816
	curr_arp_slave = rcu_dereference(bond->current_arp_slave);
2817

2818
	/* We 'trust' the received ARP enough to validate it if:
2819
	 *
2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838
	 * (a) the slave receiving the ARP is active (which includes the
	 * current ARP slave, if any), or
	 *
	 * (b) the receiving slave isn't active, but there is a currently
	 * active slave and it received valid arp reply(s) after it became
	 * the currently active slave, or
	 *
	 * (c) there is an ARP slave that sent an ARP during the prior ARP
	 * interval, and we receive an ARP reply on any slave.  We accept
	 * these because switch FDB update delays may deliver the ARP
	 * reply to a slave other than the sender of the ARP request.
	 *
	 * Note: for (b), backup slaves are receiving the broadcast ARP
	 * request, not a reply.  This request passes from the sending
	 * slave through the L2 switch(es) to the receiving slave.  Since
	 * this is checking the request, sip/tip are swapped for
	 * validation.
	 *
	 * This is done to avoid endless looping when we can't reach the
2839
	 * arp_ip_target and fool ourselves with our own arp requests.
2840
	 */
J
Jiri Pirko 已提交
2841
	if (bond_is_active_slave(slave))
2842
		bond_validate_arp(bond, slave, sip, tip);
2843 2844 2845
	else if (curr_active_slave &&
		 time_after(slave_last_rx(bond, curr_active_slave),
			    curr_active_slave->last_link_up))
2846
		bond_validate_arp(bond, slave, tip, sip);
2847 2848 2849 2850
	else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) &&
		 bond_time_in_interval(bond,
				       dev_trans_start(curr_arp_slave->dev), 1))
		bond_validate_arp(bond, slave, sip, tip);
2851 2852

out_unlock:
2853 2854
	if (arp != (struct arphdr *)skb->data)
		kfree(arp);
2855
	return RX_HANDLER_ANOTHER;
2856 2857
}

2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871
/* function to verify if we're in the arp_interval timeslice, returns true if
 * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval +
 * arp_interval/2) . the arp_interval/2 is needed for really fast networks.
 */
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
				  int mod)
{
	int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

	return time_in_range(jiffies,
			     last_act - delta_in_ticks,
			     last_act + mod * delta_in_ticks + delta_in_ticks/2);
}

2872
/* This function is called regularly to monitor each slave's link
L
Linus Torvalds 已提交
2873 2874 2875 2876 2877
 * ensuring that traffic is being sent and received when arp monitoring
 * is used in load-balancing mode. if the adapter has been dormant, then an
 * arp is transmitted to generate traffic. see activebackup_arp_monitor for
 * arp monitoring in active backup mode.
 */
2878
static void bond_loadbalance_arp_mon(struct bonding *bond)
L
Linus Torvalds 已提交
2879 2880
{
	struct slave *slave, *oldcurrent;
2881
	struct list_head *iter;
2882
	int do_failover = 0, slave_state_changed = 0;
L
Linus Torvalds 已提交
2883

2884
	if (!bond_has_slaves(bond))
L
Linus Torvalds 已提交
2885 2886
		goto re_arm;

2887 2888
	rcu_read_lock();

2889
	oldcurrent = rcu_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
2890 2891
	/* see if any of the previous devices are up now (i.e. they have
	 * xmt and rcv traffic). the curr_active_slave does not come into
2892 2893 2894
	 * the picture unless it is null. also, slave->last_link_up is not
	 * needed here because we send an arp on each slave and give a slave
	 * as long as it needs to get the tx/rx within the delta.
L
Linus Torvalds 已提交
2895 2896 2897
	 * TODO: what about up/down delay in arp mode? it wasn't here before
	 *       so it can wait
	 */
2898
	bond_for_each_slave_rcu(bond, slave, iter) {
2899 2900
		unsigned long trans_start = dev_trans_start(slave->dev);

2901
		bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
2902

L
Linus Torvalds 已提交
2903
		if (slave->link != BOND_LINK_UP) {
2904
			if (bond_time_in_interval(bond, trans_start, 1) &&
2905
			    bond_time_in_interval(bond, slave->last_rx, 1)) {
L
Linus Torvalds 已提交
2906

2907
				bond_propose_link_state(slave, BOND_LINK_UP);
2908
				slave_state_changed = 1;
L
Linus Torvalds 已提交
2909 2910 2911 2912 2913 2914 2915

				/* primary_slave has no meaning in round-robin
				 * mode. the window of a slave being up and
				 * curr_active_slave being null after enslaving
				 * is closed.
				 */
				if (!oldcurrent) {
2916
					slave_info(bond->dev, slave->dev, "link status definitely up\n");
L
Linus Torvalds 已提交
2917 2918
					do_failover = 1;
				} else {
2919
					slave_info(bond->dev, slave->dev, "interface is now up\n");
L
Linus Torvalds 已提交
2920 2921 2922 2923 2924 2925 2926 2927 2928
				}
			}
		} else {
			/* slave->link == BOND_LINK_UP */

			/* not all switches will respond to an arp request
			 * when the source ip is 0, so don't take the link down
			 * if we don't know our ip yet
			 */
2929
			if (!bond_time_in_interval(bond, trans_start, 2) ||
2930
			    !bond_time_in_interval(bond, slave->last_rx, 2)) {
L
Linus Torvalds 已提交
2931

2932
				bond_propose_link_state(slave, BOND_LINK_DOWN);
2933
				slave_state_changed = 1;
L
Linus Torvalds 已提交
2934

S
Stephen Hemminger 已提交
2935
				if (slave->link_failure_count < UINT_MAX)
L
Linus Torvalds 已提交
2936 2937
					slave->link_failure_count++;

2938
				slave_info(bond->dev, slave->dev, "interface is now down\n");
L
Linus Torvalds 已提交
2939

S
Stephen Hemminger 已提交
2940
				if (slave == oldcurrent)
L
Linus Torvalds 已提交
2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951
					do_failover = 1;
			}
		}

		/* note: if switch is in round-robin mode, all links
		 * must tx arp to ensure all links rx an arp - otherwise
		 * links may oscillate or not come up at all; if switch is
		 * in something like xor mode, there is nothing we can
		 * do - all replies will be rx'ed on same link causing slaves
		 * to be unstable during low/no traffic periods
		 */
2952
		if (bond_slave_is_up(slave))
L
Linus Torvalds 已提交
2953 2954 2955
			bond_arp_send_all(bond, slave);
	}

2956 2957
	rcu_read_unlock();

2958
	if (do_failover || slave_state_changed) {
2959 2960
		if (!rtnl_trylock())
			goto re_arm;
L
Linus Torvalds 已提交
2961

2962
		bond_for_each_slave(bond, slave, iter) {
2963 2964
			if (slave->link_new_state != BOND_LINK_NOCHANGE)
				slave->link = slave->link_new_state;
2965 2966
		}

2967 2968
		if (slave_state_changed) {
			bond_slave_state_change(bond);
2969 2970
			if (BOND_MODE(bond) == BOND_MODE_XOR)
				bond_update_slave_arr(bond, NULL);
2971 2972
		}
		if (do_failover) {
2973 2974 2975 2976
			block_netpoll_tx();
			bond_select_active_slave(bond);
			unblock_netpoll_tx();
		}
2977
		rtnl_unlock();
L
Linus Torvalds 已提交
2978 2979 2980
	}

re_arm:
2981
	if (bond->params.arp_interval)
2982 2983
		queue_delayed_work(bond->wq, &bond->arp_work,
				   msecs_to_jiffies(bond->params.arp_interval));
L
Linus Torvalds 已提交
2984 2985
}

2986
/* Called to inspect slaves for active-backup mode ARP monitor link state
2987 2988 2989
 * changes.  Sets proposed link state in slaves to specify what action
 * should take place for the slave.  Returns 0 if no changes are found, >0
 * if changes to link states must be committed.
2990
 *
2991
 * Called with rcu_read_lock held.
L
Linus Torvalds 已提交
2992
 */
2993
static int bond_ab_arp_inspect(struct bonding *bond)
L
Linus Torvalds 已提交
2994
{
2995
	unsigned long trans_start, last_rx;
2996
	struct list_head *iter;
2997 2998
	struct slave *slave;
	int commit = 0;
2999

3000
	bond_for_each_slave_rcu(bond, slave, iter) {
3001
		bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
3002
		last_rx = slave_last_rx(bond, slave);
L
Linus Torvalds 已提交
3003

3004
		if (slave->link != BOND_LINK_UP) {
3005
			if (bond_time_in_interval(bond, last_rx, 1)) {
3006
				bond_propose_link_state(slave, BOND_LINK_UP);
3007
				commit++;
3008 3009 3010
			} else if (slave->link == BOND_LINK_BACK) {
				bond_propose_link_state(slave, BOND_LINK_FAIL);
				commit++;
3011 3012 3013
			}
			continue;
		}
L
Linus Torvalds 已提交
3014

3015
		/* Give slaves 2*delta after being enslaved or made
3016 3017 3018
		 * active.  This avoids bouncing, as the last receive
		 * times need a full ARP monitor cycle to be updated.
		 */
3019
		if (bond_time_in_interval(bond, slave->last_link_up, 2))
3020 3021
			continue;

3022
		/* Backup slave is down if:
3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033
		 * - No current_arp_slave AND
		 * - more than 3*delta since last receive AND
		 * - the bond has an IP address
		 *
		 * Note: a non-null current_arp_slave indicates
		 * the curr_active_slave went down and we are
		 * searching for a new one; under this condition
		 * we only take the curr_active_slave down - this
		 * gives each slave a chance to tx/rx traffic
		 * before being taken out
		 */
J
Jiri Pirko 已提交
3034
		if (!bond_is_active_slave(slave) &&
3035
		    !rcu_access_pointer(bond->current_arp_slave) &&
3036
		    !bond_time_in_interval(bond, last_rx, 3)) {
3037
			bond_propose_link_state(slave, BOND_LINK_DOWN);
3038 3039 3040
			commit++;
		}

3041
		/* Active slave is down if:
3042 3043 3044 3045
		 * - more than 2*delta since transmitting OR
		 * - (more than 2*delta since receive AND
		 *    the bond has an IP address)
		 */
3046
		trans_start = dev_trans_start(slave->dev);
J
Jiri Pirko 已提交
3047
		if (bond_is_active_slave(slave) &&
3048 3049
		    (!bond_time_in_interval(bond, trans_start, 2) ||
		     !bond_time_in_interval(bond, last_rx, 2))) {
3050
			bond_propose_link_state(slave, BOND_LINK_DOWN);
3051 3052
			commit++;
		}
L
Linus Torvalds 已提交
3053 3054
	}

3055 3056
	return commit;
}
L
Linus Torvalds 已提交
3057

3058
/* Called to commit link state changes noted by inspection step of
3059 3060
 * active-backup mode ARP monitor.
 *
3061
 * Called with RTNL hold.
3062
 */
3063
static void bond_ab_arp_commit(struct bonding *bond)
3064
{
3065
	unsigned long trans_start;
3066
	struct list_head *iter;
3067
	struct slave *slave;
L
Linus Torvalds 已提交
3068

3069
	bond_for_each_slave(bond, slave, iter) {
3070
		switch (slave->link_new_state) {
3071 3072
		case BOND_LINK_NOCHANGE:
			continue;
3073

3074
		case BOND_LINK_UP:
3075
			trans_start = dev_trans_start(slave->dev);
3076 3077
			if (rtnl_dereference(bond->curr_active_slave) != slave ||
			    (!rtnl_dereference(bond->curr_active_slave) &&
3078
			     bond_time_in_interval(bond, trans_start, 1))) {
3079 3080 3081
				struct slave *current_arp_slave;

				current_arp_slave = rtnl_dereference(bond->current_arp_slave);
3082 3083
				bond_set_slave_link_state(slave, BOND_LINK_UP,
							  BOND_SLAVE_NOTIFY_NOW);
3084
				if (current_arp_slave) {
3085
					bond_set_slave_inactive_flags(
3086
						current_arp_slave,
3087
						BOND_SLAVE_NOTIFY_NOW);
3088
					RCU_INIT_POINTER(bond->current_arp_slave, NULL);
3089
				}
3090

3091
				slave_info(bond->dev, slave->dev, "link status definitely up\n");
3092

3093
				if (!rtnl_dereference(bond->curr_active_slave) ||
3094
				    slave == rtnl_dereference(bond->primary_slave))
3095
					goto do_failover;
L
Linus Torvalds 已提交
3096

3097
			}
L
Linus Torvalds 已提交
3098

3099
			continue;
L
Linus Torvalds 已提交
3100

3101 3102 3103 3104
		case BOND_LINK_DOWN:
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;

3105 3106
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
3107 3108
			bond_set_slave_inactive_flags(slave,
						      BOND_SLAVE_NOTIFY_NOW);
3109

3110
			slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");
3111

3112
			if (slave == rtnl_dereference(bond->curr_active_slave)) {
3113
				RCU_INIT_POINTER(bond->current_arp_slave, NULL);
3114
				goto do_failover;
L
Linus Torvalds 已提交
3115
			}
3116 3117

			continue;
3118

3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131
		case BOND_LINK_FAIL:
			bond_set_slave_link_state(slave, BOND_LINK_FAIL,
						  BOND_SLAVE_NOTIFY_NOW);
			bond_set_slave_inactive_flags(slave,
						      BOND_SLAVE_NOTIFY_NOW);

			/* A slave has just been enslaved and has become
			 * the current active slave.
			 */
			if (rtnl_dereference(bond->curr_active_slave))
				RCU_INIT_POINTER(bond->current_arp_slave, NULL);
			continue;

3132
		default:
3133 3134 3135
			slave_err(bond->dev, slave->dev,
				  "impossible: link_new_state %d on slave\n",
				  slave->link_new_state);
3136
			continue;
L
Linus Torvalds 已提交
3137 3138
		}

3139
do_failover:
3140
		block_netpoll_tx();
3141
		bond_select_active_slave(bond);
3142
		unblock_netpoll_tx();
3143
	}
L
Linus Torvalds 已提交
3144

3145 3146
	bond_set_carrier(bond);
}
L
Linus Torvalds 已提交
3147

3148
/* Send ARP probes for active-backup mode ARP monitor.
3149
 *
3150
 * Called with rcu_read_lock held.
3151
 */
3152
static bool bond_ab_arp_probe(struct bonding *bond)
3153
{
3154
	struct slave *slave, *before = NULL, *new_slave = NULL,
3155 3156
		     *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
		     *curr_active_slave = rcu_dereference(bond->curr_active_slave);
3157 3158
	struct list_head *iter;
	bool found = false;
3159
	bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER;
3160

3161
	if (curr_arp_slave && curr_active_slave)
3162 3163 3164
		netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n",
			    curr_arp_slave->dev->name,
			    curr_active_slave->dev->name);
L
Linus Torvalds 已提交
3165

3166 3167
	if (curr_active_slave) {
		bond_arp_send_all(bond, curr_active_slave);
3168
		return should_notify_rtnl;
3169
	}
L
Linus Torvalds 已提交
3170

3171 3172 3173 3174
	/* if we don't have a curr_active_slave, search for the next available
	 * backup slave from the current_arp_slave and make it the candidate
	 * for becoming the curr_active_slave
	 */
L
Linus Torvalds 已提交
3175

3176
	if (!curr_arp_slave) {
3177 3178 3179
		curr_arp_slave = bond_first_slave_rcu(bond);
		if (!curr_arp_slave)
			return should_notify_rtnl;
3180
	}
L
Linus Torvalds 已提交
3181

3182
	bond_for_each_slave_rcu(bond, slave, iter) {
3183
		if (!found && !before && bond_slave_is_up(slave))
3184
			before = slave;
L
Linus Torvalds 已提交
3185

3186
		if (found && !new_slave && bond_slave_is_up(slave))
3187
			new_slave = slave;
3188 3189 3190 3191 3192 3193
		/* if the link state is up at this point, we
		 * mark it down - this can happen if we have
		 * simultaneous link failures and
		 * reselect_active_interface doesn't make this
		 * one the current slave so it is still marked
		 * up when it is actually down
L
Linus Torvalds 已提交
3194
		 */
3195
		if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
3196 3197
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_LATER);
3198 3199
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;
L
Linus Torvalds 已提交
3200

3201
			bond_set_slave_inactive_flags(slave,
3202
						      BOND_SLAVE_NOTIFY_LATER);
3203

3204
			slave_info(bond->dev, slave->dev, "backup interface is now down\n");
L
Linus Torvalds 已提交
3205
		}
3206
		if (slave == curr_arp_slave)
3207
			found = true;
3208
	}
3209 3210 3211 3212

	if (!new_slave && before)
		new_slave = before;

3213 3214
	if (!new_slave)
		goto check_state;
3215

3216 3217
	bond_set_slave_link_state(new_slave, BOND_LINK_BACK,
				  BOND_SLAVE_NOTIFY_LATER);
3218
	bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER);
3219
	bond_arp_send_all(bond, new_slave);
3220
	new_slave->last_link_up = jiffies;
3221
	rcu_assign_pointer(bond->current_arp_slave, new_slave);
3222

3223 3224
check_state:
	bond_for_each_slave_rcu(bond, slave, iter) {
3225
		if (slave->should_notify || slave->should_notify_link) {
3226 3227 3228 3229 3230
			should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW;
			break;
		}
	}
	return should_notify_rtnl;
3231
}
L
Linus Torvalds 已提交
3232

3233
static void bond_activebackup_arp_mon(struct bonding *bond)
3234
{
3235 3236
	bool should_notify_peers = false;
	bool should_notify_rtnl = false;
3237
	int delta_in_ticks;
L
Linus Torvalds 已提交
3238

3239 3240 3241
	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

	if (!bond_has_slaves(bond))
3242 3243
		goto re_arm;

3244
	rcu_read_lock();
3245

3246 3247
	should_notify_peers = bond_should_notify_peers(bond);

3248 3249 3250
	if (bond_ab_arp_inspect(bond)) {
		rcu_read_unlock();

3251 3252 3253 3254 3255 3256
		/* Race avoidance with bond_close flush of workqueue */
		if (!rtnl_trylock()) {
			delta_in_ticks = 1;
			should_notify_peers = false;
			goto re_arm;
		}
3257

3258
		bond_ab_arp_commit(bond);
3259

3260
		rtnl_unlock();
3261
		rcu_read_lock();
3262 3263
	}

3264 3265
	should_notify_rtnl = bond_ab_arp_probe(bond);
	rcu_read_unlock();
3266

3267 3268
re_arm:
	if (bond->params.arp_interval)
3269 3270
		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);

3271
	if (should_notify_peers || should_notify_rtnl) {
3272 3273
		if (!rtnl_trylock())
			return;
3274 3275 3276 3277

		if (should_notify_peers)
			call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
						 bond->dev);
3278
		if (should_notify_rtnl) {
3279
			bond_slave_state_notify(bond);
3280 3281
			bond_slave_link_notify(bond);
		}
3282

3283 3284
		rtnl_unlock();
	}
L
Linus Torvalds 已提交
3285 3286
}

3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297
static void bond_arp_monitor(struct work_struct *work)
{
	struct bonding *bond = container_of(work, struct bonding,
					    arp_work.work);

	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
		bond_activebackup_arp_mon(bond);
	else
		bond_loadbalance_arp_mon(bond);
}

L
Linus Torvalds 已提交
3298 3299
/*-------------------------- netdev event handling --------------------------*/

3300
/* Change device name */
L
Linus Torvalds 已提交
3301 3302 3303 3304
static int bond_event_changename(struct bonding *bond)
{
	bond_remove_proc_entry(bond);
	bond_create_proc_entry(bond);
3305

3306 3307
	bond_debug_reregister(bond);

L
Linus Torvalds 已提交
3308 3309 3310
	return NOTIFY_DONE;
}

S
Stephen Hemminger 已提交
3311 3312
static int bond_master_netdev_event(unsigned long event,
				    struct net_device *bond_dev)
L
Linus Torvalds 已提交
3313
{
3314
	struct bonding *event_bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3315

3316 3317
	netdev_dbg(bond_dev, "%s called\n", __func__);

L
Linus Torvalds 已提交
3318 3319 3320
	switch (event) {
	case NETDEV_CHANGENAME:
		return bond_event_changename(event_bond);
3321 3322 3323 3324 3325 3326
	case NETDEV_UNREGISTER:
		bond_remove_proc_entry(event_bond);
		break;
	case NETDEV_REGISTER:
		bond_create_proc_entry(event_bond);
		break;
L
Linus Torvalds 已提交
3327 3328 3329 3330 3331 3332 3333
	default:
		break;
	}

	return NOTIFY_DONE;
}

S
Stephen Hemminger 已提交
3334 3335
static int bond_slave_netdev_event(unsigned long event,
				   struct net_device *slave_dev)
L
Linus Torvalds 已提交
3336
{
3337
	struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary;
3338 3339
	struct bonding *bond;
	struct net_device *bond_dev;
L
Linus Torvalds 已提交
3340

3341 3342 3343 3344
	/* A netdev event can be generated while enslaving a device
	 * before netdev_rx_handler_register is called in which case
	 * slave will be NULL
	 */
3345 3346
	if (!slave) {
		netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__);
3347
		return NOTIFY_DONE;
3348 3349
	}

3350 3351
	bond_dev = slave->bond->dev;
	bond = slave->bond;
3352
	primary = rtnl_dereference(bond->primary_slave);
3353

3354 3355
	slave_dbg(bond_dev, slave_dev, "%s called\n", __func__);

L
Linus Torvalds 已提交
3356 3357
	switch (event) {
	case NETDEV_UNREGISTER:
3358
		if (bond_dev->type != ARPHRD_ETHER)
3359 3360
			bond_release_and_destroy(bond_dev, slave_dev);
		else
3361
			__bond_release_one(bond_dev, slave_dev, false, true);
L
Linus Torvalds 已提交
3362
		break;
3363
	case NETDEV_UP:
L
Linus Torvalds 已提交
3364
	case NETDEV_CHANGE:
3365 3366
		/* For 802.3ad mode only:
		 * Getting invalid Speed/Duplex values here will put slave
3367 3368 3369 3370
		 * in weird state. Mark it as link-fail if the link was
		 * previously up or link-down if it hasn't yet come up, and
		 * let link-monitoring (miimon) set it right when correct
		 * speeds/duplex are available.
3371 3372
		 */
		if (bond_update_speed_duplex(slave) &&
3373 3374 3375 3376 3377 3378
		    BOND_MODE(bond) == BOND_MODE_8023AD) {
			if (slave->last_link_up)
				slave->link = BOND_LINK_FAIL;
			else
				slave->link = BOND_LINK_DOWN;
		}
3379

3380 3381
		if (BOND_MODE(bond) == BOND_MODE_8023AD)
			bond_3ad_adapter_speed_duplex_changed(slave);
3382
		fallthrough;
M
Mahesh Bandewar 已提交
3383
	case NETDEV_DOWN:
3384 3385 3386 3387 3388 3389 3390 3391
		/* Refresh slave-array if applicable!
		 * If the setup does not use miimon or arpmon (mode-specific!),
		 * then these events will not cause the slave-array to be
		 * refreshed. This will cause xmit to use a slave that is not
		 * usable. Avoid such situation by refeshing the array at these
		 * events. If these (miimon/arpmon) parameters are configured
		 * then array gets refreshed twice and that should be fine!
		 */
3392
		if (bond_mode_can_use_xmit_hash(bond))
3393
			bond_update_slave_arr(bond, NULL);
L
Linus Torvalds 已提交
3394 3395
		break;
	case NETDEV_CHANGEMTU:
3396
		/* TODO: Should slaves be allowed to
L
Linus Torvalds 已提交
3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408
		 * independently alter their MTU?  For
		 * an active-backup bond, slaves need
		 * not be the same type of device, so
		 * MTUs may vary.  For other modes,
		 * slaves arguably should have the
		 * same MTUs. To do this, we'd need to
		 * take over the slave's change_mtu
		 * function for the duration of their
		 * servitude.
		 */
		break;
	case NETDEV_CHANGENAME:
3409
		/* we don't care if we don't have primary set */
3410
		if (!bond_uses_primary(bond) ||
3411 3412 3413
		    !bond->params.primary[0])
			break;

3414
		if (slave == primary) {
3415
			/* slave's name changed - he's no longer primary */
3416
			RCU_INIT_POINTER(bond->primary_slave, NULL);
3417 3418
		} else if (!strcmp(slave_dev->name, bond->params.primary)) {
			/* we have a new primary slave */
3419
			rcu_assign_pointer(bond->primary_slave, slave);
3420 3421 3422 3423
		} else { /* we didn't change primary - exit */
			break;
		}

3424
		netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n",
3425
			    primary ? slave_dev->name : "none");
3426 3427

		block_netpoll_tx();
3428
		bond_select_active_slave(bond);
3429
		unblock_netpoll_tx();
L
Linus Torvalds 已提交
3430
		break;
3431 3432 3433
	case NETDEV_FEAT_CHANGE:
		bond_compute_features(bond);
		break;
3434 3435 3436 3437
	case NETDEV_RESEND_IGMP:
		/* Propagate to master device */
		call_netdevice_notifiers(event, slave->bond->dev);
		break;
L
Linus Torvalds 已提交
3438 3439 3440 3441 3442 3443 3444
	default:
		break;
	}

	return NOTIFY_DONE;
}

3445
/* bond_netdev_event: handle netdev notifier chain events.
L
Linus Torvalds 已提交
3446 3447
 *
 * This function receives events for the netdev chain.  The caller (an
3448
 * ioctl handler calling blocking_notifier_call_chain) holds the necessary
L
Linus Torvalds 已提交
3449 3450 3451
 * locks for us to safely manipulate the slave devices (RTNL lock,
 * dev_probe_lock).
 */
S
Stephen Hemminger 已提交
3452 3453
static int bond_netdev_event(struct notifier_block *this,
			     unsigned long event, void *ptr)
L
Linus Torvalds 已提交
3454
{
3455
	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
L
Linus Torvalds 已提交
3456

3457 3458
	netdev_dbg(event_dev, "%s received %s\n",
		   __func__, netdev_cmd_to_name(event));
L
Linus Torvalds 已提交
3459

3460 3461 3462
	if (!(event_dev->priv_flags & IFF_BONDING))
		return NOTIFY_DONE;

L
Linus Torvalds 已提交
3463
	if (event_dev->flags & IFF_MASTER) {
3464 3465 3466 3467 3468
		int ret;

		ret = bond_master_netdev_event(event, event_dev);
		if (ret != NOTIFY_DONE)
			return ret;
L
Linus Torvalds 已提交
3469 3470
	}

3471
	if (event_dev->flags & IFF_SLAVE)
L
Linus Torvalds 已提交
3472 3473 3474 3475 3476 3477 3478 3479 3480
		return bond_slave_netdev_event(event, event_dev);

	return NOTIFY_DONE;
}

static struct notifier_block bond_netdev_notifier = {
	.notifier_call = bond_netdev_event,
};

3481 3482
/*---------------------------- Hashing Policies -----------------------------*/

3483 3484
/* L2 hash helper */
static inline u32 bond_eth_hash(struct sk_buff *skb)
3485
{
3486
	struct ethhdr *ep, hdr_tmp;
3487

3488 3489 3490
	ep = skb_header_pointer(skb, 0, sizeof(hdr_tmp), &hdr_tmp);
	if (ep)
		return ep->h_dest[5] ^ ep->h_source[5] ^ ep->h_proto;
3491 3492 3493
	return 0;
}

M
Matteo Croce 已提交
3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524
static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
			 int *noff, int *proto, bool l34)
{
	const struct ipv6hdr *iph6;
	const struct iphdr *iph;

	if (skb->protocol == htons(ETH_P_IP)) {
		if (unlikely(!pskb_may_pull(skb, *noff + sizeof(*iph))))
			return false;
		iph = (const struct iphdr *)(skb->data + *noff);
		iph_to_flow_copy_v4addrs(fk, iph);
		*noff += iph->ihl << 2;
		if (!ip_is_fragment(iph))
			*proto = iph->protocol;
	} else if (skb->protocol == htons(ETH_P_IPV6)) {
		if (unlikely(!pskb_may_pull(skb, *noff + sizeof(*iph6))))
			return false;
		iph6 = (const struct ipv6hdr *)(skb->data + *noff);
		iph_to_flow_copy_v6addrs(fk, iph6);
		*noff += sizeof(*iph6);
		*proto = iph6->nexthdr;
	} else {
		return false;
	}

	if (l34 && *proto >= 0)
		fk->ports.ports = skb_flow_get_ports(skb, *noff, *proto);

	return true;
}

3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545
static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
{
	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
	u32 srcmac_vendor = 0, srcmac_dev = 0;
	u16 vlan;
	int i;

	for (i = 0; i < 3; i++)
		srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i];

	for (i = 3; i < ETH_ALEN; i++)
		srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i];

	if (!skb_vlan_tag_present(skb))
		return srcmac_vendor ^ srcmac_dev;

	vlan = skb_vlan_tag_get(skb);

	return vlan ^ srcmac_vendor ^ srcmac_dev;
}

3546 3547 3548
/* Extract the appropriate headers based on bond's xmit policy */
static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
			      struct flow_keys *fk)
3549
{
M
Matteo Croce 已提交
3550
	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
3551
	int noff, proto = -1;
3552

3553 3554 3555
	switch (bond->params.xmit_policy) {
	case BOND_XMIT_POLICY_ENCAP23:
	case BOND_XMIT_POLICY_ENCAP34:
3556 3557 3558
		memset(fk, 0, sizeof(*fk));
		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
					  fk, NULL, 0, 0, 0, 0);
3559 3560
	default:
		break;
3561
	}
3562

3563
	fk->ports.ports = 0;
3564
	memset(&fk->icmp, 0, sizeof(fk->icmp));
3565
	noff = skb_network_offset(skb);
M
Matteo Croce 已提交
3566
	if (!bond_flow_ip(skb, fk, &noff, &proto, l34))
3567
		return false;
M
Matteo Croce 已提交
3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589

	/* ICMP error packets contains at least 8 bytes of the header
	 * of the packet which generated the error. Use this information
	 * to correlate ICMP error packets within the same flow which
	 * generated the error.
	 */
	if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) {
		skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data,
				      skb_transport_offset(skb),
				      skb_headlen(skb));
		if (proto == IPPROTO_ICMP) {
			if (!icmp_is_err(fk->icmp.type))
				return true;

			noff += sizeof(struct icmphdr);
		} else if (proto == IPPROTO_ICMPV6) {
			if (!icmpv6_is_err(fk->icmp.type))
				return true;

			noff += sizeof(struct icmp6hdr);
		}
		return bond_flow_ip(skb, fk, &noff, &proto, l34);
3590
	}
3591

3592
	return true;
3593 3594
}

3595 3596 3597 3598 3599 3600 3601 3602 3603 3604
static u32 bond_ip_hash(u32 hash, struct flow_keys *flow)
{
	hash ^= (__force u32)flow_get_u32_dst(flow) ^
		(__force u32)flow_get_u32_src(flow);
	hash ^= (hash >> 16);
	hash ^= (hash >> 8);
	/* discard lowest hash bit to deal with the common even ports pattern */
	return hash >> 1;
}

3605 3606 3607 3608 3609 3610 3611
/**
 * bond_xmit_hash - generate a hash value based on the xmit policy
 * @bond: bonding device
 * @skb: buffer to use for headers
 *
 * This function will extract the necessary headers from the skb buffer and use
 * them to generate a hash based on the xmit_policy set in the bonding device
3612
 */
3613
u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
3614
{
3615 3616
	struct flow_keys flow;
	u32 hash;
3617

E
Eric Dumazet 已提交
3618 3619 3620 3621
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 &&
	    skb->l4_hash)
		return skb->hash;

3622 3623 3624
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
		return bond_vlan_srcmac_hash(skb);

3625 3626
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
	    !bond_flow_dissect(bond, skb, &flow))
3627
		return bond_eth_hash(skb);
3628

3629
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
3630
	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) {
3631
		hash = bond_eth_hash(skb);
3632 3633 3634 3635 3636 3637
	} else {
		if (flow.icmp.id)
			memcpy(&hash, &flow.icmp, sizeof(hash));
		else
			memcpy(&hash, &flow.ports.ports, sizeof(hash));
	}
3638

3639
	return bond_ip_hash(hash, &flow);
3640 3641
}

L
Linus Torvalds 已提交
3642 3643
/*-------------------------- Device entry points ----------------------------*/

3644
void bond_work_init_all(struct bonding *bond)
3645 3646 3647 3648 3649
{
	INIT_DELAYED_WORK(&bond->mcast_work,
			  bond_resend_igmp_join_requests_delayed);
	INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
	INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
3650
	INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor);
3651
	INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
3652
	INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
3653 3654 3655 3656 3657 3658 3659 3660 3661
}

static void bond_work_cancel_all(struct bonding *bond)
{
	cancel_delayed_work_sync(&bond->mii_work);
	cancel_delayed_work_sync(&bond->arp_work);
	cancel_delayed_work_sync(&bond->alb_work);
	cancel_delayed_work_sync(&bond->ad_work);
	cancel_delayed_work_sync(&bond->mcast_work);
3662
	cancel_delayed_work_sync(&bond->slave_arr_work);
3663 3664
}

L
Linus Torvalds 已提交
3665 3666
static int bond_open(struct net_device *bond_dev)
{
3667
	struct bonding *bond = netdev_priv(bond_dev);
3668
	struct list_head *iter;
3669
	struct slave *slave;
L
Linus Torvalds 已提交
3670

3671
	/* reset slave->backup and slave->inactive */
3672
	if (bond_has_slaves(bond)) {
3673
		bond_for_each_slave(bond, slave, iter) {
3674 3675
			if (bond_uses_primary(bond) &&
			    slave != rcu_access_pointer(bond->curr_active_slave)) {
3676 3677
				bond_set_slave_inactive_flags(slave,
							      BOND_SLAVE_NOTIFY_NOW);
3678
			} else if (BOND_MODE(bond) != BOND_MODE_8023AD) {
3679 3680
				bond_set_slave_active_flags(slave,
							    BOND_SLAVE_NOTIFY_NOW);
3681 3682 3683 3684
			}
		}
	}

3685
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
3686 3687 3688
		/* bond_alb_initialize must be called before the timer
		 * is started.
		 */
3689
		if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
3690
			return -ENOMEM;
3691
		if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
3692
			queue_delayed_work(bond->wq, &bond->alb_work, 0);
L
Linus Torvalds 已提交
3693 3694
	}

3695
	if (bond->params.miimon)  /* link check interval, in milliseconds. */
3696
		queue_delayed_work(bond->wq, &bond->mii_work, 0);
L
Linus Torvalds 已提交
3697 3698

	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
3699
		queue_delayed_work(bond->wq, &bond->arp_work, 0);
3700
		bond->recv_probe = bond_arp_rcv;
L
Linus Torvalds 已提交
3701 3702
	}

3703
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
3704
		queue_delayed_work(bond->wq, &bond->ad_work, 0);
L
Linus Torvalds 已提交
3705
		/* register to receive LACPDUs */
3706
		bond->recv_probe = bond_3ad_lacpdu_recv;
3707
		bond_3ad_initiate_agg_selection(bond, 1);
L
Linus Torvalds 已提交
3708 3709
	}

3710
	if (bond_mode_can_use_xmit_hash(bond))
3711 3712
		bond_update_slave_arr(bond, NULL);

L
Linus Torvalds 已提交
3713 3714 3715 3716 3717
	return 0;
}

static int bond_close(struct net_device *bond_dev)
{
3718
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3719

3720
	bond_work_cancel_all(bond);
3721
	bond->send_peer_notif = 0;
3722
	if (bond_is_lb(bond))
L
Linus Torvalds 已提交
3723
		bond_alb_deinitialize(bond);
3724
	bond->recv_probe = NULL;
L
Linus Torvalds 已提交
3725 3726 3727 3728

	return 0;
}

E
Eric Dumazet 已提交
3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743
/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
 * that some drivers can provide 32bit values only.
 */
static void bond_fold_stats(struct rtnl_link_stats64 *_res,
			    const struct rtnl_link_stats64 *_new,
			    const struct rtnl_link_stats64 *_old)
{
	const u64 *new = (const u64 *)_new;
	const u64 *old = (const u64 *)_old;
	u64 *res = (u64 *)_res;
	int i;

	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
		u64 nv = new[i];
		u64 ov = old[i];
3744
		s64 delta = nv - ov;
E
Eric Dumazet 已提交
3745 3746 3747

		/* detects if this particular field is 32bit only */
		if (((nv | ov) >> 32) == 0)
3748 3749 3750 3751 3752 3753 3754
			delta = (s64)(s32)((u32)nv - (u32)ov);

		/* filter anomalies, some drivers reset their stats
		 * at down/up events.
		 */
		if (delta > 0)
			res[i] += delta;
E
Eric Dumazet 已提交
3755 3756 3757
	}
}

3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798
#ifdef CONFIG_LOCKDEP
static int bond_get_lowest_level_rcu(struct net_device *dev)
{
	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
	int cur = 0, max = 0;

	now = dev;
	iter = &dev->adj_list.lower;

	while (1) {
		next = NULL;
		while (1) {
			ldev = netdev_next_lower_dev_rcu(now, &iter);
			if (!ldev)
				break;

			next = ldev;
			niter = &ldev->adj_list.lower;
			dev_stack[cur] = now;
			iter_stack[cur++] = iter;
			if (max <= cur)
				max = cur;
			break;
		}

		if (!next) {
			if (!cur)
				return max;
			next = dev_stack[--cur];
			niter = iter_stack[cur];
		}

		now = next;
		iter = niter;
	}

	return max;
}
#endif

3799 3800
static void bond_get_stats(struct net_device *bond_dev,
			   struct rtnl_link_stats64 *stats)
L
Linus Torvalds 已提交
3801
{
3802
	struct bonding *bond = netdev_priv(bond_dev);
3803
	struct rtnl_link_stats64 temp;
3804
	struct list_head *iter;
L
Linus Torvalds 已提交
3805
	struct slave *slave;
3806
	int nest_level = 0;
L
Linus Torvalds 已提交
3807 3808


E
Eric Dumazet 已提交
3809
	rcu_read_lock();
3810 3811 3812 3813 3814 3815 3816
#ifdef CONFIG_LOCKDEP
	nest_level = bond_get_lowest_level_rcu(bond_dev);
#endif

	spin_lock_nested(&bond->stats_lock, nest_level);
	memcpy(stats, &bond->bond_stats, sizeof(*stats));

E
Eric Dumazet 已提交
3817 3818
	bond_for_each_slave_rcu(bond, slave, iter) {
		const struct rtnl_link_stats64 *new =
3819
			dev_get_stats(slave->dev, &temp);
E
Eric Dumazet 已提交
3820 3821

		bond_fold_stats(stats, new, &slave->slave_stats);
3822 3823

		/* save off the slave stats for the next run */
E
Eric Dumazet 已提交
3824
		memcpy(&slave->slave_stats, new, sizeof(*new));
3825
	}
E
Eric Dumazet 已提交
3826

3827
	memcpy(&bond->bond_stats, stats, sizeof(*stats));
E
Eric Dumazet 已提交
3828
	spin_unlock(&bond->stats_lock);
3829
	rcu_read_unlock();
L
Linus Torvalds 已提交
3830 3831 3832 3833
}

static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
{
3834
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3835 3836 3837 3838 3839 3840
	struct net_device *slave_dev = NULL;
	struct ifbond k_binfo;
	struct ifbond __user *u_binfo = NULL;
	struct ifslave k_sinfo;
	struct ifslave __user *u_sinfo = NULL;
	struct mii_ioctl_data *mii = NULL;
3841
	struct bond_opt_value newval;
3842
	struct net *net;
L
Linus Torvalds 已提交
3843 3844
	int res = 0;

3845
	netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd);
L
Linus Torvalds 已提交
3846 3847 3848 3849

	switch (cmd) {
	case SIOCGMIIPHY:
		mii = if_mii(ifr);
S
Stephen Hemminger 已提交
3850
		if (!mii)
L
Linus Torvalds 已提交
3851
			return -EINVAL;
S
Stephen Hemminger 已提交
3852

L
Linus Torvalds 已提交
3853
		mii->phy_id = 0;
3854
		fallthrough;
L
Linus Torvalds 已提交
3855
	case SIOCGMIIREG:
3856
		/* We do this again just in case we were called by SIOCGMIIREG
L
Linus Torvalds 已提交
3857 3858 3859
		 * instead of SIOCGMIIPHY.
		 */
		mii = if_mii(ifr);
S
Stephen Hemminger 已提交
3860
		if (!mii)
L
Linus Torvalds 已提交
3861
			return -EINVAL;
S
Stephen Hemminger 已提交
3862

L
Linus Torvalds 已提交
3863 3864
		if (mii->reg_num == 1) {
			mii->val_out = 0;
S
Stephen Hemminger 已提交
3865
			if (netif_carrier_ok(bond->dev))
L
Linus Torvalds 已提交
3866 3867 3868 3869 3870 3871 3872 3873
				mii->val_out = BMSR_LSTATUS;
		}

		return 0;
	case BOND_INFO_QUERY_OLD:
	case SIOCBONDINFOQUERY:
		u_binfo = (struct ifbond __user *)ifr->ifr_data;

S
Stephen Hemminger 已提交
3874
		if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond)))
L
Linus Torvalds 已提交
3875 3876
			return -EFAULT;

3877 3878
		bond_info_query(bond_dev, &k_binfo);
		if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond)))
S
Stephen Hemminger 已提交
3879
			return -EFAULT;
L
Linus Torvalds 已提交
3880

3881
		return 0;
L
Linus Torvalds 已提交
3882 3883 3884 3885
	case BOND_SLAVE_INFO_QUERY_OLD:
	case SIOCBONDSLAVEINFOQUERY:
		u_sinfo = (struct ifslave __user *)ifr->ifr_data;

S
Stephen Hemminger 已提交
3886
		if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave)))
L
Linus Torvalds 已提交
3887 3888 3889
			return -EFAULT;

		res = bond_slave_info_query(bond_dev, &k_sinfo);
S
Stephen Hemminger 已提交
3890 3891 3892
		if (res == 0 &&
		    copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave)))
			return -EFAULT;
L
Linus Torvalds 已提交
3893 3894 3895 3896 3897 3898

		return res;
	default:
		break;
	}

3899 3900 3901
	net = dev_net(bond_dev);

	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
3902 3903
		return -EPERM;

3904
	slave_dev = __dev_get_by_name(net, ifr->ifr_slave);
L
Linus Torvalds 已提交
3905

3906
	slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev);
L
Linus Torvalds 已提交
3907

S
Stephen Hemminger 已提交
3908
	if (!slave_dev)
3909
		return -ENODEV;
L
Linus Torvalds 已提交
3910

3911 3912 3913
	switch (cmd) {
	case BOND_ENSLAVE_OLD:
	case SIOCBONDENSLAVE:
D
David Ahern 已提交
3914
		res = bond_enslave(bond_dev, slave_dev, NULL);
3915 3916 3917 3918 3919 3920 3921
		break;
	case BOND_RELEASE_OLD:
	case SIOCBONDRELEASE:
		res = bond_release(bond_dev, slave_dev);
		break;
	case BOND_SETHWADDR_OLD:
	case SIOCBONDSETHWADDR:
3922
		res = bond_set_dev_addr(bond_dev, slave_dev);
3923 3924 3925
		break;
	case BOND_CHANGE_ACTIVE_OLD:
	case SIOCBONDCHANGEACTIVE:
3926
		bond_opt_initstr(&newval, slave_dev->name);
3927 3928
		res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE,
					    &newval);
3929 3930 3931
		break;
	default:
		res = -EOPNOTSUPP;
L
Linus Torvalds 已提交
3932 3933 3934 3935 3936
	}

	return res;
}

3937
static void bond_change_rx_flags(struct net_device *bond_dev, int change)
L
Linus Torvalds 已提交
3938
{
3939
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3940

3941 3942 3943
	if (change & IFF_PROMISC)
		bond_set_promiscuity(bond,
				     bond_dev->flags & IFF_PROMISC ? 1 : -1);
S
Stephen Hemminger 已提交
3944

3945 3946 3947 3948
	if (change & IFF_ALLMULTI)
		bond_set_allmulti(bond,
				  bond_dev->flags & IFF_ALLMULTI ? 1 : -1);
}
L
Linus Torvalds 已提交
3949

3950
static void bond_set_rx_mode(struct net_device *bond_dev)
3951 3952
{
	struct bonding *bond = netdev_priv(bond_dev);
3953
	struct list_head *iter;
3954
	struct slave *slave;
L
Linus Torvalds 已提交
3955

3956
	rcu_read_lock();
3957
	if (bond_uses_primary(bond)) {
3958
		slave = rcu_dereference(bond->curr_active_slave);
3959 3960 3961 3962 3963
		if (slave) {
			dev_uc_sync(slave->dev, bond_dev);
			dev_mc_sync(slave->dev, bond_dev);
		}
	} else {
3964
		bond_for_each_slave_rcu(bond, slave, iter) {
3965 3966 3967
			dev_uc_sync_multiple(slave->dev, bond_dev);
			dev_mc_sync_multiple(slave->dev, bond_dev);
		}
L
Linus Torvalds 已提交
3968
	}
3969
	rcu_read_unlock();
L
Linus Torvalds 已提交
3970 3971
}

3972
static int bond_neigh_init(struct neighbour *n)
3973
{
3974 3975 3976
	struct bonding *bond = netdev_priv(n->dev);
	const struct net_device_ops *slave_ops;
	struct neigh_parms parms;
3977
	struct slave *slave;
E
Eric Dumazet 已提交
3978
	int ret = 0;
3979

E
Eric Dumazet 已提交
3980 3981
	rcu_read_lock();
	slave = bond_first_slave_rcu(bond);
3982
	if (!slave)
E
Eric Dumazet 已提交
3983
		goto out;
3984
	slave_ops = slave->dev->netdev_ops;
3985
	if (!slave_ops->ndo_neigh_setup)
E
Eric Dumazet 已提交
3986
		goto out;
3987

E
Eric Dumazet 已提交
3988 3989 3990 3991 3992 3993 3994 3995 3996
	/* TODO: find another way [1] to implement this.
	 * Passing a zeroed structure is fragile,
	 * but at least we do not pass garbage.
	 *
	 * [1] One way would be that ndo_neigh_setup() never touch
	 *     struct neigh_parms, but propagate the new neigh_setup()
	 *     back to ___neigh_create() / neigh_parms_alloc()
	 */
	memset(&parms, 0, sizeof(parms));
3997 3998
	ret = slave_ops->ndo_neigh_setup(slave->dev, &parms);

E
Eric Dumazet 已提交
3999 4000
	if (ret)
		goto out;
4001

E
Eric Dumazet 已提交
4002 4003 4004 4005 4006
	if (parms.neigh_setup)
		ret = parms.neigh_setup(n);
out:
	rcu_read_unlock();
	return ret;
4007 4008
}

4009
/* The bonding ndo_neigh_setup is called at init time beofre any
4010 4011
 * slave exists. So we must declare proxy setup function which will
 * be used at run time to resolve the actual slave neigh param setup.
4012 4013 4014 4015
 *
 * It's also called by master devices (such as vlans) to setup their
 * underlying devices. In that case - do nothing, we're already set up from
 * our init.
4016 4017 4018 4019
 */
static int bond_neigh_setup(struct net_device *dev,
			    struct neigh_parms *parms)
{
4020 4021 4022
	/* modify only our neigh_parms */
	if (parms->dev == dev)
		parms->neigh_setup = bond_neigh_init;
4023 4024 4025 4026

	return 0;
}

4027
/* Change the MTU of all of a master's slaves to match the master */
L
Linus Torvalds 已提交
4028 4029
static int bond_change_mtu(struct net_device *bond_dev, int new_mtu)
{
4030
	struct bonding *bond = netdev_priv(bond_dev);
4031
	struct slave *slave, *rollback_slave;
4032
	struct list_head *iter;
L
Linus Torvalds 已提交
4033 4034
	int res = 0;

4035
	netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu);
L
Linus Torvalds 已提交
4036

4037
	bond_for_each_slave(bond, slave, iter) {
4038
		slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n",
4039
			   slave, slave->dev->netdev_ops->ndo_change_mtu);
4040

L
Linus Torvalds 已提交
4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051
		res = dev_set_mtu(slave->dev, new_mtu);

		if (res) {
			/* If we failed to set the slave's mtu to the new value
			 * we must abort the operation even in ACTIVE_BACKUP
			 * mode, because if we allow the backup slaves to have
			 * different mtu values than the active slave we'll
			 * need to change their mtu when doing a failover. That
			 * means changing their mtu from timer context, which
			 * is probably not a good idea.
			 */
4052 4053
			slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n",
				  res, new_mtu);
L
Linus Torvalds 已提交
4054 4055 4056 4057 4058 4059 4060 4061 4062 4063
			goto unwind;
		}
	}

	bond_dev->mtu = new_mtu;

	return 0;

unwind:
	/* unwind from head to the slave that failed */
4064
	bond_for_each_slave(bond, rollback_slave, iter) {
L
Linus Torvalds 已提交
4065 4066
		int tmp_res;

4067 4068 4069 4070
		if (rollback_slave == slave)
			break;

		tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu);
4071 4072 4073
		if (tmp_res)
			slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n",
				  tmp_res);
L
Linus Torvalds 已提交
4074 4075 4076 4077 4078
	}

	return res;
}

4079
/* Change HW address
L
Linus Torvalds 已提交
4080 4081 4082 4083 4084 4085 4086
 *
 * Note that many devices must be down to change the HW address, and
 * downing the master releases all slaves.  We can make bonds full of
 * bonding devices to test this, however.
 */
static int bond_set_mac_address(struct net_device *bond_dev, void *addr)
{
4087
	struct bonding *bond = netdev_priv(bond_dev);
4088
	struct slave *slave, *rollback_slave;
4089
	struct sockaddr_storage *ss = addr, tmp_ss;
4090
	struct list_head *iter;
L
Linus Torvalds 已提交
4091 4092
	int res = 0;

4093
	if (BOND_MODE(bond) == BOND_MODE_ALB)
4094 4095 4096
		return bond_alb_set_mac_address(bond_dev, addr);


4097
	netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond);
L
Linus Torvalds 已提交
4098

4099 4100
	/* If fail_over_mac is enabled, do nothing and return success.
	 * Returning an error causes ifenslave to fail.
4101
	 */
4102
	if (bond->params.fail_over_mac &&
4103
	    BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
4104
		return 0;
4105

4106
	if (!is_valid_ether_addr(ss->__data))
L
Linus Torvalds 已提交
4107 4108
		return -EADDRNOTAVAIL;

4109
	bond_for_each_slave(bond, slave, iter) {
4110 4111
		slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n",
			  __func__, slave);
4112
		res = dev_set_mac_address(slave->dev, addr, NULL);
L
Linus Torvalds 已提交
4113 4114 4115 4116 4117 4118 4119
		if (res) {
			/* TODO: consider downing the slave
			 * and retry ?
			 * User should expect communications
			 * breakage anyway until ARP finish
			 * updating, so...
			 */
4120 4121
			slave_dbg(bond_dev, slave->dev, "%s: err %d\n",
				  __func__, res);
L
Linus Torvalds 已提交
4122 4123 4124 4125 4126
			goto unwind;
		}
	}

	/* success */
4127
	memcpy(bond_dev->dev_addr, ss->__data, bond_dev->addr_len);
L
Linus Torvalds 已提交
4128 4129 4130
	return 0;

unwind:
4131 4132
	memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
	tmp_ss.ss_family = bond_dev->type;
L
Linus Torvalds 已提交
4133 4134

	/* unwind from head to the slave that failed */
4135
	bond_for_each_slave(bond, rollback_slave, iter) {
L
Linus Torvalds 已提交
4136 4137
		int tmp_res;

4138 4139 4140
		if (rollback_slave == slave)
			break;

4141
		tmp_res = dev_set_mac_address(rollback_slave->dev,
4142
					      (struct sockaddr *)&tmp_ss, NULL);
L
Linus Torvalds 已提交
4143
		if (tmp_res) {
4144 4145
			slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n",
				   __func__, tmp_res);
L
Linus Torvalds 已提交
4146 4147 4148 4149 4150 4151
		}
	}

	return res;
}

4152
/**
4153
 * bond_get_slave_by_id - get xmit slave with slave_id
4154 4155 4156
 * @bond: bonding device that is transmitting
 * @slave_id: slave id up to slave_cnt-1 through which to transmit
 *
4157
 * This function tries to get slave with slave_id but in case
4158 4159
 * it fails, it tries to find the first available slave for transmission.
 */
4160 4161
static struct slave *bond_get_slave_by_id(struct bonding *bond,
					  int slave_id)
4162
{
4163
	struct list_head *iter;
4164 4165 4166 4167
	struct slave *slave;
	int i = slave_id;

	/* Here we start from the slave with slave_id */
4168
	bond_for_each_slave_rcu(bond, slave, iter) {
4169
		if (--i < 0) {
4170
			if (bond_slave_can_tx(slave))
4171
				return slave;
4172 4173 4174 4175 4176
		}
	}

	/* Here we start from the first slave up to slave_id */
	i = slave_id;
4177
	bond_for_each_slave_rcu(bond, slave, iter) {
4178 4179
		if (--i < 0)
			break;
4180
		if (bond_slave_can_tx(slave))
4181
			return slave;
4182 4183
	}
	/* no slave that can tx has been found */
4184
	return NULL;
4185 4186
}

4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197
/**
 * bond_rr_gen_slave_id - generate slave id based on packets_per_slave
 * @bond: bonding device to use
 *
 * Based on the value of the bonding device's packets_per_slave parameter
 * this function generates a slave id, which is usually used as the next
 * slave to transmit through.
 */
static u32 bond_rr_gen_slave_id(struct bonding *bond)
{
	u32 slave_id;
4198 4199
	struct reciprocal_value reciprocal_packets_per_slave;
	int packets_per_slave = bond->params.packets_per_slave;
4200 4201 4202 4203 4204 4205 4206 4207 4208

	switch (packets_per_slave) {
	case 0:
		slave_id = prandom_u32();
		break;
	case 1:
		slave_id = bond->rr_tx_counter;
		break;
	default:
4209 4210
		reciprocal_packets_per_slave =
			bond->params.reciprocal_packets_per_slave;
4211
		slave_id = reciprocal_divide(bond->rr_tx_counter,
4212
					     reciprocal_packets_per_slave);
4213 4214 4215 4216 4217 4218 4219
		break;
	}
	bond->rr_tx_counter++;

	return slave_id;
}

4220 4221
static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond,
						    struct sk_buff *skb)
L
Linus Torvalds 已提交
4222
{
4223
	struct slave *slave;
4224
	int slave_cnt;
4225
	u32 slave_id;
L
Linus Torvalds 已提交
4226

4227
	/* Start with the curr_active_slave that joined the bond as the
4228 4229 4230 4231
	 * default for sending IGMP traffic.  For failover purposes one
	 * needs to maintain some consistency for the interface that will
	 * send the join/membership reports.  The curr_active_slave found
	 * will send all of this type of traffic.
4232
	 */
4233 4234 4235
	if (skb->protocol == htons(ETH_P_IP)) {
		int noff = skb_network_offset(skb);
		struct iphdr *iph;
4236

4237 4238 4239 4240 4241 4242 4243
		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
			goto non_igmp;

		iph = ip_hdr(skb);
		if (iph->protocol == IPPROTO_IGMP) {
			slave = rcu_dereference(bond->curr_active_slave);
			if (slave)
4244 4245
				return slave;
			return bond_get_slave_by_id(bond, 0);
4246
		}
L
Linus Torvalds 已提交
4247
	}
4248

4249 4250 4251
non_igmp:
	slave_cnt = READ_ONCE(bond->slave_cnt);
	if (likely(slave_cnt)) {
4252 4253
		slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
		return bond_get_slave_by_id(bond, slave_id);
4254
	}
4255 4256 4257 4258 4259 4260 4261 4262 4263 4264
	return NULL;
}

static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
					struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
	struct slave *slave;

	slave = bond_xmit_roundrobin_slave_get(bond, skb);
4265 4266 4267
	if (likely(slave))
		return bond_dev_queue_xmit(bond, skb, slave->dev);

4268
	return bond_tx_drop(bond_dev, skb);
L
Linus Torvalds 已提交
4269 4270
}

4271 4272 4273 4274 4275 4276
static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond,
						      struct sk_buff *skb)
{
	return rcu_dereference(bond->curr_active_slave);
}

4277
/* In active-backup mode, we know that bond->curr_active_slave is always valid if
L
Linus Torvalds 已提交
4278 4279
 * the bond has a usable interface.
 */
4280 4281
static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
					  struct net_device *bond_dev)
L
Linus Torvalds 已提交
4282
{
4283
	struct bonding *bond = netdev_priv(bond_dev);
4284
	struct slave *slave;
L
Linus Torvalds 已提交
4285

4286
	slave = bond_xmit_activebackup_slave_get(bond, skb);
4287
	if (slave)
4288
		return bond_dev_queue_xmit(bond, skb, slave->dev);
4289

4290
	return bond_tx_drop(bond_dev, skb);
L
Linus Torvalds 已提交
4291 4292
}

4293 4294 4295
/* Use this to update slave_array when (a) it's not appropriate to update
 * slave_array right away (note that update_slave_array() may sleep)
 * and / or (b) RTNL is not held.
L
Linus Torvalds 已提交
4296
 */
4297
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay)
L
Linus Torvalds 已提交
4298
{
4299 4300
	queue_delayed_work(bond->wq, &bond->slave_arr_work, delay);
}
L
Linus Torvalds 已提交
4301

4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323
/* Slave array work handler. Holds only RTNL */
static void bond_slave_arr_handler(struct work_struct *work)
{
	struct bonding *bond = container_of(work, struct bonding,
					    slave_arr_work.work);
	int ret;

	if (!rtnl_trylock())
		goto err;

	ret = bond_update_slave_arr(bond, NULL);
	rtnl_unlock();
	if (ret) {
		pr_warn_ratelimited("Failed to update slave array from WT\n");
		goto err;
	}
	return;

err:
	bond_slave_arr_work_rearm(bond, 1);
}

4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346
static void bond_skip_slave(struct bond_up_slave *slaves,
			    struct slave *skipslave)
{
	int idx;

	/* Rare situation where caller has asked to skip a specific
	 * slave but allocation failed (most likely!). BTW this is
	 * only possible when the call is initiated from
	 * __bond_release_one(). In this situation; overwrite the
	 * skipslave entry in the array with the last entry from the
	 * array to avoid a situation where the xmit path may choose
	 * this to-be-skipped slave to send a packet out.
	 */
	for (idx = 0; slaves && idx < slaves->count; idx++) {
		if (skipslave == slaves->arr[idx]) {
			slaves->arr[idx] =
				slaves->arr[slaves->count - 1];
			slaves->count--;
			break;
		}
	}
}

M
Maor Gottlieb 已提交
4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378
static void bond_set_slave_arr(struct bonding *bond,
			       struct bond_up_slave *usable_slaves,
			       struct bond_up_slave *all_slaves)
{
	struct bond_up_slave *usable, *all;

	usable = rtnl_dereference(bond->usable_slaves);
	rcu_assign_pointer(bond->usable_slaves, usable_slaves);
	kfree_rcu(usable, rcu);

	all = rtnl_dereference(bond->all_slaves);
	rcu_assign_pointer(bond->all_slaves, all_slaves);
	kfree_rcu(all, rcu);
}

static void bond_reset_slave_arr(struct bonding *bond)
{
	struct bond_up_slave *usable, *all;

	usable = rtnl_dereference(bond->usable_slaves);
	if (usable) {
		RCU_INIT_POINTER(bond->usable_slaves, NULL);
		kfree_rcu(usable, rcu);
	}

	all = rtnl_dereference(bond->all_slaves);
	if (all) {
		RCU_INIT_POINTER(bond->all_slaves, NULL);
		kfree_rcu(all, rcu);
	}
}

4379 4380 4381 4382
/* Build the usable slaves array in control path for modes that use xmit-hash
 * to determine the slave interface -
 * (a) BOND_MODE_8023AD
 * (b) BOND_MODE_XOR
4383
 * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0
4384 4385 4386 4387 4388
 *
 * The caller is expected to hold RTNL only and NO other lock!
 */
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
{
M
Maor Gottlieb 已提交
4389
	struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL;
4390 4391 4392 4393 4394
	struct slave *slave;
	struct list_head *iter;
	int agg_id = 0;
	int ret = 0;

4395
	might_sleep();
4396

4397 4398
	usable_slaves = kzalloc(struct_size(usable_slaves, arr,
					    bond->slave_cnt), GFP_KERNEL);
M
Maor Gottlieb 已提交
4399 4400 4401
	all_slaves = kzalloc(struct_size(all_slaves, arr,
					 bond->slave_cnt), GFP_KERNEL);
	if (!usable_slaves || !all_slaves) {
4402 4403 4404 4405 4406 4407
		ret = -ENOMEM;
		goto out;
	}
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
		struct ad_info ad_info;

4408
		spin_lock_bh(&bond->mode_lock);
4409
		if (bond_3ad_get_active_agg_info(bond, &ad_info)) {
4410
			spin_unlock_bh(&bond->mode_lock);
4411 4412 4413 4414
			pr_debug("bond_3ad_get_active_agg_info failed\n");
			/* No active aggragator means it's not safe to use
			 * the previous array.
			 */
M
Maor Gottlieb 已提交
4415
			bond_reset_slave_arr(bond);
4416 4417
			goto out;
		}
4418
		spin_unlock_bh(&bond->mode_lock);
4419 4420 4421
		agg_id = ad_info.aggregator_id;
	}
	bond_for_each_slave(bond, slave, iter) {
M
Maor Gottlieb 已提交
4422 4423 4424 4425
		if (skipslave == slave)
			continue;

		all_slaves->arr[all_slaves->count++] = slave;
4426 4427 4428 4429 4430 4431 4432 4433 4434
		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			struct aggregator *agg;

			agg = SLAVE_AD_INFO(slave)->port.aggregator;
			if (!agg || agg->aggregator_identifier != agg_id)
				continue;
		}
		if (!bond_slave_can_tx(slave))
			continue;
4435

4436
		slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n",
4437
			  usable_slaves->count);
4438

4439
		usable_slaves->arr[usable_slaves->count++] = slave;
4440 4441
	}

M
Maor Gottlieb 已提交
4442 4443
	bond_set_slave_arr(bond, usable_slaves, all_slaves);
	return ret;
4444 4445
out:
	if (ret != 0 && skipslave) {
M
Maor Gottlieb 已提交
4446 4447
		bond_skip_slave(rtnl_dereference(bond->all_slaves),
				skipslave);
4448 4449
		bond_skip_slave(rtnl_dereference(bond->usable_slaves),
				skipslave);
4450
	}
M
Maor Gottlieb 已提交
4451 4452
	kfree_rcu(all_slaves, rcu);
	kfree_rcu(usable_slaves, rcu);
4453

4454 4455 4456
	return ret;
}

4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473
static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond,
						 struct sk_buff *skb,
						 struct bond_up_slave *slaves)
{
	struct slave *slave;
	unsigned int count;
	u32 hash;

	hash = bond_xmit_hash(bond, skb);
	count = slaves ? READ_ONCE(slaves->count) : 0;
	if (unlikely(!count))
		return NULL;

	slave = slaves->arr[hash % count];
	return slave;
}

4474 4475 4476 4477
/* Use this Xmit function for 3AD as well as XOR modes. The current
 * usable slave array is formed in the control path. The xmit function
 * just calculates hash and sends the packet out.
 */
4478 4479
static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
				     struct net_device *dev)
4480 4481 4482
{
	struct bonding *bond = netdev_priv(dev);
	struct bond_up_slave *slaves;
4483
	struct slave *slave;
4484

4485
	slaves = rcu_dereference(bond->usable_slaves);
4486 4487
	slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
	if (likely(slave))
4488
		return bond_dev_queue_xmit(bond, skb, slave->dev);
4489

4490
	return bond_tx_drop(dev, skb);
L
Linus Torvalds 已提交
4491 4492
}

4493
/* in broadcast mode, we send everything to all usable interfaces. */
4494 4495
static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
				       struct net_device *bond_dev)
L
Linus Torvalds 已提交
4496
{
4497
	struct bonding *bond = netdev_priv(bond_dev);
4498
	struct slave *slave = NULL;
4499
	struct list_head *iter;
L
Linus Torvalds 已提交
4500

4501
	bond_for_each_slave_rcu(bond, slave, iter) {
4502 4503
		if (bond_is_last_slave(bond, slave))
			break;
4504
		if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
4505
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
L
Linus Torvalds 已提交
4506

4507
			if (!skb2) {
4508 4509
				net_err_ratelimited("%s: Error: %s: skb_clone() failed\n",
						    bond_dev->name, __func__);
4510
				continue;
L
Linus Torvalds 已提交
4511
			}
4512
			bond_dev_queue_xmit(bond, skb2, slave->dev);
L
Linus Torvalds 已提交
4513 4514
		}
	}
4515
	if (slave && bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)
4516
		return bond_dev_queue_xmit(bond, skb, slave->dev);
S
Stephen Hemminger 已提交
4517

4518
	return bond_tx_drop(bond_dev, skb);
L
Linus Torvalds 已提交
4519 4520 4521 4522
}

/*------------------------- Device initialization ---------------------------*/

4523
/* Lookup the slave that corresponds to a qid */
4524 4525 4526 4527
static inline int bond_slave_override(struct bonding *bond,
				      struct sk_buff *skb)
{
	struct slave *slave = NULL;
4528
	struct list_head *iter;
4529

4530
	if (!skb_rx_queue_recorded(skb))
4531
		return 1;
4532 4533

	/* Find out if any slaves have the same mapping as this skb. */
4534
	bond_for_each_slave_rcu(bond, slave, iter) {
4535
		if (slave->queue_id == skb_get_queue_mapping(skb)) {
4536 4537
			if (bond_slave_is_up(slave) &&
			    slave->link == BOND_LINK_UP) {
4538 4539 4540 4541
				bond_dev_queue_xmit(bond, skb, slave->dev);
				return 0;
			}
			/* If the slave isn't UP, use default transmit policy. */
4542 4543 4544 4545
			break;
		}
	}

4546
	return 1;
4547 4548
}

4549

4550
static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
4551
			     struct net_device *sb_dev)
4552
{
4553
	/* This helper function exists to help dev_pick_tx get the correct
P
Phil Oester 已提交
4554
	 * destination queue.  Using a helper function skips a call to
4555 4556 4557
	 * skb_tx_hash and will put the skbs in the queue we expect on their
	 * way down to the bonding driver.
	 */
P
Phil Oester 已提交
4558 4559
	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;

4560
	/* Save the original txq to restore before passing to the driver */
4561
	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb);
4562

P
Phil Oester 已提交
4563
	if (unlikely(txq >= dev->real_num_tx_queues)) {
4564
		do {
P
Phil Oester 已提交
4565
			txq -= dev->real_num_tx_queues;
4566
		} while (txq >= dev->real_num_tx_queues);
P
Phil Oester 已提交
4567 4568
	}
	return txq;
4569 4570
}

4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612
static struct net_device *bond_xmit_get_slave(struct net_device *master_dev,
					      struct sk_buff *skb,
					      bool all_slaves)
{
	struct bonding *bond = netdev_priv(master_dev);
	struct bond_up_slave *slaves;
	struct slave *slave = NULL;

	switch (BOND_MODE(bond)) {
	case BOND_MODE_ROUNDROBIN:
		slave = bond_xmit_roundrobin_slave_get(bond, skb);
		break;
	case BOND_MODE_ACTIVEBACKUP:
		slave = bond_xmit_activebackup_slave_get(bond, skb);
		break;
	case BOND_MODE_8023AD:
	case BOND_MODE_XOR:
		if (all_slaves)
			slaves = rcu_dereference(bond->all_slaves);
		else
			slaves = rcu_dereference(bond->usable_slaves);
		slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
		break;
	case BOND_MODE_BROADCAST:
		break;
	case BOND_MODE_ALB:
		slave = bond_xmit_alb_slave_get(bond, skb);
		break;
	case BOND_MODE_TLB:
		slave = bond_xmit_tlb_slave_get(bond, skb);
		break;
	default:
		/* Should never happen, mode already checked */
		WARN_ONCE(true, "Unknown bonding mode");
		break;
	}

	if (slave)
		return slave->dev;
	return NULL;
}

4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691
static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow)
{
	switch (sk->sk_family) {
#if IS_ENABLED(CONFIG_IPV6)
	case AF_INET6:
		if (sk->sk_ipv6only ||
		    ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) {
			flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			flow->addrs.v6addrs.src = inet6_sk(sk)->saddr;
			flow->addrs.v6addrs.dst = sk->sk_v6_daddr;
			break;
		}
		fallthrough;
#endif
	default: /* AF_INET */
		flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
		flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr;
		flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr;
		break;
	}

	flow->ports.src = inet_sk(sk)->inet_sport;
	flow->ports.dst = inet_sk(sk)->inet_dport;
}

/**
 * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields
 * @sk: socket to use for headers
 *
 * This function will extract the necessary field from the socket and use
 * them to generate a hash based on the LAYER34 xmit_policy.
 * Assumes that sk is a TCP or UDP socket.
 */
static u32 bond_sk_hash_l34(struct sock *sk)
{
	struct flow_keys flow;
	u32 hash;

	bond_sk_to_flow(sk, &flow);

	/* L4 */
	memcpy(&hash, &flow.ports.ports, sizeof(hash));
	/* L3 */
	return bond_ip_hash(hash, &flow);
}

static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond,
						  struct sock *sk)
{
	struct bond_up_slave *slaves;
	struct slave *slave;
	unsigned int count;
	u32 hash;

	slaves = rcu_dereference(bond->usable_slaves);
	count = slaves ? READ_ONCE(slaves->count) : 0;
	if (unlikely(!count))
		return NULL;

	hash = bond_sk_hash_l34(sk);
	slave = slaves->arr[hash % count];

	return slave->dev;
}

static struct net_device *bond_sk_get_lower_dev(struct net_device *dev,
						struct sock *sk)
{
	struct bonding *bond = netdev_priv(dev);
	struct net_device *lower = NULL;

	rcu_read_lock();
	if (bond_sk_check(bond))
		lower = __bond_sk_get_lower_dev(bond, sk);
	rcu_read_unlock();

	return lower;
}

4692 4693 4694 4695 4696 4697 4698 4699 4700 4701
#if IS_ENABLED(CONFIG_TLS_DEVICE)
static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb,
					struct net_device *dev)
{
	if (likely(bond_get_slave_by_dev(bond, tls_get_ctx(skb->sk)->netdev)))
		return bond_dev_queue_xmit(bond, skb, tls_get_ctx(skb->sk)->netdev);
	return bond_tx_drop(dev, skb);
}
#endif

4702
static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
4703
{
4704 4705
	struct bonding *bond = netdev_priv(dev);

4706 4707 4708
	if (bond_should_override_tx_queue(bond) &&
	    !bond_slave_override(bond, skb))
		return NETDEV_TX_OK;
4709

4710 4711 4712 4713 4714
#if IS_ENABLED(CONFIG_TLS_DEVICE)
	if (skb->sk && tls_is_sk_tx_device_offloaded(skb->sk))
		return bond_tls_device_xmit(bond, skb, dev);
#endif

4715
	switch (BOND_MODE(bond)) {
4716 4717 4718 4719
	case BOND_MODE_ROUNDROBIN:
		return bond_xmit_roundrobin(skb, dev);
	case BOND_MODE_ACTIVEBACKUP:
		return bond_xmit_activebackup(skb, dev);
4720
	case BOND_MODE_8023AD:
4721
	case BOND_MODE_XOR:
4722
		return bond_3ad_xor_xmit(skb, dev);
4723 4724 4725 4726
	case BOND_MODE_BROADCAST:
		return bond_xmit_broadcast(skb, dev);
	case BOND_MODE_ALB:
		return bond_alb_xmit(skb, dev);
4727 4728
	case BOND_MODE_TLB:
		return bond_tlb_xmit(skb, dev);
4729 4730
	default:
		/* Should never happen, mode already checked */
4731
		netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond));
4732
		WARN_ON_ONCE(1);
4733
		return bond_tx_drop(dev, skb);
4734 4735 4736
	}
}

4737 4738 4739 4740 4741
static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct bonding *bond = netdev_priv(dev);
	netdev_tx_t ret = NETDEV_TX_OK;

4742
	/* If we risk deadlock from transmitting this in the
4743 4744
	 * netpoll path, tell netpoll to queue the frame for later tx
	 */
4745
	if (unlikely(is_netpoll_tx_blocked(dev)))
4746 4747
		return NETDEV_TX_BUSY;

4748
	rcu_read_lock();
4749
	if (bond_has_slaves(bond))
4750 4751
		ret = __bond_start_xmit(skb, dev);
	else
4752
		ret = bond_tx_drop(dev, skb);
4753
	rcu_read_unlock();
4754 4755 4756

	return ret;
}
4757

4758 4759 4760 4761 4762 4763 4764 4765 4766 4767
static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
{
	if (speed == 0 || speed == SPEED_UNKNOWN)
		speed = slave->speed;
	else
		speed = min(speed, slave->speed);

	return speed;
}

4768 4769
static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
					   struct ethtool_link_ksettings *cmd)
4770 4771
{
	struct bonding *bond = netdev_priv(bond_dev);
4772
	struct list_head *iter;
4773
	struct slave *slave;
4774
	u32 speed = 0;
4775

4776 4777
	cmd->base.duplex = DUPLEX_UNKNOWN;
	cmd->base.port = PORT_OTHER;
4778

4779
	/* Since bond_slave_can_tx returns false for all inactive or down slaves, we
4780 4781 4782 4783
	 * do not need to check mode.  Though link speed might not represent
	 * the true receive or transmit bandwidth (not all modes are symmetric)
	 * this is an accurate maximum.
	 */
4784
	bond_for_each_slave(bond, slave, iter) {
4785
		if (bond_slave_can_tx(slave)) {
4786 4787 4788 4789 4790 4791 4792
			if (slave->speed != SPEED_UNKNOWN) {
				if (BOND_MODE(bond) == BOND_MODE_BROADCAST)
					speed = bond_mode_bcast_speed(slave,
								      speed);
				else
					speed += slave->speed;
			}
4793
			if (cmd->base.duplex == DUPLEX_UNKNOWN &&
4794
			    slave->duplex != DUPLEX_UNKNOWN)
4795
				cmd->base.duplex = slave->duplex;
4796 4797
		}
	}
4798
	cmd->base.speed = speed ? : SPEED_UNKNOWN;
4799

4800 4801 4802
	return 0;
}

4803
static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
4804
				     struct ethtool_drvinfo *drvinfo)
4805
{
4806 4807 4808
	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d",
		 BOND_ABI_VERSION);
4809 4810
}

4811
static const struct ethtool_ops bond_ethtool_ops = {
4812
	.get_drvinfo		= bond_ethtool_get_drvinfo,
4813
	.get_link		= ethtool_op_get_link,
4814
	.get_link_ksettings	= bond_ethtool_get_link_ksettings,
4815 4816
};

4817
static const struct net_device_ops bond_netdev_ops = {
4818
	.ndo_init		= bond_init,
S
Stephen Hemminger 已提交
4819
	.ndo_uninit		= bond_uninit,
4820 4821
	.ndo_open		= bond_open,
	.ndo_stop		= bond_close,
4822
	.ndo_start_xmit		= bond_start_xmit,
4823
	.ndo_select_queue	= bond_select_queue,
4824
	.ndo_get_stats64	= bond_get_stats,
4825
	.ndo_do_ioctl		= bond_do_ioctl,
4826
	.ndo_change_rx_flags	= bond_change_rx_flags,
4827
	.ndo_set_rx_mode	= bond_set_rx_mode,
4828
	.ndo_change_mtu		= bond_change_mtu,
J
Jiri Pirko 已提交
4829
	.ndo_set_mac_address	= bond_set_mac_address,
4830
	.ndo_neigh_setup	= bond_neigh_setup,
J
Jiri Pirko 已提交
4831
	.ndo_vlan_rx_add_vid	= bond_vlan_rx_add_vid,
4832
	.ndo_vlan_rx_kill_vid	= bond_vlan_rx_kill_vid,
4833
#ifdef CONFIG_NET_POLL_CONTROLLER
4834
	.ndo_netpoll_setup	= bond_netpoll_setup,
4835 4836 4837
	.ndo_netpoll_cleanup	= bond_netpoll_cleanup,
	.ndo_poll_controller	= bond_poll_controller,
#endif
J
Jiri Pirko 已提交
4838 4839
	.ndo_add_slave		= bond_enslave,
	.ndo_del_slave		= bond_release,
4840
	.ndo_fix_features	= bond_fix_features,
4841
	.ndo_features_check	= passthru_features_check,
4842
	.ndo_get_xmit_slave	= bond_xmit_get_slave,
4843
	.ndo_sk_get_lower_dev	= bond_sk_get_lower_dev,
4844 4845
};

4846 4847 4848 4849
static const struct device_type bond_type = {
	.name = "bond",
};

4850 4851 4852
static void bond_destructor(struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
4853

4854 4855 4856 4857
	if (bond->wq)
		destroy_workqueue(bond->wq);
}

4858
void bond_setup(struct net_device *bond_dev)
L
Linus Torvalds 已提交
4859
{
4860
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
4861

4862
	spin_lock_init(&bond->mode_lock);
4863
	bond->params = bonding_defaults;
L
Linus Torvalds 已提交
4864 4865 4866 4867 4868

	/* Initialize pointers */
	bond->dev = bond_dev;

	/* Initialize the device entry points */
4869
	ether_setup(bond_dev);
W
WANG Cong 已提交
4870
	bond_dev->max_mtu = ETH_MAX_MTU;
4871
	bond_dev->netdev_ops = &bond_netdev_ops;
4872
	bond_dev->ethtool_ops = &bond_ethtool_ops;
L
Linus Torvalds 已提交
4873

4874 4875
	bond_dev->needs_free_netdev = true;
	bond_dev->priv_destructor = bond_destructor;
L
Linus Torvalds 已提交
4876

4877 4878
	SET_NETDEV_DEVTYPE(bond_dev, &bond_type);

L
Linus Torvalds 已提交
4879
	/* Initialize the device options */
4880
	bond_dev->flags |= IFF_MASTER;
4881
	bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE;
4882
	bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
4883

4884 4885
#ifdef CONFIG_XFRM_OFFLOAD
	/* set up xfrm device ops (only supported in active-backup right now) */
4886
	bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
4887 4888 4889
	bond->xs = NULL;
#endif /* CONFIG_XFRM_OFFLOAD */

4890
	/* don't acquire bond device's netif_tx_lock when transmitting */
L
Linus Torvalds 已提交
4891 4892 4893 4894 4895 4896 4897 4898 4899
	bond_dev->features |= NETIF_F_LLTX;

	/* By default, we declare the bond to be fully
	 * VLAN hardware accelerated capable. Special
	 * care is taken in the various xmit functions
	 * when there are slaves that are not hw accel
	 * capable
	 */

4900 4901 4902
	/* Don't allow bond devices to change network namespaces. */
	bond_dev->features |= NETIF_F_NETNS_LOCAL;

4903
	bond_dev->hw_features = BOND_VLAN_FEATURES |
4904 4905
				NETIF_F_HW_VLAN_CTAG_RX |
				NETIF_F_HW_VLAN_CTAG_FILTER;
4906

4907
	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
4908
	bond_dev->features |= bond_dev->hw_features;
4909
	bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
4910
#ifdef CONFIG_XFRM_OFFLOAD
4911 4912 4913 4914
	bond_dev->hw_features |= BOND_XFRM_FEATURES;
	/* Only enable XFRM features if this is an active-backup config */
	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
		bond_dev->features |= BOND_XFRM_FEATURES;
4915
#endif /* CONFIG_XFRM_OFFLOAD */
4916 4917 4918 4919
#if IS_ENABLED(CONFIG_TLS_DEVICE)
	if (bond_sk_check(bond))
		bond_dev->features |= BOND_TLS_FEATURES;
#endif
L
Linus Torvalds 已提交
4920 4921
}

4922 4923 4924
/* Destroy a bonding device.
 * Must be under rtnl_lock when this function is called.
 */
4925
static void bond_uninit(struct net_device *bond_dev)
J
Jay Vosburgh 已提交
4926
{
4927
	struct bonding *bond = netdev_priv(bond_dev);
M
Maor Gottlieb 已提交
4928
	struct bond_up_slave *usable, *all;
4929 4930
	struct list_head *iter;
	struct slave *slave;
J
Jay Vosburgh 已提交
4931

4932 4933
	bond_netpoll_cleanup(bond_dev);

4934
	/* Release the bonded slaves */
4935
	bond_for_each_slave(bond, slave, iter)
4936
		__bond_release_one(bond_dev, slave->dev, true, true);
4937
	netdev_info(bond_dev, "Released all slaves\n");
4938

M
Maor Gottlieb 已提交
4939 4940
	usable = rtnl_dereference(bond->usable_slaves);
	if (usable) {
4941
		RCU_INIT_POINTER(bond->usable_slaves, NULL);
M
Maor Gottlieb 已提交
4942 4943 4944 4945 4946 4947 4948
		kfree_rcu(usable, rcu);
	}

	all = rtnl_dereference(bond->all_slaves);
	if (all) {
		RCU_INIT_POINTER(bond->all_slaves, NULL);
		kfree_rcu(all, rcu);
4949 4950
	}

J
Jay Vosburgh 已提交
4951 4952
	list_del(&bond->bond_list);

4953
	bond_debug_unregister(bond);
J
Jay Vosburgh 已提交
4954 4955
}

L
Linus Torvalds 已提交
4956 4957 4958 4959
/*------------------------- Module initialization ---------------------------*/

static int bond_check_params(struct bond_params *params)
{
4960
	int arp_validate_value, fail_over_mac_value, primary_reselect_value, i;
4961 4962
	struct bond_opt_value newval;
	const struct bond_opt_value *valptr;
4963
	int arp_all_targets_value = 0;
4964
	u16 ad_actor_sys_prio = 0;
4965
	u16 ad_user_port_key = 0;
4966
	__be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 };
4967 4968 4969 4970
	int arp_ip_count;
	int bond_mode	= BOND_MODE_ROUNDROBIN;
	int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
	int lacp_fast = 0;
4971
	int tlb_dynamic_lb;
4972

4973
	/* Convert string parameters. */
L
Linus Torvalds 已提交
4974
	if (mode) {
4975 4976 4977 4978
		bond_opt_initstr(&newval, mode);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval);
		if (!valptr) {
			pr_err("Error: Invalid bonding mode \"%s\"\n", mode);
L
Linus Torvalds 已提交
4979 4980
			return -EINVAL;
		}
4981
		bond_mode = valptr->value;
L
Linus Torvalds 已提交
4982 4983
	}

4984
	if (xmit_hash_policy) {
4985 4986 4987
		if (bond_mode == BOND_MODE_ROUNDROBIN ||
		    bond_mode == BOND_MODE_ACTIVEBACKUP ||
		    bond_mode == BOND_MODE_BROADCAST) {
J
Joe Perches 已提交
4988
			pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
J
Joe Perches 已提交
4989
				bond_mode_name(bond_mode));
4990
		} else {
4991 4992 4993 4994
			bond_opt_initstr(&newval, xmit_hash_policy);
			valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH),
						&newval);
			if (!valptr) {
J
Joe Perches 已提交
4995
				pr_err("Error: Invalid xmit_hash_policy \"%s\"\n",
4996 4997 4998
				       xmit_hash_policy);
				return -EINVAL;
			}
4999
			xmit_hashtype = valptr->value;
5000 5001 5002
		}
	}

L
Linus Torvalds 已提交
5003 5004
	if (lacp_rate) {
		if (bond_mode != BOND_MODE_8023AD) {
J
Joe Perches 已提交
5005 5006
			pr_info("lacp_rate param is irrelevant in mode %s\n",
				bond_mode_name(bond_mode));
L
Linus Torvalds 已提交
5007
		} else {
5008 5009 5010 5011
			bond_opt_initstr(&newval, lacp_rate);
			valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE),
						&newval);
			if (!valptr) {
J
Joe Perches 已提交
5012
				pr_err("Error: Invalid lacp rate \"%s\"\n",
5013
				       lacp_rate);
L
Linus Torvalds 已提交
5014 5015
				return -EINVAL;
			}
5016
			lacp_fast = valptr->value;
L
Linus Torvalds 已提交
5017 5018 5019
		}
	}

5020
	if (ad_select) {
5021
		bond_opt_initstr(&newval, ad_select);
5022 5023 5024 5025
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT),
					&newval);
		if (!valptr) {
			pr_err("Error: Invalid ad_select \"%s\"\n", ad_select);
5026 5027
			return -EINVAL;
		}
5028 5029
		params->ad_select = valptr->value;
		if (bond_mode != BOND_MODE_8023AD)
5030
			pr_warn("ad_select param only affects 802.3ad mode\n");
5031 5032 5033 5034
	} else {
		params->ad_select = BOND_AD_STABLE;
	}

5035
	if (max_bonds < 0) {
5036 5037
		pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n",
			max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS);
L
Linus Torvalds 已提交
5038 5039 5040 5041
		max_bonds = BOND_DEFAULT_MAX_BONDS;
	}

	if (miimon < 0) {
5042 5043
		pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			miimon, INT_MAX);
5044
		miimon = 0;
L
Linus Torvalds 已提交
5045 5046 5047
	}

	if (updelay < 0) {
5048 5049
		pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			updelay, INT_MAX);
L
Linus Torvalds 已提交
5050 5051 5052 5053
		updelay = 0;
	}

	if (downdelay < 0) {
5054 5055
		pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			downdelay, INT_MAX);
L
Linus Torvalds 已提交
5056 5057 5058
		downdelay = 0;
	}

5059 5060
	if ((use_carrier != 0) && (use_carrier != 1)) {
		pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n",
5061
			use_carrier);
L
Linus Torvalds 已提交
5062 5063 5064
		use_carrier = 1;
	}

5065
	if (num_peer_notif < 0 || num_peer_notif > 255) {
5066 5067
		pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n",
			num_peer_notif);
5068 5069 5070
		num_peer_notif = 1;
	}

5071
	/* reset values for 802.3ad/TLB/ALB */
5072
	if (!bond_mode_uses_arp(bond_mode)) {
L
Linus Torvalds 已提交
5073
		if (!miimon) {
5074 5075
			pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n");
			pr_warn("Forcing miimon to 100msec\n");
5076
			miimon = BOND_DEFAULT_MIIMON;
L
Linus Torvalds 已提交
5077 5078 5079
		}
	}

5080
	if (tx_queues < 1 || tx_queues > 255) {
5081 5082
		pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n",
			tx_queues, BOND_DEFAULT_TX_QUEUES);
5083 5084 5085
		tx_queues = BOND_DEFAULT_TX_QUEUES;
	}

5086
	if ((all_slaves_active != 0) && (all_slaves_active != 1)) {
5087 5088
		pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n",
			all_slaves_active);
5089 5090 5091
		all_slaves_active = 0;
	}

5092
	if (resend_igmp < 0 || resend_igmp > 255) {
5093 5094
		pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n",
			resend_igmp, BOND_DEFAULT_RESEND_IGMP);
5095 5096 5097
		resend_igmp = BOND_DEFAULT_RESEND_IGMP;
	}

5098 5099
	bond_opt_initval(&newval, packets_per_slave);
	if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) {
5100 5101 5102 5103 5104
		pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n",
			packets_per_slave, USHRT_MAX);
		packets_per_slave = 1;
	}

L
Linus Torvalds 已提交
5105
	if (bond_mode == BOND_MODE_ALB) {
J
Joe Perches 已提交
5106 5107
		pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n",
			  updelay);
L
Linus Torvalds 已提交
5108 5109 5110 5111 5112 5113 5114
	}

	if (!miimon) {
		if (updelay || downdelay) {
			/* just warn the user the up/down delay will have
			 * no effect since miimon is zero...
			 */
5115 5116
			pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n",
				updelay, downdelay);
L
Linus Torvalds 已提交
5117 5118 5119 5120
		}
	} else {
		/* don't allow arp monitoring */
		if (arp_interval) {
5121 5122
			pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n",
				miimon, arp_interval);
L
Linus Torvalds 已提交
5123 5124 5125 5126
			arp_interval = 0;
		}

		if ((updelay % miimon) != 0) {
5127 5128
			pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n",
				updelay, miimon, (updelay / miimon) * miimon);
L
Linus Torvalds 已提交
5129 5130 5131 5132 5133
		}

		updelay /= miimon;

		if ((downdelay % miimon) != 0) {
5134 5135 5136
			pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n",
				downdelay, miimon,
				(downdelay / miimon) * miimon);
L
Linus Torvalds 已提交
5137 5138 5139 5140 5141 5142
		}

		downdelay /= miimon;
	}

	if (arp_interval < 0) {
5143 5144
		pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			arp_interval, INT_MAX);
5145
		arp_interval = 0;
L
Linus Torvalds 已提交
5146 5147
	}

5148 5149
	for (arp_ip_count = 0, i = 0;
	     (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) {
5150
		__be32 ip;
5151 5152

		/* not a complete check, but good enough to catch mistakes */
5153
		if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) ||
5154
		    !bond_is_ip_target_ok(ip)) {
5155 5156
			pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n",
				arp_ip_target[i]);
L
Linus Torvalds 已提交
5157 5158
			arp_interval = 0;
		} else {
5159 5160 5161
			if (bond_get_targets_ip(arp_target, ip) == -1)
				arp_target[arp_ip_count++] = ip;
			else
5162 5163
				pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n",
					&ip);
L
Linus Torvalds 已提交
5164 5165 5166 5167 5168
		}
	}

	if (arp_interval && !arp_ip_count) {
		/* don't allow arping if no arp_ip_target given... */
5169 5170
		pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n",
			arp_interval);
L
Linus Torvalds 已提交
5171 5172 5173
		arp_interval = 0;
	}

5174 5175
	if (arp_validate) {
		if (!arp_interval) {
J
Joe Perches 已提交
5176
			pr_err("arp_validate requires arp_interval\n");
5177 5178 5179
			return -EINVAL;
		}

5180 5181 5182 5183
		bond_opt_initstr(&newval, arp_validate);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
5184
			pr_err("Error: invalid arp_validate \"%s\"\n",
5185
			       arp_validate);
5186 5187
			return -EINVAL;
		}
5188 5189
		arp_validate_value = valptr->value;
	} else {
5190
		arp_validate_value = 0;
5191
	}
5192

5193
	if (arp_all_targets) {
5194 5195 5196 5197
		bond_opt_initstr(&newval, arp_all_targets);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS),
					&newval);
		if (!valptr) {
5198 5199 5200
			pr_err("Error: invalid arp_all_targets_value \"%s\"\n",
			       arp_all_targets);
			arp_all_targets_value = 0;
5201 5202
		} else {
			arp_all_targets_value = valptr->value;
5203 5204 5205
		}
	}

L
Linus Torvalds 已提交
5206
	if (miimon) {
J
Joe Perches 已提交
5207
		pr_info("MII link monitoring set to %d ms\n", miimon);
L
Linus Torvalds 已提交
5208
	} else if (arp_interval) {
5209 5210
		valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE,
					  arp_validate_value);
J
Joe Perches 已提交
5211
		pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):",
5212
			arp_interval, valptr->string, arp_ip_count);
L
Linus Torvalds 已提交
5213 5214

		for (i = 0; i < arp_ip_count; i++)
J
Joe Perches 已提交
5215
			pr_cont(" %s", arp_ip_target[i]);
L
Linus Torvalds 已提交
5216

J
Joe Perches 已提交
5217
		pr_cont("\n");
L
Linus Torvalds 已提交
5218

5219
	} else if (max_bonds) {
L
Linus Torvalds 已提交
5220 5221 5222
		/* miimon and arp_interval not set, we need one so things
		 * work as expected, see bonding.txt for details
		 */
J
Joe Perches 已提交
5223
		pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n");
L
Linus Torvalds 已提交
5224 5225
	}

5226
	if (primary && !bond_mode_uses_primary(bond_mode)) {
L
Linus Torvalds 已提交
5227 5228 5229
		/* currently, using a primary only makes sense
		 * in active backup, TLB or ALB modes
		 */
5230 5231
		pr_warn("Warning: %s primary device specified but has no effect in %s mode\n",
			primary, bond_mode_name(bond_mode));
L
Linus Torvalds 已提交
5232 5233 5234
		primary = NULL;
	}

5235
	if (primary && primary_reselect) {
5236 5237 5238 5239
		bond_opt_initstr(&newval, primary_reselect);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
5240
			pr_err("Error: Invalid primary_reselect \"%s\"\n",
5241
			       primary_reselect);
5242 5243
			return -EINVAL;
		}
5244
		primary_reselect_value = valptr->value;
5245 5246 5247 5248
	} else {
		primary_reselect_value = BOND_PRI_RESELECT_ALWAYS;
	}

5249
	if (fail_over_mac) {
5250 5251 5252 5253
		bond_opt_initstr(&newval, fail_over_mac);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
5254
			pr_err("Error: invalid fail_over_mac \"%s\"\n",
5255
			       fail_over_mac);
5256 5257
			return -EINVAL;
		}
5258
		fail_over_mac_value = valptr->value;
5259
		if (bond_mode != BOND_MODE_ACTIVEBACKUP)
5260
			pr_warn("Warning: fail_over_mac only affects active-backup mode\n");
5261 5262 5263
	} else {
		fail_over_mac_value = BOND_FOM_NONE;
	}
5264

5265 5266 5267 5268 5269 5270 5271 5272 5273 5274
	bond_opt_initstr(&newval, "default");
	valptr = bond_opt_parse(
			bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO),
				     &newval);
	if (!valptr) {
		pr_err("Error: No ad_actor_sys_prio default value");
		return -EINVAL;
	}
	ad_actor_sys_prio = valptr->value;

5275 5276 5277 5278 5279 5280 5281 5282
	valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY),
				&newval);
	if (!valptr) {
		pr_err("Error: No ad_user_port_key default value");
		return -EINVAL;
	}
	ad_user_port_key = valptr->value;

5283 5284 5285 5286 5287
	bond_opt_initstr(&newval, "default");
	valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval);
	if (!valptr) {
		pr_err("Error: No tlb_dynamic_lb default value");
		return -EINVAL;
5288
	}
5289
	tlb_dynamic_lb = valptr->value;
5290

5291
	if (lp_interval == 0) {
5292 5293
		pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
			INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
5294 5295 5296
		lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
	}

L
Linus Torvalds 已提交
5297 5298
	/* fill params struct with the proper values */
	params->mode = bond_mode;
5299
	params->xmit_policy = xmit_hashtype;
L
Linus Torvalds 已提交
5300
	params->miimon = miimon;
5301
	params->num_peer_notif = num_peer_notif;
L
Linus Torvalds 已提交
5302
	params->arp_interval = arp_interval;
5303
	params->arp_validate = arp_validate_value;
5304
	params->arp_all_targets = arp_all_targets_value;
L
Linus Torvalds 已提交
5305 5306
	params->updelay = updelay;
	params->downdelay = downdelay;
5307
	params->peer_notif_delay = 0;
L
Linus Torvalds 已提交
5308 5309 5310
	params->use_carrier = use_carrier;
	params->lacp_fast = lacp_fast;
	params->primary[0] = 0;
5311
	params->primary_reselect = primary_reselect_value;
5312
	params->fail_over_mac = fail_over_mac_value;
5313
	params->tx_queues = tx_queues;
5314
	params->all_slaves_active = all_slaves_active;
5315
	params->resend_igmp = resend_igmp;
5316
	params->min_links = min_links;
5317
	params->lp_interval = lp_interval;
5318
	params->packets_per_slave = packets_per_slave;
5319
	params->tlb_dynamic_lb = tlb_dynamic_lb;
5320
	params->ad_actor_sys_prio = ad_actor_sys_prio;
5321
	eth_zero_addr(params->ad_actor_system);
5322
	params->ad_user_port_key = ad_user_port_key;
5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333
	if (packets_per_slave > 0) {
		params->reciprocal_packets_per_slave =
			reciprocal_value(packets_per_slave);
	} else {
		/* reciprocal_packets_per_slave is unused if
		 * packets_per_slave is 0 or 1, just initialize it
		 */
		params->reciprocal_packets_per_slave =
			(struct reciprocal_value) { 0 };
	}

L
Linus Torvalds 已提交
5334 5335 5336 5337 5338 5339 5340 5341 5342 5343
	if (primary) {
		strncpy(params->primary, primary, IFNAMSIZ);
		params->primary[IFNAMSIZ - 1] = 0;
	}

	memcpy(params->arp_targets, arp_target, sizeof(arp_target));

	return 0;
}

5344
/* Called from registration process */
5345 5346 5347
static int bond_init(struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
5348
	struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
5349

5350
	netdev_dbg(bond_dev, "Begin bond_init\n");
5351

5352
	bond->wq = alloc_ordered_workqueue(bond_dev->name, WQ_MEM_RECLAIM);
5353 5354 5355
	if (!bond->wq)
		return -ENOMEM;

5356
	spin_lock_init(&bond->stats_lock);
5357
	netdev_lockdep_set_classes(bond_dev);
5358

5359
	list_add_tail(&bond->bond_list, &bn->dev_list);
5360

5361
	bond_prepare_sysfs_group(bond);
5362

5363 5364
	bond_debug_register(bond);

5365 5366
	/* Ensure valid dev_addr */
	if (is_zero_ether_addr(bond_dev->dev_addr) &&
5367
	    bond_dev->addr_assign_type == NET_ADDR_PERM)
5368 5369
		eth_hw_addr_random(bond_dev);

5370 5371 5372
	return 0;
}

5373
unsigned int bond_get_num_tx_queues(void)
5374
{
5375
	return tx_queues;
5376 5377
}

5378
/* Create a new bond based on the specified name and bonding parameters.
5379
 * If name is NULL, obtain a suitable "bond%d" name for us.
5380 5381 5382
 * Caller must NOT hold rtnl_lock; we need to release it here before we
 * set up our sysfs entries.
 */
5383
int bond_create(struct net *net, const char *name)
5384 5385
{
	struct net_device *bond_dev;
5386 5387
	struct bonding *bond;
	struct alb_bond_info *bond_info;
5388 5389 5390
	int res;

	rtnl_lock();
5391

5392
	bond_dev = alloc_netdev_mq(sizeof(struct bonding),
5393
				   name ? name : "bond%d", NET_NAME_UNKNOWN,
5394
				   bond_setup, tx_queues);
5395
	if (!bond_dev) {
J
Joe Perches 已提交
5396
		pr_err("%s: eek! can't alloc netdev!\n", name);
5397 5398
		rtnl_unlock();
		return -ENOMEM;
5399 5400
	}

5401 5402 5403 5404 5405 5406 5407 5408
	/*
	 * Initialize rx_hashtbl_used_head to RLB_NULL_INDEX.
	 * It is set to 0 by default which is wrong.
	 */
	bond = netdev_priv(bond_dev);
	bond_info = &(BOND_ALB_INFO(bond));
	bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX;

5409
	dev_net_set(bond_dev, net);
5410 5411
	bond_dev->rtnl_link_ops = &bond_link_ops;

5412
	res = register_netdevice(bond_dev);
5413 5414 5415 5416 5417 5418
	if (res < 0) {
		free_netdev(bond_dev);
		rtnl_unlock();

		return res;
	}
5419

5420 5421
	netif_carrier_off(bond_dev);

5422 5423
	bond_work_init_all(bond);

5424
	rtnl_unlock();
5425
	return 0;
5426 5427
}

5428
static int __net_init bond_net_init(struct net *net)
5429
{
5430
	struct bond_net *bn = net_generic(net, bond_net_id);
5431 5432 5433 5434 5435

	bn->net = net;
	INIT_LIST_HEAD(&bn->dev_list);

	bond_create_proc_dir(bn);
5436
	bond_create_sysfs(bn);
5437

5438
	return 0;
5439 5440
}

5441
static void __net_exit bond_net_exit(struct net *net)
5442
{
5443
	struct bond_net *bn = net_generic(net, bond_net_id);
5444 5445
	struct bonding *bond, *tmp_bond;
	LIST_HEAD(list);
5446

5447
	bond_destroy_sysfs(bn);
5448 5449 5450 5451 5452 5453 5454

	/* Kill off any bonds created after unregistering bond rtnl ops */
	rtnl_lock();
	list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
		unregister_netdevice_queue(bond->dev, &list);
	unregister_netdevice_many(&list);
	rtnl_unlock();
5455 5456

	bond_destroy_proc_dir(bn);
5457 5458 5459 5460 5461
}

static struct pernet_operations bond_net_ops = {
	.init = bond_net_init,
	.exit = bond_net_exit,
5462 5463
	.id   = &bond_net_id,
	.size = sizeof(struct bond_net),
5464 5465
};

L
Linus Torvalds 已提交
5466 5467 5468 5469 5470
static int __init bonding_init(void)
{
	int i;
	int res;

5471
	res = bond_check_params(&bonding_defaults);
S
Stephen Hemminger 已提交
5472
	if (res)
5473
		goto out;
L
Linus Torvalds 已提交
5474

5475
	res = register_pernet_subsys(&bond_net_ops);
5476 5477
	if (res)
		goto out;
5478

5479
	res = bond_netlink_init();
5480
	if (res)
5481
		goto err_link;
5482

5483 5484
	bond_create_debugfs();

L
Linus Torvalds 已提交
5485
	for (i = 0; i < max_bonds; i++) {
5486
		res = bond_create(&init_net, NULL);
5487 5488
		if (res)
			goto err;
L
Linus Torvalds 已提交
5489 5490
	}

5491 5492 5493 5494
	skb_flow_dissector_init(&flow_keys_bonding,
				flow_keys_bonding_keys,
				ARRAY_SIZE(flow_keys_bonding_keys));

L
Linus Torvalds 已提交
5495
	register_netdevice_notifier(&bond_netdev_notifier);
5496
out:
L
Linus Torvalds 已提交
5497
	return res;
5498
err:
5499
	bond_destroy_debugfs();
5500
	bond_netlink_fini();
5501
err_link:
5502
	unregister_pernet_subsys(&bond_net_ops);
5503
	goto out;
5504

L
Linus Torvalds 已提交
5505 5506 5507 5508 5509 5510
}

static void __exit bonding_exit(void)
{
	unregister_netdevice_notifier(&bond_netdev_notifier);

5511
	bond_destroy_debugfs();
5512

5513
	bond_netlink_fini();
5514
	unregister_pernet_subsys(&bond_net_ops);
5515 5516

#ifdef CONFIG_NET_POLL_CONTROLLER
5517
	/* Make sure we don't have an imbalance on our netpoll blocking */
5518
	WARN_ON(atomic_read(&netpoll_block_tx));
5519
#endif
L
Linus Torvalds 已提交
5520 5521 5522 5523 5524
}

module_init(bonding_init);
module_exit(bonding_exit);
MODULE_LICENSE("GPL");
5525
MODULE_DESCRIPTION(DRV_DESCRIPTION);
L
Linus Torvalds 已提交
5526
MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others");