bond_main.c 133.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
/*
 * originally based on the dummy device.
 *
 * Copyright 1999, Thomas Davis, tadavis@lbl.gov.
 * Licensed under the GPL. Based on dummy.c, and eql.c devices.
 *
 * bonding.c: an Ethernet Bonding driver
 *
 * This is useful to talk to a Cisco EtherChannel compatible equipment:
 *	Cisco 5500
 *	Sun Trunking (Solaris)
 *	Alteon AceDirector Trunks
 *	Linux Bonding
 *	and probably many L2 switches ...
 *
 * How it works:
 *    ifconfig bond0 ipaddress netmask up
 *      will setup a network device, with an ip address.  No mac address
 *	will be assigned at this time.  The hw mac address will come from
 *	the first slave bonded to the channel.  All slaves will then use
 *	this hw mac address.
 *
 *    ifconfig bond0 down
 *         will release all slaves, marking them as down.
 *
 *    ifenslave bond0 eth0
 *	will attach eth0 to bond0 as a slave.  eth0 hw mac address will either
 *	a: be used as initial mac address
 *	b: if a hw mac address already is there, eth0's hw mac address
 *	   will then be set from bond0.
 *
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/interrupt.h>
#include <linux/ptrace.h>
#include <linux/ioport.h>
#include <linux/in.h>
42
#include <net/ip.h>
L
Linus Torvalds 已提交
43
#include <linux/ip.h>
44 45
#include <linux/tcp.h>
#include <linux/udp.h>
L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/socket.h>
#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/bitops.h>
S
Stephen Hemminger 已提交
54
#include <linux/io.h>
L
Linus Torvalds 已提交
55
#include <asm/dma.h>
S
Stephen Hemminger 已提交
56
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
57 58 59
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
60
#include <linux/igmp.h>
L
Linus Torvalds 已提交
61 62 63 64 65 66 67 68 69 70 71
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/smp.h>
#include <linux/if_ether.h>
#include <net/arp.h>
#include <linux/mii.h>
#include <linux/ethtool.h>
#include <linux/if_vlan.h>
#include <linux/if_bonding.h>
D
David Sterba 已提交
72
#include <linux/jiffies.h>
73
#include <linux/preempt.h>
J
Jay Vosburgh 已提交
74
#include <net/route.h>
75
#include <net/net_namespace.h>
76
#include <net/netns/generic.h>
77
#include <net/pkt_sched.h>
78
#include <linux/rculist.h>
79
#include <net/flow_dissector.h>
80
#include <net/switchdev.h>
81 82 83
#include <net/bonding.h>
#include <net/bond_3ad.h>
#include <net/bond_alb.h>
L
Linus Torvalds 已提交
84

85 86
#include "bonding_priv.h"

L
Linus Torvalds 已提交
87 88 89 90 91
/*---------------------------- Module parameters ----------------------------*/

/* monitor all links that often (in milliseconds). <=0 disables monitoring */

static int max_bonds	= BOND_DEFAULT_MAX_BONDS;
92
static int tx_queues	= BOND_DEFAULT_TX_QUEUES;
93
static int num_peer_notif = 1;
94
static int miimon;
S
Stephen Hemminger 已提交
95 96
static int updelay;
static int downdelay;
L
Linus Torvalds 已提交
97
static int use_carrier	= 1;
S
Stephen Hemminger 已提交
98 99
static char *mode;
static char *primary;
100
static char *primary_reselect;
S
Stephen Hemminger 已提交
101
static char *lacp_rate;
102
static int min_links;
S
Stephen Hemminger 已提交
103 104
static char *ad_select;
static char *xmit_hash_policy;
105
static int arp_interval;
S
Stephen Hemminger 已提交
106 107
static char *arp_ip_target[BOND_MAX_ARP_TARGETS];
static char *arp_validate;
108
static char *arp_all_targets;
S
Stephen Hemminger 已提交
109
static char *fail_over_mac;
110
static int all_slaves_active;
111
static struct bond_params bonding_defaults;
112
static int resend_igmp = BOND_DEFAULT_RESEND_IGMP;
113
static int packets_per_slave = 1;
114
static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
L
Linus Torvalds 已提交
115 116 117

module_param(max_bonds, int, 0);
MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
118 119
module_param(tx_queues, int, 0);
MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)");
120
module_param_named(num_grat_arp, num_peer_notif, int, 0644);
121 122
MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on "
			       "failover event (alias of num_unsol_na)");
123
module_param_named(num_unsol_na, num_peer_notif, int, 0644);
124 125
MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on "
			       "failover event (alias of num_grat_arp)");
L
Linus Torvalds 已提交
126 127 128 129 130
module_param(miimon, int, 0);
MODULE_PARM_DESC(miimon, "Link check interval in milliseconds");
module_param(updelay, int, 0);
MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds");
module_param(downdelay, int, 0);
131 132
MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
			    "in milliseconds");
L
Linus Torvalds 已提交
133
module_param(use_carrier, int, 0);
134 135
MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
			      "0 for off, 1 for on (default)");
L
Linus Torvalds 已提交
136
module_param(mode, charp, 0);
137
MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
138 139 140
		       "1 for active-backup, 2 for balance-xor, "
		       "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, "
		       "6 for balance-alb");
L
Linus Torvalds 已提交
141 142
module_param(primary, charp, 0);
MODULE_PARM_DESC(primary, "Primary network device to use");
143 144 145 146 147 148 149 150
module_param(primary_reselect, charp, 0);
MODULE_PARM_DESC(primary_reselect, "Reselect primary slave "
				   "once it comes up; "
				   "0 for always (default), "
				   "1 for only if speed of primary is "
				   "better, "
				   "2 for only on active slave "
				   "failure");
L
Linus Torvalds 已提交
151
module_param(lacp_rate, charp, 0);
152 153
MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; "
			    "0 for slow, 1 for fast");
154
module_param(ad_select, charp, 0);
Z
Zhu Yanjun 已提交
155
MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; "
156 157
			    "0 for stable (default), 1 for bandwidth, "
			    "2 for count");
158 159 160
module_param(min_links, int, 0);
MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");

161
module_param(xmit_hash_policy, charp, 0);
162 163
MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; "
				   "0 for layer 2 (default), 1 for layer 3+4, "
164 165
				   "2 for layer 2+3, 3 for encap layer 2+3, "
				   "4 for encap layer 3+4");
L
Linus Torvalds 已提交
166 167 168 169
module_param(arp_interval, int, 0);
MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
module_param_array(arp_ip_target, charp, NULL, 0);
MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form");
170
module_param(arp_validate, charp, 0);
171 172 173
MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; "
			       "0 for none (default), 1 for active, "
			       "2 for backup, 3 for all");
174 175
module_param(arp_all_targets, charp, 0);
MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all");
176
module_param(fail_over_mac, charp, 0);
177 178 179
MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to "
				"the same MAC; 0 for none (default), "
				"1 for active, 2 for follow");
180
module_param(all_slaves_active, int, 0);
181
MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface "
182
				     "by setting active flag for all slaves; "
183
				     "0 for never (default), 1 for always.");
184
module_param(resend_igmp, int, 0);
185 186
MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on "
			      "link failure");
187 188 189 190
module_param(packets_per_slave, int, 0);
MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr "
				    "mode; 0 for a random slave, 1 packet per "
				    "slave (default), >1 packets per slave.");
191 192 193 194
module_param(lp_interval, uint, 0);
MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where "
			      "the bonding driver sends learning packets to "
			      "each slaves peer switch. The default is 1.");
L
Linus Torvalds 已提交
195 196 197

/*----------------------------- Global variables ----------------------------*/

198
#ifdef CONFIG_NET_POLL_CONTROLLER
199
atomic_t netpoll_block_tx = ATOMIC_INIT(0);
200 201
#endif

202
unsigned int bond_net_id __read_mostly;
L
Linus Torvalds 已提交
203 204 205

/*-------------------------- Forward declarations ---------------------------*/

206
static int bond_init(struct net_device *bond_dev);
207
static void bond_uninit(struct net_device *bond_dev);
208 209
static void bond_get_stats(struct net_device *bond_dev,
			   struct rtnl_link_stats64 *stats);
210
static void bond_slave_arr_handler(struct work_struct *work);
211 212
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
				  int mod);
L
Linus Torvalds 已提交
213 214 215

/*---------------------------- General routines -----------------------------*/

216
const char *bond_mode_name(int mode)
L
Linus Torvalds 已提交
217
{
218 219 220 221 222
	static const char *names[] = {
		[BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)",
		[BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)",
		[BOND_MODE_XOR] = "load balancing (xor)",
		[BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)",
S
Stephen Hemminger 已提交
223
		[BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation",
224 225 226 227
		[BOND_MODE_TLB] = "transmit load balancing",
		[BOND_MODE_ALB] = "adaptive load balancing",
	};

228
	if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB)
L
Linus Torvalds 已提交
229
		return "unknown";
230 231

	return names[mode];
L
Linus Torvalds 已提交
232 233 234 235 236 237
}

/*---------------------------------- VLAN -----------------------------------*/

/**
 * bond_dev_queue_xmit - Prepare skb for xmit.
S
Stephen Hemminger 已提交
238
 *
L
Linus Torvalds 已提交
239 240 241 242
 * @bond: bond device that got this skb for tx.
 * @skb: hw accel VLAN tagged skb to transmit
 * @slave_dev: slave that is supposed to xmit this skbuff
 */
243
void bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
S
Stephen Hemminger 已提交
244
			struct net_device *slave_dev)
L
Linus Torvalds 已提交
245
{
246
	skb->dev = slave_dev;
247

248
	BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
249 250
		     sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
251

252
	if (unlikely(netpoll_tx_running(bond->dev)))
253
		bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb);
254
	else
255
		dev_queue_xmit(skb);
L
Linus Torvalds 已提交
256 257
}

258
/* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
J
Jiri Pirko 已提交
259
 * We don't protect the slave list iteration with a lock because:
L
Linus Torvalds 已提交
260 261 262 263
 * a. This operation is performed in IOCTL context,
 * b. The operation is protected by the RTNL semaphore in the 8021q code,
 * c. Holding a lock with BH disabled while directly calling a base driver
 *    entry point is generally a BAD idea.
S
Stephen Hemminger 已提交
264
 *
L
Linus Torvalds 已提交
265 266 267 268 269 270 271 272 273 274 275 276 277 278
 * The design of synchronization/protection for this operation in the 8021q
 * module is good for one or more VLAN devices over a single physical device
 * and cannot be extended for a teaming solution like bonding, so there is a
 * potential race condition here where a net device from the vlan group might
 * be referenced (either by a base driver or the 8021q code) while it is being
 * removed from the system. However, it turns out we're not making matters
 * worse, and if it works for regular VLAN usage it will work here too.
*/

/**
 * bond_vlan_rx_add_vid - Propagates adding an id to slaves
 * @bond_dev: bonding net device that got called
 * @vid: vlan id being added
 */
279 280
static int bond_vlan_rx_add_vid(struct net_device *bond_dev,
				__be16 proto, u16 vid)
L
Linus Torvalds 已提交
281
{
282
	struct bonding *bond = netdev_priv(bond_dev);
283
	struct slave *slave, *rollback_slave;
284
	struct list_head *iter;
285
	int res;
L
Linus Torvalds 已提交
286

287
	bond_for_each_slave(bond, slave, iter) {
288
		res = vlan_vid_add(slave->dev, proto, vid);
289 290
		if (res)
			goto unwind;
L
Linus Torvalds 已提交
291 292
	}

293
	return 0;
294 295

unwind:
296
	/* unwind to the slave that failed */
297
	bond_for_each_slave(bond, rollback_slave, iter) {
298 299 300 301 302
		if (rollback_slave == slave)
			break;

		vlan_vid_del(rollback_slave->dev, proto, vid);
	}
303 304

	return res;
L
Linus Torvalds 已提交
305 306 307 308 309 310 311
}

/**
 * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves
 * @bond_dev: bonding net device that got called
 * @vid: vlan id being removed
 */
312 313
static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
				 __be16 proto, u16 vid)
L
Linus Torvalds 已提交
314
{
315
	struct bonding *bond = netdev_priv(bond_dev);
316
	struct list_head *iter;
L
Linus Torvalds 已提交
317 318
	struct slave *slave;

319
	bond_for_each_slave(bond, slave, iter)
320
		vlan_vid_del(slave->dev, proto, vid);
L
Linus Torvalds 已提交
321

322 323
	if (bond_is_lb(bond))
		bond_alb_clear_vlan(bond, vid);
324 325

	return 0;
L
Linus Torvalds 已提交
326 327 328 329
}

/*------------------------------- Link status -------------------------------*/

330
/* Set the carrier state for the master according to the state of its
331 332 333 334 335
 * slaves.  If any slaves are up, the master is up.  In 802.3ad mode,
 * do special 802.3ad magic.
 *
 * Returns zero if carrier state does not change, nonzero if it does.
 */
336
int bond_set_carrier(struct bonding *bond)
337
{
338
	struct list_head *iter;
339 340
	struct slave *slave;

341
	if (!bond_has_slaves(bond))
342 343
		goto down;

344
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
345 346
		return bond_3ad_set_carrier(bond);

347
	bond_for_each_slave(bond, slave, iter) {
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
		if (slave->link == BOND_LINK_UP) {
			if (!netif_carrier_ok(bond->dev)) {
				netif_carrier_on(bond->dev);
				return 1;
			}
			return 0;
		}
	}

down:
	if (netif_carrier_ok(bond->dev)) {
		netif_carrier_off(bond->dev);
		return 1;
	}
	return 0;
}

365
/* Get link speed and duplex from the slave's base driver
L
Linus Torvalds 已提交
366
 * using ethtool. If for some reason the call fails or the
367
 * values are invalid, set speed and duplex to -1,
368 369
 * and return. Return 1 if speed or duplex settings are
 * UNKNOWN; 0 otherwise.
L
Linus Torvalds 已提交
370
 */
371
static int bond_update_speed_duplex(struct slave *slave)
L
Linus Torvalds 已提交
372 373
{
	struct net_device *slave_dev = slave->dev;
374
	struct ethtool_link_ksettings ecmd;
375
	int res;
L
Linus Torvalds 已提交
376

377 378
	slave->speed = SPEED_UNKNOWN;
	slave->duplex = DUPLEX_UNKNOWN;
L
Linus Torvalds 已提交
379

380
	res = __ethtool_get_link_ksettings(slave_dev, &ecmd);
381 382 383 384 385 386 387 388
	if (res < 0) {
		slave->link = BOND_LINK_DOWN;
		return 1;
	}
	if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) {
		slave->link = BOND_LINK_DOWN;
		return 1;
	}
389
	switch (ecmd.base.duplex) {
L
Linus Torvalds 已提交
390 391 392 393
	case DUPLEX_FULL:
	case DUPLEX_HALF:
		break;
	default:
394 395
		slave->link = BOND_LINK_DOWN;
		return 1;
L
Linus Torvalds 已提交
396 397
	}

398 399
	slave->speed = ecmd.base.speed;
	slave->duplex = ecmd.base.duplex;
L
Linus Torvalds 已提交
400

401
	return 0;
L
Linus Torvalds 已提交
402 403
}

404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
const char *bond_slave_link_status(s8 link)
{
	switch (link) {
	case BOND_LINK_UP:
		return "up";
	case BOND_LINK_FAIL:
		return "going down";
	case BOND_LINK_DOWN:
		return "down";
	case BOND_LINK_BACK:
		return "going back";
	default:
		return "unknown";
	}
}

420
/* if <dev> supports MII link status reporting, check its link status.
L
Linus Torvalds 已提交
421 422
 *
 * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(),
S
Stephen Hemminger 已提交
423
 * depending upon the setting of the use_carrier parameter.
L
Linus Torvalds 已提交
424 425 426 427 428 429 430 431 432 433 434
 *
 * Return either BMSR_LSTATUS, meaning that the link is up (or we
 * can't tell and just pretend it is), or 0, meaning that the link is
 * down.
 *
 * If reporting is non-zero, instead of faking link up, return -1 if
 * both ETHTOOL and MII ioctls fail (meaning the device does not
 * support them).  If use_carrier is set, return whatever it says.
 * It'd be nice if there was a good way to tell if a driver supports
 * netif_carrier, but there really isn't.
 */
S
Stephen Hemminger 已提交
435 436
static int bond_check_dev_link(struct bonding *bond,
			       struct net_device *slave_dev, int reporting)
L
Linus Torvalds 已提交
437
{
438
	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
439
	int (*ioctl)(struct net_device *, struct ifreq *, int);
L
Linus Torvalds 已提交
440 441 442
	struct ifreq ifr;
	struct mii_ioctl_data *mii;

443 444 445
	if (!reporting && !netif_running(slave_dev))
		return 0;

446
	if (bond->params.use_carrier)
L
Linus Torvalds 已提交
447 448
		return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;

449
	/* Try to get link status using Ethtool first. */
450 451 452
	if (slave_dev->ethtool_ops->get_link)
		return slave_dev->ethtool_ops->get_link(slave_dev) ?
			BMSR_LSTATUS : 0;
453

S
Stephen Hemminger 已提交
454
	/* Ethtool can't be used, fallback to MII ioctls. */
455
	ioctl = slave_ops->ndo_do_ioctl;
L
Linus Torvalds 已提交
456
	if (ioctl) {
457 458 459 460 461 462 463 464
		/* TODO: set pointer to correct ioctl on a per team member
		 *       bases to make this more efficient. that is, once
		 *       we determine the correct ioctl, we will always
		 *       call it and not the others for that team
		 *       member.
		 */

		/* We cannot assume that SIOCGMIIPHY will also read a
L
Linus Torvalds 已提交
465 466 467 468 469 470 471
		 * register; not all network drivers (e.g., e100)
		 * support that.
		 */

		/* Yes, the mii is overlaid on the ifreq.ifr_ifru */
		strncpy(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
		mii = if_mii(&ifr);
A
Al Viro 已提交
472
		if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
L
Linus Torvalds 已提交
473
			mii->reg_num = MII_BMSR;
A
Al Viro 已提交
474
			if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
S
Stephen Hemminger 已提交
475
				return mii->val_out & BMSR_LSTATUS;
L
Linus Torvalds 已提交
476 477 478
		}
	}

479
	/* If reporting, report that either there's no dev->do_ioctl,
480
	 * or both SIOCGMIIREG and get_link failed (meaning that we
L
Linus Torvalds 已提交
481 482 483
	 * cannot report link status).  If not reporting, pretend
	 * we're ok.
	 */
S
Stephen Hemminger 已提交
484
	return reporting ? -1 : BMSR_LSTATUS;
L
Linus Torvalds 已提交
485 486 487 488
}

/*----------------------------- Multicast list ------------------------------*/

489
/* Push the promiscuity flag down to appropriate slaves */
490
static int bond_set_promiscuity(struct bonding *bond, int inc)
L
Linus Torvalds 已提交
491
{
492
	struct list_head *iter;
493
	int err = 0;
494

495
	if (bond_uses_primary(bond)) {
496
		struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
497 498 499

		if (curr_active)
			err = dev_set_promiscuity(curr_active->dev, inc);
L
Linus Torvalds 已提交
500 501
	} else {
		struct slave *slave;
502

503
		bond_for_each_slave(bond, slave, iter) {
504 505 506
			err = dev_set_promiscuity(slave->dev, inc);
			if (err)
				return err;
L
Linus Torvalds 已提交
507 508
		}
	}
509
	return err;
L
Linus Torvalds 已提交
510 511
}

512
/* Push the allmulti flag down to all slaves */
513
static int bond_set_allmulti(struct bonding *bond, int inc)
L
Linus Torvalds 已提交
514
{
515
	struct list_head *iter;
516
	int err = 0;
517

518
	if (bond_uses_primary(bond)) {
519
		struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
520 521 522

		if (curr_active)
			err = dev_set_allmulti(curr_active->dev, inc);
L
Linus Torvalds 已提交
523 524
	} else {
		struct slave *slave;
525

526
		bond_for_each_slave(bond, slave, iter) {
527 528 529
			err = dev_set_allmulti(slave->dev, inc);
			if (err)
				return err;
L
Linus Torvalds 已提交
530 531
		}
	}
532
	return err;
L
Linus Torvalds 已提交
533 534
}

535
/* Retrieve the list of registered multicast addresses for the bonding
536 537 538
 * device and retransmit an IGMP JOIN request to the current active
 * slave.
 */
539
static void bond_resend_igmp_join_requests_delayed(struct work_struct *work)
540
{
541 542 543
	struct bonding *bond = container_of(work, struct bonding,
					    mcast_work.work);

544
	if (!rtnl_trylock()) {
545
		queue_delayed_work(bond->wq, &bond->mcast_work, 1);
546
		return;
547
	}
548
	call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev);
549

550 551
	if (bond->igmp_retrans > 1) {
		bond->igmp_retrans--;
552
		queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
553
	}
554
	rtnl_unlock();
555 556
}

557
/* Flush bond's hardware addresses from slave */
558
static void bond_hw_addr_flush(struct net_device *bond_dev,
S
Stephen Hemminger 已提交
559
			       struct net_device *slave_dev)
L
Linus Torvalds 已提交
560
{
561
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
562

563 564
	dev_uc_unsync(slave_dev, bond_dev);
	dev_mc_unsync(slave_dev, bond_dev);
L
Linus Torvalds 已提交
565

566
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
L
Linus Torvalds 已提交
567 568 569
		/* del lacpdu mc addr from mc list */
		u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;

570
		dev_mc_del(slave_dev, lacpdu_multicast);
L
Linus Torvalds 已提交
571 572 573 574 575
	}
}

/*--------------------------- Active slave change ---------------------------*/

576
/* Update the hardware address list and promisc/allmulti for the new and
577 578
 * old active slaves (if any).  Modes that are not using primary keep all
 * slaves up date at all times; only the modes that use primary need to call
579
 * this function to swap these settings during a failover.
L
Linus Torvalds 已提交
580
 */
581 582
static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
			      struct slave *old_active)
L
Linus Torvalds 已提交
583 584
{
	if (old_active) {
S
Stephen Hemminger 已提交
585
		if (bond->dev->flags & IFF_PROMISC)
L
Linus Torvalds 已提交
586 587
			dev_set_promiscuity(old_active->dev, -1);

S
Stephen Hemminger 已提交
588
		if (bond->dev->flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
589 590
			dev_set_allmulti(old_active->dev, -1);

591
		bond_hw_addr_flush(bond->dev, old_active->dev);
L
Linus Torvalds 已提交
592 593 594
	}

	if (new_active) {
595
		/* FIXME: Signal errors upstream. */
S
Stephen Hemminger 已提交
596
		if (bond->dev->flags & IFF_PROMISC)
L
Linus Torvalds 已提交
597 598
			dev_set_promiscuity(new_active->dev, 1);

S
Stephen Hemminger 已提交
599
		if (bond->dev->flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
600 601
			dev_set_allmulti(new_active->dev, 1);

602
		netif_addr_lock_bh(bond->dev);
603 604
		dev_uc_sync(new_active->dev, bond->dev);
		dev_mc_sync(new_active->dev, bond->dev);
605
		netif_addr_unlock_bh(bond->dev);
L
Linus Torvalds 已提交
606 607 608
	}
}

609 610 611 612 613 614 615 616 617 618
/**
 * bond_set_dev_addr - clone slave's address to bond
 * @bond_dev: bond net device
 * @slave_dev: slave net device
 *
 * Should be called with RTNL held.
 */
static void bond_set_dev_addr(struct net_device *bond_dev,
			      struct net_device *slave_dev)
{
619 620
	netdev_dbg(bond_dev, "bond_dev=%p slave_dev=%p slave_dev->name=%s slave_dev->addr_len=%d\n",
		   bond_dev, slave_dev, slave_dev->name, slave_dev->addr_len);
621 622 623 624 625
	memcpy(bond_dev->dev_addr, slave_dev->dev_addr, slave_dev->addr_len);
	bond_dev->addr_assign_type = NET_ADDR_STOLEN;
	call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev);
}

626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
static struct slave *bond_get_old_active(struct bonding *bond,
					 struct slave *new_active)
{
	struct slave *slave;
	struct list_head *iter;

	bond_for_each_slave(bond, slave, iter) {
		if (slave == new_active)
			continue;

		if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr))
			return slave;
	}

	return NULL;
}

643
/* bond_do_fail_over_mac
644 645 646
 *
 * Perform special MAC address swapping for fail_over_mac settings
 *
647
 * Called with RTNL
648 649 650 651 652 653 654 655 656 657 658
 */
static void bond_do_fail_over_mac(struct bonding *bond,
				  struct slave *new_active,
				  struct slave *old_active)
{
	u8 tmp_mac[ETH_ALEN];
	struct sockaddr saddr;
	int rv;

	switch (bond->params.fail_over_mac) {
	case BOND_FOM_ACTIVE:
659
		if (new_active)
660
			bond_set_dev_addr(bond->dev, new_active->dev);
661 662
		break;
	case BOND_FOM_FOLLOW:
663
		/* if new_active && old_active, swap them
664 665 666 667 668 669
		 * if just old_active, do nothing (going to no active slave)
		 * if just new_active, set new_active to bond's MAC
		 */
		if (!new_active)
			return;

670 671 672
		if (!old_active)
			old_active = bond_get_old_active(bond, new_active);

673
		if (old_active) {
674
			ether_addr_copy(tmp_mac, new_active->dev->dev_addr);
675 676
			ether_addr_copy(saddr.sa_data,
					old_active->dev->dev_addr);
677 678
			saddr.sa_family = new_active->dev->type;
		} else {
679
			ether_addr_copy(saddr.sa_data, bond->dev->dev_addr);
680 681 682 683 684
			saddr.sa_family = bond->dev->type;
		}

		rv = dev_set_mac_address(new_active->dev, &saddr);
		if (rv) {
685 686
			netdev_err(bond->dev, "Error %d setting MAC of slave %s\n",
				   -rv, new_active->dev->name);
687 688 689 690 691 692
			goto out;
		}

		if (!old_active)
			goto out;

693
		ether_addr_copy(saddr.sa_data, tmp_mac);
694 695 696 697
		saddr.sa_family = old_active->dev->type;

		rv = dev_set_mac_address(old_active->dev, &saddr);
		if (rv)
698 699
			netdev_err(bond->dev, "Error %d setting MAC of slave %s\n",
				   -rv, new_active->dev->name);
700 701 702
out:
		break;
	default:
703 704
		netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n",
			   bond->params.fail_over_mac);
705 706 707 708 709
		break;
	}

}

710
static struct slave *bond_choose_primary_or_current(struct bonding *bond)
711
{
712
	struct slave *prim = rtnl_dereference(bond->primary_slave);
713
	struct slave *curr = rtnl_dereference(bond->curr_active_slave);
714

715 716 717 718 719 720
	if (!prim || prim->link != BOND_LINK_UP) {
		if (!curr || curr->link != BOND_LINK_UP)
			return NULL;
		return curr;
	}

721 722
	if (bond->force_primary) {
		bond->force_primary = false;
723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744
		return prim;
	}

	if (!curr || curr->link != BOND_LINK_UP)
		return prim;

	/* At this point, prim and curr are both up */
	switch (bond->params.primary_reselect) {
	case BOND_PRI_RESELECT_ALWAYS:
		return prim;
	case BOND_PRI_RESELECT_BETTER:
		if (prim->speed < curr->speed)
			return curr;
		if (prim->speed == curr->speed && prim->duplex <= curr->duplex)
			return curr;
		return prim;
	case BOND_PRI_RESELECT_FAILURE:
		return curr;
	default:
		netdev_err(bond->dev, "impossible primary_reselect %d\n",
			   bond->params.primary_reselect);
		return curr;
745 746
	}
}
747

L
Linus Torvalds 已提交
748
/**
749
 * bond_find_best_slave - select the best available slave to be the active one
L
Linus Torvalds 已提交
750 751 752 753
 * @bond: our bonding struct
 */
static struct slave *bond_find_best_slave(struct bonding *bond)
{
754
	struct slave *slave, *bestslave = NULL;
755
	struct list_head *iter;
L
Linus Torvalds 已提交
756 757
	int mintime = bond->params.updelay;

758 759 760
	slave = bond_choose_primary_or_current(bond);
	if (slave)
		return slave;
L
Linus Torvalds 已提交
761

762 763 764
	bond_for_each_slave(bond, slave, iter) {
		if (slave->link == BOND_LINK_UP)
			return slave;
765
		if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) &&
766 767 768
		    slave->delay < mintime) {
			mintime = slave->delay;
			bestslave = slave;
L
Linus Torvalds 已提交
769 770 771 772 773 774
		}
	}

	return bestslave;
}

775 776
static bool bond_should_notify_peers(struct bonding *bond)
{
777 778 779 780 781
	struct slave *slave;

	rcu_read_lock();
	slave = rcu_dereference(bond->curr_active_slave);
	rcu_read_unlock();
782

783 784
	netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n",
		   slave ? slave->dev->name : "NULL");
785 786

	if (!slave || !bond->send_peer_notif ||
787
	    !netif_carrier_ok(bond->dev) ||
788 789 790 791 792 793
	    test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state))
		return false;

	return true;
}

L
Linus Torvalds 已提交
794 795 796 797 798 799 800 801 802 803 804 805 806
/**
 * change_active_interface - change the active slave into the specified one
 * @bond: our bonding struct
 * @new: the new slave to make the active one
 *
 * Set the new slave to the bond's settings and unset them on the old
 * curr_active_slave.
 * Setting include flags, mc-list, promiscuity, allmulti, etc.
 *
 * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP,
 * because it is apparently the best available slave we have, even though its
 * updelay hasn't timed out yet.
 *
807
 * Caller must hold RTNL.
L
Linus Torvalds 已提交
808
 */
809
void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
L
Linus Torvalds 已提交
810
{
811 812
	struct slave *old_active;

813 814 815
	ASSERT_RTNL();

	old_active = rtnl_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
816

S
Stephen Hemminger 已提交
817
	if (old_active == new_active)
L
Linus Torvalds 已提交
818 819 820
		return;

	if (new_active) {
821
		new_active->last_link_up = jiffies;
822

L
Linus Torvalds 已提交
823
		if (new_active->link == BOND_LINK_BACK) {
824
			if (bond_uses_primary(bond)) {
825 826 827
				netdev_info(bond->dev, "making interface %s the new active one %d ms earlier\n",
					    new_active->dev->name,
					    (bond->params.updelay - new_active->delay) * bond->params.miimon);
L
Linus Torvalds 已提交
828 829 830
			}

			new_active->delay = 0;
831 832
			bond_set_slave_link_state(new_active, BOND_LINK_UP,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
833

834
			if (BOND_MODE(bond) == BOND_MODE_8023AD)
L
Linus Torvalds 已提交
835 836
				bond_3ad_handle_link_change(new_active, BOND_LINK_UP);

837
			if (bond_is_lb(bond))
L
Linus Torvalds 已提交
838 839
				bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP);
		} else {
840
			if (bond_uses_primary(bond)) {
841 842
				netdev_info(bond->dev, "making interface %s the new active one\n",
					    new_active->dev->name);
L
Linus Torvalds 已提交
843 844 845 846
			}
		}
	}

847
	if (bond_uses_primary(bond))
848
		bond_hw_addr_swap(bond, new_active, old_active);
L
Linus Torvalds 已提交
849

850
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
851
		bond_alb_handle_active_change(bond, new_active);
852
		if (old_active)
853 854
			bond_set_slave_inactive_flags(old_active,
						      BOND_SLAVE_NOTIFY_NOW);
855
		if (new_active)
856 857
			bond_set_slave_active_flags(new_active,
						    BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
858
	} else {
859
		rcu_assign_pointer(bond->curr_active_slave, new_active);
L
Linus Torvalds 已提交
860
	}
J
Jay Vosburgh 已提交
861

862
	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
S
Stephen Hemminger 已提交
863
		if (old_active)
864 865
			bond_set_slave_inactive_flags(old_active,
						      BOND_SLAVE_NOTIFY_NOW);
J
Jay Vosburgh 已提交
866 867

		if (new_active) {
868 869
			bool should_notify_peers = false;

870 871
			bond_set_slave_active_flags(new_active,
						    BOND_SLAVE_NOTIFY_NOW);
872

873 874 875
			if (bond->params.fail_over_mac)
				bond_do_fail_over_mac(bond, new_active,
						      old_active);
876

877 878 879 880 881 882 883
			if (netif_running(bond->dev)) {
				bond->send_peer_notif =
					bond->params.num_peer_notif;
				should_notify_peers =
					bond_should_notify_peers(bond);
			}

884
			call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev);
885
			if (should_notify_peers)
886 887
				call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
							 bond->dev);
888
		}
J
Jay Vosburgh 已提交
889
	}
890

891
	/* resend IGMP joins since active slave has changed or
892 893
	 * all were sent on curr_active_slave.
	 * resend only if bond is brought up with the affected
894 895
	 * bonding modes and the retransmission is enabled
	 */
896
	if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) &&
897
	    ((bond_uses_primary(bond) && new_active) ||
898
	     BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) {
899
		bond->igmp_retrans = bond->params.resend_igmp;
900
		queue_delayed_work(bond->wq, &bond->mcast_work, 1);
901
	}
L
Linus Torvalds 已提交
902 903 904 905 906 907
}

/**
 * bond_select_active_slave - select a new active slave, if needed
 * @bond: our bonding struct
 *
S
Stephen Hemminger 已提交
908
 * This functions should be called when one of the following occurs:
L
Linus Torvalds 已提交
909 910 911 912
 * - The old curr_active_slave has been released or lost its link.
 * - The primary_slave has got its link back.
 * - A slave has got its link back and there's no old curr_active_slave.
 *
913
 * Caller must hold RTNL.
L
Linus Torvalds 已提交
914
 */
915
void bond_select_active_slave(struct bonding *bond)
L
Linus Torvalds 已提交
916 917
{
	struct slave *best_slave;
918
	int rv;
L
Linus Torvalds 已提交
919

920 921
	ASSERT_RTNL();

L
Linus Torvalds 已提交
922
	best_slave = bond_find_best_slave(bond);
923
	if (best_slave != rtnl_dereference(bond->curr_active_slave)) {
L
Linus Torvalds 已提交
924
		bond_change_active_slave(bond, best_slave);
925 926 927 928
		rv = bond_set_carrier(bond);
		if (!rv)
			return;

Z
Zhang Shengju 已提交
929
		if (netif_carrier_ok(bond->dev))
930
			netdev_info(bond->dev, "first active interface up!\n");
Z
Zhang Shengju 已提交
931
		else
932
			netdev_info(bond->dev, "now running without any active interface!\n");
L
Linus Torvalds 已提交
933 934 935
	}
}

936
#ifdef CONFIG_NET_POLL_CONTROLLER
937
static inline int slave_enable_netpoll(struct slave *slave)
938
{
939 940
	struct netpoll *np;
	int err = 0;
941

942
	np = kzalloc(sizeof(*np), GFP_KERNEL);
943 944 945 946
	err = -ENOMEM;
	if (!np)
		goto out;

947
	err = __netpoll_setup(np, slave->dev);
948 949 950
	if (err) {
		kfree(np);
		goto out;
951
	}
952 953 954 955 956 957 958 959 960 961 962 963
	slave->np = np;
out:
	return err;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
	struct netpoll *np = slave->np;

	if (!np)
		return;

	slave->np = NULL;
964
	__netpoll_free_async(np);
965
}
966 967 968

static void bond_poll_controller(struct net_device *bond_dev)
{
969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999
	struct bonding *bond = netdev_priv(bond_dev);
	struct slave *slave = NULL;
	struct list_head *iter;
	struct ad_info ad_info;
	struct netpoll_info *ni;
	const struct net_device_ops *ops;

	if (BOND_MODE(bond) == BOND_MODE_8023AD)
		if (bond_3ad_get_active_agg_info(bond, &ad_info))
			return;

	bond_for_each_slave_rcu(bond, slave, iter) {
		ops = slave->dev->netdev_ops;
		if (!bond_slave_is_up(slave) || !ops->ndo_poll_controller)
			continue;

		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			struct aggregator *agg =
			    SLAVE_AD_INFO(slave)->port.aggregator;

			if (agg &&
			    agg->aggregator_identifier != ad_info.aggregator_id)
				continue;
		}

		ni = rcu_dereference_bh(slave->dev->npinfo);
		if (down_trylock(&ni->dev_lock))
			continue;
		ops->ndo_poll_controller(slave->dev);
		up(&ni->dev_lock);
	}
1000 1001
}

1002
static void bond_netpoll_cleanup(struct net_device *bond_dev)
1003
{
1004
	struct bonding *bond = netdev_priv(bond_dev);
1005
	struct list_head *iter;
1006 1007
	struct slave *slave;

1008
	bond_for_each_slave(bond, slave, iter)
1009
		if (bond_slave_is_up(slave))
1010
			slave_disable_netpoll(slave);
1011
}
1012

1013
static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
1014 1015
{
	struct bonding *bond = netdev_priv(dev);
1016
	struct list_head *iter;
1017
	struct slave *slave;
1018
	int err = 0;
1019

1020
	bond_for_each_slave(bond, slave, iter) {
1021 1022
		err = slave_enable_netpoll(slave);
		if (err) {
1023
			bond_netpoll_cleanup(dev);
1024
			break;
1025 1026
		}
	}
1027
	return err;
1028
}
1029 1030 1031 1032 1033 1034 1035 1036
#else
static inline int slave_enable_netpoll(struct slave *slave)
{
	return 0;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
}
1037 1038 1039 1040 1041
static void bond_netpoll_cleanup(struct net_device *bond_dev)
{
}
#endif

L
Linus Torvalds 已提交
1042 1043
/*---------------------------------- IOCTL ----------------------------------*/

1044
static netdev_features_t bond_fix_features(struct net_device *dev,
1045
					   netdev_features_t features)
1046
{
1047
	struct bonding *bond = netdev_priv(dev);
1048
	struct list_head *iter;
1049
	netdev_features_t mask;
1050
	struct slave *slave;
1051

1052
	mask = features;
1053

1054
	features &= ~NETIF_F_ONE_FOR_ALL;
1055
	features |= NETIF_F_ALL_FOR_ALL;
1056

1057
	bond_for_each_slave(bond, slave, iter) {
1058 1059
		features = netdev_increment_features(features,
						     slave->dev->features,
1060 1061
						     mask);
	}
1062
	features = netdev_add_tso_features(features, mask);
1063 1064 1065 1066

	return features;
}

1067
#define BOND_VLAN_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
1068 1069
				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
				 NETIF_F_HIGHDMA | NETIF_F_LRO)
1070

1071 1072
#define BOND_ENC_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
				 NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
1073

1074 1075
static void bond_compute_features(struct bonding *bond)
{
1076 1077
	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
					IFF_XMIT_DST_RELEASE_PERM;
1078
	netdev_features_t vlan_features = BOND_VLAN_FEATURES;
1079
	netdev_features_t enc_features  = BOND_ENC_FEATURES;
1080 1081 1082
	struct net_device *bond_dev = bond->dev;
	struct list_head *iter;
	struct slave *slave;
1083
	unsigned short max_hard_header_len = ETH_HLEN;
1084 1085
	unsigned int gso_max_size = GSO_MAX_SIZE;
	u16 gso_max_segs = GSO_MAX_SEGS;
1086

1087
	if (!bond_has_slaves(bond))
1088
		goto done;
1089
	vlan_features &= NETIF_F_ALL_FOR_ALL;
1090

1091
	bond_for_each_slave(bond, slave, iter) {
1092
		vlan_features = netdev_increment_features(vlan_features,
1093 1094
			slave->dev->vlan_features, BOND_VLAN_FEATURES);

1095 1096 1097
		enc_features = netdev_increment_features(enc_features,
							 slave->dev->hw_enc_features,
							 BOND_ENC_FEATURES);
1098
		dst_release_flag &= slave->dev->priv_flags;
1099 1100
		if (slave->dev->hard_header_len > max_hard_header_len)
			max_hard_header_len = slave->dev->hard_header_len;
1101 1102 1103

		gso_max_size = min(gso_max_size, slave->dev->gso_max_size);
		gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs);
1104
	}
1105

1106
done:
1107
	bond_dev->vlan_features = vlan_features;
E
Eric Dumazet 已提交
1108
	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
1109
	bond_dev->hard_header_len = max_hard_header_len;
1110 1111
	bond_dev->gso_max_segs = gso_max_segs;
	netif_set_gso_max_size(bond_dev, gso_max_size);
1112

1113 1114 1115 1116
	bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
	if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
	    dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
		bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
1117

1118
	netdev_change_features(bond_dev);
1119 1120
}

1121 1122 1123
static void bond_setup_by_slave(struct net_device *bond_dev,
				struct net_device *slave_dev)
{
1124
	bond_dev->header_ops	    = slave_dev->header_ops;
1125 1126 1127 1128 1129 1130 1131 1132 1133

	bond_dev->type		    = slave_dev->type;
	bond_dev->hard_header_len   = slave_dev->hard_header_len;
	bond_dev->addr_len	    = slave_dev->addr_len;

	memcpy(bond_dev->broadcast, slave_dev->broadcast,
		slave_dev->addr_len);
}

1134
/* On bonding slaves other than the currently active slave, suppress
1135
 * duplicates except for alb non-mcast/bcast.
1136 1137
 */
static bool bond_should_deliver_exact_match(struct sk_buff *skb,
1138 1139
					    struct slave *slave,
					    struct bonding *bond)
1140
{
1141
	if (bond_is_slave_inactive(slave)) {
1142
		if (BOND_MODE(bond) == BOND_MODE_ALB &&
1143 1144 1145 1146 1147 1148 1149 1150
		    skb->pkt_type != PACKET_BROADCAST &&
		    skb->pkt_type != PACKET_MULTICAST)
			return false;
		return true;
	}
	return false;
}

1151
static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
1152
{
1153
	struct sk_buff *skb = *pskb;
1154
	struct slave *slave;
1155
	struct bonding *bond;
1156 1157
	int (*recv_probe)(const struct sk_buff *, struct bonding *,
			  struct slave *);
1158
	int ret = RX_HANDLER_ANOTHER;
1159

1160 1161 1162 1163 1164
	skb = skb_share_check(skb, GFP_ATOMIC);
	if (unlikely(!skb))
		return RX_HANDLER_CONSUMED;

	*pskb = skb;
1165

J
Jiri Pirko 已提交
1166 1167
	slave = bond_slave_get_rcu(skb->dev);
	bond = slave->bond;
1168

1169 1170
	recv_probe = ACCESS_ONCE(bond->recv_probe);
	if (recv_probe) {
1171 1172 1173 1174
		ret = recv_probe(skb, bond, slave);
		if (ret == RX_HANDLER_CONSUMED) {
			consume_skb(skb);
			return ret;
1175 1176 1177
		}
	}

Z
Zhang Shengju 已提交
1178
	if (bond_should_deliver_exact_match(skb, slave, bond))
1179
		return RX_HANDLER_EXACT;
1180

J
Jiri Pirko 已提交
1181
	skb->dev = bond->dev;
1182

1183
	if (BOND_MODE(bond) == BOND_MODE_ALB &&
J
Jiri Pirko 已提交
1184
	    bond->dev->priv_flags & IFF_BRIDGE_PORT &&
1185 1186
	    skb->pkt_type == PACKET_HOST) {

1187 1188 1189
		if (unlikely(skb_cow_head(skb,
					  skb->data - skb_mac_header(skb)))) {
			kfree_skb(skb);
1190
			return RX_HANDLER_CONSUMED;
1191
		}
1192
		ether_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr);
1193 1194
	}

1195
	return ret;
1196 1197
}

1198
static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
1199
{
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
	switch (BOND_MODE(bond)) {
	case BOND_MODE_ROUNDROBIN:
		return NETDEV_LAG_TX_TYPE_ROUNDROBIN;
	case BOND_MODE_ACTIVEBACKUP:
		return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
	case BOND_MODE_BROADCAST:
		return NETDEV_LAG_TX_TYPE_BROADCAST;
	case BOND_MODE_XOR:
	case BOND_MODE_8023AD:
		return NETDEV_LAG_TX_TYPE_HASH;
	default:
		return NETDEV_LAG_TX_TYPE_UNKNOWN;
	}
}

static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave)
{
	struct netdev_lag_upper_info lag_upper_info;
1218 1219
	int err;

1220 1221 1222
	lag_upper_info.tx_type = bond_lag_tx_type(bond);
	err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
					   &lag_upper_info);
1223 1224
	if (err)
		return err;
1225
	rtmsg_ifinfo(RTM_NEWLINK, slave->dev, IFF_SLAVE, GFP_KERNEL);
1226 1227 1228
	return 0;
}

1229
static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave)
1230
{
1231 1232 1233
	netdev_upper_dev_unlink(slave->dev, bond->dev);
	slave->dev->flags &= ~IFF_SLAVE;
	rtmsg_ifinfo(RTM_NEWLINK, slave->dev, IFF_SLAVE, GFP_KERNEL);
1234 1235
}

1236 1237 1238 1239
static struct slave *bond_alloc_slave(struct bonding *bond)
{
	struct slave *slave = NULL;

Z
Zhang Shengju 已提交
1240
	slave = kzalloc(sizeof(*slave), GFP_KERNEL);
1241 1242 1243
	if (!slave)
		return NULL;

1244
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
		SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info),
					       GFP_KERNEL);
		if (!SLAVE_AD_INFO(slave)) {
			kfree(slave);
			return NULL;
		}
	}
	return slave;
}

static void bond_free_slave(struct slave *slave)
{
	struct bonding *bond = bond_get_bond_by_slave(slave);

1259
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
1260 1261 1262 1263 1264
		kfree(SLAVE_AD_INFO(slave));

	kfree(slave);
}

1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info)
{
	info->bond_mode = BOND_MODE(bond);
	info->miimon = bond->params.miimon;
	info->num_slaves = bond->slave_cnt;
}

static void bond_fill_ifslave(struct slave *slave, struct ifslave *info)
{
	strcpy(info->slave_name, slave->dev->name);
	info->link = slave->link;
	info->state = bond_slave_state(slave);
	info->link_failure_count = slave->link_failure_count;
}

1280 1281
static void bond_netdev_notify(struct net_device *dev,
			       struct netdev_bonding_info *info)
1282 1283
{
	rtnl_lock();
1284
	netdev_bonding_info_change(dev, info);
1285 1286 1287 1288 1289 1290 1291 1292
	rtnl_unlock();
}

static void bond_netdev_notify_work(struct work_struct *_work)
{
	struct netdev_notify_work *w =
		container_of(_work, struct netdev_notify_work, work.work);

1293
	bond_netdev_notify(w->dev, &w->bonding_info);
1294
	dev_put(w->dev);
1295
	kfree(w);
1296 1297 1298 1299
}

void bond_queue_slave_event(struct slave *slave)
{
1300
	struct bonding *bond = slave->bond;
1301 1302 1303 1304 1305
	struct netdev_notify_work *nnw = kzalloc(sizeof(*nnw), GFP_ATOMIC);

	if (!nnw)
		return;

1306
	dev_hold(slave->dev);
1307
	nnw->dev = slave->dev;
1308 1309 1310
	bond_fill_ifslave(slave, &nnw->bonding_info.slave);
	bond_fill_ifbond(bond, &nnw->bonding_info.master);
	INIT_DELAYED_WORK(&nnw->work, bond_netdev_notify_work);
1311

1312
	queue_delayed_work(slave->bond->wq, &nnw->work, 0);
1313 1314
}

1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
void bond_lower_state_changed(struct slave *slave)
{
	struct netdev_lag_lower_state_info info;

	info.link_up = slave->link == BOND_LINK_UP ||
		       slave->link == BOND_LINK_FAIL;
	info.tx_enabled = bond_is_active_slave(slave);
	netdev_lower_state_changed(slave->dev, &info);
}

L
Linus Torvalds 已提交
1325
/* enslave device <slave> to bond device <master> */
1326
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
L
Linus Torvalds 已提交
1327
{
1328
	struct bonding *bond = netdev_priv(bond_dev);
1329
	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
1330
	struct slave *new_slave = NULL, *prev_slave;
L
Linus Torvalds 已提交
1331 1332
	struct sockaddr addr;
	int link_reporting;
1333
	int res = 0, i;
L
Linus Torvalds 已提交
1334

1335 1336 1337
	if (!bond->params.use_carrier &&
	    slave_dev->ethtool_ops->get_link == NULL &&
	    slave_ops->ndo_do_ioctl == NULL) {
1338 1339
		netdev_warn(bond_dev, "no link monitoring support for %s\n",
			    slave_dev->name);
L
Linus Torvalds 已提交
1340 1341
	}

M
Mahesh Bandewar 已提交
1342 1343 1344 1345
	/* already in-use? */
	if (netdev_is_rx_handler_busy(slave_dev)) {
		netdev_err(bond_dev,
			   "Error: Device is in use and cannot be enslaved\n");
L
Linus Torvalds 已提交
1346 1347 1348
		return -EBUSY;
	}

1349
	if (bond_dev == slave_dev) {
1350
		netdev_err(bond_dev, "cannot enslave bond to itself.\n");
1351 1352 1353
		return -EPERM;
	}

L
Linus Torvalds 已提交
1354 1355 1356
	/* vlan challenged mutual exclusion */
	/* no need to lock since we're protected by rtnl_lock */
	if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) {
1357 1358
		netdev_dbg(bond_dev, "%s is NETIF_F_VLAN_CHALLENGED\n",
			   slave_dev->name);
1359
		if (vlan_uses_dev(bond_dev)) {
1360 1361
			netdev_err(bond_dev, "Error: cannot enslave VLAN challenged slave %s on VLAN enabled bond %s\n",
				   slave_dev->name, bond_dev->name);
L
Linus Torvalds 已提交
1362 1363
			return -EPERM;
		} else {
1364 1365 1366
			netdev_warn(bond_dev, "enslaved VLAN challenged slave %s. Adding VLANs will be blocked as long as %s is part of bond %s\n",
				    slave_dev->name, slave_dev->name,
				    bond_dev->name);
L
Linus Torvalds 已提交
1367 1368
		}
	} else {
1369 1370
		netdev_dbg(bond_dev, "%s is !NETIF_F_VLAN_CHALLENGED\n",
			   slave_dev->name);
L
Linus Torvalds 已提交
1371 1372
	}

1373
	/* Old ifenslave binaries are no longer supported.  These can
S
Stephen Hemminger 已提交
1374
	 * be identified with moderate accuracy by the state of the slave:
1375 1376 1377
	 * the current ifenslave will set the interface down prior to
	 * enslaving it; the old ifenslave will not.
	 */
Y
yzhu1 已提交
1378
	if (slave_dev->flags & IFF_UP) {
1379 1380
		netdev_err(bond_dev, "%s is up - this may be due to an out of date ifenslave\n",
			   slave_dev->name);
1381
		return -EPERM;
1382
	}
L
Linus Torvalds 已提交
1383

1384 1385 1386 1387 1388 1389 1390
	/* set bonding device ether type by slave - bonding netdevices are
	 * created with ether_setup, so when the slave type is not ARPHRD_ETHER
	 * there is a need to override some of the type dependent attribs/funcs.
	 *
	 * bond ether type mutual exclusion - don't allow slaves of dissimilar
	 * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond
	 */
1391
	if (!bond_has_slaves(bond)) {
1392
		if (bond_dev->type != slave_dev->type) {
1393 1394
			netdev_dbg(bond_dev, "change device type from %d to %d\n",
				   bond_dev->type, slave_dev->type);
1395

1396 1397
			res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
						       bond_dev);
1398 1399
			res = notifier_to_errno(res);
			if (res) {
1400
				netdev_err(bond_dev, "refused to change device type\n");
1401
				return -EBUSY;
1402
			}
1403

1404
			/* Flush unicast and multicast addresses */
1405
			dev_uc_flush(bond_dev);
1406
			dev_mc_flush(bond_dev);
1407

1408 1409
			if (slave_dev->type != ARPHRD_ETHER)
				bond_setup_by_slave(bond_dev, slave_dev);
1410
			else {
1411
				ether_setup(bond_dev);
1412 1413
				bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
			}
1414

1415 1416
			call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
						 bond_dev);
1417
		}
1418
	} else if (bond_dev->type != slave_dev->type) {
1419 1420
		netdev_err(bond_dev, "%s ether type (%d) is different from other slaves (%d), can not enslave it\n",
			   slave_dev->name, slave_dev->type, bond_dev->type);
1421
		return -EINVAL;
1422 1423
	}

1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
	if (slave_dev->type == ARPHRD_INFINIBAND &&
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
		netdev_warn(bond_dev, "Type (%d) supports only active-backup mode\n",
			    slave_dev->type);
		res = -EOPNOTSUPP;
		goto err_undo_flags;
	}

	if (!slave_ops->ndo_set_mac_address ||
	    slave_dev->type == ARPHRD_INFINIBAND) {
1434
		netdev_warn(bond_dev, "The slave device specified does not support setting the MAC address\n");
1435 1436 1437
		if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP &&
		    bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
			if (!bond_has_slaves(bond)) {
1438
				bond->params.fail_over_mac = BOND_FOM_ACTIVE;
1439
				netdev_warn(bond_dev, "Setting fail_over_mac to active for active-backup mode\n");
1440
			} else {
1441
				netdev_err(bond_dev, "The slave device specified does not support setting the MAC address, but fail_over_mac is not set to active\n");
1442 1443
				res = -EOPNOTSUPP;
				goto err_undo_flags;
1444
			}
1445
		}
L
Linus Torvalds 已提交
1446 1447
	}

1448 1449
	call_netdevice_notifiers(NETDEV_JOIN, slave_dev);

1450
	/* If this is the first slave, then we need to set the master's hardware
1451 1452
	 * address to be the same as the slave's.
	 */
1453
	if (!bond_has_slaves(bond) &&
1454
	    bond->dev->addr_assign_type == NET_ADDR_RANDOM)
1455
		bond_set_dev_addr(bond->dev, slave_dev);
1456

1457
	new_slave = bond_alloc_slave(bond);
L
Linus Torvalds 已提交
1458 1459 1460 1461
	if (!new_slave) {
		res = -ENOMEM;
		goto err_undo_flags;
	}
1462

1463 1464
	new_slave->bond = bond;
	new_slave->dev = slave_dev;
1465
	/* Set the new_slave's queue_id to be zero.  Queue ID mapping
1466 1467 1468 1469
	 * is set via sysfs or module option if desired.
	 */
	new_slave->queue_id = 0;

1470 1471 1472 1473
	/* Save slave's original mtu and then set it to match the bond */
	new_slave->original_mtu = slave_dev->mtu;
	res = dev_set_mtu(slave_dev, bond->dev->mtu);
	if (res) {
1474
		netdev_dbg(bond_dev, "Error %d calling dev_set_mtu\n", res);
1475 1476 1477
		goto err_free;
	}

1478
	/* Save slave's original ("permanent") mac address for modes
1479 1480 1481
	 * that need it, and for restoring it upon release, and then
	 * set it to the master's address
	 */
1482
	ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr);
L
Linus Torvalds 已提交
1483

1484
	if (!bond->params.fail_over_mac ||
1485
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
1486
		/* Set slave to master's mac address.  The application already
1487 1488 1489 1490 1491 1492
		 * set the master's mac address to that of the first slave
		 */
		memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len);
		addr.sa_family = slave_dev->type;
		res = dev_set_mac_address(slave_dev, &addr);
		if (res) {
1493
			netdev_dbg(bond_dev, "Error %d calling set_mac_address\n", res);
1494
			goto err_restore_mtu;
1495
		}
1496
	}
L
Linus Torvalds 已提交
1497

1498 1499 1500
	/* set slave flag before open to prevent IPv6 addrconf */
	slave_dev->flags |= IFF_SLAVE;

1501 1502 1503
	/* open the slave since the application closed it */
	res = dev_open(slave_dev);
	if (res) {
1504
		netdev_dbg(bond_dev, "Opening slave %s failed\n", slave_dev->name);
1505
		goto err_restore_mac;
L
Linus Torvalds 已提交
1506 1507
	}

1508
	slave_dev->priv_flags |= IFF_BONDING;
1509 1510
	/* initialize slave stats */
	dev_get_stats(new_slave->dev, &new_slave->slave_stats);
L
Linus Torvalds 已提交
1511

1512
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
1513 1514 1515 1516
		/* bond_alb_init_slave() must be called before all other stages since
		 * it might fail and we do not want to have to undo everything
		 */
		res = bond_alb_init_slave(bond, new_slave);
S
Stephen Hemminger 已提交
1517
		if (res)
1518
			goto err_close;
L
Linus Torvalds 已提交
1519 1520
	}

1521
	/* If the mode uses primary, then the following is handled by
1522
	 * bond_change_active_slave().
L
Linus Torvalds 已提交
1523
	 */
1524
	if (!bond_uses_primary(bond)) {
L
Linus Torvalds 已提交
1525 1526
		/* set promiscuity level to new slave */
		if (bond_dev->flags & IFF_PROMISC) {
1527 1528 1529
			res = dev_set_promiscuity(slave_dev, 1);
			if (res)
				goto err_close;
L
Linus Torvalds 已提交
1530 1531 1532 1533
		}

		/* set allmulti level to new slave */
		if (bond_dev->flags & IFF_ALLMULTI) {
1534 1535 1536
			res = dev_set_allmulti(slave_dev, 1);
			if (res)
				goto err_close;
L
Linus Torvalds 已提交
1537 1538
		}

1539
		netif_addr_lock_bh(bond_dev);
1540 1541 1542 1543

		dev_mc_sync_multiple(slave_dev, bond_dev);
		dev_uc_sync_multiple(slave_dev, bond_dev);

1544
		netif_addr_unlock_bh(bond_dev);
L
Linus Torvalds 已提交
1545 1546
	}

1547
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
L
Linus Torvalds 已提交
1548 1549 1550
		/* add lacpdu mc addr to mc list */
		u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;

1551
		dev_mc_add(slave_dev, lacpdu_multicast);
L
Linus Torvalds 已提交
1552 1553
	}

1554 1555
	res = vlan_vids_add_by_dev(slave_dev, bond_dev);
	if (res) {
1556 1557
		netdev_err(bond_dev, "Couldn't add bond vlan ids to %s\n",
			   slave_dev->name);
1558 1559
		goto err_close;
	}
L
Linus Torvalds 已提交
1560

1561
	prev_slave = bond_last_slave(bond);
L
Linus Torvalds 已提交
1562 1563 1564 1565

	new_slave->delay = 0;
	new_slave->link_failure_count = 0;

1566 1567
	bond_update_speed_duplex(new_slave);

1568
	new_slave->last_rx = jiffies -
1569
		(msecs_to_jiffies(bond->params.arp_interval) + 1);
1570
	for (i = 0; i < BOND_MAX_ARP_TARGETS; i++)
1571
		new_slave->target_last_arp_rx[i] = new_slave->last_rx;
1572

L
Linus Torvalds 已提交
1573 1574 1575 1576
	if (bond->params.miimon && !bond->params.use_carrier) {
		link_reporting = bond_check_dev_link(bond, slave_dev, 1);

		if ((link_reporting == -1) && !bond->params.arp_interval) {
1577
			/* miimon is set but a bonded network driver
L
Linus Torvalds 已提交
1578 1579 1580 1581 1582 1583 1584
			 * does not support ETHTOOL/MII and
			 * arp_interval is not set.  Note: if
			 * use_carrier is enabled, we will never go
			 * here (because netif_carrier is always
			 * supported); thus, we don't need to change
			 * the messages for netif_carrier.
			 */
1585 1586
			netdev_warn(bond_dev, "MII and ETHTOOL support not available for interface %s, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n",
				    slave_dev->name);
L
Linus Torvalds 已提交
1587 1588
		} else if (link_reporting == -1) {
			/* unable get link status using mii/ethtool */
1589 1590
			netdev_warn(bond_dev, "can't get link status from interface %s; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n",
				    slave_dev->name);
L
Linus Torvalds 已提交
1591 1592 1593 1594
		}
	}

	/* check for initial state */
1595
	new_slave->link = BOND_LINK_NOCHANGE;
1596 1597 1598
	if (bond->params.miimon) {
		if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) {
			if (bond->params.updelay) {
1599
				bond_set_slave_link_state(new_slave,
1600 1601
							  BOND_LINK_BACK,
							  BOND_SLAVE_NOTIFY_NOW);
1602 1603
				new_slave->delay = bond->params.updelay;
			} else {
1604
				bond_set_slave_link_state(new_slave,
1605 1606
							  BOND_LINK_UP,
							  BOND_SLAVE_NOTIFY_NOW);
1607
			}
L
Linus Torvalds 已提交
1608
		} else {
1609 1610
			bond_set_slave_link_state(new_slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1611
		}
1612
	} else if (bond->params.arp_interval) {
1613 1614
		bond_set_slave_link_state(new_slave,
					  (netif_carrier_ok(slave_dev) ?
1615 1616
					  BOND_LINK_UP : BOND_LINK_DOWN),
					  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1617
	} else {
1618 1619
		bond_set_slave_link_state(new_slave, BOND_LINK_UP,
					  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1620 1621
	}

1622
	if (new_slave->link != BOND_LINK_DOWN)
1623
		new_slave->last_link_up = jiffies;
1624 1625 1626
	netdev_dbg(bond_dev, "Initial state of slave_dev is BOND_LINK_%s\n",
		   new_slave->link == BOND_LINK_DOWN ? "DOWN" :
		   (new_slave->link == BOND_LINK_UP ? "UP" : "BACK"));
1627

1628
	if (bond_uses_primary(bond) && bond->params.primary[0]) {
L
Linus Torvalds 已提交
1629
		/* if there is a primary slave, remember it */
1630
		if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
1631
			rcu_assign_pointer(bond->primary_slave, new_slave);
1632 1633
			bond->force_primary = true;
		}
L
Linus Torvalds 已提交
1634 1635
	}

1636
	switch (BOND_MODE(bond)) {
L
Linus Torvalds 已提交
1637
	case BOND_MODE_ACTIVEBACKUP:
1638 1639
		bond_set_slave_inactive_flags(new_slave,
					      BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1640 1641 1642 1643 1644 1645
		break;
	case BOND_MODE_8023AD:
		/* in 802.3ad mode, the internal mechanism
		 * will activate the slaves in the selected
		 * aggregator
		 */
1646
		bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1647
		/* if this is the first slave */
1648
		if (!prev_slave) {
1649
			SLAVE_AD_INFO(new_slave)->id = 1;
L
Linus Torvalds 已提交
1650 1651 1652
			/* Initialize AD with the number of times that the AD timer is called in 1 second
			 * can be called only after the mac address of the bond is set
			 */
1653
			bond_3ad_initialize(bond, 1000/AD_TIMER_INTERVAL);
L
Linus Torvalds 已提交
1654
		} else {
1655 1656
			SLAVE_AD_INFO(new_slave)->id =
				SLAVE_AD_INFO(prev_slave)->id + 1;
L
Linus Torvalds 已提交
1657 1658 1659 1660 1661 1662
		}

		bond_3ad_bind_slave(new_slave);
		break;
	case BOND_MODE_TLB:
	case BOND_MODE_ALB:
J
Jiri Pirko 已提交
1663
		bond_set_active_slave(new_slave);
1664
		bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1665 1666
		break;
	default:
1667
		netdev_dbg(bond_dev, "This slave is always active in trunk mode\n");
L
Linus Torvalds 已提交
1668 1669

		/* always active in trunk mode */
J
Jiri Pirko 已提交
1670
		bond_set_active_slave(new_slave);
L
Linus Torvalds 已提交
1671 1672 1673 1674 1675

		/* In trunking mode there is little meaning to curr_active_slave
		 * anyway (it holds no special properties of the bond device),
		 * so we can change it without calling change_active_interface()
		 */
1676 1677
		if (!rcu_access_pointer(bond->curr_active_slave) &&
		    new_slave->link == BOND_LINK_UP)
1678
			rcu_assign_pointer(bond->curr_active_slave, new_slave);
S
Stephen Hemminger 已提交
1679

L
Linus Torvalds 已提交
1680 1681 1682
		break;
	} /* switch(bond_mode) */

1683
#ifdef CONFIG_NET_POLL_CONTROLLER
S
stephen hemminger 已提交
1684
	slave_dev->npinfo = bond->dev->npinfo;
1685 1686
	if (slave_dev->npinfo) {
		if (slave_enable_netpoll(new_slave)) {
1687
			netdev_info(bond_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n");
1688
			res = -EBUSY;
1689
			goto err_detach;
1690
		}
1691 1692
	}
#endif
1693

1694 1695 1696
	if (!(bond_dev->features & NETIF_F_LRO))
		dev_disable_lro(slave_dev);

J
Jiri Pirko 已提交
1697 1698 1699
	res = netdev_rx_handler_register(slave_dev, bond_handle_frame,
					 new_slave);
	if (res) {
1700
		netdev_dbg(bond_dev, "Error %d calling netdev_rx_handler_register\n", res);
1701
		goto err_detach;
J
Jiri Pirko 已提交
1702 1703
	}

1704
	res = bond_master_upper_dev_link(bond, new_slave);
1705
	if (res) {
1706
		netdev_dbg(bond_dev, "Error %d calling bond_master_upper_dev_link\n", res);
1707 1708 1709
		goto err_unregister;
	}

1710 1711
	res = bond_sysfs_slave_add(new_slave);
	if (res) {
1712
		netdev_dbg(bond_dev, "Error %d calling bond_sysfs_slave_add\n", res);
1713 1714 1715
		goto err_upper_unlink;
	}

1716 1717 1718 1719
	bond->slave_cnt++;
	bond_compute_features(bond);
	bond_set_carrier(bond);

1720
	if (bond_uses_primary(bond)) {
1721
		block_netpoll_tx();
1722
		bond_select_active_slave(bond);
1723
		unblock_netpoll_tx();
1724
	}
1725

1726 1727 1728
	if (bond_mode_uses_xmit_hash(bond))
		bond_update_slave_arr(bond, NULL);

1729 1730 1731 1732
	netdev_info(bond_dev, "Enslaving %s as %s interface with %s link\n",
		    slave_dev->name,
		    bond_is_active_slave(new_slave) ? "an active" : "a backup",
		    new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");
L
Linus Torvalds 已提交
1733 1734

	/* enslave is successful */
1735
	bond_queue_slave_event(new_slave);
L
Linus Torvalds 已提交
1736 1737 1738
	return 0;

/* Undo stages on error */
1739
err_upper_unlink:
1740
	bond_upper_dev_unlink(bond, new_slave);
1741

1742 1743 1744
err_unregister:
	netdev_rx_handler_unregister(slave_dev);

1745
err_detach:
1746
	if (!bond_uses_primary(bond))
1747 1748
		bond_hw_addr_flush(bond_dev, slave_dev);

1749
	vlan_vids_del_by_dev(slave_dev, bond_dev);
1750 1751
	if (rcu_access_pointer(bond->primary_slave) == new_slave)
		RCU_INIT_POINTER(bond->primary_slave, NULL);
1752
	if (rcu_access_pointer(bond->curr_active_slave) == new_slave) {
1753
		block_netpoll_tx();
1754
		bond_change_active_slave(bond, NULL);
1755
		bond_select_active_slave(bond);
1756
		unblock_netpoll_tx();
1757
	}
1758 1759
	/* either primary_slave or curr_active_slave might've changed */
	synchronize_rcu();
1760
	slave_disable_netpoll(new_slave);
1761

L
Linus Torvalds 已提交
1762
err_close:
1763
	slave_dev->priv_flags &= ~IFF_BONDING;
L
Linus Torvalds 已提交
1764 1765 1766
	dev_close(slave_dev);

err_restore_mac:
1767
	slave_dev->flags &= ~IFF_SLAVE;
1768
	if (!bond->params.fail_over_mac ||
1769
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
1770 1771 1772 1773
		/* XXX TODO - fom follow mode needs to change master's
		 * MAC if this slave's MAC is in use by the bond, or at
		 * least print a warning.
		 */
1774
		ether_addr_copy(addr.sa_data, new_slave->perm_hwaddr);
1775 1776 1777
		addr.sa_family = slave_dev->type;
		dev_set_mac_address(slave_dev, &addr);
	}
L
Linus Torvalds 已提交
1778

1779 1780 1781
err_restore_mtu:
	dev_set_mtu(slave_dev, new_slave->original_mtu);

L
Linus Torvalds 已提交
1782
err_free:
1783
	bond_free_slave(new_slave);
L
Linus Torvalds 已提交
1784 1785

err_undo_flags:
1786
	/* Enslave of first slave has failed and we need to fix master's mac */
1787 1788 1789 1790 1791
	if (!bond_has_slaves(bond)) {
		if (ether_addr_equal_64bits(bond_dev->dev_addr,
					    slave_dev->dev_addr))
			eth_hw_addr_random(bond_dev);
		if (bond_dev->type != ARPHRD_ETHER) {
1792
			dev_close(bond_dev);
1793 1794 1795 1796 1797
			ether_setup(bond_dev);
			bond_dev->flags |= IFF_MASTER;
			bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
		}
	}
S
Stephen Hemminger 已提交
1798

L
Linus Torvalds 已提交
1799 1800 1801
	return res;
}

1802
/* Try to release the slave device <slave> from the bond device <master>
L
Linus Torvalds 已提交
1803
 * It is legal to access curr_active_slave without a lock because all the function
1804
 * is RTNL-locked. If "all" is true it means that the function is being called
1805
 * while destroying a bond interface and all slaves are being released.
L
Linus Torvalds 已提交
1806 1807 1808 1809 1810 1811 1812
 *
 * The rules for slave state should be:
 *   for Active/Backup:
 *     Active stays on all backups go down
 *   for Bonded connections:
 *     The first up interface should be left on and all others downed.
 */
1813 1814 1815
static int __bond_release_one(struct net_device *bond_dev,
			      struct net_device *slave_dev,
			      bool all)
L
Linus Torvalds 已提交
1816
{
1817
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
1818 1819
	struct slave *slave, *oldcurrent;
	struct sockaddr addr;
1820
	int old_flags = bond_dev->flags;
1821
	netdev_features_t old_features = bond_dev->features;
L
Linus Torvalds 已提交
1822 1823 1824

	/* slave is not a slave or master is not master of this slave */
	if (!(slave_dev->flags & IFF_SLAVE) ||
1825
	    !netdev_has_upper_dev(slave_dev, bond_dev)) {
1826
		netdev_dbg(bond_dev, "cannot release %s\n",
1827
			   slave_dev->name);
L
Linus Torvalds 已提交
1828 1829 1830
		return -EINVAL;
	}

1831
	block_netpoll_tx();
L
Linus Torvalds 已提交
1832 1833 1834 1835

	slave = bond_get_slave_by_dev(bond, slave_dev);
	if (!slave) {
		/* not a slave of this bond */
1836 1837
		netdev_info(bond_dev, "%s not enslaved\n",
			    slave_dev->name);
1838
		unblock_netpoll_tx();
L
Linus Torvalds 已提交
1839 1840 1841
		return -EINVAL;
	}

1842 1843
	bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW);

1844 1845
	bond_sysfs_slave_del(slave);

1846 1847 1848
	/* recompute stats just before removing the slave */
	bond_get_stats(bond->dev, &bond->bond_stats);

1849
	bond_upper_dev_unlink(bond, slave);
J
Jiri Pirko 已提交
1850 1851 1852 1853 1854
	/* unregister rx_handler early so bond_handle_frame wouldn't be called
	 * for this slave anymore.
	 */
	netdev_rx_handler_unregister(slave_dev);

1855
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
L
Linus Torvalds 已提交
1856 1857
		bond_3ad_unbind_slave(slave);

1858 1859 1860
	if (bond_mode_uses_xmit_hash(bond))
		bond_update_slave_arr(bond, slave);

1861 1862 1863
	netdev_info(bond_dev, "Releasing %s interface %s\n",
		    bond_is_active_slave(slave) ? "active" : "backup",
		    slave_dev->name);
L
Linus Torvalds 已提交
1864

1865
	oldcurrent = rcu_access_pointer(bond->curr_active_slave);
L
Linus Torvalds 已提交
1866

1867
	RCU_INIT_POINTER(bond->current_arp_slave, NULL);
L
Linus Torvalds 已提交
1868

1869
	if (!all && (!bond->params.fail_over_mac ||
1870
		     BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
1871
		if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) &&
1872
		    bond_has_slaves(bond))
1873 1874 1875
			netdev_warn(bond_dev, "the permanent HWaddr of %s - %pM - is still in use by %s - set the HWaddr of %s to a different address to avoid conflicts\n",
				    slave_dev->name, slave->perm_hwaddr,
				    bond_dev->name, slave_dev->name);
1876 1877
	}

1878 1879
	if (rtnl_dereference(bond->primary_slave) == slave)
		RCU_INIT_POINTER(bond->primary_slave, NULL);
L
Linus Torvalds 已提交
1880

1881
	if (oldcurrent == slave)
L
Linus Torvalds 已提交
1882 1883
		bond_change_active_slave(bond, NULL);

1884
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
1885 1886 1887 1888 1889 1890 1891 1892
		/* Must be called only after the slave has been
		 * detached from the list and the curr_active_slave
		 * has been cleared (if our_slave == old_current),
		 * but before a new active slave is selected.
		 */
		bond_alb_deinit_slave(bond, slave);
	}

1893
	if (all) {
1894
		RCU_INIT_POINTER(bond->curr_active_slave, NULL);
1895
	} else if (oldcurrent == slave) {
1896
		/* Note that we hold RTNL over this sequence, so there
1897 1898 1899
		 * is no concern that another slave add/remove event
		 * will interfere.
		 */
L
Linus Torvalds 已提交
1900
		bond_select_active_slave(bond);
1901 1902
	}

1903
	if (!bond_has_slaves(bond)) {
1904
		bond_set_carrier(bond);
1905
		eth_hw_addr_random(bond_dev);
L
Linus Torvalds 已提交
1906 1907
	}

1908
	unblock_netpoll_tx();
1909
	synchronize_rcu();
1910
	bond->slave_cnt--;
L
Linus Torvalds 已提交
1911

1912
	if (!bond_has_slaves(bond)) {
1913
		call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev);
1914 1915
		call_netdevice_notifiers(NETDEV_RELEASE, bond->dev);
	}
1916

1917 1918 1919
	bond_compute_features(bond);
	if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) &&
	    (old_features & NETIF_F_VLAN_CHALLENGED))
1920 1921
		netdev_info(bond_dev, "last VLAN challenged slave %s left bond %s - VLAN blocking is removed\n",
			    slave_dev->name, bond_dev->name);
1922

1923
	vlan_vids_del_by_dev(slave_dev, bond_dev);
L
Linus Torvalds 已提交
1924

1925
	/* If the mode uses primary, then this case was handled above by
1926
	 * bond_change_active_slave(..., NULL)
L
Linus Torvalds 已提交
1927
	 */
1928
	if (!bond_uses_primary(bond)) {
1929 1930 1931 1932 1933 1934 1935 1936
		/* unset promiscuity level from slave
		 * NOTE: The NETDEV_CHANGEADDR call above may change the value
		 * of the IFF_PROMISC flag in the bond_dev, but we need the
		 * value of that flag before that change, as that was the value
		 * when this slave was attached, so we cache at the start of the
		 * function and use it here. Same goes for ALLMULTI below
		 */
		if (old_flags & IFF_PROMISC)
L
Linus Torvalds 已提交
1937 1938 1939
			dev_set_promiscuity(slave_dev, -1);

		/* unset allmulti level from slave */
1940
		if (old_flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
1941 1942
			dev_set_allmulti(slave_dev, -1);

1943
		bond_hw_addr_flush(bond_dev, slave_dev);
L
Linus Torvalds 已提交
1944 1945
	}

1946
	slave_disable_netpoll(slave);
1947

L
Linus Torvalds 已提交
1948 1949 1950
	/* close slave before restoring its mac address */
	dev_close(slave_dev);

1951
	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
1952
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
1953
		/* restore original ("permanent") mac address */
1954
		ether_addr_copy(addr.sa_data, slave->perm_hwaddr);
1955 1956 1957
		addr.sa_family = slave_dev->type;
		dev_set_mac_address(slave_dev, &addr);
	}
L
Linus Torvalds 已提交
1958

1959 1960
	dev_set_mtu(slave_dev, slave->original_mtu);

1961
	slave_dev->priv_flags &= ~IFF_BONDING;
L
Linus Torvalds 已提交
1962

1963
	bond_free_slave(slave);
L
Linus Torvalds 已提交
1964

1965
	return 0;
L
Linus Torvalds 已提交
1966 1967
}

1968 1969 1970 1971 1972 1973
/* A wrapper used because of ndo_del_link */
int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
{
	return __bond_release_one(bond_dev, slave_dev, false);
}

1974 1975 1976
/* First release a slave and then destroy the bond if no more slaves are left.
 * Must be under rtnl_lock when this function is called.
 */
1977 1978
static int  bond_release_and_destroy(struct net_device *bond_dev,
				     struct net_device *slave_dev)
1979
{
1980
	struct bonding *bond = netdev_priv(bond_dev);
1981 1982 1983
	int ret;

	ret = bond_release(bond_dev, slave_dev);
1984
	if (ret == 0 && !bond_has_slaves(bond)) {
1985
		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
1986 1987
		netdev_info(bond_dev, "Destroying bond %s\n",
			    bond_dev->name);
1988
		bond_remove_proc_entry(bond);
S
Stephen Hemminger 已提交
1989
		unregister_netdevice(bond_dev);
1990 1991 1992 1993
	}
	return ret;
}

1994
static void bond_info_query(struct net_device *bond_dev, struct ifbond *info)
L
Linus Torvalds 已提交
1995
{
1996
	struct bonding *bond = netdev_priv(bond_dev);
1997
	bond_fill_ifbond(bond, info);
L
Linus Torvalds 已提交
1998 1999 2000 2001
}

static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info)
{
2002
	struct bonding *bond = netdev_priv(bond_dev);
2003
	struct list_head *iter;
2004
	int i = 0, res = -ENODEV;
L
Linus Torvalds 已提交
2005 2006
	struct slave *slave;

2007
	bond_for_each_slave(bond, slave, iter) {
2008
		if (i++ == (int)info->slave_id) {
2009
			res = 0;
2010
			bond_fill_ifslave(slave, info);
L
Linus Torvalds 已提交
2011 2012 2013 2014
			break;
		}
	}

2015
	return res;
L
Linus Torvalds 已提交
2016 2017 2018 2019
}

/*-------------------------------- Monitoring -------------------------------*/

2020
/* called with rcu_read_lock() */
J
Jay Vosburgh 已提交
2021 2022
static int bond_miimon_inspect(struct bonding *bond)
{
2023
	int link_state, commit = 0;
2024
	struct list_head *iter;
J
Jay Vosburgh 已提交
2025
	struct slave *slave;
2026 2027
	bool ignore_updelay;

2028
	ignore_updelay = !rcu_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
2029

2030
	bond_for_each_slave_rcu(bond, slave, iter) {
J
Jay Vosburgh 已提交
2031
		slave->new_link = BOND_LINK_NOCHANGE;
L
Linus Torvalds 已提交
2032

J
Jay Vosburgh 已提交
2033
		link_state = bond_check_dev_link(bond, slave->dev, 0);
L
Linus Torvalds 已提交
2034 2035

		switch (slave->link) {
J
Jay Vosburgh 已提交
2036 2037 2038
		case BOND_LINK_UP:
			if (link_state)
				continue;
L
Linus Torvalds 已提交
2039

2040
			bond_propose_link_state(slave, BOND_LINK_FAIL);
J
Jay Vosburgh 已提交
2041 2042
			slave->delay = bond->params.downdelay;
			if (slave->delay) {
2043 2044 2045 2046 2047 2048 2049
				netdev_info(bond->dev, "link status down for %sinterface %s, disabling it in %d ms\n",
					    (BOND_MODE(bond) ==
					     BOND_MODE_ACTIVEBACKUP) ?
					     (bond_is_active_slave(slave) ?
					      "active " : "backup ") : "",
					    slave->dev->name,
					    bond->params.downdelay * bond->params.miimon);
L
Linus Torvalds 已提交
2050
			}
J
Jay Vosburgh 已提交
2051 2052 2053
			/*FALLTHRU*/
		case BOND_LINK_FAIL:
			if (link_state) {
2054
				/* recovered before downdelay expired */
2055
				bond_propose_link_state(slave, BOND_LINK_UP);
2056
				slave->last_link_up = jiffies;
2057 2058 2059 2060
				netdev_info(bond->dev, "link status up again after %d ms for interface %s\n",
					    (bond->params.downdelay - slave->delay) *
					    bond->params.miimon,
					    slave->dev->name);
J
Jay Vosburgh 已提交
2061
				continue;
L
Linus Torvalds 已提交
2062
			}
J
Jay Vosburgh 已提交
2063 2064 2065 2066 2067

			if (slave->delay <= 0) {
				slave->new_link = BOND_LINK_DOWN;
				commit++;
				continue;
L
Linus Torvalds 已提交
2068 2069
			}

J
Jay Vosburgh 已提交
2070 2071 2072 2073 2074 2075 2076
			slave->delay--;
			break;

		case BOND_LINK_DOWN:
			if (!link_state)
				continue;

2077
			bond_propose_link_state(slave, BOND_LINK_BACK);
J
Jay Vosburgh 已提交
2078 2079 2080
			slave->delay = bond->params.updelay;

			if (slave->delay) {
2081 2082 2083 2084 2085
				netdev_info(bond->dev, "link status up for interface %s, enabling it in %d ms\n",
					    slave->dev->name,
					    ignore_updelay ? 0 :
					    bond->params.updelay *
					    bond->params.miimon);
J
Jay Vosburgh 已提交
2086 2087 2088 2089
			}
			/*FALLTHRU*/
		case BOND_LINK_BACK:
			if (!link_state) {
2090
				bond_propose_link_state(slave, BOND_LINK_DOWN);
2091 2092 2093 2094
				netdev_info(bond->dev, "link status down again after %d ms for interface %s\n",
					    (bond->params.updelay - slave->delay) *
					    bond->params.miimon,
					    slave->dev->name);
J
Jay Vosburgh 已提交
2095 2096 2097 2098

				continue;
			}

2099 2100 2101
			if (ignore_updelay)
				slave->delay = 0;

J
Jay Vosburgh 已提交
2102 2103 2104
			if (slave->delay <= 0) {
				slave->new_link = BOND_LINK_UP;
				commit++;
2105
				ignore_updelay = false;
J
Jay Vosburgh 已提交
2106
				continue;
L
Linus Torvalds 已提交
2107
			}
J
Jay Vosburgh 已提交
2108 2109

			slave->delay--;
L
Linus Torvalds 已提交
2110
			break;
J
Jay Vosburgh 已提交
2111 2112
		}
	}
L
Linus Torvalds 已提交
2113

J
Jay Vosburgh 已提交
2114 2115
	return commit;
}
L
Linus Torvalds 已提交
2116

J
Jay Vosburgh 已提交
2117 2118
static void bond_miimon_commit(struct bonding *bond)
{
2119
	struct list_head *iter;
2120
	struct slave *slave, *primary;
J
Jay Vosburgh 已提交
2121

2122
	bond_for_each_slave(bond, slave, iter) {
J
Jay Vosburgh 已提交
2123 2124 2125
		switch (slave->new_link) {
		case BOND_LINK_NOCHANGE:
			continue;
L
Linus Torvalds 已提交
2126

J
Jay Vosburgh 已提交
2127
		case BOND_LINK_UP:
2128 2129 2130 2131 2132 2133
			if (bond_update_speed_duplex(slave)) {
				netdev_warn(bond->dev,
					    "failed to get link speed/duplex for %s\n",
					    slave->dev->name);
				continue;
			}
2134 2135
			bond_set_slave_link_state(slave, BOND_LINK_UP,
						  BOND_SLAVE_NOTIFY_NOW);
2136
			slave->last_link_up = jiffies;
J
Jay Vosburgh 已提交
2137

2138
			primary = rtnl_dereference(bond->primary_slave);
2139
			if (BOND_MODE(bond) == BOND_MODE_8023AD) {
J
Jay Vosburgh 已提交
2140
				/* prevent it from being the active one */
J
Jiri Pirko 已提交
2141
				bond_set_backup_slave(slave);
2142
			} else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
J
Jay Vosburgh 已提交
2143
				/* make it immediately active */
J
Jiri Pirko 已提交
2144
				bond_set_active_slave(slave);
2145
			} else if (slave != primary) {
J
Jay Vosburgh 已提交
2146
				/* prevent it from being the active one */
J
Jiri Pirko 已提交
2147
				bond_set_backup_slave(slave);
L
Linus Torvalds 已提交
2148 2149
			}

2150 2151 2152 2153
			netdev_info(bond->dev, "link status definitely up for interface %s, %u Mbps %s duplex\n",
				    slave->dev->name,
				    slave->speed == SPEED_UNKNOWN ? 0 : slave->speed,
				    slave->duplex ? "full" : "half");
L
Linus Torvalds 已提交
2154

J
Jay Vosburgh 已提交
2155
			/* notify ad that the link status has changed */
2156
			if (BOND_MODE(bond) == BOND_MODE_8023AD)
J
Jay Vosburgh 已提交
2157
				bond_3ad_handle_link_change(slave, BOND_LINK_UP);
2158

2159
			if (bond_is_lb(bond))
J
Jay Vosburgh 已提交
2160 2161
				bond_alb_handle_link_change(bond, slave,
							    BOND_LINK_UP);
L
Linus Torvalds 已提交
2162

2163 2164 2165
			if (BOND_MODE(bond) == BOND_MODE_XOR)
				bond_update_slave_arr(bond, NULL);

2166
			if (!bond->curr_active_slave || slave == primary)
J
Jay Vosburgh 已提交
2167
				goto do_failover;
L
Linus Torvalds 已提交
2168

J
Jay Vosburgh 已提交
2169
			continue;
2170

J
Jay Vosburgh 已提交
2171
		case BOND_LINK_DOWN:
J
Jay Vosburgh 已提交
2172 2173 2174
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;

2175 2176
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2177

2178 2179
			if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP ||
			    BOND_MODE(bond) == BOND_MODE_8023AD)
2180 2181
				bond_set_slave_inactive_flags(slave,
							      BOND_SLAVE_NOTIFY_NOW);
J
Jay Vosburgh 已提交
2182

2183 2184
			netdev_info(bond->dev, "link status definitely down for interface %s, disabling it\n",
				    slave->dev->name);
J
Jay Vosburgh 已提交
2185

2186
			if (BOND_MODE(bond) == BOND_MODE_8023AD)
J
Jay Vosburgh 已提交
2187 2188 2189
				bond_3ad_handle_link_change(slave,
							    BOND_LINK_DOWN);

2190
			if (bond_is_lb(bond))
J
Jay Vosburgh 已提交
2191 2192 2193
				bond_alb_handle_link_change(bond, slave,
							    BOND_LINK_DOWN);

2194 2195 2196
			if (BOND_MODE(bond) == BOND_MODE_XOR)
				bond_update_slave_arr(bond, NULL);

2197
			if (slave == rcu_access_pointer(bond->curr_active_slave))
J
Jay Vosburgh 已提交
2198 2199 2200 2201 2202
				goto do_failover;

			continue;

		default:
2203 2204
			netdev_err(bond->dev, "invalid new link %d on slave %s\n",
				   slave->new_link, slave->dev->name);
J
Jay Vosburgh 已提交
2205 2206 2207 2208 2209 2210
			slave->new_link = BOND_LINK_NOCHANGE;

			continue;
		}

do_failover:
2211
		block_netpoll_tx();
J
Jay Vosburgh 已提交
2212
		bond_select_active_slave(bond);
2213
		unblock_netpoll_tx();
J
Jay Vosburgh 已提交
2214 2215 2216
	}

	bond_set_carrier(bond);
L
Linus Torvalds 已提交
2217 2218
}

2219
/* bond_mii_monitor
2220 2221
 *
 * Really a wrapper that splits the mii monitor into two phases: an
J
Jay Vosburgh 已提交
2222 2223 2224
 * inspection, then (if inspection indicates something needs to be done)
 * an acquisition of appropriate locks followed by a commit phase to
 * implement whatever link state changes are indicated.
2225
 */
2226
static void bond_mii_monitor(struct work_struct *work)
2227 2228 2229
{
	struct bonding *bond = container_of(work, struct bonding,
					    mii_work.work);
2230
	bool should_notify_peers = false;
2231
	unsigned long delay;
2232 2233
	struct slave *slave;
	struct list_head *iter;
2234

2235 2236 2237
	delay = msecs_to_jiffies(bond->params.miimon);

	if (!bond_has_slaves(bond))
J
Jay Vosburgh 已提交
2238
		goto re_arm;
2239

2240 2241
	rcu_read_lock();

2242 2243
	should_notify_peers = bond_should_notify_peers(bond);

2244
	if (bond_miimon_inspect(bond)) {
2245
		rcu_read_unlock();
J
Jay Vosburgh 已提交
2246

2247 2248 2249 2250 2251 2252
		/* Race avoidance with bond_close cancel of workqueue */
		if (!rtnl_trylock()) {
			delay = 1;
			should_notify_peers = false;
			goto re_arm;
		}
2253

2254 2255 2256
		bond_for_each_slave(bond, slave, iter) {
			bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER);
		}
2257 2258 2259
		bond_miimon_commit(bond);

		rtnl_unlock();	/* might sleep, hold no other locks */
2260 2261
	} else
		rcu_read_unlock();
2262

J
Jay Vosburgh 已提交
2263
re_arm:
2264
	if (bond->params.miimon)
2265 2266 2267 2268 2269 2270 2271 2272
		queue_delayed_work(bond->wq, &bond->mii_work, delay);

	if (should_notify_peers) {
		if (!rtnl_trylock())
			return;
		call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
		rtnl_unlock();
	}
2273
}
J
Jay Vosburgh 已提交
2274

2275 2276 2277 2278 2279 2280 2281
static int bond_upper_dev_walk(struct net_device *upper, void *data)
{
	__be32 ip = *((__be32 *)data);

	return ip == bond_confirm_addr(upper, 0, ip);
}

2282
static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
2283
{
2284
	bool ret = false;
2285

2286
	if (ip == bond_confirm_addr(bond->dev, 0, ip))
2287
		return true;
2288

2289
	rcu_read_lock();
2290 2291
	if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &ip))
		ret = true;
2292
	rcu_read_unlock();
2293

2294
	return ret;
2295 2296
}

2297
/* We go to the (large) trouble of VLAN tagging ARP frames because
J
Jay Vosburgh 已提交
2298 2299 2300
 * switches in VLAN mode (especially if ports are configured as
 * "native" to a VLAN) might not pass non-tagged frames.
 */
2301 2302
static void bond_arp_send(struct net_device *slave_dev, int arp_op,
			  __be32 dest_ip, __be32 src_ip,
2303
			  struct bond_vlan_tag *tags)
J
Jay Vosburgh 已提交
2304 2305
{
	struct sk_buff *skb;
2306
	struct bond_vlan_tag *outer_tag = tags;
J
Jay Vosburgh 已提交
2307

2308 2309
	netdev_dbg(slave_dev, "arp %d on slave %s: dst %pI4 src %pI4\n",
		   arp_op, slave_dev->name, &dest_ip, &src_ip);
S
Stephen Hemminger 已提交
2310

J
Jay Vosburgh 已提交
2311 2312 2313 2314
	skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip,
			 NULL, slave_dev->dev_addr, NULL);

	if (!skb) {
2315
		net_err_ratelimited("ARP packet allocation failed\n");
J
Jay Vosburgh 已提交
2316 2317
		return;
	}
2318

2319 2320 2321 2322 2323
	if (!tags || tags->vlan_proto == VLAN_N_VID)
		goto xmit;

	tags++;

2324
	/* Go through all the tags backwards and add them to the packet */
2325 2326 2327
	while (tags->vlan_proto != VLAN_N_VID) {
		if (!tags->vlan_id) {
			tags++;
2328
			continue;
2329
		}
2330

2331
		netdev_dbg(slave_dev, "inner tag: proto %X vid %X\n",
2332
			   ntohs(outer_tag->vlan_proto), tags->vlan_id);
2333 2334
		skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto,
						tags->vlan_id);
2335 2336 2337 2338
		if (!skb) {
			net_err_ratelimited("failed to insert inner VLAN tag\n");
			return;
		}
2339 2340

		tags++;
2341 2342
	}
	/* Set the outer tag */
2343
	if (outer_tag->vlan_id) {
2344
		netdev_dbg(slave_dev, "outer tag: proto %X vid %X\n",
2345
			   ntohs(outer_tag->vlan_proto), outer_tag->vlan_id);
J
Jiri Pirko 已提交
2346 2347
		__vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto,
				       outer_tag->vlan_id);
J
Jay Vosburgh 已提交
2348
	}
2349 2350

xmit:
J
Jay Vosburgh 已提交
2351 2352 2353
	arp_xmit(skb);
}

2354 2355 2356 2357 2358 2359
/* Validate the device path between the @start_dev and the @end_dev.
 * The path is valid if the @end_dev is reachable through device
 * stacking.
 * When the path is validated, collect any vlan information in the
 * path.
 */
2360 2361 2362
struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
					      struct net_device *end_dev,
					      int level)
2363
{
2364
	struct bond_vlan_tag *tags;
2365 2366 2367
	struct net_device *upper;
	struct list_head  *iter;

2368 2369 2370 2371 2372 2373 2374
	if (start_dev == end_dev) {
		tags = kzalloc(sizeof(*tags) * (level + 1), GFP_ATOMIC);
		if (!tags)
			return ERR_PTR(-ENOMEM);
		tags[level].vlan_proto = VLAN_N_VID;
		return tags;
	}
2375 2376

	netdev_for_each_upper_dev_rcu(start_dev, upper, iter) {
2377 2378 2379 2380 2381
		tags = bond_verify_device_path(upper, end_dev, level + 1);
		if (IS_ERR_OR_NULL(tags)) {
			if (IS_ERR(tags))
				return tags;
			continue;
2382
		}
2383 2384 2385 2386 2387 2388
		if (is_vlan_dev(upper)) {
			tags[level].vlan_proto = vlan_dev_vlan_proto(upper);
			tags[level].vlan_id = vlan_dev_vlan_id(upper);
		}

		return tags;
2389 2390
	}

2391
	return NULL;
2392
}
J
Jay Vosburgh 已提交
2393

L
Linus Torvalds 已提交
2394 2395
static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
{
J
Jay Vosburgh 已提交
2396
	struct rtable *rt;
2397
	struct bond_vlan_tag *tags;
2398
	__be32 *targets = bond->params.arp_targets, addr;
2399
	int i;
L
Linus Torvalds 已提交
2400

2401
	for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
2402
		netdev_dbg(bond->dev, "basa: target %pI4\n", &targets[i]);
2403
		tags = NULL;
J
Jay Vosburgh 已提交
2404

2405
		/* Find out through which dev should the packet go */
2406 2407
		rt = ip_route_output(dev_net(bond->dev), targets[i], 0,
				     RTO_ONLINK, 0);
2408
		if (IS_ERR(rt)) {
2409 2410 2411
			/* there's no route to target - try to send arp
			 * probe to generate any traffic (arp_validate=0)
			 */
2412 2413 2414 2415
			if (bond->params.arp_validate)
				net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
						     bond->dev->name,
						     &targets[i]);
2416 2417
			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
				      0, tags);
J
Jay Vosburgh 已提交
2418 2419 2420
			continue;
		}

2421 2422 2423 2424 2425
		/* bond device itself */
		if (rt->dst.dev == bond->dev)
			goto found;

		rcu_read_lock();
2426
		tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0);
2427
		rcu_read_unlock();
J
Jay Vosburgh 已提交
2428

2429
		if (!IS_ERR_OR_NULL(tags))
2430 2431
			goto found;

2432
		/* Not our device - skip */
2433 2434
		netdev_dbg(bond->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n",
			   &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL");
2435

2436
		ip_rt_put(rt);
2437 2438 2439 2440 2441 2442
		continue;

found:
		addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
		ip_rt_put(rt);
		bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
2443
			      addr, tags);
2444
		kfree(tags);
J
Jay Vosburgh 已提交
2445 2446 2447
	}
}

2448
static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip)
2449
{
2450 2451
	int i;

2452
	if (!sip || !bond_has_this_ip(bond, tip)) {
2453 2454
		netdev_dbg(bond->dev, "bva: sip %pI4 tip %pI4 not found\n",
			   &sip, &tip);
2455 2456
		return;
	}
2457

2458 2459
	i = bond_get_targets_ip(bond->params.arp_targets, sip);
	if (i == -1) {
2460 2461
		netdev_dbg(bond->dev, "bva: sip %pI4 not found in targets\n",
			   &sip);
2462
		return;
2463
	}
2464
	slave->last_rx = jiffies;
2465
	slave->target_last_arp_rx[i] = jiffies;
2466 2467
}

2468 2469
int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
		 struct slave *slave)
2470
{
2471
	struct arphdr *arp = (struct arphdr *)skb->data;
2472
	struct slave *curr_active_slave, *curr_arp_slave;
2473
	unsigned char *arp_ptr;
2474
	__be32 sip, tip;
2475
	int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
2476

2477
	if (!slave_do_arp_validate(bond, slave)) {
2478 2479
		if ((slave_do_arp_validate_only(bond) && is_arp) ||
		    !slave_do_arp_validate_only(bond))
2480
			slave->last_rx = jiffies;
2481
		return RX_HANDLER_ANOTHER;
2482 2483 2484
	} else if (!is_arp) {
		return RX_HANDLER_ANOTHER;
	}
2485

2486
	alen = arp_hdr_len(bond->dev);
2487

2488 2489
	netdev_dbg(bond->dev, "bond_arp_rcv: skb->dev %s\n",
		   skb->dev->name);
2490

2491 2492 2493 2494 2495 2496 2497
	if (alen > skb_headlen(skb)) {
		arp = kmalloc(alen, GFP_ATOMIC);
		if (!arp)
			goto out_unlock;
		if (skb_copy_bits(skb, 0, arp, alen) < 0)
			goto out_unlock;
	}
2498

2499
	if (arp->ar_hln != bond->dev->addr_len ||
2500 2501 2502 2503 2504 2505 2506 2507
	    skb->pkt_type == PACKET_OTHERHOST ||
	    skb->pkt_type == PACKET_LOOPBACK ||
	    arp->ar_hrd != htons(ARPHRD_ETHER) ||
	    arp->ar_pro != htons(ETH_P_IP) ||
	    arp->ar_pln != 4)
		goto out_unlock;

	arp_ptr = (unsigned char *)(arp + 1);
2508
	arp_ptr += bond->dev->addr_len;
2509
	memcpy(&sip, arp_ptr, 4);
2510
	arp_ptr += 4 + bond->dev->addr_len;
2511 2512
	memcpy(&tip, arp_ptr, 4);

2513 2514 2515 2516
	netdev_dbg(bond->dev, "bond_arp_rcv: %s/%d av %d sv %d sip %pI4 tip %pI4\n",
		   slave->dev->name, bond_slave_state(slave),
		     bond->params.arp_validate, slave_do_arp_validate(bond, slave),
		     &sip, &tip);
2517

2518
	curr_active_slave = rcu_dereference(bond->curr_active_slave);
2519
	curr_arp_slave = rcu_dereference(bond->current_arp_slave);
2520

2521
	/* We 'trust' the received ARP enough to validate it if:
2522
	 *
2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541
	 * (a) the slave receiving the ARP is active (which includes the
	 * current ARP slave, if any), or
	 *
	 * (b) the receiving slave isn't active, but there is a currently
	 * active slave and it received valid arp reply(s) after it became
	 * the currently active slave, or
	 *
	 * (c) there is an ARP slave that sent an ARP during the prior ARP
	 * interval, and we receive an ARP reply on any slave.  We accept
	 * these because switch FDB update delays may deliver the ARP
	 * reply to a slave other than the sender of the ARP request.
	 *
	 * Note: for (b), backup slaves are receiving the broadcast ARP
	 * request, not a reply.  This request passes from the sending
	 * slave through the L2 switch(es) to the receiving slave.  Since
	 * this is checking the request, sip/tip are swapped for
	 * validation.
	 *
	 * This is done to avoid endless looping when we can't reach the
2542
	 * arp_ip_target and fool ourselves with our own arp requests.
2543
	 */
J
Jiri Pirko 已提交
2544
	if (bond_is_active_slave(slave))
2545
		bond_validate_arp(bond, slave, sip, tip);
2546 2547 2548
	else if (curr_active_slave &&
		 time_after(slave_last_rx(bond, curr_active_slave),
			    curr_active_slave->last_link_up))
2549
		bond_validate_arp(bond, slave, tip, sip);
2550 2551 2552 2553
	else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) &&
		 bond_time_in_interval(bond,
				       dev_trans_start(curr_arp_slave->dev), 1))
		bond_validate_arp(bond, slave, sip, tip);
2554 2555

out_unlock:
2556 2557
	if (arp != (struct arphdr *)skb->data)
		kfree(arp);
2558
	return RX_HANDLER_ANOTHER;
2559 2560
}

2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574
/* function to verify if we're in the arp_interval timeslice, returns true if
 * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval +
 * arp_interval/2) . the arp_interval/2 is needed for really fast networks.
 */
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
				  int mod)
{
	int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

	return time_in_range(jiffies,
			     last_act - delta_in_ticks,
			     last_act + mod * delta_in_ticks + delta_in_ticks/2);
}

2575
/* This function is called regularly to monitor each slave's link
L
Linus Torvalds 已提交
2576 2577 2578 2579 2580
 * ensuring that traffic is being sent and received when arp monitoring
 * is used in load-balancing mode. if the adapter has been dormant, then an
 * arp is transmitted to generate traffic. see activebackup_arp_monitor for
 * arp monitoring in active backup mode.
 */
2581
static void bond_loadbalance_arp_mon(struct bonding *bond)
L
Linus Torvalds 已提交
2582 2583
{
	struct slave *slave, *oldcurrent;
2584
	struct list_head *iter;
2585
	int do_failover = 0, slave_state_changed = 0;
L
Linus Torvalds 已提交
2586

2587
	if (!bond_has_slaves(bond))
L
Linus Torvalds 已提交
2588 2589
		goto re_arm;

2590 2591
	rcu_read_lock();

2592
	oldcurrent = rcu_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
2593 2594
	/* see if any of the previous devices are up now (i.e. they have
	 * xmt and rcv traffic). the curr_active_slave does not come into
2595 2596 2597
	 * the picture unless it is null. also, slave->last_link_up is not
	 * needed here because we send an arp on each slave and give a slave
	 * as long as it needs to get the tx/rx within the delta.
L
Linus Torvalds 已提交
2598 2599 2600
	 * TODO: what about up/down delay in arp mode? it wasn't here before
	 *       so it can wait
	 */
2601
	bond_for_each_slave_rcu(bond, slave, iter) {
2602 2603
		unsigned long trans_start = dev_trans_start(slave->dev);

L
Linus Torvalds 已提交
2604
		if (slave->link != BOND_LINK_UP) {
2605
			if (bond_time_in_interval(bond, trans_start, 1) &&
2606
			    bond_time_in_interval(bond, slave->last_rx, 1)) {
L
Linus Torvalds 已提交
2607 2608

				slave->link  = BOND_LINK_UP;
2609
				slave_state_changed = 1;
L
Linus Torvalds 已提交
2610 2611 2612 2613 2614 2615 2616

				/* primary_slave has no meaning in round-robin
				 * mode. the window of a slave being up and
				 * curr_active_slave being null after enslaving
				 * is closed.
				 */
				if (!oldcurrent) {
2617 2618
					netdev_info(bond->dev, "link status definitely up for interface %s\n",
						    slave->dev->name);
L
Linus Torvalds 已提交
2619 2620
					do_failover = 1;
				} else {
2621 2622
					netdev_info(bond->dev, "interface %s is now up\n",
						    slave->dev->name);
L
Linus Torvalds 已提交
2623 2624 2625 2626 2627 2628 2629 2630 2631
				}
			}
		} else {
			/* slave->link == BOND_LINK_UP */

			/* not all switches will respond to an arp request
			 * when the source ip is 0, so don't take the link down
			 * if we don't know our ip yet
			 */
2632
			if (!bond_time_in_interval(bond, trans_start, 2) ||
2633
			    !bond_time_in_interval(bond, slave->last_rx, 2)) {
L
Linus Torvalds 已提交
2634 2635

				slave->link  = BOND_LINK_DOWN;
2636
				slave_state_changed = 1;
L
Linus Torvalds 已提交
2637

S
Stephen Hemminger 已提交
2638
				if (slave->link_failure_count < UINT_MAX)
L
Linus Torvalds 已提交
2639 2640
					slave->link_failure_count++;

2641 2642
				netdev_info(bond->dev, "interface %s is now down\n",
					    slave->dev->name);
L
Linus Torvalds 已提交
2643

S
Stephen Hemminger 已提交
2644
				if (slave == oldcurrent)
L
Linus Torvalds 已提交
2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655
					do_failover = 1;
			}
		}

		/* note: if switch is in round-robin mode, all links
		 * must tx arp to ensure all links rx an arp - otherwise
		 * links may oscillate or not come up at all; if switch is
		 * in something like xor mode, there is nothing we can
		 * do - all replies will be rx'ed on same link causing slaves
		 * to be unstable during low/no traffic periods
		 */
2656
		if (bond_slave_is_up(slave))
L
Linus Torvalds 已提交
2657 2658 2659
			bond_arp_send_all(bond, slave);
	}

2660 2661
	rcu_read_unlock();

2662
	if (do_failover || slave_state_changed) {
2663 2664
		if (!rtnl_trylock())
			goto re_arm;
L
Linus Torvalds 已提交
2665

2666 2667
		if (slave_state_changed) {
			bond_slave_state_change(bond);
2668 2669
			if (BOND_MODE(bond) == BOND_MODE_XOR)
				bond_update_slave_arr(bond, NULL);
2670 2671
		}
		if (do_failover) {
2672 2673 2674 2675
			block_netpoll_tx();
			bond_select_active_slave(bond);
			unblock_netpoll_tx();
		}
2676
		rtnl_unlock();
L
Linus Torvalds 已提交
2677 2678 2679
	}

re_arm:
2680
	if (bond->params.arp_interval)
2681 2682
		queue_delayed_work(bond->wq, &bond->arp_work,
				   msecs_to_jiffies(bond->params.arp_interval));
L
Linus Torvalds 已提交
2683 2684
}

2685
/* Called to inspect slaves for active-backup mode ARP monitor link state
2686 2687 2688 2689
 * changes.  Sets new_link in slaves to specify what action should take
 * place for the slave.  Returns 0 if no changes are found, >0 if changes
 * to link states must be committed.
 *
2690
 * Called with rcu_read_lock held.
L
Linus Torvalds 已提交
2691
 */
2692
static int bond_ab_arp_inspect(struct bonding *bond)
L
Linus Torvalds 已提交
2693
{
2694
	unsigned long trans_start, last_rx;
2695
	struct list_head *iter;
2696 2697
	struct slave *slave;
	int commit = 0;
2698

2699
	bond_for_each_slave_rcu(bond, slave, iter) {
2700
		slave->new_link = BOND_LINK_NOCHANGE;
2701
		last_rx = slave_last_rx(bond, slave);
L
Linus Torvalds 已提交
2702

2703
		if (slave->link != BOND_LINK_UP) {
2704
			if (bond_time_in_interval(bond, last_rx, 1)) {
2705 2706 2707 2708 2709
				slave->new_link = BOND_LINK_UP;
				commit++;
			}
			continue;
		}
L
Linus Torvalds 已提交
2710

2711
		/* Give slaves 2*delta after being enslaved or made
2712 2713 2714
		 * active.  This avoids bouncing, as the last receive
		 * times need a full ARP monitor cycle to be updated.
		 */
2715
		if (bond_time_in_interval(bond, slave->last_link_up, 2))
2716 2717
			continue;

2718
		/* Backup slave is down if:
2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729
		 * - No current_arp_slave AND
		 * - more than 3*delta since last receive AND
		 * - the bond has an IP address
		 *
		 * Note: a non-null current_arp_slave indicates
		 * the curr_active_slave went down and we are
		 * searching for a new one; under this condition
		 * we only take the curr_active_slave down - this
		 * gives each slave a chance to tx/rx traffic
		 * before being taken out
		 */
J
Jiri Pirko 已提交
2730
		if (!bond_is_active_slave(slave) &&
2731
		    !rcu_access_pointer(bond->current_arp_slave) &&
2732
		    !bond_time_in_interval(bond, last_rx, 3)) {
2733 2734 2735 2736
			slave->new_link = BOND_LINK_DOWN;
			commit++;
		}

2737
		/* Active slave is down if:
2738 2739 2740 2741
		 * - more than 2*delta since transmitting OR
		 * - (more than 2*delta since receive AND
		 *    the bond has an IP address)
		 */
2742
		trans_start = dev_trans_start(slave->dev);
J
Jiri Pirko 已提交
2743
		if (bond_is_active_slave(slave) &&
2744 2745
		    (!bond_time_in_interval(bond, trans_start, 2) ||
		     !bond_time_in_interval(bond, last_rx, 2))) {
2746 2747 2748
			slave->new_link = BOND_LINK_DOWN;
			commit++;
		}
L
Linus Torvalds 已提交
2749 2750
	}

2751 2752
	return commit;
}
L
Linus Torvalds 已提交
2753

2754
/* Called to commit link state changes noted by inspection step of
2755 2756
 * active-backup mode ARP monitor.
 *
2757
 * Called with RTNL hold.
2758
 */
2759
static void bond_ab_arp_commit(struct bonding *bond)
2760
{
2761
	unsigned long trans_start;
2762
	struct list_head *iter;
2763
	struct slave *slave;
L
Linus Torvalds 已提交
2764

2765
	bond_for_each_slave(bond, slave, iter) {
2766 2767 2768
		switch (slave->new_link) {
		case BOND_LINK_NOCHANGE:
			continue;
2769

2770
		case BOND_LINK_UP:
2771
			trans_start = dev_trans_start(slave->dev);
2772 2773
			if (rtnl_dereference(bond->curr_active_slave) != slave ||
			    (!rtnl_dereference(bond->curr_active_slave) &&
2774
			     bond_time_in_interval(bond, trans_start, 1))) {
2775 2776 2777
				struct slave *current_arp_slave;

				current_arp_slave = rtnl_dereference(bond->current_arp_slave);
2778 2779
				bond_set_slave_link_state(slave, BOND_LINK_UP,
							  BOND_SLAVE_NOTIFY_NOW);
2780
				if (current_arp_slave) {
2781
					bond_set_slave_inactive_flags(
2782
						current_arp_slave,
2783
						BOND_SLAVE_NOTIFY_NOW);
2784
					RCU_INIT_POINTER(bond->current_arp_slave, NULL);
2785
				}
2786

2787 2788
				netdev_info(bond->dev, "link status definitely up for interface %s\n",
					    slave->dev->name);
2789

2790
				if (!rtnl_dereference(bond->curr_active_slave) ||
2791
				    slave == rtnl_dereference(bond->primary_slave))
2792
					goto do_failover;
L
Linus Torvalds 已提交
2793

2794
			}
L
Linus Torvalds 已提交
2795

2796
			continue;
L
Linus Torvalds 已提交
2797

2798 2799 2800 2801
		case BOND_LINK_DOWN:
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;

2802 2803
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
2804 2805
			bond_set_slave_inactive_flags(slave,
						      BOND_SLAVE_NOTIFY_NOW);
2806

2807 2808
			netdev_info(bond->dev, "link status definitely down for interface %s, disabling it\n",
				    slave->dev->name);
2809

2810
			if (slave == rtnl_dereference(bond->curr_active_slave)) {
2811
				RCU_INIT_POINTER(bond->current_arp_slave, NULL);
2812
				goto do_failover;
L
Linus Torvalds 已提交
2813
			}
2814 2815

			continue;
2816 2817

		default:
2818 2819
			netdev_err(bond->dev, "impossible: new_link %d on slave %s\n",
				   slave->new_link, slave->dev->name);
2820
			continue;
L
Linus Torvalds 已提交
2821 2822
		}

2823
do_failover:
2824
		block_netpoll_tx();
2825
		bond_select_active_slave(bond);
2826
		unblock_netpoll_tx();
2827
	}
L
Linus Torvalds 已提交
2828

2829 2830
	bond_set_carrier(bond);
}
L
Linus Torvalds 已提交
2831

2832
/* Send ARP probes for active-backup mode ARP monitor.
2833
 *
2834
 * Called with rcu_read_lock held.
2835
 */
2836
static bool bond_ab_arp_probe(struct bonding *bond)
2837
{
2838
	struct slave *slave, *before = NULL, *new_slave = NULL,
2839 2840
		     *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
		     *curr_active_slave = rcu_dereference(bond->curr_active_slave);
2841 2842
	struct list_head *iter;
	bool found = false;
2843
	bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER;
2844

2845
	if (curr_arp_slave && curr_active_slave)
2846 2847 2848
		netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n",
			    curr_arp_slave->dev->name,
			    curr_active_slave->dev->name);
L
Linus Torvalds 已提交
2849

2850 2851
	if (curr_active_slave) {
		bond_arp_send_all(bond, curr_active_slave);
2852
		return should_notify_rtnl;
2853
	}
L
Linus Torvalds 已提交
2854

2855 2856 2857 2858
	/* if we don't have a curr_active_slave, search for the next available
	 * backup slave from the current_arp_slave and make it the candidate
	 * for becoming the curr_active_slave
	 */
L
Linus Torvalds 已提交
2859

2860
	if (!curr_arp_slave) {
2861 2862 2863
		curr_arp_slave = bond_first_slave_rcu(bond);
		if (!curr_arp_slave)
			return should_notify_rtnl;
2864
	}
L
Linus Torvalds 已提交
2865

2866
	bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_LATER);
2867

2868
	bond_for_each_slave_rcu(bond, slave, iter) {
2869
		if (!found && !before && bond_slave_is_up(slave))
2870
			before = slave;
L
Linus Torvalds 已提交
2871

2872
		if (found && !new_slave && bond_slave_is_up(slave))
2873
			new_slave = slave;
2874 2875 2876 2877 2878 2879
		/* if the link state is up at this point, we
		 * mark it down - this can happen if we have
		 * simultaneous link failures and
		 * reselect_active_interface doesn't make this
		 * one the current slave so it is still marked
		 * up when it is actually down
L
Linus Torvalds 已提交
2880
		 */
2881
		if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
2882 2883
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_LATER);
2884 2885
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;
L
Linus Torvalds 已提交
2886

2887
			bond_set_slave_inactive_flags(slave,
2888
						      BOND_SLAVE_NOTIFY_LATER);
2889

2890 2891
			netdev_info(bond->dev, "backup interface %s is now down\n",
				    slave->dev->name);
L
Linus Torvalds 已提交
2892
		}
2893
		if (slave == curr_arp_slave)
2894
			found = true;
2895
	}
2896 2897 2898 2899

	if (!new_slave && before)
		new_slave = before;

2900 2901
	if (!new_slave)
		goto check_state;
2902

2903 2904
	bond_set_slave_link_state(new_slave, BOND_LINK_BACK,
				  BOND_SLAVE_NOTIFY_LATER);
2905
	bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER);
2906
	bond_arp_send_all(bond, new_slave);
2907
	new_slave->last_link_up = jiffies;
2908
	rcu_assign_pointer(bond->current_arp_slave, new_slave);
2909

2910 2911
check_state:
	bond_for_each_slave_rcu(bond, slave, iter) {
2912
		if (slave->should_notify || slave->should_notify_link) {
2913 2914 2915 2916 2917
			should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW;
			break;
		}
	}
	return should_notify_rtnl;
2918
}
L
Linus Torvalds 已提交
2919

2920
static void bond_activebackup_arp_mon(struct bonding *bond)
2921
{
2922 2923
	bool should_notify_peers = false;
	bool should_notify_rtnl = false;
2924
	int delta_in_ticks;
L
Linus Torvalds 已提交
2925

2926 2927 2928
	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

	if (!bond_has_slaves(bond))
2929 2930
		goto re_arm;

2931
	rcu_read_lock();
2932

2933 2934
	should_notify_peers = bond_should_notify_peers(bond);

2935 2936 2937
	if (bond_ab_arp_inspect(bond)) {
		rcu_read_unlock();

2938 2939 2940 2941 2942 2943
		/* Race avoidance with bond_close flush of workqueue */
		if (!rtnl_trylock()) {
			delta_in_ticks = 1;
			should_notify_peers = false;
			goto re_arm;
		}
2944

2945
		bond_ab_arp_commit(bond);
2946

2947
		rtnl_unlock();
2948
		rcu_read_lock();
2949 2950
	}

2951 2952
	should_notify_rtnl = bond_ab_arp_probe(bond);
	rcu_read_unlock();
2953

2954 2955
re_arm:
	if (bond->params.arp_interval)
2956 2957
		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);

2958
	if (should_notify_peers || should_notify_rtnl) {
2959 2960
		if (!rtnl_trylock())
			return;
2961 2962 2963 2964

		if (should_notify_peers)
			call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
						 bond->dev);
2965
		if (should_notify_rtnl) {
2966
			bond_slave_state_notify(bond);
2967 2968
			bond_slave_link_notify(bond);
		}
2969

2970 2971
		rtnl_unlock();
	}
L
Linus Torvalds 已提交
2972 2973
}

2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984
static void bond_arp_monitor(struct work_struct *work)
{
	struct bonding *bond = container_of(work, struct bonding,
					    arp_work.work);

	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
		bond_activebackup_arp_mon(bond);
	else
		bond_loadbalance_arp_mon(bond);
}

L
Linus Torvalds 已提交
2985 2986
/*-------------------------- netdev event handling --------------------------*/

2987
/* Change device name */
L
Linus Torvalds 已提交
2988 2989 2990 2991
static int bond_event_changename(struct bonding *bond)
{
	bond_remove_proc_entry(bond);
	bond_create_proc_entry(bond);
2992

2993 2994
	bond_debug_reregister(bond);

L
Linus Torvalds 已提交
2995 2996 2997
	return NOTIFY_DONE;
}

S
Stephen Hemminger 已提交
2998 2999
static int bond_master_netdev_event(unsigned long event,
				    struct net_device *bond_dev)
L
Linus Torvalds 已提交
3000
{
3001
	struct bonding *event_bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3002 3003 3004 3005

	switch (event) {
	case NETDEV_CHANGENAME:
		return bond_event_changename(event_bond);
3006 3007 3008 3009 3010 3011
	case NETDEV_UNREGISTER:
		bond_remove_proc_entry(event_bond);
		break;
	case NETDEV_REGISTER:
		bond_create_proc_entry(event_bond);
		break;
3012 3013 3014 3015
	case NETDEV_NOTIFY_PEERS:
		if (event_bond->send_peer_notif)
			event_bond->send_peer_notif--;
		break;
L
Linus Torvalds 已提交
3016 3017 3018 3019 3020 3021 3022
	default:
		break;
	}

	return NOTIFY_DONE;
}

S
Stephen Hemminger 已提交
3023 3024
static int bond_slave_netdev_event(unsigned long event,
				   struct net_device *slave_dev)
L
Linus Torvalds 已提交
3025
{
3026
	struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary;
3027 3028
	struct bonding *bond;
	struct net_device *bond_dev;
L
Linus Torvalds 已提交
3029

3030 3031 3032 3033 3034 3035 3036 3037
	/* A netdev event can be generated while enslaving a device
	 * before netdev_rx_handler_register is called in which case
	 * slave will be NULL
	 */
	if (!slave)
		return NOTIFY_DONE;
	bond_dev = slave->bond->dev;
	bond = slave->bond;
3038
	primary = rtnl_dereference(bond->primary_slave);
3039

L
Linus Torvalds 已提交
3040 3041
	switch (event) {
	case NETDEV_UNREGISTER:
3042
		if (bond_dev->type != ARPHRD_ETHER)
3043 3044 3045
			bond_release_and_destroy(bond_dev, slave_dev);
		else
			bond_release(bond_dev, slave_dev);
L
Linus Torvalds 已提交
3046
		break;
3047
	case NETDEV_UP:
L
Linus Torvalds 已提交
3048
	case NETDEV_CHANGE:
3049
		bond_update_speed_duplex(slave);
3050 3051
		if (BOND_MODE(bond) == BOND_MODE_8023AD)
			bond_3ad_adapter_speed_duplex_changed(slave);
M
Mahesh Bandewar 已提交
3052 3053
		/* Fallthrough */
	case NETDEV_DOWN:
3054 3055 3056 3057 3058 3059 3060 3061 3062 3063
		/* Refresh slave-array if applicable!
		 * If the setup does not use miimon or arpmon (mode-specific!),
		 * then these events will not cause the slave-array to be
		 * refreshed. This will cause xmit to use a slave that is not
		 * usable. Avoid such situation by refeshing the array at these
		 * events. If these (miimon/arpmon) parameters are configured
		 * then array gets refreshed twice and that should be fine!
		 */
		if (bond_mode_uses_xmit_hash(bond))
			bond_update_slave_arr(bond, NULL);
L
Linus Torvalds 已提交
3064 3065
		break;
	case NETDEV_CHANGEMTU:
3066
		/* TODO: Should slaves be allowed to
L
Linus Torvalds 已提交
3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078
		 * independently alter their MTU?  For
		 * an active-backup bond, slaves need
		 * not be the same type of device, so
		 * MTUs may vary.  For other modes,
		 * slaves arguably should have the
		 * same MTUs. To do this, we'd need to
		 * take over the slave's change_mtu
		 * function for the duration of their
		 * servitude.
		 */
		break;
	case NETDEV_CHANGENAME:
3079
		/* we don't care if we don't have primary set */
3080
		if (!bond_uses_primary(bond) ||
3081 3082 3083
		    !bond->params.primary[0])
			break;

3084
		if (slave == primary) {
3085
			/* slave's name changed - he's no longer primary */
3086
			RCU_INIT_POINTER(bond->primary_slave, NULL);
3087 3088
		} else if (!strcmp(slave_dev->name, bond->params.primary)) {
			/* we have a new primary slave */
3089
			rcu_assign_pointer(bond->primary_slave, slave);
3090 3091 3092 3093
		} else { /* we didn't change primary - exit */
			break;
		}

3094
		netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n",
3095
			    primary ? slave_dev->name : "none");
3096 3097

		block_netpoll_tx();
3098
		bond_select_active_slave(bond);
3099
		unblock_netpoll_tx();
L
Linus Torvalds 已提交
3100
		break;
3101 3102 3103
	case NETDEV_FEAT_CHANGE:
		bond_compute_features(bond);
		break;
3104 3105 3106 3107
	case NETDEV_RESEND_IGMP:
		/* Propagate to master device */
		call_netdevice_notifiers(event, slave->bond->dev);
		break;
L
Linus Torvalds 已提交
3108 3109 3110 3111 3112 3113 3114
	default:
		break;
	}

	return NOTIFY_DONE;
}

3115
/* bond_netdev_event: handle netdev notifier chain events.
L
Linus Torvalds 已提交
3116 3117
 *
 * This function receives events for the netdev chain.  The caller (an
3118
 * ioctl handler calling blocking_notifier_call_chain) holds the necessary
L
Linus Torvalds 已提交
3119 3120 3121
 * locks for us to safely manipulate the slave devices (RTNL lock,
 * dev_probe_lock).
 */
S
Stephen Hemminger 已提交
3122 3123
static int bond_netdev_event(struct notifier_block *this,
			     unsigned long event, void *ptr)
L
Linus Torvalds 已提交
3124
{
3125
	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
L
Linus Torvalds 已提交
3126

3127
	netdev_dbg(event_dev, "event: %lx\n", event);
L
Linus Torvalds 已提交
3128

3129 3130 3131
	if (!(event_dev->priv_flags & IFF_BONDING))
		return NOTIFY_DONE;

L
Linus Torvalds 已提交
3132
	if (event_dev->flags & IFF_MASTER) {
3133
		netdev_dbg(event_dev, "IFF_MASTER\n");
L
Linus Torvalds 已提交
3134 3135 3136 3137
		return bond_master_netdev_event(event, event_dev);
	}

	if (event_dev->flags & IFF_SLAVE) {
3138
		netdev_dbg(event_dev, "IFF_SLAVE\n");
L
Linus Torvalds 已提交
3139 3140 3141 3142 3143 3144 3145 3146 3147 3148
		return bond_slave_netdev_event(event, event_dev);
	}

	return NOTIFY_DONE;
}

static struct notifier_block bond_netdev_notifier = {
	.notifier_call = bond_netdev_event,
};

3149 3150
/*---------------------------- Hashing Policies -----------------------------*/

3151 3152
/* L2 hash helper */
static inline u32 bond_eth_hash(struct sk_buff *skb)
3153
{
3154
	struct ethhdr *ep, hdr_tmp;
3155

3156 3157 3158
	ep = skb_header_pointer(skb, 0, sizeof(hdr_tmp), &hdr_tmp);
	if (ep)
		return ep->h_dest[5] ^ ep->h_source[5] ^ ep->h_proto;
3159 3160 3161
	return 0;
}

3162 3163 3164
/* Extract the appropriate headers based on bond's xmit policy */
static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
			      struct flow_keys *fk)
3165
{
3166
	const struct ipv6hdr *iph6;
3167
	const struct iphdr *iph;
3168
	int noff, proto = -1;
3169

3170
	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
3171
		return skb_flow_dissect_flow_keys(skb, fk, 0);
3172

3173
	fk->ports.ports = 0;
3174 3175
	noff = skb_network_offset(skb);
	if (skb->protocol == htons(ETH_P_IP)) {
3176
		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
3177
			return false;
3178
		iph = ip_hdr(skb);
3179
		iph_to_flow_copy_v4addrs(fk, iph);
3180 3181 3182 3183
		noff += iph->ihl << 2;
		if (!ip_is_fragment(iph))
			proto = iph->protocol;
	} else if (skb->protocol == htons(ETH_P_IPV6)) {
3184
		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6))))
3185 3186
			return false;
		iph6 = ipv6_hdr(skb);
3187
		iph_to_flow_copy_v6addrs(fk, iph6);
3188 3189 3190 3191
		noff += sizeof(*iph6);
		proto = iph6->nexthdr;
	} else {
		return false;
3192
	}
3193
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
3194
		fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
3195

3196
	return true;
3197 3198
}

3199 3200 3201 3202 3203 3204 3205
/**
 * bond_xmit_hash - generate a hash value based on the xmit policy
 * @bond: bonding device
 * @skb: buffer to use for headers
 *
 * This function will extract the necessary headers from the skb buffer and use
 * them to generate a hash based on the xmit_policy set in the bonding device
3206
 */
3207
u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
3208
{
3209 3210
	struct flow_keys flow;
	u32 hash;
3211

E
Eric Dumazet 已提交
3212 3213 3214 3215
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 &&
	    skb->l4_hash)
		return skb->hash;

3216 3217
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
	    !bond_flow_dissect(bond, skb, &flow))
3218
		return bond_eth_hash(skb);
3219

3220 3221 3222 3223
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
		hash = bond_eth_hash(skb);
	else
3224
		hash = (__force u32)flow.ports.ports;
3225 3226
	hash ^= (__force u32)flow_get_u32_dst(&flow) ^
		(__force u32)flow_get_u32_src(&flow);
3227 3228 3229
	hash ^= (hash >> 16);
	hash ^= (hash >> 8);

3230
	return hash;
3231 3232
}

L
Linus Torvalds 已提交
3233 3234
/*-------------------------- Device entry points ----------------------------*/

3235 3236 3237 3238 3239 3240
static void bond_work_init_all(struct bonding *bond)
{
	INIT_DELAYED_WORK(&bond->mcast_work,
			  bond_resend_igmp_join_requests_delayed);
	INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
	INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
3241
	INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor);
3242
	INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
3243
	INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
3244 3245 3246 3247 3248 3249 3250 3251 3252
}

static void bond_work_cancel_all(struct bonding *bond)
{
	cancel_delayed_work_sync(&bond->mii_work);
	cancel_delayed_work_sync(&bond->arp_work);
	cancel_delayed_work_sync(&bond->alb_work);
	cancel_delayed_work_sync(&bond->ad_work);
	cancel_delayed_work_sync(&bond->mcast_work);
3253
	cancel_delayed_work_sync(&bond->slave_arr_work);
3254 3255
}

L
Linus Torvalds 已提交
3256 3257
static int bond_open(struct net_device *bond_dev)
{
3258
	struct bonding *bond = netdev_priv(bond_dev);
3259
	struct list_head *iter;
3260
	struct slave *slave;
L
Linus Torvalds 已提交
3261

3262
	/* reset slave->backup and slave->inactive */
3263
	if (bond_has_slaves(bond)) {
3264
		bond_for_each_slave(bond, slave, iter) {
3265 3266
			if (bond_uses_primary(bond) &&
			    slave != rcu_access_pointer(bond->curr_active_slave)) {
3267 3268
				bond_set_slave_inactive_flags(slave,
							      BOND_SLAVE_NOTIFY_NOW);
3269
			} else if (BOND_MODE(bond) != BOND_MODE_8023AD) {
3270 3271
				bond_set_slave_active_flags(slave,
							    BOND_SLAVE_NOTIFY_NOW);
3272 3273 3274 3275
			}
		}
	}

3276
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
3277 3278 3279
		/* bond_alb_initialize must be called before the timer
		 * is started.
		 */
3280
		if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
3281
			return -ENOMEM;
3282 3283
		if (bond->params.tlb_dynamic_lb)
			queue_delayed_work(bond->wq, &bond->alb_work, 0);
L
Linus Torvalds 已提交
3284 3285
	}

3286
	if (bond->params.miimon)  /* link check interval, in milliseconds. */
3287
		queue_delayed_work(bond->wq, &bond->mii_work, 0);
L
Linus Torvalds 已提交
3288 3289

	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
3290
		queue_delayed_work(bond->wq, &bond->arp_work, 0);
3291
		bond->recv_probe = bond_arp_rcv;
L
Linus Torvalds 已提交
3292 3293
	}

3294
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
3295
		queue_delayed_work(bond->wq, &bond->ad_work, 0);
L
Linus Torvalds 已提交
3296
		/* register to receive LACPDUs */
3297
		bond->recv_probe = bond_3ad_lacpdu_recv;
3298
		bond_3ad_initiate_agg_selection(bond, 1);
L
Linus Torvalds 已提交
3299 3300
	}

3301 3302 3303
	if (bond_mode_uses_xmit_hash(bond))
		bond_update_slave_arr(bond, NULL);

L
Linus Torvalds 已提交
3304 3305 3306 3307 3308
	return 0;
}

static int bond_close(struct net_device *bond_dev)
{
3309
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3310

3311
	bond_work_cancel_all(bond);
3312
	bond->send_peer_notif = 0;
3313
	if (bond_is_lb(bond))
L
Linus Torvalds 已提交
3314
		bond_alb_deinitialize(bond);
3315
	bond->recv_probe = NULL;
L
Linus Torvalds 已提交
3316 3317 3318 3319

	return 0;
}

E
Eric Dumazet 已提交
3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334
/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
 * that some drivers can provide 32bit values only.
 */
static void bond_fold_stats(struct rtnl_link_stats64 *_res,
			    const struct rtnl_link_stats64 *_new,
			    const struct rtnl_link_stats64 *_old)
{
	const u64 *new = (const u64 *)_new;
	const u64 *old = (const u64 *)_old;
	u64 *res = (u64 *)_res;
	int i;

	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
		u64 nv = new[i];
		u64 ov = old[i];
3335
		s64 delta = nv - ov;
E
Eric Dumazet 已提交
3336 3337 3338

		/* detects if this particular field is 32bit only */
		if (((nv | ov) >> 32) == 0)
3339 3340 3341 3342 3343 3344 3345
			delta = (s64)(s32)((u32)nv - (u32)ov);

		/* filter anomalies, some drivers reset their stats
		 * at down/up events.
		 */
		if (delta > 0)
			res[i] += delta;
E
Eric Dumazet 已提交
3346 3347 3348
	}
}

3349 3350
static void bond_get_stats(struct net_device *bond_dev,
			   struct rtnl_link_stats64 *stats)
L
Linus Torvalds 已提交
3351
{
3352
	struct bonding *bond = netdev_priv(bond_dev);
3353
	struct rtnl_link_stats64 temp;
3354
	struct list_head *iter;
L
Linus Torvalds 已提交
3355 3356
	struct slave *slave;

E
Eric Dumazet 已提交
3357
	spin_lock(&bond->stats_lock);
3358
	memcpy(stats, &bond->bond_stats, sizeof(*stats));
L
Linus Torvalds 已提交
3359

E
Eric Dumazet 已提交
3360 3361 3362
	rcu_read_lock();
	bond_for_each_slave_rcu(bond, slave, iter) {
		const struct rtnl_link_stats64 *new =
3363
			dev_get_stats(slave->dev, &temp);
E
Eric Dumazet 已提交
3364 3365

		bond_fold_stats(stats, new, &slave->slave_stats);
3366 3367

		/* save off the slave stats for the next run */
E
Eric Dumazet 已提交
3368
		memcpy(&slave->slave_stats, new, sizeof(*new));
3369
	}
E
Eric Dumazet 已提交
3370 3371
	rcu_read_unlock();

3372
	memcpy(&bond->bond_stats, stats, sizeof(*stats));
E
Eric Dumazet 已提交
3373
	spin_unlock(&bond->stats_lock);
L
Linus Torvalds 已提交
3374 3375 3376 3377
}

static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
{
3378
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3379 3380 3381 3382 3383 3384
	struct net_device *slave_dev = NULL;
	struct ifbond k_binfo;
	struct ifbond __user *u_binfo = NULL;
	struct ifslave k_sinfo;
	struct ifslave __user *u_sinfo = NULL;
	struct mii_ioctl_data *mii = NULL;
3385
	struct bond_opt_value newval;
3386
	struct net *net;
L
Linus Torvalds 已提交
3387 3388
	int res = 0;

3389
	netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd);
L
Linus Torvalds 已提交
3390 3391 3392 3393

	switch (cmd) {
	case SIOCGMIIPHY:
		mii = if_mii(ifr);
S
Stephen Hemminger 已提交
3394
		if (!mii)
L
Linus Torvalds 已提交
3395
			return -EINVAL;
S
Stephen Hemminger 已提交
3396

L
Linus Torvalds 已提交
3397 3398 3399
		mii->phy_id = 0;
		/* Fall Through */
	case SIOCGMIIREG:
3400
		/* We do this again just in case we were called by SIOCGMIIREG
L
Linus Torvalds 已提交
3401 3402 3403
		 * instead of SIOCGMIIPHY.
		 */
		mii = if_mii(ifr);
S
Stephen Hemminger 已提交
3404
		if (!mii)
L
Linus Torvalds 已提交
3405
			return -EINVAL;
S
Stephen Hemminger 已提交
3406

L
Linus Torvalds 已提交
3407 3408
		if (mii->reg_num == 1) {
			mii->val_out = 0;
S
Stephen Hemminger 已提交
3409
			if (netif_carrier_ok(bond->dev))
L
Linus Torvalds 已提交
3410 3411 3412 3413 3414 3415 3416 3417
				mii->val_out = BMSR_LSTATUS;
		}

		return 0;
	case BOND_INFO_QUERY_OLD:
	case SIOCBONDINFOQUERY:
		u_binfo = (struct ifbond __user *)ifr->ifr_data;

S
Stephen Hemminger 已提交
3418
		if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond)))
L
Linus Torvalds 已提交
3419 3420
			return -EFAULT;

3421 3422
		bond_info_query(bond_dev, &k_binfo);
		if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond)))
S
Stephen Hemminger 已提交
3423
			return -EFAULT;
L
Linus Torvalds 已提交
3424

3425
		return 0;
L
Linus Torvalds 已提交
3426 3427 3428 3429
	case BOND_SLAVE_INFO_QUERY_OLD:
	case SIOCBONDSLAVEINFOQUERY:
		u_sinfo = (struct ifslave __user *)ifr->ifr_data;

S
Stephen Hemminger 已提交
3430
		if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave)))
L
Linus Torvalds 已提交
3431 3432 3433
			return -EFAULT;

		res = bond_slave_info_query(bond_dev, &k_sinfo);
S
Stephen Hemminger 已提交
3434 3435 3436
		if (res == 0 &&
		    copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave)))
			return -EFAULT;
L
Linus Torvalds 已提交
3437 3438 3439 3440 3441 3442

		return res;
	default:
		break;
	}

3443 3444 3445
	net = dev_net(bond_dev);

	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
3446 3447
		return -EPERM;

3448
	slave_dev = __dev_get_by_name(net, ifr->ifr_slave);
L
Linus Torvalds 已提交
3449

3450
	netdev_dbg(bond_dev, "slave_dev=%p:\n", slave_dev);
L
Linus Torvalds 已提交
3451

S
Stephen Hemminger 已提交
3452
	if (!slave_dev)
3453
		return -ENODEV;
L
Linus Torvalds 已提交
3454

3455
	netdev_dbg(bond_dev, "slave_dev->name=%s:\n", slave_dev->name);
3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471
	switch (cmd) {
	case BOND_ENSLAVE_OLD:
	case SIOCBONDENSLAVE:
		res = bond_enslave(bond_dev, slave_dev);
		break;
	case BOND_RELEASE_OLD:
	case SIOCBONDRELEASE:
		res = bond_release(bond_dev, slave_dev);
		break;
	case BOND_SETHWADDR_OLD:
	case SIOCBONDSETHWADDR:
		bond_set_dev_addr(bond_dev, slave_dev);
		res = 0;
		break;
	case BOND_CHANGE_ACTIVE_OLD:
	case SIOCBONDCHANGEACTIVE:
3472 3473
		bond_opt_initstr(&newval, slave_dev->name);
		res = __bond_opt_set(bond, BOND_OPT_ACTIVE_SLAVE, &newval);
3474 3475 3476
		break;
	default:
		res = -EOPNOTSUPP;
L
Linus Torvalds 已提交
3477 3478 3479 3480 3481
	}

	return res;
}

3482
static void bond_change_rx_flags(struct net_device *bond_dev, int change)
L
Linus Torvalds 已提交
3483
{
3484
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3485

3486 3487 3488
	if (change & IFF_PROMISC)
		bond_set_promiscuity(bond,
				     bond_dev->flags & IFF_PROMISC ? 1 : -1);
S
Stephen Hemminger 已提交
3489

3490 3491 3492 3493
	if (change & IFF_ALLMULTI)
		bond_set_allmulti(bond,
				  bond_dev->flags & IFF_ALLMULTI ? 1 : -1);
}
L
Linus Torvalds 已提交
3494

3495
static void bond_set_rx_mode(struct net_device *bond_dev)
3496 3497
{
	struct bonding *bond = netdev_priv(bond_dev);
3498
	struct list_head *iter;
3499
	struct slave *slave;
L
Linus Torvalds 已提交
3500

3501
	rcu_read_lock();
3502
	if (bond_uses_primary(bond)) {
3503
		slave = rcu_dereference(bond->curr_active_slave);
3504 3505 3506 3507 3508
		if (slave) {
			dev_uc_sync(slave->dev, bond_dev);
			dev_mc_sync(slave->dev, bond_dev);
		}
	} else {
3509
		bond_for_each_slave_rcu(bond, slave, iter) {
3510 3511 3512
			dev_uc_sync_multiple(slave->dev, bond_dev);
			dev_mc_sync_multiple(slave->dev, bond_dev);
		}
L
Linus Torvalds 已提交
3513
	}
3514
	rcu_read_unlock();
L
Linus Torvalds 已提交
3515 3516
}

3517
static int bond_neigh_init(struct neighbour *n)
3518
{
3519 3520 3521
	struct bonding *bond = netdev_priv(n->dev);
	const struct net_device_ops *slave_ops;
	struct neigh_parms parms;
3522
	struct slave *slave;
3523 3524
	int ret;

3525
	slave = bond_first_slave(bond);
3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537
	if (!slave)
		return 0;
	slave_ops = slave->dev->netdev_ops;
	if (!slave_ops->ndo_neigh_setup)
		return 0;

	parms.neigh_setup = NULL;
	parms.neigh_cleanup = NULL;
	ret = slave_ops->ndo_neigh_setup(slave->dev, &parms);
	if (ret)
		return ret;

3538
	/* Assign slave's neigh_cleanup to neighbour in case cleanup is called
3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550
	 * after the last slave has been detached.  Assumes that all slaves
	 * utilize the same neigh_cleanup (true at this writing as only user
	 * is ipoib).
	 */
	n->parms->neigh_cleanup = parms.neigh_cleanup;

	if (!parms.neigh_setup)
		return 0;

	return parms.neigh_setup(n);
}

3551
/* The bonding ndo_neigh_setup is called at init time beofre any
3552 3553
 * slave exists. So we must declare proxy setup function which will
 * be used at run time to resolve the actual slave neigh param setup.
3554 3555 3556 3557
 *
 * It's also called by master devices (such as vlans) to setup their
 * underlying devices. In that case - do nothing, we're already set up from
 * our init.
3558 3559 3560 3561
 */
static int bond_neigh_setup(struct net_device *dev,
			    struct neigh_parms *parms)
{
3562 3563 3564
	/* modify only our neigh_parms */
	if (parms->dev == dev)
		parms->neigh_setup = bond_neigh_init;
3565 3566 3567 3568

	return 0;
}

3569
/* Change the MTU of all of a master's slaves to match the master */
L
Linus Torvalds 已提交
3570 3571
static int bond_change_mtu(struct net_device *bond_dev, int new_mtu)
{
3572
	struct bonding *bond = netdev_priv(bond_dev);
3573
	struct slave *slave, *rollback_slave;
3574
	struct list_head *iter;
L
Linus Torvalds 已提交
3575 3576
	int res = 0;

3577
	netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu);
L
Linus Torvalds 已提交
3578

3579
	bond_for_each_slave(bond, slave, iter) {
3580 3581
		netdev_dbg(bond_dev, "s %p c_m %p\n",
			   slave, slave->dev->netdev_ops->ndo_change_mtu);
3582

L
Linus Torvalds 已提交
3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593
		res = dev_set_mtu(slave->dev, new_mtu);

		if (res) {
			/* If we failed to set the slave's mtu to the new value
			 * we must abort the operation even in ACTIVE_BACKUP
			 * mode, because if we allow the backup slaves to have
			 * different mtu values than the active slave we'll
			 * need to change their mtu when doing a failover. That
			 * means changing their mtu from timer context, which
			 * is probably not a good idea.
			 */
3594 3595
			netdev_dbg(bond_dev, "err %d %s\n", res,
				   slave->dev->name);
L
Linus Torvalds 已提交
3596 3597 3598 3599 3600 3601 3602 3603 3604 3605
			goto unwind;
		}
	}

	bond_dev->mtu = new_mtu;

	return 0;

unwind:
	/* unwind from head to the slave that failed */
3606
	bond_for_each_slave(bond, rollback_slave, iter) {
L
Linus Torvalds 已提交
3607 3608
		int tmp_res;

3609 3610 3611 3612
		if (rollback_slave == slave)
			break;

		tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu);
L
Linus Torvalds 已提交
3613
		if (tmp_res) {
3614 3615
			netdev_dbg(bond_dev, "unwind err %d dev %s\n",
				   tmp_res, rollback_slave->dev->name);
L
Linus Torvalds 已提交
3616 3617 3618 3619 3620 3621
		}
	}

	return res;
}

3622
/* Change HW address
L
Linus Torvalds 已提交
3623 3624 3625 3626 3627 3628 3629
 *
 * Note that many devices must be down to change the HW address, and
 * downing the master releases all slaves.  We can make bonds full of
 * bonding devices to test this, however.
 */
static int bond_set_mac_address(struct net_device *bond_dev, void *addr)
{
3630
	struct bonding *bond = netdev_priv(bond_dev);
3631
	struct slave *slave, *rollback_slave;
L
Linus Torvalds 已提交
3632
	struct sockaddr *sa = addr, tmp_sa;
3633
	struct list_head *iter;
L
Linus Torvalds 已提交
3634 3635
	int res = 0;

3636
	if (BOND_MODE(bond) == BOND_MODE_ALB)
3637 3638 3639
		return bond_alb_set_mac_address(bond_dev, addr);


3640
	netdev_dbg(bond_dev, "bond=%p\n", bond);
L
Linus Torvalds 已提交
3641

3642 3643
	/* If fail_over_mac is enabled, do nothing and return success.
	 * Returning an error causes ifenslave to fail.
3644
	 */
3645
	if (bond->params.fail_over_mac &&
3646
	    BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
3647
		return 0;
3648

S
Stephen Hemminger 已提交
3649
	if (!is_valid_ether_addr(sa->sa_data))
L
Linus Torvalds 已提交
3650 3651
		return -EADDRNOTAVAIL;

3652
	bond_for_each_slave(bond, slave, iter) {
3653
		netdev_dbg(bond_dev, "slave %p %s\n", slave, slave->dev->name);
L
Linus Torvalds 已提交
3654 3655 3656 3657 3658 3659 3660 3661
		res = dev_set_mac_address(slave->dev, addr);
		if (res) {
			/* TODO: consider downing the slave
			 * and retry ?
			 * User should expect communications
			 * breakage anyway until ARP finish
			 * updating, so...
			 */
3662
			netdev_dbg(bond_dev, "err %d %s\n", res, slave->dev->name);
L
Linus Torvalds 已提交
3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675
			goto unwind;
		}
	}

	/* success */
	memcpy(bond_dev->dev_addr, sa->sa_data, bond_dev->addr_len);
	return 0;

unwind:
	memcpy(tmp_sa.sa_data, bond_dev->dev_addr, bond_dev->addr_len);
	tmp_sa.sa_family = bond_dev->type;

	/* unwind from head to the slave that failed */
3676
	bond_for_each_slave(bond, rollback_slave, iter) {
L
Linus Torvalds 已提交
3677 3678
		int tmp_res;

3679 3680 3681 3682
		if (rollback_slave == slave)
			break;

		tmp_res = dev_set_mac_address(rollback_slave->dev, &tmp_sa);
L
Linus Torvalds 已提交
3683
		if (tmp_res) {
3684 3685
			netdev_dbg(bond_dev, "unwind err %d dev %s\n",
				   tmp_res, rollback_slave->dev->name);
L
Linus Torvalds 已提交
3686 3687 3688 3689 3690 3691
		}
	}

	return res;
}

3692 3693 3694 3695 3696 3697 3698 3699 3700 3701
/**
 * bond_xmit_slave_id - transmit skb through slave with slave_id
 * @bond: bonding device that is transmitting
 * @skb: buffer to transmit
 * @slave_id: slave id up to slave_cnt-1 through which to transmit
 *
 * This function tries to transmit through slave with slave_id but in case
 * it fails, it tries to find the first available slave for transmission.
 * The skb is consumed in all cases, thus the function is void.
 */
3702
static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int slave_id)
3703
{
3704
	struct list_head *iter;
3705 3706 3707 3708
	struct slave *slave;
	int i = slave_id;

	/* Here we start from the slave with slave_id */
3709
	bond_for_each_slave_rcu(bond, slave, iter) {
3710
		if (--i < 0) {
3711
			if (bond_slave_can_tx(slave)) {
3712 3713 3714 3715 3716 3717 3718 3719
				bond_dev_queue_xmit(bond, skb, slave->dev);
				return;
			}
		}
	}

	/* Here we start from the first slave up to slave_id */
	i = slave_id;
3720
	bond_for_each_slave_rcu(bond, slave, iter) {
3721 3722
		if (--i < 0)
			break;
3723
		if (bond_slave_can_tx(slave)) {
3724 3725 3726 3727 3728
			bond_dev_queue_xmit(bond, skb, slave->dev);
			return;
		}
	}
	/* no slave that can tx has been found */
E
Eric Dumazet 已提交
3729
	bond_tx_drop(bond->dev, skb);
3730 3731
}

3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742
/**
 * bond_rr_gen_slave_id - generate slave id based on packets_per_slave
 * @bond: bonding device to use
 *
 * Based on the value of the bonding device's packets_per_slave parameter
 * this function generates a slave id, which is usually used as the next
 * slave to transmit through.
 */
static u32 bond_rr_gen_slave_id(struct bonding *bond)
{
	u32 slave_id;
3743 3744
	struct reciprocal_value reciprocal_packets_per_slave;
	int packets_per_slave = bond->params.packets_per_slave;
3745 3746 3747 3748 3749 3750 3751 3752 3753

	switch (packets_per_slave) {
	case 0:
		slave_id = prandom_u32();
		break;
	case 1:
		slave_id = bond->rr_tx_counter;
		break;
	default:
3754 3755
		reciprocal_packets_per_slave =
			bond->params.reciprocal_packets_per_slave;
3756
		slave_id = reciprocal_divide(bond->rr_tx_counter,
3757
					     reciprocal_packets_per_slave);
3758 3759 3760 3761 3762 3763 3764
		break;
	}
	bond->rr_tx_counter++;

	return slave_id;
}

L
Linus Torvalds 已提交
3765 3766
static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
{
3767
	struct bonding *bond = netdev_priv(bond_dev);
3768
	struct iphdr *iph = ip_hdr(skb);
3769
	struct slave *slave;
3770
	u32 slave_id;
L
Linus Torvalds 已提交
3771

3772
	/* Start with the curr_active_slave that joined the bond as the
3773 3774 3775 3776
	 * default for sending IGMP traffic.  For failover purposes one
	 * needs to maintain some consistency for the interface that will
	 * send the join/membership reports.  The curr_active_slave found
	 * will send all of this type of traffic.
3777
	 */
3778
	if (iph->protocol == IPPROTO_IGMP && skb->protocol == htons(ETH_P_IP)) {
3779
		slave = rcu_dereference(bond->curr_active_slave);
3780
		if (slave)
3781 3782 3783
			bond_dev_queue_xmit(bond, skb, slave->dev);
		else
			bond_xmit_slave_id(bond, skb, 0);
3784
	} else {
3785 3786 3787 3788 3789 3790
		int slave_cnt = ACCESS_ONCE(bond->slave_cnt);

		if (likely(slave_cnt)) {
			slave_id = bond_rr_gen_slave_id(bond);
			bond_xmit_slave_id(bond, skb, slave_id % slave_cnt);
		} else {
E
Eric Dumazet 已提交
3791
			bond_tx_drop(bond_dev, skb);
3792
		}
L
Linus Torvalds 已提交
3793
	}
3794

3795
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
3796 3797
}

3798
/* In active-backup mode, we know that bond->curr_active_slave is always valid if
L
Linus Torvalds 已提交
3799 3800 3801 3802
 * the bond has a usable interface.
 */
static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev)
{
3803
	struct bonding *bond = netdev_priv(bond_dev);
3804
	struct slave *slave;
L
Linus Torvalds 已提交
3805

3806
	slave = rcu_dereference(bond->curr_active_slave);
3807
	if (slave)
3808 3809
		bond_dev_queue_xmit(bond, skb, slave->dev);
	else
E
Eric Dumazet 已提交
3810
		bond_tx_drop(bond_dev, skb);
3811

3812
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
3813 3814
}

3815 3816 3817
/* Use this to update slave_array when (a) it's not appropriate to update
 * slave_array right away (note that update_slave_array() may sleep)
 * and / or (b) RTNL is not held.
L
Linus Torvalds 已提交
3818
 */
3819
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay)
L
Linus Torvalds 已提交
3820
{
3821 3822
	queue_delayed_work(bond->wq, &bond->slave_arr_work, delay);
}
L
Linus Torvalds 已提交
3823

3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938
/* Slave array work handler. Holds only RTNL */
static void bond_slave_arr_handler(struct work_struct *work)
{
	struct bonding *bond = container_of(work, struct bonding,
					    slave_arr_work.work);
	int ret;

	if (!rtnl_trylock())
		goto err;

	ret = bond_update_slave_arr(bond, NULL);
	rtnl_unlock();
	if (ret) {
		pr_warn_ratelimited("Failed to update slave array from WT\n");
		goto err;
	}
	return;

err:
	bond_slave_arr_work_rearm(bond, 1);
}

/* Build the usable slaves array in control path for modes that use xmit-hash
 * to determine the slave interface -
 * (a) BOND_MODE_8023AD
 * (b) BOND_MODE_XOR
 * (c) BOND_MODE_TLB && tlb_dynamic_lb == 0
 *
 * The caller is expected to hold RTNL only and NO other lock!
 */
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
{
	struct slave *slave;
	struct list_head *iter;
	struct bond_up_slave *new_arr, *old_arr;
	int agg_id = 0;
	int ret = 0;

#ifdef CONFIG_LOCKDEP
	WARN_ON(lockdep_is_held(&bond->mode_lock));
#endif

	new_arr = kzalloc(offsetof(struct bond_up_slave, arr[bond->slave_cnt]),
			  GFP_KERNEL);
	if (!new_arr) {
		ret = -ENOMEM;
		pr_err("Failed to build slave-array.\n");
		goto out;
	}
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
		struct ad_info ad_info;

		if (bond_3ad_get_active_agg_info(bond, &ad_info)) {
			pr_debug("bond_3ad_get_active_agg_info failed\n");
			kfree_rcu(new_arr, rcu);
			/* No active aggragator means it's not safe to use
			 * the previous array.
			 */
			old_arr = rtnl_dereference(bond->slave_arr);
			if (old_arr) {
				RCU_INIT_POINTER(bond->slave_arr, NULL);
				kfree_rcu(old_arr, rcu);
			}
			goto out;
		}
		agg_id = ad_info.aggregator_id;
	}
	bond_for_each_slave(bond, slave, iter) {
		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			struct aggregator *agg;

			agg = SLAVE_AD_INFO(slave)->port.aggregator;
			if (!agg || agg->aggregator_identifier != agg_id)
				continue;
		}
		if (!bond_slave_can_tx(slave))
			continue;
		if (skipslave == slave)
			continue;
		new_arr->arr[new_arr->count++] = slave;
	}

	old_arr = rtnl_dereference(bond->slave_arr);
	rcu_assign_pointer(bond->slave_arr, new_arr);
	if (old_arr)
		kfree_rcu(old_arr, rcu);
out:
	if (ret != 0 && skipslave) {
		int idx;

		/* Rare situation where caller has asked to skip a specific
		 * slave but allocation failed (most likely!). BTW this is
		 * only possible when the call is initiated from
		 * __bond_release_one(). In this situation; overwrite the
		 * skipslave entry in the array with the last entry from the
		 * array to avoid a situation where the xmit path may choose
		 * this to-be-skipped slave to send a packet out.
		 */
		old_arr = rtnl_dereference(bond->slave_arr);
		for (idx = 0; idx < old_arr->count; idx++) {
			if (skipslave == old_arr->arr[idx]) {
				old_arr->arr[idx] =
				    old_arr->arr[old_arr->count-1];
				old_arr->count--;
				break;
			}
		}
	}
	return ret;
}

/* Use this Xmit function for 3AD as well as XOR modes. The current
 * usable slave array is formed in the control path. The xmit function
 * just calculates hash and sends the packet out.
 */
3939
static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951
{
	struct bonding *bond = netdev_priv(dev);
	struct slave *slave;
	struct bond_up_slave *slaves;
	unsigned int count;

	slaves = rcu_dereference(bond->slave_arr);
	count = slaves ? ACCESS_ONCE(slaves->count) : 0;
	if (likely(count)) {
		slave = slaves->arr[bond_xmit_hash(bond, skb) % count];
		bond_dev_queue_xmit(bond, skb, slave->dev);
	} else {
E
Eric Dumazet 已提交
3952
		bond_tx_drop(dev, skb);
3953
	}
3954

3955
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
3956 3957
}

3958
/* in broadcast mode, we send everything to all usable interfaces. */
L
Linus Torvalds 已提交
3959 3960
static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
{
3961
	struct bonding *bond = netdev_priv(bond_dev);
3962
	struct slave *slave = NULL;
3963
	struct list_head *iter;
L
Linus Torvalds 已提交
3964

3965
	bond_for_each_slave_rcu(bond, slave, iter) {
3966 3967
		if (bond_is_last_slave(bond, slave))
			break;
3968
		if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
3969
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
L
Linus Torvalds 已提交
3970

3971
			if (!skb2) {
3972 3973
				net_err_ratelimited("%s: Error: %s: skb_clone() failed\n",
						    bond_dev->name, __func__);
3974
				continue;
L
Linus Torvalds 已提交
3975
			}
3976
			bond_dev_queue_xmit(bond, skb2, slave->dev);
L
Linus Torvalds 已提交
3977 3978
		}
	}
3979
	if (slave && bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)
3980 3981
		bond_dev_queue_xmit(bond, skb, slave->dev);
	else
E
Eric Dumazet 已提交
3982
		bond_tx_drop(bond_dev, skb);
S
Stephen Hemminger 已提交
3983

3984
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
3985 3986 3987 3988
}

/*------------------------- Device initialization ---------------------------*/

3989
/* Lookup the slave that corresponds to a qid */
3990 3991 3992 3993
static inline int bond_slave_override(struct bonding *bond,
				      struct sk_buff *skb)
{
	struct slave *slave = NULL;
3994
	struct list_head *iter;
3995

3996 3997
	if (!skb->queue_mapping)
		return 1;
3998 3999

	/* Find out if any slaves have the same mapping as this skb. */
4000 4001
	bond_for_each_slave_rcu(bond, slave, iter) {
		if (slave->queue_id == skb->queue_mapping) {
4002 4003
			if (bond_slave_is_up(slave) &&
			    slave->link == BOND_LINK_UP) {
4004 4005 4006 4007
				bond_dev_queue_xmit(bond, skb, slave->dev);
				return 0;
			}
			/* If the slave isn't UP, use default transmit policy. */
4008 4009 4010 4011
			break;
		}
	}

4012
	return 1;
4013 4014
}

4015

4016
static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
4017
			     void *accel_priv, select_queue_fallback_t fallback)
4018
{
4019
	/* This helper function exists to help dev_pick_tx get the correct
P
Phil Oester 已提交
4020
	 * destination queue.  Using a helper function skips a call to
4021 4022 4023
	 * skb_tx_hash and will put the skbs in the queue we expect on their
	 * way down to the bonding driver.
	 */
P
Phil Oester 已提交
4024 4025
	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;

4026
	/* Save the original txq to restore before passing to the driver */
4027
	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
4028

P
Phil Oester 已提交
4029
	if (unlikely(txq >= dev->real_num_tx_queues)) {
4030
		do {
P
Phil Oester 已提交
4031
			txq -= dev->real_num_tx_queues;
4032
		} while (txq >= dev->real_num_tx_queues);
P
Phil Oester 已提交
4033 4034
	}
	return txq;
4035 4036
}

4037
static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
4038
{
4039 4040
	struct bonding *bond = netdev_priv(dev);

4041 4042 4043
	if (bond_should_override_tx_queue(bond) &&
	    !bond_slave_override(bond, skb))
		return NETDEV_TX_OK;
4044

4045
	switch (BOND_MODE(bond)) {
4046 4047 4048 4049
	case BOND_MODE_ROUNDROBIN:
		return bond_xmit_roundrobin(skb, dev);
	case BOND_MODE_ACTIVEBACKUP:
		return bond_xmit_activebackup(skb, dev);
4050
	case BOND_MODE_8023AD:
4051
	case BOND_MODE_XOR:
4052
		return bond_3ad_xor_xmit(skb, dev);
4053 4054 4055 4056
	case BOND_MODE_BROADCAST:
		return bond_xmit_broadcast(skb, dev);
	case BOND_MODE_ALB:
		return bond_alb_xmit(skb, dev);
4057 4058
	case BOND_MODE_TLB:
		return bond_tlb_xmit(skb, dev);
4059 4060
	default:
		/* Should never happen, mode already checked */
4061
		netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond));
4062
		WARN_ON_ONCE(1);
E
Eric Dumazet 已提交
4063
		bond_tx_drop(dev, skb);
4064 4065 4066 4067
		return NETDEV_TX_OK;
	}
}

4068 4069 4070 4071 4072
static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct bonding *bond = netdev_priv(dev);
	netdev_tx_t ret = NETDEV_TX_OK;

4073
	/* If we risk deadlock from transmitting this in the
4074 4075
	 * netpoll path, tell netpoll to queue the frame for later tx
	 */
4076
	if (unlikely(is_netpoll_tx_blocked(dev)))
4077 4078
		return NETDEV_TX_BUSY;

4079
	rcu_read_lock();
4080
	if (bond_has_slaves(bond))
4081 4082
		ret = __bond_start_xmit(skb, dev);
	else
E
Eric Dumazet 已提交
4083
		bond_tx_drop(dev, skb);
4084
	rcu_read_unlock();
4085 4086 4087

	return ret;
}
4088

4089 4090
static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
					   struct ethtool_link_ksettings *cmd)
4091 4092 4093
{
	struct bonding *bond = netdev_priv(bond_dev);
	unsigned long speed = 0;
4094
	struct list_head *iter;
4095
	struct slave *slave;
4096

4097 4098
	cmd->base.duplex = DUPLEX_UNKNOWN;
	cmd->base.port = PORT_OTHER;
4099

4100
	/* Since bond_slave_can_tx returns false for all inactive or down slaves, we
4101 4102 4103 4104
	 * do not need to check mode.  Though link speed might not represent
	 * the true receive or transmit bandwidth (not all modes are symmetric)
	 * this is an accurate maximum.
	 */
4105
	bond_for_each_slave(bond, slave, iter) {
4106
		if (bond_slave_can_tx(slave)) {
4107 4108
			if (slave->speed != SPEED_UNKNOWN)
				speed += slave->speed;
4109
			if (cmd->base.duplex == DUPLEX_UNKNOWN &&
4110
			    slave->duplex != DUPLEX_UNKNOWN)
4111
				cmd->base.duplex = slave->duplex;
4112 4113
		}
	}
4114
	cmd->base.speed = speed ? : SPEED_UNKNOWN;
4115

4116 4117 4118
	return 0;
}

4119
static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
4120
				     struct ethtool_drvinfo *drvinfo)
4121
{
4122 4123 4124 4125
	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
	strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d",
		 BOND_ABI_VERSION);
4126 4127
}

4128
static const struct ethtool_ops bond_ethtool_ops = {
4129
	.get_drvinfo		= bond_ethtool_get_drvinfo,
4130
	.get_link		= ethtool_op_get_link,
4131
	.get_link_ksettings	= bond_ethtool_get_link_ksettings,
4132 4133
};

4134
static const struct net_device_ops bond_netdev_ops = {
4135
	.ndo_init		= bond_init,
S
Stephen Hemminger 已提交
4136
	.ndo_uninit		= bond_uninit,
4137 4138
	.ndo_open		= bond_open,
	.ndo_stop		= bond_close,
4139
	.ndo_start_xmit		= bond_start_xmit,
4140
	.ndo_select_queue	= bond_select_queue,
4141
	.ndo_get_stats64	= bond_get_stats,
4142
	.ndo_do_ioctl		= bond_do_ioctl,
4143
	.ndo_change_rx_flags	= bond_change_rx_flags,
4144
	.ndo_set_rx_mode	= bond_set_rx_mode,
4145
	.ndo_change_mtu		= bond_change_mtu,
J
Jiri Pirko 已提交
4146
	.ndo_set_mac_address	= bond_set_mac_address,
4147
	.ndo_neigh_setup	= bond_neigh_setup,
J
Jiri Pirko 已提交
4148
	.ndo_vlan_rx_add_vid	= bond_vlan_rx_add_vid,
4149
	.ndo_vlan_rx_kill_vid	= bond_vlan_rx_kill_vid,
4150
#ifdef CONFIG_NET_POLL_CONTROLLER
4151
	.ndo_netpoll_setup	= bond_netpoll_setup,
4152 4153 4154
	.ndo_netpoll_cleanup	= bond_netpoll_cleanup,
	.ndo_poll_controller	= bond_poll_controller,
#endif
J
Jiri Pirko 已提交
4155 4156
	.ndo_add_slave		= bond_enslave,
	.ndo_del_slave		= bond_release,
4157
	.ndo_fix_features	= bond_fix_features,
4158
	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
4159
	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
4160
	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
4161 4162 4163
	.ndo_fdb_add		= switchdev_port_fdb_add,
	.ndo_fdb_del		= switchdev_port_fdb_del,
	.ndo_fdb_dump		= switchdev_port_fdb_dump,
4164
	.ndo_features_check	= passthru_features_check,
4165 4166
};

4167 4168 4169 4170
static const struct device_type bond_type = {
	.name = "bond",
};

4171 4172 4173 4174 4175 4176 4177 4178
static void bond_destructor(struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
	if (bond->wq)
		destroy_workqueue(bond->wq);
	free_netdev(bond_dev);
}

4179
void bond_setup(struct net_device *bond_dev)
L
Linus Torvalds 已提交
4180
{
4181
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
4182

4183
	spin_lock_init(&bond->mode_lock);
E
Eric Dumazet 已提交
4184
	spin_lock_init(&bond->stats_lock);
4185
	bond->params = bonding_defaults;
L
Linus Torvalds 已提交
4186 4187 4188 4189 4190

	/* Initialize pointers */
	bond->dev = bond_dev;

	/* Initialize the device entry points */
4191
	ether_setup(bond_dev);
W
WANG Cong 已提交
4192
	bond_dev->max_mtu = ETH_MAX_MTU;
4193
	bond_dev->netdev_ops = &bond_netdev_ops;
4194
	bond_dev->ethtool_ops = &bond_ethtool_ops;
L
Linus Torvalds 已提交
4195

4196
	bond_dev->destructor = bond_destructor;
L
Linus Torvalds 已提交
4197

4198 4199
	SET_NETDEV_DEVTYPE(bond_dev, &bond_type);

L
Linus Torvalds 已提交
4200
	/* Initialize the device options */
4201
	bond_dev->flags |= IFF_MASTER;
4202
	bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE;
4203
	bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
4204

4205
	/* don't acquire bond device's netif_tx_lock when transmitting */
L
Linus Torvalds 已提交
4206 4207 4208 4209 4210 4211 4212 4213 4214
	bond_dev->features |= NETIF_F_LLTX;

	/* By default, we declare the bond to be fully
	 * VLAN hardware accelerated capable. Special
	 * care is taken in the various xmit functions
	 * when there are slaves that are not hw accel
	 * capable
	 */

4215 4216 4217
	/* Don't allow bond devices to change network namespaces. */
	bond_dev->features |= NETIF_F_NETNS_LOCAL;

4218
	bond_dev->hw_features = BOND_VLAN_FEATURES |
4219 4220 4221
				NETIF_F_HW_VLAN_CTAG_TX |
				NETIF_F_HW_VLAN_CTAG_RX |
				NETIF_F_HW_VLAN_CTAG_FILTER;
4222

E
Eric Dumazet 已提交
4223
	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
4224
	bond_dev->features |= bond_dev->hw_features;
L
Linus Torvalds 已提交
4225 4226
}

4227 4228 4229
/* Destroy a bonding device.
 * Must be under rtnl_lock when this function is called.
 */
4230
static void bond_uninit(struct net_device *bond_dev)
J
Jay Vosburgh 已提交
4231
{
4232
	struct bonding *bond = netdev_priv(bond_dev);
4233 4234
	struct list_head *iter;
	struct slave *slave;
4235
	struct bond_up_slave *arr;
J
Jay Vosburgh 已提交
4236

4237 4238
	bond_netpoll_cleanup(bond_dev);

4239
	/* Release the bonded slaves */
4240
	bond_for_each_slave(bond, slave, iter)
4241
		__bond_release_one(bond_dev, slave->dev, true);
4242
	netdev_info(bond_dev, "Released all slaves\n");
4243

4244 4245 4246 4247 4248 4249
	arr = rtnl_dereference(bond->slave_arr);
	if (arr) {
		RCU_INIT_POINTER(bond->slave_arr, NULL);
		kfree_rcu(arr, rcu);
	}

J
Jay Vosburgh 已提交
4250 4251
	list_del(&bond->bond_list);

4252
	bond_debug_unregister(bond);
J
Jay Vosburgh 已提交
4253 4254
}

L
Linus Torvalds 已提交
4255 4256 4257 4258
/*------------------------- Module initialization ---------------------------*/

static int bond_check_params(struct bond_params *params)
{
4259
	int arp_validate_value, fail_over_mac_value, primary_reselect_value, i;
4260 4261
	struct bond_opt_value newval;
	const struct bond_opt_value *valptr;
4262
	int arp_all_targets_value;
4263
	u16 ad_actor_sys_prio = 0;
4264
	u16 ad_user_port_key = 0;
4265 4266 4267 4268 4269
	__be32 arp_target[BOND_MAX_ARP_TARGETS];
	int arp_ip_count;
	int bond_mode	= BOND_MODE_ROUNDROBIN;
	int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
	int lacp_fast = 0;
4270
	int tlb_dynamic_lb = 0;
4271

4272
	/* Convert string parameters. */
L
Linus Torvalds 已提交
4273
	if (mode) {
4274 4275 4276 4277
		bond_opt_initstr(&newval, mode);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval);
		if (!valptr) {
			pr_err("Error: Invalid bonding mode \"%s\"\n", mode);
L
Linus Torvalds 已提交
4278 4279
			return -EINVAL;
		}
4280
		bond_mode = valptr->value;
L
Linus Torvalds 已提交
4281 4282
	}

4283 4284
	if (xmit_hash_policy) {
		if ((bond_mode != BOND_MODE_XOR) &&
4285 4286
		    (bond_mode != BOND_MODE_8023AD) &&
		    (bond_mode != BOND_MODE_TLB)) {
J
Joe Perches 已提交
4287
			pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
J
Joe Perches 已提交
4288
				bond_mode_name(bond_mode));
4289
		} else {
4290 4291 4292 4293
			bond_opt_initstr(&newval, xmit_hash_policy);
			valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH),
						&newval);
			if (!valptr) {
J
Joe Perches 已提交
4294
				pr_err("Error: Invalid xmit_hash_policy \"%s\"\n",
4295 4296 4297
				       xmit_hash_policy);
				return -EINVAL;
			}
4298
			xmit_hashtype = valptr->value;
4299 4300 4301
		}
	}

L
Linus Torvalds 已提交
4302 4303
	if (lacp_rate) {
		if (bond_mode != BOND_MODE_8023AD) {
J
Joe Perches 已提交
4304 4305
			pr_info("lacp_rate param is irrelevant in mode %s\n",
				bond_mode_name(bond_mode));
L
Linus Torvalds 已提交
4306
		} else {
4307 4308 4309 4310
			bond_opt_initstr(&newval, lacp_rate);
			valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE),
						&newval);
			if (!valptr) {
J
Joe Perches 已提交
4311
				pr_err("Error: Invalid lacp rate \"%s\"\n",
4312
				       lacp_rate);
L
Linus Torvalds 已提交
4313 4314
				return -EINVAL;
			}
4315
			lacp_fast = valptr->value;
L
Linus Torvalds 已提交
4316 4317 4318
		}
	}

4319
	if (ad_select) {
4320
		bond_opt_initstr(&newval, ad_select);
4321 4322 4323 4324
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT),
					&newval);
		if (!valptr) {
			pr_err("Error: Invalid ad_select \"%s\"\n", ad_select);
4325 4326
			return -EINVAL;
		}
4327 4328
		params->ad_select = valptr->value;
		if (bond_mode != BOND_MODE_8023AD)
4329
			pr_warn("ad_select param only affects 802.3ad mode\n");
4330 4331 4332 4333
	} else {
		params->ad_select = BOND_AD_STABLE;
	}

4334
	if (max_bonds < 0) {
4335 4336
		pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n",
			max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS);
L
Linus Torvalds 已提交
4337 4338 4339 4340
		max_bonds = BOND_DEFAULT_MAX_BONDS;
	}

	if (miimon < 0) {
4341 4342
		pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			miimon, INT_MAX);
4343
		miimon = 0;
L
Linus Torvalds 已提交
4344 4345 4346
	}

	if (updelay < 0) {
4347 4348
		pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			updelay, INT_MAX);
L
Linus Torvalds 已提交
4349 4350 4351 4352
		updelay = 0;
	}

	if (downdelay < 0) {
4353 4354
		pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			downdelay, INT_MAX);
L
Linus Torvalds 已提交
4355 4356 4357 4358
		downdelay = 0;
	}

	if ((use_carrier != 0) && (use_carrier != 1)) {
4359 4360
		pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n",
			use_carrier);
L
Linus Torvalds 已提交
4361 4362 4363
		use_carrier = 1;
	}

4364
	if (num_peer_notif < 0 || num_peer_notif > 255) {
4365 4366
		pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n",
			num_peer_notif);
4367 4368 4369
		num_peer_notif = 1;
	}

4370
	/* reset values for 802.3ad/TLB/ALB */
4371
	if (!bond_mode_uses_arp(bond_mode)) {
L
Linus Torvalds 已提交
4372
		if (!miimon) {
4373 4374
			pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n");
			pr_warn("Forcing miimon to 100msec\n");
4375
			miimon = BOND_DEFAULT_MIIMON;
L
Linus Torvalds 已提交
4376 4377 4378
		}
	}

4379
	if (tx_queues < 1 || tx_queues > 255) {
4380 4381
		pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n",
			tx_queues, BOND_DEFAULT_TX_QUEUES);
4382 4383 4384
		tx_queues = BOND_DEFAULT_TX_QUEUES;
	}

4385
	if ((all_slaves_active != 0) && (all_slaves_active != 1)) {
4386 4387
		pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n",
			all_slaves_active);
4388 4389 4390
		all_slaves_active = 0;
	}

4391
	if (resend_igmp < 0 || resend_igmp > 255) {
4392 4393
		pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n",
			resend_igmp, BOND_DEFAULT_RESEND_IGMP);
4394 4395 4396
		resend_igmp = BOND_DEFAULT_RESEND_IGMP;
	}

4397 4398
	bond_opt_initval(&newval, packets_per_slave);
	if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) {
4399 4400 4401 4402 4403
		pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n",
			packets_per_slave, USHRT_MAX);
		packets_per_slave = 1;
	}

L
Linus Torvalds 已提交
4404
	if (bond_mode == BOND_MODE_ALB) {
J
Joe Perches 已提交
4405 4406
		pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n",
			  updelay);
L
Linus Torvalds 已提交
4407 4408 4409 4410 4411 4412 4413
	}

	if (!miimon) {
		if (updelay || downdelay) {
			/* just warn the user the up/down delay will have
			 * no effect since miimon is zero...
			 */
4414 4415
			pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n",
				updelay, downdelay);
L
Linus Torvalds 已提交
4416 4417 4418 4419
		}
	} else {
		/* don't allow arp monitoring */
		if (arp_interval) {
4420 4421
			pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n",
				miimon, arp_interval);
L
Linus Torvalds 已提交
4422 4423 4424 4425
			arp_interval = 0;
		}

		if ((updelay % miimon) != 0) {
4426 4427
			pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n",
				updelay, miimon, (updelay / miimon) * miimon);
L
Linus Torvalds 已提交
4428 4429 4430 4431 4432
		}

		updelay /= miimon;

		if ((downdelay % miimon) != 0) {
4433 4434 4435
			pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n",
				downdelay, miimon,
				(downdelay / miimon) * miimon);
L
Linus Torvalds 已提交
4436 4437 4438 4439 4440 4441
		}

		downdelay /= miimon;
	}

	if (arp_interval < 0) {
4442 4443
		pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			arp_interval, INT_MAX);
4444
		arp_interval = 0;
L
Linus Torvalds 已提交
4445 4446
	}

4447 4448
	for (arp_ip_count = 0, i = 0;
	     (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) {
4449
		__be32 ip;
4450 4451

		/* not a complete check, but good enough to catch mistakes */
4452
		if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) ||
4453
		    !bond_is_ip_target_ok(ip)) {
4454 4455
			pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n",
				arp_ip_target[i]);
L
Linus Torvalds 已提交
4456 4457
			arp_interval = 0;
		} else {
4458 4459 4460
			if (bond_get_targets_ip(arp_target, ip) == -1)
				arp_target[arp_ip_count++] = ip;
			else
4461 4462
				pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n",
					&ip);
L
Linus Torvalds 已提交
4463 4464 4465 4466 4467
		}
	}

	if (arp_interval && !arp_ip_count) {
		/* don't allow arping if no arp_ip_target given... */
4468 4469
		pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n",
			arp_interval);
L
Linus Torvalds 已提交
4470 4471 4472
		arp_interval = 0;
	}

4473 4474
	if (arp_validate) {
		if (!arp_interval) {
J
Joe Perches 已提交
4475
			pr_err("arp_validate requires arp_interval\n");
4476 4477 4478
			return -EINVAL;
		}

4479 4480 4481 4482
		bond_opt_initstr(&newval, arp_validate);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
4483
			pr_err("Error: invalid arp_validate \"%s\"\n",
4484
			       arp_validate);
4485 4486
			return -EINVAL;
		}
4487 4488
		arp_validate_value = valptr->value;
	} else {
4489
		arp_validate_value = 0;
4490
	}
4491

4492 4493
	arp_all_targets_value = 0;
	if (arp_all_targets) {
4494 4495 4496 4497
		bond_opt_initstr(&newval, arp_all_targets);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS),
					&newval);
		if (!valptr) {
4498 4499 4500
			pr_err("Error: invalid arp_all_targets_value \"%s\"\n",
			       arp_all_targets);
			arp_all_targets_value = 0;
4501 4502
		} else {
			arp_all_targets_value = valptr->value;
4503 4504 4505
		}
	}

L
Linus Torvalds 已提交
4506
	if (miimon) {
J
Joe Perches 已提交
4507
		pr_info("MII link monitoring set to %d ms\n", miimon);
L
Linus Torvalds 已提交
4508
	} else if (arp_interval) {
4509 4510
		valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE,
					  arp_validate_value);
J
Joe Perches 已提交
4511
		pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):",
4512
			arp_interval, valptr->string, arp_ip_count);
L
Linus Torvalds 已提交
4513 4514

		for (i = 0; i < arp_ip_count; i++)
J
Joe Perches 已提交
4515
			pr_cont(" %s", arp_ip_target[i]);
L
Linus Torvalds 已提交
4516

J
Joe Perches 已提交
4517
		pr_cont("\n");
L
Linus Torvalds 已提交
4518

4519
	} else if (max_bonds) {
L
Linus Torvalds 已提交
4520 4521 4522
		/* miimon and arp_interval not set, we need one so things
		 * work as expected, see bonding.txt for details
		 */
J
Joe Perches 已提交
4523
		pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n");
L
Linus Torvalds 已提交
4524 4525
	}

4526
	if (primary && !bond_mode_uses_primary(bond_mode)) {
L
Linus Torvalds 已提交
4527 4528 4529
		/* currently, using a primary only makes sense
		 * in active backup, TLB or ALB modes
		 */
4530 4531
		pr_warn("Warning: %s primary device specified but has no effect in %s mode\n",
			primary, bond_mode_name(bond_mode));
L
Linus Torvalds 已提交
4532 4533 4534
		primary = NULL;
	}

4535
	if (primary && primary_reselect) {
4536 4537 4538 4539
		bond_opt_initstr(&newval, primary_reselect);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
4540
			pr_err("Error: Invalid primary_reselect \"%s\"\n",
4541
			       primary_reselect);
4542 4543
			return -EINVAL;
		}
4544
		primary_reselect_value = valptr->value;
4545 4546 4547 4548
	} else {
		primary_reselect_value = BOND_PRI_RESELECT_ALWAYS;
	}

4549
	if (fail_over_mac) {
4550 4551 4552 4553
		bond_opt_initstr(&newval, fail_over_mac);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
4554
			pr_err("Error: invalid fail_over_mac \"%s\"\n",
4555
			       fail_over_mac);
4556 4557
			return -EINVAL;
		}
4558
		fail_over_mac_value = valptr->value;
4559
		if (bond_mode != BOND_MODE_ACTIVEBACKUP)
4560
			pr_warn("Warning: fail_over_mac only affects active-backup mode\n");
4561 4562 4563
	} else {
		fail_over_mac_value = BOND_FOM_NONE;
	}
4564

4565 4566 4567 4568 4569 4570 4571 4572 4573 4574
	bond_opt_initstr(&newval, "default");
	valptr = bond_opt_parse(
			bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO),
				     &newval);
	if (!valptr) {
		pr_err("Error: No ad_actor_sys_prio default value");
		return -EINVAL;
	}
	ad_actor_sys_prio = valptr->value;

4575 4576 4577 4578 4579 4580 4581 4582
	valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY),
				&newval);
	if (!valptr) {
		pr_err("Error: No ad_user_port_key default value");
		return -EINVAL;
	}
	ad_user_port_key = valptr->value;

4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593
	if (bond_mode == BOND_MODE_TLB) {
		bond_opt_initstr(&newval, "default");
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB),
					&newval);
		if (!valptr) {
			pr_err("Error: No tlb_dynamic_lb default value");
			return -EINVAL;
		}
		tlb_dynamic_lb = valptr->value;
	}

4594
	if (lp_interval == 0) {
4595 4596
		pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
			INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
4597 4598 4599
		lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
	}

L
Linus Torvalds 已提交
4600 4601
	/* fill params struct with the proper values */
	params->mode = bond_mode;
4602
	params->xmit_policy = xmit_hashtype;
L
Linus Torvalds 已提交
4603
	params->miimon = miimon;
4604
	params->num_peer_notif = num_peer_notif;
L
Linus Torvalds 已提交
4605
	params->arp_interval = arp_interval;
4606
	params->arp_validate = arp_validate_value;
4607
	params->arp_all_targets = arp_all_targets_value;
L
Linus Torvalds 已提交
4608 4609 4610 4611 4612
	params->updelay = updelay;
	params->downdelay = downdelay;
	params->use_carrier = use_carrier;
	params->lacp_fast = lacp_fast;
	params->primary[0] = 0;
4613
	params->primary_reselect = primary_reselect_value;
4614
	params->fail_over_mac = fail_over_mac_value;
4615
	params->tx_queues = tx_queues;
4616
	params->all_slaves_active = all_slaves_active;
4617
	params->resend_igmp = resend_igmp;
4618
	params->min_links = min_links;
4619
	params->lp_interval = lp_interval;
4620
	params->packets_per_slave = packets_per_slave;
4621
	params->tlb_dynamic_lb = tlb_dynamic_lb;
4622
	params->ad_actor_sys_prio = ad_actor_sys_prio;
4623
	eth_zero_addr(params->ad_actor_system);
4624
	params->ad_user_port_key = ad_user_port_key;
4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635
	if (packets_per_slave > 0) {
		params->reciprocal_packets_per_slave =
			reciprocal_value(packets_per_slave);
	} else {
		/* reciprocal_packets_per_slave is unused if
		 * packets_per_slave is 0 or 1, just initialize it
		 */
		params->reciprocal_packets_per_slave =
			(struct reciprocal_value) { 0 };
	}

L
Linus Torvalds 已提交
4636 4637 4638 4639 4640 4641 4642 4643 4644 4645
	if (primary) {
		strncpy(params->primary, primary, IFNAMSIZ);
		params->primary[IFNAMSIZ - 1] = 0;
	}

	memcpy(params->arp_targets, arp_target, sizeof(arp_target));

	return 0;
}

4646
/* Called from registration process */
4647 4648 4649
static int bond_init(struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
4650
	struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
4651

4652
	netdev_dbg(bond_dev, "Begin bond_init\n");
4653

4654
	bond->wq = alloc_ordered_workqueue(bond_dev->name, WQ_MEM_RECLAIM);
4655 4656 4657
	if (!bond->wq)
		return -ENOMEM;

4658
	netdev_lockdep_set_classes(bond_dev);
4659

4660
	list_add_tail(&bond->bond_list, &bn->dev_list);
4661

4662
	bond_prepare_sysfs_group(bond);
4663

4664 4665
	bond_debug_register(bond);

4666 4667
	/* Ensure valid dev_addr */
	if (is_zero_ether_addr(bond_dev->dev_addr) &&
4668
	    bond_dev->addr_assign_type == NET_ADDR_PERM)
4669 4670
		eth_hw_addr_random(bond_dev);

4671 4672 4673
	return 0;
}

4674
unsigned int bond_get_num_tx_queues(void)
4675
{
4676
	return tx_queues;
4677 4678
}

4679
/* Create a new bond based on the specified name and bonding parameters.
4680
 * If name is NULL, obtain a suitable "bond%d" name for us.
4681 4682 4683
 * Caller must NOT hold rtnl_lock; we need to release it here before we
 * set up our sysfs entries.
 */
4684
int bond_create(struct net *net, const char *name)
4685 4686
{
	struct net_device *bond_dev;
4687 4688
	struct bonding *bond;
	struct alb_bond_info *bond_info;
4689 4690 4691
	int res;

	rtnl_lock();
4692

4693
	bond_dev = alloc_netdev_mq(sizeof(struct bonding),
4694
				   name ? name : "bond%d", NET_NAME_UNKNOWN,
4695
				   bond_setup, tx_queues);
4696
	if (!bond_dev) {
J
Joe Perches 已提交
4697
		pr_err("%s: eek! can't alloc netdev!\n", name);
4698 4699
		rtnl_unlock();
		return -ENOMEM;
4700 4701
	}

4702 4703 4704 4705 4706 4707 4708 4709
	/*
	 * Initialize rx_hashtbl_used_head to RLB_NULL_INDEX.
	 * It is set to 0 by default which is wrong.
	 */
	bond = netdev_priv(bond_dev);
	bond_info = &(BOND_ALB_INFO(bond));
	bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX;

4710
	dev_net_set(bond_dev, net);
4711 4712
	bond_dev->rtnl_link_ops = &bond_link_ops;

4713
	res = register_netdevice(bond_dev);
4714

4715 4716
	netif_carrier_off(bond_dev);

4717 4718
	bond_work_init_all(bond);

4719
	rtnl_unlock();
4720 4721
	if (res < 0)
		bond_destructor(bond_dev);
E
Eric W. Biederman 已提交
4722
	return res;
4723 4724
}

4725
static int __net_init bond_net_init(struct net *net)
4726
{
4727
	struct bond_net *bn = net_generic(net, bond_net_id);
4728 4729 4730 4731 4732

	bn->net = net;
	INIT_LIST_HEAD(&bn->dev_list);

	bond_create_proc_dir(bn);
4733
	bond_create_sysfs(bn);
4734

4735
	return 0;
4736 4737
}

4738
static void __net_exit bond_net_exit(struct net *net)
4739
{
4740
	struct bond_net *bn = net_generic(net, bond_net_id);
4741 4742
	struct bonding *bond, *tmp_bond;
	LIST_HEAD(list);
4743

4744
	bond_destroy_sysfs(bn);
4745 4746 4747 4748 4749 4750 4751

	/* Kill off any bonds created after unregistering bond rtnl ops */
	rtnl_lock();
	list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
		unregister_netdevice_queue(bond->dev, &list);
	unregister_netdevice_many(&list);
	rtnl_unlock();
4752 4753

	bond_destroy_proc_dir(bn);
4754 4755 4756 4757 4758
}

static struct pernet_operations bond_net_ops = {
	.init = bond_net_init,
	.exit = bond_net_exit,
4759 4760
	.id   = &bond_net_id,
	.size = sizeof(struct bond_net),
4761 4762
};

L
Linus Torvalds 已提交
4763 4764 4765 4766 4767
static int __init bonding_init(void)
{
	int i;
	int res;

4768
	pr_info("%s", bond_version);
L
Linus Torvalds 已提交
4769

4770
	res = bond_check_params(&bonding_defaults);
S
Stephen Hemminger 已提交
4771
	if (res)
4772
		goto out;
L
Linus Torvalds 已提交
4773

4774
	res = register_pernet_subsys(&bond_net_ops);
4775 4776
	if (res)
		goto out;
4777

4778
	res = bond_netlink_init();
4779
	if (res)
4780
		goto err_link;
4781

4782 4783
	bond_create_debugfs();

L
Linus Torvalds 已提交
4784
	for (i = 0; i < max_bonds; i++) {
4785
		res = bond_create(&init_net, NULL);
4786 4787
		if (res)
			goto err;
L
Linus Torvalds 已提交
4788 4789 4790
	}

	register_netdevice_notifier(&bond_netdev_notifier);
4791
out:
L
Linus Torvalds 已提交
4792
	return res;
4793
err:
4794
	bond_destroy_debugfs();
4795
	bond_netlink_fini();
4796
err_link:
4797
	unregister_pernet_subsys(&bond_net_ops);
4798
	goto out;
4799

L
Linus Torvalds 已提交
4800 4801 4802 4803 4804 4805
}

static void __exit bonding_exit(void)
{
	unregister_netdevice_notifier(&bond_netdev_notifier);

4806
	bond_destroy_debugfs();
4807

4808
	bond_netlink_fini();
4809
	unregister_pernet_subsys(&bond_net_ops);
4810 4811

#ifdef CONFIG_NET_POLL_CONTROLLER
4812
	/* Make sure we don't have an imbalance on our netpoll blocking */
4813
	WARN_ON(atomic_read(&netpoll_block_tx));
4814
#endif
L
Linus Torvalds 已提交
4815 4816 4817 4818 4819 4820 4821 4822
}

module_init(bonding_init);
module_exit(bonding_exit);
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_VERSION);
MODULE_DESCRIPTION(DRV_DESCRIPTION ", v" DRV_VERSION);
MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others");