bond_main.c 164.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
/*
 * originally based on the dummy device.
 *
 * Copyright 1999, Thomas Davis, tadavis@lbl.gov.
 * Licensed under the GPL. Based on dummy.c, and eql.c devices.
 *
 * bonding.c: an Ethernet Bonding driver
 *
 * This is useful to talk to a Cisco EtherChannel compatible equipment:
 *	Cisco 5500
 *	Sun Trunking (Solaris)
 *	Alteon AceDirector Trunks
 *	Linux Bonding
 *	and probably many L2 switches ...
 *
 * How it works:
 *    ifconfig bond0 ipaddress netmask up
 *      will setup a network device, with an ip address.  No mac address
 *	will be assigned at this time.  The hw mac address will come from
 *	the first slave bonded to the channel.  All slaves will then use
 *	this hw mac address.
 *
 *    ifconfig bond0 down
 *         will release all slaves, marking them as down.
 *
 *    ifenslave bond0 eth0
 *	will attach eth0 to bond0 as a slave.  eth0 hw mac address will either
 *	a: be used as initial mac address
 *	b: if a hw mac address already is there, eth0's hw mac address
 *	   will then be set from bond0.
 *
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/interrupt.h>
#include <linux/ptrace.h>
#include <linux/ioport.h>
#include <linux/in.h>
42
#include <net/ip.h>
L
Linus Torvalds 已提交
43
#include <linux/ip.h>
M
Matteo Croce 已提交
44 45
#include <linux/icmp.h>
#include <linux/icmpv6.h>
46 47
#include <linux/tcp.h>
#include <linux/udp.h>
L
Linus Torvalds 已提交
48 49 50 51 52 53 54 55
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/socket.h>
#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/bitops.h>
S
Stephen Hemminger 已提交
56
#include <linux/io.h>
L
Linus Torvalds 已提交
57
#include <asm/dma.h>
S
Stephen Hemminger 已提交
58
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
59 60 61
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
62
#include <linux/igmp.h>
L
Linus Torvalds 已提交
63 64 65 66 67 68 69 70 71 72 73
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/smp.h>
#include <linux/if_ether.h>
#include <net/arp.h>
#include <linux/mii.h>
#include <linux/ethtool.h>
#include <linux/if_vlan.h>
#include <linux/if_bonding.h>
D
David Sterba 已提交
74
#include <linux/jiffies.h>
75
#include <linux/preempt.h>
J
Jay Vosburgh 已提交
76
#include <net/route.h>
77
#include <net/net_namespace.h>
78
#include <net/netns/generic.h>
79
#include <net/pkt_sched.h>
80
#include <linux/rculist.h>
81
#include <net/flow_dissector.h>
82
#include <net/xfrm.h>
83 84 85
#include <net/bonding.h>
#include <net/bond_3ad.h>
#include <net/bond_alb.h>
86 87 88
#if IS_ENABLED(CONFIG_TLS_DEVICE)
#include <net/tls.h>
#endif
L
Linus Torvalds 已提交
89

90 91
#include "bonding_priv.h"

L
Linus Torvalds 已提交
92 93 94 95 96
/*---------------------------- Module parameters ----------------------------*/

/* monitor all links that often (in milliseconds). <=0 disables monitoring */

static int max_bonds	= BOND_DEFAULT_MAX_BONDS;
97
static int tx_queues	= BOND_DEFAULT_TX_QUEUES;
98
static int num_peer_notif = 1;
99
static int miimon;
S
Stephen Hemminger 已提交
100 101
static int updelay;
static int downdelay;
L
Linus Torvalds 已提交
102
static int use_carrier	= 1;
S
Stephen Hemminger 已提交
103 104
static char *mode;
static char *primary;
105
static char *primary_reselect;
S
Stephen Hemminger 已提交
106
static char *lacp_rate;
107
static int min_links;
S
Stephen Hemminger 已提交
108 109
static char *ad_select;
static char *xmit_hash_policy;
110
static int arp_interval;
S
Stephen Hemminger 已提交
111 112
static char *arp_ip_target[BOND_MAX_ARP_TARGETS];
static char *arp_validate;
113
static char *arp_all_targets;
S
Stephen Hemminger 已提交
114
static char *fail_over_mac;
115
static int all_slaves_active;
116
static struct bond_params bonding_defaults;
117
static int resend_igmp = BOND_DEFAULT_RESEND_IGMP;
118
static int packets_per_slave = 1;
119
static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
L
Linus Torvalds 已提交
120 121 122

module_param(max_bonds, int, 0);
MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
123 124
module_param(tx_queues, int, 0);
MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)");
125
module_param_named(num_grat_arp, num_peer_notif, int, 0644);
126 127
MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on "
			       "failover event (alias of num_unsol_na)");
128
module_param_named(num_unsol_na, num_peer_notif, int, 0644);
129 130
MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on "
			       "failover event (alias of num_grat_arp)");
L
Linus Torvalds 已提交
131 132 133 134 135
module_param(miimon, int, 0);
MODULE_PARM_DESC(miimon, "Link check interval in milliseconds");
module_param(updelay, int, 0);
MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds");
module_param(downdelay, int, 0);
136 137
MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
			    "in milliseconds");
L
Linus Torvalds 已提交
138
module_param(use_carrier, int, 0);
139
MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
140
			      "0 for off, 1 for on (default)");
L
Linus Torvalds 已提交
141
module_param(mode, charp, 0);
142
MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
143 144 145
		       "1 for active-backup, 2 for balance-xor, "
		       "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, "
		       "6 for balance-alb");
L
Linus Torvalds 已提交
146 147
module_param(primary, charp, 0);
MODULE_PARM_DESC(primary, "Primary network device to use");
148 149 150 151 152 153 154 155
module_param(primary_reselect, charp, 0);
MODULE_PARM_DESC(primary_reselect, "Reselect primary slave "
				   "once it comes up; "
				   "0 for always (default), "
				   "1 for only if speed of primary is "
				   "better, "
				   "2 for only on active slave "
				   "failure");
L
Linus Torvalds 已提交
156
module_param(lacp_rate, charp, 0);
157 158
MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; "
			    "0 for slow, 1 for fast");
159
module_param(ad_select, charp, 0);
Z
Zhu Yanjun 已提交
160
MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; "
161 162
			    "0 for stable (default), 1 for bandwidth, "
			    "2 for count");
163 164 165
module_param(min_links, int, 0);
MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");

166
module_param(xmit_hash_policy, charp, 0);
167
MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
168
				   "0 for layer 2 (default), 1 for layer 3+4, "
169
				   "2 for layer 2+3, 3 for encap layer 2+3, "
170
				   "4 for encap layer 3+4, 5 for vlan+srcmac");
L
Linus Torvalds 已提交
171 172 173 174
module_param(arp_interval, int, 0);
MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
module_param_array(arp_ip_target, charp, NULL, 0);
MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form");
175
module_param(arp_validate, charp, 0);
176 177 178
MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; "
			       "0 for none (default), 1 for active, "
			       "2 for backup, 3 for all");
179 180
module_param(arp_all_targets, charp, 0);
MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all");
181
module_param(fail_over_mac, charp, 0);
182 183 184
MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to "
				"the same MAC; 0 for none (default), "
				"1 for active, 2 for follow");
185
module_param(all_slaves_active, int, 0);
186
MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface "
187
				     "by setting active flag for all slaves; "
188
				     "0 for never (default), 1 for always.");
189
module_param(resend_igmp, int, 0);
190 191
MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on "
			      "link failure");
192 193 194 195
module_param(packets_per_slave, int, 0);
MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr "
				    "mode; 0 for a random slave, 1 packet per "
				    "slave (default), >1 packets per slave.");
196 197 198 199
module_param(lp_interval, uint, 0);
MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where "
			      "the bonding driver sends learning packets to "
			      "each slaves peer switch. The default is 1.");
L
Linus Torvalds 已提交
200 201 202

/*----------------------------- Global variables ----------------------------*/

203
#ifdef CONFIG_NET_POLL_CONTROLLER
204
atomic_t netpoll_block_tx = ATOMIC_INIT(0);
205 206
#endif

207
unsigned int bond_net_id __read_mostly;
L
Linus Torvalds 已提交
208

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
static const struct flow_dissector_key flow_keys_bonding_keys[] = {
	{
		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
		.offset = offsetof(struct flow_keys, control),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_BASIC,
		.offset = offsetof(struct flow_keys, basic),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
		.offset = offsetof(struct flow_keys, addrs.v4addrs),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
		.offset = offsetof(struct flow_keys, addrs.v6addrs),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_TIPC,
		.offset = offsetof(struct flow_keys, addrs.tipckey),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_PORTS,
		.offset = offsetof(struct flow_keys, ports),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_ICMP,
		.offset = offsetof(struct flow_keys, icmp),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_VLAN,
		.offset = offsetof(struct flow_keys, vlan),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
		.offset = offsetof(struct flow_keys, tags),
	},
	{
		.key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
		.offset = offsetof(struct flow_keys, keyid),
	},
};

static struct flow_dissector flow_keys_bonding __read_mostly;

L
Linus Torvalds 已提交
254 255
/*-------------------------- Forward declarations ---------------------------*/

256
static int bond_init(struct net_device *bond_dev);
257
static void bond_uninit(struct net_device *bond_dev);
258 259
static void bond_get_stats(struct net_device *bond_dev,
			   struct rtnl_link_stats64 *stats);
260
static void bond_slave_arr_handler(struct work_struct *work);
261 262
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
				  int mod);
263
static void bond_netdev_notify_work(struct work_struct *work);
L
Linus Torvalds 已提交
264 265 266

/*---------------------------- General routines -----------------------------*/

267
const char *bond_mode_name(int mode)
L
Linus Torvalds 已提交
268
{
269 270 271 272 273
	static const char *names[] = {
		[BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)",
		[BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)",
		[BOND_MODE_XOR] = "load balancing (xor)",
		[BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)",
S
Stephen Hemminger 已提交
274
		[BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation",
275 276 277 278
		[BOND_MODE_TLB] = "transmit load balancing",
		[BOND_MODE_ALB] = "adaptive load balancing",
	};

279
	if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB)
L
Linus Torvalds 已提交
280
		return "unknown";
281 282

	return names[mode];
L
Linus Torvalds 已提交
283 284 285 286
}

/**
 * bond_dev_queue_xmit - Prepare skb for xmit.
S
Stephen Hemminger 已提交
287
 *
L
Linus Torvalds 已提交
288 289 290 291
 * @bond: bond device that got this skb for tx.
 * @skb: hw accel VLAN tagged skb to transmit
 * @slave_dev: slave that is supposed to xmit this skbuff
 */
292
netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
S
Stephen Hemminger 已提交
293
			struct net_device *slave_dev)
L
Linus Torvalds 已提交
294
{
295
	skb->dev = slave_dev;
296

297
	BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
298
		     sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
299
	skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
300

301
	if (unlikely(netpoll_tx_running(bond->dev)))
302 303 304
		return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb);

	return dev_queue_xmit(skb);
L
Linus Torvalds 已提交
305 306
}

307 308 309 310 311 312 313 314 315 316 317 318 319
bool bond_sk_check(struct bonding *bond)
{
	switch (BOND_MODE(bond)) {
	case BOND_MODE_8023AD:
	case BOND_MODE_XOR:
		if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34)
			return true;
		fallthrough;
	default:
		return false;
	}
}

320 321 322 323 324 325 326 327 328 329 330 331 332
static bool bond_xdp_check(struct bonding *bond)
{
	switch (BOND_MODE(bond)) {
	case BOND_MODE_ROUNDROBIN:
	case BOND_MODE_ACTIVEBACKUP:
	case BOND_MODE_8023AD:
	case BOND_MODE_XOR:
		return true;
	default:
		return false;
	}
}

333 334
/*---------------------------------- VLAN -----------------------------------*/

335
/* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
J
Jiri Pirko 已提交
336
 * We don't protect the slave list iteration with a lock because:
L
Linus Torvalds 已提交
337 338 339 340
 * a. This operation is performed in IOCTL context,
 * b. The operation is protected by the RTNL semaphore in the 8021q code,
 * c. Holding a lock with BH disabled while directly calling a base driver
 *    entry point is generally a BAD idea.
S
Stephen Hemminger 已提交
341
 *
L
Linus Torvalds 已提交
342 343 344 345 346 347 348 349 350 351 352 353
 * The design of synchronization/protection for this operation in the 8021q
 * module is good for one or more VLAN devices over a single physical device
 * and cannot be extended for a teaming solution like bonding, so there is a
 * potential race condition here where a net device from the vlan group might
 * be referenced (either by a base driver or the 8021q code) while it is being
 * removed from the system. However, it turns out we're not making matters
 * worse, and if it works for regular VLAN usage it will work here too.
*/

/**
 * bond_vlan_rx_add_vid - Propagates adding an id to slaves
 * @bond_dev: bonding net device that got called
354
 * @proto: network protocol ID
L
Linus Torvalds 已提交
355 356
 * @vid: vlan id being added
 */
357 358
static int bond_vlan_rx_add_vid(struct net_device *bond_dev,
				__be16 proto, u16 vid)
L
Linus Torvalds 已提交
359
{
360
	struct bonding *bond = netdev_priv(bond_dev);
361
	struct slave *slave, *rollback_slave;
362
	struct list_head *iter;
363
	int res;
L
Linus Torvalds 已提交
364

365
	bond_for_each_slave(bond, slave, iter) {
366
		res = vlan_vid_add(slave->dev, proto, vid);
367 368
		if (res)
			goto unwind;
L
Linus Torvalds 已提交
369 370
	}

371
	return 0;
372 373

unwind:
374
	/* unwind to the slave that failed */
375
	bond_for_each_slave(bond, rollback_slave, iter) {
376 377 378 379 380
		if (rollback_slave == slave)
			break;

		vlan_vid_del(rollback_slave->dev, proto, vid);
	}
381 382

	return res;
L
Linus Torvalds 已提交
383 384 385 386 387
}

/**
 * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves
 * @bond_dev: bonding net device that got called
388
 * @proto: network protocol ID
L
Linus Torvalds 已提交
389 390
 * @vid: vlan id being removed
 */
391 392
static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
				 __be16 proto, u16 vid)
L
Linus Torvalds 已提交
393
{
394
	struct bonding *bond = netdev_priv(bond_dev);
395
	struct list_head *iter;
L
Linus Torvalds 已提交
396 397
	struct slave *slave;

398
	bond_for_each_slave(bond, slave, iter)
399
		vlan_vid_del(slave->dev, proto, vid);
L
Linus Torvalds 已提交
400

401 402
	if (bond_is_lb(bond))
		bond_alb_clear_vlan(bond, vid);
403 404

	return 0;
L
Linus Torvalds 已提交
405 406
}

407 408 409 410 411 412 413 414 415 416
/*---------------------------------- XFRM -----------------------------------*/

#ifdef CONFIG_XFRM_OFFLOAD
/**
 * bond_ipsec_add_sa - program device with a security association
 * @xs: pointer to transformer state struct
 **/
static int bond_ipsec_add_sa(struct xfrm_state *xs)
{
	struct net_device *bond_dev = xs->xso.dev;
417
	struct bond_ipsec *ipsec;
418 419
	struct bonding *bond;
	struct slave *slave;
420
	int err;
421

422 423 424
	if (!bond_dev)
		return -EINVAL;

425
	rcu_read_lock();
426
	bond = netdev_priv(bond_dev);
427
	slave = rcu_dereference(bond->curr_active_slave);
428 429 430 431 432
	if (!slave) {
		rcu_read_unlock();
		return -ENODEV;
	}

433 434 435
	if (!slave->dev->xfrmdev_ops ||
	    !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
	    netif_is_bond_master(slave->dev)) {
436
		slave_warn(bond_dev, slave->dev, "Slave does not support ipsec offload\n");
437
		rcu_read_unlock();
438 439 440
		return -EINVAL;
	}

441 442 443 444 445 446 447
	ipsec = kmalloc(sizeof(*ipsec), GFP_ATOMIC);
	if (!ipsec) {
		rcu_read_unlock();
		return -ENOMEM;
	}
	xs->xso.real_dev = slave->dev;

448
	err = slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
449 450 451 452 453 454 455 456 457
	if (!err) {
		ipsec->xs = xs;
		INIT_LIST_HEAD(&ipsec->list);
		spin_lock_bh(&bond->ipsec_lock);
		list_add(&ipsec->list, &bond->ipsec_list);
		spin_unlock_bh(&bond->ipsec_lock);
	} else {
		kfree(ipsec);
	}
458 459
	rcu_read_unlock();
	return err;
460 461
}

462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
static void bond_ipsec_add_sa_all(struct bonding *bond)
{
	struct net_device *bond_dev = bond->dev;
	struct bond_ipsec *ipsec;
	struct slave *slave;

	rcu_read_lock();
	slave = rcu_dereference(bond->curr_active_slave);
	if (!slave)
		goto out;

	if (!slave->dev->xfrmdev_ops ||
	    !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
	    netif_is_bond_master(slave->dev)) {
		spin_lock_bh(&bond->ipsec_lock);
		if (!list_empty(&bond->ipsec_list))
			slave_warn(bond_dev, slave->dev,
				   "%s: no slave xdo_dev_state_add\n",
				   __func__);
		spin_unlock_bh(&bond->ipsec_lock);
		goto out;
	}

	spin_lock_bh(&bond->ipsec_lock);
	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
		ipsec->xs->xso.real_dev = slave->dev;
		if (slave->dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs)) {
			slave_warn(bond_dev, slave->dev, "%s: failed to add SA\n", __func__);
			ipsec->xs->xso.real_dev = NULL;
		}
	}
	spin_unlock_bh(&bond->ipsec_lock);
out:
	rcu_read_unlock();
}

498 499 500 501 502 503 504
/**
 * bond_ipsec_del_sa - clear out this specific SA
 * @xs: pointer to transformer state struct
 **/
static void bond_ipsec_del_sa(struct xfrm_state *xs)
{
	struct net_device *bond_dev = xs->xso.dev;
505
	struct bond_ipsec *ipsec;
506 507 508 509 510 511
	struct bonding *bond;
	struct slave *slave;

	if (!bond_dev)
		return;

512
	rcu_read_lock();
513
	bond = netdev_priv(bond_dev);
514
	slave = rcu_dereference(bond->curr_active_slave);
515 516

	if (!slave)
517
		goto out;
518

519 520 521 522
	if (!xs->xso.real_dev)
		goto out;

	WARN_ON(xs->xso.real_dev != slave->dev);
523

524 525 526
	if (!slave->dev->xfrmdev_ops ||
	    !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
	    netif_is_bond_master(slave->dev)) {
527
		slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__);
528
		goto out;
529 530 531
	}

	slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs);
532
out:
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
	spin_lock_bh(&bond->ipsec_lock);
	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
		if (ipsec->xs == xs) {
			list_del(&ipsec->list);
			kfree(ipsec);
			break;
		}
	}
	spin_unlock_bh(&bond->ipsec_lock);
	rcu_read_unlock();
}

static void bond_ipsec_del_sa_all(struct bonding *bond)
{
	struct net_device *bond_dev = bond->dev;
	struct bond_ipsec *ipsec;
	struct slave *slave;

	rcu_read_lock();
	slave = rcu_dereference(bond->curr_active_slave);
	if (!slave) {
		rcu_read_unlock();
		return;
	}

	spin_lock_bh(&bond->ipsec_lock);
	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
		if (!ipsec->xs->xso.real_dev)
			continue;

		if (!slave->dev->xfrmdev_ops ||
		    !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
		    netif_is_bond_master(slave->dev)) {
			slave_warn(bond_dev, slave->dev,
				   "%s: no slave xdo_dev_state_delete\n",
				   __func__);
		} else {
			slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs);
		}
		ipsec->xs->xso.real_dev = NULL;
	}
	spin_unlock_bh(&bond->ipsec_lock);
575
	rcu_read_unlock();
576 577 578 579 580 581 582 583 584 585
}

/**
 * bond_ipsec_offload_ok - can this packet use the xfrm hw offload
 * @skb: current data packet
 * @xs: pointer to transformer state struct
 **/
static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
{
	struct net_device *bond_dev = xs->xso.dev;
586 587 588
	struct net_device *real_dev;
	struct slave *curr_active;
	struct bonding *bond;
589
	int err;
590 591

	bond = netdev_priv(bond_dev);
592
	rcu_read_lock();
593 594
	curr_active = rcu_dereference(bond->curr_active_slave);
	real_dev = curr_active->dev;
595

596
	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
597
		err = false;
598 599
		goto out;
	}
600

601 602 603 604
	if (!xs->xso.real_dev) {
		err = false;
		goto out;
	}
605 606 607 608

	if (!real_dev->xfrmdev_ops ||
	    !real_dev->xfrmdev_ops->xdo_dev_offload_ok ||
	    netif_is_bond_master(real_dev)) {
609 610
		err = false;
		goto out;
611 612
	}

613 614 615 616
	err = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
out:
	rcu_read_unlock();
	return err;
617 618 619 620 621 622 623 624 625
}

static const struct xfrmdev_ops bond_xfrmdev_ops = {
	.xdo_dev_state_add = bond_ipsec_add_sa,
	.xdo_dev_state_delete = bond_ipsec_del_sa,
	.xdo_dev_offload_ok = bond_ipsec_offload_ok,
};
#endif /* CONFIG_XFRM_OFFLOAD */

L
Linus Torvalds 已提交
626 627
/*------------------------------- Link status -------------------------------*/

628
/* Set the carrier state for the master according to the state of its
629 630 631 632 633
 * slaves.  If any slaves are up, the master is up.  In 802.3ad mode,
 * do special 802.3ad magic.
 *
 * Returns zero if carrier state does not change, nonzero if it does.
 */
634
int bond_set_carrier(struct bonding *bond)
635
{
636
	struct list_head *iter;
637 638
	struct slave *slave;

639
	if (!bond_has_slaves(bond))
640 641
		goto down;

642
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
643 644
		return bond_3ad_set_carrier(bond);

645
	bond_for_each_slave(bond, slave, iter) {
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
		if (slave->link == BOND_LINK_UP) {
			if (!netif_carrier_ok(bond->dev)) {
				netif_carrier_on(bond->dev);
				return 1;
			}
			return 0;
		}
	}

down:
	if (netif_carrier_ok(bond->dev)) {
		netif_carrier_off(bond->dev);
		return 1;
	}
	return 0;
}

663
/* Get link speed and duplex from the slave's base driver
L
Linus Torvalds 已提交
664
 * using ethtool. If for some reason the call fails or the
665
 * values are invalid, set speed and duplex to -1,
666 667
 * and return. Return 1 if speed or duplex settings are
 * UNKNOWN; 0 otherwise.
L
Linus Torvalds 已提交
668
 */
669
static int bond_update_speed_duplex(struct slave *slave)
L
Linus Torvalds 已提交
670 671
{
	struct net_device *slave_dev = slave->dev;
672
	struct ethtool_link_ksettings ecmd;
673
	int res;
L
Linus Torvalds 已提交
674

675 676
	slave->speed = SPEED_UNKNOWN;
	slave->duplex = DUPLEX_UNKNOWN;
L
Linus Torvalds 已提交
677

678
	res = __ethtool_get_link_ksettings(slave_dev, &ecmd);
679
	if (res < 0)
680
		return 1;
681
	if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
682
		return 1;
683
	switch (ecmd.base.duplex) {
L
Linus Torvalds 已提交
684 685 686 687
	case DUPLEX_FULL:
	case DUPLEX_HALF:
		break;
	default:
688
		return 1;
L
Linus Torvalds 已提交
689 690
	}

691 692
	slave->speed = ecmd.base.speed;
	slave->duplex = ecmd.base.duplex;
L
Linus Torvalds 已提交
693

694
	return 0;
L
Linus Torvalds 已提交
695 696
}

697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
const char *bond_slave_link_status(s8 link)
{
	switch (link) {
	case BOND_LINK_UP:
		return "up";
	case BOND_LINK_FAIL:
		return "going down";
	case BOND_LINK_DOWN:
		return "down";
	case BOND_LINK_BACK:
		return "going back";
	default:
		return "unknown";
	}
}

713
/* if <dev> supports MII link status reporting, check its link status.
L
Linus Torvalds 已提交
714 715
 *
 * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(),
S
Stephen Hemminger 已提交
716
 * depending upon the setting of the use_carrier parameter.
L
Linus Torvalds 已提交
717 718 719 720 721 722 723 724 725 726 727
 *
 * Return either BMSR_LSTATUS, meaning that the link is up (or we
 * can't tell and just pretend it is), or 0, meaning that the link is
 * down.
 *
 * If reporting is non-zero, instead of faking link up, return -1 if
 * both ETHTOOL and MII ioctls fail (meaning the device does not
 * support them).  If use_carrier is set, return whatever it says.
 * It'd be nice if there was a good way to tell if a driver supports
 * netif_carrier, but there really isn't.
 */
S
Stephen Hemminger 已提交
728 729
static int bond_check_dev_link(struct bonding *bond,
			       struct net_device *slave_dev, int reporting)
L
Linus Torvalds 已提交
730
{
731
	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
732
	int (*ioctl)(struct net_device *, struct ifreq *, int);
L
Linus Torvalds 已提交
733 734 735
	struct ifreq ifr;
	struct mii_ioctl_data *mii;

736 737 738
	if (!reporting && !netif_running(slave_dev))
		return 0;

739
	if (bond->params.use_carrier)
740
		return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
L
Linus Torvalds 已提交
741

742
	/* Try to get link status using Ethtool first. */
743 744 745
	if (slave_dev->ethtool_ops->get_link)
		return slave_dev->ethtool_ops->get_link(slave_dev) ?
			BMSR_LSTATUS : 0;
746

S
Stephen Hemminger 已提交
747
	/* Ethtool can't be used, fallback to MII ioctls. */
748
	ioctl = slave_ops->ndo_eth_ioctl;
L
Linus Torvalds 已提交
749
	if (ioctl) {
750 751 752 753 754 755 756 757
		/* TODO: set pointer to correct ioctl on a per team member
		 *       bases to make this more efficient. that is, once
		 *       we determine the correct ioctl, we will always
		 *       call it and not the others for that team
		 *       member.
		 */

		/* We cannot assume that SIOCGMIIPHY will also read a
L
Linus Torvalds 已提交
758 759 760 761 762
		 * register; not all network drivers (e.g., e100)
		 * support that.
		 */

		/* Yes, the mii is overlaid on the ifreq.ifr_ifru */
763
		strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
L
Linus Torvalds 已提交
764
		mii = if_mii(&ifr);
A
Al Viro 已提交
765
		if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
L
Linus Torvalds 已提交
766
			mii->reg_num = MII_BMSR;
A
Al Viro 已提交
767
			if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
S
Stephen Hemminger 已提交
768
				return mii->val_out & BMSR_LSTATUS;
L
Linus Torvalds 已提交
769 770 771
		}
	}

772
	/* If reporting, report that either there's no ndo_eth_ioctl,
773
	 * or both SIOCGMIIREG and get_link failed (meaning that we
L
Linus Torvalds 已提交
774 775 776
	 * cannot report link status).  If not reporting, pretend
	 * we're ok.
	 */
S
Stephen Hemminger 已提交
777
	return reporting ? -1 : BMSR_LSTATUS;
L
Linus Torvalds 已提交
778 779 780 781
}

/*----------------------------- Multicast list ------------------------------*/

782
/* Push the promiscuity flag down to appropriate slaves */
783
static int bond_set_promiscuity(struct bonding *bond, int inc)
L
Linus Torvalds 已提交
784
{
785
	struct list_head *iter;
786
	int err = 0;
787

788
	if (bond_uses_primary(bond)) {
789
		struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
790 791 792

		if (curr_active)
			err = dev_set_promiscuity(curr_active->dev, inc);
L
Linus Torvalds 已提交
793 794
	} else {
		struct slave *slave;
795

796
		bond_for_each_slave(bond, slave, iter) {
797 798 799
			err = dev_set_promiscuity(slave->dev, inc);
			if (err)
				return err;
L
Linus Torvalds 已提交
800 801
		}
	}
802
	return err;
L
Linus Torvalds 已提交
803 804
}

805
/* Push the allmulti flag down to all slaves */
806
static int bond_set_allmulti(struct bonding *bond, int inc)
L
Linus Torvalds 已提交
807
{
808
	struct list_head *iter;
809
	int err = 0;
810

811
	if (bond_uses_primary(bond)) {
812
		struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
813 814 815

		if (curr_active)
			err = dev_set_allmulti(curr_active->dev, inc);
L
Linus Torvalds 已提交
816 817
	} else {
		struct slave *slave;
818

819
		bond_for_each_slave(bond, slave, iter) {
820 821 822
			err = dev_set_allmulti(slave->dev, inc);
			if (err)
				return err;
L
Linus Torvalds 已提交
823 824
		}
	}
825
	return err;
L
Linus Torvalds 已提交
826 827
}

828
/* Retrieve the list of registered multicast addresses for the bonding
829 830 831
 * device and retransmit an IGMP JOIN request to the current active
 * slave.
 */
832
static void bond_resend_igmp_join_requests_delayed(struct work_struct *work)
833
{
834 835 836
	struct bonding *bond = container_of(work, struct bonding,
					    mcast_work.work);

837
	if (!rtnl_trylock()) {
838
		queue_delayed_work(bond->wq, &bond->mcast_work, 1);
839
		return;
840
	}
841
	call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev);
842

843 844
	if (bond->igmp_retrans > 1) {
		bond->igmp_retrans--;
845
		queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
846
	}
847
	rtnl_unlock();
848 849
}

850
/* Flush bond's hardware addresses from slave */
851
static void bond_hw_addr_flush(struct net_device *bond_dev,
S
Stephen Hemminger 已提交
852
			       struct net_device *slave_dev)
L
Linus Torvalds 已提交
853
{
854
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
855

856 857
	dev_uc_unsync(slave_dev, bond_dev);
	dev_mc_unsync(slave_dev, bond_dev);
L
Linus Torvalds 已提交
858

859
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
L
Linus Torvalds 已提交
860 861 862
		/* del lacpdu mc addr from mc list */
		u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;

863
		dev_mc_del(slave_dev, lacpdu_multicast);
L
Linus Torvalds 已提交
864 865 866 867 868
	}
}

/*--------------------------- Active slave change ---------------------------*/

869
/* Update the hardware address list and promisc/allmulti for the new and
870 871
 * old active slaves (if any).  Modes that are not using primary keep all
 * slaves up date at all times; only the modes that use primary need to call
872
 * this function to swap these settings during a failover.
L
Linus Torvalds 已提交
873
 */
874 875
static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
			      struct slave *old_active)
L
Linus Torvalds 已提交
876 877
{
	if (old_active) {
S
Stephen Hemminger 已提交
878
		if (bond->dev->flags & IFF_PROMISC)
L
Linus Torvalds 已提交
879 880
			dev_set_promiscuity(old_active->dev, -1);

S
Stephen Hemminger 已提交
881
		if (bond->dev->flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
882 883
			dev_set_allmulti(old_active->dev, -1);

884
		bond_hw_addr_flush(bond->dev, old_active->dev);
L
Linus Torvalds 已提交
885 886 887
	}

	if (new_active) {
888
		/* FIXME: Signal errors upstream. */
S
Stephen Hemminger 已提交
889
		if (bond->dev->flags & IFF_PROMISC)
L
Linus Torvalds 已提交
890 891
			dev_set_promiscuity(new_active->dev, 1);

S
Stephen Hemminger 已提交
892
		if (bond->dev->flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
893 894
			dev_set_allmulti(new_active->dev, 1);

895
		netif_addr_lock_bh(bond->dev);
896 897
		dev_uc_sync(new_active->dev, bond->dev);
		dev_mc_sync(new_active->dev, bond->dev);
898
		netif_addr_unlock_bh(bond->dev);
L
Linus Torvalds 已提交
899 900 901
	}
}

902 903 904 905 906 907 908
/**
 * bond_set_dev_addr - clone slave's address to bond
 * @bond_dev: bond net device
 * @slave_dev: slave net device
 *
 * Should be called with RTNL held.
 */
909 910
static int bond_set_dev_addr(struct net_device *bond_dev,
			     struct net_device *slave_dev)
911
{
912 913
	int err;

914 915
	slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n",
		  bond_dev, slave_dev, slave_dev->addr_len);
916 917 918 919
	err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL);
	if (err)
		return err;

920 921 922
	memcpy(bond_dev->dev_addr, slave_dev->dev_addr, slave_dev->addr_len);
	bond_dev->addr_assign_type = NET_ADDR_STOLEN;
	call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev);
923
	return 0;
924 925
}

926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
static struct slave *bond_get_old_active(struct bonding *bond,
					 struct slave *new_active)
{
	struct slave *slave;
	struct list_head *iter;

	bond_for_each_slave(bond, slave, iter) {
		if (slave == new_active)
			continue;

		if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr))
			return slave;
	}

	return NULL;
}

943
/* bond_do_fail_over_mac
944 945 946
 *
 * Perform special MAC address swapping for fail_over_mac settings
 *
947
 * Called with RTNL
948 949 950 951 952
 */
static void bond_do_fail_over_mac(struct bonding *bond,
				  struct slave *new_active,
				  struct slave *old_active)
{
953 954
	u8 tmp_mac[MAX_ADDR_LEN];
	struct sockaddr_storage ss;
955 956 957 958
	int rv;

	switch (bond->params.fail_over_mac) {
	case BOND_FOM_ACTIVE:
959 960 961
		if (new_active) {
			rv = bond_set_dev_addr(bond->dev, new_active->dev);
			if (rv)
962 963
				slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n",
					  -rv);
964
		}
965 966
		break;
	case BOND_FOM_FOLLOW:
967
		/* if new_active && old_active, swap them
968 969 970 971 972 973
		 * if just old_active, do nothing (going to no active slave)
		 * if just new_active, set new_active to bond's MAC
		 */
		if (!new_active)
			return;

974 975 976
		if (!old_active)
			old_active = bond_get_old_active(bond, new_active);

977
		if (old_active) {
978 979 980 981 982 983
			bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr,
					  new_active->dev->addr_len);
			bond_hw_addr_copy(ss.__data,
					  old_active->dev->dev_addr,
					  old_active->dev->addr_len);
			ss.ss_family = new_active->dev->type;
984
		} else {
985 986 987
			bond_hw_addr_copy(ss.__data, bond->dev->dev_addr,
					  bond->dev->addr_len);
			ss.ss_family = bond->dev->type;
988 989
		}

990
		rv = dev_set_mac_address(new_active->dev,
991
					 (struct sockaddr *)&ss, NULL);
992
		if (rv) {
993 994
			slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n",
				  -rv);
995 996 997 998 999 1000
			goto out;
		}

		if (!old_active)
			goto out;

1001 1002 1003
		bond_hw_addr_copy(ss.__data, tmp_mac,
				  new_active->dev->addr_len);
		ss.ss_family = old_active->dev->type;
1004

1005
		rv = dev_set_mac_address(old_active->dev,
1006
					 (struct sockaddr *)&ss, NULL);
1007
		if (rv)
1008 1009
			slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n",
				  -rv);
1010 1011 1012
out:
		break;
	default:
1013 1014
		netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n",
			   bond->params.fail_over_mac);
1015 1016 1017 1018 1019
		break;
	}

}

1020
static struct slave *bond_choose_primary_or_current(struct bonding *bond)
1021
{
1022
	struct slave *prim = rtnl_dereference(bond->primary_slave);
1023
	struct slave *curr = rtnl_dereference(bond->curr_active_slave);
1024

1025 1026 1027 1028 1029 1030
	if (!prim || prim->link != BOND_LINK_UP) {
		if (!curr || curr->link != BOND_LINK_UP)
			return NULL;
		return curr;
	}

1031 1032
	if (bond->force_primary) {
		bond->force_primary = false;
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
		return prim;
	}

	if (!curr || curr->link != BOND_LINK_UP)
		return prim;

	/* At this point, prim and curr are both up */
	switch (bond->params.primary_reselect) {
	case BOND_PRI_RESELECT_ALWAYS:
		return prim;
	case BOND_PRI_RESELECT_BETTER:
		if (prim->speed < curr->speed)
			return curr;
		if (prim->speed == curr->speed && prim->duplex <= curr->duplex)
			return curr;
		return prim;
	case BOND_PRI_RESELECT_FAILURE:
		return curr;
	default:
		netdev_err(bond->dev, "impossible primary_reselect %d\n",
			   bond->params.primary_reselect);
		return curr;
1055 1056
	}
}
1057

L
Linus Torvalds 已提交
1058
/**
1059
 * bond_find_best_slave - select the best available slave to be the active one
L
Linus Torvalds 已提交
1060 1061 1062 1063
 * @bond: our bonding struct
 */
static struct slave *bond_find_best_slave(struct bonding *bond)
{
1064
	struct slave *slave, *bestslave = NULL;
1065
	struct list_head *iter;
L
Linus Torvalds 已提交
1066 1067
	int mintime = bond->params.updelay;

1068 1069 1070
	slave = bond_choose_primary_or_current(bond);
	if (slave)
		return slave;
L
Linus Torvalds 已提交
1071

1072 1073 1074
	bond_for_each_slave(bond, slave, iter) {
		if (slave->link == BOND_LINK_UP)
			return slave;
1075
		if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) &&
1076 1077 1078
		    slave->delay < mintime) {
			mintime = slave->delay;
			bestslave = slave;
L
Linus Torvalds 已提交
1079 1080 1081 1082 1083 1084
		}
	}

	return bestslave;
}

1085 1086
static bool bond_should_notify_peers(struct bonding *bond)
{
1087 1088 1089 1090 1091
	struct slave *slave;

	rcu_read_lock();
	slave = rcu_dereference(bond->curr_active_slave);
	rcu_read_unlock();
1092

1093 1094
	netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n",
		   slave ? slave->dev->name : "NULL");
1095 1096

	if (!slave || !bond->send_peer_notif ||
1097 1098
	    bond->send_peer_notif %
	    max(1, bond->params.peer_notif_delay) != 0 ||
1099
	    !netif_carrier_ok(bond->dev) ||
1100 1101 1102 1103 1104 1105
	    test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state))
		return false;

	return true;
}

L
Linus Torvalds 已提交
1106
/**
1107
 * bond_change_active_slave - change the active slave into the specified one
L
Linus Torvalds 已提交
1108
 * @bond: our bonding struct
1109
 * @new_active: the new slave to make the active one
L
Linus Torvalds 已提交
1110 1111 1112 1113 1114 1115 1116 1117 1118
 *
 * Set the new slave to the bond's settings and unset them on the old
 * curr_active_slave.
 * Setting include flags, mc-list, promiscuity, allmulti, etc.
 *
 * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP,
 * because it is apparently the best available slave we have, even though its
 * updelay hasn't timed out yet.
 *
1119
 * Caller must hold RTNL.
L
Linus Torvalds 已提交
1120
 */
1121
void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
L
Linus Torvalds 已提交
1122
{
1123 1124
	struct slave *old_active;

1125 1126 1127
	ASSERT_RTNL();

	old_active = rtnl_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
1128

S
Stephen Hemminger 已提交
1129
	if (old_active == new_active)
L
Linus Torvalds 已提交
1130 1131
		return;

1132
#ifdef CONFIG_XFRM_OFFLOAD
1133
	bond_ipsec_del_sa_all(bond);
1134 1135
#endif /* CONFIG_XFRM_OFFLOAD */

1136
	if (new_active) {
1137
		new_active->last_link_up = jiffies;
1138

L
Linus Torvalds 已提交
1139
		if (new_active->link == BOND_LINK_BACK) {
1140
			if (bond_uses_primary(bond)) {
1141 1142
				slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n",
					   (bond->params.updelay - new_active->delay) * bond->params.miimon);
L
Linus Torvalds 已提交
1143 1144 1145
			}

			new_active->delay = 0;
1146 1147
			bond_set_slave_link_state(new_active, BOND_LINK_UP,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1148

1149
			if (BOND_MODE(bond) == BOND_MODE_8023AD)
L
Linus Torvalds 已提交
1150 1151
				bond_3ad_handle_link_change(new_active, BOND_LINK_UP);

1152
			if (bond_is_lb(bond))
L
Linus Torvalds 已提交
1153 1154
				bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP);
		} else {
1155
			if (bond_uses_primary(bond))
1156
				slave_info(bond->dev, new_active->dev, "making interface the new active one\n");
L
Linus Torvalds 已提交
1157 1158 1159
		}
	}

1160
	if (bond_uses_primary(bond))
1161
		bond_hw_addr_swap(bond, new_active, old_active);
L
Linus Torvalds 已提交
1162

1163
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
1164
		bond_alb_handle_active_change(bond, new_active);
1165
		if (old_active)
1166 1167
			bond_set_slave_inactive_flags(old_active,
						      BOND_SLAVE_NOTIFY_NOW);
1168
		if (new_active)
1169 1170
			bond_set_slave_active_flags(new_active,
						    BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1171
	} else {
1172
		rcu_assign_pointer(bond->curr_active_slave, new_active);
L
Linus Torvalds 已提交
1173
	}
J
Jay Vosburgh 已提交
1174

1175
	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
S
Stephen Hemminger 已提交
1176
		if (old_active)
1177 1178
			bond_set_slave_inactive_flags(old_active,
						      BOND_SLAVE_NOTIFY_NOW);
J
Jay Vosburgh 已提交
1179 1180

		if (new_active) {
1181 1182
			bool should_notify_peers = false;

1183 1184
			bond_set_slave_active_flags(new_active,
						    BOND_SLAVE_NOTIFY_NOW);
1185

1186 1187 1188
			if (bond->params.fail_over_mac)
				bond_do_fail_over_mac(bond, new_active,
						      old_active);
1189

1190 1191
			if (netif_running(bond->dev)) {
				bond->send_peer_notif =
1192 1193
					bond->params.num_peer_notif *
					max(1, bond->params.peer_notif_delay);
1194 1195 1196 1197
				should_notify_peers =
					bond_should_notify_peers(bond);
			}

1198
			call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev);
1199 1200
			if (should_notify_peers) {
				bond->send_peer_notif--;
1201 1202
				call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
							 bond->dev);
1203
			}
1204
		}
J
Jay Vosburgh 已提交
1205
	}
1206

1207
#ifdef CONFIG_XFRM_OFFLOAD
1208
	bond_ipsec_add_sa_all(bond);
1209 1210
#endif /* CONFIG_XFRM_OFFLOAD */

1211
	/* resend IGMP joins since active slave has changed or
1212 1213
	 * all were sent on curr_active_slave.
	 * resend only if bond is brought up with the affected
1214 1215
	 * bonding modes and the retransmission is enabled
	 */
1216
	if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) &&
1217
	    ((bond_uses_primary(bond) && new_active) ||
1218
	     BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) {
1219
		bond->igmp_retrans = bond->params.resend_igmp;
1220
		queue_delayed_work(bond->wq, &bond->mcast_work, 1);
1221
	}
L
Linus Torvalds 已提交
1222 1223 1224 1225 1226 1227
}

/**
 * bond_select_active_slave - select a new active slave, if needed
 * @bond: our bonding struct
 *
S
Stephen Hemminger 已提交
1228
 * This functions should be called when one of the following occurs:
L
Linus Torvalds 已提交
1229 1230 1231 1232
 * - The old curr_active_slave has been released or lost its link.
 * - The primary_slave has got its link back.
 * - A slave has got its link back and there's no old curr_active_slave.
 *
1233
 * Caller must hold RTNL.
L
Linus Torvalds 已提交
1234
 */
1235
void bond_select_active_slave(struct bonding *bond)
L
Linus Torvalds 已提交
1236 1237
{
	struct slave *best_slave;
1238
	int rv;
L
Linus Torvalds 已提交
1239

1240 1241
	ASSERT_RTNL();

L
Linus Torvalds 已提交
1242
	best_slave = bond_find_best_slave(bond);
1243
	if (best_slave != rtnl_dereference(bond->curr_active_slave)) {
L
Linus Torvalds 已提交
1244
		bond_change_active_slave(bond, best_slave);
1245 1246 1247 1248
		rv = bond_set_carrier(bond);
		if (!rv)
			return;

Z
Zhang Shengju 已提交
1249
		if (netif_carrier_ok(bond->dev))
1250
			netdev_info(bond->dev, "active interface up!\n");
Z
Zhang Shengju 已提交
1251
		else
1252
			netdev_info(bond->dev, "now running without any active interface!\n");
L
Linus Torvalds 已提交
1253 1254 1255
	}
}

1256
#ifdef CONFIG_NET_POLL_CONTROLLER
1257
static inline int slave_enable_netpoll(struct slave *slave)
1258
{
1259 1260
	struct netpoll *np;
	int err = 0;
1261

1262
	np = kzalloc(sizeof(*np), GFP_KERNEL);
1263 1264 1265 1266
	err = -ENOMEM;
	if (!np)
		goto out;

1267
	err = __netpoll_setup(np, slave->dev);
1268 1269 1270
	if (err) {
		kfree(np);
		goto out;
1271
	}
1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
	slave->np = np;
out:
	return err;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
	struct netpoll *np = slave->np;

	if (!np)
		return;

	slave->np = NULL;
1284 1285

	__netpoll_free(np);
1286
}
1287 1288 1289

static void bond_poll_controller(struct net_device *bond_dev)
{
1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
	struct bonding *bond = netdev_priv(bond_dev);
	struct slave *slave = NULL;
	struct list_head *iter;
	struct ad_info ad_info;

	if (BOND_MODE(bond) == BOND_MODE_8023AD)
		if (bond_3ad_get_active_agg_info(bond, &ad_info))
			return;

	bond_for_each_slave_rcu(bond, slave, iter) {
1300
		if (!bond_slave_is_up(slave))
1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
			continue;

		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			struct aggregator *agg =
			    SLAVE_AD_INFO(slave)->port.aggregator;

			if (agg &&
			    agg->aggregator_identifier != ad_info.aggregator_id)
				continue;
		}

1312
		netpoll_poll_dev(slave->dev);
1313
	}
1314 1315
}

1316
static void bond_netpoll_cleanup(struct net_device *bond_dev)
1317
{
1318
	struct bonding *bond = netdev_priv(bond_dev);
1319
	struct list_head *iter;
1320 1321
	struct slave *slave;

1322
	bond_for_each_slave(bond, slave, iter)
1323
		if (bond_slave_is_up(slave))
1324
			slave_disable_netpoll(slave);
1325
}
1326

1327
static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
1328 1329
{
	struct bonding *bond = netdev_priv(dev);
1330
	struct list_head *iter;
1331
	struct slave *slave;
1332
	int err = 0;
1333

1334
	bond_for_each_slave(bond, slave, iter) {
1335 1336
		err = slave_enable_netpoll(slave);
		if (err) {
1337
			bond_netpoll_cleanup(dev);
1338
			break;
1339 1340
		}
	}
1341
	return err;
1342
}
1343 1344 1345 1346 1347 1348 1349 1350
#else
static inline int slave_enable_netpoll(struct slave *slave)
{
	return 0;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
}
1351 1352 1353 1354 1355
static void bond_netpoll_cleanup(struct net_device *bond_dev)
{
}
#endif

L
Linus Torvalds 已提交
1356 1357
/*---------------------------------- IOCTL ----------------------------------*/

1358
static netdev_features_t bond_fix_features(struct net_device *dev,
1359
					   netdev_features_t features)
1360
{
1361
	struct bonding *bond = netdev_priv(dev);
1362
	struct list_head *iter;
1363
	netdev_features_t mask;
1364
	struct slave *slave;
1365

1366 1367 1368 1369 1370 1371 1372
#if IS_ENABLED(CONFIG_TLS_DEVICE)
	if (bond_sk_check(bond))
		features |= BOND_TLS_FEATURES;
	else
		features &= ~BOND_TLS_FEATURES;
#endif

1373
	mask = features;
1374

1375
	features &= ~NETIF_F_ONE_FOR_ALL;
1376
	features |= NETIF_F_ALL_FOR_ALL;
1377

1378
	bond_for_each_slave(bond, slave, iter) {
1379 1380
		features = netdev_increment_features(features,
						     slave->dev->features,
1381 1382
						     mask);
	}
1383
	features = netdev_add_tso_features(features, mask);
1384 1385 1386 1387

	return features;
}

1388
#define BOND_VLAN_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
1389
				 NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \
1390
				 NETIF_F_HIGHDMA | NETIF_F_LRO)
1391

1392
#define BOND_ENC_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
1393
				 NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE)
1394

1395
#define BOND_MPLS_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
1396
				 NETIF_F_GSO_SOFTWARE)
1397

1398

1399 1400
static void bond_compute_features(struct bonding *bond)
{
1401 1402
	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
					IFF_XMIT_DST_RELEASE_PERM;
1403
	netdev_features_t vlan_features = BOND_VLAN_FEATURES;
1404
	netdev_features_t enc_features  = BOND_ENC_FEATURES;
1405 1406 1407
#ifdef CONFIG_XFRM_OFFLOAD
	netdev_features_t xfrm_features  = BOND_XFRM_FEATURES;
#endif /* CONFIG_XFRM_OFFLOAD */
1408
	netdev_features_t mpls_features  = BOND_MPLS_FEATURES;
1409 1410 1411
	struct net_device *bond_dev = bond->dev;
	struct list_head *iter;
	struct slave *slave;
1412
	unsigned short max_hard_header_len = ETH_HLEN;
1413 1414
	unsigned int gso_max_size = GSO_MAX_SIZE;
	u16 gso_max_segs = GSO_MAX_SEGS;
1415

1416
	if (!bond_has_slaves(bond))
1417
		goto done;
1418
	vlan_features &= NETIF_F_ALL_FOR_ALL;
1419
	mpls_features &= NETIF_F_ALL_FOR_ALL;
1420

1421
	bond_for_each_slave(bond, slave, iter) {
1422
		vlan_features = netdev_increment_features(vlan_features,
1423 1424
			slave->dev->vlan_features, BOND_VLAN_FEATURES);

1425 1426 1427
		enc_features = netdev_increment_features(enc_features,
							 slave->dev->hw_enc_features,
							 BOND_ENC_FEATURES);
1428

1429 1430 1431 1432 1433 1434
#ifdef CONFIG_XFRM_OFFLOAD
		xfrm_features = netdev_increment_features(xfrm_features,
							  slave->dev->hw_enc_features,
							  BOND_XFRM_FEATURES);
#endif /* CONFIG_XFRM_OFFLOAD */

1435 1436 1437 1438
		mpls_features = netdev_increment_features(mpls_features,
							  slave->dev->mpls_features,
							  BOND_MPLS_FEATURES);

1439
		dst_release_flag &= slave->dev->priv_flags;
1440 1441
		if (slave->dev->hard_header_len > max_hard_header_len)
			max_hard_header_len = slave->dev->hard_header_len;
1442 1443 1444

		gso_max_size = min(gso_max_size, slave->dev->gso_max_size);
		gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs);
1445
	}
1446
	bond_dev->hard_header_len = max_hard_header_len;
1447

1448
done:
1449
	bond_dev->vlan_features = vlan_features;
1450
	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
1451
				    NETIF_F_HW_VLAN_CTAG_TX |
1452
				    NETIF_F_HW_VLAN_STAG_TX;
1453 1454 1455
#ifdef CONFIG_XFRM_OFFLOAD
	bond_dev->hw_enc_features |= xfrm_features;
#endif /* CONFIG_XFRM_OFFLOAD */
1456
	bond_dev->mpls_features = mpls_features;
1457 1458
	bond_dev->gso_max_segs = gso_max_segs;
	netif_set_gso_max_size(bond_dev, gso_max_size);
1459

1460 1461 1462 1463
	bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
	if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
	    dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
		bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
1464

1465
	netdev_change_features(bond_dev);
1466 1467
}

1468 1469 1470
static void bond_setup_by_slave(struct net_device *bond_dev,
				struct net_device *slave_dev)
{
1471
	bond_dev->header_ops	    = slave_dev->header_ops;
1472 1473 1474

	bond_dev->type		    = slave_dev->type;
	bond_dev->hard_header_len   = slave_dev->hard_header_len;
1475
	bond_dev->needed_headroom   = slave_dev->needed_headroom;
1476 1477 1478 1479 1480 1481
	bond_dev->addr_len	    = slave_dev->addr_len;

	memcpy(bond_dev->broadcast, slave_dev->broadcast,
		slave_dev->addr_len);
}

1482
/* On bonding slaves other than the currently active slave, suppress
1483
 * duplicates except for alb non-mcast/bcast.
1484 1485
 */
static bool bond_should_deliver_exact_match(struct sk_buff *skb,
1486 1487
					    struct slave *slave,
					    struct bonding *bond)
1488
{
1489
	if (bond_is_slave_inactive(slave)) {
1490
		if (BOND_MODE(bond) == BOND_MODE_ALB &&
1491 1492 1493 1494 1495 1496 1497 1498
		    skb->pkt_type != PACKET_BROADCAST &&
		    skb->pkt_type != PACKET_MULTICAST)
			return false;
		return true;
	}
	return false;
}

1499
static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
1500
{
1501
	struct sk_buff *skb = *pskb;
1502
	struct slave *slave;
1503
	struct bonding *bond;
1504 1505
	int (*recv_probe)(const struct sk_buff *, struct bonding *,
			  struct slave *);
1506
	int ret = RX_HANDLER_ANOTHER;
1507

1508 1509 1510 1511 1512
	skb = skb_share_check(skb, GFP_ATOMIC);
	if (unlikely(!skb))
		return RX_HANDLER_CONSUMED;

	*pskb = skb;
1513

J
Jiri Pirko 已提交
1514 1515
	slave = bond_slave_get_rcu(skb->dev);
	bond = slave->bond;
1516

1517
	recv_probe = READ_ONCE(bond->recv_probe);
1518
	if (recv_probe) {
1519 1520 1521 1522
		ret = recv_probe(skb, bond, slave);
		if (ret == RX_HANDLER_CONSUMED) {
			consume_skb(skb);
			return ret;
1523 1524 1525
		}
	}

1526 1527 1528 1529 1530 1531 1532 1533 1534 1535
	/*
	 * For packets determined by bond_should_deliver_exact_match() call to
	 * be suppressed we want to make an exception for link-local packets.
	 * This is necessary for e.g. LLDP daemons to be able to monitor
	 * inactive slave links without being forced to bind to them
	 * explicitly.
	 *
	 * At the same time, packets that are passed to the bonding master
	 * (including link-local ones) can have their originating interface
	 * determined via PACKET_ORIGDEV socket option.
1536
	 */
1537 1538 1539
	if (bond_should_deliver_exact_match(skb, slave, bond)) {
		if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
			return RX_HANDLER_PASS;
1540
		return RX_HANDLER_EXACT;
1541
	}
1542

J
Jiri Pirko 已提交
1543
	skb->dev = bond->dev;
1544

1545
	if (BOND_MODE(bond) == BOND_MODE_ALB &&
1546
	    netif_is_bridge_port(bond->dev) &&
1547 1548
	    skb->pkt_type == PACKET_HOST) {

1549 1550 1551
		if (unlikely(skb_cow_head(skb,
					  skb->data - skb_mac_header(skb)))) {
			kfree_skb(skb);
1552
			return RX_HANDLER_CONSUMED;
1553
		}
1554 1555
		bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr,
				  bond->dev->addr_len);
1556 1557
	}

1558
	return ret;
1559 1560
}

1561
static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
1562
{
1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
	switch (BOND_MODE(bond)) {
	case BOND_MODE_ROUNDROBIN:
		return NETDEV_LAG_TX_TYPE_ROUNDROBIN;
	case BOND_MODE_ACTIVEBACKUP:
		return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
	case BOND_MODE_BROADCAST:
		return NETDEV_LAG_TX_TYPE_BROADCAST;
	case BOND_MODE_XOR:
	case BOND_MODE_8023AD:
		return NETDEV_LAG_TX_TYPE_HASH;
	default:
		return NETDEV_LAG_TX_TYPE_UNKNOWN;
	}
}

1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594
static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
					       enum netdev_lag_tx_type type)
{
	if (type != NETDEV_LAG_TX_TYPE_HASH)
		return NETDEV_LAG_HASH_NONE;

	switch (bond->params.xmit_policy) {
	case BOND_XMIT_POLICY_LAYER2:
		return NETDEV_LAG_HASH_L2;
	case BOND_XMIT_POLICY_LAYER34:
		return NETDEV_LAG_HASH_L34;
	case BOND_XMIT_POLICY_LAYER23:
		return NETDEV_LAG_HASH_L23;
	case BOND_XMIT_POLICY_ENCAP23:
		return NETDEV_LAG_HASH_E23;
	case BOND_XMIT_POLICY_ENCAP34:
		return NETDEV_LAG_HASH_E34;
1595 1596
	case BOND_XMIT_POLICY_VLAN_SRCMAC:
		return NETDEV_LAG_HASH_VLAN_SRCMAC;
1597 1598 1599 1600 1601
	default:
		return NETDEV_LAG_HASH_UNKNOWN;
	}
}

1602 1603
static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
				      struct netlink_ext_ack *extack)
1604 1605
{
	struct netdev_lag_upper_info lag_upper_info;
1606
	enum netdev_lag_tx_type type;
1607

1608 1609 1610
	type = bond_lag_tx_type(bond);
	lag_upper_info.tx_type = type;
	lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
1611 1612 1613

	return netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
					    &lag_upper_info, extack);
1614 1615
}

1616
static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave)
1617
{
1618 1619
	netdev_upper_dev_unlink(slave->dev, bond->dev);
	slave->dev->flags &= ~IFF_SLAVE;
1620 1621
}

1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654
static void slave_kobj_release(struct kobject *kobj)
{
	struct slave *slave = to_slave(kobj);
	struct bonding *bond = bond_get_bond_by_slave(slave);

	cancel_delayed_work_sync(&slave->notify_work);
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
		kfree(SLAVE_AD_INFO(slave));

	kfree(slave);
}

static struct kobj_type slave_ktype = {
	.release = slave_kobj_release,
#ifdef CONFIG_SYSFS
	.sysfs_ops = &slave_sysfs_ops,
#endif
};

static int bond_kobj_init(struct slave *slave)
{
	int err;

	err = kobject_init_and_add(&slave->kobj, &slave_ktype,
				   &(slave->dev->dev.kobj), "bonding_slave");
	if (err)
		kobject_put(&slave->kobj);

	return err;
}

static struct slave *bond_alloc_slave(struct bonding *bond,
				      struct net_device *slave_dev)
1655 1656 1657
{
	struct slave *slave = NULL;

Z
Zhang Shengju 已提交
1658
	slave = kzalloc(sizeof(*slave), GFP_KERNEL);
1659 1660 1661
	if (!slave)
		return NULL;

1662 1663
	slave->bond = bond;
	slave->dev = slave_dev;
1664
	INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work);
1665 1666 1667 1668

	if (bond_kobj_init(slave))
		return NULL;

1669
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
1670 1671 1672
		SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info),
					       GFP_KERNEL);
		if (!SLAVE_AD_INFO(slave)) {
1673
			kobject_put(&slave->kobj);
1674 1675 1676
			return NULL;
		}
	}
1677

1678 1679 1680
	return slave;
}

1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695
static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info)
{
	info->bond_mode = BOND_MODE(bond);
	info->miimon = bond->params.miimon;
	info->num_slaves = bond->slave_cnt;
}

static void bond_fill_ifslave(struct slave *slave, struct ifslave *info)
{
	strcpy(info->slave_name, slave->dev->name);
	info->link = slave->link;
	info->state = bond_slave_state(slave);
	info->link_failure_count = slave->link_failure_count;
}

1696 1697
static void bond_netdev_notify_work(struct work_struct *_work)
{
1698 1699 1700 1701 1702
	struct slave *slave = container_of(_work, struct slave,
					   notify_work.work);

	if (rtnl_trylock()) {
		struct netdev_bonding_info binfo;
1703

1704 1705 1706 1707 1708 1709 1710
		bond_fill_ifslave(slave, &binfo.slave);
		bond_fill_ifbond(slave->bond, &binfo.master);
		netdev_bonding_info_change(slave->dev, &binfo);
		rtnl_unlock();
	} else {
		queue_delayed_work(slave->bond->wq, &slave->notify_work, 1);
	}
1711 1712 1713 1714
}

void bond_queue_slave_event(struct slave *slave)
{
1715
	queue_delayed_work(slave->bond->wq, &slave->notify_work, 0);
1716 1717
}

1718 1719 1720 1721 1722 1723 1724 1725 1726 1727
void bond_lower_state_changed(struct slave *slave)
{
	struct netdev_lag_lower_state_info info;

	info.link_up = slave->link == BOND_LINK_UP ||
		       slave->link == BOND_LINK_FAIL;
	info.tx_enabled = bond_is_active_slave(slave);
	netdev_lower_state_changed(slave->dev, &info);
}

L
Linus Torvalds 已提交
1728
/* enslave device <slave> to bond device <master> */
D
David Ahern 已提交
1729 1730
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
		 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1731
{
1732
	struct bonding *bond = netdev_priv(bond_dev);
1733
	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
1734
	struct slave *new_slave = NULL, *prev_slave;
1735
	struct sockaddr_storage ss;
L
Linus Torvalds 已提交
1736
	int link_reporting;
1737
	int res = 0, i;
L
Linus Torvalds 已提交
1738

1739 1740 1741
	if (slave_dev->flags & IFF_MASTER &&
	    !netif_is_bond_master(slave_dev)) {
		NL_SET_ERR_MSG(extack, "Device with IFF_MASTER cannot be enslaved");
1742 1743 1744 1745 1746
		netdev_err(bond_dev,
			   "Error: Device with IFF_MASTER cannot be enslaved\n");
		return -EPERM;
	}

1747 1748
	if (!bond->params.use_carrier &&
	    slave_dev->ethtool_ops->get_link == NULL &&
1749
	    slave_ops->ndo_eth_ioctl == NULL) {
1750
		slave_warn(bond_dev, slave_dev, "no link monitoring support\n");
L
Linus Torvalds 已提交
1751 1752
	}

M
Mahesh Bandewar 已提交
1753 1754
	/* already in-use? */
	if (netdev_is_rx_handler_busy(slave_dev)) {
1755
		NL_SET_ERR_MSG(extack, "Device is in use and cannot be enslaved");
1756 1757
		slave_err(bond_dev, slave_dev,
			  "Error: Device is in use and cannot be enslaved\n");
L
Linus Torvalds 已提交
1758 1759 1760
		return -EBUSY;
	}

1761
	if (bond_dev == slave_dev) {
1762
		NL_SET_ERR_MSG(extack, "Cannot enslave bond to itself.");
1763
		netdev_err(bond_dev, "cannot enslave bond to itself.\n");
1764 1765 1766
		return -EPERM;
	}

L
Linus Torvalds 已提交
1767 1768 1769
	/* vlan challenged mutual exclusion */
	/* no need to lock since we're protected by rtnl_lock */
	if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) {
1770
		slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n");
1771
		if (vlan_uses_dev(bond_dev)) {
1772
			NL_SET_ERR_MSG(extack, "Can not enslave VLAN challenged device to VLAN enabled bond");
1773
			slave_err(bond_dev, slave_dev, "Error: cannot enslave VLAN challenged slave on VLAN enabled bond\n");
L
Linus Torvalds 已提交
1774 1775
			return -EPERM;
		} else {
1776
			slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n");
L
Linus Torvalds 已提交
1777 1778
		}
	} else {
1779
		slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n");
L
Linus Torvalds 已提交
1780 1781
	}

1782 1783 1784
	if (slave_dev->features & NETIF_F_HW_ESP)
		slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n");

1785
	/* Old ifenslave binaries are no longer supported.  These can
S
Stephen Hemminger 已提交
1786
	 * be identified with moderate accuracy by the state of the slave:
1787 1788 1789
	 * the current ifenslave will set the interface down prior to
	 * enslaving it; the old ifenslave will not.
	 */
Y
yzhu1 已提交
1790
	if (slave_dev->flags & IFF_UP) {
1791
		NL_SET_ERR_MSG(extack, "Device can not be enslaved while up");
1792
		slave_err(bond_dev, slave_dev, "slave is up - this may be due to an out of date ifenslave\n");
1793
		return -EPERM;
1794
	}
L
Linus Torvalds 已提交
1795

1796 1797 1798 1799 1800 1801 1802
	/* set bonding device ether type by slave - bonding netdevices are
	 * created with ether_setup, so when the slave type is not ARPHRD_ETHER
	 * there is a need to override some of the type dependent attribs/funcs.
	 *
	 * bond ether type mutual exclusion - don't allow slaves of dissimilar
	 * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond
	 */
1803
	if (!bond_has_slaves(bond)) {
1804
		if (bond_dev->type != slave_dev->type) {
1805 1806
			slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n",
				  bond_dev->type, slave_dev->type);
1807

1808 1809
			res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
						       bond_dev);
1810 1811
			res = notifier_to_errno(res);
			if (res) {
1812
				slave_err(bond_dev, slave_dev, "refused to change device type\n");
1813
				return -EBUSY;
1814
			}
1815

1816
			/* Flush unicast and multicast addresses */
1817
			dev_uc_flush(bond_dev);
1818
			dev_mc_flush(bond_dev);
1819

1820 1821
			if (slave_dev->type != ARPHRD_ETHER)
				bond_setup_by_slave(bond_dev, slave_dev);
1822
			else {
1823
				ether_setup(bond_dev);
1824 1825
				bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
			}
1826

1827 1828
			call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
						 bond_dev);
1829
		}
1830
	} else if (bond_dev->type != slave_dev->type) {
1831
		NL_SET_ERR_MSG(extack, "Device type is different from other slaves");
1832 1833
		slave_err(bond_dev, slave_dev, "ether type (%d) is different from other slaves (%d), can not enslave it\n",
			  slave_dev->type, bond_dev->type);
1834
		return -EINVAL;
1835 1836
	}

1837 1838
	if (slave_dev->type == ARPHRD_INFINIBAND &&
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
1839
		NL_SET_ERR_MSG(extack, "Only active-backup mode is supported for infiniband slaves");
1840 1841
		slave_warn(bond_dev, slave_dev, "Type (%d) supports only active-backup mode\n",
			   slave_dev->type);
1842 1843 1844 1845 1846 1847
		res = -EOPNOTSUPP;
		goto err_undo_flags;
	}

	if (!slave_ops->ndo_set_mac_address ||
	    slave_dev->type == ARPHRD_INFINIBAND) {
1848
		slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n");
1849 1850 1851
		if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP &&
		    bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
			if (!bond_has_slaves(bond)) {
1852
				bond->params.fail_over_mac = BOND_FOM_ACTIVE;
1853
				slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n");
1854
			} else {
1855
				NL_SET_ERR_MSG(extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active");
1856
				slave_err(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address, but fail_over_mac is not set to active\n");
1857 1858
				res = -EOPNOTSUPP;
				goto err_undo_flags;
1859
			}
1860
		}
L
Linus Torvalds 已提交
1861 1862
	}

1863 1864
	call_netdevice_notifiers(NETDEV_JOIN, slave_dev);

1865
	/* If this is the first slave, then we need to set the master's hardware
1866 1867
	 * address to be the same as the slave's.
	 */
1868
	if (!bond_has_slaves(bond) &&
1869 1870 1871 1872 1873
	    bond->dev->addr_assign_type == NET_ADDR_RANDOM) {
		res = bond_set_dev_addr(bond->dev, slave_dev);
		if (res)
			goto err_undo_flags;
	}
1874

1875
	new_slave = bond_alloc_slave(bond, slave_dev);
L
Linus Torvalds 已提交
1876 1877 1878 1879
	if (!new_slave) {
		res = -ENOMEM;
		goto err_undo_flags;
	}
1880

1881
	/* Set the new_slave's queue_id to be zero.  Queue ID mapping
1882 1883 1884 1885
	 * is set via sysfs or module option if desired.
	 */
	new_slave->queue_id = 0;

1886 1887 1888 1889
	/* Save slave's original mtu and then set it to match the bond */
	new_slave->original_mtu = slave_dev->mtu;
	res = dev_set_mtu(slave_dev, bond->dev->mtu);
	if (res) {
1890
		slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res);
1891 1892 1893
		goto err_free;
	}

1894
	/* Save slave's original ("permanent") mac address for modes
1895 1896 1897
	 * that need it, and for restoring it upon release, and then
	 * set it to the master's address
	 */
1898 1899
	bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr,
			  slave_dev->addr_len);
L
Linus Torvalds 已提交
1900

1901
	if (!bond->params.fail_over_mac ||
1902
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
1903
		/* Set slave to master's mac address.  The application already
1904 1905
		 * set the master's mac address to that of the first slave
		 */
1906 1907
		memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
		ss.ss_family = slave_dev->type;
1908 1909
		res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss,
					  extack);
1910
		if (res) {
1911
			slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res);
1912
			goto err_restore_mtu;
1913
		}
1914
	}
L
Linus Torvalds 已提交
1915

1916 1917 1918
	/* set slave flag before open to prevent IPv6 addrconf */
	slave_dev->flags |= IFF_SLAVE;

1919
	/* open the slave since the application closed it */
1920
	res = dev_open(slave_dev, extack);
1921
	if (res) {
1922
		slave_err(bond_dev, slave_dev, "Opening slave failed\n");
1923
		goto err_restore_mac;
L
Linus Torvalds 已提交
1924 1925
	}

1926
	slave_dev->priv_flags |= IFF_BONDING;
1927 1928
	/* initialize slave stats */
	dev_get_stats(new_slave->dev, &new_slave->slave_stats);
L
Linus Torvalds 已提交
1929

1930
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
1931 1932 1933 1934
		/* bond_alb_init_slave() must be called before all other stages since
		 * it might fail and we do not want to have to undo everything
		 */
		res = bond_alb_init_slave(bond, new_slave);
S
Stephen Hemminger 已提交
1935
		if (res)
1936
			goto err_close;
L
Linus Torvalds 已提交
1937 1938
	}

1939 1940
	res = vlan_vids_add_by_dev(slave_dev, bond_dev);
	if (res) {
1941
		slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n");
1942
		goto err_close;
1943
	}
L
Linus Torvalds 已提交
1944

1945
	prev_slave = bond_last_slave(bond);
L
Linus Torvalds 已提交
1946 1947 1948 1949

	new_slave->delay = 0;
	new_slave->link_failure_count = 0;

1950 1951
	if (bond_update_speed_duplex(new_slave) &&
	    bond_needs_speed_duplex(bond))
1952
		new_slave->link = BOND_LINK_DOWN;
1953

1954
	new_slave->last_rx = jiffies -
1955
		(msecs_to_jiffies(bond->params.arp_interval) + 1);
1956
	for (i = 0; i < BOND_MAX_ARP_TARGETS; i++)
1957
		new_slave->target_last_arp_rx[i] = new_slave->last_rx;
1958

L
Linus Torvalds 已提交
1959 1960 1961 1962
	if (bond->params.miimon && !bond->params.use_carrier) {
		link_reporting = bond_check_dev_link(bond, slave_dev, 1);

		if ((link_reporting == -1) && !bond->params.arp_interval) {
1963
			/* miimon is set but a bonded network driver
L
Linus Torvalds 已提交
1964 1965 1966 1967 1968 1969 1970
			 * does not support ETHTOOL/MII and
			 * arp_interval is not set.  Note: if
			 * use_carrier is enabled, we will never go
			 * here (because netif_carrier is always
			 * supported); thus, we don't need to change
			 * the messages for netif_carrier.
			 */
1971
			slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n");
L
Linus Torvalds 已提交
1972 1973
		} else if (link_reporting == -1) {
			/* unable get link status using mii/ethtool */
1974
			slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n");
L
Linus Torvalds 已提交
1975 1976 1977 1978
		}
	}

	/* check for initial state */
1979
	new_slave->link = BOND_LINK_NOCHANGE;
1980 1981 1982
	if (bond->params.miimon) {
		if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) {
			if (bond->params.updelay) {
1983
				bond_set_slave_link_state(new_slave,
1984 1985
							  BOND_LINK_BACK,
							  BOND_SLAVE_NOTIFY_NOW);
1986 1987
				new_slave->delay = bond->params.updelay;
			} else {
1988
				bond_set_slave_link_state(new_slave,
1989 1990
							  BOND_LINK_UP,
							  BOND_SLAVE_NOTIFY_NOW);
1991
			}
L
Linus Torvalds 已提交
1992
		} else {
1993 1994
			bond_set_slave_link_state(new_slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
1995
		}
1996
	} else if (bond->params.arp_interval) {
1997 1998
		bond_set_slave_link_state(new_slave,
					  (netif_carrier_ok(slave_dev) ?
1999 2000
					  BOND_LINK_UP : BOND_LINK_DOWN),
					  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2001
	} else {
2002 2003
		bond_set_slave_link_state(new_slave, BOND_LINK_UP,
					  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2004 2005
	}

2006
	if (new_slave->link != BOND_LINK_DOWN)
2007
		new_slave->last_link_up = jiffies;
2008 2009 2010
	slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n",
		  new_slave->link == BOND_LINK_DOWN ? "DOWN" :
		  (new_slave->link == BOND_LINK_UP ? "UP" : "BACK"));
2011

2012
	if (bond_uses_primary(bond) && bond->params.primary[0]) {
L
Linus Torvalds 已提交
2013
		/* if there is a primary slave, remember it */
2014
		if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
2015
			rcu_assign_pointer(bond->primary_slave, new_slave);
2016 2017
			bond->force_primary = true;
		}
L
Linus Torvalds 已提交
2018 2019
	}

2020
	switch (BOND_MODE(bond)) {
L
Linus Torvalds 已提交
2021
	case BOND_MODE_ACTIVEBACKUP:
2022 2023
		bond_set_slave_inactive_flags(new_slave,
					      BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2024 2025 2026 2027 2028 2029
		break;
	case BOND_MODE_8023AD:
		/* in 802.3ad mode, the internal mechanism
		 * will activate the slaves in the selected
		 * aggregator
		 */
2030
		bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2031
		/* if this is the first slave */
2032
		if (!prev_slave) {
2033
			SLAVE_AD_INFO(new_slave)->id = 1;
L
Linus Torvalds 已提交
2034 2035 2036
			/* Initialize AD with the number of times that the AD timer is called in 1 second
			 * can be called only after the mac address of the bond is set
			 */
2037
			bond_3ad_initialize(bond, 1000/AD_TIMER_INTERVAL);
L
Linus Torvalds 已提交
2038
		} else {
2039 2040
			SLAVE_AD_INFO(new_slave)->id =
				SLAVE_AD_INFO(prev_slave)->id + 1;
L
Linus Torvalds 已提交
2041 2042 2043 2044 2045 2046
		}

		bond_3ad_bind_slave(new_slave);
		break;
	case BOND_MODE_TLB:
	case BOND_MODE_ALB:
J
Jiri Pirko 已提交
2047
		bond_set_active_slave(new_slave);
2048
		bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2049 2050
		break;
	default:
2051
		slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n");
L
Linus Torvalds 已提交
2052 2053

		/* always active in trunk mode */
J
Jiri Pirko 已提交
2054
		bond_set_active_slave(new_slave);
L
Linus Torvalds 已提交
2055 2056 2057 2058 2059

		/* In trunking mode there is little meaning to curr_active_slave
		 * anyway (it holds no special properties of the bond device),
		 * so we can change it without calling change_active_interface()
		 */
2060 2061
		if (!rcu_access_pointer(bond->curr_active_slave) &&
		    new_slave->link == BOND_LINK_UP)
2062
			rcu_assign_pointer(bond->curr_active_slave, new_slave);
S
Stephen Hemminger 已提交
2063

L
Linus Torvalds 已提交
2064 2065 2066
		break;
	} /* switch(bond_mode) */

2067
#ifdef CONFIG_NET_POLL_CONTROLLER
2068
	if (bond->dev->npinfo) {
2069
		if (slave_enable_netpoll(new_slave)) {
2070
			slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n");
2071
			res = -EBUSY;
2072
			goto err_detach;
2073
		}
2074 2075
	}
#endif
2076

2077 2078 2079
	if (!(bond_dev->features & NETIF_F_LRO))
		dev_disable_lro(slave_dev);

J
Jiri Pirko 已提交
2080 2081 2082
	res = netdev_rx_handler_register(slave_dev, bond_handle_frame,
					 new_slave);
	if (res) {
2083
		slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res);
2084
		goto err_detach;
J
Jiri Pirko 已提交
2085 2086
	}

2087
	res = bond_master_upper_dev_link(bond, new_slave, extack);
2088
	if (res) {
2089
		slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res);
2090 2091 2092
		goto err_unregister;
	}

2093 2094
	bond_lower_state_changed(new_slave);

2095 2096
	res = bond_sysfs_slave_add(new_slave);
	if (res) {
2097
		slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res);
2098 2099 2100
		goto err_upper_unlink;
	}

2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114
	/* If the mode uses primary, then the following is handled by
	 * bond_change_active_slave().
	 */
	if (!bond_uses_primary(bond)) {
		/* set promiscuity level to new slave */
		if (bond_dev->flags & IFF_PROMISC) {
			res = dev_set_promiscuity(slave_dev, 1);
			if (res)
				goto err_sysfs_del;
		}

		/* set allmulti level to new slave */
		if (bond_dev->flags & IFF_ALLMULTI) {
			res = dev_set_allmulti(slave_dev, 1);
2115 2116 2117
			if (res) {
				if (bond_dev->flags & IFF_PROMISC)
					dev_set_promiscuity(slave_dev, -1);
2118
				goto err_sysfs_del;
2119
			}
2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134
		}

		netif_addr_lock_bh(bond_dev);
		dev_mc_sync_multiple(slave_dev, bond_dev);
		dev_uc_sync_multiple(slave_dev, bond_dev);
		netif_addr_unlock_bh(bond_dev);

		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			/* add lacpdu mc addr to mc list */
			u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;

			dev_mc_add(slave_dev, lacpdu_multicast);
		}
	}

2135 2136 2137 2138
	bond->slave_cnt++;
	bond_compute_features(bond);
	bond_set_carrier(bond);

2139
	if (bond_uses_primary(bond)) {
2140
		block_netpoll_tx();
2141
		bond_select_active_slave(bond);
2142
		unblock_netpoll_tx();
2143
	}
2144

2145
	if (bond_mode_can_use_xmit_hash(bond))
2146 2147
		bond_update_slave_arr(bond, NULL);

2148

2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183
	if (!slave_dev->netdev_ops->ndo_bpf ||
	    !slave_dev->netdev_ops->ndo_xdp_xmit) {
		if (bond->xdp_prog) {
			NL_SET_ERR_MSG(extack, "Slave does not support XDP");
			slave_err(bond_dev, slave_dev, "Slave does not support XDP\n");
			res = -EOPNOTSUPP;
			goto err_sysfs_del;
		}
	} else {
		struct netdev_bpf xdp = {
			.command = XDP_SETUP_PROG,
			.flags   = 0,
			.prog    = bond->xdp_prog,
			.extack  = extack,
		};

		if (dev_xdp_prog_count(slave_dev) > 0) {
			NL_SET_ERR_MSG(extack,
				       "Slave has XDP program loaded, please unload before enslaving");
			slave_err(bond_dev, slave_dev,
				  "Slave has XDP program loaded, please unload before enslaving\n");
			res = -EOPNOTSUPP;
			goto err_sysfs_del;
		}

		res = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
		if (res < 0) {
			/* ndo_bpf() sets extack error message */
			slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res);
			goto err_sysfs_del;
		}
		if (bond->xdp_prog)
			bpf_prog_inc(bond->xdp_prog);
	}

2184 2185 2186
	slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n",
		   bond_is_active_slave(new_slave) ? "an active" : "a backup",
		   new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");
L
Linus Torvalds 已提交
2187 2188

	/* enslave is successful */
2189
	bond_queue_slave_event(new_slave);
L
Linus Torvalds 已提交
2190 2191 2192
	return 0;

/* Undo stages on error */
2193 2194 2195
err_sysfs_del:
	bond_sysfs_slave_del(new_slave);

2196
err_upper_unlink:
2197
	bond_upper_dev_unlink(bond, new_slave);
2198

2199 2200 2201
err_unregister:
	netdev_rx_handler_unregister(slave_dev);

2202
err_detach:
2203
	vlan_vids_del_by_dev(slave_dev, bond_dev);
2204 2205
	if (rcu_access_pointer(bond->primary_slave) == new_slave)
		RCU_INIT_POINTER(bond->primary_slave, NULL);
2206
	if (rcu_access_pointer(bond->curr_active_slave) == new_slave) {
2207
		block_netpoll_tx();
2208
		bond_change_active_slave(bond, NULL);
2209
		bond_select_active_slave(bond);
2210
		unblock_netpoll_tx();
2211
	}
2212 2213
	/* either primary_slave or curr_active_slave might've changed */
	synchronize_rcu();
2214
	slave_disable_netpoll(new_slave);
2215

L
Linus Torvalds 已提交
2216
err_close:
2217 2218
	if (!netif_is_bond_master(slave_dev))
		slave_dev->priv_flags &= ~IFF_BONDING;
L
Linus Torvalds 已提交
2219 2220 2221
	dev_close(slave_dev);

err_restore_mac:
2222
	slave_dev->flags &= ~IFF_SLAVE;
2223
	if (!bond->params.fail_over_mac ||
2224
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2225 2226 2227 2228
		/* XXX TODO - fom follow mode needs to change master's
		 * MAC if this slave's MAC is in use by the bond, or at
		 * least print a warning.
		 */
2229 2230 2231
		bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr,
				  new_slave->dev->addr_len);
		ss.ss_family = slave_dev->type;
2232
		dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
2233
	}
L
Linus Torvalds 已提交
2234

2235 2236 2237
err_restore_mtu:
	dev_set_mtu(slave_dev, new_slave->original_mtu);

L
Linus Torvalds 已提交
2238
err_free:
2239
	kobject_put(&new_slave->kobj);
L
Linus Torvalds 已提交
2240 2241

err_undo_flags:
2242
	/* Enslave of first slave has failed and we need to fix master's mac */
2243 2244 2245 2246 2247
	if (!bond_has_slaves(bond)) {
		if (ether_addr_equal_64bits(bond_dev->dev_addr,
					    slave_dev->dev_addr))
			eth_hw_addr_random(bond_dev);
		if (bond_dev->type != ARPHRD_ETHER) {
2248
			dev_close(bond_dev);
2249 2250 2251 2252 2253
			ether_setup(bond_dev);
			bond_dev->flags |= IFF_MASTER;
			bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
		}
	}
S
Stephen Hemminger 已提交
2254

L
Linus Torvalds 已提交
2255 2256 2257
	return res;
}

2258
/* Try to release the slave device <slave> from the bond device <master>
L
Linus Torvalds 已提交
2259
 * It is legal to access curr_active_slave without a lock because all the function
2260
 * is RTNL-locked. If "all" is true it means that the function is being called
2261
 * while destroying a bond interface and all slaves are being released.
L
Linus Torvalds 已提交
2262 2263 2264 2265 2266 2267 2268
 *
 * The rules for slave state should be:
 *   for Active/Backup:
 *     Active stays on all backups go down
 *   for Bonded connections:
 *     The first up interface should be left on and all others downed.
 */
2269 2270
static int __bond_release_one(struct net_device *bond_dev,
			      struct net_device *slave_dev,
2271
			      bool all, bool unregister)
L
Linus Torvalds 已提交
2272
{
2273
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
2274
	struct slave *slave, *oldcurrent;
2275
	struct sockaddr_storage ss;
2276
	int old_flags = bond_dev->flags;
2277
	netdev_features_t old_features = bond_dev->features;
L
Linus Torvalds 已提交
2278 2279 2280

	/* slave is not a slave or master is not master of this slave */
	if (!(slave_dev->flags & IFF_SLAVE) ||
2281
	    !netdev_has_upper_dev(slave_dev, bond_dev)) {
2282
		slave_dbg(bond_dev, slave_dev, "cannot release slave\n");
L
Linus Torvalds 已提交
2283 2284 2285
		return -EINVAL;
	}

2286
	block_netpoll_tx();
L
Linus Torvalds 已提交
2287 2288 2289 2290

	slave = bond_get_slave_by_dev(bond, slave_dev);
	if (!slave) {
		/* not a slave of this bond */
2291
		slave_info(bond_dev, slave_dev, "interface not enslaved\n");
2292
		unblock_netpoll_tx();
L
Linus Torvalds 已提交
2293 2294 2295
		return -EINVAL;
	}

2296 2297
	bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW);

2298 2299
	bond_sysfs_slave_del(slave);

2300 2301 2302
	/* recompute stats just before removing the slave */
	bond_get_stats(bond->dev, &bond->bond_stats);

2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313
	if (bond->xdp_prog) {
		struct netdev_bpf xdp = {
			.command = XDP_SETUP_PROG,
			.flags   = 0,
			.prog	 = NULL,
			.extack  = NULL,
		};
		if (slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp))
			slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n");
	}

J
Jiri Pirko 已提交
2314 2315 2316 2317 2318
	/* unregister rx_handler early so bond_handle_frame wouldn't be called
	 * for this slave anymore.
	 */
	netdev_rx_handler_unregister(slave_dev);

2319
	if (BOND_MODE(bond) == BOND_MODE_8023AD)
L
Linus Torvalds 已提交
2320 2321
		bond_3ad_unbind_slave(slave);

2322 2323
	bond_upper_dev_unlink(bond, slave);

2324
	if (bond_mode_can_use_xmit_hash(bond))
2325 2326
		bond_update_slave_arr(bond, slave);

2327 2328
	slave_info(bond_dev, slave_dev, "Releasing %s interface\n",
		    bond_is_active_slave(slave) ? "active" : "backup");
L
Linus Torvalds 已提交
2329

2330
	oldcurrent = rcu_access_pointer(bond->curr_active_slave);
L
Linus Torvalds 已提交
2331

2332
	RCU_INIT_POINTER(bond->current_arp_slave, NULL);
L
Linus Torvalds 已提交
2333

2334
	if (!all && (!bond->params.fail_over_mac ||
2335
		     BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
2336
		if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) &&
2337
		    bond_has_slaves(bond))
2338 2339
			slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n",
				   slave->perm_hwaddr);
2340 2341
	}

2342 2343
	if (rtnl_dereference(bond->primary_slave) == slave)
		RCU_INIT_POINTER(bond->primary_slave, NULL);
L
Linus Torvalds 已提交
2344

2345
	if (oldcurrent == slave)
L
Linus Torvalds 已提交
2346 2347
		bond_change_active_slave(bond, NULL);

2348
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
2349 2350 2351 2352 2353 2354 2355 2356
		/* Must be called only after the slave has been
		 * detached from the list and the curr_active_slave
		 * has been cleared (if our_slave == old_current),
		 * but before a new active slave is selected.
		 */
		bond_alb_deinit_slave(bond, slave);
	}

2357
	if (all) {
2358
		RCU_INIT_POINTER(bond->curr_active_slave, NULL);
2359
	} else if (oldcurrent == slave) {
2360
		/* Note that we hold RTNL over this sequence, so there
2361 2362 2363
		 * is no concern that another slave add/remove event
		 * will interfere.
		 */
L
Linus Torvalds 已提交
2364
		bond_select_active_slave(bond);
2365 2366
	}

2367
	if (!bond_has_slaves(bond)) {
2368
		bond_set_carrier(bond);
2369
		eth_hw_addr_random(bond_dev);
L
Linus Torvalds 已提交
2370 2371
	}

2372
	unblock_netpoll_tx();
2373
	synchronize_rcu();
2374
	bond->slave_cnt--;
L
Linus Torvalds 已提交
2375

2376
	if (!bond_has_slaves(bond)) {
2377
		call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev);
2378 2379
		call_netdevice_notifiers(NETDEV_RELEASE, bond->dev);
	}
2380

2381 2382 2383
	bond_compute_features(bond);
	if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) &&
	    (old_features & NETIF_F_VLAN_CHALLENGED))
2384
		slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n");
2385

2386
	vlan_vids_del_by_dev(slave_dev, bond_dev);
L
Linus Torvalds 已提交
2387

2388
	/* If the mode uses primary, then this case was handled above by
2389
	 * bond_change_active_slave(..., NULL)
L
Linus Torvalds 已提交
2390
	 */
2391
	if (!bond_uses_primary(bond)) {
2392 2393 2394 2395 2396 2397 2398 2399
		/* unset promiscuity level from slave
		 * NOTE: The NETDEV_CHANGEADDR call above may change the value
		 * of the IFF_PROMISC flag in the bond_dev, but we need the
		 * value of that flag before that change, as that was the value
		 * when this slave was attached, so we cache at the start of the
		 * function and use it here. Same goes for ALLMULTI below
		 */
		if (old_flags & IFF_PROMISC)
L
Linus Torvalds 已提交
2400 2401 2402
			dev_set_promiscuity(slave_dev, -1);

		/* unset allmulti level from slave */
2403
		if (old_flags & IFF_ALLMULTI)
L
Linus Torvalds 已提交
2404 2405
			dev_set_allmulti(slave_dev, -1);

2406
		bond_hw_addr_flush(bond_dev, slave_dev);
L
Linus Torvalds 已提交
2407 2408
	}

2409
	slave_disable_netpoll(slave);
2410

L
Linus Torvalds 已提交
2411 2412 2413
	/* close slave before restoring its mac address */
	dev_close(slave_dev);

2414
	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
2415
	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2416
		/* restore original ("permanent") mac address */
2417 2418 2419
		bond_hw_addr_copy(ss.__data, slave->perm_hwaddr,
				  slave->dev->addr_len);
		ss.ss_family = slave_dev->type;
2420
		dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
2421
	}
L
Linus Torvalds 已提交
2422

2423 2424 2425 2426
	if (unregister)
		__dev_set_mtu(slave_dev, slave->original_mtu);
	else
		dev_set_mtu(slave_dev, slave->original_mtu);
2427

2428 2429
	if (!netif_is_bond_master(slave_dev))
		slave_dev->priv_flags &= ~IFF_BONDING;
L
Linus Torvalds 已提交
2430

2431
	kobject_put(&slave->kobj);
L
Linus Torvalds 已提交
2432

2433
	return 0;
L
Linus Torvalds 已提交
2434 2435
}

2436 2437 2438
/* A wrapper used because of ndo_del_link */
int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
{
2439
	return __bond_release_one(bond_dev, slave_dev, false, false);
2440 2441
}

2442 2443 2444
/* First release a slave and then destroy the bond if no more slaves are left.
 * Must be under rtnl_lock when this function is called.
 */
2445 2446
static int bond_release_and_destroy(struct net_device *bond_dev,
				    struct net_device *slave_dev)
2447
{
2448
	struct bonding *bond = netdev_priv(bond_dev);
2449 2450
	int ret;

2451
	ret = __bond_release_one(bond_dev, slave_dev, false, true);
2452 2453
	if (ret == 0 && !bond_has_slaves(bond) &&
	    bond_dev->reg_state != NETREG_UNREGISTERING) {
2454
		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
2455
		netdev_info(bond_dev, "Destroying bond\n");
2456
		bond_remove_proc_entry(bond);
S
Stephen Hemminger 已提交
2457
		unregister_netdevice(bond_dev);
2458 2459 2460 2461
	}
	return ret;
}

2462
static void bond_info_query(struct net_device *bond_dev, struct ifbond *info)
L
Linus Torvalds 已提交
2463
{
2464
	struct bonding *bond = netdev_priv(bond_dev);
2465

2466
	bond_fill_ifbond(bond, info);
L
Linus Torvalds 已提交
2467 2468 2469 2470
}

static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info)
{
2471
	struct bonding *bond = netdev_priv(bond_dev);
2472
	struct list_head *iter;
2473
	int i = 0, res = -ENODEV;
L
Linus Torvalds 已提交
2474 2475
	struct slave *slave;

2476
	bond_for_each_slave(bond, slave, iter) {
2477
		if (i++ == (int)info->slave_id) {
2478
			res = 0;
2479
			bond_fill_ifslave(slave, info);
L
Linus Torvalds 已提交
2480 2481 2482 2483
			break;
		}
	}

2484
	return res;
L
Linus Torvalds 已提交
2485 2486 2487 2488
}

/*-------------------------------- Monitoring -------------------------------*/

2489
/* called with rcu_read_lock() */
J
Jay Vosburgh 已提交
2490 2491
static int bond_miimon_inspect(struct bonding *bond)
{
2492
	int link_state, commit = 0;
2493
	struct list_head *iter;
J
Jay Vosburgh 已提交
2494
	struct slave *slave;
2495 2496
	bool ignore_updelay;

2497
	ignore_updelay = !rcu_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
2498

2499
	bond_for_each_slave_rcu(bond, slave, iter) {
2500
		bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
L
Linus Torvalds 已提交
2501

J
Jay Vosburgh 已提交
2502
		link_state = bond_check_dev_link(bond, slave->dev, 0);
L
Linus Torvalds 已提交
2503 2504

		switch (slave->link) {
J
Jay Vosburgh 已提交
2505 2506 2507
		case BOND_LINK_UP:
			if (link_state)
				continue;
L
Linus Torvalds 已提交
2508

2509
			bond_propose_link_state(slave, BOND_LINK_FAIL);
2510
			commit++;
J
Jay Vosburgh 已提交
2511 2512
			slave->delay = bond->params.downdelay;
			if (slave->delay) {
2513 2514 2515 2516 2517 2518
				slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
					   (BOND_MODE(bond) ==
					    BOND_MODE_ACTIVEBACKUP) ?
					    (bond_is_active_slave(slave) ?
					     "active " : "backup ") : "",
					   bond->params.downdelay * bond->params.miimon);
L
Linus Torvalds 已提交
2519
			}
2520
			fallthrough;
J
Jay Vosburgh 已提交
2521 2522
		case BOND_LINK_FAIL:
			if (link_state) {
2523
				/* recovered before downdelay expired */
2524
				bond_propose_link_state(slave, BOND_LINK_UP);
2525
				slave->last_link_up = jiffies;
2526 2527 2528
				slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
					   (bond->params.downdelay - slave->delay) *
					   bond->params.miimon);
2529
				commit++;
J
Jay Vosburgh 已提交
2530
				continue;
L
Linus Torvalds 已提交
2531
			}
J
Jay Vosburgh 已提交
2532 2533

			if (slave->delay <= 0) {
2534
				bond_propose_link_state(slave, BOND_LINK_DOWN);
J
Jay Vosburgh 已提交
2535 2536
				commit++;
				continue;
L
Linus Torvalds 已提交
2537 2538
			}

J
Jay Vosburgh 已提交
2539 2540 2541 2542 2543 2544 2545
			slave->delay--;
			break;

		case BOND_LINK_DOWN:
			if (!link_state)
				continue;

2546
			bond_propose_link_state(slave, BOND_LINK_BACK);
2547
			commit++;
J
Jay Vosburgh 已提交
2548 2549 2550
			slave->delay = bond->params.updelay;

			if (slave->delay) {
2551 2552 2553 2554
				slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
					   ignore_updelay ? 0 :
					   bond->params.updelay *
					   bond->params.miimon);
J
Jay Vosburgh 已提交
2555
			}
2556
			fallthrough;
J
Jay Vosburgh 已提交
2557 2558
		case BOND_LINK_BACK:
			if (!link_state) {
2559
				bond_propose_link_state(slave, BOND_LINK_DOWN);
2560 2561 2562
				slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
					   (bond->params.updelay - slave->delay) *
					   bond->params.miimon);
2563
				commit++;
J
Jay Vosburgh 已提交
2564 2565 2566
				continue;
			}

2567 2568 2569
			if (ignore_updelay)
				slave->delay = 0;

J
Jay Vosburgh 已提交
2570
			if (slave->delay <= 0) {
2571
				bond_propose_link_state(slave, BOND_LINK_UP);
J
Jay Vosburgh 已提交
2572
				commit++;
2573
				ignore_updelay = false;
J
Jay Vosburgh 已提交
2574
				continue;
L
Linus Torvalds 已提交
2575
			}
J
Jay Vosburgh 已提交
2576 2577

			slave->delay--;
L
Linus Torvalds 已提交
2578
			break;
J
Jay Vosburgh 已提交
2579 2580
		}
	}
L
Linus Torvalds 已提交
2581

J
Jay Vosburgh 已提交
2582 2583
	return commit;
}
L
Linus Torvalds 已提交
2584

2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602
static void bond_miimon_link_change(struct bonding *bond,
				    struct slave *slave,
				    char link)
{
	switch (BOND_MODE(bond)) {
	case BOND_MODE_8023AD:
		bond_3ad_handle_link_change(slave, link);
		break;
	case BOND_MODE_TLB:
	case BOND_MODE_ALB:
		bond_alb_handle_link_change(bond, slave, link);
		break;
	case BOND_MODE_XOR:
		bond_update_slave_arr(bond, NULL);
		break;
	}
}

J
Jay Vosburgh 已提交
2603 2604
static void bond_miimon_commit(struct bonding *bond)
{
2605
	struct list_head *iter;
2606
	struct slave *slave, *primary;
J
Jay Vosburgh 已提交
2607

2608
	bond_for_each_slave(bond, slave, iter) {
2609
		switch (slave->link_new_state) {
J
Jay Vosburgh 已提交
2610
		case BOND_LINK_NOCHANGE:
2611 2612 2613 2614 2615 2616 2617 2618 2619
			/* For 802.3ad mode, check current slave speed and
			 * duplex again in case its port was disabled after
			 * invalid speed/duplex reporting but recovered before
			 * link monitoring could make a decision on the actual
			 * link status
			 */
			if (BOND_MODE(bond) == BOND_MODE_8023AD &&
			    slave->link == BOND_LINK_UP)
				bond_3ad_adapter_speed_duplex_changed(slave);
J
Jay Vosburgh 已提交
2620
			continue;
L
Linus Torvalds 已提交
2621

J
Jay Vosburgh 已提交
2622
		case BOND_LINK_UP:
2623 2624
			if (bond_update_speed_duplex(slave) &&
			    bond_needs_speed_duplex(bond)) {
2625
				slave->link = BOND_LINK_DOWN;
2626
				if (net_ratelimit())
2627 2628
					slave_warn(bond->dev, slave->dev,
						   "failed to get link speed/duplex\n");
2629 2630
				continue;
			}
2631 2632
			bond_set_slave_link_state(slave, BOND_LINK_UP,
						  BOND_SLAVE_NOTIFY_NOW);
2633
			slave->last_link_up = jiffies;
J
Jay Vosburgh 已提交
2634

2635
			primary = rtnl_dereference(bond->primary_slave);
2636
			if (BOND_MODE(bond) == BOND_MODE_8023AD) {
J
Jay Vosburgh 已提交
2637
				/* prevent it from being the active one */
J
Jiri Pirko 已提交
2638
				bond_set_backup_slave(slave);
2639
			} else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
J
Jay Vosburgh 已提交
2640
				/* make it immediately active */
J
Jiri Pirko 已提交
2641
				bond_set_active_slave(slave);
L
Linus Torvalds 已提交
2642 2643
			}

2644 2645 2646
			slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n",
				   slave->speed == SPEED_UNKNOWN ? 0 : slave->speed,
				   slave->duplex ? "full" : "half");
L
Linus Torvalds 已提交
2647

2648
			bond_miimon_link_change(bond, slave, BOND_LINK_UP);
2649

2650
			if (!bond->curr_active_slave || slave == primary)
J
Jay Vosburgh 已提交
2651
				goto do_failover;
L
Linus Torvalds 已提交
2652

J
Jay Vosburgh 已提交
2653
			continue;
2654

J
Jay Vosburgh 已提交
2655
		case BOND_LINK_DOWN:
J
Jay Vosburgh 已提交
2656 2657 2658
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;

2659 2660
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
L
Linus Torvalds 已提交
2661

2662 2663
			if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP ||
			    BOND_MODE(bond) == BOND_MODE_8023AD)
2664 2665
				bond_set_slave_inactive_flags(slave,
							      BOND_SLAVE_NOTIFY_NOW);
J
Jay Vosburgh 已提交
2666

2667
			slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");
J
Jay Vosburgh 已提交
2668

2669
			bond_miimon_link_change(bond, slave, BOND_LINK_DOWN);
2670

2671
			if (slave == rcu_access_pointer(bond->curr_active_slave))
J
Jay Vosburgh 已提交
2672 2673 2674 2675 2676
				goto do_failover;

			continue;

		default:
2677
			slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n",
2678 2679
				  slave->link_new_state);
			bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
J
Jay Vosburgh 已提交
2680 2681 2682 2683 2684

			continue;
		}

do_failover:
2685
		block_netpoll_tx();
J
Jay Vosburgh 已提交
2686
		bond_select_active_slave(bond);
2687
		unblock_netpoll_tx();
J
Jay Vosburgh 已提交
2688 2689 2690
	}

	bond_set_carrier(bond);
L
Linus Torvalds 已提交
2691 2692
}

2693
/* bond_mii_monitor
2694 2695
 *
 * Really a wrapper that splits the mii monitor into two phases: an
J
Jay Vosburgh 已提交
2696 2697 2698
 * inspection, then (if inspection indicates something needs to be done)
 * an acquisition of appropriate locks followed by a commit phase to
 * implement whatever link state changes are indicated.
2699
 */
2700
static void bond_mii_monitor(struct work_struct *work)
2701 2702 2703
{
	struct bonding *bond = container_of(work, struct bonding,
					    mii_work.work);
2704
	bool should_notify_peers = false;
2705
	bool commit;
2706
	unsigned long delay;
2707 2708
	struct slave *slave;
	struct list_head *iter;
2709

2710 2711 2712
	delay = msecs_to_jiffies(bond->params.miimon);

	if (!bond_has_slaves(bond))
J
Jay Vosburgh 已提交
2713
		goto re_arm;
2714

2715
	rcu_read_lock();
2716
	should_notify_peers = bond_should_notify_peers(bond);
2717 2718 2719 2720 2721 2722 2723 2724
	commit = !!bond_miimon_inspect(bond);
	if (bond->send_peer_notif) {
		rcu_read_unlock();
		if (rtnl_trylock()) {
			bond->send_peer_notif--;
			rtnl_unlock();
		}
	} else {
2725
		rcu_read_unlock();
2726
	}
J
Jay Vosburgh 已提交
2727

2728
	if (commit) {
2729 2730 2731 2732 2733 2734
		/* Race avoidance with bond_close cancel of workqueue */
		if (!rtnl_trylock()) {
			delay = 1;
			should_notify_peers = false;
			goto re_arm;
		}
2735

2736 2737 2738
		bond_for_each_slave(bond, slave, iter) {
			bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER);
		}
2739 2740 2741
		bond_miimon_commit(bond);

		rtnl_unlock();	/* might sleep, hold no other locks */
2742
	}
2743

J
Jay Vosburgh 已提交
2744
re_arm:
2745
	if (bond->params.miimon)
2746 2747 2748 2749 2750 2751 2752 2753
		queue_delayed_work(bond->wq, &bond->mii_work, delay);

	if (should_notify_peers) {
		if (!rtnl_trylock())
			return;
		call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
		rtnl_unlock();
	}
2754
}
J
Jay Vosburgh 已提交
2755

2756 2757
static int bond_upper_dev_walk(struct net_device *upper,
			       struct netdev_nested_priv *priv)
2758
{
2759
	__be32 ip = *(__be32 *)priv->data;
2760 2761 2762 2763

	return ip == bond_confirm_addr(upper, 0, ip);
}

2764
static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
2765
{
2766 2767 2768
	struct netdev_nested_priv priv = {
		.data = (void *)&ip,
	};
2769
	bool ret = false;
2770

2771
	if (ip == bond_confirm_addr(bond->dev, 0, ip))
2772
		return true;
2773

2774
	rcu_read_lock();
2775
	if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv))
2776
		ret = true;
2777
	rcu_read_unlock();
2778

2779
	return ret;
2780 2781
}

2782
/* We go to the (large) trouble of VLAN tagging ARP frames because
J
Jay Vosburgh 已提交
2783 2784 2785
 * switches in VLAN mode (especially if ports are configured as
 * "native" to a VLAN) might not pass non-tagged frames.
 */
2786 2787
static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip,
			  __be32 src_ip, struct bond_vlan_tag *tags)
J
Jay Vosburgh 已提交
2788 2789
{
	struct sk_buff *skb;
2790
	struct bond_vlan_tag *outer_tag = tags;
2791 2792
	struct net_device *slave_dev = slave->dev;
	struct net_device *bond_dev = slave->bond->dev;
J
Jay Vosburgh 已提交
2793

2794 2795
	slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n",
		  arp_op, &dest_ip, &src_ip);
S
Stephen Hemminger 已提交
2796

J
Jay Vosburgh 已提交
2797 2798 2799 2800
	skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip,
			 NULL, slave_dev->dev_addr, NULL);

	if (!skb) {
2801
		net_err_ratelimited("ARP packet allocation failed\n");
J
Jay Vosburgh 已提交
2802 2803
		return;
	}
2804

2805 2806 2807 2808 2809
	if (!tags || tags->vlan_proto == VLAN_N_VID)
		goto xmit;

	tags++;

2810
	/* Go through all the tags backwards and add them to the packet */
2811 2812 2813
	while (tags->vlan_proto != VLAN_N_VID) {
		if (!tags->vlan_id) {
			tags++;
2814
			continue;
2815
		}
2816

2817 2818
		slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n",
			  ntohs(outer_tag->vlan_proto), tags->vlan_id);
2819 2820
		skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto,
						tags->vlan_id);
2821 2822 2823 2824
		if (!skb) {
			net_err_ratelimited("failed to insert inner VLAN tag\n");
			return;
		}
2825 2826

		tags++;
2827 2828
	}
	/* Set the outer tag */
2829
	if (outer_tag->vlan_id) {
2830 2831
		slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n",
			  ntohs(outer_tag->vlan_proto), outer_tag->vlan_id);
J
Jiri Pirko 已提交
2832 2833
		__vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto,
				       outer_tag->vlan_id);
J
Jay Vosburgh 已提交
2834
	}
2835 2836

xmit:
J
Jay Vosburgh 已提交
2837 2838 2839
	arp_xmit(skb);
}

2840 2841 2842 2843 2844 2845
/* Validate the device path between the @start_dev and the @end_dev.
 * The path is valid if the @end_dev is reachable through device
 * stacking.
 * When the path is validated, collect any vlan information in the
 * path.
 */
2846 2847 2848
struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
					      struct net_device *end_dev,
					      int level)
2849
{
2850
	struct bond_vlan_tag *tags;
2851 2852 2853
	struct net_device *upper;
	struct list_head  *iter;

2854
	if (start_dev == end_dev) {
K
Kees Cook 已提交
2855
		tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC);
2856 2857 2858 2859 2860
		if (!tags)
			return ERR_PTR(-ENOMEM);
		tags[level].vlan_proto = VLAN_N_VID;
		return tags;
	}
2861 2862

	netdev_for_each_upper_dev_rcu(start_dev, upper, iter) {
2863 2864 2865 2866 2867
		tags = bond_verify_device_path(upper, end_dev, level + 1);
		if (IS_ERR_OR_NULL(tags)) {
			if (IS_ERR(tags))
				return tags;
			continue;
2868
		}
2869 2870 2871 2872 2873 2874
		if (is_vlan_dev(upper)) {
			tags[level].vlan_proto = vlan_dev_vlan_proto(upper);
			tags[level].vlan_id = vlan_dev_vlan_id(upper);
		}

		return tags;
2875 2876
	}

2877
	return NULL;
2878
}
J
Jay Vosburgh 已提交
2879

L
Linus Torvalds 已提交
2880 2881
static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
{
J
Jay Vosburgh 已提交
2882
	struct rtable *rt;
2883
	struct bond_vlan_tag *tags;
2884
	__be32 *targets = bond->params.arp_targets, addr;
2885
	int i;
L
Linus Torvalds 已提交
2886

2887
	for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
2888 2889
		slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n",
			  __func__, &targets[i]);
2890
		tags = NULL;
J
Jay Vosburgh 已提交
2891

2892
		/* Find out through which dev should the packet go */
2893 2894
		rt = ip_route_output(dev_net(bond->dev), targets[i], 0,
				     RTO_ONLINK, 0);
2895
		if (IS_ERR(rt)) {
2896 2897 2898
			/* there's no route to target - try to send arp
			 * probe to generate any traffic (arp_validate=0)
			 */
2899 2900 2901 2902
			if (bond->params.arp_validate)
				net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
						     bond->dev->name,
						     &targets[i]);
2903
			bond_arp_send(slave, ARPOP_REQUEST, targets[i],
2904
				      0, tags);
J
Jay Vosburgh 已提交
2905 2906 2907
			continue;
		}

2908 2909 2910 2911 2912
		/* bond device itself */
		if (rt->dst.dev == bond->dev)
			goto found;

		rcu_read_lock();
2913
		tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0);
2914
		rcu_read_unlock();
J
Jay Vosburgh 已提交
2915

2916
		if (!IS_ERR_OR_NULL(tags))
2917 2918
			goto found;

2919
		/* Not our device - skip */
2920
		slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n",
2921
			   &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL");
2922

2923
		ip_rt_put(rt);
2924 2925 2926 2927 2928
		continue;

found:
		addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
		ip_rt_put(rt);
2929
		bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags);
2930
		kfree(tags);
J
Jay Vosburgh 已提交
2931 2932 2933
	}
}

2934
static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip)
2935
{
2936 2937
	int i;

2938
	if (!sip || !bond_has_this_ip(bond, tip)) {
2939 2940
		slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n",
			   __func__, &sip, &tip);
2941 2942
		return;
	}
2943

2944 2945
	i = bond_get_targets_ip(bond->params.arp_targets, sip);
	if (i == -1) {
2946 2947
		slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n",
			   __func__, &sip);
2948
		return;
2949
	}
2950
	slave->last_rx = jiffies;
2951
	slave->target_last_arp_rx[i] = jiffies;
2952 2953
}

2954 2955
int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
		 struct slave *slave)
2956
{
2957
	struct arphdr *arp = (struct arphdr *)skb->data;
2958
	struct slave *curr_active_slave, *curr_arp_slave;
2959
	unsigned char *arp_ptr;
2960
	__be32 sip, tip;
2961 2962
	int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
	unsigned int alen;
2963

2964
	if (!slave_do_arp_validate(bond, slave)) {
2965 2966
		if ((slave_do_arp_validate_only(bond) && is_arp) ||
		    !slave_do_arp_validate_only(bond))
2967
			slave->last_rx = jiffies;
2968
		return RX_HANDLER_ANOTHER;
2969 2970 2971
	} else if (!is_arp) {
		return RX_HANDLER_ANOTHER;
	}
2972

2973
	alen = arp_hdr_len(bond->dev);
2974

2975 2976
	slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n",
		   __func__, skb->dev->name);
2977

2978 2979 2980 2981 2982 2983 2984
	if (alen > skb_headlen(skb)) {
		arp = kmalloc(alen, GFP_ATOMIC);
		if (!arp)
			goto out_unlock;
		if (skb_copy_bits(skb, 0, arp, alen) < 0)
			goto out_unlock;
	}
2985

2986
	if (arp->ar_hln != bond->dev->addr_len ||
2987 2988 2989 2990 2991 2992 2993 2994
	    skb->pkt_type == PACKET_OTHERHOST ||
	    skb->pkt_type == PACKET_LOOPBACK ||
	    arp->ar_hrd != htons(ARPHRD_ETHER) ||
	    arp->ar_pro != htons(ETH_P_IP) ||
	    arp->ar_pln != 4)
		goto out_unlock;

	arp_ptr = (unsigned char *)(arp + 1);
2995
	arp_ptr += bond->dev->addr_len;
2996
	memcpy(&sip, arp_ptr, 4);
2997
	arp_ptr += 4 + bond->dev->addr_len;
2998 2999
	memcpy(&tip, arp_ptr, 4);

3000 3001 3002 3003
	slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n",
		  __func__, slave->dev->name, bond_slave_state(slave),
		  bond->params.arp_validate, slave_do_arp_validate(bond, slave),
		  &sip, &tip);
3004

3005
	curr_active_slave = rcu_dereference(bond->curr_active_slave);
3006
	curr_arp_slave = rcu_dereference(bond->current_arp_slave);
3007

3008
	/* We 'trust' the received ARP enough to validate it if:
3009
	 *
3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028
	 * (a) the slave receiving the ARP is active (which includes the
	 * current ARP slave, if any), or
	 *
	 * (b) the receiving slave isn't active, but there is a currently
	 * active slave and it received valid arp reply(s) after it became
	 * the currently active slave, or
	 *
	 * (c) there is an ARP slave that sent an ARP during the prior ARP
	 * interval, and we receive an ARP reply on any slave.  We accept
	 * these because switch FDB update delays may deliver the ARP
	 * reply to a slave other than the sender of the ARP request.
	 *
	 * Note: for (b), backup slaves are receiving the broadcast ARP
	 * request, not a reply.  This request passes from the sending
	 * slave through the L2 switch(es) to the receiving slave.  Since
	 * this is checking the request, sip/tip are swapped for
	 * validation.
	 *
	 * This is done to avoid endless looping when we can't reach the
3029
	 * arp_ip_target and fool ourselves with our own arp requests.
3030
	 */
J
Jiri Pirko 已提交
3031
	if (bond_is_active_slave(slave))
3032
		bond_validate_arp(bond, slave, sip, tip);
3033 3034 3035
	else if (curr_active_slave &&
		 time_after(slave_last_rx(bond, curr_active_slave),
			    curr_active_slave->last_link_up))
3036
		bond_validate_arp(bond, slave, tip, sip);
3037 3038 3039 3040
	else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) &&
		 bond_time_in_interval(bond,
				       dev_trans_start(curr_arp_slave->dev), 1))
		bond_validate_arp(bond, slave, sip, tip);
3041 3042

out_unlock:
3043 3044
	if (arp != (struct arphdr *)skb->data)
		kfree(arp);
3045
	return RX_HANDLER_ANOTHER;
3046 3047
}

3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061
/* function to verify if we're in the arp_interval timeslice, returns true if
 * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval +
 * arp_interval/2) . the arp_interval/2 is needed for really fast networks.
 */
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
				  int mod)
{
	int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

	return time_in_range(jiffies,
			     last_act - delta_in_ticks,
			     last_act + mod * delta_in_ticks + delta_in_ticks/2);
}

3062
/* This function is called regularly to monitor each slave's link
L
Linus Torvalds 已提交
3063 3064 3065 3066 3067
 * ensuring that traffic is being sent and received when arp monitoring
 * is used in load-balancing mode. if the adapter has been dormant, then an
 * arp is transmitted to generate traffic. see activebackup_arp_monitor for
 * arp monitoring in active backup mode.
 */
3068
static void bond_loadbalance_arp_mon(struct bonding *bond)
L
Linus Torvalds 已提交
3069 3070
{
	struct slave *slave, *oldcurrent;
3071
	struct list_head *iter;
3072
	int do_failover = 0, slave_state_changed = 0;
L
Linus Torvalds 已提交
3073

3074
	if (!bond_has_slaves(bond))
L
Linus Torvalds 已提交
3075 3076
		goto re_arm;

3077 3078
	rcu_read_lock();

3079
	oldcurrent = rcu_dereference(bond->curr_active_slave);
L
Linus Torvalds 已提交
3080 3081
	/* see if any of the previous devices are up now (i.e. they have
	 * xmt and rcv traffic). the curr_active_slave does not come into
3082 3083 3084
	 * the picture unless it is null. also, slave->last_link_up is not
	 * needed here because we send an arp on each slave and give a slave
	 * as long as it needs to get the tx/rx within the delta.
L
Linus Torvalds 已提交
3085 3086 3087
	 * TODO: what about up/down delay in arp mode? it wasn't here before
	 *       so it can wait
	 */
3088
	bond_for_each_slave_rcu(bond, slave, iter) {
3089 3090
		unsigned long trans_start = dev_trans_start(slave->dev);

3091
		bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
3092

L
Linus Torvalds 已提交
3093
		if (slave->link != BOND_LINK_UP) {
3094
			if (bond_time_in_interval(bond, trans_start, 1) &&
3095
			    bond_time_in_interval(bond, slave->last_rx, 1)) {
L
Linus Torvalds 已提交
3096

3097
				bond_propose_link_state(slave, BOND_LINK_UP);
3098
				slave_state_changed = 1;
L
Linus Torvalds 已提交
3099 3100 3101 3102 3103 3104 3105

				/* primary_slave has no meaning in round-robin
				 * mode. the window of a slave being up and
				 * curr_active_slave being null after enslaving
				 * is closed.
				 */
				if (!oldcurrent) {
3106
					slave_info(bond->dev, slave->dev, "link status definitely up\n");
L
Linus Torvalds 已提交
3107 3108
					do_failover = 1;
				} else {
3109
					slave_info(bond->dev, slave->dev, "interface is now up\n");
L
Linus Torvalds 已提交
3110 3111 3112 3113 3114 3115 3116 3117 3118
				}
			}
		} else {
			/* slave->link == BOND_LINK_UP */

			/* not all switches will respond to an arp request
			 * when the source ip is 0, so don't take the link down
			 * if we don't know our ip yet
			 */
3119
			if (!bond_time_in_interval(bond, trans_start, 2) ||
3120
			    !bond_time_in_interval(bond, slave->last_rx, 2)) {
L
Linus Torvalds 已提交
3121

3122
				bond_propose_link_state(slave, BOND_LINK_DOWN);
3123
				slave_state_changed = 1;
L
Linus Torvalds 已提交
3124

S
Stephen Hemminger 已提交
3125
				if (slave->link_failure_count < UINT_MAX)
L
Linus Torvalds 已提交
3126 3127
					slave->link_failure_count++;

3128
				slave_info(bond->dev, slave->dev, "interface is now down\n");
L
Linus Torvalds 已提交
3129

S
Stephen Hemminger 已提交
3130
				if (slave == oldcurrent)
L
Linus Torvalds 已提交
3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141
					do_failover = 1;
			}
		}

		/* note: if switch is in round-robin mode, all links
		 * must tx arp to ensure all links rx an arp - otherwise
		 * links may oscillate or not come up at all; if switch is
		 * in something like xor mode, there is nothing we can
		 * do - all replies will be rx'ed on same link causing slaves
		 * to be unstable during low/no traffic periods
		 */
3142
		if (bond_slave_is_up(slave))
L
Linus Torvalds 已提交
3143 3144 3145
			bond_arp_send_all(bond, slave);
	}

3146 3147
	rcu_read_unlock();

3148
	if (do_failover || slave_state_changed) {
3149 3150
		if (!rtnl_trylock())
			goto re_arm;
L
Linus Torvalds 已提交
3151

3152
		bond_for_each_slave(bond, slave, iter) {
3153 3154
			if (slave->link_new_state != BOND_LINK_NOCHANGE)
				slave->link = slave->link_new_state;
3155 3156
		}

3157 3158
		if (slave_state_changed) {
			bond_slave_state_change(bond);
3159 3160
			if (BOND_MODE(bond) == BOND_MODE_XOR)
				bond_update_slave_arr(bond, NULL);
3161 3162
		}
		if (do_failover) {
3163 3164 3165 3166
			block_netpoll_tx();
			bond_select_active_slave(bond);
			unblock_netpoll_tx();
		}
3167
		rtnl_unlock();
L
Linus Torvalds 已提交
3168 3169 3170
	}

re_arm:
3171
	if (bond->params.arp_interval)
3172 3173
		queue_delayed_work(bond->wq, &bond->arp_work,
				   msecs_to_jiffies(bond->params.arp_interval));
L
Linus Torvalds 已提交
3174 3175
}

3176
/* Called to inspect slaves for active-backup mode ARP monitor link state
3177 3178 3179
 * changes.  Sets proposed link state in slaves to specify what action
 * should take place for the slave.  Returns 0 if no changes are found, >0
 * if changes to link states must be committed.
3180
 *
3181
 * Called with rcu_read_lock held.
L
Linus Torvalds 已提交
3182
 */
3183
static int bond_ab_arp_inspect(struct bonding *bond)
L
Linus Torvalds 已提交
3184
{
3185
	unsigned long trans_start, last_rx;
3186
	struct list_head *iter;
3187 3188
	struct slave *slave;
	int commit = 0;
3189

3190
	bond_for_each_slave_rcu(bond, slave, iter) {
3191
		bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
3192
		last_rx = slave_last_rx(bond, slave);
L
Linus Torvalds 已提交
3193

3194
		if (slave->link != BOND_LINK_UP) {
3195
			if (bond_time_in_interval(bond, last_rx, 1)) {
3196
				bond_propose_link_state(slave, BOND_LINK_UP);
3197
				commit++;
3198 3199 3200
			} else if (slave->link == BOND_LINK_BACK) {
				bond_propose_link_state(slave, BOND_LINK_FAIL);
				commit++;
3201 3202 3203
			}
			continue;
		}
L
Linus Torvalds 已提交
3204

3205
		/* Give slaves 2*delta after being enslaved or made
3206 3207 3208
		 * active.  This avoids bouncing, as the last receive
		 * times need a full ARP monitor cycle to be updated.
		 */
3209
		if (bond_time_in_interval(bond, slave->last_link_up, 2))
3210 3211
			continue;

3212
		/* Backup slave is down if:
3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223
		 * - No current_arp_slave AND
		 * - more than 3*delta since last receive AND
		 * - the bond has an IP address
		 *
		 * Note: a non-null current_arp_slave indicates
		 * the curr_active_slave went down and we are
		 * searching for a new one; under this condition
		 * we only take the curr_active_slave down - this
		 * gives each slave a chance to tx/rx traffic
		 * before being taken out
		 */
J
Jiri Pirko 已提交
3224
		if (!bond_is_active_slave(slave) &&
3225
		    !rcu_access_pointer(bond->current_arp_slave) &&
3226
		    !bond_time_in_interval(bond, last_rx, 3)) {
3227
			bond_propose_link_state(slave, BOND_LINK_DOWN);
3228 3229 3230
			commit++;
		}

3231
		/* Active slave is down if:
3232 3233 3234 3235
		 * - more than 2*delta since transmitting OR
		 * - (more than 2*delta since receive AND
		 *    the bond has an IP address)
		 */
3236
		trans_start = dev_trans_start(slave->dev);
J
Jiri Pirko 已提交
3237
		if (bond_is_active_slave(slave) &&
3238 3239
		    (!bond_time_in_interval(bond, trans_start, 2) ||
		     !bond_time_in_interval(bond, last_rx, 2))) {
3240
			bond_propose_link_state(slave, BOND_LINK_DOWN);
3241 3242
			commit++;
		}
L
Linus Torvalds 已提交
3243 3244
	}

3245 3246
	return commit;
}
L
Linus Torvalds 已提交
3247

3248
/* Called to commit link state changes noted by inspection step of
3249 3250
 * active-backup mode ARP monitor.
 *
3251
 * Called with RTNL hold.
3252
 */
3253
static void bond_ab_arp_commit(struct bonding *bond)
3254
{
3255
	unsigned long trans_start;
3256
	struct list_head *iter;
3257
	struct slave *slave;
L
Linus Torvalds 已提交
3258

3259
	bond_for_each_slave(bond, slave, iter) {
3260
		switch (slave->link_new_state) {
3261 3262
		case BOND_LINK_NOCHANGE:
			continue;
3263

3264
		case BOND_LINK_UP:
3265
			trans_start = dev_trans_start(slave->dev);
3266 3267
			if (rtnl_dereference(bond->curr_active_slave) != slave ||
			    (!rtnl_dereference(bond->curr_active_slave) &&
3268
			     bond_time_in_interval(bond, trans_start, 1))) {
3269 3270 3271
				struct slave *current_arp_slave;

				current_arp_slave = rtnl_dereference(bond->current_arp_slave);
3272 3273
				bond_set_slave_link_state(slave, BOND_LINK_UP,
							  BOND_SLAVE_NOTIFY_NOW);
3274
				if (current_arp_slave) {
3275
					bond_set_slave_inactive_flags(
3276
						current_arp_slave,
3277
						BOND_SLAVE_NOTIFY_NOW);
3278
					RCU_INIT_POINTER(bond->current_arp_slave, NULL);
3279
				}
3280

3281
				slave_info(bond->dev, slave->dev, "link status definitely up\n");
3282

3283
				if (!rtnl_dereference(bond->curr_active_slave) ||
3284
				    slave == rtnl_dereference(bond->primary_slave))
3285
					goto do_failover;
L
Linus Torvalds 已提交
3286

3287
			}
L
Linus Torvalds 已提交
3288

3289
			continue;
L
Linus Torvalds 已提交
3290

3291 3292 3293 3294
		case BOND_LINK_DOWN:
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;

3295 3296
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_NOW);
3297 3298
			bond_set_slave_inactive_flags(slave,
						      BOND_SLAVE_NOTIFY_NOW);
3299

3300
			slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");
3301

3302
			if (slave == rtnl_dereference(bond->curr_active_slave)) {
3303
				RCU_INIT_POINTER(bond->current_arp_slave, NULL);
3304
				goto do_failover;
L
Linus Torvalds 已提交
3305
			}
3306 3307

			continue;
3308

3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321
		case BOND_LINK_FAIL:
			bond_set_slave_link_state(slave, BOND_LINK_FAIL,
						  BOND_SLAVE_NOTIFY_NOW);
			bond_set_slave_inactive_flags(slave,
						      BOND_SLAVE_NOTIFY_NOW);

			/* A slave has just been enslaved and has become
			 * the current active slave.
			 */
			if (rtnl_dereference(bond->curr_active_slave))
				RCU_INIT_POINTER(bond->current_arp_slave, NULL);
			continue;

3322
		default:
3323 3324 3325
			slave_err(bond->dev, slave->dev,
				  "impossible: link_new_state %d on slave\n",
				  slave->link_new_state);
3326
			continue;
L
Linus Torvalds 已提交
3327 3328
		}

3329
do_failover:
3330
		block_netpoll_tx();
3331
		bond_select_active_slave(bond);
3332
		unblock_netpoll_tx();
3333
	}
L
Linus Torvalds 已提交
3334

3335 3336
	bond_set_carrier(bond);
}
L
Linus Torvalds 已提交
3337

3338
/* Send ARP probes for active-backup mode ARP monitor.
3339
 *
3340
 * Called with rcu_read_lock held.
3341
 */
3342
static bool bond_ab_arp_probe(struct bonding *bond)
3343
{
3344
	struct slave *slave, *before = NULL, *new_slave = NULL,
3345 3346
		     *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
		     *curr_active_slave = rcu_dereference(bond->curr_active_slave);
3347 3348
	struct list_head *iter;
	bool found = false;
3349
	bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER;
3350

3351
	if (curr_arp_slave && curr_active_slave)
3352 3353 3354
		netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n",
			    curr_arp_slave->dev->name,
			    curr_active_slave->dev->name);
L
Linus Torvalds 已提交
3355

3356 3357
	if (curr_active_slave) {
		bond_arp_send_all(bond, curr_active_slave);
3358
		return should_notify_rtnl;
3359
	}
L
Linus Torvalds 已提交
3360

3361 3362 3363 3364
	/* if we don't have a curr_active_slave, search for the next available
	 * backup slave from the current_arp_slave and make it the candidate
	 * for becoming the curr_active_slave
	 */
L
Linus Torvalds 已提交
3365

3366
	if (!curr_arp_slave) {
3367 3368 3369
		curr_arp_slave = bond_first_slave_rcu(bond);
		if (!curr_arp_slave)
			return should_notify_rtnl;
3370
	}
L
Linus Torvalds 已提交
3371

3372
	bond_for_each_slave_rcu(bond, slave, iter) {
3373
		if (!found && !before && bond_slave_is_up(slave))
3374
			before = slave;
L
Linus Torvalds 已提交
3375

3376
		if (found && !new_slave && bond_slave_is_up(slave))
3377
			new_slave = slave;
3378 3379 3380 3381 3382 3383
		/* if the link state is up at this point, we
		 * mark it down - this can happen if we have
		 * simultaneous link failures and
		 * reselect_active_interface doesn't make this
		 * one the current slave so it is still marked
		 * up when it is actually down
L
Linus Torvalds 已提交
3384
		 */
3385
		if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
3386 3387
			bond_set_slave_link_state(slave, BOND_LINK_DOWN,
						  BOND_SLAVE_NOTIFY_LATER);
3388 3389
			if (slave->link_failure_count < UINT_MAX)
				slave->link_failure_count++;
L
Linus Torvalds 已提交
3390

3391
			bond_set_slave_inactive_flags(slave,
3392
						      BOND_SLAVE_NOTIFY_LATER);
3393

3394
			slave_info(bond->dev, slave->dev, "backup interface is now down\n");
L
Linus Torvalds 已提交
3395
		}
3396
		if (slave == curr_arp_slave)
3397
			found = true;
3398
	}
3399 3400 3401 3402

	if (!new_slave && before)
		new_slave = before;

3403 3404
	if (!new_slave)
		goto check_state;
3405

3406 3407
	bond_set_slave_link_state(new_slave, BOND_LINK_BACK,
				  BOND_SLAVE_NOTIFY_LATER);
3408
	bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER);
3409
	bond_arp_send_all(bond, new_slave);
3410
	new_slave->last_link_up = jiffies;
3411
	rcu_assign_pointer(bond->current_arp_slave, new_slave);
3412

3413 3414
check_state:
	bond_for_each_slave_rcu(bond, slave, iter) {
3415
		if (slave->should_notify || slave->should_notify_link) {
3416 3417 3418 3419 3420
			should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW;
			break;
		}
	}
	return should_notify_rtnl;
3421
}
L
Linus Torvalds 已提交
3422

3423
static void bond_activebackup_arp_mon(struct bonding *bond)
3424
{
3425 3426
	bool should_notify_peers = false;
	bool should_notify_rtnl = false;
3427
	int delta_in_ticks;
L
Linus Torvalds 已提交
3428

3429 3430 3431
	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

	if (!bond_has_slaves(bond))
3432 3433
		goto re_arm;

3434
	rcu_read_lock();
3435

3436 3437
	should_notify_peers = bond_should_notify_peers(bond);

3438 3439 3440
	if (bond_ab_arp_inspect(bond)) {
		rcu_read_unlock();

3441 3442 3443 3444 3445 3446
		/* Race avoidance with bond_close flush of workqueue */
		if (!rtnl_trylock()) {
			delta_in_ticks = 1;
			should_notify_peers = false;
			goto re_arm;
		}
3447

3448
		bond_ab_arp_commit(bond);
3449

3450
		rtnl_unlock();
3451
		rcu_read_lock();
3452 3453
	}

3454 3455
	should_notify_rtnl = bond_ab_arp_probe(bond);
	rcu_read_unlock();
3456

3457 3458
re_arm:
	if (bond->params.arp_interval)
3459 3460
		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);

3461
	if (should_notify_peers || should_notify_rtnl) {
3462 3463
		if (!rtnl_trylock())
			return;
3464 3465 3466 3467

		if (should_notify_peers)
			call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
						 bond->dev);
3468
		if (should_notify_rtnl) {
3469
			bond_slave_state_notify(bond);
3470 3471
			bond_slave_link_notify(bond);
		}
3472

3473 3474
		rtnl_unlock();
	}
L
Linus Torvalds 已提交
3475 3476
}

3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487
static void bond_arp_monitor(struct work_struct *work)
{
	struct bonding *bond = container_of(work, struct bonding,
					    arp_work.work);

	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
		bond_activebackup_arp_mon(bond);
	else
		bond_loadbalance_arp_mon(bond);
}

L
Linus Torvalds 已提交
3488 3489
/*-------------------------- netdev event handling --------------------------*/

3490
/* Change device name */
L
Linus Torvalds 已提交
3491 3492 3493 3494
static int bond_event_changename(struct bonding *bond)
{
	bond_remove_proc_entry(bond);
	bond_create_proc_entry(bond);
3495

3496 3497
	bond_debug_reregister(bond);

L
Linus Torvalds 已提交
3498 3499 3500
	return NOTIFY_DONE;
}

S
Stephen Hemminger 已提交
3501 3502
static int bond_master_netdev_event(unsigned long event,
				    struct net_device *bond_dev)
L
Linus Torvalds 已提交
3503
{
3504
	struct bonding *event_bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3505

3506 3507
	netdev_dbg(bond_dev, "%s called\n", __func__);

L
Linus Torvalds 已提交
3508 3509 3510
	switch (event) {
	case NETDEV_CHANGENAME:
		return bond_event_changename(event_bond);
3511 3512
	case NETDEV_UNREGISTER:
		bond_remove_proc_entry(event_bond);
M
Mahesh Bandewar 已提交
3513
#ifdef CONFIG_XFRM_OFFLOAD
3514
		xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true);
M
Mahesh Bandewar 已提交
3515
#endif /* CONFIG_XFRM_OFFLOAD */
3516 3517 3518 3519
		break;
	case NETDEV_REGISTER:
		bond_create_proc_entry(event_bond);
		break;
L
Linus Torvalds 已提交
3520 3521 3522 3523 3524 3525 3526
	default:
		break;
	}

	return NOTIFY_DONE;
}

S
Stephen Hemminger 已提交
3527 3528
static int bond_slave_netdev_event(unsigned long event,
				   struct net_device *slave_dev)
L
Linus Torvalds 已提交
3529
{
3530
	struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary;
3531 3532
	struct bonding *bond;
	struct net_device *bond_dev;
L
Linus Torvalds 已提交
3533

3534 3535 3536 3537
	/* A netdev event can be generated while enslaving a device
	 * before netdev_rx_handler_register is called in which case
	 * slave will be NULL
	 */
3538 3539
	if (!slave) {
		netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__);
3540
		return NOTIFY_DONE;
3541 3542
	}

3543 3544
	bond_dev = slave->bond->dev;
	bond = slave->bond;
3545
	primary = rtnl_dereference(bond->primary_slave);
3546

3547 3548
	slave_dbg(bond_dev, slave_dev, "%s called\n", __func__);

L
Linus Torvalds 已提交
3549 3550
	switch (event) {
	case NETDEV_UNREGISTER:
3551
		if (bond_dev->type != ARPHRD_ETHER)
3552 3553
			bond_release_and_destroy(bond_dev, slave_dev);
		else
3554
			__bond_release_one(bond_dev, slave_dev, false, true);
L
Linus Torvalds 已提交
3555
		break;
3556
	case NETDEV_UP:
L
Linus Torvalds 已提交
3557
	case NETDEV_CHANGE:
3558 3559
		/* For 802.3ad mode only:
		 * Getting invalid Speed/Duplex values here will put slave
3560 3561 3562 3563
		 * in weird state. Mark it as link-fail if the link was
		 * previously up or link-down if it hasn't yet come up, and
		 * let link-monitoring (miimon) set it right when correct
		 * speeds/duplex are available.
3564 3565
		 */
		if (bond_update_speed_duplex(slave) &&
3566 3567 3568 3569 3570 3571
		    BOND_MODE(bond) == BOND_MODE_8023AD) {
			if (slave->last_link_up)
				slave->link = BOND_LINK_FAIL;
			else
				slave->link = BOND_LINK_DOWN;
		}
3572

3573 3574
		if (BOND_MODE(bond) == BOND_MODE_8023AD)
			bond_3ad_adapter_speed_duplex_changed(slave);
3575
		fallthrough;
M
Mahesh Bandewar 已提交
3576
	case NETDEV_DOWN:
3577 3578 3579 3580 3581 3582 3583 3584
		/* Refresh slave-array if applicable!
		 * If the setup does not use miimon or arpmon (mode-specific!),
		 * then these events will not cause the slave-array to be
		 * refreshed. This will cause xmit to use a slave that is not
		 * usable. Avoid such situation by refeshing the array at these
		 * events. If these (miimon/arpmon) parameters are configured
		 * then array gets refreshed twice and that should be fine!
		 */
3585
		if (bond_mode_can_use_xmit_hash(bond))
3586
			bond_update_slave_arr(bond, NULL);
L
Linus Torvalds 已提交
3587 3588
		break;
	case NETDEV_CHANGEMTU:
3589
		/* TODO: Should slaves be allowed to
L
Linus Torvalds 已提交
3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601
		 * independently alter their MTU?  For
		 * an active-backup bond, slaves need
		 * not be the same type of device, so
		 * MTUs may vary.  For other modes,
		 * slaves arguably should have the
		 * same MTUs. To do this, we'd need to
		 * take over the slave's change_mtu
		 * function for the duration of their
		 * servitude.
		 */
		break;
	case NETDEV_CHANGENAME:
3602
		/* we don't care if we don't have primary set */
3603
		if (!bond_uses_primary(bond) ||
3604 3605 3606
		    !bond->params.primary[0])
			break;

3607
		if (slave == primary) {
3608
			/* slave's name changed - he's no longer primary */
3609
			RCU_INIT_POINTER(bond->primary_slave, NULL);
3610 3611
		} else if (!strcmp(slave_dev->name, bond->params.primary)) {
			/* we have a new primary slave */
3612
			rcu_assign_pointer(bond->primary_slave, slave);
3613 3614 3615 3616
		} else { /* we didn't change primary - exit */
			break;
		}

3617
		netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n",
3618
			    primary ? slave_dev->name : "none");
3619 3620

		block_netpoll_tx();
3621
		bond_select_active_slave(bond);
3622
		unblock_netpoll_tx();
L
Linus Torvalds 已提交
3623
		break;
3624 3625 3626
	case NETDEV_FEAT_CHANGE:
		bond_compute_features(bond);
		break;
3627 3628 3629 3630
	case NETDEV_RESEND_IGMP:
		/* Propagate to master device */
		call_netdevice_notifiers(event, slave->bond->dev);
		break;
L
Linus Torvalds 已提交
3631 3632 3633 3634 3635 3636 3637
	default:
		break;
	}

	return NOTIFY_DONE;
}

3638
/* bond_netdev_event: handle netdev notifier chain events.
L
Linus Torvalds 已提交
3639 3640
 *
 * This function receives events for the netdev chain.  The caller (an
3641
 * ioctl handler calling blocking_notifier_call_chain) holds the necessary
L
Linus Torvalds 已提交
3642 3643 3644
 * locks for us to safely manipulate the slave devices (RTNL lock,
 * dev_probe_lock).
 */
S
Stephen Hemminger 已提交
3645 3646
static int bond_netdev_event(struct notifier_block *this,
			     unsigned long event, void *ptr)
L
Linus Torvalds 已提交
3647
{
3648
	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
L
Linus Torvalds 已提交
3649

3650 3651
	netdev_dbg(event_dev, "%s received %s\n",
		   __func__, netdev_cmd_to_name(event));
L
Linus Torvalds 已提交
3652

3653 3654 3655
	if (!(event_dev->priv_flags & IFF_BONDING))
		return NOTIFY_DONE;

L
Linus Torvalds 已提交
3656
	if (event_dev->flags & IFF_MASTER) {
3657 3658 3659 3660 3661
		int ret;

		ret = bond_master_netdev_event(event, event_dev);
		if (ret != NOTIFY_DONE)
			return ret;
L
Linus Torvalds 已提交
3662 3663
	}

3664
	if (event_dev->flags & IFF_SLAVE)
L
Linus Torvalds 已提交
3665 3666 3667 3668 3669 3670 3671 3672 3673
		return bond_slave_netdev_event(event, event_dev);

	return NOTIFY_DONE;
}

static struct notifier_block bond_netdev_notifier = {
	.notifier_call = bond_netdev_event,
};

3674 3675
/*---------------------------- Hashing Policies -----------------------------*/

3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689
/* Helper to access data in a packet, with or without a backing skb.
 * If skb is given the data is linearized if necessary via pskb_may_pull.
 */
static inline const void *bond_pull_data(struct sk_buff *skb,
					 const void *data, int hlen, int n)
{
	if (likely(n <= hlen))
		return data;
	else if (skb && likely(pskb_may_pull(skb, n)))
		return skb->head;

	return NULL;
}

3690
/* L2 hash helper */
3691
static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen)
3692
{
3693
	struct ethhdr *ep;
3694

3695 3696 3697 3698 3699
	data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr));
	if (!data)
		return 0;

	ep = (struct ethhdr *)(data + mhoff);
3700
	return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto);
3701 3702
}

3703 3704
static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data,
			 int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34)
M
Matteo Croce 已提交
3705 3706 3707 3708
{
	const struct ipv6hdr *iph6;
	const struct iphdr *iph;

3709 3710 3711
	if (l2_proto == htons(ETH_P_IP)) {
		data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph));
		if (!data)
M
Matteo Croce 已提交
3712
			return false;
3713 3714

		iph = (const struct iphdr *)(data + *nhoff);
M
Matteo Croce 已提交
3715
		iph_to_flow_copy_v4addrs(fk, iph);
3716
		*nhoff += iph->ihl << 2;
M
Matteo Croce 已提交
3717
		if (!ip_is_fragment(iph))
3718 3719 3720 3721
			*ip_proto = iph->protocol;
	} else if (l2_proto == htons(ETH_P_IPV6)) {
		data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6));
		if (!data)
M
Matteo Croce 已提交
3722
			return false;
3723 3724

		iph6 = (const struct ipv6hdr *)(data + *nhoff);
M
Matteo Croce 已提交
3725
		iph_to_flow_copy_v6addrs(fk, iph6);
3726 3727
		*nhoff += sizeof(*iph6);
		*ip_proto = iph6->nexthdr;
M
Matteo Croce 已提交
3728 3729 3730 3731
	} else {
		return false;
	}

3732 3733
	if (l34 && *ip_proto >= 0)
		fk->ports.ports = __skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen);
M
Matteo Croce 已提交
3734 3735 3736 3737

	return true;
}

3738
static u32 bond_vlan_srcmac_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen)
3739
{
3740
	struct ethhdr *mac_hdr;
3741 3742 3743 3744
	u32 srcmac_vendor = 0, srcmac_dev = 0;
	u16 vlan;
	int i;

3745 3746 3747 3748 3749
	data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr));
	if (!data)
		return 0;
	mac_hdr = (struct ethhdr *)(data + mhoff);

3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763
	for (i = 0; i < 3; i++)
		srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i];

	for (i = 3; i < ETH_ALEN; i++)
		srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i];

	if (!skb_vlan_tag_present(skb))
		return srcmac_vendor ^ srcmac_dev;

	vlan = skb_vlan_tag_get(skb);

	return vlan ^ srcmac_vendor ^ srcmac_dev;
}

3764
/* Extract the appropriate headers based on bond's xmit policy */
3765 3766
static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const void *data,
			      __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk)
3767
{
M
Matteo Croce 已提交
3768
	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
3769
	int ip_proto = -1;
3770

3771 3772 3773
	switch (bond->params.xmit_policy) {
	case BOND_XMIT_POLICY_ENCAP23:
	case BOND_XMIT_POLICY_ENCAP34:
3774 3775
		memset(fk, 0, sizeof(*fk));
		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
3776
					  fk, data, l2_proto, nhoff, hlen, 0);
3777 3778
	default:
		break;
3779
	}
3780

3781
	fk->ports.ports = 0;
3782
	memset(&fk->icmp, 0, sizeof(fk->icmp));
3783
	if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34))
3784
		return false;
M
Matteo Croce 已提交
3785 3786 3787 3788 3789 3790

	/* ICMP error packets contains at least 8 bytes of the header
	 * of the packet which generated the error. Use this information
	 * to correlate ICMP error packets within the same flow which
	 * generated the error.
	 */
3791 3792 3793
	if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) {
		skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen);
		if (ip_proto == IPPROTO_ICMP) {
M
Matteo Croce 已提交
3794 3795 3796
			if (!icmp_is_err(fk->icmp.type))
				return true;

3797 3798
			nhoff += sizeof(struct icmphdr);
		} else if (ip_proto == IPPROTO_ICMPV6) {
M
Matteo Croce 已提交
3799 3800 3801
			if (!icmpv6_is_err(fk->icmp.type))
				return true;

3802
			nhoff += sizeof(struct icmp6hdr);
M
Matteo Croce 已提交
3803
		}
3804
		return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34);
3805
	}
3806

3807
	return true;
3808 3809
}

3810 3811 3812 3813 3814 3815 3816 3817 3818 3819
static u32 bond_ip_hash(u32 hash, struct flow_keys *flow)
{
	hash ^= (__force u32)flow_get_u32_dst(flow) ^
		(__force u32)flow_get_u32_src(flow);
	hash ^= (hash >> 16);
	hash ^= (hash >> 8);
	/* discard lowest hash bit to deal with the common even ports pattern */
	return hash >> 1;
}

3820 3821 3822
/* Generate hash based on xmit policy. If @skb is given it is used to linearize
 * the data as required, but this function can be used without it if the data is
 * known to be linear (e.g. with xdp_buff).
3823
 */
3824 3825
static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const void *data,
			    __be16 l2_proto, int mhoff, int nhoff, int hlen)
3826
{
3827 3828
	struct flow_keys flow;
	u32 hash;
3829

3830
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
3831
		return bond_vlan_srcmac_hash(skb, data, mhoff, hlen);
3832

3833
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
3834 3835
	    !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow))
		return bond_eth_hash(skb, data, mhoff, hlen);
3836

3837
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
3838
	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) {
3839
		hash = bond_eth_hash(skb, data, mhoff, hlen);
3840 3841 3842 3843 3844 3845
	} else {
		if (flow.icmp.id)
			memcpy(&hash, &flow.icmp, sizeof(hash));
		else
			memcpy(&hash, &flow.ports.ports, sizeof(hash));
	}
3846

3847
	return bond_ip_hash(hash, &flow);
3848 3849
}

3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868
/**
 * bond_xmit_hash - generate a hash value based on the xmit policy
 * @bond: bonding device
 * @skb: buffer to use for headers
 *
 * This function will extract the necessary headers from the skb buffer and use
 * them to generate a hash based on the xmit_policy set in the bonding device
 */
u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
{
	if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 &&
	    skb->l4_hash)
		return skb->hash;

	return __bond_xmit_hash(bond, skb, skb->head, skb->protocol,
				skb->mac_header, skb->network_header,
				skb_headlen(skb));
}

3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888
/**
 * bond_xmit_hash_xdp - generate a hash value based on the xmit policy
 * @bond: bonding device
 * @xdp: buffer to use for headers
 *
 * The XDP variant of bond_xmit_hash.
 */
static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp)
{
	struct ethhdr *eth;

	if (xdp->data + sizeof(struct ethhdr) > xdp->data_end)
		return 0;

	eth = (struct ethhdr *)xdp->data;

	return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0,
				sizeof(struct ethhdr), xdp->data_end - xdp->data);
}

L
Linus Torvalds 已提交
3889 3890
/*-------------------------- Device entry points ----------------------------*/

3891
void bond_work_init_all(struct bonding *bond)
3892 3893 3894 3895 3896
{
	INIT_DELAYED_WORK(&bond->mcast_work,
			  bond_resend_igmp_join_requests_delayed);
	INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
	INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
3897
	INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor);
3898
	INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
3899
	INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
3900 3901 3902 3903 3904 3905 3906 3907 3908
}

static void bond_work_cancel_all(struct bonding *bond)
{
	cancel_delayed_work_sync(&bond->mii_work);
	cancel_delayed_work_sync(&bond->arp_work);
	cancel_delayed_work_sync(&bond->alb_work);
	cancel_delayed_work_sync(&bond->ad_work);
	cancel_delayed_work_sync(&bond->mcast_work);
3909
	cancel_delayed_work_sync(&bond->slave_arr_work);
3910 3911
}

L
Linus Torvalds 已提交
3912 3913
static int bond_open(struct net_device *bond_dev)
{
3914
	struct bonding *bond = netdev_priv(bond_dev);
3915
	struct list_head *iter;
3916
	struct slave *slave;
L
Linus Torvalds 已提交
3917

3918
	/* reset slave->backup and slave->inactive */
3919
	if (bond_has_slaves(bond)) {
3920
		bond_for_each_slave(bond, slave, iter) {
3921 3922
			if (bond_uses_primary(bond) &&
			    slave != rcu_access_pointer(bond->curr_active_slave)) {
3923 3924
				bond_set_slave_inactive_flags(slave,
							      BOND_SLAVE_NOTIFY_NOW);
3925
			} else if (BOND_MODE(bond) != BOND_MODE_8023AD) {
3926 3927
				bond_set_slave_active_flags(slave,
							    BOND_SLAVE_NOTIFY_NOW);
3928 3929 3930 3931
			}
		}
	}

3932
	if (bond_is_lb(bond)) {
L
Linus Torvalds 已提交
3933 3934 3935
		/* bond_alb_initialize must be called before the timer
		 * is started.
		 */
3936
		if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
3937
			return -ENOMEM;
3938
		if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
3939
			queue_delayed_work(bond->wq, &bond->alb_work, 0);
L
Linus Torvalds 已提交
3940 3941
	}

3942
	if (bond->params.miimon)  /* link check interval, in milliseconds. */
3943
		queue_delayed_work(bond->wq, &bond->mii_work, 0);
L
Linus Torvalds 已提交
3944 3945

	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
3946
		queue_delayed_work(bond->wq, &bond->arp_work, 0);
3947
		bond->recv_probe = bond_arp_rcv;
L
Linus Torvalds 已提交
3948 3949
	}

3950
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
3951
		queue_delayed_work(bond->wq, &bond->ad_work, 0);
L
Linus Torvalds 已提交
3952
		/* register to receive LACPDUs */
3953
		bond->recv_probe = bond_3ad_lacpdu_recv;
3954
		bond_3ad_initiate_agg_selection(bond, 1);
L
Linus Torvalds 已提交
3955 3956
	}

3957
	if (bond_mode_can_use_xmit_hash(bond))
3958 3959
		bond_update_slave_arr(bond, NULL);

L
Linus Torvalds 已提交
3960 3961 3962 3963 3964
	return 0;
}

static int bond_close(struct net_device *bond_dev)
{
3965
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
3966

3967
	bond_work_cancel_all(bond);
3968
	bond->send_peer_notif = 0;
3969
	if (bond_is_lb(bond))
L
Linus Torvalds 已提交
3970
		bond_alb_deinitialize(bond);
3971
	bond->recv_probe = NULL;
L
Linus Torvalds 已提交
3972 3973 3974 3975

	return 0;
}

E
Eric Dumazet 已提交
3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990
/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
 * that some drivers can provide 32bit values only.
 */
static void bond_fold_stats(struct rtnl_link_stats64 *_res,
			    const struct rtnl_link_stats64 *_new,
			    const struct rtnl_link_stats64 *_old)
{
	const u64 *new = (const u64 *)_new;
	const u64 *old = (const u64 *)_old;
	u64 *res = (u64 *)_res;
	int i;

	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
		u64 nv = new[i];
		u64 ov = old[i];
3991
		s64 delta = nv - ov;
E
Eric Dumazet 已提交
3992 3993 3994

		/* detects if this particular field is 32bit only */
		if (((nv | ov) >> 32) == 0)
3995 3996 3997 3998 3999 4000 4001
			delta = (s64)(s32)((u32)nv - (u32)ov);

		/* filter anomalies, some drivers reset their stats
		 * at down/up events.
		 */
		if (delta > 0)
			res[i] += delta;
E
Eric Dumazet 已提交
4002 4003 4004
	}
}

4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045
#ifdef CONFIG_LOCKDEP
static int bond_get_lowest_level_rcu(struct net_device *dev)
{
	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
	int cur = 0, max = 0;

	now = dev;
	iter = &dev->adj_list.lower;

	while (1) {
		next = NULL;
		while (1) {
			ldev = netdev_next_lower_dev_rcu(now, &iter);
			if (!ldev)
				break;

			next = ldev;
			niter = &ldev->adj_list.lower;
			dev_stack[cur] = now;
			iter_stack[cur++] = iter;
			if (max <= cur)
				max = cur;
			break;
		}

		if (!next) {
			if (!cur)
				return max;
			next = dev_stack[--cur];
			niter = iter_stack[cur];
		}

		now = next;
		iter = niter;
	}

	return max;
}
#endif

4046 4047
static void bond_get_stats(struct net_device *bond_dev,
			   struct rtnl_link_stats64 *stats)
L
Linus Torvalds 已提交
4048
{
4049
	struct bonding *bond = netdev_priv(bond_dev);
4050
	struct rtnl_link_stats64 temp;
4051
	struct list_head *iter;
L
Linus Torvalds 已提交
4052
	struct slave *slave;
4053
	int nest_level = 0;
L
Linus Torvalds 已提交
4054 4055


E
Eric Dumazet 已提交
4056
	rcu_read_lock();
4057 4058 4059 4060 4061 4062 4063
#ifdef CONFIG_LOCKDEP
	nest_level = bond_get_lowest_level_rcu(bond_dev);
#endif

	spin_lock_nested(&bond->stats_lock, nest_level);
	memcpy(stats, &bond->bond_stats, sizeof(*stats));

E
Eric Dumazet 已提交
4064 4065
	bond_for_each_slave_rcu(bond, slave, iter) {
		const struct rtnl_link_stats64 *new =
4066
			dev_get_stats(slave->dev, &temp);
E
Eric Dumazet 已提交
4067 4068

		bond_fold_stats(stats, new, &slave->slave_stats);
4069 4070

		/* save off the slave stats for the next run */
E
Eric Dumazet 已提交
4071
		memcpy(&slave->slave_stats, new, sizeof(*new));
4072
	}
E
Eric Dumazet 已提交
4073

4074
	memcpy(&bond->bond_stats, stats, sizeof(*stats));
E
Eric Dumazet 已提交
4075
	spin_unlock(&bond->stats_lock);
4076
	rcu_read_unlock();
L
Linus Torvalds 已提交
4077 4078
}

4079
static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
L
Linus Torvalds 已提交
4080
{
4081
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
4082
	struct mii_ioctl_data *mii = NULL;
4083
	int res;
L
Linus Torvalds 已提交
4084

4085
	netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd);
L
Linus Torvalds 已提交
4086 4087 4088 4089

	switch (cmd) {
	case SIOCGMIIPHY:
		mii = if_mii(ifr);
S
Stephen Hemminger 已提交
4090
		if (!mii)
L
Linus Torvalds 已提交
4091
			return -EINVAL;
S
Stephen Hemminger 已提交
4092

L
Linus Torvalds 已提交
4093
		mii->phy_id = 0;
4094
		fallthrough;
L
Linus Torvalds 已提交
4095
	case SIOCGMIIREG:
4096
		/* We do this again just in case we were called by SIOCGMIIREG
L
Linus Torvalds 已提交
4097 4098 4099
		 * instead of SIOCGMIIPHY.
		 */
		mii = if_mii(ifr);
S
Stephen Hemminger 已提交
4100
		if (!mii)
L
Linus Torvalds 已提交
4101
			return -EINVAL;
S
Stephen Hemminger 已提交
4102

L
Linus Torvalds 已提交
4103 4104
		if (mii->reg_num == 1) {
			mii->val_out = 0;
S
Stephen Hemminger 已提交
4105
			if (netif_carrier_ok(bond->dev))
L
Linus Torvalds 已提交
4106 4107 4108 4109
				mii->val_out = BMSR_LSTATUS;
		}

		return 0;
4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131
	default:
		res = -EOPNOTSUPP;
	}

	return res;
}

static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
{
	struct bonding *bond = netdev_priv(bond_dev);
	struct net_device *slave_dev = NULL;
	struct ifbond k_binfo;
	struct ifbond __user *u_binfo = NULL;
	struct ifslave k_sinfo;
	struct ifslave __user *u_sinfo = NULL;
	struct bond_opt_value newval;
	struct net *net;
	int res = 0;

	netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd);

	switch (cmd) {
L
Linus Torvalds 已提交
4132 4133 4134
	case SIOCBONDINFOQUERY:
		u_binfo = (struct ifbond __user *)ifr->ifr_data;

S
Stephen Hemminger 已提交
4135
		if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond)))
L
Linus Torvalds 已提交
4136 4137
			return -EFAULT;

4138 4139
		bond_info_query(bond_dev, &k_binfo);
		if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond)))
S
Stephen Hemminger 已提交
4140
			return -EFAULT;
L
Linus Torvalds 已提交
4141

4142
		return 0;
L
Linus Torvalds 已提交
4143 4144 4145
	case SIOCBONDSLAVEINFOQUERY:
		u_sinfo = (struct ifslave __user *)ifr->ifr_data;

S
Stephen Hemminger 已提交
4146
		if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave)))
L
Linus Torvalds 已提交
4147 4148 4149
			return -EFAULT;

		res = bond_slave_info_query(bond_dev, &k_sinfo);
S
Stephen Hemminger 已提交
4150 4151 4152
		if (res == 0 &&
		    copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave)))
			return -EFAULT;
L
Linus Torvalds 已提交
4153 4154 4155 4156 4157 4158

		return res;
	default:
		break;
	}

4159 4160 4161
	net = dev_net(bond_dev);

	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
4162 4163
		return -EPERM;

4164
	slave_dev = __dev_get_by_name(net, ifr->ifr_slave);
L
Linus Torvalds 已提交
4165

4166
	slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev);
L
Linus Torvalds 已提交
4167

S
Stephen Hemminger 已提交
4168
	if (!slave_dev)
4169
		return -ENODEV;
L
Linus Torvalds 已提交
4170

4171 4172
	switch (cmd) {
	case SIOCBONDENSLAVE:
D
David Ahern 已提交
4173
		res = bond_enslave(bond_dev, slave_dev, NULL);
4174 4175 4176 4177 4178
		break;
	case SIOCBONDRELEASE:
		res = bond_release(bond_dev, slave_dev);
		break;
	case SIOCBONDSETHWADDR:
4179
		res = bond_set_dev_addr(bond_dev, slave_dev);
4180 4181
		break;
	case SIOCBONDCHANGEACTIVE:
4182
		bond_opt_initstr(&newval, slave_dev->name);
4183 4184
		res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE,
					    &newval);
4185 4186 4187
		break;
	default:
		res = -EOPNOTSUPP;
L
Linus Torvalds 已提交
4188 4189 4190 4191 4192
	}

	return res;
}

A
Arnd Bergmann 已提交
4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215
static int bond_siocdevprivate(struct net_device *bond_dev, struct ifreq *ifr,
			       void __user *data, int cmd)
{
	struct ifreq ifrdata = { .ifr_data = data };

	switch (cmd) {
	case BOND_INFO_QUERY_OLD:
		return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDINFOQUERY);
	case BOND_SLAVE_INFO_QUERY_OLD:
		return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDSLAVEINFOQUERY);
	case BOND_ENSLAVE_OLD:
		return bond_do_ioctl(bond_dev, ifr, SIOCBONDENSLAVE);
	case BOND_RELEASE_OLD:
		return bond_do_ioctl(bond_dev, ifr, SIOCBONDRELEASE);
	case BOND_SETHWADDR_OLD:
		return bond_do_ioctl(bond_dev, ifr, SIOCBONDSETHWADDR);
	case BOND_CHANGE_ACTIVE_OLD:
		return bond_do_ioctl(bond_dev, ifr, SIOCBONDCHANGEACTIVE);
	}

	return -EOPNOTSUPP;
}

4216
static void bond_change_rx_flags(struct net_device *bond_dev, int change)
L
Linus Torvalds 已提交
4217
{
4218
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
4219

4220 4221 4222
	if (change & IFF_PROMISC)
		bond_set_promiscuity(bond,
				     bond_dev->flags & IFF_PROMISC ? 1 : -1);
S
Stephen Hemminger 已提交
4223

4224 4225 4226 4227
	if (change & IFF_ALLMULTI)
		bond_set_allmulti(bond,
				  bond_dev->flags & IFF_ALLMULTI ? 1 : -1);
}
L
Linus Torvalds 已提交
4228

4229
static void bond_set_rx_mode(struct net_device *bond_dev)
4230 4231
{
	struct bonding *bond = netdev_priv(bond_dev);
4232
	struct list_head *iter;
4233
	struct slave *slave;
L
Linus Torvalds 已提交
4234

4235
	rcu_read_lock();
4236
	if (bond_uses_primary(bond)) {
4237
		slave = rcu_dereference(bond->curr_active_slave);
4238 4239 4240 4241 4242
		if (slave) {
			dev_uc_sync(slave->dev, bond_dev);
			dev_mc_sync(slave->dev, bond_dev);
		}
	} else {
4243
		bond_for_each_slave_rcu(bond, slave, iter) {
4244 4245 4246
			dev_uc_sync_multiple(slave->dev, bond_dev);
			dev_mc_sync_multiple(slave->dev, bond_dev);
		}
L
Linus Torvalds 已提交
4247
	}
4248
	rcu_read_unlock();
L
Linus Torvalds 已提交
4249 4250
}

4251
static int bond_neigh_init(struct neighbour *n)
4252
{
4253 4254 4255
	struct bonding *bond = netdev_priv(n->dev);
	const struct net_device_ops *slave_ops;
	struct neigh_parms parms;
4256
	struct slave *slave;
E
Eric Dumazet 已提交
4257
	int ret = 0;
4258

E
Eric Dumazet 已提交
4259 4260
	rcu_read_lock();
	slave = bond_first_slave_rcu(bond);
4261
	if (!slave)
E
Eric Dumazet 已提交
4262
		goto out;
4263
	slave_ops = slave->dev->netdev_ops;
4264
	if (!slave_ops->ndo_neigh_setup)
E
Eric Dumazet 已提交
4265
		goto out;
4266

E
Eric Dumazet 已提交
4267 4268 4269 4270 4271 4272 4273 4274 4275
	/* TODO: find another way [1] to implement this.
	 * Passing a zeroed structure is fragile,
	 * but at least we do not pass garbage.
	 *
	 * [1] One way would be that ndo_neigh_setup() never touch
	 *     struct neigh_parms, but propagate the new neigh_setup()
	 *     back to ___neigh_create() / neigh_parms_alloc()
	 */
	memset(&parms, 0, sizeof(parms));
4276 4277
	ret = slave_ops->ndo_neigh_setup(slave->dev, &parms);

E
Eric Dumazet 已提交
4278 4279
	if (ret)
		goto out;
4280

E
Eric Dumazet 已提交
4281 4282 4283 4284 4285
	if (parms.neigh_setup)
		ret = parms.neigh_setup(n);
out:
	rcu_read_unlock();
	return ret;
4286 4287
}

4288
/* The bonding ndo_neigh_setup is called at init time beofre any
4289 4290
 * slave exists. So we must declare proxy setup function which will
 * be used at run time to resolve the actual slave neigh param setup.
4291 4292 4293 4294
 *
 * It's also called by master devices (such as vlans) to setup their
 * underlying devices. In that case - do nothing, we're already set up from
 * our init.
4295 4296 4297 4298
 */
static int bond_neigh_setup(struct net_device *dev,
			    struct neigh_parms *parms)
{
4299 4300 4301
	/* modify only our neigh_parms */
	if (parms->dev == dev)
		parms->neigh_setup = bond_neigh_init;
4302 4303 4304 4305

	return 0;
}

4306
/* Change the MTU of all of a master's slaves to match the master */
L
Linus Torvalds 已提交
4307 4308
static int bond_change_mtu(struct net_device *bond_dev, int new_mtu)
{
4309
	struct bonding *bond = netdev_priv(bond_dev);
4310
	struct slave *slave, *rollback_slave;
4311
	struct list_head *iter;
L
Linus Torvalds 已提交
4312 4313
	int res = 0;

4314
	netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu);
L
Linus Torvalds 已提交
4315

4316
	bond_for_each_slave(bond, slave, iter) {
4317
		slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n",
4318
			   slave, slave->dev->netdev_ops->ndo_change_mtu);
4319

L
Linus Torvalds 已提交
4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330
		res = dev_set_mtu(slave->dev, new_mtu);

		if (res) {
			/* If we failed to set the slave's mtu to the new value
			 * we must abort the operation even in ACTIVE_BACKUP
			 * mode, because if we allow the backup slaves to have
			 * different mtu values than the active slave we'll
			 * need to change their mtu when doing a failover. That
			 * means changing their mtu from timer context, which
			 * is probably not a good idea.
			 */
4331 4332
			slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n",
				  res, new_mtu);
L
Linus Torvalds 已提交
4333 4334 4335 4336 4337 4338 4339 4340 4341 4342
			goto unwind;
		}
	}

	bond_dev->mtu = new_mtu;

	return 0;

unwind:
	/* unwind from head to the slave that failed */
4343
	bond_for_each_slave(bond, rollback_slave, iter) {
L
Linus Torvalds 已提交
4344 4345
		int tmp_res;

4346 4347 4348 4349
		if (rollback_slave == slave)
			break;

		tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu);
4350 4351 4352
		if (tmp_res)
			slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n",
				  tmp_res);
L
Linus Torvalds 已提交
4353 4354 4355 4356 4357
	}

	return res;
}

4358
/* Change HW address
L
Linus Torvalds 已提交
4359 4360 4361 4362 4363 4364 4365
 *
 * Note that many devices must be down to change the HW address, and
 * downing the master releases all slaves.  We can make bonds full of
 * bonding devices to test this, however.
 */
static int bond_set_mac_address(struct net_device *bond_dev, void *addr)
{
4366
	struct bonding *bond = netdev_priv(bond_dev);
4367
	struct slave *slave, *rollback_slave;
4368
	struct sockaddr_storage *ss = addr, tmp_ss;
4369
	struct list_head *iter;
L
Linus Torvalds 已提交
4370 4371
	int res = 0;

4372
	if (BOND_MODE(bond) == BOND_MODE_ALB)
4373 4374 4375
		return bond_alb_set_mac_address(bond_dev, addr);


4376
	netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond);
L
Linus Torvalds 已提交
4377

4378 4379
	/* If fail_over_mac is enabled, do nothing and return success.
	 * Returning an error causes ifenslave to fail.
4380
	 */
4381
	if (bond->params.fail_over_mac &&
4382
	    BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
4383
		return 0;
4384

4385
	if (!is_valid_ether_addr(ss->__data))
L
Linus Torvalds 已提交
4386 4387
		return -EADDRNOTAVAIL;

4388
	bond_for_each_slave(bond, slave, iter) {
4389 4390
		slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n",
			  __func__, slave);
4391
		res = dev_set_mac_address(slave->dev, addr, NULL);
L
Linus Torvalds 已提交
4392 4393 4394 4395 4396 4397 4398
		if (res) {
			/* TODO: consider downing the slave
			 * and retry ?
			 * User should expect communications
			 * breakage anyway until ARP finish
			 * updating, so...
			 */
4399 4400
			slave_dbg(bond_dev, slave->dev, "%s: err %d\n",
				  __func__, res);
L
Linus Torvalds 已提交
4401 4402 4403 4404 4405
			goto unwind;
		}
	}

	/* success */
4406
	memcpy(bond_dev->dev_addr, ss->__data, bond_dev->addr_len);
L
Linus Torvalds 已提交
4407 4408 4409
	return 0;

unwind:
4410 4411
	memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
	tmp_ss.ss_family = bond_dev->type;
L
Linus Torvalds 已提交
4412 4413

	/* unwind from head to the slave that failed */
4414
	bond_for_each_slave(bond, rollback_slave, iter) {
L
Linus Torvalds 已提交
4415 4416
		int tmp_res;

4417 4418 4419
		if (rollback_slave == slave)
			break;

4420
		tmp_res = dev_set_mac_address(rollback_slave->dev,
4421
					      (struct sockaddr *)&tmp_ss, NULL);
L
Linus Torvalds 已提交
4422
		if (tmp_res) {
4423 4424
			slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n",
				   __func__, tmp_res);
L
Linus Torvalds 已提交
4425 4426 4427 4428 4429 4430
		}
	}

	return res;
}

4431
/**
4432
 * bond_get_slave_by_id - get xmit slave with slave_id
4433 4434 4435
 * @bond: bonding device that is transmitting
 * @slave_id: slave id up to slave_cnt-1 through which to transmit
 *
4436
 * This function tries to get slave with slave_id but in case
4437 4438
 * it fails, it tries to find the first available slave for transmission.
 */
4439 4440
static struct slave *bond_get_slave_by_id(struct bonding *bond,
					  int slave_id)
4441
{
4442
	struct list_head *iter;
4443 4444 4445 4446
	struct slave *slave;
	int i = slave_id;

	/* Here we start from the slave with slave_id */
4447
	bond_for_each_slave_rcu(bond, slave, iter) {
4448
		if (--i < 0) {
4449
			if (bond_slave_can_tx(slave))
4450
				return slave;
4451 4452 4453 4454 4455
		}
	}

	/* Here we start from the first slave up to slave_id */
	i = slave_id;
4456
	bond_for_each_slave_rcu(bond, slave, iter) {
4457 4458
		if (--i < 0)
			break;
4459
		if (bond_slave_can_tx(slave))
4460
			return slave;
4461 4462
	}
	/* no slave that can tx has been found */
4463
	return NULL;
4464 4465
}

4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476
/**
 * bond_rr_gen_slave_id - generate slave id based on packets_per_slave
 * @bond: bonding device to use
 *
 * Based on the value of the bonding device's packets_per_slave parameter
 * this function generates a slave id, which is usually used as the next
 * slave to transmit through.
 */
static u32 bond_rr_gen_slave_id(struct bonding *bond)
{
	u32 slave_id;
4477 4478
	struct reciprocal_value reciprocal_packets_per_slave;
	int packets_per_slave = bond->params.packets_per_slave;
4479 4480 4481 4482 4483 4484

	switch (packets_per_slave) {
	case 0:
		slave_id = prandom_u32();
		break;
	case 1:
4485
		slave_id = this_cpu_inc_return(*bond->rr_tx_counter);
4486 4487
		break;
	default:
4488 4489
		reciprocal_packets_per_slave =
			bond->params.reciprocal_packets_per_slave;
4490 4491
		slave_id = this_cpu_inc_return(*bond->rr_tx_counter);
		slave_id = reciprocal_divide(slave_id,
4492
					     reciprocal_packets_per_slave);
4493 4494 4495 4496 4497 4498
		break;
	}

	return slave_id;
}

4499 4500
static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond,
						    struct sk_buff *skb)
L
Linus Torvalds 已提交
4501
{
4502
	struct slave *slave;
4503
	int slave_cnt;
4504
	u32 slave_id;
L
Linus Torvalds 已提交
4505

4506
	/* Start with the curr_active_slave that joined the bond as the
4507 4508 4509 4510
	 * default for sending IGMP traffic.  For failover purposes one
	 * needs to maintain some consistency for the interface that will
	 * send the join/membership reports.  The curr_active_slave found
	 * will send all of this type of traffic.
4511
	 */
4512 4513 4514
	if (skb->protocol == htons(ETH_P_IP)) {
		int noff = skb_network_offset(skb);
		struct iphdr *iph;
4515

4516 4517 4518 4519 4520 4521 4522
		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
			goto non_igmp;

		iph = ip_hdr(skb);
		if (iph->protocol == IPPROTO_IGMP) {
			slave = rcu_dereference(bond->curr_active_slave);
			if (slave)
4523 4524
				return slave;
			return bond_get_slave_by_id(bond, 0);
4525
		}
L
Linus Torvalds 已提交
4526
	}
4527

4528 4529 4530
non_igmp:
	slave_cnt = READ_ONCE(bond->slave_cnt);
	if (likely(slave_cnt)) {
4531 4532
		slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
		return bond_get_slave_by_id(bond, slave_id);
4533
	}
4534 4535 4536
	return NULL;
}

4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577
static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond,
							struct xdp_buff *xdp)
{
	struct slave *slave;
	int slave_cnt;
	u32 slave_id;
	const struct ethhdr *eth;
	void *data = xdp->data;

	if (data + sizeof(struct ethhdr) > xdp->data_end)
		goto non_igmp;

	eth = (struct ethhdr *)data;
	data += sizeof(struct ethhdr);

	/* See comment on IGMP in bond_xmit_roundrobin_slave_get() */
	if (eth->h_proto == htons(ETH_P_IP)) {
		const struct iphdr *iph;

		if (data + sizeof(struct iphdr) > xdp->data_end)
			goto non_igmp;

		iph = (struct iphdr *)data;

		if (iph->protocol == IPPROTO_IGMP) {
			slave = rcu_dereference(bond->curr_active_slave);
			if (slave)
				return slave;
			return bond_get_slave_by_id(bond, 0);
		}
	}

non_igmp:
	slave_cnt = READ_ONCE(bond->slave_cnt);
	if (likely(slave_cnt)) {
		slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
		return bond_get_slave_by_id(bond, slave_id);
	}
	return NULL;
}

4578 4579 4580 4581 4582 4583 4584
static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
					struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
	struct slave *slave;

	slave = bond_xmit_roundrobin_slave_get(bond, skb);
4585 4586 4587
	if (likely(slave))
		return bond_dev_queue_xmit(bond, skb, slave->dev);

4588
	return bond_tx_drop(bond_dev, skb);
L
Linus Torvalds 已提交
4589 4590
}

4591
static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond)
4592 4593 4594 4595
{
	return rcu_dereference(bond->curr_active_slave);
}

4596
/* In active-backup mode, we know that bond->curr_active_slave is always valid if
L
Linus Torvalds 已提交
4597 4598
 * the bond has a usable interface.
 */
4599 4600
static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
					  struct net_device *bond_dev)
L
Linus Torvalds 已提交
4601
{
4602
	struct bonding *bond = netdev_priv(bond_dev);
4603
	struct slave *slave;
L
Linus Torvalds 已提交
4604

4605
	slave = bond_xmit_activebackup_slave_get(bond);
4606
	if (slave)
4607
		return bond_dev_queue_xmit(bond, skb, slave->dev);
4608

4609
	return bond_tx_drop(bond_dev, skb);
L
Linus Torvalds 已提交
4610 4611
}

4612 4613 4614
/* Use this to update slave_array when (a) it's not appropriate to update
 * slave_array right away (note that update_slave_array() may sleep)
 * and / or (b) RTNL is not held.
L
Linus Torvalds 已提交
4615
 */
4616
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay)
L
Linus Torvalds 已提交
4617
{
4618 4619
	queue_delayed_work(bond->wq, &bond->slave_arr_work, delay);
}
L
Linus Torvalds 已提交
4620

4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642
/* Slave array work handler. Holds only RTNL */
static void bond_slave_arr_handler(struct work_struct *work)
{
	struct bonding *bond = container_of(work, struct bonding,
					    slave_arr_work.work);
	int ret;

	if (!rtnl_trylock())
		goto err;

	ret = bond_update_slave_arr(bond, NULL);
	rtnl_unlock();
	if (ret) {
		pr_warn_ratelimited("Failed to update slave array from WT\n");
		goto err;
	}
	return;

err:
	bond_slave_arr_work_rearm(bond, 1);
}

4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665
static void bond_skip_slave(struct bond_up_slave *slaves,
			    struct slave *skipslave)
{
	int idx;

	/* Rare situation where caller has asked to skip a specific
	 * slave but allocation failed (most likely!). BTW this is
	 * only possible when the call is initiated from
	 * __bond_release_one(). In this situation; overwrite the
	 * skipslave entry in the array with the last entry from the
	 * array to avoid a situation where the xmit path may choose
	 * this to-be-skipped slave to send a packet out.
	 */
	for (idx = 0; slaves && idx < slaves->count; idx++) {
		if (skipslave == slaves->arr[idx]) {
			slaves->arr[idx] =
				slaves->arr[slaves->count - 1];
			slaves->count--;
			break;
		}
	}
}

M
Maor Gottlieb 已提交
4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697
static void bond_set_slave_arr(struct bonding *bond,
			       struct bond_up_slave *usable_slaves,
			       struct bond_up_slave *all_slaves)
{
	struct bond_up_slave *usable, *all;

	usable = rtnl_dereference(bond->usable_slaves);
	rcu_assign_pointer(bond->usable_slaves, usable_slaves);
	kfree_rcu(usable, rcu);

	all = rtnl_dereference(bond->all_slaves);
	rcu_assign_pointer(bond->all_slaves, all_slaves);
	kfree_rcu(all, rcu);
}

static void bond_reset_slave_arr(struct bonding *bond)
{
	struct bond_up_slave *usable, *all;

	usable = rtnl_dereference(bond->usable_slaves);
	if (usable) {
		RCU_INIT_POINTER(bond->usable_slaves, NULL);
		kfree_rcu(usable, rcu);
	}

	all = rtnl_dereference(bond->all_slaves);
	if (all) {
		RCU_INIT_POINTER(bond->all_slaves, NULL);
		kfree_rcu(all, rcu);
	}
}

4698 4699 4700 4701
/* Build the usable slaves array in control path for modes that use xmit-hash
 * to determine the slave interface -
 * (a) BOND_MODE_8023AD
 * (b) BOND_MODE_XOR
4702
 * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0
4703 4704 4705 4706 4707
 *
 * The caller is expected to hold RTNL only and NO other lock!
 */
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
{
M
Maor Gottlieb 已提交
4708
	struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL;
4709 4710 4711 4712 4713
	struct slave *slave;
	struct list_head *iter;
	int agg_id = 0;
	int ret = 0;

4714
	might_sleep();
4715

4716 4717
	usable_slaves = kzalloc(struct_size(usable_slaves, arr,
					    bond->slave_cnt), GFP_KERNEL);
M
Maor Gottlieb 已提交
4718 4719 4720
	all_slaves = kzalloc(struct_size(all_slaves, arr,
					 bond->slave_cnt), GFP_KERNEL);
	if (!usable_slaves || !all_slaves) {
4721 4722 4723 4724 4725 4726
		ret = -ENOMEM;
		goto out;
	}
	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
		struct ad_info ad_info;

4727
		spin_lock_bh(&bond->mode_lock);
4728
		if (bond_3ad_get_active_agg_info(bond, &ad_info)) {
4729
			spin_unlock_bh(&bond->mode_lock);
4730 4731 4732 4733
			pr_debug("bond_3ad_get_active_agg_info failed\n");
			/* No active aggragator means it's not safe to use
			 * the previous array.
			 */
M
Maor Gottlieb 已提交
4734
			bond_reset_slave_arr(bond);
4735 4736
			goto out;
		}
4737
		spin_unlock_bh(&bond->mode_lock);
4738 4739 4740
		agg_id = ad_info.aggregator_id;
	}
	bond_for_each_slave(bond, slave, iter) {
M
Maor Gottlieb 已提交
4741 4742 4743 4744
		if (skipslave == slave)
			continue;

		all_slaves->arr[all_slaves->count++] = slave;
4745 4746 4747 4748 4749 4750 4751 4752 4753
		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
			struct aggregator *agg;

			agg = SLAVE_AD_INFO(slave)->port.aggregator;
			if (!agg || agg->aggregator_identifier != agg_id)
				continue;
		}
		if (!bond_slave_can_tx(slave))
			continue;
4754

4755
		slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n",
4756
			  usable_slaves->count);
4757

4758
		usable_slaves->arr[usable_slaves->count++] = slave;
4759 4760
	}

M
Maor Gottlieb 已提交
4761 4762
	bond_set_slave_arr(bond, usable_slaves, all_slaves);
	return ret;
4763 4764
out:
	if (ret != 0 && skipslave) {
M
Maor Gottlieb 已提交
4765 4766
		bond_skip_slave(rtnl_dereference(bond->all_slaves),
				skipslave);
4767 4768
		bond_skip_slave(rtnl_dereference(bond->usable_slaves),
				skipslave);
4769
	}
M
Maor Gottlieb 已提交
4770 4771
	kfree_rcu(all_slaves, rcu);
	kfree_rcu(usable_slaves, rcu);
4772

4773 4774 4775
	return ret;
}

4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792
static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond,
						 struct sk_buff *skb,
						 struct bond_up_slave *slaves)
{
	struct slave *slave;
	unsigned int count;
	u32 hash;

	hash = bond_xmit_hash(bond, skb);
	count = slaves ? READ_ONCE(slaves->count) : 0;
	if (unlikely(!count))
		return NULL;

	slave = slaves->arr[hash % count];
	return slave;
}

4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808
static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond,
						     struct xdp_buff *xdp)
{
	struct bond_up_slave *slaves;
	unsigned int count;
	u32 hash;

	hash = bond_xmit_hash_xdp(bond, xdp);
	slaves = rcu_dereference(bond->usable_slaves);
	count = slaves ? READ_ONCE(slaves->count) : 0;
	if (unlikely(!count))
		return NULL;

	return slaves->arr[hash % count];
}

4809 4810 4811 4812
/* Use this Xmit function for 3AD as well as XOR modes. The current
 * usable slave array is formed in the control path. The xmit function
 * just calculates hash and sends the packet out.
 */
4813 4814
static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
				     struct net_device *dev)
4815 4816 4817
{
	struct bonding *bond = netdev_priv(dev);
	struct bond_up_slave *slaves;
4818
	struct slave *slave;
4819

4820
	slaves = rcu_dereference(bond->usable_slaves);
4821 4822
	slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
	if (likely(slave))
4823
		return bond_dev_queue_xmit(bond, skb, slave->dev);
4824

4825
	return bond_tx_drop(dev, skb);
L
Linus Torvalds 已提交
4826 4827
}

4828
/* in broadcast mode, we send everything to all usable interfaces. */
4829 4830
static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
				       struct net_device *bond_dev)
L
Linus Torvalds 已提交
4831
{
4832
	struct bonding *bond = netdev_priv(bond_dev);
4833
	struct slave *slave = NULL;
4834
	struct list_head *iter;
L
Linus Torvalds 已提交
4835

4836
	bond_for_each_slave_rcu(bond, slave, iter) {
4837 4838
		if (bond_is_last_slave(bond, slave))
			break;
4839
		if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
4840
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
L
Linus Torvalds 已提交
4841

4842
			if (!skb2) {
4843 4844
				net_err_ratelimited("%s: Error: %s: skb_clone() failed\n",
						    bond_dev->name, __func__);
4845
				continue;
L
Linus Torvalds 已提交
4846
			}
4847
			bond_dev_queue_xmit(bond, skb2, slave->dev);
L
Linus Torvalds 已提交
4848 4849
		}
	}
4850
	if (slave && bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)
4851
		return bond_dev_queue_xmit(bond, skb, slave->dev);
S
Stephen Hemminger 已提交
4852

4853
	return bond_tx_drop(bond_dev, skb);
L
Linus Torvalds 已提交
4854 4855 4856 4857
}

/*------------------------- Device initialization ---------------------------*/

4858
/* Lookup the slave that corresponds to a qid */
4859 4860 4861 4862
static inline int bond_slave_override(struct bonding *bond,
				      struct sk_buff *skb)
{
	struct slave *slave = NULL;
4863
	struct list_head *iter;
4864

4865
	if (!skb_rx_queue_recorded(skb))
4866
		return 1;
4867 4868

	/* Find out if any slaves have the same mapping as this skb. */
4869
	bond_for_each_slave_rcu(bond, slave, iter) {
4870
		if (slave->queue_id == skb_get_queue_mapping(skb)) {
4871 4872
			if (bond_slave_is_up(slave) &&
			    slave->link == BOND_LINK_UP) {
4873 4874 4875 4876
				bond_dev_queue_xmit(bond, skb, slave->dev);
				return 0;
			}
			/* If the slave isn't UP, use default transmit policy. */
4877 4878 4879 4880
			break;
		}
	}

4881
	return 1;
4882 4883
}

4884

4885
static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
4886
			     struct net_device *sb_dev)
4887
{
4888
	/* This helper function exists to help dev_pick_tx get the correct
P
Phil Oester 已提交
4889
	 * destination queue.  Using a helper function skips a call to
4890 4891 4892
	 * skb_tx_hash and will put the skbs in the queue we expect on their
	 * way down to the bonding driver.
	 */
P
Phil Oester 已提交
4893 4894
	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;

4895
	/* Save the original txq to restore before passing to the driver */
4896
	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb);
4897

P
Phil Oester 已提交
4898
	if (unlikely(txq >= dev->real_num_tx_queues)) {
4899
		do {
P
Phil Oester 已提交
4900
			txq -= dev->real_num_tx_queues;
4901
		} while (txq >= dev->real_num_tx_queues);
P
Phil Oester 已提交
4902 4903
	}
	return txq;
4904 4905
}

4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918
static struct net_device *bond_xmit_get_slave(struct net_device *master_dev,
					      struct sk_buff *skb,
					      bool all_slaves)
{
	struct bonding *bond = netdev_priv(master_dev);
	struct bond_up_slave *slaves;
	struct slave *slave = NULL;

	switch (BOND_MODE(bond)) {
	case BOND_MODE_ROUNDROBIN:
		slave = bond_xmit_roundrobin_slave_get(bond, skb);
		break;
	case BOND_MODE_ACTIVEBACKUP:
4919
		slave = bond_xmit_activebackup_slave_get(bond);
4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947
		break;
	case BOND_MODE_8023AD:
	case BOND_MODE_XOR:
		if (all_slaves)
			slaves = rcu_dereference(bond->all_slaves);
		else
			slaves = rcu_dereference(bond->usable_slaves);
		slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
		break;
	case BOND_MODE_BROADCAST:
		break;
	case BOND_MODE_ALB:
		slave = bond_xmit_alb_slave_get(bond, skb);
		break;
	case BOND_MODE_TLB:
		slave = bond_xmit_tlb_slave_get(bond, skb);
		break;
	default:
		/* Should never happen, mode already checked */
		WARN_ONCE(true, "Unknown bonding mode");
		break;
	}

	if (slave)
		return slave->dev;
	return NULL;
}

4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026
static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow)
{
	switch (sk->sk_family) {
#if IS_ENABLED(CONFIG_IPV6)
	case AF_INET6:
		if (sk->sk_ipv6only ||
		    ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) {
			flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			flow->addrs.v6addrs.src = inet6_sk(sk)->saddr;
			flow->addrs.v6addrs.dst = sk->sk_v6_daddr;
			break;
		}
		fallthrough;
#endif
	default: /* AF_INET */
		flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
		flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr;
		flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr;
		break;
	}

	flow->ports.src = inet_sk(sk)->inet_sport;
	flow->ports.dst = inet_sk(sk)->inet_dport;
}

/**
 * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields
 * @sk: socket to use for headers
 *
 * This function will extract the necessary field from the socket and use
 * them to generate a hash based on the LAYER34 xmit_policy.
 * Assumes that sk is a TCP or UDP socket.
 */
static u32 bond_sk_hash_l34(struct sock *sk)
{
	struct flow_keys flow;
	u32 hash;

	bond_sk_to_flow(sk, &flow);

	/* L4 */
	memcpy(&hash, &flow.ports.ports, sizeof(hash));
	/* L3 */
	return bond_ip_hash(hash, &flow);
}

static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond,
						  struct sock *sk)
{
	struct bond_up_slave *slaves;
	struct slave *slave;
	unsigned int count;
	u32 hash;

	slaves = rcu_dereference(bond->usable_slaves);
	count = slaves ? READ_ONCE(slaves->count) : 0;
	if (unlikely(!count))
		return NULL;

	hash = bond_sk_hash_l34(sk);
	slave = slaves->arr[hash % count];

	return slave->dev;
}

static struct net_device *bond_sk_get_lower_dev(struct net_device *dev,
						struct sock *sk)
{
	struct bonding *bond = netdev_priv(dev);
	struct net_device *lower = NULL;

	rcu_read_lock();
	if (bond_sk_check(bond))
		lower = __bond_sk_get_lower_dev(bond, sk);
	rcu_read_unlock();

	return lower;
}

5027 5028 5029 5030 5031 5032 5033 5034 5035 5036
#if IS_ENABLED(CONFIG_TLS_DEVICE)
static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb,
					struct net_device *dev)
{
	if (likely(bond_get_slave_by_dev(bond, tls_get_ctx(skb->sk)->netdev)))
		return bond_dev_queue_xmit(bond, skb, tls_get_ctx(skb->sk)->netdev);
	return bond_tx_drop(dev, skb);
}
#endif

5037
static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
5038
{
5039 5040
	struct bonding *bond = netdev_priv(dev);

5041 5042 5043
	if (bond_should_override_tx_queue(bond) &&
	    !bond_slave_override(bond, skb))
		return NETDEV_TX_OK;
5044

5045 5046 5047 5048 5049
#if IS_ENABLED(CONFIG_TLS_DEVICE)
	if (skb->sk && tls_is_sk_tx_device_offloaded(skb->sk))
		return bond_tls_device_xmit(bond, skb, dev);
#endif

5050
	switch (BOND_MODE(bond)) {
5051 5052 5053 5054
	case BOND_MODE_ROUNDROBIN:
		return bond_xmit_roundrobin(skb, dev);
	case BOND_MODE_ACTIVEBACKUP:
		return bond_xmit_activebackup(skb, dev);
5055
	case BOND_MODE_8023AD:
5056
	case BOND_MODE_XOR:
5057
		return bond_3ad_xor_xmit(skb, dev);
5058 5059 5060 5061
	case BOND_MODE_BROADCAST:
		return bond_xmit_broadcast(skb, dev);
	case BOND_MODE_ALB:
		return bond_alb_xmit(skb, dev);
5062 5063
	case BOND_MODE_TLB:
		return bond_tlb_xmit(skb, dev);
5064 5065
	default:
		/* Should never happen, mode already checked */
5066
		netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond));
5067
		WARN_ON_ONCE(1);
5068
		return bond_tx_drop(dev, skb);
5069 5070 5071
	}
}

5072 5073 5074 5075 5076
static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct bonding *bond = netdev_priv(dev);
	netdev_tx_t ret = NETDEV_TX_OK;

5077
	/* If we risk deadlock from transmitting this in the
5078 5079
	 * netpoll path, tell netpoll to queue the frame for later tx
	 */
5080
	if (unlikely(is_netpoll_tx_blocked(dev)))
5081 5082
		return NETDEV_TX_BUSY;

5083
	rcu_read_lock();
5084
	if (bond_has_slaves(bond))
5085 5086
		ret = __bond_start_xmit(skb, dev);
	else
5087
		ret = bond_tx_drop(dev, skb);
5088
	rcu_read_unlock();
5089 5090 5091

	return ret;
}
5092

5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260
static struct net_device *
bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp)
{
	struct bonding *bond = netdev_priv(bond_dev);
	struct slave *slave;

	/* Caller needs to hold rcu_read_lock() */

	switch (BOND_MODE(bond)) {
	case BOND_MODE_ROUNDROBIN:
		slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp);
		break;

	case BOND_MODE_ACTIVEBACKUP:
		slave = bond_xmit_activebackup_slave_get(bond);
		break;

	case BOND_MODE_8023AD:
	case BOND_MODE_XOR:
		slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp);
		break;

	default:
		/* Should never happen. Mode guarded by bond_xdp_check() */
		netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond));
		WARN_ON_ONCE(1);
		return NULL;
	}

	if (slave)
		return slave->dev;

	return NULL;
}

static int bond_xdp_xmit(struct net_device *bond_dev,
			 int n, struct xdp_frame **frames, u32 flags)
{
	int nxmit, err = -ENXIO;

	rcu_read_lock();

	for (nxmit = 0; nxmit < n; nxmit++) {
		struct xdp_frame *frame = frames[nxmit];
		struct xdp_frame *frames1[] = {frame};
		struct net_device *slave_dev;
		struct xdp_buff xdp;

		xdp_convert_frame_to_buff(frame, &xdp);

		slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp);
		if (!slave_dev) {
			err = -ENXIO;
			break;
		}

		err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags);
		if (err < 1)
			break;
	}

	rcu_read_unlock();

	/* If error happened on the first frame then we can pass the error up, otherwise
	 * report the number of frames that were xmitted.
	 */
	if (err < 0)
		return (nxmit == 0 ? err : nxmit);

	return nxmit;
}

static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog,
			struct netlink_ext_ack *extack)
{
	struct bonding *bond = netdev_priv(dev);
	struct list_head *iter;
	struct slave *slave, *rollback_slave;
	struct bpf_prog *old_prog;
	struct netdev_bpf xdp = {
		.command = XDP_SETUP_PROG,
		.flags   = 0,
		.prog    = prog,
		.extack  = extack,
	};
	int err;

	ASSERT_RTNL();

	if (!bond_xdp_check(bond))
		return -EOPNOTSUPP;

	old_prog = bond->xdp_prog;
	bond->xdp_prog = prog;

	bond_for_each_slave(bond, slave, iter) {
		struct net_device *slave_dev = slave->dev;

		if (!slave_dev->netdev_ops->ndo_bpf ||
		    !slave_dev->netdev_ops->ndo_xdp_xmit) {
			NL_SET_ERR_MSG(extack, "Slave device does not support XDP");
			slave_err(dev, slave_dev, "Slave does not support XDP\n");
			err = -EOPNOTSUPP;
			goto err;
		}

		if (dev_xdp_prog_count(slave_dev) > 0) {
			NL_SET_ERR_MSG(extack,
				       "Slave has XDP program loaded, please unload before enslaving");
			slave_err(dev, slave_dev,
				  "Slave has XDP program loaded, please unload before enslaving\n");
			err = -EOPNOTSUPP;
			goto err;
		}

		err = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
		if (err < 0) {
			/* ndo_bpf() sets extack error message */
			slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err);
			goto err;
		}
		if (prog)
			bpf_prog_inc(prog);
	}

	if (old_prog)
		bpf_prog_put(old_prog);

	if (prog)
		static_branch_inc(&bpf_master_redirect_enabled_key);
	else
		static_branch_dec(&bpf_master_redirect_enabled_key);

	return 0;

err:
	/* unwind the program changes */
	bond->xdp_prog = old_prog;
	xdp.prog = old_prog;
	xdp.extack = NULL; /* do not overwrite original error */

	bond_for_each_slave(bond, rollback_slave, iter) {
		struct net_device *slave_dev = rollback_slave->dev;
		int err_unwind;

		if (slave == rollback_slave)
			break;

		err_unwind = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
		if (err_unwind < 0)
			slave_err(dev, slave_dev,
				  "Error %d when unwinding XDP program change\n", err_unwind);
		else if (xdp.prog)
			bpf_prog_inc(xdp.prog);
	}
	return err;
}

static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
	switch (xdp->command) {
	case XDP_SETUP_PROG:
		return bond_xdp_set(dev, xdp->prog, xdp->extack);
	default:
		return -EINVAL;
	}
}

5261 5262 5263 5264 5265 5266 5267 5268 5269 5270
static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
{
	if (speed == 0 || speed == SPEED_UNKNOWN)
		speed = slave->speed;
	else
		speed = min(speed, slave->speed);

	return speed;
}

5271 5272
static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
					   struct ethtool_link_ksettings *cmd)
5273 5274
{
	struct bonding *bond = netdev_priv(bond_dev);
5275
	struct list_head *iter;
5276
	struct slave *slave;
5277
	u32 speed = 0;
5278

5279 5280
	cmd->base.duplex = DUPLEX_UNKNOWN;
	cmd->base.port = PORT_OTHER;
5281

5282
	/* Since bond_slave_can_tx returns false for all inactive or down slaves, we
5283 5284 5285 5286
	 * do not need to check mode.  Though link speed might not represent
	 * the true receive or transmit bandwidth (not all modes are symmetric)
	 * this is an accurate maximum.
	 */
5287
	bond_for_each_slave(bond, slave, iter) {
5288
		if (bond_slave_can_tx(slave)) {
5289 5290 5291 5292 5293 5294 5295
			if (slave->speed != SPEED_UNKNOWN) {
				if (BOND_MODE(bond) == BOND_MODE_BROADCAST)
					speed = bond_mode_bcast_speed(slave,
								      speed);
				else
					speed += slave->speed;
			}
5296
			if (cmd->base.duplex == DUPLEX_UNKNOWN &&
5297
			    slave->duplex != DUPLEX_UNKNOWN)
5298
				cmd->base.duplex = slave->duplex;
5299 5300
		}
	}
5301
	cmd->base.speed = speed ? : SPEED_UNKNOWN;
5302

5303 5304 5305
	return 0;
}

5306
static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
5307
				     struct ethtool_drvinfo *drvinfo)
5308
{
5309 5310 5311
	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d",
		 BOND_ABI_VERSION);
5312 5313
}

5314
static const struct ethtool_ops bond_ethtool_ops = {
5315
	.get_drvinfo		= bond_ethtool_get_drvinfo,
5316
	.get_link		= ethtool_op_get_link,
5317
	.get_link_ksettings	= bond_ethtool_get_link_ksettings,
5318 5319
};

5320
static const struct net_device_ops bond_netdev_ops = {
5321
	.ndo_init		= bond_init,
S
Stephen Hemminger 已提交
5322
	.ndo_uninit		= bond_uninit,
5323 5324
	.ndo_open		= bond_open,
	.ndo_stop		= bond_close,
5325
	.ndo_start_xmit		= bond_start_xmit,
5326
	.ndo_select_queue	= bond_select_queue,
5327
	.ndo_get_stats64	= bond_get_stats,
5328
	.ndo_eth_ioctl		= bond_eth_ioctl,
5329
	.ndo_siocbond		= bond_do_ioctl,
A
Arnd Bergmann 已提交
5330
	.ndo_siocdevprivate	= bond_siocdevprivate,
5331
	.ndo_change_rx_flags	= bond_change_rx_flags,
5332
	.ndo_set_rx_mode	= bond_set_rx_mode,
5333
	.ndo_change_mtu		= bond_change_mtu,
J
Jiri Pirko 已提交
5334
	.ndo_set_mac_address	= bond_set_mac_address,
5335
	.ndo_neigh_setup	= bond_neigh_setup,
J
Jiri Pirko 已提交
5336
	.ndo_vlan_rx_add_vid	= bond_vlan_rx_add_vid,
5337
	.ndo_vlan_rx_kill_vid	= bond_vlan_rx_kill_vid,
5338
#ifdef CONFIG_NET_POLL_CONTROLLER
5339
	.ndo_netpoll_setup	= bond_netpoll_setup,
5340 5341 5342
	.ndo_netpoll_cleanup	= bond_netpoll_cleanup,
	.ndo_poll_controller	= bond_poll_controller,
#endif
J
Jiri Pirko 已提交
5343 5344
	.ndo_add_slave		= bond_enslave,
	.ndo_del_slave		= bond_release,
5345
	.ndo_fix_features	= bond_fix_features,
5346
	.ndo_features_check	= passthru_features_check,
5347
	.ndo_get_xmit_slave	= bond_xmit_get_slave,
5348
	.ndo_sk_get_lower_dev	= bond_sk_get_lower_dev,
5349 5350 5351
	.ndo_bpf		= bond_xdp,
	.ndo_xdp_xmit           = bond_xdp_xmit,
	.ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave,
5352 5353
};

5354 5355 5356 5357
static const struct device_type bond_type = {
	.name = "bond",
};

5358 5359 5360
static void bond_destructor(struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
5361

5362 5363
	if (bond->wq)
		destroy_workqueue(bond->wq);
5364 5365 5366

	if (bond->rr_tx_counter)
		free_percpu(bond->rr_tx_counter);
5367 5368
}

5369
void bond_setup(struct net_device *bond_dev)
L
Linus Torvalds 已提交
5370
{
5371
	struct bonding *bond = netdev_priv(bond_dev);
L
Linus Torvalds 已提交
5372

5373
	spin_lock_init(&bond->mode_lock);
5374
	bond->params = bonding_defaults;
L
Linus Torvalds 已提交
5375 5376 5377 5378 5379

	/* Initialize pointers */
	bond->dev = bond_dev;

	/* Initialize the device entry points */
5380
	ether_setup(bond_dev);
W
WANG Cong 已提交
5381
	bond_dev->max_mtu = ETH_MAX_MTU;
5382
	bond_dev->netdev_ops = &bond_netdev_ops;
5383
	bond_dev->ethtool_ops = &bond_ethtool_ops;
L
Linus Torvalds 已提交
5384

5385 5386
	bond_dev->needs_free_netdev = true;
	bond_dev->priv_destructor = bond_destructor;
L
Linus Torvalds 已提交
5387

5388 5389
	SET_NETDEV_DEVTYPE(bond_dev, &bond_type);

L
Linus Torvalds 已提交
5390
	/* Initialize the device options */
5391
	bond_dev->flags |= IFF_MASTER;
5392
	bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE;
5393
	bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
5394

5395 5396
#ifdef CONFIG_XFRM_OFFLOAD
	/* set up xfrm device ops (only supported in active-backup right now) */
5397
	bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
5398 5399
	INIT_LIST_HEAD(&bond->ipsec_list);
	spin_lock_init(&bond->ipsec_lock);
5400 5401
#endif /* CONFIG_XFRM_OFFLOAD */

5402
	/* don't acquire bond device's netif_tx_lock when transmitting */
L
Linus Torvalds 已提交
5403 5404 5405 5406 5407 5408 5409 5410 5411
	bond_dev->features |= NETIF_F_LLTX;

	/* By default, we declare the bond to be fully
	 * VLAN hardware accelerated capable. Special
	 * care is taken in the various xmit functions
	 * when there are slaves that are not hw accel
	 * capable
	 */

5412 5413 5414
	/* Don't allow bond devices to change network namespaces. */
	bond_dev->features |= NETIF_F_NETNS_LOCAL;

5415
	bond_dev->hw_features = BOND_VLAN_FEATURES |
5416 5417
				NETIF_F_HW_VLAN_CTAG_RX |
				NETIF_F_HW_VLAN_CTAG_FILTER;
5418

5419
	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
5420
	bond_dev->features |= bond_dev->hw_features;
5421
	bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
5422
#ifdef CONFIG_XFRM_OFFLOAD
5423 5424 5425 5426
	bond_dev->hw_features |= BOND_XFRM_FEATURES;
	/* Only enable XFRM features if this is an active-backup config */
	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
		bond_dev->features |= BOND_XFRM_FEATURES;
5427
#endif /* CONFIG_XFRM_OFFLOAD */
5428 5429 5430 5431
#if IS_ENABLED(CONFIG_TLS_DEVICE)
	if (bond_sk_check(bond))
		bond_dev->features |= BOND_TLS_FEATURES;
#endif
L
Linus Torvalds 已提交
5432 5433
}

5434 5435 5436
/* Destroy a bonding device.
 * Must be under rtnl_lock when this function is called.
 */
5437
static void bond_uninit(struct net_device *bond_dev)
J
Jay Vosburgh 已提交
5438
{
5439
	struct bonding *bond = netdev_priv(bond_dev);
M
Maor Gottlieb 已提交
5440
	struct bond_up_slave *usable, *all;
5441 5442
	struct list_head *iter;
	struct slave *slave;
J
Jay Vosburgh 已提交
5443

5444 5445
	bond_netpoll_cleanup(bond_dev);

5446
	/* Release the bonded slaves */
5447
	bond_for_each_slave(bond, slave, iter)
5448
		__bond_release_one(bond_dev, slave->dev, true, true);
5449
	netdev_info(bond_dev, "Released all slaves\n");
5450

M
Maor Gottlieb 已提交
5451 5452
	usable = rtnl_dereference(bond->usable_slaves);
	if (usable) {
5453
		RCU_INIT_POINTER(bond->usable_slaves, NULL);
M
Maor Gottlieb 已提交
5454 5455 5456 5457 5458 5459 5460
		kfree_rcu(usable, rcu);
	}

	all = rtnl_dereference(bond->all_slaves);
	if (all) {
		RCU_INIT_POINTER(bond->all_slaves, NULL);
		kfree_rcu(all, rcu);
5461 5462
	}

J
Jay Vosburgh 已提交
5463 5464
	list_del(&bond->bond_list);

5465
	bond_debug_unregister(bond);
J
Jay Vosburgh 已提交
5466 5467
}

L
Linus Torvalds 已提交
5468 5469 5470 5471
/*------------------------- Module initialization ---------------------------*/

static int bond_check_params(struct bond_params *params)
{
5472
	int arp_validate_value, fail_over_mac_value, primary_reselect_value, i;
5473 5474
	struct bond_opt_value newval;
	const struct bond_opt_value *valptr;
5475
	int arp_all_targets_value = 0;
5476
	u16 ad_actor_sys_prio = 0;
5477
	u16 ad_user_port_key = 0;
5478
	__be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 };
5479 5480 5481 5482
	int arp_ip_count;
	int bond_mode	= BOND_MODE_ROUNDROBIN;
	int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
	int lacp_fast = 0;
5483
	int tlb_dynamic_lb;
5484

5485
	/* Convert string parameters. */
L
Linus Torvalds 已提交
5486
	if (mode) {
5487 5488 5489 5490
		bond_opt_initstr(&newval, mode);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval);
		if (!valptr) {
			pr_err("Error: Invalid bonding mode \"%s\"\n", mode);
L
Linus Torvalds 已提交
5491 5492
			return -EINVAL;
		}
5493
		bond_mode = valptr->value;
L
Linus Torvalds 已提交
5494 5495
	}

5496
	if (xmit_hash_policy) {
5497 5498 5499
		if (bond_mode == BOND_MODE_ROUNDROBIN ||
		    bond_mode == BOND_MODE_ACTIVEBACKUP ||
		    bond_mode == BOND_MODE_BROADCAST) {
J
Joe Perches 已提交
5500
			pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
J
Joe Perches 已提交
5501
				bond_mode_name(bond_mode));
5502
		} else {
5503 5504 5505 5506
			bond_opt_initstr(&newval, xmit_hash_policy);
			valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH),
						&newval);
			if (!valptr) {
J
Joe Perches 已提交
5507
				pr_err("Error: Invalid xmit_hash_policy \"%s\"\n",
5508 5509 5510
				       xmit_hash_policy);
				return -EINVAL;
			}
5511
			xmit_hashtype = valptr->value;
5512 5513 5514
		}
	}

L
Linus Torvalds 已提交
5515 5516
	if (lacp_rate) {
		if (bond_mode != BOND_MODE_8023AD) {
J
Joe Perches 已提交
5517 5518
			pr_info("lacp_rate param is irrelevant in mode %s\n",
				bond_mode_name(bond_mode));
L
Linus Torvalds 已提交
5519
		} else {
5520 5521 5522 5523
			bond_opt_initstr(&newval, lacp_rate);
			valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE),
						&newval);
			if (!valptr) {
J
Joe Perches 已提交
5524
				pr_err("Error: Invalid lacp rate \"%s\"\n",
5525
				       lacp_rate);
L
Linus Torvalds 已提交
5526 5527
				return -EINVAL;
			}
5528
			lacp_fast = valptr->value;
L
Linus Torvalds 已提交
5529 5530 5531
		}
	}

5532
	if (ad_select) {
5533
		bond_opt_initstr(&newval, ad_select);
5534 5535 5536 5537
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT),
					&newval);
		if (!valptr) {
			pr_err("Error: Invalid ad_select \"%s\"\n", ad_select);
5538 5539
			return -EINVAL;
		}
5540 5541
		params->ad_select = valptr->value;
		if (bond_mode != BOND_MODE_8023AD)
5542
			pr_warn("ad_select param only affects 802.3ad mode\n");
5543 5544 5545 5546
	} else {
		params->ad_select = BOND_AD_STABLE;
	}

5547
	if (max_bonds < 0) {
5548 5549
		pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n",
			max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS);
L
Linus Torvalds 已提交
5550 5551 5552 5553
		max_bonds = BOND_DEFAULT_MAX_BONDS;
	}

	if (miimon < 0) {
5554 5555
		pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			miimon, INT_MAX);
5556
		miimon = 0;
L
Linus Torvalds 已提交
5557 5558 5559
	}

	if (updelay < 0) {
5560 5561
		pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			updelay, INT_MAX);
L
Linus Torvalds 已提交
5562 5563 5564 5565
		updelay = 0;
	}

	if (downdelay < 0) {
5566 5567
		pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			downdelay, INT_MAX);
L
Linus Torvalds 已提交
5568 5569 5570
		downdelay = 0;
	}

5571 5572
	if ((use_carrier != 0) && (use_carrier != 1)) {
		pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n",
5573
			use_carrier);
L
Linus Torvalds 已提交
5574 5575 5576
		use_carrier = 1;
	}

5577
	if (num_peer_notif < 0 || num_peer_notif > 255) {
5578 5579
		pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n",
			num_peer_notif);
5580 5581 5582
		num_peer_notif = 1;
	}

5583
	/* reset values for 802.3ad/TLB/ALB */
5584
	if (!bond_mode_uses_arp(bond_mode)) {
L
Linus Torvalds 已提交
5585
		if (!miimon) {
5586 5587
			pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n");
			pr_warn("Forcing miimon to 100msec\n");
5588
			miimon = BOND_DEFAULT_MIIMON;
L
Linus Torvalds 已提交
5589 5590 5591
		}
	}

5592
	if (tx_queues < 1 || tx_queues > 255) {
5593 5594
		pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n",
			tx_queues, BOND_DEFAULT_TX_QUEUES);
5595 5596 5597
		tx_queues = BOND_DEFAULT_TX_QUEUES;
	}

5598
	if ((all_slaves_active != 0) && (all_slaves_active != 1)) {
5599 5600
		pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n",
			all_slaves_active);
5601 5602 5603
		all_slaves_active = 0;
	}

5604
	if (resend_igmp < 0 || resend_igmp > 255) {
5605 5606
		pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n",
			resend_igmp, BOND_DEFAULT_RESEND_IGMP);
5607 5608 5609
		resend_igmp = BOND_DEFAULT_RESEND_IGMP;
	}

5610 5611
	bond_opt_initval(&newval, packets_per_slave);
	if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) {
5612 5613 5614 5615 5616
		pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n",
			packets_per_slave, USHRT_MAX);
		packets_per_slave = 1;
	}

L
Linus Torvalds 已提交
5617
	if (bond_mode == BOND_MODE_ALB) {
J
Joe Perches 已提交
5618 5619
		pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n",
			  updelay);
L
Linus Torvalds 已提交
5620 5621 5622 5623 5624 5625 5626
	}

	if (!miimon) {
		if (updelay || downdelay) {
			/* just warn the user the up/down delay will have
			 * no effect since miimon is zero...
			 */
5627 5628
			pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n",
				updelay, downdelay);
L
Linus Torvalds 已提交
5629 5630 5631 5632
		}
	} else {
		/* don't allow arp monitoring */
		if (arp_interval) {
5633 5634
			pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n",
				miimon, arp_interval);
L
Linus Torvalds 已提交
5635 5636 5637 5638
			arp_interval = 0;
		}

		if ((updelay % miimon) != 0) {
5639 5640
			pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n",
				updelay, miimon, (updelay / miimon) * miimon);
L
Linus Torvalds 已提交
5641 5642 5643 5644 5645
		}

		updelay /= miimon;

		if ((downdelay % miimon) != 0) {
5646 5647 5648
			pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n",
				downdelay, miimon,
				(downdelay / miimon) * miimon);
L
Linus Torvalds 已提交
5649 5650 5651 5652 5653 5654
		}

		downdelay /= miimon;
	}

	if (arp_interval < 0) {
5655 5656
		pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n",
			arp_interval, INT_MAX);
5657
		arp_interval = 0;
L
Linus Torvalds 已提交
5658 5659
	}

5660 5661
	for (arp_ip_count = 0, i = 0;
	     (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) {
5662
		__be32 ip;
5663 5664

		/* not a complete check, but good enough to catch mistakes */
5665
		if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) ||
5666
		    !bond_is_ip_target_ok(ip)) {
5667 5668
			pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n",
				arp_ip_target[i]);
L
Linus Torvalds 已提交
5669 5670
			arp_interval = 0;
		} else {
5671 5672 5673
			if (bond_get_targets_ip(arp_target, ip) == -1)
				arp_target[arp_ip_count++] = ip;
			else
5674 5675
				pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n",
					&ip);
L
Linus Torvalds 已提交
5676 5677 5678 5679 5680
		}
	}

	if (arp_interval && !arp_ip_count) {
		/* don't allow arping if no arp_ip_target given... */
5681 5682
		pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n",
			arp_interval);
L
Linus Torvalds 已提交
5683 5684 5685
		arp_interval = 0;
	}

5686 5687
	if (arp_validate) {
		if (!arp_interval) {
J
Joe Perches 已提交
5688
			pr_err("arp_validate requires arp_interval\n");
5689 5690 5691
			return -EINVAL;
		}

5692 5693 5694 5695
		bond_opt_initstr(&newval, arp_validate);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
5696
			pr_err("Error: invalid arp_validate \"%s\"\n",
5697
			       arp_validate);
5698 5699
			return -EINVAL;
		}
5700 5701
		arp_validate_value = valptr->value;
	} else {
5702
		arp_validate_value = 0;
5703
	}
5704

5705
	if (arp_all_targets) {
5706 5707 5708 5709
		bond_opt_initstr(&newval, arp_all_targets);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS),
					&newval);
		if (!valptr) {
5710 5711 5712
			pr_err("Error: invalid arp_all_targets_value \"%s\"\n",
			       arp_all_targets);
			arp_all_targets_value = 0;
5713 5714
		} else {
			arp_all_targets_value = valptr->value;
5715 5716 5717
		}
	}

L
Linus Torvalds 已提交
5718
	if (miimon) {
J
Joe Perches 已提交
5719
		pr_info("MII link monitoring set to %d ms\n", miimon);
L
Linus Torvalds 已提交
5720
	} else if (arp_interval) {
5721 5722
		valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE,
					  arp_validate_value);
J
Joe Perches 已提交
5723
		pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):",
5724
			arp_interval, valptr->string, arp_ip_count);
L
Linus Torvalds 已提交
5725 5726

		for (i = 0; i < arp_ip_count; i++)
J
Joe Perches 已提交
5727
			pr_cont(" %s", arp_ip_target[i]);
L
Linus Torvalds 已提交
5728

J
Joe Perches 已提交
5729
		pr_cont("\n");
L
Linus Torvalds 已提交
5730

5731
	} else if (max_bonds) {
L
Linus Torvalds 已提交
5732 5733 5734
		/* miimon and arp_interval not set, we need one so things
		 * work as expected, see bonding.txt for details
		 */
J
Joe Perches 已提交
5735
		pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n");
L
Linus Torvalds 已提交
5736 5737
	}

5738
	if (primary && !bond_mode_uses_primary(bond_mode)) {
L
Linus Torvalds 已提交
5739 5740 5741
		/* currently, using a primary only makes sense
		 * in active backup, TLB or ALB modes
		 */
5742 5743
		pr_warn("Warning: %s primary device specified but has no effect in %s mode\n",
			primary, bond_mode_name(bond_mode));
L
Linus Torvalds 已提交
5744 5745 5746
		primary = NULL;
	}

5747
	if (primary && primary_reselect) {
5748 5749 5750 5751
		bond_opt_initstr(&newval, primary_reselect);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
5752
			pr_err("Error: Invalid primary_reselect \"%s\"\n",
5753
			       primary_reselect);
5754 5755
			return -EINVAL;
		}
5756
		primary_reselect_value = valptr->value;
5757 5758 5759 5760
	} else {
		primary_reselect_value = BOND_PRI_RESELECT_ALWAYS;
	}

5761
	if (fail_over_mac) {
5762 5763 5764 5765
		bond_opt_initstr(&newval, fail_over_mac);
		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC),
					&newval);
		if (!valptr) {
J
Joe Perches 已提交
5766
			pr_err("Error: invalid fail_over_mac \"%s\"\n",
5767
			       fail_over_mac);
5768 5769
			return -EINVAL;
		}
5770
		fail_over_mac_value = valptr->value;
5771
		if (bond_mode != BOND_MODE_ACTIVEBACKUP)
5772
			pr_warn("Warning: fail_over_mac only affects active-backup mode\n");
5773 5774 5775
	} else {
		fail_over_mac_value = BOND_FOM_NONE;
	}
5776

5777 5778 5779 5780 5781 5782 5783 5784 5785 5786
	bond_opt_initstr(&newval, "default");
	valptr = bond_opt_parse(
			bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO),
				     &newval);
	if (!valptr) {
		pr_err("Error: No ad_actor_sys_prio default value");
		return -EINVAL;
	}
	ad_actor_sys_prio = valptr->value;

5787 5788 5789 5790 5791 5792 5793 5794
	valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY),
				&newval);
	if (!valptr) {
		pr_err("Error: No ad_user_port_key default value");
		return -EINVAL;
	}
	ad_user_port_key = valptr->value;

5795 5796 5797 5798 5799
	bond_opt_initstr(&newval, "default");
	valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval);
	if (!valptr) {
		pr_err("Error: No tlb_dynamic_lb default value");
		return -EINVAL;
5800
	}
5801
	tlb_dynamic_lb = valptr->value;
5802

5803
	if (lp_interval == 0) {
5804 5805
		pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
			INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
5806 5807 5808
		lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
	}

L
Linus Torvalds 已提交
5809 5810
	/* fill params struct with the proper values */
	params->mode = bond_mode;
5811
	params->xmit_policy = xmit_hashtype;
L
Linus Torvalds 已提交
5812
	params->miimon = miimon;
5813
	params->num_peer_notif = num_peer_notif;
L
Linus Torvalds 已提交
5814
	params->arp_interval = arp_interval;
5815
	params->arp_validate = arp_validate_value;
5816
	params->arp_all_targets = arp_all_targets_value;
L
Linus Torvalds 已提交
5817 5818
	params->updelay = updelay;
	params->downdelay = downdelay;
5819
	params->peer_notif_delay = 0;
L
Linus Torvalds 已提交
5820
	params->use_carrier = use_carrier;
5821
	params->lacp_active = 1;
L
Linus Torvalds 已提交
5822 5823
	params->lacp_fast = lacp_fast;
	params->primary[0] = 0;
5824
	params->primary_reselect = primary_reselect_value;
5825
	params->fail_over_mac = fail_over_mac_value;
5826
	params->tx_queues = tx_queues;
5827
	params->all_slaves_active = all_slaves_active;
5828
	params->resend_igmp = resend_igmp;
5829
	params->min_links = min_links;
5830
	params->lp_interval = lp_interval;
5831
	params->packets_per_slave = packets_per_slave;
5832
	params->tlb_dynamic_lb = tlb_dynamic_lb;
5833
	params->ad_actor_sys_prio = ad_actor_sys_prio;
5834
	eth_zero_addr(params->ad_actor_system);
5835
	params->ad_user_port_key = ad_user_port_key;
5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846
	if (packets_per_slave > 0) {
		params->reciprocal_packets_per_slave =
			reciprocal_value(packets_per_slave);
	} else {
		/* reciprocal_packets_per_slave is unused if
		 * packets_per_slave is 0 or 1, just initialize it
		 */
		params->reciprocal_packets_per_slave =
			(struct reciprocal_value) { 0 };
	}

5847 5848
	if (primary)
		strscpy_pad(params->primary, primary, sizeof(params->primary));
L
Linus Torvalds 已提交
5849 5850 5851 5852 5853 5854

	memcpy(params->arp_targets, arp_target, sizeof(arp_target));

	return 0;
}

5855
/* Called from registration process */
5856 5857 5858
static int bond_init(struct net_device *bond_dev)
{
	struct bonding *bond = netdev_priv(bond_dev);
5859
	struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
5860

5861
	netdev_dbg(bond_dev, "Begin bond_init\n");
5862

5863
	bond->wq = alloc_ordered_workqueue(bond_dev->name, WQ_MEM_RECLAIM);
5864 5865 5866
	if (!bond->wq)
		return -ENOMEM;

5867 5868 5869 5870 5871 5872 5873 5874 5875
	if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN) {
		bond->rr_tx_counter = alloc_percpu(u32);
		if (!bond->rr_tx_counter) {
			destroy_workqueue(bond->wq);
			bond->wq = NULL;
			return -ENOMEM;
		}
	}

5876
	spin_lock_init(&bond->stats_lock);
5877
	netdev_lockdep_set_classes(bond_dev);
5878

5879
	list_add_tail(&bond->bond_list, &bn->dev_list);
5880

5881
	bond_prepare_sysfs_group(bond);
5882

5883 5884
	bond_debug_register(bond);

5885 5886
	/* Ensure valid dev_addr */
	if (is_zero_ether_addr(bond_dev->dev_addr) &&
5887
	    bond_dev->addr_assign_type == NET_ADDR_PERM)
5888 5889
		eth_hw_addr_random(bond_dev);

5890 5891 5892
	return 0;
}

5893
unsigned int bond_get_num_tx_queues(void)
5894
{
5895
	return tx_queues;
5896 5897
}

5898
/* Create a new bond based on the specified name and bonding parameters.
5899
 * If name is NULL, obtain a suitable "bond%d" name for us.
5900 5901 5902
 * Caller must NOT hold rtnl_lock; we need to release it here before we
 * set up our sysfs entries.
 */
5903
int bond_create(struct net *net, const char *name)
5904 5905
{
	struct net_device *bond_dev;
5906 5907
	struct bonding *bond;
	struct alb_bond_info *bond_info;
5908 5909 5910
	int res;

	rtnl_lock();
5911

5912
	bond_dev = alloc_netdev_mq(sizeof(struct bonding),
5913
				   name ? name : "bond%d", NET_NAME_UNKNOWN,
5914
				   bond_setup, tx_queues);
5915
	if (!bond_dev) {
J
Joe Perches 已提交
5916
		pr_err("%s: eek! can't alloc netdev!\n", name);
5917 5918
		rtnl_unlock();
		return -ENOMEM;
5919 5920
	}

5921 5922 5923 5924 5925 5926 5927 5928
	/*
	 * Initialize rx_hashtbl_used_head to RLB_NULL_INDEX.
	 * It is set to 0 by default which is wrong.
	 */
	bond = netdev_priv(bond_dev);
	bond_info = &(BOND_ALB_INFO(bond));
	bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX;

5929
	dev_net_set(bond_dev, net);
5930 5931
	bond_dev->rtnl_link_ops = &bond_link_ops;

5932
	res = register_netdevice(bond_dev);
5933 5934 5935 5936 5937 5938
	if (res < 0) {
		free_netdev(bond_dev);
		rtnl_unlock();

		return res;
	}
5939

5940 5941
	netif_carrier_off(bond_dev);

5942 5943
	bond_work_init_all(bond);

5944
	rtnl_unlock();
5945
	return 0;
5946 5947
}

5948
static int __net_init bond_net_init(struct net *net)
5949
{
5950
	struct bond_net *bn = net_generic(net, bond_net_id);
5951 5952 5953 5954 5955

	bn->net = net;
	INIT_LIST_HEAD(&bn->dev_list);

	bond_create_proc_dir(bn);
5956
	bond_create_sysfs(bn);
5957

5958
	return 0;
5959 5960
}

5961
static void __net_exit bond_net_exit(struct net *net)
5962
{
5963
	struct bond_net *bn = net_generic(net, bond_net_id);
5964 5965
	struct bonding *bond, *tmp_bond;
	LIST_HEAD(list);
5966

5967
	bond_destroy_sysfs(bn);
5968 5969 5970 5971 5972 5973 5974

	/* Kill off any bonds created after unregistering bond rtnl ops */
	rtnl_lock();
	list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
		unregister_netdevice_queue(bond->dev, &list);
	unregister_netdevice_many(&list);
	rtnl_unlock();
5975 5976

	bond_destroy_proc_dir(bn);
5977 5978 5979 5980 5981
}

static struct pernet_operations bond_net_ops = {
	.init = bond_net_init,
	.exit = bond_net_exit,
5982 5983
	.id   = &bond_net_id,
	.size = sizeof(struct bond_net),
5984 5985
};

L
Linus Torvalds 已提交
5986 5987 5988 5989 5990
static int __init bonding_init(void)
{
	int i;
	int res;

5991
	res = bond_check_params(&bonding_defaults);
S
Stephen Hemminger 已提交
5992
	if (res)
5993
		goto out;
L
Linus Torvalds 已提交
5994

5995
	res = register_pernet_subsys(&bond_net_ops);
5996 5997
	if (res)
		goto out;
5998

5999
	res = bond_netlink_init();
6000
	if (res)
6001
		goto err_link;
6002

6003 6004
	bond_create_debugfs();

L
Linus Torvalds 已提交
6005
	for (i = 0; i < max_bonds; i++) {
6006
		res = bond_create(&init_net, NULL);
6007 6008
		if (res)
			goto err;
L
Linus Torvalds 已提交
6009 6010
	}

6011 6012 6013 6014
	skb_flow_dissector_init(&flow_keys_bonding,
				flow_keys_bonding_keys,
				ARRAY_SIZE(flow_keys_bonding_keys));

L
Linus Torvalds 已提交
6015
	register_netdevice_notifier(&bond_netdev_notifier);
6016
out:
L
Linus Torvalds 已提交
6017
	return res;
6018
err:
6019
	bond_destroy_debugfs();
6020
	bond_netlink_fini();
6021
err_link:
6022
	unregister_pernet_subsys(&bond_net_ops);
6023
	goto out;
6024

L
Linus Torvalds 已提交
6025 6026 6027 6028 6029 6030
}

static void __exit bonding_exit(void)
{
	unregister_netdevice_notifier(&bond_netdev_notifier);

6031
	bond_destroy_debugfs();
6032

6033
	bond_netlink_fini();
6034
	unregister_pernet_subsys(&bond_net_ops);
6035 6036

#ifdef CONFIG_NET_POLL_CONTROLLER
6037
	/* Make sure we don't have an imbalance on our netpoll blocking */
6038
	WARN_ON(atomic_read(&netpoll_block_tx));
6039
#endif
L
Linus Torvalds 已提交
6040 6041 6042 6043 6044
}

module_init(bonding_init);
module_exit(bonding_exit);
MODULE_LICENSE("GPL");
6045
MODULE_DESCRIPTION(DRV_DESCRIPTION);
L
Linus Torvalds 已提交
6046
MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others");