dev.c 273.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2
/*
3
 *      NET3    Protocol independent device support routines.
L
Linus Torvalds 已提交
4 5
 *
 *	Derived from the non IP parts of dev.c 1.0.19
6
 *              Authors:	Ross Biro
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19
 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
 *
 *	Additional Authors:
 *		Florian la Roche <rzsfl@rz.uni-sb.de>
 *		Alan Cox <gw4pts@gw4pts.ampr.org>
 *		David Hinds <dahinds@users.sourceforge.net>
 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *		Adam Sulmicki <adam@cfar.umd.edu>
 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 *
 *	Changes:
 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
20 21 22
 *                                      to 2 if register_netdev gets called
 *                                      before net_dev_init & also removed a
 *                                      few lines of code in the process.
L
Linus Torvalds 已提交
23 24 25 26 27 28 29 30 31 32 33 34
 *		Alan Cox	:	device private ioctl copies fields back.
 *		Alan Cox	:	Transmit queue code does relevant
 *					stunts to keep the queue safe.
 *		Alan Cox	:	Fixed double lock.
 *		Alan Cox	:	Fixed promisc NULL pointer trap
 *		????????	:	Support the full private ioctl range
 *		Alan Cox	:	Moved ioctl permission check into
 *					drivers
 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
 *		Alan Cox	:	100 backlog just doesn't cut it when
 *					you start doing multicast video 8)
 *		Alan Cox	:	Rewrote net_bh and list manager.
35
 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
L
Linus Torvalds 已提交
36 37 38 39 40 41 42 43 44
 *		Alan Cox	:	Took out transmit every packet pass
 *					Saved a few bytes in the ioctl handler
 *		Alan Cox	:	Network driver sets packet type before
 *					calling netif_rx. Saves a function
 *					call a packet.
 *		Alan Cox	:	Hashed net_bh()
 *		Richard Kooijman:	Timestamp fixes.
 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
 *		Alan Cox	:	Device lock protection.
45
 *              Alan Cox        :       Fixed nasty side effect of device close
L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
 *					changes.
 *		Rudi Cilibrasi	:	Pass the right thing to
 *					set_mac_address()
 *		Dave Miller	:	32bit quantity for the device lock to
 *					make it work out on a Sparc.
 *		Bjorn Ekwall	:	Added KERNELD hack.
 *		Alan Cox	:	Cleaned up the backlog initialise.
 *		Craig Metz	:	SIOCGIFCONF fix if space for under
 *					1 device.
 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
 *					is no device open function.
 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
 *		Cyrus Durgin	:	Cleaned for KMOD
 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
 *					A network device unload needs to purge
 *					the backlog queue.
 *	Paul Rusty Russell	:	SIOCSIFNAME
 *              Pekka Riikonen  :	Netdev boot-time settings code
 *              Andrew Morton   :       Make unregister_netdevice wait
66 67
 *                                      indefinitely on dev->refcnt
 *              J Hadi Salim    :       - Backlog queue sampling
L
Linus Torvalds 已提交
68 69 70
 *				        - netif_rx() feedback
 */

71
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
72
#include <linux/bitops.h>
73
#include <linux/capability.h>
L
Linus Torvalds 已提交
74 75 76
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
77
#include <linux/hash.h>
78
#include <linux/slab.h>
L
Linus Torvalds 已提交
79
#include <linux/sched.h>
80
#include <linux/sched/mm.h>
A
Arjan van de Ven 已提交
81
#include <linux/mutex.h>
82
#include <linux/rwsem.h>
L
Linus Torvalds 已提交
83 84 85 86 87 88 89 90 91
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
92
#include <linux/ethtool.h>
L
Linus Torvalds 已提交
93
#include <linux/skbuff.h>
94
#include <linux/bpf.h>
D
David S. Miller 已提交
95
#include <linux/bpf_trace.h>
96
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
97
#include <net/sock.h>
E
Eric Dumazet 已提交
98
#include <net/busy_poll.h>
L
Linus Torvalds 已提交
99 100 101
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dst.h>
102
#include <net/dst_metadata.h>
L
Linus Torvalds 已提交
103
#include <net/pkt_sched.h>
104
#include <net/pkt_cls.h>
L
Linus Torvalds 已提交
105
#include <net/checksum.h>
106
#include <net/xfrm.h>
L
Linus Torvalds 已提交
107 108 109 110 111 112 113 114
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <net/iw_handler.h>
#include <asm/current.h>
S
Steve Grubb 已提交
115
#include <linux/audit.h>
116
#include <linux/dmaengine.h>
117
#include <linux/err.h>
118
#include <linux/ctype.h>
119
#include <linux/if_arp.h>
120
#include <linux/if_vlan.h>
121
#include <linux/ip.h>
122
#include <net/ip.h>
123
#include <net/mpls.h>
124 125
#include <linux/ipv6.h>
#include <linux/in.h>
D
David S. Miller 已提交
126 127
#include <linux/jhash.h>
#include <linux/random.h>
128
#include <trace/events/napi.h>
129
#include <trace/events/net.h>
130
#include <trace/events/skb.h>
131
#include <linux/inetdevice.h>
132
#include <linux/cpu_rmap.h>
133
#include <linux/static_key.h>
E
Eliezer Tamir 已提交
134
#include <linux/hashtable.h>
135
#include <linux/vmalloc.h>
136
#include <linux/if_macvlan.h>
137
#include <linux/errqueue.h>
138
#include <linux/hrtimer.h>
139
#include <linux/netfilter_ingress.h>
140
#include <linux/crash_dump.h>
141
#include <linux/sctp.h>
142
#include <net/udp_tunnel.h>
143
#include <linux/net_namespace.h>
144
#include <linux/indirect_call_wrapper.h>
145
#include <net/devlink.h>
146
#include <linux/pm_runtime.h>
L
Linus Torvalds 已提交
147

148 149
#include "net-sysfs.h"

150 151
#define MAX_GRO_SKBS 8

H
Herbert Xu 已提交
152 153 154
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)

L
Linus Torvalds 已提交
155
static DEFINE_SPINLOCK(ptype_lock);
156
static DEFINE_SPINLOCK(offload_lock);
157 158
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly;	/* Taps */
159
static struct list_head offload_base __read_mostly;
L
Linus Torvalds 已提交
160

161
static int netif_rx_internal(struct sk_buff *skb);
162 163
static int call_netdevice_notifiers_info(unsigned long val,
					 struct netdev_notifier_info *info);
164 165 166
static int call_netdevice_notifiers_extack(unsigned long val,
					   struct net_device *dev,
					   struct netlink_ext_ack *extack);
167
static struct napi_struct *napi_by_id(unsigned int napi_id);
168

L
Linus Torvalds 已提交
169
/*
170
 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
L
Linus Torvalds 已提交
171 172
 * semaphore.
 *
173
 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
L
Linus Torvalds 已提交
174 175
 *
 * Writers must hold the rtnl semaphore while they loop through the
176
 * dev_base_head list, and hold dev_base_lock for writing when they do the
L
Linus Torvalds 已提交
177 178 179 180 181 182 183 184 185 186 187 188 189 190
 * actual updates.  This allows pure readers to access the list even
 * while a writer is preparing to update it.
 *
 * To put it another way, dev_base_lock is held for writing only to
 * protect against pure readers; the rtnl semaphore provides the
 * protection against other writers.
 *
 * See, for example usages, register_netdevice() and
 * unregister_netdevice(), which must be called with the rtnl
 * semaphore held.
 */
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);

191 192
static DEFINE_MUTEX(ifalias_mutex);

E
Eliezer Tamir 已提交
193 194 195
/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);

196
static unsigned int napi_gen_id = NR_CPUS;
197
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
E
Eliezer Tamir 已提交
198

199
static DECLARE_RWSEM(devnet_rename_sem);
200

201 202
static inline void dev_base_seq_inc(struct net *net)
{
203 204
	while (++net->dev_base_seq == 0)
		;
205 206
}

207
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
L
Linus Torvalds 已提交
208
{
209
	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
210

211
	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
L
Linus Torvalds 已提交
212 213
}

214
static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
L
Linus Torvalds 已提交
215
{
216
	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
L
Linus Torvalds 已提交
217 218
}

E
Eric Dumazet 已提交
219
static inline void rps_lock(struct softnet_data *sd)
220 221
{
#ifdef CONFIG_RPS
E
Eric Dumazet 已提交
222
	spin_lock(&sd->input_pkt_queue.lock);
223 224 225
#endif
}

E
Eric Dumazet 已提交
226
static inline void rps_unlock(struct softnet_data *sd)
227 228
{
#ifdef CONFIG_RPS
E
Eric Dumazet 已提交
229
	spin_unlock(&sd->input_pkt_queue.lock);
230 231 232
#endif
}

233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
						       const char *name)
{
	struct netdev_name_node *name_node;

	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
	if (!name_node)
		return NULL;
	INIT_HLIST_NODE(&name_node->hlist);
	name_node->dev = dev;
	name_node->name = name;
	return name_node;
}

static struct netdev_name_node *
netdev_name_node_head_alloc(struct net_device *dev)
{
250 251 252 253 254 255 256
	struct netdev_name_node *name_node;

	name_node = netdev_name_node_alloc(dev, dev->name);
	if (!name_node)
		return NULL;
	INIT_LIST_HEAD(&name_node->list);
	return name_node;
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
}

static void netdev_name_node_free(struct netdev_name_node *name_node)
{
	kfree(name_node);
}

static void netdev_name_node_add(struct net *net,
				 struct netdev_name_node *name_node)
{
	hlist_add_head_rcu(&name_node->hlist,
			   dev_name_hash(net, name_node->name));
}

static void netdev_name_node_del(struct netdev_name_node *name_node)
{
	hlist_del_rcu(&name_node->hlist);
}

static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
							const char *name)
{
	struct hlist_head *head = dev_name_hash(net, name);
	struct netdev_name_node *name_node;

	hlist_for_each_entry(name_node, head, hlist)
		if (!strcmp(name_node->name, name))
			return name_node;
	return NULL;
}

static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
							    const char *name)
{
	struct hlist_head *head = dev_name_hash(net, name);
	struct netdev_name_node *name_node;

	hlist_for_each_entry_rcu(name_node, head, hlist)
		if (!strcmp(name_node->name, name))
			return name_node;
	return NULL;
}

300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
	struct netdev_name_node *name_node;
	struct net *net = dev_net(dev);

	name_node = netdev_name_node_lookup(net, name);
	if (name_node)
		return -EEXIST;
	name_node = netdev_name_node_alloc(dev, name);
	if (!name_node)
		return -ENOMEM;
	netdev_name_node_add(net, name_node);
	/* The node that holds dev->name acts as a head of per-device list. */
	list_add_tail(&name_node->list, &dev->name_node->list);

	return 0;
}
EXPORT_SYMBOL(netdev_name_node_alt_create);

static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
	list_del(&name_node->list);
	netdev_name_node_del(name_node);
	kfree(name_node->name);
	netdev_name_node_free(name_node);
}

int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
	struct netdev_name_node *name_node;
	struct net *net = dev_net(dev);

	name_node = netdev_name_node_lookup(net, name);
	if (!name_node)
		return -ENOENT;
335 336 337 338 339 340
	/* lookup might have found our primary name or a name belonging
	 * to another device.
	 */
	if (name_node == dev->name_node || name_node->dev != dev)
		return -EINVAL;

341 342 343 344 345 346 347 348 349 350 351 352 353 354
	__netdev_name_node_alt_destroy(name_node);

	return 0;
}
EXPORT_SYMBOL(netdev_name_node_alt_destroy);

static void netdev_name_node_alt_flush(struct net_device *dev)
{
	struct netdev_name_node *name_node, *tmp;

	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
		__netdev_name_node_alt_destroy(name_node);
}

355
/* Device list insertion */
356
static void list_netdevice(struct net_device *dev)
357
{
358
	struct net *net = dev_net(dev);
359 360 361 362

	ASSERT_RTNL();

	write_lock_bh(&dev_base_lock);
363
	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
364
	netdev_name_node_add(net, dev->name_node);
365 366
	hlist_add_head_rcu(&dev->index_hlist,
			   dev_index_hash(net, dev->ifindex));
367
	write_unlock_bh(&dev_base_lock);
368 369

	dev_base_seq_inc(net);
370 371
}

372 373 374
/* Device list removal
 * caller must respect a RCU grace period before freeing/reusing dev
 */
375 376 377 378 379 380
static void unlist_netdevice(struct net_device *dev)
{
	ASSERT_RTNL();

	/* Unlink dev from the device chain */
	write_lock_bh(&dev_base_lock);
381
	list_del_rcu(&dev->dev_list);
382
	netdev_name_node_del(dev->name_node);
383
	hlist_del_rcu(&dev->index_hlist);
384
	write_unlock_bh(&dev_base_lock);
385 386

	dev_base_seq_inc(dev_net(dev));
387 388
}

L
Linus Torvalds 已提交
389 390 391 392
/*
 *	Our notifier list
 */

393
static RAW_NOTIFIER_HEAD(netdev_chain);
L
Linus Torvalds 已提交
394 395 396 397 398

/*
 *	Device drivers call our routines to queue packets here. We empty the
 *	queue in the local softnet handler.
 */
399

400
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
E
Eric Dumazet 已提交
401
EXPORT_PER_CPU_SYMBOL(softnet_data);
L
Linus Torvalds 已提交
402

403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
#ifdef CONFIG_LOCKDEP
/*
 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 * according to dev->type
 */
static const unsigned short netdev_lock_type[] = {
	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};

static const char *const netdev_lock_name[] = {
	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};

static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
443
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464

static inline unsigned short netdev_lock_pos(unsigned short dev_type)
{
	int i;

	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
		if (netdev_lock_type[i] == dev_type)
			return i;
	/* the last key is used by default */
	return ARRAY_SIZE(netdev_lock_type) - 1;
}

static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
						 unsigned short dev_type)
{
	int i;

	i = netdev_lock_pos(dev_type);
	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
				   netdev_lock_name[i]);
}
465 466 467 468 469 470 471 472 473 474

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
	int i;

	i = netdev_lock_pos(dev->type);
	lockdep_set_class_and_name(&dev->addr_list_lock,
				   &netdev_addr_lock_key[i],
				   netdev_lock_name[i]);
}
475 476 477 478 479
#else
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
						 unsigned short dev_type)
{
}
480 481 482 483

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
484 485
#endif

L
Linus Torvalds 已提交
486
/*******************************************************************************
487 488 489 490
 *
 *		Protocol management and registration routines
 *
 *******************************************************************************/
L
Linus Torvalds 已提交
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508


/*
 *	Add a protocol ID to the list. Now that the input handler is
 *	smarter we can dispense with all the messy stuff that used to be
 *	here.
 *
 *	BEWARE!!! Protocol handlers, mangling input packets,
 *	MUST BE last in hash buckets and checking protocol handlers
 *	MUST start from promiscuous ptype_all chain in net_bh.
 *	It is true now, do not change it.
 *	Explanation follows: if protocol handler, mangling packet, will
 *	be the first on list, it is not able to sense, that packet
 *	is cloned and should be copied-on-write, so that it will
 *	change it and subsequent readers will get broken packet.
 *							--ANK (980803)
 */

509 510 511
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
	if (pt->type == htons(ETH_P_ALL))
512
		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
513
	else
514 515
		return pt->dev ? &pt->dev->ptype_specific :
				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
516 517
}

L
Linus Torvalds 已提交
518 519 520 521 522 523 524 525
/**
 *	dev_add_pack - add packet handler
 *	@pt: packet type declaration
 *
 *	Add a protocol handler to the networking stack. The passed &packet_type
 *	is linked into kernel lists and may not be freed until it has been
 *	removed from the kernel lists.
 *
526
 *	This call does not sleep therefore it can not
L
Linus Torvalds 已提交
527 528 529 530 531 532
 *	guarantee all CPU's that are in middle of receiving packets
 *	will see the new packet type (until the next received packet).
 */

void dev_add_pack(struct packet_type *pt)
{
533
	struct list_head *head = ptype_head(pt);
L
Linus Torvalds 已提交
534

535 536 537
	spin_lock(&ptype_lock);
	list_add_rcu(&pt->list, head);
	spin_unlock(&ptype_lock);
L
Linus Torvalds 已提交
538
}
E
Eric Dumazet 已提交
539
EXPORT_SYMBOL(dev_add_pack);
L
Linus Torvalds 已提交
540 541 542 543 544 545 546 547

/**
 *	__dev_remove_pack	 - remove packet handler
 *	@pt: packet type declaration
 *
 *	Remove a protocol handler that was previously added to the kernel
 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *	from the kernel lists and can be freed or reused once this function
548
 *	returns.
L
Linus Torvalds 已提交
549 550 551 552 553 554 555
 *
 *      The packet type might still be in use by receivers
 *	and must not be freed until after all the CPU's have gone
 *	through a quiescent state.
 */
void __dev_remove_pack(struct packet_type *pt)
{
556
	struct list_head *head = ptype_head(pt);
L
Linus Torvalds 已提交
557 558
	struct packet_type *pt1;

559
	spin_lock(&ptype_lock);
L
Linus Torvalds 已提交
560 561 562 563 564 565 566 567

	list_for_each_entry(pt1, head, list) {
		if (pt == pt1) {
			list_del_rcu(&pt->list);
			goto out;
		}
	}

568
	pr_warn("dev_remove_pack: %p not found\n", pt);
L
Linus Torvalds 已提交
569
out:
570
	spin_unlock(&ptype_lock);
L
Linus Torvalds 已提交
571
}
E
Eric Dumazet 已提交
572 573
EXPORT_SYMBOL(__dev_remove_pack);

L
Linus Torvalds 已提交
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
/**
 *	dev_remove_pack	 - remove packet handler
 *	@pt: packet type declaration
 *
 *	Remove a protocol handler that was previously added to the kernel
 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *	from the kernel lists and can be freed or reused once this function
 *	returns.
 *
 *	This call sleeps to guarantee that no CPU is looking at the packet
 *	type after return.
 */
void dev_remove_pack(struct packet_type *pt)
{
	__dev_remove_pack(pt);
589

L
Linus Torvalds 已提交
590 591
	synchronize_net();
}
E
Eric Dumazet 已提交
592
EXPORT_SYMBOL(dev_remove_pack);
L
Linus Torvalds 已提交
593

594 595 596 597 598 599 600 601 602 603 604 605 606 607 608

/**
 *	dev_add_offload - register offload handlers
 *	@po: protocol offload declaration
 *
 *	Add protocol offload handlers to the networking stack. The passed
 *	&proto_offload is linked into kernel lists and may not be freed until
 *	it has been removed from the kernel lists.
 *
 *	This call does not sleep therefore it can not
 *	guarantee all CPU's that are in middle of receiving packets
 *	will see the new offload handlers (until the next received packet).
 */
void dev_add_offload(struct packet_offload *po)
{
609
	struct packet_offload *elem;
610 611

	spin_lock(&offload_lock);
612 613 614 615 616
	list_for_each_entry(elem, &offload_base, list) {
		if (po->priority < elem->priority)
			break;
	}
	list_add_rcu(&po->list, elem->list.prev);
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
	spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);

/**
 *	__dev_remove_offload	 - remove offload handler
 *	@po: packet offload declaration
 *
 *	Remove a protocol offload handler that was previously added to the
 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 *	is removed from the kernel lists and can be freed or reused once this
 *	function returns.
 *
 *      The packet type might still be in use by receivers
 *	and must not be freed until after all the CPU's have gone
 *	through a quiescent state.
 */
S
stephen hemminger 已提交
634
static void __dev_remove_offload(struct packet_offload *po)
635 636 637 638
{
	struct list_head *head = &offload_base;
	struct packet_offload *po1;

639
	spin_lock(&offload_lock);
640 641 642 643 644 645 646 647 648 649

	list_for_each_entry(po1, head, list) {
		if (po == po1) {
			list_del_rcu(&po->list);
			goto out;
		}
	}

	pr_warn("dev_remove_offload: %p not found\n", po);
out:
650
	spin_unlock(&offload_lock);
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
}

/**
 *	dev_remove_offload	 - remove packet offload handler
 *	@po: packet offload declaration
 *
 *	Remove a packet offload handler that was previously added to the kernel
 *	offload handlers by dev_add_offload(). The passed &offload_type is
 *	removed from the kernel lists and can be freed or reused once this
 *	function returns.
 *
 *	This call sleeps to guarantee that no CPU is looking at the packet
 *	type after return.
 */
void dev_remove_offload(struct packet_offload *po)
{
	__dev_remove_offload(po);

	synchronize_net();
}
EXPORT_SYMBOL(dev_remove_offload);

L
Linus Torvalds 已提交
673
/******************************************************************************
674 675 676 677
 *
 *		      Device Boot-time Settings Routines
 *
 ******************************************************************************/
L
Linus Torvalds 已提交
678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699

/* Boot time configuration table */
static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];

/**
 *	netdev_boot_setup_add	- add new setup entry
 *	@name: name of the device
 *	@map: configured settings for the device
 *
 *	Adds new setup entry to the dev_boot_setup list.  The function
 *	returns 0 on error and 1 on success.  This is a generic routine to
 *	all netdevices.
 */
static int netdev_boot_setup_add(char *name, struct ifmap *map)
{
	struct netdev_boot_setup *s;
	int i;

	s = dev_boot_setup;
	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
			memset(s[i].name, 0, sizeof(s[i].name));
700
			strlcpy(s[i].name, name, IFNAMSIZ);
L
Linus Torvalds 已提交
701 702 703 704 705 706 707 708 709
			memcpy(&s[i].map, map, sizeof(s[i].map));
			break;
		}
	}

	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
}

/**
710 711
 * netdev_boot_setup_check	- check boot time settings
 * @dev: the netdevice
L
Linus Torvalds 已提交
712
 *
713 714 715 716
 * Check boot time settings for the device.
 * The found settings are set for the device to be used
 * later in the device probing.
 * Returns 0 if no settings found, 1 if they are.
L
Linus Torvalds 已提交
717 718 719 720 721 722 723 724
 */
int netdev_boot_setup_check(struct net_device *dev)
{
	struct netdev_boot_setup *s = dev_boot_setup;
	int i;

	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
725
		    !strcmp(dev->name, s[i].name)) {
726 727 728 729
			dev->irq = s[i].map.irq;
			dev->base_addr = s[i].map.base_addr;
			dev->mem_start = s[i].map.mem_start;
			dev->mem_end = s[i].map.mem_end;
L
Linus Torvalds 已提交
730 731 732 733 734
			return 1;
		}
	}
	return 0;
}
E
Eric Dumazet 已提交
735
EXPORT_SYMBOL(netdev_boot_setup_check);
L
Linus Torvalds 已提交
736 737 738


/**
739 740 741 742 743 744 745 746
 * netdev_boot_base	- get address from boot time settings
 * @prefix: prefix for network device
 * @unit: id for network device
 *
 * Check boot time settings for the base address of device.
 * The found settings are set for the device to be used
 * later in the device probing.
 * Returns 0 if no settings found.
L
Linus Torvalds 已提交
747 748 749 750 751 752 753 754 755 756 757 758 759
 */
unsigned long netdev_boot_base(const char *prefix, int unit)
{
	const struct netdev_boot_setup *s = dev_boot_setup;
	char name[IFNAMSIZ];
	int i;

	sprintf(name, "%s%d", prefix, unit);

	/*
	 * If device already registered then return base of 1
	 * to indicate not to probe for this interface
	 */
760
	if (__dev_get_by_name(&init_net, name))
L
Linus Torvalds 已提交
761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
		return 1;

	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
		if (!strcmp(name, s[i].name))
			return s[i].map.base_addr;
	return 0;
}

/*
 * Saves at boot time configured settings for any netdevice.
 */
int __init netdev_boot_setup(char *str)
{
	int ints[5];
	struct ifmap map;

	str = get_options(str, ARRAY_SIZE(ints), ints);
	if (!str || !*str)
		return 0;

	/* Save settings */
	memset(&map, 0, sizeof(map));
	if (ints[0] > 0)
		map.irq = ints[1];
	if (ints[0] > 1)
		map.base_addr = ints[2];
	if (ints[0] > 2)
		map.mem_start = ints[3];
	if (ints[0] > 3)
		map.mem_end = ints[4];

	/* Add new entry to the list */
	return netdev_boot_setup_add(str, &map);
}

__setup("netdev=", netdev_boot_setup);

/*******************************************************************************
799 800 801 802
 *
 *			    Device Interface Subroutines
 *
 *******************************************************************************/
L
Linus Torvalds 已提交
803

804 805 806 807 808 809 810 811 812 813 814 815 816
/**
 *	dev_get_iflink	- get 'iflink' value of a interface
 *	@dev: targeted interface
 *
 *	Indicates the ifindex the interface is linked to.
 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 */

int dev_get_iflink(const struct net_device *dev)
{
	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
		return dev->netdev_ops->ndo_get_iflink(dev);

817
	return dev->ifindex;
818 819 820
}
EXPORT_SYMBOL(dev_get_iflink);

821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
/**
 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 *	@dev: targeted interface
 *	@skb: The packet.
 *
 *	For better visibility of tunnel traffic OVS needs to retrieve
 *	egress tunnel information for a packet. Following API allows
 *	user to get this info.
 */
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct ip_tunnel_info *info;

	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
		return -EINVAL;

	info = skb_tunnel_info_unclone(skb);
	if (!info)
		return -ENOMEM;
	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
		return -EINVAL;

	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);

L
Linus Torvalds 已提交
847 848
/**
 *	__dev_get_by_name	- find a device by its name
849
 *	@net: the applicable net namespace
L
Linus Torvalds 已提交
850 851 852 853 854 855 856 857 858
 *	@name: name to find
 *
 *	Find an interface by name. Must be called under RTNL semaphore
 *	or @dev_base_lock. If the name is found a pointer to the device
 *	is returned. If the name is not found then %NULL is returned. The
 *	reference counters are not incremented so the caller must be
 *	careful with locks.
 */

859
struct net_device *__dev_get_by_name(struct net *net, const char *name)
L
Linus Torvalds 已提交
860
{
861
	struct netdev_name_node *node_name;
L
Linus Torvalds 已提交
862

863 864
	node_name = netdev_name_node_lookup(net, name);
	return node_name ? node_name->dev : NULL;
L
Linus Torvalds 已提交
865
}
E
Eric Dumazet 已提交
866
EXPORT_SYMBOL(__dev_get_by_name);
L
Linus Torvalds 已提交
867

868
/**
869 870 871 872 873 874 875 876 877
 * dev_get_by_name_rcu	- find a device by its name
 * @net: the applicable net namespace
 * @name: name to find
 *
 * Find an interface by name.
 * If the name is found a pointer to the device is returned.
 * If the name is not found then %NULL is returned.
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
878 879 880 881
 */

struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
882
	struct netdev_name_node *node_name;
883

884 885
	node_name = netdev_name_node_lookup_rcu(net, name);
	return node_name ? node_name->dev : NULL;
886 887 888
}
EXPORT_SYMBOL(dev_get_by_name_rcu);

L
Linus Torvalds 已提交
889 890
/**
 *	dev_get_by_name		- find a device by its name
891
 *	@net: the applicable net namespace
L
Linus Torvalds 已提交
892 893 894 895 896 897 898 899 900
 *	@name: name to find
 *
 *	Find an interface by name. This can be called from any
 *	context and does its own locking. The returned handle has
 *	the usage count incremented and the caller must use dev_put() to
 *	release it when it is no longer needed. %NULL is returned if no
 *	matching device is found.
 */

901
struct net_device *dev_get_by_name(struct net *net, const char *name)
L
Linus Torvalds 已提交
902 903 904
{
	struct net_device *dev;

905 906
	rcu_read_lock();
	dev = dev_get_by_name_rcu(net, name);
L
Linus Torvalds 已提交
907 908
	if (dev)
		dev_hold(dev);
909
	rcu_read_unlock();
L
Linus Torvalds 已提交
910 911
	return dev;
}
E
Eric Dumazet 已提交
912
EXPORT_SYMBOL(dev_get_by_name);
L
Linus Torvalds 已提交
913 914 915

/**
 *	__dev_get_by_index - find a device by its ifindex
916
 *	@net: the applicable net namespace
L
Linus Torvalds 已提交
917 918 919 920 921 922 923 924 925
 *	@ifindex: index of device
 *
 *	Search for an interface by index. Returns %NULL if the device
 *	is not found or a pointer to the device. The device has not
 *	had its reference counter increased so the caller must be careful
 *	about locking. The caller must hold either the RTNL semaphore
 *	or @dev_base_lock.
 */

926
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
L
Linus Torvalds 已提交
927
{
E
Eric Dumazet 已提交
928 929
	struct net_device *dev;
	struct hlist_head *head = dev_index_hash(net, ifindex);
L
Linus Torvalds 已提交
930

931
	hlist_for_each_entry(dev, head, index_hlist)
L
Linus Torvalds 已提交
932 933
		if (dev->ifindex == ifindex)
			return dev;
E
Eric Dumazet 已提交
934

L
Linus Torvalds 已提交
935 936
	return NULL;
}
E
Eric Dumazet 已提交
937
EXPORT_SYMBOL(__dev_get_by_index);
L
Linus Torvalds 已提交
938

939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
/**
 *	dev_get_by_index_rcu - find a device by its ifindex
 *	@net: the applicable net namespace
 *	@ifindex: index of device
 *
 *	Search for an interface by index. Returns %NULL if the device
 *	is not found or a pointer to the device. The device has not
 *	had its reference counter increased so the caller must be careful
 *	about locking. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
	struct net_device *dev;
	struct hlist_head *head = dev_index_hash(net, ifindex);

955
	hlist_for_each_entry_rcu(dev, head, index_hlist)
956 957 958 959 960 961 962
		if (dev->ifindex == ifindex)
			return dev;

	return NULL;
}
EXPORT_SYMBOL(dev_get_by_index_rcu);

L
Linus Torvalds 已提交
963 964 965

/**
 *	dev_get_by_index - find a device by its ifindex
966
 *	@net: the applicable net namespace
L
Linus Torvalds 已提交
967 968 969 970 971 972 973 974
 *	@ifindex: index of device
 *
 *	Search for an interface by index. Returns NULL if the device
 *	is not found or a pointer to the device. The device returned has
 *	had a reference added and the pointer is safe until the user calls
 *	dev_put to indicate they have finished with it.
 */

975
struct net_device *dev_get_by_index(struct net *net, int ifindex)
L
Linus Torvalds 已提交
976 977 978
{
	struct net_device *dev;

979 980
	rcu_read_lock();
	dev = dev_get_by_index_rcu(net, ifindex);
L
Linus Torvalds 已提交
981 982
	if (dev)
		dev_hold(dev);
983
	rcu_read_unlock();
L
Linus Torvalds 已提交
984 985
	return dev;
}
E
Eric Dumazet 已提交
986
EXPORT_SYMBOL(dev_get_by_index);
L
Linus Torvalds 已提交
987

988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
/**
 *	dev_get_by_napi_id - find a device by napi_id
 *	@napi_id: ID of the NAPI struct
 *
 *	Search for an interface by NAPI ID. Returns %NULL if the device
 *	is not found or a pointer to the device. The device has not had
 *	its reference counter increased so the caller must be careful
 *	about locking. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
	struct napi_struct *napi;

	WARN_ON_ONCE(!rcu_read_lock_held());

	if (napi_id < MIN_NAPI_ID)
		return NULL;

	napi = napi_by_id(napi_id);

	return napi ? napi->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_napi_id);

1013 1014 1015 1016 1017 1018 1019 1020 1021
/**
 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 *	@net: network namespace
 *	@name: a pointer to the buffer where the name will be stored.
 *	@ifindex: the ifindex of the interface to get the name from.
 */
int netdev_get_name(struct net *net, char *name, int ifindex)
{
	struct net_device *dev;
1022
	int ret;
1023

1024
	down_read(&devnet_rename_sem);
1025
	rcu_read_lock();
1026

1027 1028
	dev = dev_get_by_index_rcu(net, ifindex);
	if (!dev) {
1029 1030
		ret = -ENODEV;
		goto out;
1031 1032 1033 1034
	}

	strcpy(name, dev->name);

1035 1036 1037 1038 1039
	ret = 0;
out:
	rcu_read_unlock();
	up_read(&devnet_rename_sem);
	return ret;
1040 1041
}

L
Linus Torvalds 已提交
1042
/**
1043
 *	dev_getbyhwaddr_rcu - find a device by its hardware address
1044
 *	@net: the applicable net namespace
L
Linus Torvalds 已提交
1045 1046 1047 1048
 *	@type: media type of device
 *	@ha: hardware address
 *
 *	Search for an interface by MAC address. Returns NULL if the device
E
Eric Dumazet 已提交
1049 1050
 *	is not found or a pointer to the device.
 *	The caller must hold RCU or RTNL.
1051
 *	The returned device has not had its ref count increased
L
Linus Torvalds 已提交
1052 1053 1054 1055
 *	and the caller must therefore be careful about locking
 *
 */

1056 1057
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
				       const char *ha)
L
Linus Torvalds 已提交
1058 1059 1060
{
	struct net_device *dev;

1061
	for_each_netdev_rcu(net, dev)
L
Linus Torvalds 已提交
1062 1063
		if (dev->type == type &&
		    !memcmp(dev->dev_addr, ha, dev->addr_len))
1064 1065 1066
			return dev;

	return NULL;
L
Linus Torvalds 已提交
1067
}
1068
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1069

1070
struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
L
Linus Torvalds 已提交
1071 1072 1073
{
	struct net_device *dev;

1074
	ASSERT_RTNL();
1075
	for_each_netdev(net, dev)
1076
		if (dev->type == type)
1077 1078 1079
			return dev;

	return NULL;
1080 1081 1082
}
EXPORT_SYMBOL(__dev_getfirstbyhwtype);

1083
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1084
{
1085
	struct net_device *dev, *ret = NULL;
1086

1087 1088 1089 1090 1091 1092 1093 1094 1095
	rcu_read_lock();
	for_each_netdev_rcu(net, dev)
		if (dev->type == type) {
			dev_hold(dev);
			ret = dev;
			break;
		}
	rcu_read_unlock();
	return ret;
L
Linus Torvalds 已提交
1096 1097 1098 1099
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);

/**
1100
 *	__dev_get_by_flags - find any device with given flags
1101
 *	@net: the applicable net namespace
L
Linus Torvalds 已提交
1102 1103 1104 1105
 *	@if_flags: IFF_* values
 *	@mask: bitmask of bits in if_flags to check
 *
 *	Search for any interface with the given flags. Returns NULL if a device
E
Eric Dumazet 已提交
1106
 *	is not found or a pointer to the device. Must be called inside
1107
 *	rtnl_lock(), and result refcount is unchanged.
L
Linus Torvalds 已提交
1108 1109
 */

1110 1111
struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
				      unsigned short mask)
L
Linus Torvalds 已提交
1112
{
1113
	struct net_device *dev, *ret;
L
Linus Torvalds 已提交
1114

1115 1116
	ASSERT_RTNL();

1117
	ret = NULL;
1118
	for_each_netdev(net, dev) {
L
Linus Torvalds 已提交
1119
		if (((dev->flags ^ if_flags) & mask) == 0) {
1120
			ret = dev;
L
Linus Torvalds 已提交
1121 1122 1123
			break;
		}
	}
1124
	return ret;
L
Linus Torvalds 已提交
1125
}
1126
EXPORT_SYMBOL(__dev_get_by_flags);
L
Linus Torvalds 已提交
1127 1128 1129 1130 1131 1132

/**
 *	dev_valid_name - check if name is okay for network device
 *	@name: name string
 *
 *	Network device names need to be valid file names to
1133 1134
 *	to allow sysfs to work.  We also disallow any kind of
 *	whitespace.
L
Linus Torvalds 已提交
1135
 */
1136
bool dev_valid_name(const char *name)
L
Linus Torvalds 已提交
1137
{
1138
	if (*name == '\0')
1139
		return false;
E
Eric Dumazet 已提交
1140
	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1141
		return false;
1142
	if (!strcmp(name, ".") || !strcmp(name, ".."))
1143
		return false;
1144 1145

	while (*name) {
1146
		if (*name == '/' || *name == ':' || isspace(*name))
1147
			return false;
1148 1149
		name++;
	}
1150
	return true;
L
Linus Torvalds 已提交
1151
}
E
Eric Dumazet 已提交
1152
EXPORT_SYMBOL(dev_valid_name);
L
Linus Torvalds 已提交
1153 1154

/**
1155 1156
 *	__dev_alloc_name - allocate a name for a device
 *	@net: network namespace to allocate the device name in
L
Linus Torvalds 已提交
1157
 *	@name: name format string
1158
 *	@buf:  scratch buffer and result name string
L
Linus Torvalds 已提交
1159 1160
 *
 *	Passed a format string - eg "lt%d" it will try and find a suitable
S
Stephen Hemminger 已提交
1161 1162 1163 1164 1165 1166
 *	id. It scans list of devices to build up a free map, then chooses
 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 *	while allocating the name and adding the device in order to avoid
 *	duplicates.
 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *	Returns the number of the unit assigned or a negative errno code.
L
Linus Torvalds 已提交
1167 1168
 */

1169
static int __dev_alloc_name(struct net *net, const char *name, char *buf)
L
Linus Torvalds 已提交
1170 1171 1172 1173
{
	int i = 0;
	const char *p;
	const int max_netdevices = 8*PAGE_SIZE;
S
Stephen Hemminger 已提交
1174
	unsigned long *inuse;
L
Linus Torvalds 已提交
1175 1176
	struct net_device *d;

1177 1178 1179
	if (!dev_valid_name(name))
		return -EINVAL;

1180
	p = strchr(name, '%');
L
Linus Torvalds 已提交
1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
	if (p) {
		/*
		 * Verify the string as this thing may have come from
		 * the user.  There must be either one "%d" and no other "%"
		 * characters.
		 */
		if (p[1] != 'd' || strchr(p + 2, '%'))
			return -EINVAL;

		/* Use one page as a bit array of possible slots */
S
Stephen Hemminger 已提交
1191
		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
1192 1193 1194
		if (!inuse)
			return -ENOMEM;

1195
		for_each_netdev(net, d) {
L
Linus Torvalds 已提交
1196 1197 1198 1199 1200 1201
			if (!sscanf(d->name, name, &i))
				continue;
			if (i < 0 || i >= max_netdevices)
				continue;

			/*  avoid cases where sscanf is not exact inverse of printf */
1202
			snprintf(buf, IFNAMSIZ, name, i);
L
Linus Torvalds 已提交
1203 1204 1205 1206 1207 1208 1209 1210
			if (!strncmp(buf, d->name, IFNAMSIZ))
				set_bit(i, inuse);
		}

		i = find_first_zero_bit(inuse, max_netdevices);
		free_page((unsigned long) inuse);
	}

1211
	snprintf(buf, IFNAMSIZ, name, i);
1212
	if (!__dev_get_by_name(net, buf))
L
Linus Torvalds 已提交
1213 1214 1215 1216 1217 1218
		return i;

	/* It is possible to run out of possible slots
	 * when the name is long and there isn't enough space left
	 * for the digits, or if all bits are used.
	 */
1219
	return -ENFILE;
L
Linus Torvalds 已提交
1220 1221
}

1222 1223 1224 1225 1226 1227 1228
static int dev_alloc_name_ns(struct net *net,
			     struct net_device *dev,
			     const char *name)
{
	char buf[IFNAMSIZ];
	int ret;

1229
	BUG_ON(!net);
1230 1231 1232 1233
	ret = __dev_alloc_name(net, name, buf);
	if (ret >= 0)
		strlcpy(dev->name, buf, IFNAMSIZ);
	return ret;
L
Linus Torvalds 已提交
1234 1235
}

1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251
/**
 *	dev_alloc_name - allocate a name for a device
 *	@dev: device
 *	@name: name format string
 *
 *	Passed a format string - eg "lt%d" it will try and find a suitable
 *	id. It scans list of devices to build up a free map, then chooses
 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 *	while allocating the name and adding the device in order to avoid
 *	duplicates.
 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *	Returns the number of the unit assigned or a negative errno code.
 */

int dev_alloc_name(struct net_device *dev, const char *name)
{
1252
	return dev_alloc_name_ns(dev_net(dev), dev, name);
1253
}
E
Eric Dumazet 已提交
1254
EXPORT_SYMBOL(dev_alloc_name);
1255

1256 1257
static int dev_get_valid_name(struct net *net, struct net_device *dev,
			      const char *name)
1258
{
1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
	BUG_ON(!net);

	if (!dev_valid_name(name))
		return -EINVAL;

	if (strchr(name, '%'))
		return dev_alloc_name_ns(net, dev, name);
	else if (__dev_get_by_name(net, name))
		return -EEXIST;
	else if (dev->name != name)
		strlcpy(dev->name, name, IFNAMSIZ);

	return 0;
1272
}
L
Linus Torvalds 已提交
1273 1274 1275 1276 1277 1278 1279 1280 1281

/**
 *	dev_change_name - change name of a device
 *	@dev: device
 *	@newname: name (or format string) must be at least IFNAMSIZ
 *
 *	Change name of a device, can pass format strings "eth%d".
 *	for wildcarding.
 */
1282
int dev_change_name(struct net_device *dev, const char *newname)
L
Linus Torvalds 已提交
1283
{
1284
	unsigned char old_assign_type;
1285
	char oldname[IFNAMSIZ];
L
Linus Torvalds 已提交
1286
	int err = 0;
1287
	int ret;
1288
	struct net *net;
L
Linus Torvalds 已提交
1289 1290

	ASSERT_RTNL();
1291
	BUG_ON(!dev_net(dev));
L
Linus Torvalds 已提交
1292

1293
	net = dev_net(dev);
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308

	/* Some auto-enslaved devices e.g. failover slaves are
	 * special, as userspace might rename the device after
	 * the interface had been brought up and running since
	 * the point kernel initiated auto-enslavement. Allow
	 * live name change even when these slave devices are
	 * up and running.
	 *
	 * Typically, users of these auto-enslaving devices
	 * don't actually care about slave name change, as
	 * they are supposed to operate on master interface
	 * directly.
	 */
	if (dev->flags & IFF_UP &&
	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
L
Linus Torvalds 已提交
1309 1310
		return -EBUSY;

1311
	down_write(&devnet_rename_sem);
1312 1313

	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1314
		up_write(&devnet_rename_sem);
1315
		return 0;
1316
	}
1317

1318 1319
	memcpy(oldname, dev->name, IFNAMSIZ);

1320
	err = dev_get_valid_name(net, dev, newname);
1321
	if (err < 0) {
1322
		up_write(&devnet_rename_sem);
1323
		return err;
1324
	}
L
Linus Torvalds 已提交
1325

1326 1327 1328
	if (oldname[0] && !strchr(oldname, '%'))
		netdev_info(dev, "renamed from %s\n", oldname);

1329 1330 1331
	old_assign_type = dev->name_assign_type;
	dev->name_assign_type = NET_NAME_RENAMED;

1332
rollback:
1333 1334 1335
	ret = device_rename(&dev->dev, dev->name);
	if (ret) {
		memcpy(dev->name, oldname, IFNAMSIZ);
1336
		dev->name_assign_type = old_assign_type;
1337
		up_write(&devnet_rename_sem);
1338
		return ret;
1339
	}
1340

1341
	up_write(&devnet_rename_sem);
1342

1343 1344
	netdev_adjacent_rename_links(dev, oldname);

1345
	write_lock_bh(&dev_base_lock);
1346
	netdev_name_node_del(dev->name_node);
1347 1348 1349 1350 1351
	write_unlock_bh(&dev_base_lock);

	synchronize_rcu();

	write_lock_bh(&dev_base_lock);
1352
	netdev_name_node_add(net, dev->name_node);
1353 1354
	write_unlock_bh(&dev_base_lock);

1355
	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1356 1357 1358
	ret = notifier_to_errno(ret);

	if (ret) {
1359 1360
		/* err >= 0 after dev_alloc_name() or stores the first errno */
		if (err >= 0) {
1361
			err = ret;
1362
			down_write(&devnet_rename_sem);
1363
			memcpy(dev->name, oldname, IFNAMSIZ);
1364
			memcpy(oldname, newname, IFNAMSIZ);
1365 1366
			dev->name_assign_type = old_assign_type;
			old_assign_type = NET_NAME_RENAMED;
1367
			goto rollback;
1368
		} else {
1369
			pr_err("%s: name change rollback failed: %d\n",
1370
			       dev->name, ret);
1371 1372
		}
	}
L
Linus Torvalds 已提交
1373 1374 1375 1376

	return err;
}

1377 1378 1379 1380
/**
 *	dev_set_alias - change ifalias of a device
 *	@dev: device
 *	@alias: name up to IFALIASZ
1381
 *	@len: limit of bytes to copy from info
1382 1383 1384 1385 1386
 *
 *	Set ifalias for a device,
 */
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
1387
	struct dev_ifalias *new_alias = NULL;
1388 1389 1390 1391

	if (len >= IFALIASZ)
		return -EINVAL;

1392 1393 1394 1395 1396 1397 1398
	if (len) {
		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
		if (!new_alias)
			return -ENOMEM;

		memcpy(new_alias->ifalias, alias, len);
		new_alias->ifalias[len] = 0;
1399 1400
	}

1401
	mutex_lock(&ifalias_mutex);
1402 1403
	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
					mutex_is_locked(&ifalias_mutex));
1404 1405 1406 1407
	mutex_unlock(&ifalias_mutex);

	if (new_alias)
		kfree_rcu(new_alias, rcuhead);
1408 1409 1410

	return len;
}
1411
EXPORT_SYMBOL(dev_set_alias);
1412

1413 1414 1415
/**
 *	dev_get_alias - get ifalias of a device
 *	@dev: device
1416
 *	@name: buffer to store name of ifalias
1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434
 *	@len: size of buffer
 *
 *	get ifalias for a device.  Caller must make sure dev cannot go
 *	away,  e.g. rcu read lock or own a reference count to device.
 */
int dev_get_alias(const struct net_device *dev, char *name, size_t len)
{
	const struct dev_ifalias *alias;
	int ret = 0;

	rcu_read_lock();
	alias = rcu_dereference(dev->ifalias);
	if (alias)
		ret = snprintf(name, len, "%s", alias->ifalias);
	rcu_read_unlock();

	return ret;
}
1435

1436
/**
S
Stephen Hemminger 已提交
1437
 *	netdev_features_change - device changes features
1438 1439 1440 1441 1442 1443
 *	@dev: device to cause notification
 *
 *	Called to indicate a device has changed features.
 */
void netdev_features_change(struct net_device *dev)
{
1444
	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1445 1446 1447
}
EXPORT_SYMBOL(netdev_features_change);

L
Linus Torvalds 已提交
1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458
/**
 *	netdev_state_change - device changes state
 *	@dev: device to cause notification
 *
 *	Called to indicate a device has changed state. This function calls
 *	the notifier chains for netdev_chain and sends a NEWLINK message
 *	to the routing socket.
 */
void netdev_state_change(struct net_device *dev)
{
	if (dev->flags & IFF_UP) {
1459 1460 1461
		struct netdev_notifier_change_info change_info = {
			.info.dev = dev,
		};
1462

1463
		call_netdevice_notifiers_info(NETDEV_CHANGE,
1464
					      &change_info.info);
1465
		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
L
Linus Torvalds 已提交
1466 1467
	}
}
E
Eric Dumazet 已提交
1468
EXPORT_SYMBOL(netdev_state_change);
L
Linus Torvalds 已提交
1469

1470
/**
1471 1472
 * netdev_notify_peers - notify network peers about existence of @dev
 * @dev: network device
1473 1474 1475 1476 1477 1478 1479 1480
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void netdev_notify_peers(struct net_device *dev)
1481
{
1482 1483
	rtnl_lock();
	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1484
	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1485
	rtnl_unlock();
1486
}
1487
EXPORT_SYMBOL(netdev_notify_peers);
1488

1489
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1490
{
1491
	const struct net_device_ops *ops = dev->netdev_ops;
1492
	int ret;
L
Linus Torvalds 已提交
1493

1494 1495
	ASSERT_RTNL();

1496 1497 1498 1499 1500 1501 1502
	if (!netif_device_present(dev)) {
		/* may be detached because parent is runtime-suspended */
		if (dev->dev.parent)
			pm_runtime_resume(dev->dev.parent);
		if (!netif_device_present(dev))
			return -ENODEV;
	}
L
Linus Torvalds 已提交
1503

1504 1505 1506 1507
	/* Block netpoll from trying to do any rx path servicing.
	 * If we don't do this there is a chance ndo_poll_controller
	 * or ndo_poll may be running while we open the device
	 */
1508
	netpoll_poll_disable(dev);
1509

1510
	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1511 1512 1513 1514
	ret = notifier_to_errno(ret);
	if (ret)
		return ret;

L
Linus Torvalds 已提交
1515
	set_bit(__LINK_STATE_START, &dev->state);
1516

1517 1518
	if (ops->ndo_validate_addr)
		ret = ops->ndo_validate_addr(dev);
1519

1520 1521
	if (!ret && ops->ndo_open)
		ret = ops->ndo_open(dev);
L
Linus Torvalds 已提交
1522

1523
	netpoll_poll_enable(dev);
1524

1525 1526 1527
	if (ret)
		clear_bit(__LINK_STATE_START, &dev->state);
	else {
L
Linus Torvalds 已提交
1528
		dev->flags |= IFF_UP;
1529
		dev_set_rx_mode(dev);
L
Linus Torvalds 已提交
1530
		dev_activate(dev);
1531
		add_device_randomness(dev->dev_addr, dev->addr_len);
L
Linus Torvalds 已提交
1532
	}
1533

L
Linus Torvalds 已提交
1534 1535 1536 1537
	return ret;
}

/**
1538
 *	dev_open	- prepare an interface for use.
1539 1540
 *	@dev: device to open
 *	@extack: netlink extended ack
L
Linus Torvalds 已提交
1541
 *
1542 1543 1544 1545 1546 1547 1548
 *	Takes a device from down to up state. The device's private open
 *	function is invoked and then the multicast lists are loaded. Finally
 *	the device is moved into the up state and a %NETDEV_UP message is
 *	sent to the netdev notifier chain.
 *
 *	Calling this function on an active interface is a nop. On a failure
 *	a negative errno code is returned.
L
Linus Torvalds 已提交
1549
 */
1550
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1551 1552 1553 1554 1555 1556
{
	int ret;

	if (dev->flags & IFF_UP)
		return 0;

1557
	ret = __dev_open(dev, extack);
1558 1559 1560
	if (ret < 0)
		return ret;

1561
	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1562 1563 1564 1565 1566 1567
	call_netdevice_notifiers(NETDEV_UP, dev);

	return ret;
}
EXPORT_SYMBOL(dev_open);

1568
static void __dev_close_many(struct list_head *head)
L
Linus Torvalds 已提交
1569
{
1570
	struct net_device *dev;
1571

1572
	ASSERT_RTNL();
1573 1574
	might_sleep();

1575
	list_for_each_entry(dev, head, close_list) {
1576
		/* Temporarily disable netpoll until the interface is down */
1577
		netpoll_poll_disable(dev);
1578

1579
		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
L
Linus Torvalds 已提交
1580

1581
		clear_bit(__LINK_STATE_START, &dev->state);
L
Linus Torvalds 已提交
1582

1583 1584 1585 1586 1587 1588
		/* Synchronize to scheduled poll. We cannot touch poll list, it
		 * can be even on different cpu. So just clear netif_running().
		 *
		 * dev->stop() will invoke napi_disable() on all of it's
		 * napi_struct instances on this device.
		 */
1589
		smp_mb__after_atomic(); /* Commit netif_running(). */
1590
	}
L
Linus Torvalds 已提交
1591

1592
	dev_deactivate_many(head);
1593

1594
	list_for_each_entry(dev, head, close_list) {
1595
		const struct net_device_ops *ops = dev->netdev_ops;
L
Linus Torvalds 已提交
1596

1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607
		/*
		 *	Call the device specific close. This cannot fail.
		 *	Only if device is UP
		 *
		 *	We allow it to be called even after a DETACH hot-plug
		 *	event.
		 */
		if (ops->ndo_stop)
			ops->ndo_stop(dev);

		dev->flags &= ~IFF_UP;
1608
		netpoll_poll_enable(dev);
1609 1610 1611
	}
}

1612
static void __dev_close(struct net_device *dev)
1613 1614 1615
{
	LIST_HEAD(single);

1616
	list_add(&dev->close_list, &single);
1617
	__dev_close_many(&single);
1618
	list_del(&single);
1619 1620
}

1621
void dev_close_many(struct list_head *head, bool unlink)
1622 1623
{
	struct net_device *dev, *tmp;
L
Linus Torvalds 已提交
1624

1625 1626
	/* Remove the devices that don't need to be closed */
	list_for_each_entry_safe(dev, tmp, head, close_list)
1627
		if (!(dev->flags & IFF_UP))
1628
			list_del_init(&dev->close_list);
1629 1630

	__dev_close_many(head);
L
Linus Torvalds 已提交
1631

1632
	list_for_each_entry_safe(dev, tmp, head, close_list) {
1633
		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1634
		call_netdevice_notifiers(NETDEV_DOWN, dev);
1635 1636
		if (unlink)
			list_del_init(&dev->close_list);
1637
	}
1638
}
1639
EXPORT_SYMBOL(dev_close_many);
1640 1641 1642 1643 1644 1645 1646 1647 1648 1649

/**
 *	dev_close - shutdown an interface.
 *	@dev: device to shutdown
 *
 *	This function moves an active device into down state. A
 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 *	chain.
 */
1650
void dev_close(struct net_device *dev)
1651
{
1652 1653
	if (dev->flags & IFF_UP) {
		LIST_HEAD(single);
L
Linus Torvalds 已提交
1654

1655
		list_add(&dev->close_list, &single);
1656
		dev_close_many(&single, true);
1657 1658
		list_del(&single);
	}
L
Linus Torvalds 已提交
1659
}
E
Eric Dumazet 已提交
1660
EXPORT_SYMBOL(dev_close);
L
Linus Torvalds 已提交
1661 1662


1663 1664 1665 1666 1667 1668 1669 1670 1671 1672
/**
 *	dev_disable_lro - disable Large Receive Offload on a device
 *	@dev: device
 *
 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 *	called under RTNL.  This is needed if received packets may be
 *	forwarded to another interface.
 */
void dev_disable_lro(struct net_device *dev)
{
1673 1674
	struct net_device *lower_dev;
	struct list_head *iter;
1675

M
Michał Mirosław 已提交
1676 1677
	dev->wanted_features &= ~NETIF_F_LRO;
	netdev_update_features(dev);
1678

1679 1680
	if (unlikely(dev->features & NETIF_F_LRO))
		netdev_WARN(dev, "failed to disable LRO!\n");
1681 1682 1683

	netdev_for_each_lower_dev(dev, lower_dev, iter)
		dev_disable_lro(lower_dev);
1684 1685 1686
}
EXPORT_SYMBOL(dev_disable_lro);

1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
/**
 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 *	@dev: device
 *
 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 *	called under RTNL.  This is needed if Generic XDP is installed on
 *	the device.
 */
static void dev_disable_gro_hw(struct net_device *dev)
{
	dev->wanted_features &= ~NETIF_F_GRO_HW;
	netdev_update_features(dev);

	if (unlikely(dev->features & NETIF_F_GRO_HW))
		netdev_WARN(dev, "failed to disable GRO_HW!\n");
}

1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716
const char *netdev_cmd_to_name(enum netdev_cmd cmd)
{
#define N(val) 						\
	case NETDEV_##val:				\
		return "NETDEV_" __stringify(val);
	switch (cmd) {
	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1717 1718
	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
P
Petr Machata 已提交
1719
	N(PRE_CHANGEADDR)
K
Kirill Tkhai 已提交
1720
	}
1721 1722 1723 1724 1725
#undef N
	return "UNKNOWN_NETDEV_EVENT";
}
EXPORT_SYMBOL_GPL(netdev_cmd_to_name);

1726 1727 1728
static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
				   struct net_device *dev)
{
1729 1730 1731
	struct netdev_notifier_info info = {
		.dev = dev,
	};
1732 1733 1734

	return nb->notifier_call(nb, val, &info);
}
1735

1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791
static int call_netdevice_register_notifiers(struct notifier_block *nb,
					     struct net_device *dev)
{
	int err;

	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
	err = notifier_to_errno(err);
	if (err)
		return err;

	if (!(dev->flags & IFF_UP))
		return 0;

	call_netdevice_notifier(nb, NETDEV_UP, dev);
	return 0;
}

static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
						struct net_device *dev)
{
	if (dev->flags & IFF_UP) {
		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
					dev);
		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
	}
	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}

static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
						 struct net *net)
{
	struct net_device *dev;
	int err;

	for_each_netdev(net, dev) {
		err = call_netdevice_register_notifiers(nb, dev);
		if (err)
			goto rollback;
	}
	return 0;

rollback:
	for_each_netdev_continue_reverse(net, dev)
		call_netdevice_unregister_notifiers(nb, dev);
	return err;
}

static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
						    struct net *net)
{
	struct net_device *dev;

	for_each_netdev(net, dev)
		call_netdevice_unregister_notifiers(nb, dev);
}

1792 1793
static int dev_boot_phase = 1;

L
Linus Torvalds 已提交
1794
/**
1795 1796
 * register_netdevice_notifier - register a network notifier block
 * @nb: notifier
L
Linus Torvalds 已提交
1797
 *
1798 1799 1800 1801
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
L
Linus Torvalds 已提交
1802
 *
1803 1804 1805
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
L
Linus Torvalds 已提交
1806 1807 1808 1809
 */

int register_netdevice_notifier(struct notifier_block *nb)
{
1810
	struct net *net;
L
Linus Torvalds 已提交
1811 1812
	int err;

1813 1814
	/* Close race with setup_net() and cleanup_net() */
	down_write(&pernet_ops_rwsem);
L
Linus Torvalds 已提交
1815
	rtnl_lock();
1816
	err = raw_notifier_chain_register(&netdev_chain, nb);
1817 1818
	if (err)
		goto unlock;
1819 1820 1821
	if (dev_boot_phase)
		goto unlock;
	for_each_net(net) {
1822 1823 1824
		err = call_netdevice_register_net_notifiers(nb, net);
		if (err)
			goto rollback;
L
Linus Torvalds 已提交
1825
	}
1826 1827

unlock:
L
Linus Torvalds 已提交
1828
	rtnl_unlock();
1829
	up_write(&pernet_ops_rwsem);
L
Linus Torvalds 已提交
1830
	return err;
1831 1832

rollback:
1833 1834
	for_each_net_continue_reverse(net)
		call_netdevice_unregister_net_notifiers(nb, net);
1835 1836

	raw_notifier_chain_unregister(&netdev_chain, nb);
1837
	goto unlock;
L
Linus Torvalds 已提交
1838
}
E
Eric Dumazet 已提交
1839
EXPORT_SYMBOL(register_netdevice_notifier);
L
Linus Torvalds 已提交
1840 1841

/**
1842 1843
 * unregister_netdevice_notifier - unregister a network notifier block
 * @nb: notifier
L
Linus Torvalds 已提交
1844
 *
1845 1846 1847 1848
 * Unregister a notifier previously registered by
 * register_netdevice_notifier(). The notifier is unlinked into the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
1849
 *
1850 1851 1852
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
L
Linus Torvalds 已提交
1853 1854 1855 1856
 */

int unregister_netdevice_notifier(struct notifier_block *nb)
{
1857
	struct net *net;
1858 1859
	int err;

1860 1861
	/* Close race with setup_net() and cleanup_net() */
	down_write(&pernet_ops_rwsem);
1862
	rtnl_lock();
1863
	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1864 1865 1866
	if (err)
		goto unlock;

1867 1868 1869
	for_each_net(net)
		call_netdevice_unregister_net_notifiers(nb, net);

1870
unlock:
1871
	rtnl_unlock();
1872
	up_write(&pernet_ops_rwsem);
1873
	return err;
L
Linus Torvalds 已提交
1874
}
E
Eric Dumazet 已提交
1875
EXPORT_SYMBOL(unregister_netdevice_notifier);
L
Linus Torvalds 已提交
1876

1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912
static int __register_netdevice_notifier_net(struct net *net,
					     struct notifier_block *nb,
					     bool ignore_call_fail)
{
	int err;

	err = raw_notifier_chain_register(&net->netdev_chain, nb);
	if (err)
		return err;
	if (dev_boot_phase)
		return 0;

	err = call_netdevice_register_net_notifiers(nb, net);
	if (err && !ignore_call_fail)
		goto chain_unregister;

	return 0;

chain_unregister:
	raw_notifier_chain_unregister(&net->netdev_chain, nb);
	return err;
}

static int __unregister_netdevice_notifier_net(struct net *net,
					       struct notifier_block *nb)
{
	int err;

	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
	if (err)
		return err;

	call_netdevice_unregister_net_notifiers(nb, net);
	return 0;
}

1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932
/**
 * register_netdevice_notifier_net - register a per-netns network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
	int err;

	rtnl_lock();
1933
	err = __register_netdevice_notifier_net(net, nb, false);
1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
	rtnl_unlock();
	return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);

/**
 * unregister_netdevice_notifier_net - unregister a per-netns
 *                                     network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier(). The notifier is unlinked into the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier_net(struct net *net,
				      struct notifier_block *nb)
{
	int err;

	rtnl_lock();
1961
	err = __unregister_netdevice_notifier_net(net, nb);
1962 1963 1964 1965 1966
	rtnl_unlock();
	return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);

1967 1968 1969 1970 1971
int register_netdevice_notifier_dev_net(struct net_device *dev,
					struct notifier_block *nb,
					struct netdev_net_notifier *nn)
{
	int err;
1972

1973 1974 1975 1976 1977 1978
	rtnl_lock();
	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
	if (!err) {
		nn->nb = nb;
		list_add(&nn->list, &dev->net_notifier_list);
	}
1979 1980 1981
	rtnl_unlock();
	return err;
}
1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);

int unregister_netdevice_notifier_dev_net(struct net_device *dev,
					  struct notifier_block *nb,
					  struct netdev_net_notifier *nn)
{
	int err;

	rtnl_lock();
	list_del(&nn->list);
	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
	rtnl_unlock();
	return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);

static void move_netdevice_notifiers_dev_net(struct net_device *dev,
					     struct net *net)
{
	struct netdev_net_notifier *nn;

	list_for_each_entry(nn, &dev->net_notifier_list, list) {
		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
		__register_netdevice_notifier_net(net, nn->nb, true);
	}
}
2008

2009 2010 2011 2012 2013 2014 2015 2016 2017
/**
 *	call_netdevice_notifiers_info - call all network notifier blocks
 *	@val: value passed unmodified to notifier function
 *	@info: notifier information data
 *
 *	Call all network notifier blocks.  Parameters and return value
 *	are as for raw_notifier_call_chain().
 */

S
stephen hemminger 已提交
2018 2019
static int call_netdevice_notifiers_info(unsigned long val,
					 struct netdev_notifier_info *info)
2020
{
2021 2022 2023
	struct net *net = dev_net(info->dev);
	int ret;

2024
	ASSERT_RTNL();
2025 2026 2027 2028 2029 2030 2031 2032

	/* Run per-netns notifier block chain first, then run the global one.
	 * Hopefully, one day, the global one is going to be removed after
	 * all notifier block registrators get converted to be per-netns.
	 */
	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
	if (ret & NOTIFY_STOP_MASK)
		return ret;
2033 2034 2035
	return raw_notifier_call_chain(&netdev_chain, val, info);
}

2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047
static int call_netdevice_notifiers_extack(unsigned long val,
					   struct net_device *dev,
					   struct netlink_ext_ack *extack)
{
	struct netdev_notifier_info info = {
		.dev = dev,
		.extack = extack,
	};

	return call_netdevice_notifiers_info(val, &info);
}

L
Linus Torvalds 已提交
2048 2049 2050
/**
 *	call_netdevice_notifiers - call all network notifier blocks
 *      @val: value passed unmodified to notifier function
2051
 *      @dev: net_device pointer passed unmodified to notifier function
L
Linus Torvalds 已提交
2052 2053
 *
 *	Call all network notifier blocks.  Parameters and return value
2054
 *	are as for raw_notifier_call_chain().
L
Linus Torvalds 已提交
2055 2056
 */

2057
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
L
Linus Torvalds 已提交
2058
{
2059
	return call_netdevice_notifiers_extack(val, dev, NULL);
L
Linus Torvalds 已提交
2060
}
2061
EXPORT_SYMBOL(call_netdevice_notifiers);
L
Linus Torvalds 已提交
2062

2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084
/**
 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 *	@val: value passed unmodified to notifier function
 *	@dev: net_device pointer passed unmodified to notifier function
 *	@arg: additional u32 argument passed to the notifier function
 *
 *	Call all network notifier blocks.  Parameters and return value
 *	are as for raw_notifier_call_chain().
 */
static int call_netdevice_notifiers_mtu(unsigned long val,
					struct net_device *dev, u32 arg)
{
	struct netdev_notifier_info_ext info = {
		.info.dev = dev,
		.ext.mtu = arg,
	};

	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);

	return call_netdevice_notifiers_info(val, &info.info);
}

2085
#ifdef CONFIG_NET_INGRESS
2086
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2087 2088 2089

void net_inc_ingress_queue(void)
{
2090
	static_branch_inc(&ingress_needed_key);
2091 2092 2093 2094 2095
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);

void net_dec_ingress_queue(void)
{
2096
	static_branch_dec(&ingress_needed_key);
2097 2098 2099 2100
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif

D
Daniel Borkmann 已提交
2101
#ifdef CONFIG_NET_EGRESS
2102
static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
D
Daniel Borkmann 已提交
2103 2104 2105

void net_inc_egress_queue(void)
{
2106
	static_branch_inc(&egress_needed_key);
D
Daniel Borkmann 已提交
2107 2108 2109 2110 2111
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);

void net_dec_egress_queue(void)
{
2112
	static_branch_dec(&egress_needed_key);
D
Daniel Borkmann 已提交
2113 2114 2115 2116
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif

2117
static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2118
#ifdef CONFIG_JUMP_LABEL
2119
static atomic_t netstamp_needed_deferred;
2120
static atomic_t netstamp_wanted;
2121
static void netstamp_clear(struct work_struct *work)
L
Linus Torvalds 已提交
2122
{
2123
	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2124
	int wanted;
2125

2126 2127
	wanted = atomic_add_return(deferred, &netstamp_wanted);
	if (wanted > 0)
2128
		static_branch_enable(&netstamp_needed_key);
2129
	else
2130
		static_branch_disable(&netstamp_needed_key);
2131 2132
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
2133
#endif
2134 2135 2136

void net_enable_timestamp(void)
{
2137
#ifdef CONFIG_JUMP_LABEL
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149
	int wanted;

	while (1) {
		wanted = atomic_read(&netstamp_wanted);
		if (wanted <= 0)
			break;
		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
			return;
	}
	atomic_inc(&netstamp_needed_deferred);
	schedule_work(&netstamp_work);
#else
2150
	static_branch_inc(&netstamp_needed_key);
2151
#endif
L
Linus Torvalds 已提交
2152
}
E
Eric Dumazet 已提交
2153
EXPORT_SYMBOL(net_enable_timestamp);
L
Linus Torvalds 已提交
2154 2155 2156

void net_disable_timestamp(void)
{
2157
#ifdef CONFIG_JUMP_LABEL
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
	int wanted;

	while (1) {
		wanted = atomic_read(&netstamp_wanted);
		if (wanted <= 1)
			break;
		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
			return;
	}
	atomic_dec(&netstamp_needed_deferred);
2168 2169
	schedule_work(&netstamp_work);
#else
2170
	static_branch_dec(&netstamp_needed_key);
2171
#endif
L
Linus Torvalds 已提交
2172
}
E
Eric Dumazet 已提交
2173
EXPORT_SYMBOL(net_disable_timestamp);
L
Linus Torvalds 已提交
2174

E
Eric Dumazet 已提交
2175
static inline void net_timestamp_set(struct sk_buff *skb)
L
Linus Torvalds 已提交
2176
{
T
Thomas Gleixner 已提交
2177
	skb->tstamp = 0;
2178
	if (static_branch_unlikely(&netstamp_needed_key))
2179
		__net_timestamp(skb);
L
Linus Torvalds 已提交
2180 2181
}

2182 2183 2184 2185 2186
#define net_timestamp_check(COND, SKB)				\
	if (static_branch_unlikely(&netstamp_needed_key)) {	\
		if ((COND) && !(SKB)->tstamp)			\
			__net_timestamp(SKB);			\
	}							\
E
Eric Dumazet 已提交
2187

2188
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206
{
	unsigned int len;

	if (!(dev->flags & IFF_UP))
		return false;

	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
	if (skb->len <= len)
		return true;

	/* if TSO is enabled, we don't care about the length as the packet
	 * could be forwarded without being segmented before
	 */
	if (skb_is_gso(skb))
		return true;

	return false;
}
2207
EXPORT_SYMBOL_GPL(is_skb_forwardable);
2208

H
Herbert Xu 已提交
2209 2210
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
2211
	int ret = ____dev_forward_skb(dev, skb);
H
Herbert Xu 已提交
2212

2213 2214 2215 2216
	if (likely(!ret)) {
		skb->protocol = eth_type_trans(skb, dev);
		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
	}
H
Herbert Xu 已提交
2217

2218
	return ret;
H
Herbert Xu 已提交
2219 2220 2221
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

2222 2223 2224 2225 2226 2227 2228 2229
/**
 * dev_forward_skb - loopback an skb to another netif
 *
 * @dev: destination network device
 * @skb: buffer to forward
 *
 * return values:
 *	NET_RX_SUCCESS	(no congestion)
2230
 *	NET_RX_DROP     (packet was dropped, but freed)
2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241
 *
 * dev_forward_skb can be used for injecting an skb from the
 * start_xmit function of one device into the receive queue
 * of another device.
 *
 * The receiving device may be in another namespace, so
 * we have to clear all information in the skb that could
 * impact namespace isolation.
 */
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
H
Herbert Xu 已提交
2242
	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2243 2244 2245
}
EXPORT_SYMBOL_GPL(dev_forward_skb);

2246 2247 2248 2249
static inline int deliver_skb(struct sk_buff *skb,
			      struct packet_type *pt_prev,
			      struct net_device *orig_dev)
{
W
Willem de Bruijn 已提交
2250
	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2251
		return -ENOMEM;
2252
	refcount_inc(&skb->users);
2253 2254 2255
	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

2256 2257
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
					  struct packet_type **pt,
2258 2259
					  struct net_device *orig_dev,
					  __be16 type,
2260 2261 2262 2263 2264 2265 2266 2267
					  struct list_head *ptype_list)
{
	struct packet_type *ptype, *pt_prev = *pt;

	list_for_each_entry_rcu(ptype, ptype_list, list) {
		if (ptype->type != type)
			continue;
		if (pt_prev)
2268
			deliver_skb(skb, pt_prev, orig_dev);
2269 2270 2271 2272 2273
		pt_prev = ptype;
	}
	*pt = pt_prev;
}

2274 2275
static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
2276
	if (!ptype->af_packet_priv || !skb->sk)
2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
		return false;

	if (ptype->id_match)
		return ptype->id_match(ptype, skb->sk);
	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
		return true;

	return false;
}

2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297
/**
 * dev_nit_active - return true if any network interface taps are in use
 *
 * @dev: network device to check for the presence of taps
 */
bool dev_nit_active(struct net_device *dev)
{
	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active);

L
Linus Torvalds 已提交
2298 2299 2300 2301 2302
/*
 *	Support routine. Sends outgoing frames to any network
 *	taps currently in use.
 */

2303
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
2304 2305
{
	struct packet_type *ptype;
2306 2307
	struct sk_buff *skb2 = NULL;
	struct packet_type *pt_prev = NULL;
2308
	struct list_head *ptype_list = &ptype_all;
2309

L
Linus Torvalds 已提交
2310
	rcu_read_lock();
2311 2312
again:
	list_for_each_entry_rcu(ptype, ptype_list, list) {
2313 2314 2315
		if (ptype->ignore_outgoing)
			continue;

L
Linus Torvalds 已提交
2316 2317 2318
		/* Never send packets back to the socket
		 * they originated from - MvS (miquels@drinkel.ow.org)
		 */
2319 2320
		if (skb_loop_sk(ptype, skb))
			continue;
2321

2322 2323 2324 2325 2326
		if (pt_prev) {
			deliver_skb(skb2, pt_prev, skb->dev);
			pt_prev = ptype;
			continue;
		}
L
Linus Torvalds 已提交
2327

2328 2329 2330 2331
		/* need to clone skb, done only once */
		skb2 = skb_clone(skb, GFP_ATOMIC);
		if (!skb2)
			goto out_unlock;
2332

2333
		net_timestamp_set(skb2);
L
Linus Torvalds 已提交
2334

2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346
		/* skb->nh should be correctly
		 * set by sender, so that the second statement is
		 * just protection against buggy protocols.
		 */
		skb_reset_mac_header(skb2);

		if (skb_network_header(skb2) < skb2->data ||
		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
					     ntohs(skb2->protocol),
					     dev->name);
			skb_reset_network_header(skb2);
L
Linus Torvalds 已提交
2347
		}
2348 2349 2350 2351 2352 2353 2354 2355 2356

		skb2->transport_header = skb2->network_header;
		skb2->pkt_type = PACKET_OUTGOING;
		pt_prev = ptype;
	}

	if (ptype_list == &ptype_all) {
		ptype_list = &dev->ptype_all;
		goto again;
L
Linus Torvalds 已提交
2357
	}
2358
out_unlock:
2359 2360 2361 2362 2363 2364
	if (pt_prev) {
		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
		else
			kfree_skb(skb2);
	}
L
Linus Torvalds 已提交
2365 2366
	rcu_read_unlock();
}
2367
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
L
Linus Torvalds 已提交
2368

2369 2370
/**
 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381
 * @dev: Network device
 * @txq: number of queues available
 *
 * If real_num_tx_queues is changed the tc mappings may no longer be
 * valid. To resolve this verify the tc mapping remains valid and if
 * not NULL the mapping. With no priorities mapping to this
 * offset/count pair it will no longer be used. In the worst case TC0
 * is invalid nothing can be done so disable priority mappings. If is
 * expected that drivers will fix this mapping if they can before
 * calling netif_set_real_num_tx_queues.
 */
E
Eric Dumazet 已提交
2382
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2383 2384 2385 2386 2387 2388
{
	int i;
	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];

	/* If TC0 is invalidated disable TC mapping */
	if (tc->offset + tc->count > txq) {
2389
		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2390 2391 2392 2393 2394 2395 2396 2397 2398 2399
		dev->num_tc = 0;
		return;
	}

	/* Invalidated prio to tc mappings set to TC0 */
	for (i = 1; i < TC_BITMASK + 1; i++) {
		int q = netdev_get_prio_tc_map(dev, i);

		tc = &dev->tc_to_txq[q];
		if (tc->offset + tc->count > txq) {
2400 2401
			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
				i, q);
2402 2403 2404 2405 2406
			netdev_set_prio_tc_map(dev, i, 0);
		}
	}
}

2407 2408 2409 2410 2411 2412
int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
{
	if (dev->num_tc) {
		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
		int i;

2413
		/* walk through the TCs and see if it falls into any of them */
2414 2415 2416 2417 2418
		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
			if ((txq - tc->offset) < tc->count)
				return i;
		}

2419
		/* didn't find it, just return -1 to indicate no match */
2420 2421 2422 2423 2424
		return -1;
	}

	return 0;
}
2425
EXPORT_SYMBOL(netdev_txq_to_tc);
2426

2427
#ifdef CONFIG_XPS
2428 2429 2430 2431
struct static_key xps_needed __read_mostly;
EXPORT_SYMBOL(xps_needed);
struct static_key xps_rxqs_needed __read_mostly;
EXPORT_SYMBOL(xps_rxqs_needed);
2432 2433 2434 2435
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P)		\
	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))

2436 2437
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
			     int tci, u16 index)
2438
{
2439 2440
	struct xps_map *map = NULL;
	int pos;
2441

2442
	if (dev_maps)
2443
		map = xmap_dereference(dev_maps->attr_map[tci]);
2444 2445
	if (!map)
		return false;
2446

2447 2448 2449 2450 2451 2452
	for (pos = map->len; pos--;) {
		if (map->queues[pos] != index)
			continue;

		if (map->len > 1) {
			map->queues[pos] = map->queues[--map->len];
2453
			break;
2454
		}
2455

2456
		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2457 2458
		kfree_rcu(map, rcu);
		return false;
2459 2460
	}

2461
	return true;
2462 2463
}

2464 2465 2466 2467
static bool remove_xps_queue_cpu(struct net_device *dev,
				 struct xps_dev_maps *dev_maps,
				 int cpu, u16 offset, u16 count)
{
2468 2469 2470
	int num_tc = dev->num_tc ? : 1;
	bool active = false;
	int tci;
2471

2472 2473 2474 2475
	for (tci = cpu * num_tc; num_tc--; tci++) {
		int i, j;

		for (i = count, j = offset; i--; j++) {
2476
			if (!remove_xps_queue(dev_maps, tci, j))
2477 2478 2479 2480
				break;
		}

		active |= i < 0;
2481 2482
	}

2483
	return active;
2484 2485
}

2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499
static void reset_xps_maps(struct net_device *dev,
			   struct xps_dev_maps *dev_maps,
			   bool is_rxqs_map)
{
	if (is_rxqs_map) {
		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
	} else {
		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
	}
	static_key_slow_dec_cpuslocked(&xps_needed);
	kfree_rcu(dev_maps, rcu);
}

2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510
static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
			   u16 offset, u16 count, bool is_rxqs_map)
{
	bool active = false;
	int i, j;

	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
	     j < nr_ids;)
		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
					       count);
2511 2512
	if (!active)
		reset_xps_maps(dev, dev_maps, is_rxqs_map);
2513

2514 2515 2516 2517 2518
	if (!is_rxqs_map) {
		for (i = offset + (count - 1); count--; i--) {
			netdev_queue_numa_node_write(
				netdev_get_tx_queue(dev, i),
				NUMA_NO_NODE);
2519 2520 2521 2522
		}
	}
}

2523 2524
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
				   u16 count)
2525
{
2526
	const unsigned long *possible_mask = NULL;
2527
	struct xps_dev_maps *dev_maps;
2528
	unsigned int nr_ids;
2529

2530 2531
	if (!static_key_false(&xps_needed))
		return;
2532

2533
	cpus_read_lock();
2534
	mutex_lock(&xps_map_mutex);
2535

2536 2537 2538 2539 2540 2541 2542
	if (static_key_false(&xps_rxqs_needed)) {
		dev_maps = xmap_dereference(dev->xps_rxqs_map);
		if (dev_maps) {
			nr_ids = dev->num_rx_queues;
			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
				       offset, count, true);
		}
2543 2544
	}

2545 2546 2547 2548 2549 2550 2551 2552 2553
	dev_maps = xmap_dereference(dev->xps_cpus_map);
	if (!dev_maps)
		goto out_no_maps;

	if (num_possible_cpus() > 1)
		possible_mask = cpumask_bits(cpu_possible_mask);
	nr_ids = nr_cpu_ids;
	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
		       false);
2554

2555 2556
out_no_maps:
	mutex_unlock(&xps_map_mutex);
2557
	cpus_read_unlock();
2558 2559
}

2560 2561 2562 2563 2564
static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
{
	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}

2565 2566
static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
				      u16 index, bool is_rxqs_map)
2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577
{
	struct xps_map *new_map;
	int alloc_len = XPS_MIN_MAP_ALLOC;
	int i, pos;

	for (pos = 0; map && pos < map->len; pos++) {
		if (map->queues[pos] != index)
			continue;
		return map;
	}

2578
	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
2579 2580 2581 2582 2583 2584 2585
	if (map) {
		if (pos < map->alloc_len)
			return map;

		alloc_len = map->alloc_len * 2;
	}

2586 2587 2588 2589 2590 2591 2592 2593
	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
	 *  map
	 */
	if (is_rxqs_map)
		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
	else
		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
				       cpu_to_node(attr_index));
2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
	if (!new_map)
		return NULL;

	for (i = 0; i < pos; i++)
		new_map->queues[i] = map->queues[i];
	new_map->alloc_len = alloc_len;
	new_map->len = pos;

	return new_map;
}

2605
/* Must be called under cpus_read_lock */
2606 2607
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
			  u16 index, bool is_rxqs_map)
2608
{
2609
	const unsigned long *online_mask = NULL, *possible_mask = NULL;
2610
	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2611
	int i, j, tci, numa_node_id = -2;
2612
	int maps_sz, num_tc = 1, tc = 0;
2613
	struct xps_map *map, *new_map;
2614
	bool active = false;
2615
	unsigned int nr_ids;
2616

2617
	if (dev->num_tc) {
2618
		/* Do not allow XPS on subordinate device directly */
2619
		num_tc = dev->num_tc;
2620 2621 2622 2623 2624 2625
		if (num_tc < 0)
			return -EINVAL;

		/* If queue belongs to subordinate dev use its map */
		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;

2626 2627 2628 2629 2630
		tc = netdev_txq_to_tc(dev, index);
		if (tc < 0)
			return -EINVAL;
	}

2631
	mutex_lock(&xps_map_mutex);
2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644
	if (is_rxqs_map) {
		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
		dev_maps = xmap_dereference(dev->xps_rxqs_map);
		nr_ids = dev->num_rx_queues;
	} else {
		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
		if (num_possible_cpus() > 1) {
			online_mask = cpumask_bits(cpu_online_mask);
			possible_mask = cpumask_bits(cpu_possible_mask);
		}
		dev_maps = xmap_dereference(dev->xps_cpus_map);
		nr_ids = nr_cpu_ids;
	}
2645

2646 2647
	if (maps_sz < L1_CACHE_BYTES)
		maps_sz = L1_CACHE_BYTES;
2648

2649
	/* allocate memory for queue storage */
2650 2651
	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
	     j < nr_ids;) {
2652 2653
		if (!new_dev_maps)
			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2654 2655
		if (!new_dev_maps) {
			mutex_unlock(&xps_map_mutex);
2656
			return -ENOMEM;
2657
		}
2658

2659 2660
		tci = j * num_tc + tc;
		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2661 2662
				 NULL;

2663
		map = expand_xps_map(map, j, index, is_rxqs_map);
2664 2665 2666
		if (!map)
			goto error;

2667
		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2668 2669 2670 2671 2672
	}

	if (!new_dev_maps)
		goto out_no_new_maps;

2673 2674 2675 2676 2677 2678
	if (!dev_maps) {
		/* Increment static keys at most once per type */
		static_key_slow_inc_cpuslocked(&xps_needed);
		if (is_rxqs_map)
			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
	}
2679

2680 2681
	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
	     j < nr_ids;) {
2682
		/* copy maps belonging to foreign traffic classes */
2683
		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2684
			/* fill in the new device map from the old device map */
2685 2686
			map = xmap_dereference(dev_maps->attr_map[tci]);
			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2687 2688 2689 2690 2691
		}

		/* We need to explicitly update tci as prevous loop
		 * could break out early if dev_maps is NULL.
		 */
2692
		tci = j * num_tc + tc;
2693

2694 2695 2696
		if (netif_attr_test_mask(j, mask, nr_ids) &&
		    netif_attr_test_online(j, online_mask, nr_ids)) {
			/* add tx-queue to CPU/rx-queue maps */
2697 2698
			int pos = 0;

2699
			map = xmap_dereference(new_dev_maps->attr_map[tci]);
2700 2701 2702 2703 2704
			while ((pos < map->len) && (map->queues[pos] != index))
				pos++;

			if (pos == map->len)
				map->queues[map->len++] = index;
2705
#ifdef CONFIG_NUMA
2706 2707 2708 2709 2710 2711
			if (!is_rxqs_map) {
				if (numa_node_id == -2)
					numa_node_id = cpu_to_node(j);
				else if (numa_node_id != cpu_to_node(j))
					numa_node_id = -1;
			}
2712
#endif
2713 2714
		} else if (dev_maps) {
			/* fill in the new device map from the old device map */
2715 2716
			map = xmap_dereference(dev_maps->attr_map[tci]);
			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2717
		}
2718

2719 2720 2721
		/* copy maps belonging to foreign traffic classes */
		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
			/* fill in the new device map from the old device map */
2722 2723
			map = xmap_dereference(dev_maps->attr_map[tci]);
			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2724
		}
2725 2726
	}

2727 2728 2729 2730
	if (is_rxqs_map)
		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
	else
		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2731

2732
	/* Cleanup old maps */
2733 2734 2735
	if (!dev_maps)
		goto out_no_old_maps;

2736 2737 2738 2739 2740
	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
	     j < nr_ids;) {
		for (i = num_tc, tci = j * num_tc; i--; tci++) {
			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
			map = xmap_dereference(dev_maps->attr_map[tci]);
2741 2742 2743
			if (map && map != new_map)
				kfree_rcu(map, rcu);
		}
2744 2745
	}

2746 2747 2748
	kfree_rcu(dev_maps, rcu);

out_no_old_maps:
2749 2750
	dev_maps = new_dev_maps;
	active = true;
2751

2752
out_no_new_maps:
2753 2754 2755 2756 2757 2758
	if (!is_rxqs_map) {
		/* update Tx queue numa node */
		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
					     (numa_node_id >= 0) ?
					     numa_node_id : NUMA_NO_NODE);
	}
2759

2760 2761 2762
	if (!dev_maps)
		goto out_no_maps;

2763 2764 2765 2766
	/* removes tx-queue from unused CPUs/rx-queues */
	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
	     j < nr_ids;) {
		for (i = tc, tci = j * num_tc; i--; tci++)
2767
			active |= remove_xps_queue(dev_maps, tci, index);
2768 2769
		if (!netif_attr_test_mask(j, mask, nr_ids) ||
		    !netif_attr_test_online(j, online_mask, nr_ids))
2770 2771 2772
			active |= remove_xps_queue(dev_maps, tci, index);
		for (i = num_tc - tc, tci++; --i; tci++)
			active |= remove_xps_queue(dev_maps, tci, index);
2773 2774 2775
	}

	/* free map if not active */
2776 2777
	if (!active)
		reset_xps_maps(dev, dev_maps, is_rxqs_map);
2778 2779

out_no_maps:
2780 2781 2782 2783
	mutex_unlock(&xps_map_mutex);

	return 0;
error:
2784
	/* remove any maps that we added */
2785 2786 2787 2788
	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
	     j < nr_ids;) {
		for (i = num_tc, tci = j * num_tc; i--; tci++) {
			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2789
			map = dev_maps ?
2790
			      xmap_dereference(dev_maps->attr_map[tci]) :
2791 2792 2793 2794
			      NULL;
			if (new_map && new_map != map)
				kfree(new_map);
		}
2795 2796
	}

2797 2798 2799 2800 2801
	mutex_unlock(&xps_map_mutex);

	kfree(new_dev_maps);
	return -ENOMEM;
}
2802
EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2803 2804 2805 2806

int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
			u16 index)
{
2807 2808 2809 2810 2811 2812 2813
	int ret;

	cpus_read_lock();
	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
	cpus_read_unlock();

	return ret;
2814
}
2815 2816 2817
EXPORT_SYMBOL(netif_set_xps_queue);

#endif
2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828
static void netdev_unbind_all_sb_channels(struct net_device *dev)
{
	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

	/* Unbind any subordinate channels */
	while (txq-- != &dev->_tx[0]) {
		if (txq->sb_dev)
			netdev_unbind_sb_channel(dev, txq->sb_dev);
	}
}

2829 2830
void netdev_reset_tc(struct net_device *dev)
{
2831 2832 2833
#ifdef CONFIG_XPS
	netif_reset_xps_queues_gt(dev, 0);
#endif
2834 2835 2836
	netdev_unbind_all_sb_channels(dev);

	/* Reset TC configuration of device */
2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847
	dev->num_tc = 0;
	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
}
EXPORT_SYMBOL(netdev_reset_tc);

int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
{
	if (tc >= dev->num_tc)
		return -EINVAL;

2848 2849 2850
#ifdef CONFIG_XPS
	netif_reset_xps_queues(dev, offset, count);
#endif
2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861
	dev->tc_to_txq[tc].count = count;
	dev->tc_to_txq[tc].offset = offset;
	return 0;
}
EXPORT_SYMBOL(netdev_set_tc_queue);

int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
{
	if (num_tc > TC_MAX_QUEUE)
		return -EINVAL;

2862 2863 2864
#ifdef CONFIG_XPS
	netif_reset_xps_queues_gt(dev, 0);
#endif
2865 2866
	netdev_unbind_all_sb_channels(dev);

2867 2868 2869 2870 2871
	dev->num_tc = num_tc;
	return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);

2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935
void netdev_unbind_sb_channel(struct net_device *dev,
			      struct net_device *sb_dev)
{
	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

#ifdef CONFIG_XPS
	netif_reset_xps_queues_gt(sb_dev, 0);
#endif
	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));

	while (txq-- != &dev->_tx[0]) {
		if (txq->sb_dev == sb_dev)
			txq->sb_dev = NULL;
	}
}
EXPORT_SYMBOL(netdev_unbind_sb_channel);

int netdev_bind_sb_channel_queue(struct net_device *dev,
				 struct net_device *sb_dev,
				 u8 tc, u16 count, u16 offset)
{
	/* Make certain the sb_dev and dev are already configured */
	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
		return -EINVAL;

	/* We cannot hand out queues we don't have */
	if ((offset + count) > dev->real_num_tx_queues)
		return -EINVAL;

	/* Record the mapping */
	sb_dev->tc_to_txq[tc].count = count;
	sb_dev->tc_to_txq[tc].offset = offset;

	/* Provide a way for Tx queue to find the tc_to_txq map or
	 * XPS map for itself.
	 */
	while (count--)
		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;

	return 0;
}
EXPORT_SYMBOL(netdev_bind_sb_channel_queue);

int netdev_set_sb_channel(struct net_device *dev, u16 channel)
{
	/* Do not use a multiqueue device to represent a subordinate channel */
	if (netif_is_multiqueue(dev))
		return -ENODEV;

	/* We allow channels 1 - 32767 to be used for subordinate channels.
	 * Channel 0 is meant to be "native" mode and used only to represent
	 * the main root device. We allow writing 0 to reset the device back
	 * to normal mode after being used as a subordinate channel.
	 */
	if (channel > S16_MAX)
		return -EINVAL;

	dev->num_tc = -channel;

	return 0;
}
EXPORT_SYMBOL(netdev_set_sb_channel);

2936 2937
/*
 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2938
 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2939
 */
2940
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2941
{
2942
	bool disabling;
T
Tom Herbert 已提交
2943 2944
	int rc;

2945 2946
	disabling = txq < dev->real_num_tx_queues;

2947 2948
	if (txq < 1 || txq > dev->num_tx_queues)
		return -EINVAL;
2949

2950 2951
	if (dev->reg_state == NETREG_REGISTERED ||
	    dev->reg_state == NETREG_UNREGISTERING) {
2952 2953
		ASSERT_RTNL();

T
Tom Herbert 已提交
2954 2955
		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
						  txq);
T
Tom Herbert 已提交
2956 2957 2958
		if (rc)
			return rc;

2959 2960 2961
		if (dev->num_tc)
			netif_setup_tc(dev, txq);

2962 2963 2964 2965
		dev->real_num_tx_queues = txq;

		if (disabling) {
			synchronize_net();
2966
			qdisc_reset_all_tx_gt(dev, txq);
2967 2968 2969 2970
#ifdef CONFIG_XPS
			netif_reset_xps_queues_gt(dev, txq);
#endif
		}
2971 2972
	} else {
		dev->real_num_tx_queues = txq;
2973
	}
2974 2975

	return 0;
2976 2977
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2978

2979
#ifdef CONFIG_SYSFS
2980 2981 2982 2983 2984 2985 2986
/**
 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 *	@dev: Network device
 *	@rxq: Actual number of RX queues
 *
 *	This must be called either with the rtnl_lock held or before
 *	registration of the net device.  Returns 0 on success, or a
2987 2988
 *	negative error code.  If called before registration, it always
 *	succeeds.
2989 2990 2991 2992 2993
 */
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
{
	int rc;

T
Tom Herbert 已提交
2994 2995 2996
	if (rxq < 1 || rxq > dev->num_rx_queues)
		return -EINVAL;

2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011
	if (dev->reg_state == NETREG_REGISTERED) {
		ASSERT_RTNL();

		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
						  rxq);
		if (rc)
			return rc;
	}

	dev->real_num_rx_queues = rxq;
	return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
#endif

3012 3013
/**
 * netif_get_num_default_rss_queues - default number of RSS queues
3014 3015 3016 3017
 *
 * This routine should set an upper limit on the number of RSS queues
 * used by default by multiqueue devices.
 */
3018
int netif_get_num_default_rss_queues(void)
3019
{
3020 3021
	return is_kdump_kernel() ?
		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
3022 3023 3024
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);

3025
static void __netif_reschedule(struct Qdisc *q)
3026
{
3027 3028
	struct softnet_data *sd;
	unsigned long flags;
3029

3030
	local_irq_save(flags);
3031
	sd = this_cpu_ptr(&softnet_data);
3032 3033 3034
	q->next_sched = NULL;
	*sd->output_queue_tailp = q;
	sd->output_queue_tailp = &q->next_sched;
3035 3036 3037 3038 3039 3040 3041 3042
	raise_softirq_irqoff(NET_TX_SOFTIRQ);
	local_irq_restore(flags);
}

void __netif_schedule(struct Qdisc *q)
{
	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
		__netif_reschedule(q);
3043 3044 3045
}
EXPORT_SYMBOL(__netif_schedule);

3046 3047 3048 3049 3050
struct dev_kfree_skb_cb {
	enum skb_free_reason reason;
};

static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3051
{
3052 3053 3054
	return (struct dev_kfree_skb_cb *)skb->cb;
}

3055 3056 3057
void netif_schedule_queue(struct netdev_queue *txq)
{
	rcu_read_lock();
3058
	if (!netif_xmit_stopped(txq)) {
3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079
		struct Qdisc *q = rcu_dereference(txq->qdisc);

		__netif_schedule(q);
	}
	rcu_read_unlock();
}
EXPORT_SYMBOL(netif_schedule_queue);

void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
		struct Qdisc *q;

		rcu_read_lock();
		q = rcu_dereference(dev_queue->qdisc);
		__netif_schedule(q);
		rcu_read_unlock();
	}
}
EXPORT_SYMBOL(netif_tx_wake_queue);

3080
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3081
{
3082
	unsigned long flags;
3083

3084 3085 3086
	if (unlikely(!skb))
		return;

3087
	if (likely(refcount_read(&skb->users) == 1)) {
3088
		smp_rmb();
3089 3090
		refcount_set(&skb->users, 0);
	} else if (likely(!refcount_dec_and_test(&skb->users))) {
3091
		return;
3092
	}
3093 3094 3095 3096 3097 3098
	get_kfree_skb_cb(skb)->reason = reason;
	local_irq_save(flags);
	skb->next = __this_cpu_read(softnet_data.completion_queue);
	__this_cpu_write(softnet_data.completion_queue, skb);
	raise_softirq_irqoff(NET_TX_SOFTIRQ);
	local_irq_restore(flags);
3099
}
3100
EXPORT_SYMBOL(__dev_kfree_skb_irq);
3101

3102
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3103 3104
{
	if (in_irq() || irqs_disabled())
3105
		__dev_kfree_skb_irq(skb, reason);
3106 3107 3108
	else
		dev_kfree_skb(skb);
}
3109
EXPORT_SYMBOL(__dev_kfree_skb_any);
3110 3111


3112 3113 3114 3115 3116 3117
/**
 * netif_device_detach - mark device as removed
 * @dev: network device
 *
 * Mark device as removed from system and therefore no longer available.
 */
3118 3119 3120 3121
void netif_device_detach(struct net_device *dev)
{
	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
	    netif_running(dev)) {
3122
		netif_tx_stop_all_queues(dev);
3123 3124 3125 3126
	}
}
EXPORT_SYMBOL(netif_device_detach);

3127 3128 3129 3130 3131 3132
/**
 * netif_device_attach - mark device as attached
 * @dev: network device
 *
 * Mark device as attached from system and restart if needed.
 */
3133 3134 3135 3136
void netif_device_attach(struct net_device *dev)
{
	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
	    netif_running(dev)) {
3137
		netif_tx_wake_all_queues(dev);
3138
		__netdev_watchdog_up(dev);
3139 3140 3141 3142
	}
}
EXPORT_SYMBOL(netif_device_attach);

J
Jiri Pirko 已提交
3143 3144 3145 3146
/*
 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 * to be used as a distribution range.
 */
3147 3148 3149
static u16 skb_tx_hash(const struct net_device *dev,
		       const struct net_device *sb_dev,
		       struct sk_buff *skb)
J
Jiri Pirko 已提交
3150 3151 3152
{
	u32 hash;
	u16 qoffset = 0;
3153
	u16 qcount = dev->real_num_tx_queues;
J
Jiri Pirko 已提交
3154

3155 3156 3157 3158 3159 3160 3161
	if (dev->num_tc) {
		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);

		qoffset = sb_dev->tc_to_txq[tc].offset;
		qcount = sb_dev->tc_to_txq[tc].count;
	}

J
Jiri Pirko 已提交
3162 3163
	if (skb_rx_queue_recorded(skb)) {
		hash = skb_get_rx_queue(skb);
3164 3165
		if (hash >= qoffset)
			hash -= qoffset;
3166 3167
		while (unlikely(hash >= qcount))
			hash -= qcount;
3168
		return hash + qoffset;
J
Jiri Pirko 已提交
3169 3170 3171 3172 3173
	}

	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}

3174 3175
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
W
Wei Tang 已提交
3176
	static const netdev_features_t null_features;
3177
	struct net_device *dev = skb->dev;
3178
	const char *name = "";
3179

3180 3181 3182
	if (!net_ratelimit())
		return;

3183 3184 3185 3186 3187 3188
	if (dev) {
		if (dev->dev.parent)
			name = dev_driver_string(dev->dev.parent);
		else
			name = netdev_name(dev);
	}
3189 3190
	skb_dump(KERN_WARNING, skb, false);
	WARN(1, "%s: caps=(%pNF, %pNF)\n",
3191
	     name, dev ? &dev->features : &null_features,
3192
	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
3193 3194
}

L
Linus Torvalds 已提交
3195 3196 3197 3198
/*
 * Invalidate hardware checksum when packet is to be mangled, and
 * complete checksum manually on outgoing path.
 */
3199
int skb_checksum_help(struct sk_buff *skb)
L
Linus Torvalds 已提交
3200
{
3201
	__wsum csum;
3202
	int ret = 0, offset;
L
Linus Torvalds 已提交
3203

3204
	if (skb->ip_summed == CHECKSUM_COMPLETE)
3205 3206 3207
		goto out_set_summed;

	if (unlikely(skb_shinfo(skb)->gso_size)) {
3208 3209
		skb_warn_bad_offload(skb);
		return -EINVAL;
L
Linus Torvalds 已提交
3210 3211
	}

3212 3213 3214 3215 3216 3217 3218 3219 3220
	/* Before computing a checksum, we should make sure no frag could
	 * be modified by an external entity : checksum could be wrong.
	 */
	if (skb_has_shared_frag(skb)) {
		ret = __skb_linearize(skb);
		if (ret)
			goto out;
	}

3221
	offset = skb_checksum_start_offset(skb);
3222 3223 3224 3225 3226 3227
	BUG_ON(offset >= skb_headlen(skb));
	csum = skb_checksum(skb, offset, skb->len - offset, 0);

	offset += skb->csum_offset;
	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));

3228 3229 3230
	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
	if (ret)
		goto out;
L
Linus Torvalds 已提交
3231

3232
	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3233
out_set_summed:
L
Linus Torvalds 已提交
3234
	skb->ip_summed = CHECKSUM_NONE;
3235
out:
L
Linus Torvalds 已提交
3236 3237
	return ret;
}
E
Eric Dumazet 已提交
3238
EXPORT_SYMBOL(skb_checksum_help);
L
Linus Torvalds 已提交
3239

3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264
int skb_crc32c_csum_help(struct sk_buff *skb)
{
	__le32 crc32c_csum;
	int ret = 0, offset, start;

	if (skb->ip_summed != CHECKSUM_PARTIAL)
		goto out;

	if (unlikely(skb_is_gso(skb)))
		goto out;

	/* Before computing a checksum, we should make sure no frag could
	 * be modified by an external entity : checksum could be wrong.
	 */
	if (unlikely(skb_has_shared_frag(skb))) {
		ret = __skb_linearize(skb);
		if (ret)
			goto out;
	}
	start = skb_checksum_start_offset(skb);
	offset = start + offsetof(struct sctphdr, checksum);
	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
		ret = -EINVAL;
		goto out;
	}
3265 3266 3267 3268 3269

	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
	if (ret)
		goto out;

3270 3271 3272 3273 3274
	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
						  skb->len - start, ~(__u32)0,
						  crc32c_csum_stub));
	*(__le32 *)(skb->data + offset) = crc32c_csum;
	skb->ip_summed = CHECKSUM_NONE;
3275
	skb->csum_not_inet = 0;
3276 3277 3278 3279
out:
	return ret;
}

3280
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3281
{
A
Al Viro 已提交
3282
	__be16 type = skb->protocol;
3283

3284 3285 3286 3287 3288 3289 3290
	/* Tunnel gso handlers can set protocol to ethernet. */
	if (type == htons(ETH_P_TEB)) {
		struct ethhdr *eth;

		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
			return 0;

3291
		eth = (struct ethhdr *)skb->data;
3292 3293 3294
		type = eth->h_proto;
	}

3295
	return __vlan_get_protocol(skb, type, depth);
3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307
}

/**
 *	skb_mac_gso_segment - mac layer segmentation handler.
 *	@skb: buffer to segment
 *	@features: features for the output path (see dev->features)
 */
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
				    netdev_features_t features)
{
	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
	struct packet_offload *ptype;
3308 3309
	int vlan_depth = skb->mac_len;
	__be16 type = skb_network_protocol(skb, &vlan_depth);
3310 3311 3312 3313

	if (unlikely(!type))
		return ERR_PTR(-EINVAL);

3314
	__skb_pull(skb, vlan_depth);
3315 3316

	rcu_read_lock();
3317
	list_for_each_entry_rcu(ptype, &offload_base, list) {
3318 3319
		if (ptype->type == type && ptype->callbacks.gso_segment) {
			segs = ptype->callbacks.gso_segment(skb, features);
3320 3321 3322 3323 3324
			break;
		}
	}
	rcu_read_unlock();

3325
	__skb_push(skb, skb->data - skb_mac_header(skb));
3326

3327 3328
	return segs;
}
3329 3330 3331 3332 3333 3334 3335 3336
EXPORT_SYMBOL(skb_mac_gso_segment);


/* openvswitch calls this on rx path, so we need a different check.
 */
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
	if (tx_path)
3337 3338
		return skb->ip_summed != CHECKSUM_PARTIAL &&
		       skb->ip_summed != CHECKSUM_UNNECESSARY;
3339 3340

	return skb->ip_summed == CHECKSUM_NONE;
3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352
}

/**
 *	__skb_gso_segment - Perform segmentation on skb.
 *	@skb: buffer to segment
 *	@features: features for the output path (see dev->features)
 *	@tx_path: whether it is called in TX path
 *
 *	This function segments the given skb and returns a list of segments.
 *
 *	It may return NULL if the skb requires no segmentation.  This is
 *	only possible when GSO is used for verifying header integrity.
3353
 *
C
Cambda Zhu 已提交
3354
 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3355 3356 3357 3358
 */
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
				  netdev_features_t features, bool tx_path)
{
3359 3360
	struct sk_buff *segs;

3361 3362 3363
	if (unlikely(skb_needs_check(skb, tx_path))) {
		int err;

3364
		/* We're going to init ->check field in TCP or UDP header */
3365 3366
		err = skb_cow_head(skb, 0);
		if (err < 0)
3367 3368 3369
			return ERR_PTR(err);
	}

3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382
	/* Only report GSO partial support if it will enable us to
	 * support segmentation on this frame without needing additional
	 * work.
	 */
	if (features & NETIF_F_GSO_PARTIAL) {
		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
		struct net_device *dev = skb->dev;

		partial_features |= dev->features & dev->gso_partial_features;
		if (!skb_gso_ok(skb, features | partial_features))
			features &= ~NETIF_F_GSO_PARTIAL;
	}

C
Cambda Zhu 已提交
3383
	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3384 3385
		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));

3386
	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3387 3388
	SKB_GSO_CB(skb)->encap_level = 0;

3389 3390 3391
	skb_reset_mac_header(skb);
	skb_reset_mac_len(skb);

3392 3393
	segs = skb_mac_gso_segment(skb, features);

3394
	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3395 3396 3397
		skb_warn_bad_offload(skb);

	return segs;
3398
}
3399
EXPORT_SYMBOL(__skb_gso_segment);
3400

3401 3402
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
3403
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3404 3405
{
	if (net_ratelimit()) {
3406
		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3407
		skb_dump(KERN_ERR, skb, true);
3408 3409 3410 3411 3412 3413
		dump_stack();
	}
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif

3414
/* XXX: check that highmem exists at all on the given machine. */
3415
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
L
Linus Torvalds 已提交
3416
{
3417
#ifdef CONFIG_HIGHMEM
L
Linus Torvalds 已提交
3418
	int i;
3419

3420
	if (!(dev->features & NETIF_F_HIGHDMA)) {
3421 3422
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3423

3424
			if (PageHighMem(skb_frag_page(frag)))
3425
				return 1;
3426
		}
3427
	}
3428
#endif
L
Linus Torvalds 已提交
3429 3430 3431
	return 0;
}

3432 3433 3434
/* If MPLS offload request, verify we are testing hardware MPLS features
 * instead of standard features for the netdev.
 */
P
Pravin B Shelar 已提交
3435
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3436 3437 3438 3439
static netdev_features_t net_mpls_features(struct sk_buff *skb,
					   netdev_features_t features,
					   __be16 type)
{
3440
	if (eth_p_mpls(type))
3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453
		features &= skb->dev->mpls_features;

	return features;
}
#else
static netdev_features_t net_mpls_features(struct sk_buff *skb,
					   netdev_features_t features,
					   __be16 type)
{
	return features;
}
#endif

3454
static netdev_features_t harmonize_features(struct sk_buff *skb,
3455
	netdev_features_t features)
3456
{
3457 3458
	__be16 type;

3459
	type = skb_network_protocol(skb, NULL);
3460
	features = net_mpls_features(skb, features, type);
3461

3462
	if (skb->ip_summed != CHECKSUM_NONE &&
3463
	    !can_checksum_protocol(features, type)) {
3464
		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3465
	}
3466 3467
	if (illegal_highdma(skb->dev, skb))
		features &= ~NETIF_F_SG;
3468 3469 3470 3471

	return features;
}

3472 3473 3474 3475 3476 3477 3478 3479
netdev_features_t passthru_features_check(struct sk_buff *skb,
					  struct net_device *dev,
					  netdev_features_t features)
{
	return features;
}
EXPORT_SYMBOL(passthru_features_check);

3480
static netdev_features_t dflt_features_check(struct sk_buff *skb,
3481 3482 3483 3484 3485 3486
					     struct net_device *dev,
					     netdev_features_t features)
{
	return vlan_features_check(skb, features);
}

3487 3488 3489 3490 3491 3492 3493 3494 3495
static netdev_features_t gso_features_check(const struct sk_buff *skb,
					    struct net_device *dev,
					    netdev_features_t features)
{
	u16 gso_segs = skb_shinfo(skb)->gso_segs;

	if (gso_segs > dev->gso_max_segs)
		return features & ~NETIF_F_GSO_MASK;

3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506
	/* Support for GSO partial features requires software
	 * intervention before we can actually process the packets
	 * so we need to strip support for any partial features now
	 * and we can pull them back in after we have partially
	 * segmented the frame.
	 */
	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
		features &= ~dev->gso_partial_features;

	/* Make sure to clear the IPv4 ID mangling feature if the
	 * IPv4 header has the potential to be fragmented.
3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518
	 */
	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
		struct iphdr *iph = skb->encapsulation ?
				    inner_ip_hdr(skb) : ip_hdr(skb);

		if (!(iph->frag_off & htons(IP_DF)))
			features &= ~NETIF_F_TSO_MANGLEID;
	}

	return features;
}

3519
netdev_features_t netif_skb_features(struct sk_buff *skb)
3520
{
3521
	struct net_device *dev = skb->dev;
3522
	netdev_features_t features = dev->features;
3523

3524 3525
	if (skb_is_gso(skb))
		features = gso_features_check(skb, dev, features);
3526

3527 3528 3529 3530 3531 3532 3533
	/* If encapsulation offload request, verify we are testing
	 * hardware encapsulation features instead of standard
	 * features for the netdev
	 */
	if (skb->encapsulation)
		features &= dev->hw_enc_features;

3534 3535 3536 3537 3538
	if (skb_vlan_tagged(skb))
		features = netdev_intersect_features(features,
						     dev->vlan_features |
						     NETIF_F_HW_VLAN_CTAG_TX |
						     NETIF_F_HW_VLAN_STAG_TX);
3539

3540 3541 3542
	if (dev->netdev_ops->ndo_features_check)
		features &= dev->netdev_ops->ndo_features_check(skb, dev,
								features);
3543 3544
	else
		features &= dflt_features_check(skb, dev, features);
3545

3546
	return harmonize_features(skb, features);
3547
}
3548
EXPORT_SYMBOL(netif_skb_features);
3549

3550
static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3551
		    struct netdev_queue *txq, bool more)
3552
{
3553 3554
	unsigned int len;
	int rc;
3555

3556
	if (dev_nit_active(dev))
3557
		dev_queue_xmit_nit(skb, dev);
3558

3559 3560
	len = skb->len;
	trace_net_dev_start_xmit(skb, dev);
3561
	rc = netdev_start_xmit(skb, dev, txq, more);
3562
	trace_net_dev_xmit(skb, rc, dev, len);
E
Eric Dumazet 已提交
3563

3564 3565
	return rc;
}
3566

3567 3568
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
				    struct netdev_queue *txq, int *ret)
3569 3570 3571
{
	struct sk_buff *skb = first;
	int rc = NETDEV_TX_OK;
3572

3573 3574
	while (skb) {
		struct sk_buff *next = skb->next;
3575

3576
		skb_mark_not_on_list(skb);
3577
		rc = xmit_one(skb, dev, txq, next != NULL);
3578 3579 3580 3581
		if (unlikely(!dev_xmit_complete(rc))) {
			skb->next = next;
			goto out;
		}
3582

3583
		skb = next;
3584
		if (netif_tx_queue_stopped(txq) && skb) {
3585 3586
			rc = NETDEV_TX_BUSY;
			break;
3587
		}
3588
	}
3589

3590 3591 3592 3593
out:
	*ret = rc;
	return skb;
}
3594

E
Eric Dumazet 已提交
3595 3596
static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
					  netdev_features_t features)
3597
{
3598
	if (skb_vlan_tag_present(skb) &&
3599 3600
	    !vlan_hw_offload_capable(features, skb->vlan_proto))
		skb = __vlan_hwaccel_push_inside(skb);
3601 3602
	return skb;
}
3603

3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614
int skb_csum_hwoffload_help(struct sk_buff *skb,
			    const netdev_features_t features)
{
	if (unlikely(skb->csum_not_inet))
		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
			skb_crc32c_csum_help(skb);

	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);

3615
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3616 3617
{
	netdev_features_t features;
3618

3619 3620 3621 3622
	features = netif_skb_features(skb);
	skb = validate_xmit_vlan(skb, features);
	if (unlikely(!skb))
		goto out_null;
3623

3624 3625 3626 3627
	skb = sk_validate_xmit_skb(skb, dev);
	if (unlikely(!skb))
		goto out_null;

3628
	if (netif_needs_gso(skb, features)) {
3629 3630 3631
		struct sk_buff *segs;

		segs = skb_gso_segment(skb, features);
3632
		if (IS_ERR(segs)) {
3633
			goto out_kfree_skb;
3634 3635 3636
		} else if (segs) {
			consume_skb(skb);
			skb = segs;
3637
		}
3638 3639 3640 3641
	} else {
		if (skb_needs_linearize(skb, features) &&
		    __skb_linearize(skb))
			goto out_kfree_skb;
3642

3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653
		/* If packet is not checksummed and device does not
		 * support checksumming for this protocol, complete
		 * checksumming here.
		 */
		if (skb->ip_summed == CHECKSUM_PARTIAL) {
			if (skb->encapsulation)
				skb_set_inner_transport_header(skb,
							       skb_checksum_start_offset(skb));
			else
				skb_set_transport_header(skb,
							 skb_checksum_start_offset(skb));
3654
			if (skb_csum_hwoffload_help(skb, features))
3655
				goto out_kfree_skb;
3656
		}
3657
	}
3658

3659
	skb = validate_xmit_xfrm(skb, features, again);
3660

3661
	return skb;
3662

3663 3664
out_kfree_skb:
	kfree_skb(skb);
3665
out_null:
E
Eric Dumazet 已提交
3666
	atomic_long_inc(&dev->tx_dropped);
3667 3668
	return NULL;
}
3669

3670
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3671 3672 3673
{
	struct sk_buff *next, *head = NULL, *tail;

3674
	for (; skb != NULL; skb = next) {
3675
		next = skb->next;
3676
		skb_mark_not_on_list(skb);
3677 3678 3679 3680

		/* in case skb wont be segmented, point to itself */
		skb->prev = skb;

3681
		skb = validate_xmit_skb(skb, dev, again);
3682 3683
		if (!skb)
			continue;
3684

3685 3686 3687 3688 3689 3690 3691 3692
		if (!head)
			head = skb;
		else
			tail->next = skb;
		/* If skb was segmented, skb->prev points to
		 * the last segment. If not, it still contains skb.
		 */
		tail = skb->prev;
3693 3694
	}
	return head;
3695
}
3696
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3697

3698 3699 3700 3701 3702 3703 3704 3705 3706
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
	const struct skb_shared_info *shinfo = skb_shinfo(skb);

	qdisc_skb_cb(skb)->pkt_len = skb->len;

	/* To get more precise estimation of bytes sent on wire,
	 * we add to pkt_len the headers size of all segments
	 */
3707
	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3708
		unsigned int hdr_len;
3709
		u16 gso_segs = shinfo->gso_segs;
3710

3711 3712 3713 3714
		/* mac layer + network layer */
		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);

		/* + transport layer */
3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729
		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
			const struct tcphdr *th;
			struct tcphdr _tcphdr;

			th = skb_header_pointer(skb, skb_transport_offset(skb),
						sizeof(_tcphdr), &_tcphdr);
			if (likely(th))
				hdr_len += __tcp_hdrlen(th);
		} else {
			struct udphdr _udphdr;

			if (skb_header_pointer(skb, skb_transport_offset(skb),
					       sizeof(_udphdr), &_udphdr))
				hdr_len += sizeof(struct udphdr);
		}
3730 3731 3732 3733 3734 3735

		if (shinfo->gso_type & SKB_GSO_DODGY)
			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
						shinfo->gso_size);

		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3736 3737 3738
	}
}

3739 3740 3741 3742 3743
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
				 struct net_device *dev,
				 struct netdev_queue *txq)
{
	spinlock_t *root_lock = qdisc_lock(q);
3744
	struct sk_buff *to_free = NULL;
E
Eric Dumazet 已提交
3745
	bool contended;
3746 3747
	int rc;

E
Eric Dumazet 已提交
3748
	qdisc_calculate_pkt_len(skb, q);
3749 3750

	if (q->flags & TCQ_F_NOLOCK) {
3751
		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3752
		qdisc_run(q);
3753 3754 3755 3756 3757 3758

		if (unlikely(to_free))
			kfree_skb_list(to_free);
		return rc;
	}

3759 3760 3761
	/*
	 * Heuristic to force contended enqueues to serialize on a
	 * separate lock before trying to get qdisc main lock.
3762
	 * This permits qdisc->running owner to get the lock more
Y
Ying Xue 已提交
3763
	 * often and dequeue packets faster.
3764
	 */
E
Eric Dumazet 已提交
3765
	contended = qdisc_is_running(q);
3766 3767 3768
	if (unlikely(contended))
		spin_lock(&q->busylock);

3769 3770
	spin_lock(root_lock);
	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3771
		__qdisc_drop(skb, &to_free);
3772 3773
		rc = NET_XMIT_DROP;
	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3774
		   qdisc_run_begin(q)) {
3775 3776 3777 3778 3779
		/*
		 * This is a work-conserving queue; there are no old skbs
		 * waiting to be sent out; and the qdisc is not running -
		 * xmit the skb directly.
		 */
3780 3781 3782

		qdisc_bstats_update(q, skb);

3783
		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3784 3785 3786 3787
			if (unlikely(contended)) {
				spin_unlock(&q->busylock);
				contended = false;
			}
3788
			__qdisc_run(q);
3789
		}
3790

3791
		qdisc_run_end(q);
3792 3793
		rc = NET_XMIT_SUCCESS;
	} else {
3794
		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3795 3796 3797 3798 3799 3800
		if (qdisc_run_begin(q)) {
			if (unlikely(contended)) {
				spin_unlock(&q->busylock);
				contended = false;
			}
			__qdisc_run(q);
3801
			qdisc_run_end(q);
3802
		}
3803 3804
	}
	spin_unlock(root_lock);
3805 3806
	if (unlikely(to_free))
		kfree_skb_list(to_free);
3807 3808
	if (unlikely(contended))
		spin_unlock(&q->busylock);
3809 3810 3811
	return rc;
}

3812
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3813 3814
static void skb_update_prio(struct sk_buff *skb)
{
3815 3816 3817
	const struct netprio_map *map;
	const struct sock *sk;
	unsigned int prioidx;
3818

3819 3820 3821 3822 3823 3824 3825 3826
	if (skb->priority)
		return;
	map = rcu_dereference_bh(skb->dev->priomap);
	if (!map)
		return;
	sk = skb_to_full_sk(skb);
	if (!sk)
		return;
3827

3828 3829 3830 3831
	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);

	if (prioidx < map->priomap_len)
		skb->priority = map->priomap[prioidx];
3832 3833 3834 3835 3836
}
#else
#define skb_update_prio(skb)
#endif

3837 3838
/**
 *	dev_loopback_xmit - loop back @skb
3839 3840
 *	@net: network namespace this loopback is happening in
 *	@sk:  sk needed to be a netfilter okfn
3841 3842
 *	@skb: buffer to transmit
 */
3843
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855
{
	skb_reset_mac_header(skb);
	__skb_pull(skb, skb_network_offset(skb));
	skb->pkt_type = PACKET_LOOPBACK;
	skb->ip_summed = CHECKSUM_UNNECESSARY;
	WARN_ON(!skb_dst(skb));
	skb_dst_force(skb);
	netif_rx_ni(skb);
	return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);

D
Daniel Borkmann 已提交
3856 3857 3858 3859
#ifdef CONFIG_NET_EGRESS
static struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
3860
	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
D
Daniel Borkmann 已提交
3861 3862
	struct tcf_result cl_res;

3863
	if (!miniq)
D
Daniel Borkmann 已提交
3864 3865
		return skb;

3866
	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3867
	mini_qdisc_bstats_cpu_update(miniq, skb);
D
Daniel Borkmann 已提交
3868

3869
	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
D
Daniel Borkmann 已提交
3870 3871 3872 3873 3874
	case TC_ACT_OK:
	case TC_ACT_RECLASSIFY:
		skb->tc_index = TC_H_MIN(cl_res.classid);
		break;
	case TC_ACT_SHOT:
3875
		mini_qdisc_qstats_cpu_drop(miniq);
D
Daniel Borkmann 已提交
3876
		*ret = NET_XMIT_DROP;
3877 3878
		kfree_skb(skb);
		return NULL;
D
Daniel Borkmann 已提交
3879 3880
	case TC_ACT_STOLEN:
	case TC_ACT_QUEUED:
3881
	case TC_ACT_TRAP:
D
Daniel Borkmann 已提交
3882
		*ret = NET_XMIT_SUCCESS;
3883
		consume_skb(skb);
D
Daniel Borkmann 已提交
3884 3885 3886 3887 3888 3889 3890 3891 3892
		return NULL;
	case TC_ACT_REDIRECT:
		/* No need to push/pop skb's mac_header here on egress! */
		skb_do_redirect(skb);
		*ret = NET_XMIT_SUCCESS;
		return NULL;
	default:
		break;
	}
3893

D
Daniel Borkmann 已提交
3894 3895 3896 3897
	return skb;
}
#endif /* CONFIG_NET_EGRESS */

3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923
#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
			       struct xps_dev_maps *dev_maps, unsigned int tci)
{
	struct xps_map *map;
	int queue_index = -1;

	if (dev->num_tc) {
		tci *= dev->num_tc;
		tci += netdev_get_prio_tc_map(dev, skb->priority);
	}

	map = rcu_dereference(dev_maps->attr_map[tci]);
	if (map) {
		if (map->len == 1)
			queue_index = map->queues[0];
		else
			queue_index = map->queues[reciprocal_scale(
						skb_get_hash(skb), map->len)];
		if (unlikely(queue_index >= dev->real_num_tx_queues))
			queue_index = -1;
	}
	return queue_index;
}
#endif

3924 3925
static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
			 struct sk_buff *skb)
3926 3927 3928
{
#ifdef CONFIG_XPS
	struct xps_dev_maps *dev_maps;
3929
	struct sock *sk = skb->sk;
3930 3931
	int queue_index = -1;

3932 3933 3934
	if (!static_key_false(&xps_needed))
		return -1;

3935
	rcu_read_lock();
3936 3937 3938
	if (!static_key_false(&xps_rxqs_needed))
		goto get_cpus_map;

3939
	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3940
	if (dev_maps) {
3941
		int tci = sk_rx_queue_get(sk);
3942

3943 3944 3945 3946
		if (tci >= 0 && tci < dev->num_rx_queues)
			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
							  tci);
	}
3947

3948 3949
get_cpus_map:
	if (queue_index < 0) {
3950
		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3951 3952 3953 3954 3955
		if (dev_maps) {
			unsigned int tci = skb->sender_cpu - 1;

			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
							  tci);
3956 3957 3958 3959 3960 3961 3962 3963 3964 3965
		}
	}
	rcu_read_unlock();

	return queue_index;
#else
	return -1;
#endif
}

3966
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3967
		     struct net_device *sb_dev)
3968 3969 3970 3971 3972 3973
{
	return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);

u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3974
		       struct net_device *sb_dev)
3975 3976 3977 3978 3979
{
	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
}
EXPORT_SYMBOL(dev_pick_tx_cpu_id);

3980 3981
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
		     struct net_device *sb_dev)
3982 3983 3984 3985
{
	struct sock *sk = skb->sk;
	int queue_index = sk_tx_queue_get(sk);

3986 3987
	sb_dev = sb_dev ? : dev;

3988 3989
	if (queue_index < 0 || skb->ooo_okay ||
	    queue_index >= dev->real_num_tx_queues) {
3990
		int new_index = get_xps_queue(dev, sb_dev, skb);
3991

3992
		if (new_index < 0)
3993
			new_index = skb_tx_hash(dev, sb_dev, skb);
3994 3995

		if (queue_index != new_index && sk &&
3996
		    sk_fullsock(sk) &&
3997 3998 3999 4000 4001 4002 4003 4004
		    rcu_access_pointer(sk->sk_dst_cache))
			sk_tx_queue_set(sk, new_index);

		queue_index = new_index;
	}

	return queue_index;
}
4005
EXPORT_SYMBOL(netdev_pick_tx);
4006

4007 4008 4009
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
					 struct sk_buff *skb,
					 struct net_device *sb_dev)
4010 4011 4012 4013
{
	int queue_index = 0;

#ifdef CONFIG_XPS
4014 4015 4016
	u32 sender_cpu = skb->sender_cpu - 1;

	if (sender_cpu >= (u32)NR_CPUS)
4017 4018 4019 4020 4021
		skb->sender_cpu = raw_smp_processor_id() + 1;
#endif

	if (dev->real_num_tx_queues != 1) {
		const struct net_device_ops *ops = dev->netdev_ops;
4022

4023
		if (ops->ndo_select_queue)
4024
			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4025
		else
4026
			queue_index = netdev_pick_tx(dev, skb, sb_dev);
4027

4028
		queue_index = netdev_cap_txqueue(dev, queue_index);
4029 4030 4031 4032 4033 4034
	}

	skb_set_queue_mapping(skb, queue_index);
	return netdev_get_tx_queue(dev, queue_index);
}

4035
/**
4036
 *	__dev_queue_xmit - transmit a buffer
4037
 *	@skb: buffer to transmit
4038
 *	@sb_dev: suboordinate device used for L2 forwarding offload
4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060
 *
 *	Queue a buffer for transmission to a network device. The caller must
 *	have set the device and priority and built the buffer before calling
 *	this function. The function can be called from an interrupt.
 *
 *	A negative errno code is returned on a failure. A success does not
 *	guarantee the frame will be transmitted as it may be dropped due
 *	to congestion or traffic shaping.
 *
 * -----------------------------------------------------------------------------------
 *      I notice this method can also return errors from the queue disciplines,
 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 *      be positive.
 *
 *      Regardless of the return value, the skb is consumed, so it is currently
 *      difficult to retry a send to this method.  (You can bump the ref count
 *      before sending to hold a reference for retry if you are careful.)
 *
 *      When calling this method, interrupts MUST be enabled.  This is because
 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 *          --BLG
 */
4061
static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
L
Linus Torvalds 已提交
4062 4063
{
	struct net_device *dev = skb->dev;
4064
	struct netdev_queue *txq;
L
Linus Torvalds 已提交
4065 4066
	struct Qdisc *q;
	int rc = -ENOMEM;
4067
	bool again = false;
L
Linus Torvalds 已提交
4068

4069 4070
	skb_reset_mac_header(skb);

4071 4072 4073
	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);

4074 4075
	/* Disable soft irqs for various locks below. Also
	 * stops preemption for RCU.
L
Linus Torvalds 已提交
4076
	 */
4077
	rcu_read_lock_bh();
L
Linus Torvalds 已提交
4078

4079 4080
	skb_update_prio(skb);

D
Daniel Borkmann 已提交
4081 4082
	qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
4083
	skb->tc_at_ingress = 0;
4084
# ifdef CONFIG_NET_EGRESS
4085
	if (static_branch_unlikely(&egress_needed_key)) {
D
Daniel Borkmann 已提交
4086 4087 4088 4089
		skb = sch_handle_egress(skb, &rc, dev);
		if (!skb)
			goto out;
	}
4090
# endif
D
Daniel Borkmann 已提交
4091
#endif
4092 4093 4094 4095 4096 4097 4098 4099
	/* If device/qdisc don't need skb->dst, release it right now while
	 * its hot in this cpu cache.
	 */
	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
		skb_dst_drop(skb);
	else
		skb_dst_force(skb);

4100
	txq = netdev_core_pick_tx(dev, skb, sb_dev);
4101
	q = rcu_dereference_bh(txq->qdisc);
4102

4103
	trace_net_dev_queue(skb);
L
Linus Torvalds 已提交
4104
	if (q->enqueue) {
4105
		rc = __dev_xmit_skb(skb, q, dev, txq);
4106
		goto out;
L
Linus Torvalds 已提交
4107 4108 4109
	}

	/* The device has no queue. Common case for software devices:
4110
	 * loopback, all the sorts of tunnels...
L
Linus Torvalds 已提交
4111

4112 4113 4114 4115 4116
	 * Really, it is unlikely that netif_tx_lock protection is necessary
	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
	 * counters.)
	 * However, it is possible, that they rely on protection
	 * made by us here.
L
Linus Torvalds 已提交
4117

4118 4119
	 * Check this and shot the lock. It is not prone from deadlocks.
	 *Either shot noqueue qdisc, it is even simpler 8)
L
Linus Torvalds 已提交
4120 4121 4122 4123
	 */
	if (dev->flags & IFF_UP) {
		int cpu = smp_processor_id(); /* ok because BHs are off */

4124
		if (txq->xmit_lock_owner != cpu) {
4125
			if (dev_xmit_recursion())
4126 4127
				goto recursion_alert;

4128
			skb = validate_xmit_skb(skb, dev, &again);
4129
			if (!skb)
E
Eric Dumazet 已提交
4130
				goto out;
4131

4132
			HARD_TX_LOCK(dev, txq, cpu);
L
Linus Torvalds 已提交
4133

4134
			if (!netif_xmit_stopped(txq)) {
4135
				dev_xmit_recursion_inc();
4136
				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4137
				dev_xmit_recursion_dec();
4138
				if (dev_xmit_complete(rc)) {
4139
					HARD_TX_UNLOCK(dev, txq);
L
Linus Torvalds 已提交
4140 4141 4142
					goto out;
				}
			}
4143
			HARD_TX_UNLOCK(dev, txq);
4144 4145
			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
					     dev->name);
L
Linus Torvalds 已提交
4146 4147
		} else {
			/* Recursion is detected! It is possible,
4148 4149 4150
			 * unfortunately
			 */
recursion_alert:
4151 4152
			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
					     dev->name);
L
Linus Torvalds 已提交
4153 4154 4155 4156
		}
	}

	rc = -ENETDOWN;
4157
	rcu_read_unlock_bh();
L
Linus Torvalds 已提交
4158

4159
	atomic_long_inc(&dev->tx_dropped);
4160
	kfree_skb_list(skb);
L
Linus Torvalds 已提交
4161 4162
	return rc;
out:
4163
	rcu_read_unlock_bh();
L
Linus Torvalds 已提交
4164 4165
	return rc;
}
4166

4167
int dev_queue_xmit(struct sk_buff *skb)
4168 4169 4170
{
	return __dev_queue_xmit(skb, NULL);
}
4171
EXPORT_SYMBOL(dev_queue_xmit);
L
Linus Torvalds 已提交
4172

4173
int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4174
{
4175
	return __dev_queue_xmit(skb, sb_dev);
4176 4177 4178
}
EXPORT_SYMBOL(dev_queue_xmit_accel);

4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199
int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
	struct net_device *dev = skb->dev;
	struct sk_buff *orig_skb = skb;
	struct netdev_queue *txq;
	int ret = NETDEV_TX_BUSY;
	bool again = false;

	if (unlikely(!netif_running(dev) ||
		     !netif_carrier_ok(dev)))
		goto drop;

	skb = validate_xmit_skb_list(skb, dev, &again);
	if (skb != orig_skb)
		goto drop;

	skb_set_queue_mapping(skb, queue_id);
	txq = skb_get_tx_queue(dev, skb);

	local_bh_disable();

4200
	dev_xmit_recursion_inc();
4201 4202 4203 4204
	HARD_TX_LOCK(dev, txq, smp_processor_id());
	if (!netif_xmit_frozen_or_drv_stopped(txq))
		ret = netdev_start_xmit(skb, dev, txq, false);
	HARD_TX_UNLOCK(dev, txq);
4205
	dev_xmit_recursion_dec();
4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218

	local_bh_enable();

	if (!dev_xmit_complete(ret))
		kfree_skb(skb);

	return ret;
drop:
	atomic_long_inc(&dev->tx_dropped);
	kfree_skb_list(skb);
	return NET_XMIT_DROP;
}
EXPORT_SYMBOL(dev_direct_xmit);
L
Linus Torvalds 已提交
4219

4220 4221 4222
/*************************************************************************
 *			Receiver routines
 *************************************************************************/
L
Linus Torvalds 已提交
4223

4224
int netdev_max_backlog __read_mostly = 1000;
E
Eric Dumazet 已提交
4225 4226
EXPORT_SYMBOL(netdev_max_backlog);

E
Eric Dumazet 已提交
4227
int netdev_tstamp_prequeue __read_mostly = 1;
4228
int netdev_budget __read_mostly = 300;
4229 4230
/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4231 4232 4233 4234 4235
int weight_p __read_mostly = 64;           /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
int dev_rx_weight __read_mostly = 64;
int dev_tx_weight __read_mostly = 64;
4236 4237
/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
int gro_normal_batch __read_mostly = 8;
L
Linus Torvalds 已提交
4238

E
Eric Dumazet 已提交
4239 4240 4241 4242 4243 4244 4245 4246
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

4247 4248 4249
#ifdef CONFIG_RPS

/* One global table that all flow-based protocols share. */
E
Eric Dumazet 已提交
4250
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4251
EXPORT_SYMBOL(rps_sock_flow_table);
4252 4253
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
4254

4255
struct static_key_false rps_needed __read_mostly;
4256
EXPORT_SYMBOL(rps_needed);
4257
struct static_key_false rfs_needed __read_mostly;
E
Eric Dumazet 已提交
4258
EXPORT_SYMBOL(rfs_needed);
4259

4260 4261 4262 4263
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
	    struct rps_dev_flow *rflow, u16 next_cpu)
{
4264
	if (next_cpu < nr_cpu_ids) {
4265 4266 4267 4268 4269 4270 4271 4272 4273
#ifdef CONFIG_RFS_ACCEL
		struct netdev_rx_queue *rxqueue;
		struct rps_dev_flow_table *flow_table;
		struct rps_dev_flow *old_rflow;
		u32 flow_id;
		u16 rxq_index;
		int rc;

		/* Should we steer this flow to a different hardware queue? */
4274 4275
		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
		    !(dev->features & NETIF_F_NTUPLE))
4276 4277 4278 4279 4280 4281 4282 4283 4284
			goto out;
		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
		if (rxq_index == skb_get_rx_queue(skb))
			goto out;

		rxqueue = dev->_rx + rxq_index;
		flow_table = rcu_dereference(rxqueue->rps_flow_table);
		if (!flow_table)
			goto out;
4285
		flow_id = skb_get_hash(skb) & flow_table->mask;
4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297
		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
							rxq_index, flow_id);
		if (rc < 0)
			goto out;
		old_rflow = rflow;
		rflow = &flow_table->flows[flow_id];
		rflow->filter = rc;
		if (old_rflow->filter == rflow->filter)
			old_rflow->filter = RPS_NO_FILTER;
	out:
#endif
		rflow->last_qtail =
4298
			per_cpu(softnet_data, next_cpu).input_queue_head;
4299 4300
	}

4301
	rflow->cpu = next_cpu;
4302 4303 4304
	return rflow;
}

4305 4306 4307 4308 4309 4310 4311 4312
/*
 * get_rps_cpu is called from netif_receive_skb and returns the target
 * CPU from the RPS map of the receiving queue for a given skb.
 * rcu_read_lock must be held on entry.
 */
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
		       struct rps_dev_flow **rflowp)
{
4313 4314
	const struct rps_sock_flow_table *sock_flow_table;
	struct netdev_rx_queue *rxqueue = dev->_rx;
4315
	struct rps_dev_flow_table *flow_table;
4316
	struct rps_map *map;
4317
	int cpu = -1;
4318
	u32 tcpu;
4319
	u32 hash;
4320 4321 4322

	if (skb_rx_queue_recorded(skb)) {
		u16 index = skb_get_rx_queue(skb);
4323

4324 4325 4326 4327 4328
		if (unlikely(index >= dev->real_num_rx_queues)) {
			WARN_ONCE(dev->real_num_rx_queues > 1,
				  "%s received packet on queue %u, but number "
				  "of RX queues is %u\n",
				  dev->name, index, dev->real_num_rx_queues);
4329 4330
			goto done;
		}
4331 4332
		rxqueue += index;
	}
4333

4334 4335 4336
	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */

	flow_table = rcu_dereference(rxqueue->rps_flow_table);
E
Eric Dumazet 已提交
4337
	map = rcu_dereference(rxqueue->rps_map);
4338
	if (!flow_table && !map)
4339 4340
		goto done;

4341
	skb_reset_network_header(skb);
4342 4343
	hash = skb_get_hash(skb);
	if (!hash)
4344 4345
		goto done;

T
Tom Herbert 已提交
4346 4347 4348
	sock_flow_table = rcu_dereference(rps_sock_flow_table);
	if (flow_table && sock_flow_table) {
		struct rps_dev_flow *rflow;
4349 4350 4351 4352 4353 4354 4355
		u32 next_cpu;
		u32 ident;

		/* First check into global flow table if there is a match */
		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
		if ((ident ^ hash) & ~rps_cpu_mask)
			goto try_rps;
T
Tom Herbert 已提交
4356

4357 4358 4359 4360 4361
		next_cpu = ident & rps_cpu_mask;

		/* OK, now we know there is a match,
		 * we can look at the local (per receive queue) flow table
		 */
4362
		rflow = &flow_table->flows[hash & flow_table->mask];
T
Tom Herbert 已提交
4363 4364 4365 4366 4367 4368
		tcpu = rflow->cpu;

		/*
		 * If the desired CPU (where last recvmsg was done) is
		 * different from current CPU (one in the rx-queue flow
		 * table entry), switch if one of the following holds:
4369
		 *   - Current CPU is unset (>= nr_cpu_ids).
T
Tom Herbert 已提交
4370 4371 4372 4373 4374 4375 4376
		 *   - Current CPU is offline.
		 *   - The current CPU's queue tail has advanced beyond the
		 *     last packet that was enqueued using this table entry.
		 *     This guarantees that all previous packets for the flow
		 *     have been dequeued, thus preserving in order delivery.
		 */
		if (unlikely(tcpu != next_cpu) &&
4377
		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
T
Tom Herbert 已提交
4378
		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4379 4380
		      rflow->last_qtail)) >= 0)) {
			tcpu = next_cpu;
4381
			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4382
		}
4383

4384
		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
T
Tom Herbert 已提交
4385 4386 4387 4388 4389 4390
			*rflowp = rflow;
			cpu = tcpu;
			goto done;
		}
	}

4391 4392
try_rps:

T
Tom Herbert 已提交
4393
	if (map) {
4394
		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
T
Tom Herbert 已提交
4395 4396 4397 4398 4399 4400 4401 4402 4403 4404
		if (cpu_online(tcpu)) {
			cpu = tcpu;
			goto done;
		}
	}

done:
	return cpu;
}

4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424
#ifdef CONFIG_RFS_ACCEL

/**
 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 * @dev: Device on which the filter was set
 * @rxq_index: RX queue index
 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 *
 * Drivers that implement ndo_rx_flow_steer() should periodically call
 * this function for each installed filter and remove the filters for
 * which it returns %true.
 */
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
			 u32 flow_id, u16 filter_id)
{
	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
	struct rps_dev_flow_table *flow_table;
	struct rps_dev_flow *rflow;
	bool expire = true;
4425
	unsigned int cpu;
4426 4427 4428 4429 4430

	rcu_read_lock();
	flow_table = rcu_dereference(rxqueue->rps_flow_table);
	if (flow_table && flow_id <= flow_table->mask) {
		rflow = &flow_table->flows[flow_id];
4431
		cpu = READ_ONCE(rflow->cpu);
4432
		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444
		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
			   rflow->last_qtail) <
		     (int)(10 * flow_table->mask)))
			expire = false;
	}
	rcu_read_unlock();
	return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);

#endif /* CONFIG_RFS_ACCEL */

T
Tom Herbert 已提交
4445
/* Called from hardirq (IPI) context */
E
Eric Dumazet 已提交
4446
static void rps_trigger_softirq(void *data)
T
Tom Herbert 已提交
4447
{
E
Eric Dumazet 已提交
4448 4449
	struct softnet_data *sd = data;

E
Eric Dumazet 已提交
4450
	____napi_schedule(sd, &sd->backlog);
C
Changli Gao 已提交
4451
	sd->received_rps++;
T
Tom Herbert 已提交
4452
}
E
Eric Dumazet 已提交
4453

T
Tom Herbert 已提交
4454
#endif /* CONFIG_RPS */
T
Tom Herbert 已提交
4455

E
Eric Dumazet 已提交
4456 4457 4458 4459 4460 4461 4462 4463
/*
 * Check if this softnet_data structure is another cpu one
 * If yes, queue it to our IPI list and return 1
 * If no, return 0
 */
static int rps_ipi_queued(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
4464
	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
E
Eric Dumazet 已提交
4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476

	if (sd != mysd) {
		sd->rps_ipi_next = mysd->rps_ipi_list;
		mysd->rps_ipi_list = sd;

		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
		return 1;
	}
#endif /* CONFIG_RPS */
	return 0;
}

4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490
#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif

static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
{
#ifdef CONFIG_NET_FLOW_LIMIT
	struct sd_flow_limit *fl;
	struct softnet_data *sd;
	unsigned int old_flow, new_flow;

	if (qlen < (netdev_max_backlog >> 1))
		return false;

4491
	sd = this_cpu_ptr(&softnet_data);
4492 4493 4494 4495

	rcu_read_lock();
	fl = rcu_dereference(sd->flow_limit);
	if (fl) {
4496
		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516
		old_flow = fl->history[fl->history_head];
		fl->history[fl->history_head] = new_flow;

		fl->history_head++;
		fl->history_head &= FLOW_LIMIT_HISTORY - 1;

		if (likely(fl->buckets[old_flow]))
			fl->buckets[old_flow]--;

		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
			fl->count++;
			rcu_read_unlock();
			return true;
		}
	}
	rcu_read_unlock();
#endif
	return false;
}

T
Tom Herbert 已提交
4517 4518 4519 4520
/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
T
Tom Herbert 已提交
4521 4522
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
			      unsigned int *qtail)
T
Tom Herbert 已提交
4523
{
E
Eric Dumazet 已提交
4524
	struct softnet_data *sd;
T
Tom Herbert 已提交
4525
	unsigned long flags;
4526
	unsigned int qlen;
T
Tom Herbert 已提交
4527

E
Eric Dumazet 已提交
4528
	sd = &per_cpu(softnet_data, cpu);
T
Tom Herbert 已提交
4529 4530 4531

	local_irq_save(flags);

E
Eric Dumazet 已提交
4532
	rps_lock(sd);
4533 4534
	if (!netif_running(skb->dev))
		goto drop;
4535 4536
	qlen = skb_queue_len(&sd->input_pkt_queue);
	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4537
		if (qlen) {
T
Tom Herbert 已提交
4538
enqueue:
E
Eric Dumazet 已提交
4539
			__skb_queue_tail(&sd->input_pkt_queue, skb);
4540
			input_queue_tail_incr_save(sd, qtail);
E
Eric Dumazet 已提交
4541
			rps_unlock(sd);
4542
			local_irq_restore(flags);
T
Tom Herbert 已提交
4543 4544 4545
			return NET_RX_SUCCESS;
		}

4546 4547 4548 4549
		/* Schedule NAPI for backlog device
		 * We can use non atomic operation since we own the queue lock
		 */
		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
E
Eric Dumazet 已提交
4550
			if (!rps_ipi_queued(sd))
E
Eric Dumazet 已提交
4551
				____napi_schedule(sd, &sd->backlog);
T
Tom Herbert 已提交
4552 4553 4554 4555
		}
		goto enqueue;
	}

4556
drop:
C
Changli Gao 已提交
4557
	sd->dropped++;
E
Eric Dumazet 已提交
4558
	rps_unlock(sd);
T
Tom Herbert 已提交
4559 4560 4561

	local_irq_restore(flags);

4562
	atomic_long_inc(&skb->dev->rx_dropped);
T
Tom Herbert 已提交
4563 4564 4565
	kfree_skb(skb);
	return NET_RX_DROP;
}
L
Linus Torvalds 已提交
4566

4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589
static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;
	struct netdev_rx_queue *rxqueue;

	rxqueue = dev->_rx;

	if (skb_rx_queue_recorded(skb)) {
		u16 index = skb_get_rx_queue(skb);

		if (unlikely(index >= dev->real_num_rx_queues)) {
			WARN_ONCE(dev->real_num_rx_queues > 1,
				  "%s received packet on queue %u, but number "
				  "of RX queues is %u\n",
				  dev->name, index, dev->real_num_rx_queues);

			return rxqueue; /* Return first rxqueue */
		}
		rxqueue += index;
	}
	return rxqueue;
}

4590
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4591
				     struct xdp_buff *xdp,
4592 4593
				     struct bpf_prog *xdp_prog)
{
4594
	struct netdev_rx_queue *rxqueue;
4595
	void *orig_data, *orig_data_end;
4596
	u32 metalen, act = XDP_DROP;
4597 4598 4599
	__be16 orig_eth_type;
	struct ethhdr *eth;
	bool orig_bcast;
4600 4601 4602 4603 4604 4605
	int hlen, off;
	u32 mac_len;

	/* Reinjected packets coming from act_mirred or similar should
	 * not get XDP generic processing.
	 */
4606
	if (skb_is_redirected(skb))
4607 4608
		return XDP_PASS;

4609 4610 4611 4612
	/* XDP packets must be linear and must have sufficient headroom
	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
	 * native XDP provides, thus we need to do it here as well.
	 */
4613
	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624
	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
		int troom = skb->tail + skb->data_len - skb->end;

		/* In case we have to go down the path and also linearize,
		 * then lets do the pskb_expand_head() work just once here.
		 */
		if (pskb_expand_head(skb,
				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
			goto do_drop;
4625
		if (skb_linearize(skb))
4626 4627
			goto do_drop;
	}
4628 4629 4630 4631 4632 4633

	/* The XDP program wants to see the packet starting at the MAC
	 * header.
	 */
	mac_len = skb->data - skb_mac_header(skb);
	hlen = skb_headlen(skb) + mac_len;
4634 4635 4636 4637
	xdp->data = skb->data - mac_len;
	xdp->data_meta = xdp->data;
	xdp->data_end = xdp->data + hlen;
	xdp->data_hard_start = skb->data - skb_headroom(skb);
4638 4639 4640 4641 4642

	/* SKB "head" area always have tailroom for skb_shared_info */
	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

4643 4644
	orig_data_end = xdp->data_end;
	orig_data = xdp->data;
4645 4646 4647
	eth = (struct ethhdr *)xdp->data;
	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
	orig_eth_type = eth->h_proto;
4648

4649
	rxqueue = netif_get_rxqueue(skb);
4650
	xdp->rxq = &rxqueue->xdp_rxq;
4651

4652
	act = bpf_prog_run_xdp(xdp_prog, xdp);
4653

4654
	/* check if bpf_xdp_adjust_head was used */
4655
	off = xdp->data - orig_data;
4656 4657 4658 4659 4660 4661 4662 4663 4664
	if (off) {
		if (off > 0)
			__skb_pull(skb, off);
		else if (off < 0)
			__skb_push(skb, -off);

		skb->mac_header += off;
		skb_reset_network_header(skb);
	}
4665

4666 4667
	/* check if bpf_xdp_adjust_tail was used */
	off = xdp->data_end - orig_data_end;
4668
	if (off != 0) {
4669
		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4670
		skb->len += off; /* positive on grow, negative on shrink */
4671
	}
4672

4673 4674 4675 4676 4677 4678 4679 4680
	/* check if XDP changed eth hdr such SKB needs update */
	eth = (struct ethhdr *)xdp->data;
	if ((orig_eth_type != eth->h_proto) ||
	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
		__skb_push(skb, ETH_HLEN);
		skb->protocol = eth_type_trans(skb, skb->dev);
	}

4681
	switch (act) {
4682
	case XDP_REDIRECT:
4683 4684
	case XDP_TX:
		__skb_push(skb, mac_len);
4685
		break;
4686
	case XDP_PASS:
4687
		metalen = xdp->data - xdp->data_meta;
4688 4689
		if (metalen)
			skb_metadata_set(skb, metalen);
4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708
		break;
	default:
		bpf_warn_invalid_xdp_action(act);
		/* fall through */
	case XDP_ABORTED:
		trace_xdp_exception(skb->dev, xdp_prog, act);
		/* fall through */
	case XDP_DROP:
	do_drop:
		kfree_skb(skb);
		break;
	}

	return act;
}

/* When doing generic XDP we have to bypass the qdisc layer and the
 * network taps in order to match in-driver-XDP behavior.
 */
4709
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4710 4711 4712 4713 4714 4715
{
	struct net_device *dev = skb->dev;
	struct netdev_queue *txq;
	bool free_skb = true;
	int cpu, rc;

4716
	txq = netdev_core_pick_tx(dev, skb, NULL);
4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730
	cpu = smp_processor_id();
	HARD_TX_LOCK(dev, txq, cpu);
	if (!netif_xmit_stopped(txq)) {
		rc = netdev_start_xmit(skb, dev, txq, 0);
		if (dev_xmit_complete(rc))
			free_skb = false;
	}
	HARD_TX_UNLOCK(dev, txq);
	if (free_skb) {
		trace_xdp_exception(dev, xdp_prog, XDP_TX);
		kfree_skb(skb);
	}
}

4731
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4732

4733
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4734 4735
{
	if (xdp_prog) {
4736 4737
		struct xdp_buff xdp;
		u32 act;
4738
		int err;
4739

4740
		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4741
		if (act != XDP_PASS) {
4742 4743
			switch (act) {
			case XDP_REDIRECT:
4744
				err = xdp_do_generic_redirect(skb->dev, skb,
4745
							      &xdp, xdp_prog);
4746 4747
				if (err)
					goto out_redir;
4748
				break;
4749
			case XDP_TX:
4750
				generic_xdp_tx(skb, xdp_prog);
4751 4752
				break;
			}
4753 4754 4755 4756
			return XDP_DROP;
		}
	}
	return XDP_PASS;
4757 4758 4759
out_redir:
	kfree_skb(skb);
	return XDP_DROP;
4760
}
4761
EXPORT_SYMBOL_GPL(do_xdp_generic);
4762

4763
static int netif_rx_internal(struct sk_buff *skb)
L
Linus Torvalds 已提交
4764
{
4765
	int ret;
L
Linus Torvalds 已提交
4766

4767
	net_timestamp_check(netdev_tstamp_prequeue, skb);
L
Linus Torvalds 已提交
4768

4769
	trace_netif_rx(skb);
4770

E
Eric Dumazet 已提交
4771
#ifdef CONFIG_RPS
4772
	if (static_branch_unlikely(&rps_needed)) {
T
Tom Herbert 已提交
4773
		struct rps_dev_flow voidflow, *rflow = &voidflow;
4774 4775
		int cpu;

4776
		preempt_disable();
4777
		rcu_read_lock();
T
Tom Herbert 已提交
4778 4779

		cpu = get_rps_cpu(skb->dev, skb, &rflow);
4780 4781
		if (cpu < 0)
			cpu = smp_processor_id();
T
Tom Herbert 已提交
4782 4783 4784

		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

4785
		rcu_read_unlock();
4786
		preempt_enable();
4787 4788
	} else
#endif
T
Tom Herbert 已提交
4789 4790
	{
		unsigned int qtail;
4791

T
Tom Herbert 已提交
4792 4793 4794
		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
		put_cpu();
	}
4795
	return ret;
L
Linus Torvalds 已提交
4796
}
4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814

/**
 *	netif_rx	-	post buffer to the network code
 *	@skb: buffer to post
 *
 *	This function receives a packet from a device driver and queues it for
 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 *	may be dropped during processing for congestion control or by the
 *	protocol layers.
 *
 *	return values:
 *	NET_RX_SUCCESS	(no congestion)
 *	NET_RX_DROP     (packet was dropped)
 *
 */

int netif_rx(struct sk_buff *skb)
{
4815 4816
	int ret;

4817 4818
	trace_netif_rx_entry(skb);

4819 4820 4821 4822
	ret = netif_rx_internal(skb);
	trace_netif_rx_exit(ret);

	return ret;
4823
}
E
Eric Dumazet 已提交
4824
EXPORT_SYMBOL(netif_rx);
L
Linus Torvalds 已提交
4825 4826 4827 4828 4829

int netif_rx_ni(struct sk_buff *skb)
{
	int err;

4830 4831
	trace_netif_rx_ni_entry(skb);

L
Linus Torvalds 已提交
4832
	preempt_disable();
4833
	err = netif_rx_internal(skb);
L
Linus Torvalds 已提交
4834 4835 4836
	if (local_softirq_pending())
		do_softirq();
	preempt_enable();
4837
	trace_netif_rx_ni_exit(err);
L
Linus Torvalds 已提交
4838 4839 4840 4841 4842

	return err;
}
EXPORT_SYMBOL(netif_rx_ni);

4843
static __latent_entropy void net_tx_action(struct softirq_action *h)
L
Linus Torvalds 已提交
4844
{
4845
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
L
Linus Torvalds 已提交
4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856

	if (sd->completion_queue) {
		struct sk_buff *clist;

		local_irq_disable();
		clist = sd->completion_queue;
		sd->completion_queue = NULL;
		local_irq_enable();

		while (clist) {
			struct sk_buff *skb = clist;
4857

L
Linus Torvalds 已提交
4858 4859
			clist = clist->next;

4860
			WARN_ON(refcount_read(&skb->users));
4861 4862 4863 4864
			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
				trace_consume_skb(skb);
			else
				trace_kfree_skb(skb, net_tx_action);
4865 4866 4867 4868 4869

			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
				__kfree_skb(skb);
			else
				__kfree_skb_defer(skb);
L
Linus Torvalds 已提交
4870
		}
4871 4872

		__kfree_skb_flush();
L
Linus Torvalds 已提交
4873 4874 4875
	}

	if (sd->output_queue) {
4876
		struct Qdisc *head;
L
Linus Torvalds 已提交
4877 4878 4879 4880

		local_irq_disable();
		head = sd->output_queue;
		sd->output_queue = NULL;
4881
		sd->output_queue_tailp = &sd->output_queue;
L
Linus Torvalds 已提交
4882 4883 4884
		local_irq_enable();

		while (head) {
4885
			struct Qdisc *q = head;
4886
			spinlock_t *root_lock = NULL;
4887

L
Linus Torvalds 已提交
4888 4889
			head = head->next_sched;

4890 4891 4892 4893
			if (!(q->flags & TCQ_F_NOLOCK)) {
				root_lock = qdisc_lock(q);
				spin_lock(root_lock);
			}
4894 4895 4896 4897 4898 4899
			/* We need to make sure head->next_sched is read
			 * before clearing __QDISC_STATE_SCHED
			 */
			smp_mb__before_atomic();
			clear_bit(__QDISC_STATE_SCHED, &q->state);
			qdisc_run(q);
4900 4901
			if (root_lock)
				spin_unlock(root_lock);
L
Linus Torvalds 已提交
4902 4903
		}
	}
4904 4905

	xfrm_dev_backlog(sd);
L
Linus Torvalds 已提交
4906 4907
}

4908
#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4909 4910 4911
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
			     unsigned char *addr) __read_mostly;
4912
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4913
#endif
L
Linus Torvalds 已提交
4914

D
Daniel Borkmann 已提交
4915 4916 4917
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
		   struct net_device *orig_dev)
4918
{
4919
#ifdef CONFIG_NET_CLS_ACT
4920
	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4921
	struct tcf_result cl_res;
4922

4923 4924 4925
	/* If there's at least one ingress present somewhere (so
	 * we get here via enabled static key), remaining devices
	 * that are not configured with an ingress qdisc will bail
4926
	 * out here.
4927
	 */
4928
	if (!miniq)
4929
		return skb;
4930

4931 4932 4933
	if (*pt_prev) {
		*ret = deliver_skb(skb, *pt_prev, orig_dev);
		*pt_prev = NULL;
L
Linus Torvalds 已提交
4934 4935
	}

4936
	qdisc_skb_cb(skb)->pkt_len = skb->len;
4937
	skb->tc_at_ingress = 1;
4938
	mini_qdisc_bstats_cpu_update(miniq, skb);
4939

4940 4941
	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
				     &cl_res, false)) {
4942 4943 4944 4945 4946
	case TC_ACT_OK:
	case TC_ACT_RECLASSIFY:
		skb->tc_index = TC_H_MIN(cl_res.classid);
		break;
	case TC_ACT_SHOT:
4947
		mini_qdisc_qstats_cpu_drop(miniq);
4948 4949
		kfree_skb(skb);
		return NULL;
4950 4951
	case TC_ACT_STOLEN:
	case TC_ACT_QUEUED:
4952
	case TC_ACT_TRAP:
4953
		consume_skb(skb);
4954
		return NULL;
4955 4956 4957 4958 4959 4960 4961 4962
	case TC_ACT_REDIRECT:
		/* skb_mac_header check was done by cls/act_bpf, so
		 * we can safely push the L2 header back before
		 * redirecting to another netdev
		 */
		__skb_push(skb, skb->mac_len);
		skb_do_redirect(skb);
		return NULL;
4963
	case TC_ACT_CONSUMED:
P
Paolo Abeni 已提交
4964
		return NULL;
4965 4966
	default:
		break;
4967
	}
4968
#endif /* CONFIG_NET_CLS_ACT */
4969 4970
	return skb;
}
L
Linus Torvalds 已提交
4971

M
Mahesh Bandewar 已提交
4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987
/**
 *	netdev_is_rx_handler_busy - check if receive handler is registered
 *	@dev: device to check
 *
 *	Check if a receive handler is already registered for a given device.
 *	Return true if there one.
 *
 *	The caller must hold the rtnl_mutex.
 */
bool netdev_is_rx_handler_busy(struct net_device *dev)
{
	ASSERT_RTNL();
	return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);

4988 4989 4990 4991
/**
 *	netdev_rx_handler_register - register receive handler
 *	@dev: device to register a handler for
 *	@rx_handler: receive handler to register
J
Jiri Pirko 已提交
4992
 *	@rx_handler_data: data pointer that is used by rx handler
4993
 *
4994
 *	Register a receive handler for a device. This handler will then be
4995 4996 4997 4998
 *	called from __netif_receive_skb. A negative errno code is returned
 *	on a failure.
 *
 *	The caller must hold the rtnl_mutex.
4999 5000
 *
 *	For a general description of rx_handler, see enum rx_handler_result.
5001 5002
 */
int netdev_rx_handler_register(struct net_device *dev,
J
Jiri Pirko 已提交
5003 5004
			       rx_handler_func_t *rx_handler,
			       void *rx_handler_data)
5005
{
M
Mahesh Bandewar 已提交
5006
	if (netdev_is_rx_handler_busy(dev))
5007 5008
		return -EBUSY;

P
Paolo Abeni 已提交
5009 5010 5011
	if (dev->priv_flags & IFF_NO_RX_HANDLER)
		return -EINVAL;

5012
	/* Note: rx_handler_data must be set before rx_handler */
J
Jiri Pirko 已提交
5013
	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5014 5015 5016 5017 5018 5019 5020 5021 5022 5023
	rcu_assign_pointer(dev->rx_handler, rx_handler);

	return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);

/**
 *	netdev_rx_handler_unregister - unregister receive handler
 *	@dev: device to unregister a handler from
 *
K
Kusanagi Kouichi 已提交
5024
 *	Unregister a receive handler from a device.
5025 5026 5027 5028 5029 5030 5031
 *
 *	The caller must hold the rtnl_mutex.
 */
void netdev_rx_handler_unregister(struct net_device *dev)
{

	ASSERT_RTNL();
5032
	RCU_INIT_POINTER(dev->rx_handler, NULL);
5033 5034 5035 5036 5037
	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
	 * section has a guarantee to see a non NULL rx_handler_data
	 * as well.
	 */
	synchronize_net();
5038
	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5039 5040 5041
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);

5042 5043 5044 5045 5046 5047 5048
/*
 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 * the special handling of PFMEMALLOC skbs.
 */
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
	switch (skb->protocol) {
5049 5050 5051 5052 5053
	case htons(ETH_P_ARP):
	case htons(ETH_P_IP):
	case htons(ETH_P_IPV6):
	case htons(ETH_P_8021Q):
	case htons(ETH_P_8021AD):
5054 5055 5056 5057 5058 5059
		return true;
	default:
		return false;
	}
}

5060 5061 5062 5063
static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
			     int *ret, struct net_device *orig_dev)
{
	if (nf_hook_ingress_active(skb)) {
5064 5065
		int ingress_retval;

5066 5067 5068 5069 5070
		if (*pt_prev) {
			*ret = deliver_skb(skb, *pt_prev, orig_dev);
			*pt_prev = NULL;
		}

5071 5072 5073 5074
		rcu_read_lock();
		ingress_retval = nf_hook_ingress(skb);
		rcu_read_unlock();
		return ingress_retval;
5075 5076 5077 5078
	}
	return 0;
}

5079
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5080
				    struct packet_type **ppt_prev)
L
Linus Torvalds 已提交
5081 5082
{
	struct packet_type *ptype, *pt_prev;
5083
	rx_handler_func_t *rx_handler;
5084
	struct sk_buff *skb = *pskb;
D
David S. Miller 已提交
5085
	struct net_device *orig_dev;
5086
	bool deliver_exact = false;
L
Linus Torvalds 已提交
5087
	int ret = NET_RX_DROP;
A
Al Viro 已提交
5088
	__be16 type;
L
Linus Torvalds 已提交
5089

5090
	net_timestamp_check(!netdev_tstamp_prequeue, skb);
5091

5092
	trace_netif_receive_skb(skb);
5093

J
Joe Eykholt 已提交
5094
	orig_dev = skb->dev;
5095

5096
	skb_reset_network_header(skb);
5097 5098
	if (!skb_transport_header_was_set(skb))
		skb_reset_transport_header(skb);
5099
	skb_reset_mac_len(skb);
L
Linus Torvalds 已提交
5100 5101 5102

	pt_prev = NULL;

5103
another_round:
5104
	skb->skb_iif = skb->dev->ifindex;
5105 5106 5107

	__this_cpu_inc(softnet_data.processed);

5108 5109 5110 5111 5112 5113 5114
	if (static_branch_unlikely(&generic_xdp_needed_key)) {
		int ret2;

		preempt_disable();
		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
		preempt_enable();

5115 5116 5117 5118
		if (ret2 != XDP_PASS) {
			ret = NET_RX_DROP;
			goto out;
		}
5119 5120 5121
		skb_reset_mac_len(skb);
	}

P
Patrick McHardy 已提交
5122 5123
	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5124
		skb = skb_vlan_untag(skb);
5125
		if (unlikely(!skb))
5126
			goto out;
5127 5128
	}

5129 5130
	if (skb_skip_tc_classify(skb))
		goto skip_classify;
L
Linus Torvalds 已提交
5131

5132
	if (pfmemalloc)
5133 5134
		goto skip_taps;

L
Linus Torvalds 已提交
5135
	list_for_each_entry_rcu(ptype, &ptype_all, list) {
5136 5137 5138 5139 5140 5141 5142 5143 5144
		if (pt_prev)
			ret = deliver_skb(skb, pt_prev, orig_dev);
		pt_prev = ptype;
	}

	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
		if (pt_prev)
			ret = deliver_skb(skb, pt_prev, orig_dev);
		pt_prev = ptype;
L
Linus Torvalds 已提交
5145 5146
	}

5147
skip_taps:
5148
#ifdef CONFIG_NET_INGRESS
5149
	if (static_branch_unlikely(&ingress_needed_key)) {
D
Daniel Borkmann 已提交
5150
		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5151
		if (!skb)
5152
			goto out;
5153 5154

		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5155
			goto out;
5156
	}
5157
#endif
5158
	skb_reset_redirect(skb);
5159
skip_classify:
5160
	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5161 5162
		goto drop;

5163
	if (skb_vlan_tag_present(skb)) {
5164 5165 5166 5167
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
5168
		if (vlan_do_receive(&skb))
5169 5170
			goto another_round;
		else if (unlikely(!skb))
5171
			goto out;
5172 5173
	}

5174
	rx_handler = rcu_dereference(skb->dev->rx_handler);
5175 5176 5177 5178 5179
	if (rx_handler) {
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
5180 5181
		switch (rx_handler(&skb)) {
		case RX_HANDLER_CONSUMED:
5182
			ret = NET_RX_SUCCESS;
5183
			goto out;
5184
		case RX_HANDLER_ANOTHER:
5185
			goto another_round;
5186 5187 5188 5189 5190 5191 5192
		case RX_HANDLER_EXACT:
			deliver_exact = true;
		case RX_HANDLER_PASS:
			break;
		default:
			BUG();
		}
5193
	}
L
Linus Torvalds 已提交
5194

5195
	if (unlikely(skb_vlan_tag_present(skb))) {
5196 5197 5198 5199 5200
check_vlan_id:
		if (skb_vlan_tag_get_id(skb)) {
			/* Vlan id is non 0 and vlan_do_receive() above couldn't
			 * find vlan device.
			 */
E
Eric Dumazet 已提交
5201
			skb->pkt_type = PACKET_OTHERHOST;
5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225
		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
			/* Outer header is 802.1P with vlan 0, inner header is
			 * 802.1Q or 802.1AD and vlan_do_receive() above could
			 * not find vlan dev for vlan id 0.
			 */
			__vlan_hwaccel_clear_tag(skb);
			skb = skb_vlan_untag(skb);
			if (unlikely(!skb))
				goto out;
			if (vlan_do_receive(&skb))
				/* After stripping off 802.1P header with vlan 0
				 * vlan dev is found for inner header.
				 */
				goto another_round;
			else if (unlikely(!skb))
				goto out;
			else
				/* We have stripped outer 802.1P vlan 0 header.
				 * But could not find vlan dev.
				 * check again for vlan id to set OTHERHOST.
				 */
				goto check_vlan_id;
		}
E
Eric Dumazet 已提交
5226 5227 5228 5229
		/* Note: we might in the future use prio bits
		 * and set skb->priority like in vlan_do_receive()
		 * For the time being, just ignore Priority Code Point
		 */
5230
		__vlan_hwaccel_clear_tag(skb);
E
Eric Dumazet 已提交
5231
	}
5232

5233 5234
	type = skb->protocol;

5235
	/* deliver only exact match when indicated */
5236 5237 5238 5239 5240
	if (likely(!deliver_exact)) {
		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
				       &ptype_base[ntohs(type) &
						   PTYPE_HASH_MASK]);
	}
5241

5242 5243 5244 5245 5246 5247
	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
			       &orig_dev->ptype_specific);

	if (unlikely(skb->dev != orig_dev)) {
		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
				       &skb->dev->ptype_specific);
L
Linus Torvalds 已提交
5248 5249 5250
	}

	if (pt_prev) {
W
Willem de Bruijn 已提交
5251
		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5252
			goto drop;
5253
		*ppt_prev = pt_prev;
L
Linus Torvalds 已提交
5254
	} else {
5255
drop:
5256 5257 5258 5259
		if (!deliver_exact)
			atomic_long_inc(&skb->dev->rx_dropped);
		else
			atomic_long_inc(&skb->dev->rx_nohandler);
L
Linus Torvalds 已提交
5260 5261 5262 5263 5264 5265 5266
		kfree_skb(skb);
		/* Jamal, now you will not able to escape explaining
		 * me how you were going to use this. :-)
		 */
		ret = NET_RX_DROP;
	}

5267
out:
5268 5269 5270 5271 5272 5273 5274
	/* The invariant here is that if *ppt_prev is not NULL
	 * then skb should also be non-NULL.
	 *
	 * Apparently *ppt_prev assignment above holds this invariant due to
	 * skb dereferencing near it.
	 */
	*pskb = skb;
5275 5276 5277
	return ret;
}

5278 5279 5280 5281 5282 5283
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
	struct net_device *orig_dev = skb->dev;
	struct packet_type *pt_prev = NULL;
	int ret;

5284
	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5285
	if (pt_prev)
5286 5287
		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
					 skb->dev, pt_prev, orig_dev);
5288 5289 5290
	return ret;
}

5291 5292 5293 5294 5295 5296
/**
 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 *	@skb: buffer to process
 *
 *	More direct receive version of netif_receive_skb().  It should
 *	only be used by callers that have a need to skip RPS and Generic XDP.
5297
 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310
 *
 *	This function may only be called from softirq context and interrupts
 *	should be enabled.
 *
 *	Return values (usually ignored):
 *	NET_RX_SUCCESS: no congestion
 *	NET_RX_DROP: packet was dropped
 */
int netif_receive_skb_core(struct sk_buff *skb)
{
	int ret;

	rcu_read_lock();
5311
	ret = __netif_receive_skb_one_core(skb, false);
5312 5313 5314 5315 5316 5317
	rcu_read_unlock();

	return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);

5318 5319 5320
static inline void __netif_receive_skb_list_ptype(struct list_head *head,
						  struct packet_type *pt_prev,
						  struct net_device *orig_dev)
5321 5322 5323
{
	struct sk_buff *skb, *next;

5324 5325 5326 5327
	if (!pt_prev)
		return;
	if (list_empty(head))
		return;
5328
	if (pt_prev->list_func != NULL)
5329 5330
		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
				   ip_list_rcv, head, pt_prev, orig_dev);
5331
	else
5332 5333
		list_for_each_entry_safe(skb, next, head, list) {
			skb_list_del_init(skb);
5334
			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5335
		}
5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355
}

static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
{
	/* Fast-path assumptions:
	 * - There is no RX handler.
	 * - Only one packet_type matches.
	 * If either of these fails, we will end up doing some per-packet
	 * processing in-line, then handling the 'last ptype' for the whole
	 * sublist.  This can't cause out-of-order delivery to any single ptype,
	 * because the 'last ptype' must be constant across the sublist, and all
	 * other ptypes are handled per-packet.
	 */
	/* Current (common) ptype of sublist */
	struct packet_type *pt_curr = NULL;
	/* Current (common) orig_dev of sublist */
	struct net_device *od_curr = NULL;
	struct list_head sublist;
	struct sk_buff *skb, *next;

5356
	INIT_LIST_HEAD(&sublist);
5357 5358 5359 5360
	list_for_each_entry_safe(skb, next, head, list) {
		struct net_device *orig_dev = skb->dev;
		struct packet_type *pt_prev = NULL;

5361
		skb_list_del_init(skb);
5362
		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5363 5364
		if (!pt_prev)
			continue;
5365 5366 5367 5368
		if (pt_curr != pt_prev || od_curr != orig_dev) {
			/* dispatch old sublist */
			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
			/* start new sublist */
5369
			INIT_LIST_HEAD(&sublist);
5370 5371 5372
			pt_curr = pt_prev;
			od_curr = orig_dev;
		}
5373
		list_add_tail(&skb->list, &sublist);
5374 5375 5376
	}

	/* dispatch final sublist */
5377
	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5378 5379
}

5380 5381 5382 5383 5384
static int __netif_receive_skb(struct sk_buff *skb)
{
	int ret;

	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5385
		unsigned int noreclaim_flag;
5386 5387 5388 5389 5390 5391 5392 5393 5394 5395

		/*
		 * PFMEMALLOC skbs are special, they should
		 * - be delivered to SOCK_MEMALLOC sockets only
		 * - stay away from userspace
		 * - have bounded memory usage
		 *
		 * Use PF_MEMALLOC as this saves us from propagating the allocation
		 * context down to all allocation sites.
		 */
5396
		noreclaim_flag = memalloc_noreclaim_save();
5397
		ret = __netif_receive_skb_one_core(skb, true);
5398
		memalloc_noreclaim_restore(noreclaim_flag);
5399
	} else
5400
		ret = __netif_receive_skb_one_core(skb, false);
5401

L
Linus Torvalds 已提交
5402 5403
	return ret;
}
T
Tom Herbert 已提交
5404

5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416
static void __netif_receive_skb_list(struct list_head *head)
{
	unsigned long noreclaim_flag = 0;
	struct sk_buff *skb, *next;
	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */

	list_for_each_entry_safe(skb, next, head, list) {
		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
			struct list_head sublist;

			/* Handle the previous sublist */
			list_cut_before(&sublist, head, &skb->list);
5417 5418
			if (!list_empty(&sublist))
				__netif_receive_skb_list_core(&sublist, pfmemalloc);
5419 5420 5421 5422 5423 5424 5425 5426 5427
			pfmemalloc = !pfmemalloc;
			/* See comments in __netif_receive_skb */
			if (pfmemalloc)
				noreclaim_flag = memalloc_noreclaim_save();
			else
				memalloc_noreclaim_restore(noreclaim_flag);
		}
	}
	/* Handle the remaining sublist */
5428 5429
	if (!list_empty(head))
		__netif_receive_skb_list_core(head, pfmemalloc);
5430 5431 5432 5433 5434
	/* Restore pflags */
	if (pfmemalloc)
		memalloc_noreclaim_restore(noreclaim_flag);
}

5435
static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
D
David S. Miller 已提交
5436
{
M
Martin KaFai Lau 已提交
5437
	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
D
David S. Miller 已提交
5438 5439 5440
	struct bpf_prog *new = xdp->prog;
	int ret = 0;

5441 5442 5443 5444 5445 5446 5447 5448 5449
	if (new) {
		u32 i;

		/* generic XDP does not work with DEVMAPs that can
		 * have a bpf_prog installed on an entry
		 */
		for (i = 0; i < new->aux->used_map_cnt; i++) {
			if (dev_map_can_have_prog(new->aux->used_maps[i]))
				return -EINVAL;
5450 5451
			if (cpu_map_prog_allowed(new->aux->used_maps[i]))
				return -EINVAL;
5452 5453 5454
		}
	}

D
David S. Miller 已提交
5455
	switch (xdp->command) {
M
Martin KaFai Lau 已提交
5456
	case XDP_SETUP_PROG:
D
David S. Miller 已提交
5457 5458 5459 5460 5461
		rcu_assign_pointer(dev->xdp_prog, new);
		if (old)
			bpf_prog_put(old);

		if (old && !new) {
5462
			static_branch_dec(&generic_xdp_needed_key);
D
David S. Miller 已提交
5463
		} else if (new && !old) {
5464
			static_branch_inc(&generic_xdp_needed_key);
D
David S. Miller 已提交
5465
			dev_disable_lro(dev);
5466
			dev_disable_gro_hw(dev);
D
David S. Miller 已提交
5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477
		}
		break;

	default:
		ret = -EINVAL;
		break;
	}

	return ret;
}

5478
static int netif_receive_skb_internal(struct sk_buff *skb)
T
Tom Herbert 已提交
5479
{
5480 5481
	int ret;

5482
	net_timestamp_check(netdev_tstamp_prequeue, skb);
E
Eric Dumazet 已提交
5483

5484 5485 5486
	if (skb_defer_rx_timestamp(skb))
		return NET_RX_SUCCESS;

5487
	rcu_read_lock();
E
Eric Dumazet 已提交
5488
#ifdef CONFIG_RPS
5489
	if (static_branch_unlikely(&rps_needed)) {
E
Eric Dumazet 已提交
5490
		struct rps_dev_flow voidflow, *rflow = &voidflow;
5491
		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
T
Tom Herbert 已提交
5492

E
Eric Dumazet 已提交
5493 5494 5495
		if (cpu >= 0) {
			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
			rcu_read_unlock();
5496
			return ret;
E
Eric Dumazet 已提交
5497
		}
T
Tom Herbert 已提交
5498
	}
5499
#endif
5500 5501 5502
	ret = __netif_receive_skb(skb);
	rcu_read_unlock();
	return ret;
T
Tom Herbert 已提交
5503
}
5504

5505 5506 5507
static void netif_receive_skb_list_internal(struct list_head *head)
{
	struct sk_buff *skb, *next;
5508
	struct list_head sublist;
5509

5510
	INIT_LIST_HEAD(&sublist);
5511 5512
	list_for_each_entry_safe(skb, next, head, list) {
		net_timestamp_check(netdev_tstamp_prequeue, skb);
5513
		skb_list_del_init(skb);
5514 5515
		if (!skb_defer_rx_timestamp(skb))
			list_add_tail(&skb->list, &sublist);
5516
	}
5517
	list_splice_init(&sublist, head);
5518 5519 5520

	rcu_read_lock();
#ifdef CONFIG_RPS
5521
	if (static_branch_unlikely(&rps_needed)) {
5522 5523 5524 5525 5526
		list_for_each_entry_safe(skb, next, head, list) {
			struct rps_dev_flow voidflow, *rflow = &voidflow;
			int cpu = get_rps_cpu(skb->dev, skb, &rflow);

			if (cpu >= 0) {
5527
				/* Will be handled, remove from list */
5528
				skb_list_del_init(skb);
5529
				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5530 5531 5532 5533 5534 5535 5536 5537
			}
		}
	}
#endif
	__netif_receive_skb_list(head);
	rcu_read_unlock();
}

5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552
/**
 *	netif_receive_skb - process receive buffer from network
 *	@skb: buffer to process
 *
 *	netif_receive_skb() is the main receive data processing function.
 *	It always succeeds. The buffer may be dropped during processing
 *	for congestion control or by the protocol layers.
 *
 *	This function may only be called from softirq context and interrupts
 *	should be enabled.
 *
 *	Return values (usually ignored):
 *	NET_RX_SUCCESS: no congestion
 *	NET_RX_DROP: packet was dropped
 */
5553
int netif_receive_skb(struct sk_buff *skb)
5554
{
5555 5556
	int ret;

5557 5558
	trace_netif_receive_skb_entry(skb);

5559 5560 5561 5562
	ret = netif_receive_skb_internal(skb);
	trace_netif_receive_skb_exit(ret);

	return ret;
5563
}
5564
EXPORT_SYMBOL(netif_receive_skb);
L
Linus Torvalds 已提交
5565

5566 5567 5568 5569
/**
 *	netif_receive_skb_list - process many receive buffers from network
 *	@head: list of skbs to process.
 *
5570 5571
 *	Since return value of netif_receive_skb() is normally ignored, and
 *	wouldn't be meaningful for a list, this function returns void.
5572 5573 5574 5575 5576 5577
 *
 *	This function may only be called from softirq context and interrupts
 *	should be enabled.
 */
void netif_receive_skb_list(struct list_head *head)
{
5578
	struct sk_buff *skb;
5579

5580 5581
	if (list_empty(head))
		return;
5582 5583 5584 5585
	if (trace_netif_receive_skb_list_entry_enabled()) {
		list_for_each_entry(skb, head, list)
			trace_netif_receive_skb_list_entry(skb);
	}
5586
	netif_receive_skb_list_internal(head);
5587
	trace_netif_receive_skb_list_exit(0);
5588 5589 5590
}
EXPORT_SYMBOL(netif_receive_skb_list);

5591
static DEFINE_PER_CPU(struct work_struct, flush_works);
5592 5593 5594

/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
5595 5596
{
	struct sk_buff *skb, *tmp;
5597 5598 5599 5600
	struct softnet_data *sd;

	local_bh_disable();
	sd = this_cpu_ptr(&softnet_data);
5601

5602
	local_irq_disable();
E
Eric Dumazet 已提交
5603
	rps_lock(sd);
5604
	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5605
		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
E
Eric Dumazet 已提交
5606
			__skb_unlink(skb, &sd->input_pkt_queue);
5607
			dev_kfree_skb_irq(skb);
5608
			input_queue_head_incr(sd);
5609
		}
5610
	}
E
Eric Dumazet 已提交
5611
	rps_unlock(sd);
5612
	local_irq_enable();
5613 5614

	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5615
		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5616 5617
			__skb_unlink(skb, &sd->process_queue);
			kfree_skb(skb);
5618
			input_queue_head_incr(sd);
5619 5620
		}
	}
5621 5622 5623
	local_bh_enable();
}

5624
static void flush_all_backlogs(void)
5625 5626 5627 5628 5629
{
	unsigned int cpu;

	get_online_cpus();

5630 5631 5632
	for_each_online_cpu(cpu)
		queue_work_on(cpu, system_highpri_wq,
			      per_cpu_ptr(&flush_works, cpu));
5633 5634

	for_each_online_cpu(cpu)
5635
		flush_work(per_cpu_ptr(&flush_works, cpu));
5636 5637

	put_online_cpus();
5638 5639
}

5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
static void gro_normal_list(struct napi_struct *napi)
{
	if (!napi->rx_count)
		return;
	netif_receive_skb_list_internal(&napi->rx_list);
	INIT_LIST_HEAD(&napi->rx_list);
	napi->rx_count = 0;
}

/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 * pass the whole batch up to the stack.
 */
static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
{
	list_add_tail(&skb->list, &napi->rx_list);
	if (++napi->rx_count >= gro_normal_batch)
		gro_normal_list(napi);
}

5660 5661
INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5662
static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5663
{
5664
	struct packet_offload *ptype;
5665
	__be16 type = skb->protocol;
5666
	struct list_head *head = &offload_base;
5667 5668
	int err = -ENOENT;

5669 5670
	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));

5671 5672
	if (NAPI_GRO_CB(skb)->count == 1) {
		skb_shinfo(skb)->gso_size = 0;
5673
		goto out;
5674
	}
5675 5676 5677

	rcu_read_lock();
	list_for_each_entry_rcu(ptype, head, list) {
5678
		if (ptype->type != type || !ptype->callbacks.gro_complete)
5679 5680
			continue;

5681 5682 5683
		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
					 ipv6_gro_complete, inet_gro_complete,
					 skb, 0);
5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694
		break;
	}
	rcu_read_unlock();

	if (err) {
		WARN_ON(&ptype->list == head);
		kfree_skb(skb);
		return NET_RX_SUCCESS;
	}

out:
5695 5696
	gro_normal_one(napi, skb);
	return NET_RX_SUCCESS;
5697 5698
}

5699
static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5700
				   bool flush_old)
5701
{
5702
	struct list_head *head = &napi->gro_hash[index].list;
5703
	struct sk_buff *skb, *p;
5704

5705
	list_for_each_entry_safe_reverse(skb, p, head, list) {
5706 5707
		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
			return;
5708
		skb_list_del_init(skb);
5709
		napi_gro_complete(napi, skb);
5710
		napi->gro_hash[index].count--;
5711
	}
L
Li RongQing 已提交
5712 5713 5714

	if (!napi->gro_hash[index].count)
		__clear_bit(index, &napi->gro_bitmask);
5715
}
5716

5717
/* napi->gro_hash[].list contains packets ordered by age.
5718 5719 5720 5721 5722
 * youngest packets at the head of it.
 * Complete skbs in reverse order to reduce latencies.
 */
void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
5723 5724
	unsigned long bitmask = napi->gro_bitmask;
	unsigned int i, base = ~0U;
5725

5726 5727 5728 5729
	while ((i = ffs(bitmask)) != 0) {
		bitmask >>= i;
		base += i;
		__napi_gro_flush_chain(napi, base, flush_old);
L
Li RongQing 已提交
5730
	}
5731
}
E
Eric Dumazet 已提交
5732
EXPORT_SYMBOL(napi_gro_flush);
5733

5734 5735
static struct list_head *gro_list_prepare(struct napi_struct *napi,
					  struct sk_buff *skb)
E
Eric Dumazet 已提交
5736 5737
{
	unsigned int maclen = skb->dev->hard_header_len;
5738
	u32 hash = skb_get_hash_raw(skb);
5739
	struct list_head *head;
5740
	struct sk_buff *p;
E
Eric Dumazet 已提交
5741

5742
	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5743
	list_for_each_entry(p, head, list) {
E
Eric Dumazet 已提交
5744 5745
		unsigned long diffs;

5746 5747 5748 5749 5750 5751 5752
		NAPI_GRO_CB(p)->flush = 0;

		if (hash != skb_get_hash_raw(p)) {
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}

E
Eric Dumazet 已提交
5753
		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5754 5755
		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
		if (skb_vlan_tag_present(p))
5756
			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5757
		diffs |= skb_metadata_dst_cmp(p, skb);
5758
		diffs |= skb_metadata_differs(p, skb);
E
Eric Dumazet 已提交
5759 5760
		if (maclen == ETH_HLEN)
			diffs |= compare_ether_header(skb_mac_header(p),
5761
						      skb_mac_header(skb));
E
Eric Dumazet 已提交
5762 5763
		else if (!diffs)
			diffs = memcmp(skb_mac_header(p),
5764
				       skb_mac_header(skb),
E
Eric Dumazet 已提交
5765 5766 5767
				       maclen);
		NAPI_GRO_CB(p)->same_flow = !diffs;
	}
5768 5769

	return head;
E
Eric Dumazet 已提交
5770 5771
}

5772 5773 5774 5775 5776 5777 5778 5779 5780
static void skb_gro_reset_offset(struct sk_buff *skb)
{
	const struct skb_shared_info *pinfo = skb_shinfo(skb);
	const skb_frag_t *frag0 = &pinfo->frags[0];

	NAPI_GRO_CB(skb)->data_offset = 0;
	NAPI_GRO_CB(skb)->frag0 = NULL;
	NAPI_GRO_CB(skb)->frag0_len = 0;

5781
	if (!skb_headlen(skb) && pinfo->nr_frags &&
5782 5783
	    !PageHighMem(skb_frag_page(frag0))) {
		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5784 5785 5786
		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
						    skb_frag_size(frag0),
						    skb->end - skb->tail);
E
Eric Dumazet 已提交
5787 5788 5789
	}
}

5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800
static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
{
	struct skb_shared_info *pinfo = skb_shinfo(skb);

	BUG_ON(skb->end - skb->tail < grow);

	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);

	skb->data_len -= grow;
	skb->tail += grow;

J
Jonathan Lemon 已提交
5801
	skb_frag_off_add(&pinfo->frags[0], grow);
5802 5803 5804 5805 5806 5807 5808 5809 5810
	skb_frag_size_sub(&pinfo->frags[0], grow);

	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
		skb_frag_unref(skb, 0);
		memmove(pinfo->frags, pinfo->frags + 1,
			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
	}
}

5811
static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5812
{
5813
	struct sk_buff *oldest;
5814

5815
	oldest = list_last_entry(head, struct sk_buff, list);
5816

5817
	/* We are called with head length >= MAX_GRO_SKBS, so this is
5818 5819 5820 5821 5822
	 * impossible.
	 */
	if (WARN_ON_ONCE(!oldest))
		return;

L
Li RongQing 已提交
5823 5824
	/* Do not adjust napi->gro_hash[].count, caller is adding a new
	 * SKB to the chain.
5825
	 */
5826
	skb_list_del_init(oldest);
5827
	napi_gro_complete(napi, oldest);
5828 5829
}

5830 5831 5832 5833
INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
							   struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
							   struct sk_buff *));
R
Rami Rosen 已提交
5834
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5835
{
5836
	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5837
	struct list_head *head = &offload_base;
5838
	struct packet_offload *ptype;
5839
	__be16 type = skb->protocol;
5840
	struct list_head *gro_head;
5841
	struct sk_buff *pp = NULL;
5842
	enum gro_result ret;
5843
	int same_flow;
5844
	int grow;
5845

D
David S. Miller 已提交
5846
	if (netif_elide_gro(skb->dev))
5847 5848
		goto normal;

5849
	gro_head = gro_list_prepare(napi, skb);
E
Eric Dumazet 已提交
5850

5851 5852
	rcu_read_lock();
	list_for_each_entry_rcu(ptype, head, list) {
5853
		if (ptype->type != type || !ptype->callbacks.gro_receive)
5854 5855
			continue;

5856
		skb_set_network_header(skb, skb_gro_offset(skb));
5857
		skb_reset_mac_len(skb);
5858
		NAPI_GRO_CB(skb)->same_flow = 0;
E
Eric Dumazet 已提交
5859
		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
H
Herbert Xu 已提交
5860
		NAPI_GRO_CB(skb)->free = 0;
5861
		NAPI_GRO_CB(skb)->encap_mark = 0;
S
Sabrina Dubroca 已提交
5862
		NAPI_GRO_CB(skb)->recursion_counter = 0;
5863
		NAPI_GRO_CB(skb)->is_fou = 0;
5864
		NAPI_GRO_CB(skb)->is_atomic = 1;
5865
		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5866

5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881
		/* Setup for GRO checksum validation */
		switch (skb->ip_summed) {
		case CHECKSUM_COMPLETE:
			NAPI_GRO_CB(skb)->csum = skb->csum;
			NAPI_GRO_CB(skb)->csum_valid = 1;
			NAPI_GRO_CB(skb)->csum_cnt = 0;
			break;
		case CHECKSUM_UNNECESSARY:
			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
			NAPI_GRO_CB(skb)->csum_valid = 0;
			break;
		default:
			NAPI_GRO_CB(skb)->csum_cnt = 0;
			NAPI_GRO_CB(skb)->csum_valid = 0;
		}
5882

5883 5884 5885
		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
					ipv6_gro_receive, inet_gro_receive,
					gro_head, skb);
5886 5887 5888 5889 5890 5891 5892
		break;
	}
	rcu_read_unlock();

	if (&ptype->list == head)
		goto normal;

5893
	if (PTR_ERR(pp) == -EINPROGRESS) {
5894 5895 5896 5897
		ret = GRO_CONSUMED;
		goto ok;
	}

H
Herbert Xu 已提交
5898
	same_flow = NAPI_GRO_CB(skb)->same_flow;
5899
	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
H
Herbert Xu 已提交
5900

5901
	if (pp) {
5902
		skb_list_del_init(pp);
5903
		napi_gro_complete(napi, pp);
5904
		napi->gro_hash[hash].count--;
5905 5906
	}

H
Herbert Xu 已提交
5907
	if (same_flow)
5908 5909
		goto ok;

5910
	if (NAPI_GRO_CB(skb)->flush)
5911 5912
		goto normal;

5913
	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5914
		gro_flush_oldest(napi, gro_head);
5915
	} else {
5916
		napi->gro_hash[hash].count++;
5917
	}
5918
	NAPI_GRO_CB(skb)->count = 1;
5919
	NAPI_GRO_CB(skb)->age = jiffies;
5920
	NAPI_GRO_CB(skb)->last = skb;
5921
	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5922
	list_add(&skb->list, gro_head);
5923
	ret = GRO_HELD;
5924

5925
pull:
5926 5927 5928
	grow = skb_gro_offset(skb) - skb_headlen(skb);
	if (grow > 0)
		gro_pull_from_frag0(skb, grow);
5929
ok:
L
Li RongQing 已提交
5930 5931 5932 5933 5934 5935 5936
	if (napi->gro_hash[hash].count) {
		if (!test_bit(hash, &napi->gro_bitmask))
			__set_bit(hash, &napi->gro_bitmask);
	} else if (test_bit(hash, &napi->gro_bitmask)) {
		__clear_bit(hash, &napi->gro_bitmask);
	}

5937
	return ret;
5938 5939

normal:
5940 5941
	ret = GRO_NORMAL;
	goto pull;
H
Herbert Xu 已提交
5942
}
5943

5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955
struct packet_offload *gro_find_receive_by_type(__be16 type)
{
	struct list_head *offload_head = &offload_base;
	struct packet_offload *ptype;

	list_for_each_entry_rcu(ptype, offload_head, list) {
		if (ptype->type != type || !ptype->callbacks.gro_receive)
			continue;
		return ptype;
	}
	return NULL;
}
5956
EXPORT_SYMBOL(gro_find_receive_by_type);
5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969

struct packet_offload *gro_find_complete_by_type(__be16 type)
{
	struct list_head *offload_head = &offload_base;
	struct packet_offload *ptype;

	list_for_each_entry_rcu(ptype, offload_head, list) {
		if (ptype->type != type || !ptype->callbacks.gro_complete)
			continue;
		return ptype;
	}
	return NULL;
}
5970
EXPORT_SYMBOL(gro_find_complete_by_type);
H
Herbert Xu 已提交
5971

5972 5973 5974
static void napi_skb_free_stolen_head(struct sk_buff *skb)
{
	skb_dst_drop(skb);
5975
	skb_ext_put(skb);
5976 5977 5978
	kmem_cache_free(skbuff_head_cache, skb);
}

5979 5980 5981
static gro_result_t napi_skb_finish(struct napi_struct *napi,
				    struct sk_buff *skb,
				    gro_result_t ret)
H
Herbert Xu 已提交
5982
{
5983 5984
	switch (ret) {
	case GRO_NORMAL:
5985
		gro_normal_one(napi, skb);
5986
		break;
H
Herbert Xu 已提交
5987

5988
	case GRO_DROP:
H
Herbert Xu 已提交
5989 5990
		kfree_skb(skb);
		break;
5991

5992
	case GRO_MERGED_FREE:
5993 5994 5995
		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
			napi_skb_free_stolen_head(skb);
		else
5996
			__kfree_skb(skb);
5997 5998
		break;

5999 6000
	case GRO_HELD:
	case GRO_MERGED:
6001
	case GRO_CONSUMED:
6002
		break;
H
Herbert Xu 已提交
6003 6004
	}

6005
	return ret;
6006 6007
}

6008
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
6009
{
6010 6011
	gro_result_t ret;

6012
	skb_mark_napi_id(skb, napi);
6013
	trace_napi_gro_receive_entry(skb);
6014

6015 6016
	skb_gro_reset_offset(skb);

6017
	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6018 6019 6020
	trace_napi_gro_receive_exit(ret);

	return ret;
6021 6022 6023
}
EXPORT_SYMBOL(napi_gro_receive);

S
stephen hemminger 已提交
6024
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
6025
{
6026 6027 6028 6029
	if (unlikely(skb->pfmemalloc)) {
		consume_skb(skb);
		return;
	}
6030
	__skb_pull(skb, skb_headlen(skb));
6031 6032
	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
6033
	__vlan_hwaccel_clear_tag(skb);
H
Herbert Xu 已提交
6034
	skb->dev = napi->dev;
A
Andy Gospodarek 已提交
6035
	skb->skb_iif = 0;
6036 6037 6038 6039

	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
	skb->pkt_type = PACKET_HOST;

6040 6041
	skb->encapsulation = 0;
	skb_shinfo(skb)->gso_type = 0;
6042
	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6043
	skb_ext_reset(skb);
6044 6045 6046 6047

	napi->skb = skb;
}

6048
struct sk_buff *napi_get_frags(struct napi_struct *napi)
H
Herbert Xu 已提交
6049 6050 6051 6052
{
	struct sk_buff *skb = napi->skb;

	if (!skb) {
6053
		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
6054 6055 6056 6057
		if (skb) {
			napi->skb = skb;
			skb_mark_napi_id(skb, napi);
		}
6058
	}
6059 6060
	return skb;
}
6061
EXPORT_SYMBOL(napi_get_frags);
6062

6063 6064 6065
static gro_result_t napi_frags_finish(struct napi_struct *napi,
				      struct sk_buff *skb,
				      gro_result_t ret)
6066
{
6067 6068
	switch (ret) {
	case GRO_NORMAL:
6069 6070 6071
	case GRO_HELD:
		__skb_push(skb, ETH_HLEN);
		skb->protocol = eth_type_trans(skb, skb->dev);
6072 6073
		if (ret == GRO_NORMAL)
			gro_normal_one(napi, skb);
6074
		break;
H
Herbert Xu 已提交
6075

6076 6077 6078
	case GRO_DROP:
		napi_reuse_skb(napi, skb);
		break;
6079

6080 6081 6082 6083 6084 6085 6086
	case GRO_MERGED_FREE:
		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
			napi_skb_free_stolen_head(skb);
		else
			napi_reuse_skb(napi, skb);
		break;

6087
	case GRO_MERGED:
6088
	case GRO_CONSUMED:
6089
		break;
6090
	}
H
Herbert Xu 已提交
6091

6092
	return ret;
H
Herbert Xu 已提交
6093
}
6094

6095 6096 6097 6098
/* Upper GRO stack assumes network header starts at gro_offset=0
 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 * We copy ethernet header into skb->data to have a common layout.
 */
E
Eric Dumazet 已提交
6099
static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
6100 6101
{
	struct sk_buff *skb = napi->skb;
6102 6103
	const struct ethhdr *eth;
	unsigned int hlen = sizeof(*eth);
6104 6105 6106

	napi->skb = NULL;

6107 6108 6109 6110 6111 6112
	skb_reset_mac_header(skb);
	skb_gro_reset_offset(skb);

	if (unlikely(skb_gro_header_hard(skb, hlen))) {
		eth = skb_gro_header_slow(skb, hlen, 0);
		if (unlikely(!eth)) {
6113 6114
			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
					     __func__, napi->dev->name);
6115 6116 6117 6118
			napi_reuse_skb(napi, skb);
			return NULL;
		}
	} else {
6119
		eth = (const struct ethhdr *)skb->data;
6120 6121 6122
		gro_pull_from_frag0(skb, hlen);
		NAPI_GRO_CB(skb)->frag0 += hlen;
		NAPI_GRO_CB(skb)->frag0_len -= hlen;
6123
	}
6124 6125 6126 6127 6128 6129 6130 6131
	__skb_pull(skb, hlen);

	/*
	 * This works because the only protocols we care about don't require
	 * special handling.
	 * We'll fix it up properly in napi_frags_finish()
	 */
	skb->protocol = eth->h_proto;
6132 6133 6134 6135

	return skb;
}

6136
gro_result_t napi_gro_frags(struct napi_struct *napi)
6137
{
6138
	gro_result_t ret;
6139
	struct sk_buff *skb = napi_frags_skb(napi);
6140 6141

	if (!skb)
6142
		return GRO_DROP;
6143

6144 6145
	trace_napi_gro_frags_entry(skb);

6146 6147 6148 6149
	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
	trace_napi_gro_frags_exit(ret);

	return ret;
6150
}
H
Herbert Xu 已提交
6151 6152
EXPORT_SYMBOL(napi_gro_frags);

T
Tom Herbert 已提交
6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164
/* Compute the checksum from gro_offset and return the folded value
 * after adding in any pseudo checksum.
 */
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
{
	__wsum wsum;
	__sum16 sum;

	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);

	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6165
	/* See comments in __skb_checksum_complete(). */
T
Tom Herbert 已提交
6166 6167 6168
	if (likely(!sum)) {
		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
		    !skb->csum_complete_sw)
6169
			netdev_rx_csum_fault(skb->dev, skb);
T
Tom Herbert 已提交
6170 6171 6172 6173 6174 6175 6176 6177 6178
	}

	NAPI_GRO_CB(skb)->csum = wsum;
	NAPI_GRO_CB(skb)->csum_valid = 1;

	return sum;
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);

6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191
static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
	while (remsd) {
		struct softnet_data *next = remsd->rps_ipi_next;

		if (cpu_online(remsd->cpu))
			smp_call_function_single_async(remsd->cpu, &remsd->csd);
		remsd = next;
	}
#endif
}

6192
/*
6193
 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206
 * Note: called with local irq disabled, but exits with local irq enabled.
 */
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
	struct softnet_data *remsd = sd->rps_ipi_list;

	if (remsd) {
		sd->rps_ipi_list = NULL;

		local_irq_enable();

		/* Send pending IPI's to kick RPS processing on remote cpus. */
6207
		net_rps_send_ipi(remsd);
6208 6209 6210 6211 6212
	} else
#endif
		local_irq_enable();
}

E
Eric Dumazet 已提交
6213 6214 6215 6216 6217 6218 6219 6220 6221
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
	return sd->rps_ipi_list != NULL;
#else
	return false;
#endif
}

6222
static int process_backlog(struct napi_struct *napi, int quota)
L
Linus Torvalds 已提交
6223
{
E
Eric Dumazet 已提交
6224
	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6225 6226
	bool again = true;
	int work = 0;
L
Linus Torvalds 已提交
6227

6228 6229 6230
	/* Check if we have pending ipi, its better to send them now,
	 * not waiting net_rx_action() end.
	 */
E
Eric Dumazet 已提交
6231
	if (sd_has_rps_ipi_waiting(sd)) {
6232 6233 6234
		local_irq_disable();
		net_rps_action_and_irq_enable(sd);
	}
E
Eric Dumazet 已提交
6235

6236
	napi->weight = dev_rx_weight;
6237
	while (again) {
L
Linus Torvalds 已提交
6238
		struct sk_buff *skb;
6239 6240

		while ((skb = __skb_dequeue(&sd->process_queue))) {
6241
			rcu_read_lock();
6242
			__netif_receive_skb(skb);
6243
			rcu_read_unlock();
6244
			input_queue_head_incr(sd);
6245
			if (++work >= quota)
6246
				return work;
6247

6248
		}
L
Linus Torvalds 已提交
6249

6250
		local_irq_disable();
E
Eric Dumazet 已提交
6251
		rps_lock(sd);
6252
		if (skb_queue_empty(&sd->input_pkt_queue)) {
E
Eric Dumazet 已提交
6253 6254 6255
			/*
			 * Inline a custom version of __napi_complete().
			 * only current cpu owns and manipulates this napi,
6256 6257 6258
			 * and NAPI_STATE_SCHED is the only possible flag set
			 * on backlog.
			 * We can use a plain write instead of clear_bit(),
E
Eric Dumazet 已提交
6259 6260 6261
			 * and we dont need an smp_mb() memory barrier.
			 */
			napi->state = 0;
6262 6263 6264 6265
			again = false;
		} else {
			skb_queue_splice_tail_init(&sd->input_pkt_queue,
						   &sd->process_queue);
6266
		}
E
Eric Dumazet 已提交
6267
		rps_unlock(sd);
6268
		local_irq_enable();
6269
	}
L
Linus Torvalds 已提交
6270

6271 6272
	return work;
}
L
Linus Torvalds 已提交
6273

6274 6275
/**
 * __napi_schedule - schedule for receive
6276
 * @n: entry to schedule
6277
 *
6278 6279
 * The entry's receive function will be scheduled to run.
 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6280
 */
H
Harvey Harrison 已提交
6281
void __napi_schedule(struct napi_struct *n)
6282 6283
{
	unsigned long flags;
L
Linus Torvalds 已提交
6284

6285
	local_irq_save(flags);
6286
	____napi_schedule(this_cpu_ptr(&softnet_data), n);
6287
	local_irq_restore(flags);
L
Linus Torvalds 已提交
6288
}
6289 6290
EXPORT_SYMBOL(__napi_schedule);

E
Eric Dumazet 已提交
6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323
/**
 *	napi_schedule_prep - check if napi can be scheduled
 *	@n: napi context
 *
 * Test if NAPI routine is already running, and if not mark
 * it as running.  This is used as a condition variable
 * insure only one NAPI poll instance runs.  We also make
 * sure there is no pending NAPI disable.
 */
bool napi_schedule_prep(struct napi_struct *n)
{
	unsigned long val, new;

	do {
		val = READ_ONCE(n->state);
		if (unlikely(val & NAPIF_STATE_DISABLE))
			return false;
		new = val | NAPIF_STATE_SCHED;

		/* Sets STATE_MISSED bit if STATE_SCHED was already set
		 * This was suggested by Alexander Duyck, as compiler
		 * emits better code than :
		 * if (val & NAPIF_STATE_SCHED)
		 *     new |= NAPIF_STATE_MISSED;
		 */
		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
						   NAPIF_STATE_MISSED;
	} while (cmpxchg(&n->state, val, new) != val);

	return !(val & NAPIF_STATE_SCHED);
}
EXPORT_SYMBOL(napi_schedule_prep);

6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335
/**
 * __napi_schedule_irqoff - schedule for receive
 * @n: entry to schedule
 *
 * Variant of __napi_schedule() assuming hard irqs are masked
 */
void __napi_schedule_irqoff(struct napi_struct *n)
{
	____napi_schedule(this_cpu_ptr(&softnet_data), n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);

6336
bool napi_complete_done(struct napi_struct *n, int work_done)
6337
{
6338 6339
	unsigned long flags, val, new, timeout = 0;
	bool ret = true;
6340 6341

	/*
6342 6343 6344 6345
	 * 1) Don't let napi dequeue from the cpu poll list
	 *    just in case its running on a different cpu.
	 * 2) If we are busy polling, do nothing here, we have
	 *    the guarantee we will be called later.
6346
	 */
6347 6348
	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
				 NAPIF_STATE_IN_BUSY_POLL)))
6349
		return false;
6350

6351 6352
	if (work_done) {
		if (n->gro_bitmask)
6353 6354
			timeout = READ_ONCE(n->dev->gro_flush_timeout);
		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6355 6356 6357
	}
	if (n->defer_hard_irqs_count > 0) {
		n->defer_hard_irqs_count--;
6358
		timeout = READ_ONCE(n->dev->gro_flush_timeout);
6359 6360 6361 6362
		if (timeout)
			ret = false;
	}
	if (n->gro_bitmask) {
6363 6364 6365 6366 6367
		/* When the NAPI instance uses a timeout and keeps postponing
		 * it, we need to bound somehow the time packets are kept in
		 * the GRO layer
		 */
		napi_gro_flush(n, !!timeout);
6368
	}
6369 6370 6371

	gro_normal_list(n);

E
Eric Dumazet 已提交
6372
	if (unlikely(!list_empty(&n->poll_list))) {
E
Eric Dumazet 已提交
6373 6374
		/* If n->poll_list is not empty, we need to mask irqs */
		local_irq_save(flags);
E
Eric Dumazet 已提交
6375
		list_del_init(&n->poll_list);
E
Eric Dumazet 已提交
6376 6377
		local_irq_restore(flags);
	}
E
Eric Dumazet 已提交
6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398

	do {
		val = READ_ONCE(n->state);

		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));

		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);

		/* If STATE_MISSED was set, leave STATE_SCHED set,
		 * because we will call napi->poll() one more time.
		 * This C code was suggested by Alexander Duyck to help gcc.
		 */
		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
						    NAPIF_STATE_SCHED;
	} while (cmpxchg(&n->state, val, new) != val);

	if (unlikely(val & NAPIF_STATE_MISSED)) {
		__napi_schedule(n);
		return false;
	}

6399 6400 6401 6402
	if (timeout)
		hrtimer_start(&n->timer, ns_to_ktime(timeout),
			      HRTIMER_MODE_REL_PINNED);
	return ret;
6403
}
6404
EXPORT_SYMBOL(napi_complete_done);
6405

E
Eliezer Tamir 已提交
6406
/* must be called under rcu_read_lock(), as we dont take a reference */
E
Eric Dumazet 已提交
6407
static struct napi_struct *napi_by_id(unsigned int napi_id)
E
Eliezer Tamir 已提交
6408 6409 6410 6411 6412 6413 6414 6415 6416 6417
{
	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
	struct napi_struct *napi;

	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
		if (napi->napi_id == napi_id)
			return napi;

	return NULL;
}
E
Eric Dumazet 已提交
6418 6419

#if defined(CONFIG_NET_RX_BUSY_POLL)
6420

6421
#define BUSY_POLL_BUDGET 8
6422 6423 6424 6425 6426

static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
{
	int rc;

E
Eric Dumazet 已提交
6427 6428 6429 6430 6431 6432 6433 6434 6435 6436
	/* Busy polling means there is a high chance device driver hard irq
	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
	 * set in napi_schedule_prep().
	 * Since we are about to call napi->poll() once more, we can safely
	 * clear NAPI_STATE_MISSED.
	 *
	 * Note: x86 could use a single "lock and ..." instruction
	 * to perform these two clear_bit()
	 */
	clear_bit(NAPI_STATE_MISSED, &napi->state);
6437 6438 6439 6440 6441 6442 6443 6444
	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);

	local_bh_disable();

	/* All we really want here is to re-enable device interrupts.
	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
	 */
	rc = napi->poll(napi, BUSY_POLL_BUDGET);
6445 6446 6447 6448
	/* We can't gro_normal_list() here, because napi->poll() might have
	 * rearmed the napi (napi_complete_done()) in which case it could
	 * already be running on another CPU.
	 */
6449
	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6450
	netpoll_poll_unlock(have_poll_lock);
6451 6452 6453 6454 6455
	if (rc == BUSY_POLL_BUDGET) {
		/* As the whole budget was spent, we still own the napi so can
		 * safely handle the rx_list.
		 */
		gro_normal_list(napi);
6456
		__napi_schedule(napi);
6457
	}
6458 6459 6460
	local_bh_enable();
}

6461 6462 6463
void napi_busy_loop(unsigned int napi_id,
		    bool (*loop_end)(void *, unsigned long),
		    void *loop_end_arg)
E
Eric Dumazet 已提交
6464
{
6465
	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6466 6467
	int (*napi_poll)(struct napi_struct *napi, int budget);
	void *have_poll_lock = NULL;
E
Eric Dumazet 已提交
6468
	struct napi_struct *napi;
6469 6470 6471

restart:
	napi_poll = NULL;
E
Eric Dumazet 已提交
6472

6473
	rcu_read_lock();
E
Eric Dumazet 已提交
6474

6475
	napi = napi_by_id(napi_id);
E
Eric Dumazet 已提交
6476 6477 6478
	if (!napi)
		goto out;

6479 6480
	preempt_disable();
	for (;;) {
6481 6482
		int work = 0;

6483
		local_bh_disable();
6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499
		if (!napi_poll) {
			unsigned long val = READ_ONCE(napi->state);

			/* If multiple threads are competing for this napi,
			 * we avoid dirtying napi->state as much as we can.
			 */
			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
				   NAPIF_STATE_IN_BUSY_POLL))
				goto count;
			if (cmpxchg(&napi->state, val,
				    val | NAPIF_STATE_IN_BUSY_POLL |
					  NAPIF_STATE_SCHED) != val)
				goto count;
			have_poll_lock = netpoll_poll_lock(napi);
			napi_poll = napi->poll;
		}
6500 6501
		work = napi_poll(napi, BUSY_POLL_BUDGET);
		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6502
		gro_normal_list(napi);
6503
count:
6504
		if (work > 0)
6505
			__NET_ADD_STATS(dev_net(napi->dev),
6506
					LINUX_MIB_BUSYPOLLRXPACKETS, work);
6507
		local_bh_enable();
E
Eric Dumazet 已提交
6508

6509
		if (!loop_end || loop_end(loop_end_arg, start_time))
6510
			break;
E
Eric Dumazet 已提交
6511

6512 6513 6514 6515 6516 6517
		if (unlikely(need_resched())) {
			if (napi_poll)
				busy_poll_stop(napi, have_poll_lock);
			preempt_enable();
			rcu_read_unlock();
			cond_resched();
6518
			if (loop_end(loop_end_arg, start_time))
6519
				return;
6520 6521
			goto restart;
		}
6522
		cpu_relax();
6523 6524 6525 6526
	}
	if (napi_poll)
		busy_poll_stop(napi, have_poll_lock);
	preempt_enable();
E
Eric Dumazet 已提交
6527
out:
6528
	rcu_read_unlock();
E
Eric Dumazet 已提交
6529
}
6530
EXPORT_SYMBOL(napi_busy_loop);
E
Eric Dumazet 已提交
6531 6532

#endif /* CONFIG_NET_RX_BUSY_POLL */
E
Eliezer Tamir 已提交
6533

6534
static void napi_hash_add(struct napi_struct *napi)
E
Eliezer Tamir 已提交
6535
{
E
Eric Dumazet 已提交
6536 6537
	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6538
		return;
E
Eliezer Tamir 已提交
6539

6540
	spin_lock(&napi_hash_lock);
E
Eliezer Tamir 已提交
6541

6542
	/* 0..NR_CPUS range is reserved for sender_cpu use */
6543
	do {
6544 6545
		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
			napi_gen_id = MIN_NAPI_ID;
6546 6547
	} while (napi_by_id(napi_gen_id));
	napi->napi_id = napi_gen_id;
E
Eliezer Tamir 已提交
6548

6549 6550
	hlist_add_head_rcu(&napi->napi_hash_node,
			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
E
Eliezer Tamir 已提交
6551

6552
	spin_unlock(&napi_hash_lock);
E
Eliezer Tamir 已提交
6553 6554 6555 6556 6557
}

/* Warning : caller is responsible to make sure rcu grace period
 * is respected before freeing memory containing @napi
 */
6558
bool napi_hash_del(struct napi_struct *napi)
E
Eliezer Tamir 已提交
6559
{
6560 6561
	bool rcu_sync_needed = false;

E
Eliezer Tamir 已提交
6562 6563
	spin_lock(&napi_hash_lock);

6564 6565
	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
		rcu_sync_needed = true;
E
Eliezer Tamir 已提交
6566
		hlist_del_rcu(&napi->napi_hash_node);
6567
	}
E
Eliezer Tamir 已提交
6568
	spin_unlock(&napi_hash_lock);
6569
	return rcu_sync_needed;
E
Eliezer Tamir 已提交
6570 6571 6572
}
EXPORT_SYMBOL_GPL(napi_hash_del);

6573 6574 6575 6576 6577
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
	struct napi_struct *napi;

	napi = container_of(timer, struct napi_struct, timer);
E
Eric Dumazet 已提交
6578 6579 6580 6581

	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
	 */
6582
	if (!napi_disable_pending(napi) &&
E
Eric Dumazet 已提交
6583 6584
	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
		__napi_schedule_irqoff(napi);
6585 6586 6587 6588

	return HRTIMER_NORESTART;
}

6589
static void init_gro_hash(struct napi_struct *napi)
6590
{
6591 6592
	int i;

6593 6594 6595 6596
	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
		INIT_LIST_HEAD(&napi->gro_hash[i].list);
		napi->gro_hash[i].count = 0;
	}
6597 6598 6599 6600 6601 6602 6603 6604 6605 6606
	napi->gro_bitmask = 0;
}

void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
		    int (*poll)(struct napi_struct *, int), int weight)
{
	INIT_LIST_HEAD(&napi->poll_list);
	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
	napi->timer.function = napi_watchdog;
	init_gro_hash(napi);
H
Herbert Xu 已提交
6607
	napi->skb = NULL;
6608 6609
	INIT_LIST_HEAD(&napi->rx_list);
	napi->rx_count = 0;
6610
	napi->poll = poll;
E
Eric Dumazet 已提交
6611
	if (weight > NAPI_POLL_WEIGHT)
Q
Qian Cai 已提交
6612 6613
		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
				weight);
6614 6615 6616
	napi->weight = weight;
	list_add(&napi->dev_list, &dev->napi_list);
	napi->dev = dev;
H
Herbert Xu 已提交
6617
#ifdef CONFIG_NETPOLL
6618 6619 6620
	napi->poll_owner = -1;
#endif
	set_bit(NAPI_STATE_SCHED, &napi->state);
6621
	napi_hash_add(napi);
6622 6623 6624
}
EXPORT_SYMBOL(netif_napi_add);

6625 6626 6627 6628 6629 6630 6631
void napi_disable(struct napi_struct *n)
{
	might_sleep();
	set_bit(NAPI_STATE_DISABLE, &n->state);

	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
		msleep(1);
6632 6633
	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
		msleep(1);
6634 6635 6636 6637 6638 6639 6640

	hrtimer_cancel(&n->timer);

	clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);

6641
static void flush_gro_hash(struct napi_struct *napi)
6642
{
6643
	int i;
6644

6645 6646 6647
	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
		struct sk_buff *skb, *n;

6648
		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6649
			kfree_skb(skb);
6650
		napi->gro_hash[i].count = 0;
6651
	}
6652 6653
}

6654
/* Must be called in process context */
6655 6656
void netif_napi_del(struct napi_struct *napi)
{
6657 6658 6659
	might_sleep();
	if (napi_hash_del(napi))
		synchronize_net();
6660
	list_del_init(&napi->dev_list);
6661
	napi_free_frags(napi);
6662

6663
	flush_gro_hash(napi);
L
Li RongQing 已提交
6664
	napi->gro_bitmask = 0;
6665 6666 6667
}
EXPORT_SYMBOL(netif_napi_del);

6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
	void *have;
	int work, weight;

	list_del_init(&n->poll_list);

	have = netpoll_poll_lock(n);

	weight = n->weight;

	/* This NAPI_STATE_SCHED test is for avoiding a race
	 * with netpoll's poll_napi().  Only the entity which
	 * obtains the lock and sees NAPI_STATE_SCHED set will
	 * actually make the ->poll() call.  Therefore we avoid
	 * accidentally calling ->poll() when NAPI is not scheduled.
	 */
	work = 0;
	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
		work = n->poll(n, weight);
6688
		trace_napi_poll(n, work, weight);
6689 6690
	}

6691 6692 6693
	if (unlikely(work > weight))
		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
			    n->poll, work, weight);
6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707

	if (likely(work < weight))
		goto out_unlock;

	/* Drivers must not modify the NAPI state if they
	 * consume the entire weight.  In such cases this code
	 * still "owns" the NAPI instance and therefore can
	 * move the instance around on the list at-will.
	 */
	if (unlikely(napi_disable_pending(n))) {
		napi_complete(n);
		goto out_unlock;
	}

L
Li RongQing 已提交
6708
	if (n->gro_bitmask) {
6709 6710 6711 6712 6713 6714
		/* flush too old packets
		 * If HZ < 1000, flush all packets.
		 */
		napi_gro_flush(n, HZ >= 1000);
	}

6715 6716
	gro_normal_list(n);

6717 6718 6719 6720 6721 6722 6723 6724 6725
	/* Some drivers may have called napi_schedule
	 * prior to exhausting their budget.
	 */
	if (unlikely(!list_empty(&n->poll_list))) {
		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
			     n->dev ? n->dev->name : "backlog");
		goto out_unlock;
	}

6726 6727 6728 6729 6730 6731 6732 6733
	list_add_tail(&n->poll_list, repoll);

out_unlock:
	netpoll_poll_unlock(have);

	return work;
}

6734
static __latent_entropy void net_rx_action(struct softirq_action *h)
L
Linus Torvalds 已提交
6735
{
6736
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6737 6738
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(netdev_budget_usecs);
6739
	int budget = netdev_budget;
E
Eric Dumazet 已提交
6740 6741
	LIST_HEAD(list);
	LIST_HEAD(repoll);
6742

L
Linus Torvalds 已提交
6743
	local_irq_disable();
E
Eric Dumazet 已提交
6744 6745
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();
L
Linus Torvalds 已提交
6746

H
Herbert Xu 已提交
6747
	for (;;) {
6748
		struct napi_struct *n;
L
Linus Torvalds 已提交
6749

H
Herbert Xu 已提交
6750 6751
		if (list_empty(&list)) {
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6752
				goto out;
H
Herbert Xu 已提交
6753 6754 6755
			break;
		}

6756 6757 6758
		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll);

E
Eric Dumazet 已提交
6759
		/* If softirq window is exhausted then punt.
6760 6761
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
6762
		 */
H
Herbert Xu 已提交
6763 6764 6765 6766 6767
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
L
Linus Torvalds 已提交
6768
	}
E
Eric Dumazet 已提交
6769 6770 6771 6772 6773 6774 6775 6776 6777

	local_irq_disable();

	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list))
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

6778
	net_rps_action_and_irq_enable(sd);
6779 6780
out:
	__kfree_skb_flush();
L
Linus Torvalds 已提交
6781 6782
}

6783
struct netdev_adjacent {
J
Jiri Pirko 已提交
6784
	struct net_device *dev;
6785 6786

	/* upper master flag, there can only be one master device per list */
J
Jiri Pirko 已提交
6787
	bool master;
6788

6789 6790 6791
	/* lookup ignore flag */
	bool ignore;

6792 6793 6794
	/* counter for the number of times this device was added to us */
	u16 ref_nr;

6795 6796 6797
	/* private field for the users */
	void *private;

J
Jiri Pirko 已提交
6798 6799 6800 6801
	struct list_head list;
	struct rcu_head rcu;
};

6802
static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6803
						 struct list_head *adj_list)
J
Jiri Pirko 已提交
6804
{
6805 6806
	struct netdev_adjacent *adj;

6807
	list_for_each_entry(adj, adj_list, list) {
6808 6809
		if (adj->dev == adj_dev)
			return adj;
J
Jiri Pirko 已提交
6810 6811 6812 6813
	}
	return NULL;
}

6814
static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6815 6816 6817 6818 6819 6820
{
	struct net_device *dev = data;

	return upper_dev == dev;
}

J
Jiri Pirko 已提交
6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834
/**
 * netdev_has_upper_dev - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks only immediate upper device,
 * not through a complete stack of devices. The caller must hold the RTNL lock.
 */
bool netdev_has_upper_dev(struct net_device *dev,
			  struct net_device *upper_dev)
{
	ASSERT_RTNL();

6835
	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6836
					     upper_dev);
J
Jiri Pirko 已提交
6837 6838 6839
}
EXPORT_SYMBOL(netdev_has_upper_dev);

6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852
/**
 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks the entire upper device chain.
 * The caller must hold rcu lock.
 */

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
				  struct net_device *upper_dev)
{
6853
	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6854 6855 6856 6857
					       upper_dev);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);

J
Jiri Pirko 已提交
6858 6859 6860 6861 6862 6863 6864
/**
 * netdev_has_any_upper_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to an upper device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
6865
bool netdev_has_any_upper_dev(struct net_device *dev)
J
Jiri Pirko 已提交
6866 6867 6868
{
	ASSERT_RTNL();

6869
	return !list_empty(&dev->adj_list.upper);
J
Jiri Pirko 已提交
6870
}
6871
EXPORT_SYMBOL(netdev_has_any_upper_dev);
J
Jiri Pirko 已提交
6872 6873 6874 6875 6876 6877 6878 6879 6880 6881

/**
 * netdev_master_upper_dev_get - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RTNL lock.
 */
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
6882
	struct netdev_adjacent *upper;
J
Jiri Pirko 已提交
6883 6884 6885

	ASSERT_RTNL();

6886
	if (list_empty(&dev->adj_list.upper))
J
Jiri Pirko 已提交
6887 6888
		return NULL;

6889
	upper = list_first_entry(&dev->adj_list.upper,
6890
				 struct netdev_adjacent, list);
J
Jiri Pirko 已提交
6891 6892 6893 6894 6895 6896
	if (likely(upper->master))
		return upper->dev;
	return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);

6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912
static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
{
	struct netdev_adjacent *upper;

	ASSERT_RTNL();

	if (list_empty(&dev->adj_list.upper))
		return NULL;

	upper = list_first_entry(&dev->adj_list.upper,
				 struct netdev_adjacent, list);
	if (likely(upper->master) && !upper->ignore)
		return upper->dev;
	return NULL;
}

6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926
/**
 * netdev_has_any_lower_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to a lower device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
static bool netdev_has_any_lower_dev(struct net_device *dev)
{
	ASSERT_RTNL();

	return !list_empty(&dev->adj_list.lower);
}

6927 6928 6929 6930 6931 6932 6933 6934 6935 6936
void *netdev_adjacent_get_private(struct list_head *adj_list)
{
	struct netdev_adjacent *adj;

	adj = list_entry(adj_list, struct netdev_adjacent, list);

	return adj->private;
}
EXPORT_SYMBOL(netdev_adjacent_get_private);

6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962
/**
 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next device from the dev's upper list, starting from iter
 * position. The caller must hold RCU read lock.
 */
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
						 struct list_head **iter)
{
	struct netdev_adjacent *upper;

	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

	if (&upper->list == &dev->adj_list.upper)
		return NULL;

	*iter = &upper->list;

	return upper->dev;
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);

6963 6964 6965
static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
						  struct list_head **iter,
						  bool *ignore)
T
Taehee Yoo 已提交
6966 6967 6968 6969 6970 6971 6972 6973 6974
{
	struct netdev_adjacent *upper;

	upper = list_entry((*iter)->next, struct netdev_adjacent, list);

	if (&upper->list == &dev->adj_list.upper)
		return NULL;

	*iter = &upper->list;
6975
	*ignore = upper->ignore;
T
Taehee Yoo 已提交
6976 6977 6978 6979

	return upper->dev;
}

6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996
static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
						    struct list_head **iter)
{
	struct netdev_adjacent *upper;

	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

	if (&upper->list == &dev->adj_list.upper)
		return NULL;

	*iter = &upper->list;

	return upper->dev;
}

6997 6998 6999 7000
static int __netdev_walk_all_upper_dev(struct net_device *dev,
				       int (*fn)(struct net_device *dev,
						 void *data),
				       void *data)
T
Taehee Yoo 已提交
7001 7002 7003 7004
{
	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
	int ret, cur = 0;
7005
	bool ignore;
T
Taehee Yoo 已提交
7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018

	now = dev;
	iter = &dev->adj_list.upper;

	while (1) {
		if (now != dev) {
			ret = fn(now, data);
			if (ret)
				return ret;
		}

		next = NULL;
		while (1) {
7019
			udev = __netdev_next_upper_dev(now, &iter, &ignore);
T
Taehee Yoo 已提交
7020 7021
			if (!udev)
				break;
7022 7023
			if (ignore)
				continue;
T
Taehee Yoo 已提交
7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045

			next = udev;
			niter = &udev->adj_list.upper;
			dev_stack[cur] = now;
			iter_stack[cur++] = iter;
			break;
		}

		if (!next) {
			if (!cur)
				return 0;
			next = dev_stack[--cur];
			niter = iter_stack[cur];
		}

		now = next;
		iter = niter;
	}

	return 0;
}

7046 7047 7048 7049 7050
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
				  int (*fn)(struct net_device *dev,
					    void *data),
				  void *data)
{
T
Taehee Yoo 已提交
7051 7052 7053
	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
	int ret, cur = 0;
7054

T
Taehee Yoo 已提交
7055 7056
	now = dev;
	iter = &dev->adj_list.upper;
7057

T
Taehee Yoo 已提交
7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086
	while (1) {
		if (now != dev) {
			ret = fn(now, data);
			if (ret)
				return ret;
		}

		next = NULL;
		while (1) {
			udev = netdev_next_upper_dev_rcu(now, &iter);
			if (!udev)
				break;

			next = udev;
			niter = &udev->adj_list.upper;
			dev_stack[cur] = now;
			iter_stack[cur++] = iter;
			break;
		}

		if (!next) {
			if (!cur)
				return 0;
			next = dev_stack[--cur];
			niter = iter_stack[cur];
		}

		now = next;
		iter = niter;
7087 7088 7089 7090 7091 7092
	}

	return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);

7093 7094 7095 7096 7097 7098 7099 7100 7101
static bool __netdev_has_upper_dev(struct net_device *dev,
				   struct net_device *upper_dev)
{
	ASSERT_RTNL();

	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
					   upper_dev);
}

7102 7103 7104 7105 7106 7107 7108 7109 7110
/**
 * netdev_lower_get_next_private - Get the next ->private from the
 *				   lower neighbour list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold either hold the
 * RTNL lock or its own locking that guarantees that the neighbour lower
7111
 * list will remain unchanged.
7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122
 */
void *netdev_lower_get_next_private(struct net_device *dev,
				    struct list_head **iter)
{
	struct netdev_adjacent *lower;

	lower = list_entry(*iter, struct netdev_adjacent, list);

	if (&lower->list == &dev->adj_list.lower)
		return NULL;

7123
	*iter = lower->list.next;
7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150

	return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private);

/**
 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 *				       lower neighbour list, RCU
 *				       variant
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RCU read lock.
 */
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
					struct list_head **iter)
{
	struct netdev_adjacent *lower;

	WARN_ON_ONCE(!rcu_read_lock_held());

	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

	if (&lower->list == &dev->adj_list.lower)
		return NULL;

7151
	*iter = &lower->list;
7152 7153 7154 7155 7156

	return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);

7157 7158 7159 7160 7161 7162 7163 7164 7165
/**
 * netdev_lower_get_next - Get the next device from the lower neighbour
 *                         list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RTNL lock or
 * its own locking that guarantees that the neighbour lower
7166
 * list will remain unchanged.
7167 7168 7169 7170 7171
 */
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
	struct netdev_adjacent *lower;

7172
	lower = list_entry(*iter, struct netdev_adjacent, list);
7173 7174 7175 7176

	if (&lower->list == &dev->adj_list.lower)
		return NULL;

7177
	*iter = lower->list.next;
7178 7179 7180 7181 7182

	return lower->dev;
}
EXPORT_SYMBOL(netdev_lower_get_next);

7183 7184 7185 7186 7187
static struct net_device *netdev_next_lower_dev(struct net_device *dev,
						struct list_head **iter)
{
	struct netdev_adjacent *lower;

7188
	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7189 7190 7191 7192

	if (&lower->list == &dev->adj_list.lower)
		return NULL;

7193
	*iter = &lower->list;
7194 7195 7196 7197

	return lower->dev;
}

7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214
static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
						  struct list_head **iter,
						  bool *ignore)
{
	struct netdev_adjacent *lower;

	lower = list_entry((*iter)->next, struct netdev_adjacent, list);

	if (&lower->list == &dev->adj_list.lower)
		return NULL;

	*iter = &lower->list;
	*ignore = lower->ignore;

	return lower->dev;
}

7215 7216 7217 7218 7219
int netdev_walk_all_lower_dev(struct net_device *dev,
			      int (*fn)(struct net_device *dev,
					void *data),
			      void *data)
{
T
Taehee Yoo 已提交
7220 7221 7222
	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
	int ret, cur = 0;
7223

T
Taehee Yoo 已提交
7224 7225
	now = dev;
	iter = &dev->adj_list.lower;
7226

T
Taehee Yoo 已提交
7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255
	while (1) {
		if (now != dev) {
			ret = fn(now, data);
			if (ret)
				return ret;
		}

		next = NULL;
		while (1) {
			ldev = netdev_next_lower_dev(now, &iter);
			if (!ldev)
				break;

			next = ldev;
			niter = &ldev->adj_list.lower;
			dev_stack[cur] = now;
			iter_stack[cur++] = iter;
			break;
		}

		if (!next) {
			if (!cur)
				return 0;
			next = dev_stack[--cur];
			niter = iter_stack[cur];
		}

		now = next;
		iter = niter;
7256 7257 7258 7259 7260 7261
	}

	return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);

7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310
static int __netdev_walk_all_lower_dev(struct net_device *dev,
				       int (*fn)(struct net_device *dev,
						 void *data),
				       void *data)
{
	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
	int ret, cur = 0;
	bool ignore;

	now = dev;
	iter = &dev->adj_list.lower;

	while (1) {
		if (now != dev) {
			ret = fn(now, data);
			if (ret)
				return ret;
		}

		next = NULL;
		while (1) {
			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
			if (!ldev)
				break;
			if (ignore)
				continue;

			next = ldev;
			niter = &ldev->adj_list.lower;
			dev_stack[cur] = now;
			iter_stack[cur++] = iter;
			break;
		}

		if (!next) {
			if (!cur)
				return 0;
			next = dev_stack[--cur];
			niter = iter_stack[cur];
		}

		now = next;
		iter = niter;
	}

	return 0;
}

7311 7312
struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
					     struct list_head **iter)
7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323
{
	struct netdev_adjacent *lower;

	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
	if (&lower->list == &dev->adj_list.lower)
		return NULL;

	*iter = &lower->list;

	return lower->dev;
}
7324
EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7325

T
Taehee Yoo 已提交
7326 7327 7328 7329 7330
static u8 __netdev_upper_depth(struct net_device *dev)
{
	struct net_device *udev;
	struct list_head *iter;
	u8 max_depth = 0;
7331
	bool ignore;
T
Taehee Yoo 已提交
7332 7333

	for (iter = &dev->adj_list.upper,
7334
	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
T
Taehee Yoo 已提交
7335
	     udev;
7336 7337 7338
	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
		if (ignore)
			continue;
T
Taehee Yoo 已提交
7339 7340 7341 7342 7343 7344 7345 7346
		if (max_depth < udev->upper_level)
			max_depth = udev->upper_level;
	}

	return max_depth;
}

static u8 __netdev_lower_depth(struct net_device *dev)
7347 7348 7349
{
	struct net_device *ldev;
	struct list_head *iter;
T
Taehee Yoo 已提交
7350
	u8 max_depth = 0;
7351
	bool ignore;
7352 7353

	for (iter = &dev->adj_list.lower,
7354
	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7355
	     ldev;
7356 7357 7358
	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
		if (ignore)
			continue;
T
Taehee Yoo 已提交
7359 7360 7361
		if (max_depth < ldev->lower_level)
			max_depth = ldev->lower_level;
	}
7362

T
Taehee Yoo 已提交
7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418
	return max_depth;
}

static int __netdev_update_upper_level(struct net_device *dev, void *data)
{
	dev->upper_level = __netdev_upper_depth(dev) + 1;
	return 0;
}

static int __netdev_update_lower_level(struct net_device *dev, void *data)
{
	dev->lower_level = __netdev_lower_depth(dev) + 1;
	return 0;
}

int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
				  int (*fn)(struct net_device *dev,
					    void *data),
				  void *data)
{
	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
	int ret, cur = 0;

	now = dev;
	iter = &dev->adj_list.lower;

	while (1) {
		if (now != dev) {
			ret = fn(now, data);
			if (ret)
				return ret;
		}

		next = NULL;
		while (1) {
			ldev = netdev_next_lower_dev_rcu(now, &iter);
			if (!ldev)
				break;

			next = ldev;
			niter = &ldev->adj_list.lower;
			dev_stack[cur] = now;
			iter_stack[cur++] = iter;
			break;
		}

		if (!next) {
			if (!cur)
				return 0;
			next = dev_stack[--cur];
			niter = iter_stack[cur];
		}

		now = next;
		iter = niter;
7419 7420 7421 7422 7423 7424
	}

	return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);

7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445
/**
 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 *				       lower neighbour list, RCU
 *				       variant
 * @dev: device
 *
 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 * list. The caller must hold RCU read lock.
 */
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
	struct netdev_adjacent *lower;

	lower = list_first_or_null_rcu(&dev->adj_list.lower,
			struct netdev_adjacent, list);
	if (lower)
		return lower->private;
	return NULL;
}
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);

J
Jiri Pirko 已提交
7446 7447 7448 7449 7450 7451 7452 7453 7454
/**
 * netdev_master_upper_dev_get_rcu - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RCU read lock.
 */
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
7455
	struct netdev_adjacent *upper;
J
Jiri Pirko 已提交
7456

7457
	upper = list_first_or_null_rcu(&dev->adj_list.upper,
7458
				       struct netdev_adjacent, list);
J
Jiri Pirko 已提交
7459 7460 7461 7462 7463 7464
	if (upper && likely(upper->master))
		return upper->dev;
	return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);

7465
static int netdev_adjacent_sysfs_add(struct net_device *dev,
7466 7467 7468 7469
			      struct net_device *adj_dev,
			      struct list_head *dev_list)
{
	char linkname[IFNAMSIZ+7];
7470

7471 7472 7473 7474 7475
	sprintf(linkname, dev_list == &dev->adj_list.upper ?
		"upper_%s" : "lower_%s", adj_dev->name);
	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
				 linkname);
}
7476
static void netdev_adjacent_sysfs_del(struct net_device *dev,
7477 7478 7479 7480
			       char *name,
			       struct list_head *dev_list)
{
	char linkname[IFNAMSIZ+7];
7481

7482 7483 7484 7485 7486
	sprintf(linkname, dev_list == &dev->adj_list.upper ?
		"upper_%s" : "lower_%s", name);
	sysfs_remove_link(&(dev->dev.kobj), linkname);
}

7487 7488 7489 7490 7491 7492 7493 7494
static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
						 struct net_device *adj_dev,
						 struct list_head *dev_list)
{
	return (dev_list == &dev->adj_list.upper ||
		dev_list == &dev->adj_list.lower) &&
		net_eq(dev_net(dev), dev_net(adj_dev));
}
7495

7496 7497
static int __netdev_adjacent_dev_insert(struct net_device *dev,
					struct net_device *adj_dev,
7498
					struct list_head *dev_list,
7499
					void *private, bool master)
7500 7501
{
	struct netdev_adjacent *adj;
7502
	int ret;
7503

7504
	adj = __netdev_find_adj(adj_dev, dev_list);
7505 7506

	if (adj) {
7507
		adj->ref_nr += 1;
7508 7509 7510
		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
			 dev->name, adj_dev->name, adj->ref_nr);

7511 7512 7513 7514 7515 7516 7517 7518 7519
		return 0;
	}

	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
	if (!adj)
		return -ENOMEM;

	adj->dev = adj_dev;
	adj->master = master;
7520
	adj->ref_nr = 1;
7521
	adj->private = private;
7522
	adj->ignore = false;
7523
	dev_hold(adj_dev);
7524

7525 7526
	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7527

7528
	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7529
		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7530 7531 7532 7533
		if (ret)
			goto free_adj;
	}

7534
	/* Ensure that master link is always the first item in list. */
7535 7536 7537 7538
	if (master) {
		ret = sysfs_create_link(&(dev->dev.kobj),
					&(adj_dev->dev.kobj), "master");
		if (ret)
7539
			goto remove_symlinks;
7540

7541
		list_add_rcu(&adj->list, dev_list);
7542
	} else {
7543
		list_add_tail_rcu(&adj->list, dev_list);
7544
	}
7545 7546

	return 0;
7547

7548
remove_symlinks:
7549
	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7550
		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7551 7552
free_adj:
	kfree(adj);
7553
	dev_put(adj_dev);
7554 7555

	return ret;
7556 7557
}

S
stephen hemminger 已提交
7558 7559
static void __netdev_adjacent_dev_remove(struct net_device *dev,
					 struct net_device *adj_dev,
7560
					 u16 ref_nr,
S
stephen hemminger 已提交
7561
					 struct list_head *dev_list)
7562 7563 7564
{
	struct netdev_adjacent *adj;

7565 7566 7567
	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
		 dev->name, adj_dev->name, ref_nr);

7568
	adj = __netdev_find_adj(adj_dev, dev_list);
7569

7570
	if (!adj) {
7571
		pr_err("Adjacency does not exist for device %s from %s\n",
7572
		       dev->name, adj_dev->name);
7573 7574
		WARN_ON(1);
		return;
7575
	}
7576

7577
	if (adj->ref_nr > ref_nr) {
7578 7579 7580
		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
			 dev->name, adj_dev->name, ref_nr,
			 adj->ref_nr - ref_nr);
7581
		adj->ref_nr -= ref_nr;
7582 7583 7584
		return;
	}

7585 7586 7587
	if (adj->master)
		sysfs_remove_link(&(dev->dev.kobj), "master");

7588
	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7589
		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7590

7591
	list_del_rcu(&adj->list);
7592
	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7593
		 adj_dev->name, dev->name, adj_dev->name);
7594 7595 7596 7597
	dev_put(adj_dev);
	kfree_rcu(adj, rcu);
}

S
stephen hemminger 已提交
7598 7599 7600 7601 7602
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
					    struct net_device *upper_dev,
					    struct list_head *up_list,
					    struct list_head *down_list,
					    void *private, bool master)
7603 7604 7605
{
	int ret;

7606
	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7607
					   private, master);
7608 7609 7610
	if (ret)
		return ret;

7611
	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7612
					   private, false);
7613
	if (ret) {
7614
		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7615 7616 7617 7618 7619 7620
		return ret;
	}

	return 0;
}

S
stephen hemminger 已提交
7621 7622
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
					       struct net_device *upper_dev,
7623
					       u16 ref_nr,
S
stephen hemminger 已提交
7624 7625
					       struct list_head *up_list,
					       struct list_head *down_list)
7626
{
7627 7628
	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7629 7630
}

S
stephen hemminger 已提交
7631 7632 7633
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
						struct net_device *upper_dev,
						void *private, bool master)
7634
{
7635 7636 7637 7638
	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
						&dev->adj_list.upper,
						&upper_dev->adj_list.lower,
						private, master);
7639 7640
}

S
stephen hemminger 已提交
7641 7642
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
						   struct net_device *upper_dev)
7643
{
7644
	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7645 7646 7647
					   &dev->adj_list.upper,
					   &upper_dev->adj_list.lower);
}
7648

J
Jiri Pirko 已提交
7649
static int __netdev_upper_dev_link(struct net_device *dev,
7650
				   struct net_device *upper_dev, bool master,
7651 7652
				   void *upper_priv, void *upper_info,
				   struct netlink_ext_ack *extack)
J
Jiri Pirko 已提交
7653
{
7654 7655 7656
	struct netdev_notifier_changeupper_info changeupper_info = {
		.info = {
			.dev = dev,
7657
			.extack = extack,
7658 7659 7660 7661 7662 7663
		},
		.upper_dev = upper_dev,
		.master = master,
		.linking = true,
		.upper_info = upper_info,
	};
7664
	struct net_device *master_dev;
7665
	int ret = 0;
J
Jiri Pirko 已提交
7666 7667 7668 7669 7670 7671 7672

	ASSERT_RTNL();

	if (dev == upper_dev)
		return -EBUSY;

	/* To prevent loops, check if dev is not upper device to upper_dev. */
7673
	if (__netdev_has_upper_dev(upper_dev, dev))
J
Jiri Pirko 已提交
7674 7675
		return -EBUSY;

T
Taehee Yoo 已提交
7676 7677 7678
	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
		return -EMLINK;

7679
	if (!master) {
7680
		if (__netdev_has_upper_dev(dev, upper_dev))
7681 7682
			return -EEXIST;
	} else {
7683
		master_dev = __netdev_master_upper_dev_get(dev);
7684 7685 7686
		if (master_dev)
			return master_dev == upper_dev ? -EEXIST : -EBUSY;
	}
J
Jiri Pirko 已提交
7687

7688
	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7689 7690 7691 7692 7693
					    &changeupper_info.info);
	ret = notifier_to_errno(ret);
	if (ret)
		return ret;

7694
	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7695
						   master);
7696 7697
	if (ret)
		return ret;
J
Jiri Pirko 已提交
7698

7699
	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7700 7701 7702
					    &changeupper_info.info);
	ret = notifier_to_errno(ret);
	if (ret)
7703
		goto rollback;
7704

T
Taehee Yoo 已提交
7705
	__netdev_update_upper_level(dev, NULL);
7706
	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
T
Taehee Yoo 已提交
7707 7708

	__netdev_update_lower_level(upper_dev, NULL);
7709 7710
	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
				    NULL);
T
Taehee Yoo 已提交
7711

J
Jiri Pirko 已提交
7712
	return 0;
7713

7714
rollback:
7715
	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7716 7717

	return ret;
J
Jiri Pirko 已提交
7718 7719 7720 7721 7722 7723
}

/**
 * netdev_upper_dev_link - Add a link to the upper device
 * @dev: device
 * @upper_dev: new upper device
7724
 * @extack: netlink extended ack
J
Jiri Pirko 已提交
7725 7726 7727 7728 7729 7730 7731
 *
 * Adds a link to device which is upper to this one. The caller must hold
 * the RTNL lock. On a failure a negative errno code is returned.
 * On success the reference counts are adjusted and the function
 * returns zero.
 */
int netdev_upper_dev_link(struct net_device *dev,
7732 7733
			  struct net_device *upper_dev,
			  struct netlink_ext_ack *extack)
J
Jiri Pirko 已提交
7734
{
7735 7736
	return __netdev_upper_dev_link(dev, upper_dev, false,
				       NULL, NULL, extack);
J
Jiri Pirko 已提交
7737 7738 7739 7740 7741 7742 7743
}
EXPORT_SYMBOL(netdev_upper_dev_link);

/**
 * netdev_master_upper_dev_link - Add a master link to the upper device
 * @dev: device
 * @upper_dev: new upper device
7744
 * @upper_priv: upper device private
7745
 * @upper_info: upper info to be passed down via notifier
7746
 * @extack: netlink extended ack
J
Jiri Pirko 已提交
7747 7748 7749 7750 7751 7752 7753 7754
 *
 * Adds a link to device which is upper to this one. In this case, only
 * one master upper device can be linked, although other non-master devices
 * might be linked as well. The caller must hold the RTNL lock.
 * On a failure a negative errno code is returned. On success the reference
 * counts are adjusted and the function returns zero.
 */
int netdev_master_upper_dev_link(struct net_device *dev,
7755
				 struct net_device *upper_dev,
7756 7757
				 void *upper_priv, void *upper_info,
				 struct netlink_ext_ack *extack)
J
Jiri Pirko 已提交
7758
{
7759
	return __netdev_upper_dev_link(dev, upper_dev, true,
7760
				       upper_priv, upper_info, extack);
J
Jiri Pirko 已提交
7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);

/**
 * netdev_upper_dev_unlink - Removes a link to upper device
 * @dev: device
 * @upper_dev: new upper device
 *
 * Removes a link to device which is upper to this one. The caller must hold
 * the RTNL lock.
 */
void netdev_upper_dev_unlink(struct net_device *dev,
			     struct net_device *upper_dev)
{
7775 7776 7777 7778 7779 7780 7781
	struct netdev_notifier_changeupper_info changeupper_info = {
		.info = {
			.dev = dev,
		},
		.upper_dev = upper_dev,
		.linking = false,
	};
7782

J
Jiri Pirko 已提交
7783 7784
	ASSERT_RTNL();

7785 7786
	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;

7787
	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7788 7789
				      &changeupper_info.info);

7790
	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7791

7792
	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7793
				      &changeupper_info.info);
T
Taehee Yoo 已提交
7794 7795

	__netdev_update_upper_level(dev, NULL);
7796
	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
T
Taehee Yoo 已提交
7797 7798

	__netdev_update_lower_level(upper_dev, NULL);
7799 7800
	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
				    NULL);
J
Jiri Pirko 已提交
7801 7802 7803
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);

7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883
static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
				      struct net_device *lower_dev,
				      bool val)
{
	struct netdev_adjacent *adj;

	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
	if (adj)
		adj->ignore = val;

	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
	if (adj)
		adj->ignore = val;
}

static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
					struct net_device *lower_dev)
{
	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
}

static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
				       struct net_device *lower_dev)
{
	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
}

int netdev_adjacent_change_prepare(struct net_device *old_dev,
				   struct net_device *new_dev,
				   struct net_device *dev,
				   struct netlink_ext_ack *extack)
{
	int err;

	if (!new_dev)
		return 0;

	if (old_dev && new_dev != old_dev)
		netdev_adjacent_dev_disable(dev, old_dev);

	err = netdev_upper_dev_link(new_dev, dev, extack);
	if (err) {
		if (old_dev && new_dev != old_dev)
			netdev_adjacent_dev_enable(dev, old_dev);
		return err;
	}

	return 0;
}
EXPORT_SYMBOL(netdev_adjacent_change_prepare);

void netdev_adjacent_change_commit(struct net_device *old_dev,
				   struct net_device *new_dev,
				   struct net_device *dev)
{
	if (!new_dev || !old_dev)
		return;

	if (new_dev == old_dev)
		return;

	netdev_adjacent_dev_enable(dev, old_dev);
	netdev_upper_dev_unlink(old_dev, dev);
}
EXPORT_SYMBOL(netdev_adjacent_change_commit);

void netdev_adjacent_change_abort(struct net_device *old_dev,
				  struct net_device *new_dev,
				  struct net_device *dev)
{
	if (!new_dev)
		return;

	if (old_dev && new_dev != old_dev)
		netdev_adjacent_dev_enable(dev, old_dev);

	netdev_upper_dev_unlink(new_dev, dev);
}
EXPORT_SYMBOL(netdev_adjacent_change_abort);

7884 7885 7886
/**
 * netdev_bonding_info_change - Dispatch event about slave change
 * @dev: device
7887
 * @bonding_info: info to dispatch
7888 7889 7890 7891 7892 7893 7894
 *
 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_bonding_info_change(struct net_device *dev,
				struct netdev_bonding_info *bonding_info)
{
7895 7896 7897
	struct netdev_notifier_bonding_info info = {
		.info.dev = dev,
	};
7898 7899 7900

	memcpy(&info.bonding_info, bonding_info,
	       sizeof(struct netdev_bonding_info));
7901
	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7902 7903 7904 7905
				      &info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);

7906 7907
/**
 * netdev_get_xmit_slave - Get the xmit slave of master device
A
Andrew Lunn 已提交
7908
 * @dev: device
7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928
 * @skb: The packet
 * @all_slaves: assume all the slaves are active
 *
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 * %NULL is returned if no slave is found.
 */

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
					 struct sk_buff *skb,
					 bool all_slaves)
{
	const struct net_device_ops *ops = dev->netdev_ops;

	if (!ops->ndo_get_xmit_slave)
		return NULL;
	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
}
EXPORT_SYMBOL(netdev_get_xmit_slave);

E
Eric Dumazet 已提交
7929
static void netdev_adjacent_add_links(struct net_device *dev)
7930 7931 7932 7933 7934 7935
{
	struct netdev_adjacent *iter;

	struct net *net = dev_net(dev);

	list_for_each_entry(iter, &dev->adj_list.upper, list) {
W
Wei Tang 已提交
7936
		if (!net_eq(net, dev_net(iter->dev)))
7937 7938 7939 7940 7941 7942 7943 7944
			continue;
		netdev_adjacent_sysfs_add(iter->dev, dev,
					  &iter->dev->adj_list.lower);
		netdev_adjacent_sysfs_add(dev, iter->dev,
					  &dev->adj_list.upper);
	}

	list_for_each_entry(iter, &dev->adj_list.lower, list) {
W
Wei Tang 已提交
7945
		if (!net_eq(net, dev_net(iter->dev)))
7946 7947 7948 7949 7950 7951 7952 7953
			continue;
		netdev_adjacent_sysfs_add(iter->dev, dev,
					  &iter->dev->adj_list.upper);
		netdev_adjacent_sysfs_add(dev, iter->dev,
					  &dev->adj_list.lower);
	}
}

E
Eric Dumazet 已提交
7954
static void netdev_adjacent_del_links(struct net_device *dev)
7955 7956 7957 7958 7959 7960
{
	struct netdev_adjacent *iter;

	struct net *net = dev_net(dev);

	list_for_each_entry(iter, &dev->adj_list.upper, list) {
W
Wei Tang 已提交
7961
		if (!net_eq(net, dev_net(iter->dev)))
7962 7963 7964 7965 7966 7967 7968 7969
			continue;
		netdev_adjacent_sysfs_del(iter->dev, dev->name,
					  &iter->dev->adj_list.lower);
		netdev_adjacent_sysfs_del(dev, iter->dev->name,
					  &dev->adj_list.upper);
	}

	list_for_each_entry(iter, &dev->adj_list.lower, list) {
W
Wei Tang 已提交
7970
		if (!net_eq(net, dev_net(iter->dev)))
7971 7972 7973 7974 7975 7976 7977 7978
			continue;
		netdev_adjacent_sysfs_del(iter->dev, dev->name,
					  &iter->dev->adj_list.upper);
		netdev_adjacent_sysfs_del(dev, iter->dev->name,
					  &dev->adj_list.lower);
	}
}

7979
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
7980
{
7981
	struct netdev_adjacent *iter;
7982

7983 7984
	struct net *net = dev_net(dev);

7985
	list_for_each_entry(iter, &dev->adj_list.upper, list) {
W
Wei Tang 已提交
7986
		if (!net_eq(net, dev_net(iter->dev)))
7987
			continue;
7988 7989 7990 7991 7992
		netdev_adjacent_sysfs_del(iter->dev, oldname,
					  &iter->dev->adj_list.lower);
		netdev_adjacent_sysfs_add(iter->dev, dev,
					  &iter->dev->adj_list.lower);
	}
7993

7994
	list_for_each_entry(iter, &dev->adj_list.lower, list) {
W
Wei Tang 已提交
7995
		if (!net_eq(net, dev_net(iter->dev)))
7996
			continue;
7997 7998 7999 8000 8001
		netdev_adjacent_sysfs_del(iter->dev, oldname,
					  &iter->dev->adj_list.upper);
		netdev_adjacent_sysfs_add(iter->dev, dev,
					  &iter->dev->adj_list.upper);
	}
8002 8003 8004 8005 8006 8007 8008 8009 8010
}

void *netdev_lower_dev_get_private(struct net_device *dev,
				   struct net_device *lower_dev)
{
	struct netdev_adjacent *lower;

	if (!lower_dev)
		return NULL;
8011
	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8012 8013 8014 8015 8016 8017 8018
	if (!lower)
		return NULL;

	return lower->private;
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);

8019

8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030
/**
 * netdev_lower_change - Dispatch event about lower device state change
 * @lower_dev: device
 * @lower_state_info: state to dispatch
 *
 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_lower_state_changed(struct net_device *lower_dev,
				void *lower_state_info)
{
8031 8032 8033
	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
		.info.dev = lower_dev,
	};
8034 8035 8036

	ASSERT_RTNL();
	changelowerstate_info.lower_state_info = lower_state_info;
8037
	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8038 8039 8040 8041
				      &changelowerstate_info.info);
}
EXPORT_SYMBOL(netdev_lower_state_changed);

8042 8043
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
8044 8045
	const struct net_device_ops *ops = dev->netdev_ops;

8046
	if (ops->ndo_change_rx_flags)
8047
		ops->ndo_change_rx_flags(dev, flags);
8048 8049
}

8050
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
L
Linus Torvalds 已提交
8051
{
8052
	unsigned int old_flags = dev->flags;
8053 8054
	kuid_t uid;
	kgid_t gid;
L
Linus Torvalds 已提交
8055

8056 8057
	ASSERT_RTNL();

8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068
	dev->flags |= IFF_PROMISC;
	dev->promiscuity += inc;
	if (dev->promiscuity == 0) {
		/*
		 * Avoid overflow.
		 * If inc causes overflow, untouch promisc and return error.
		 */
		if (inc < 0)
			dev->flags &= ~IFF_PROMISC;
		else {
			dev->promiscuity -= inc;
8069 8070
			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
				dev->name);
8071 8072 8073
			return -EOVERFLOW;
		}
	}
8074
	if (dev->flags != old_flags) {
8075 8076 8077
		pr_info("device %s %s promiscuous mode\n",
			dev->name,
			dev->flags & IFF_PROMISC ? "entered" : "left");
8078 8079
		if (audit_enabled) {
			current_uid_gid(&uid, &gid);
8080 8081 8082 8083 8084 8085 8086 8087 8088
			audit_log(audit_context(), GFP_ATOMIC,
				  AUDIT_ANOM_PROMISCUOUS,
				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
				  dev->name, (dev->flags & IFF_PROMISC),
				  (old_flags & IFF_PROMISC),
				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
				  from_kuid(&init_user_ns, uid),
				  from_kgid(&init_user_ns, gid),
				  audit_get_sessionid(current));
8089
		}
8090

8091
		dev_change_rx_flags(dev, IFF_PROMISC);
L
Linus Torvalds 已提交
8092
	}
8093 8094
	if (notify)
		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
8095
	return 0;
L
Linus Torvalds 已提交
8096 8097
}

8098 8099 8100 8101 8102 8103 8104 8105 8106
/**
 *	dev_set_promiscuity	- update promiscuity count on a device
 *	@dev: device
 *	@inc: modifier
 *
 *	Add or remove promiscuity from a device. While the count in the device
 *	remains above zero the interface remains promiscuous. Once it hits zero
 *	the device reverts back to normal filtering operation. A negative inc
 *	value is used to drop promiscuity on the device.
8107
 *	Return 0 if successful or a negative errno code on error.
8108
 */
8109
int dev_set_promiscuity(struct net_device *dev, int inc)
8110
{
8111
	unsigned int old_flags = dev->flags;
8112
	int err;
8113

8114
	err = __dev_set_promiscuity(dev, inc, true);
8115
	if (err < 0)
8116
		return err;
8117 8118
	if (dev->flags != old_flags)
		dev_set_rx_mode(dev);
8119
	return err;
8120
}
E
Eric Dumazet 已提交
8121
EXPORT_SYMBOL(dev_set_promiscuity);
8122

8123
static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
L
Linus Torvalds 已提交
8124
{
8125
	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
L
Linus Torvalds 已提交
8126

8127 8128
	ASSERT_RTNL();

L
Linus Torvalds 已提交
8129
	dev->flags |= IFF_ALLMULTI;
8130 8131 8132 8133 8134 8135 8136 8137 8138 8139
	dev->allmulti += inc;
	if (dev->allmulti == 0) {
		/*
		 * Avoid overflow.
		 * If inc causes overflow, untouch allmulti and return error.
		 */
		if (inc < 0)
			dev->flags &= ~IFF_ALLMULTI;
		else {
			dev->allmulti -= inc;
8140 8141
			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
				dev->name);
8142 8143 8144
			return -EOVERFLOW;
		}
	}
8145
	if (dev->flags ^ old_flags) {
8146
		dev_change_rx_flags(dev, IFF_ALLMULTI);
8147
		dev_set_rx_mode(dev);
8148 8149 8150
		if (notify)
			__dev_notify_flags(dev, old_flags,
					   dev->gflags ^ old_gflags);
8151
	}
8152
	return 0;
8153
}
8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171

/**
 *	dev_set_allmulti	- update allmulti count on a device
 *	@dev: device
 *	@inc: modifier
 *
 *	Add or remove reception of all multicast frames to a device. While the
 *	count in the device remains above zero the interface remains listening
 *	to all interfaces. Once it hits zero the device reverts back to normal
 *	filtering operation. A negative @inc value is used to drop the counter
 *	when releasing a resource needing all multicasts.
 *	Return 0 if successful or a negative errno code on error.
 */

int dev_set_allmulti(struct net_device *dev, int inc)
{
	return __dev_set_allmulti(dev, inc, true);
}
E
Eric Dumazet 已提交
8172
EXPORT_SYMBOL(dev_set_allmulti);
8173 8174 8175 8176

/*
 *	Upload unicast and multicast address lists to device and
 *	configure RX filtering. When the device doesn't support unicast
J
Joe Perches 已提交
8177
 *	filtering it is put in promiscuous mode while unicast addresses
8178 8179 8180 8181
 *	are present.
 */
void __dev_set_rx_mode(struct net_device *dev)
{
8182 8183
	const struct net_device_ops *ops = dev->netdev_ops;

8184 8185 8186 8187 8188
	/* dev_open will call this function so the list will stay sane. */
	if (!(dev->flags&IFF_UP))
		return;

	if (!netif_device_present(dev))
8189
		return;
8190

8191
	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8192 8193 8194
		/* Unicast addresses changes may only happen under the rtnl,
		 * therefore calling __dev_set_promiscuity here is safe.
		 */
8195
		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8196
			__dev_set_promiscuity(dev, 1, false);
8197
			dev->uc_promisc = true;
8198
		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8199
			__dev_set_promiscuity(dev, -1, false);
8200
			dev->uc_promisc = false;
8201 8202
		}
	}
8203 8204 8205

	if (ops->ndo_set_rx_mode)
		ops->ndo_set_rx_mode(dev);
8206 8207 8208 8209
}

void dev_set_rx_mode(struct net_device *dev)
{
8210
	netif_addr_lock_bh(dev);
8211
	__dev_set_rx_mode(dev);
8212
	netif_addr_unlock_bh(dev);
L
Linus Torvalds 已提交
8213 8214
}

8215 8216 8217 8218 8219 8220
/**
 *	dev_get_flags - get flags reported to userspace
 *	@dev: device
 *
 *	Get the combination of flag bits exported through APIs to userspace.
 */
8221
unsigned int dev_get_flags(const struct net_device *dev)
L
Linus Torvalds 已提交
8222
{
8223
	unsigned int flags;
L
Linus Torvalds 已提交
8224 8225 8226

	flags = (dev->flags & ~(IFF_PROMISC |
				IFF_ALLMULTI |
S
Stefan Rompf 已提交
8227 8228 8229
				IFF_RUNNING |
				IFF_LOWER_UP |
				IFF_DORMANT)) |
L
Linus Torvalds 已提交
8230 8231 8232
		(dev->gflags & (IFF_PROMISC |
				IFF_ALLMULTI));

S
Stefan Rompf 已提交
8233 8234 8235 8236 8237 8238 8239 8240
	if (netif_running(dev)) {
		if (netif_oper_up(dev))
			flags |= IFF_RUNNING;
		if (netif_carrier_ok(dev))
			flags |= IFF_LOWER_UP;
		if (netif_dormant(dev))
			flags |= IFF_DORMANT;
	}
L
Linus Torvalds 已提交
8241 8242 8243

	return flags;
}
E
Eric Dumazet 已提交
8244
EXPORT_SYMBOL(dev_get_flags);
L
Linus Torvalds 已提交
8245

8246 8247
int __dev_change_flags(struct net_device *dev, unsigned int flags,
		       struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
8248
{
8249
	unsigned int old_flags = dev->flags;
8250
	int ret;
L
Linus Torvalds 已提交
8251

8252 8253
	ASSERT_RTNL();

L
Linus Torvalds 已提交
8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267
	/*
	 *	Set the flags on our device.
	 */

	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
			       IFF_AUTOMEDIA)) |
		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
				    IFF_ALLMULTI));

	/*
	 *	Load in the correct multicast list now the flags have changed.
	 */

8268 8269
	if ((old_flags ^ flags) & IFF_MULTICAST)
		dev_change_rx_flags(dev, IFF_MULTICAST);
8270

8271
	dev_set_rx_mode(dev);
L
Linus Torvalds 已提交
8272 8273 8274 8275 8276 8277 8278 8279

	/*
	 *	Have we downed the interface. We handle IFF_UP ourselves
	 *	according to user attempts to set it, rather than blindly
	 *	setting it.
	 */

	ret = 0;
8280 8281 8282 8283
	if ((old_flags ^ flags) & IFF_UP) {
		if (old_flags & IFF_UP)
			__dev_close(dev);
		else
8284
			ret = __dev_open(dev, extack);
8285
	}
L
Linus Torvalds 已提交
8286 8287

	if ((flags ^ dev->gflags) & IFF_PROMISC) {
E
Eric Dumazet 已提交
8288
		int inc = (flags & IFF_PROMISC) ? 1 : -1;
8289
		unsigned int old_flags = dev->flags;
E
Eric Dumazet 已提交
8290

L
Linus Torvalds 已提交
8291
		dev->gflags ^= IFF_PROMISC;
8292 8293 8294 8295

		if (__dev_set_promiscuity(dev, inc, false) >= 0)
			if (dev->flags != old_flags)
				dev_set_rx_mode(dev);
L
Linus Torvalds 已提交
8296 8297 8298
	}

	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8299 8300
	 * is important. Some (broken) drivers set IFF_PROMISC, when
	 * IFF_ALLMULTI is requested not asking us and not reporting.
L
Linus Torvalds 已提交
8301 8302
	 */
	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
E
Eric Dumazet 已提交
8303 8304
		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;

L
Linus Torvalds 已提交
8305
		dev->gflags ^= IFF_ALLMULTI;
8306
		__dev_set_allmulti(dev, inc, false);
L
Linus Torvalds 已提交
8307 8308
	}

8309 8310 8311
	return ret;
}

8312 8313
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
			unsigned int gchanges)
8314 8315 8316
{
	unsigned int changes = dev->flags ^ old_flags;

8317
	if (gchanges)
8318
		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8319

8320 8321 8322 8323 8324 8325 8326 8327
	if (changes & IFF_UP) {
		if (dev->flags & IFF_UP)
			call_netdevice_notifiers(NETDEV_UP, dev);
		else
			call_netdevice_notifiers(NETDEV_DOWN, dev);
	}

	if (dev->flags & IFF_UP &&
8328
	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8329 8330 8331 8332 8333 8334
		struct netdev_notifier_change_info change_info = {
			.info = {
				.dev = dev,
			},
			.flags_changed = changes,
		};
8335

8336
		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8337
	}
8338 8339 8340 8341 8342 8343
}

/**
 *	dev_change_flags - change device settings
 *	@dev: device
 *	@flags: device state flags
8344
 *	@extack: netlink extended ack
8345 8346 8347 8348
 *
 *	Change settings on device based state flags. The flags are
 *	in the userspace exported format.
 */
8349 8350
int dev_change_flags(struct net_device *dev, unsigned int flags,
		     struct netlink_ext_ack *extack)
8351
{
8352
	int ret;
8353
	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8354

8355
	ret = __dev_change_flags(dev, flags, extack);
8356 8357 8358
	if (ret < 0)
		return ret;

8359
	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8360
	__dev_notify_flags(dev, old_flags, changes);
L
Linus Torvalds 已提交
8361 8362
	return ret;
}
E
Eric Dumazet 已提交
8363
EXPORT_SYMBOL(dev_change_flags);
L
Linus Torvalds 已提交
8364

8365
int __dev_set_mtu(struct net_device *dev, int new_mtu)
8366 8367 8368 8369 8370 8371
{
	const struct net_device_ops *ops = dev->netdev_ops;

	if (ops->ndo_change_mtu)
		return ops->ndo_change_mtu(dev, new_mtu);

8372 8373
	/* Pairs with all the lockless reads of dev->mtu in the stack */
	WRITE_ONCE(dev->mtu, new_mtu);
8374 8375
	return 0;
}
8376
EXPORT_SYMBOL(__dev_set_mtu);
8377

8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393
int dev_validate_mtu(struct net_device *dev, int new_mtu,
		     struct netlink_ext_ack *extack)
{
	/* MTU must be positive, and in range */
	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
		return -EINVAL;
	}

	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
		return -EINVAL;
	}
	return 0;
}

8394
/**
8395
 *	dev_set_mtu_ext - Change maximum transfer unit
8396 8397
 *	@dev: device
 *	@new_mtu: new transfer unit
8398
 *	@extack: netlink extended ack
8399 8400 8401
 *
 *	Change the maximum transfer size of the network device.
 */
8402 8403
int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
		    struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
8404
{
8405
	int err, orig_mtu;
L
Linus Torvalds 已提交
8406 8407 8408 8409

	if (new_mtu == dev->mtu)
		return 0;

8410 8411 8412
	err = dev_validate_mtu(dev, new_mtu, extack);
	if (err)
		return err;
L
Linus Torvalds 已提交
8413 8414 8415 8416

	if (!netif_device_present(dev))
		return -ENODEV;

8417 8418 8419 8420
	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
	err = notifier_to_errno(err);
	if (err)
		return err;
8421

8422 8423
	orig_mtu = dev->mtu;
	err = __dev_set_mtu(dev, new_mtu);
8424

8425
	if (!err) {
8426 8427
		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
						   orig_mtu);
8428 8429 8430 8431 8432 8433
		err = notifier_to_errno(err);
		if (err) {
			/* setting mtu back and notifying everyone again,
			 * so that they have a chance to revert changes.
			 */
			__dev_set_mtu(dev, orig_mtu);
8434 8435
			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
						     new_mtu);
8436 8437
		}
	}
L
Linus Torvalds 已提交
8438 8439
	return err;
}
8440 8441 8442 8443 8444 8445

int dev_set_mtu(struct net_device *dev, int new_mtu)
{
	struct netlink_ext_ack extack;
	int err;

8446
	memset(&extack, 0, sizeof(extack));
8447
	err = dev_set_mtu_ext(dev, new_mtu, &extack);
8448
	if (err && extack._msg)
8449 8450 8451
		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
	return err;
}
E
Eric Dumazet 已提交
8452
EXPORT_SYMBOL(dev_set_mtu);
L
Linus Torvalds 已提交
8453

8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470
/**
 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 *	@dev: device
 *	@new_len: new tx queue length
 */
int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
	unsigned int orig_len = dev->tx_queue_len;
	int res;

	if (new_len != (unsigned int)new_len)
		return -ERANGE;

	if (new_len != orig_len) {
		dev->tx_queue_len = new_len;
		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
		res = notifier_to_errno(res);
8471 8472 8473 8474 8475
		if (res)
			goto err_rollback;
		res = dev_qdisc_change_tx_queue_len(dev);
		if (res)
			goto err_rollback;
8476 8477 8478
	}

	return 0;
8479 8480 8481 8482 8483

err_rollback:
	netdev_err(dev, "refused to change device tx_queue_len\n");
	dev->tx_queue_len = orig_len;
	return res;
8484 8485
}

8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496
/**
 *	dev_set_group - Change group this device belongs to
 *	@dev: device
 *	@new_group: group this device should belong to
 */
void dev_set_group(struct net_device *dev, int new_group)
{
	dev->group = new_group;
}
EXPORT_SYMBOL(dev_set_group);

8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517
/**
 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 *	@dev: device
 *	@addr: new address
 *	@extack: netlink extended ack
 */
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
			      struct netlink_ext_ack *extack)
{
	struct netdev_notifier_pre_changeaddr_info info = {
		.info.dev = dev,
		.info.extack = extack,
		.dev_addr = addr,
	};
	int rc;

	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
	return notifier_to_errno(rc);
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);

8518 8519 8520 8521
/**
 *	dev_set_mac_address - Change Media Access Control Address
 *	@dev: device
 *	@sa: new address
8522
 *	@extack: netlink extended ack
8523 8524 8525
 *
 *	Change the hardware (MAC) address of the device
 */
8526 8527
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
			struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
8528
{
8529
	const struct net_device_ops *ops = dev->netdev_ops;
L
Linus Torvalds 已提交
8530 8531
	int err;

8532
	if (!ops->ndo_set_mac_address)
L
Linus Torvalds 已提交
8533 8534 8535 8536 8537
		return -EOPNOTSUPP;
	if (sa->sa_family != dev->type)
		return -EINVAL;
	if (!netif_device_present(dev))
		return -ENODEV;
8538 8539 8540
	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
	if (err)
		return err;
8541
	err = ops->ndo_set_mac_address(dev, sa);
8542 8543
	if (err)
		return err;
J
Jiri Pirko 已提交
8544
	dev->addr_assign_type = NET_ADDR_SET;
8545
	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8546
	add_device_randomness(dev->dev_addr, dev->addr_len);
8547
	return 0;
L
Linus Torvalds 已提交
8548
}
E
Eric Dumazet 已提交
8549
EXPORT_SYMBOL(dev_set_mac_address);
L
Linus Torvalds 已提交
8550

J
Jiri Pirko 已提交
8551 8552 8553
/**
 *	dev_change_carrier - Change device carrier
 *	@dev: device
8554
 *	@new_carrier: new value
J
Jiri Pirko 已提交
8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569
 *
 *	Change device carrier
 */
int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
	const struct net_device_ops *ops = dev->netdev_ops;

	if (!ops->ndo_change_carrier)
		return -EOPNOTSUPP;
	if (!netif_device_present(dev))
		return -ENODEV;
	return ops->ndo_change_carrier(dev, new_carrier);
}
EXPORT_SYMBOL(dev_change_carrier);

8570 8571 8572 8573 8574 8575 8576 8577
/**
 *	dev_get_phys_port_id - Get device physical port ID
 *	@dev: device
 *	@ppid: port ID
 *
 *	Get device physical port ID
 */
int dev_get_phys_port_id(struct net_device *dev,
8578
			 struct netdev_phys_item_id *ppid)
8579 8580 8581 8582 8583 8584 8585 8586 8587
{
	const struct net_device_ops *ops = dev->netdev_ops;

	if (!ops->ndo_get_phys_port_id)
		return -EOPNOTSUPP;
	return ops->ndo_get_phys_port_id(dev, ppid);
}
EXPORT_SYMBOL(dev_get_phys_port_id);

8588 8589 8590 8591
/**
 *	dev_get_phys_port_name - Get device physical port name
 *	@dev: device
 *	@name: port name
8592
 *	@len: limit of bytes to copy to name
8593 8594 8595 8596 8597 8598 8599
 *
 *	Get device physical port name
 */
int dev_get_phys_port_name(struct net_device *dev,
			   char *name, size_t len)
{
	const struct net_device_ops *ops = dev->netdev_ops;
8600
	int err;
8601

8602 8603 8604 8605 8606 8607
	if (ops->ndo_get_phys_port_name) {
		err = ops->ndo_get_phys_port_name(dev, name, len);
		if (err != -EOPNOTSUPP)
			return err;
	}
	return devlink_compat_phys_port_name_get(dev, name, len);
8608 8609 8610
}
EXPORT_SYMBOL(dev_get_phys_port_name);

8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626
/**
 *	dev_get_port_parent_id - Get the device's port parent identifier
 *	@dev: network device
 *	@ppid: pointer to a storage for the port's parent identifier
 *	@recurse: allow/disallow recursion to lower devices
 *
 *	Get the devices's port parent identifier
 */
int dev_get_port_parent_id(struct net_device *dev,
			   struct netdev_phys_item_id *ppid,
			   bool recurse)
{
	const struct net_device_ops *ops = dev->netdev_ops;
	struct netdev_phys_item_id first = { };
	struct net_device *lower_dev;
	struct list_head *iter;
8627 8628 8629 8630 8631 8632 8633
	int err;

	if (ops->ndo_get_port_parent_id) {
		err = ops->ndo_get_port_parent_id(dev, ppid);
		if (err != -EOPNOTSUPP)
			return err;
	}
8634

8635 8636 8637
	err = devlink_compat_switch_id_get(dev, ppid);
	if (!err || err != -EOPNOTSUPP)
		return err;
8638 8639

	if (!recurse)
8640
		return -EOPNOTSUPP;
8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674

	netdev_for_each_lower_dev(dev, lower_dev, iter) {
		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
		if (err)
			break;
		if (!first.id_len)
			first = *ppid;
		else if (memcmp(&first, ppid, sizeof(*ppid)))
			return -ENODATA;
	}

	return err;
}
EXPORT_SYMBOL(dev_get_port_parent_id);

/**
 *	netdev_port_same_parent_id - Indicate if two network devices have
 *	the same port parent identifier
 *	@a: first network device
 *	@b: second network device
 */
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
{
	struct netdev_phys_item_id a_id = { };
	struct netdev_phys_item_id b_id = { };

	if (dev_get_port_parent_id(a, &a_id, true) ||
	    dev_get_port_parent_id(b, &b_id, true))
		return false;

	return netdev_phys_item_id_same(&a_id, &b_id);
}
EXPORT_SYMBOL(netdev_port_same_parent_id);

8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694
/**
 *	dev_change_proto_down - update protocol port state information
 *	@dev: device
 *	@proto_down: new value
 *
 *	This info can be used by switch drivers to set the phys state of the
 *	port.
 */
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
	const struct net_device_ops *ops = dev->netdev_ops;

	if (!ops->ndo_change_proto_down)
		return -EOPNOTSUPP;
	if (!netif_device_present(dev))
		return -ENODEV;
	return ops->ndo_change_proto_down(dev, proto_down);
}
EXPORT_SYMBOL(dev_change_proto_down);

8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713
/**
 *	dev_change_proto_down_generic - generic implementation for
 * 	ndo_change_proto_down that sets carrier according to
 * 	proto_down.
 *
 *	@dev: device
 *	@proto_down: new value
 */
int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
{
	if (proto_down)
		netif_carrier_off(dev);
	else
		netif_carrier_on(dev);
	dev->proto_down = proto_down;
	return 0;
}
EXPORT_SYMBOL(dev_change_proto_down_generic);

8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738
/**
 *	dev_change_proto_down_reason - proto down reason
 *
 *	@dev: device
 *	@mask: proto down mask
 *	@value: proto down value
 */
void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
				  u32 value)
{
	int b;

	if (!mask) {
		dev->proto_down_reason = value;
	} else {
		for_each_set_bit(b, &mask, 32) {
			if (value & (1 << b))
				dev->proto_down_reason |= BIT(b);
			else
				dev->proto_down_reason &= ~BIT(b);
		}
	}
}
EXPORT_SYMBOL(dev_change_proto_down_reason);

8739 8740 8741 8742 8743 8744
struct bpf_xdp_link {
	struct bpf_link link;
	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
	int flags;
};

8745
static enum bpf_xdp_mode dev_xdp_mode(u32 flags)
8746
{
8747 8748 8749 8750 8751 8752
	if (flags & XDP_FLAGS_HW_MODE)
		return XDP_MODE_HW;
	if (flags & XDP_FLAGS_DRV_MODE)
		return XDP_MODE_DRV;
	return XDP_MODE_SKB;
}
8753

8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765
static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
{
	switch (mode) {
	case XDP_MODE_SKB:
		return generic_xdp_install;
	case XDP_MODE_DRV:
	case XDP_MODE_HW:
		return dev->netdev_ops->ndo_bpf;
	default:
		return NULL;
	};
}
8766

8767 8768 8769 8770 8771 8772
static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
					 enum bpf_xdp_mode mode)
{
	return dev->xdp_state[mode].link;
}

8773 8774 8775
static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
				     enum bpf_xdp_mode mode)
{
8776 8777 8778 8779
	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);

	if (link)
		return link->link.prog;
8780 8781 8782 8783 8784 8785
	return dev->xdp_state[mode].prog;
}

u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
8786

8787 8788
	return prog ? prog->aux->id : 0;
}
M
Martin KaFai Lau 已提交
8789

8790 8791 8792 8793 8794
static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
			     struct bpf_xdp_link *link)
{
	dev->xdp_state[mode].link = link;
	dev->xdp_state[mode].prog = NULL;
8795 8796
}

8797 8798 8799
static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
			     struct bpf_prog *prog)
{
8800
	dev->xdp_state[mode].link = NULL;
8801
	dev->xdp_state[mode].prog = prog;
8802 8803
}

8804 8805 8806
static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
			   u32 flags, struct bpf_prog *prog)
8807
{
8808
	struct netdev_bpf xdp;
8809 8810
	int err;

8811
	memset(&xdp, 0, sizeof(xdp));
8812
	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
8813
	xdp.extack = extack;
8814
	xdp.flags = flags;
8815 8816
	xdp.prog = prog;

8817 8818 8819 8820 8821 8822 8823 8824
	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
	 * "moved" into driver), so they don't increment it on their own, but
	 * they do decrement refcnt when program is detached or replaced.
	 * Given net_device also owns link/prog, we need to bump refcnt here
	 * to prevent drivers from underflowing it.
	 */
	if (prog)
		bpf_prog_inc(prog);
8825
	err = bpf_op(dev, &xdp);
8826 8827 8828 8829 8830
	if (err) {
		if (prog)
			bpf_prog_put(prog);
		return err;
	}
8831

8832 8833
	if (mode != XDP_MODE_HW)
		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
8834

8835
	return 0;
8836 8837
}

8838 8839
static void dev_xdp_uninstall(struct net_device *dev)
{
8840
	struct bpf_xdp_link *link;
8841 8842 8843
	struct bpf_prog *prog;
	enum bpf_xdp_mode mode;
	bpf_op_t bpf_op;
8844

8845
	ASSERT_RTNL();
8846

8847 8848 8849 8850
	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
		prog = dev_xdp_prog(dev, mode);
		if (!prog)
			continue;
8851

8852 8853 8854
		bpf_op = dev_xdp_bpf_op(dev, mode);
		if (!bpf_op)
			continue;
8855

8856 8857
		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));

8858 8859 8860 8861 8862 8863 8864 8865
		/* auto-detach link from net device */
		link = dev_xdp_link(dev, mode);
		if (link)
			link->dev = NULL;
		else
			bpf_prog_put(prog);

		dev_xdp_set_link(dev, mode, NULL);
8866
	}
8867 8868
}

8869
static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
8870 8871
			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
			  struct bpf_prog *old_prog, u32 flags)
8872
{
8873 8874
	struct bpf_prog *cur_prog;
	enum bpf_xdp_mode mode;
8875
	bpf_op_t bpf_op;
8876 8877
	int err;

8878 8879
	ASSERT_RTNL();

8880 8881 8882 8883 8884 8885 8886 8887
	/* either link or prog attachment, never both */
	if (link && (new_prog || old_prog))
		return -EINVAL;
	/* link supports only XDP mode flags */
	if (link && (flags & ~XDP_FLAGS_MODES)) {
		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
		return -EINVAL;
	}
8888 8889 8890 8891 8892 8893 8894 8895 8896
	/* just one XDP mode bit should be set, zero defaults to SKB mode */
	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
		return -EINVAL;
	}
	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
		return -EINVAL;
8897
	}
8898

8899
	mode = dev_xdp_mode(flags);
8900 8901 8902 8903
	/* can't replace attached link */
	if (dev_xdp_link(dev, mode)) {
		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
		return -EBUSY;
8904
	}
8905

8906
	cur_prog = dev_xdp_prog(dev, mode);
8907 8908 8909 8910 8911
	/* can't replace attached prog with link */
	if (link && cur_prog) {
		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
		return -EBUSY;
	}
8912 8913 8914
	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
		NL_SET_ERR_MSG(extack, "Active program does not match expected");
		return -EEXIST;
8915
	}
8916

8917 8918 8919
	/* put effective new program into new_prog */
	if (link)
		new_prog = link->link.prog;
8920

8921 8922
	if (new_prog) {
		bool offload = mode == XDP_MODE_HW;
8923 8924
		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
					       ? XDP_MODE_DRV : XDP_MODE_SKB;
8925

8926 8927 8928 8929
		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
			NL_SET_ERR_MSG(extack, "XDP program already attached");
			return -EBUSY;
		}
8930
		if (!offload && dev_xdp_prog(dev, other_mode)) {
8931
			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
8932
			return -EEXIST;
8933
		}
8934
		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
8935
			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
8936 8937
			return -EINVAL;
		}
8938
		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
8939 8940 8941
			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
			return -EINVAL;
		}
8942 8943
		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
8944 8945
			return -EINVAL;
		}
8946
	}
8947

8948 8949 8950 8951 8952 8953
	/* don't call drivers if the effective program didn't change */
	if (new_prog != cur_prog) {
		bpf_op = dev_xdp_bpf_op(dev, mode);
		if (!bpf_op) {
			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
			return -EOPNOTSUPP;
8954
		}
8955

8956 8957 8958
		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
		if (err)
			return err;
8959
	}
8960

8961 8962 8963 8964
	if (link)
		dev_xdp_set_link(dev, mode, link);
	else
		dev_xdp_set_prog(dev, mode, new_prog);
8965 8966
	if (cur_prog)
		bpf_prog_put(cur_prog);
8967

8968
	return 0;
8969 8970
}

8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005
static int dev_xdp_attach_link(struct net_device *dev,
			       struct netlink_ext_ack *extack,
			       struct bpf_xdp_link *link)
{
	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
}

static int dev_xdp_detach_link(struct net_device *dev,
			       struct netlink_ext_ack *extack,
			       struct bpf_xdp_link *link)
{
	enum bpf_xdp_mode mode;
	bpf_op_t bpf_op;

	ASSERT_RTNL();

	mode = dev_xdp_mode(link->flags);
	if (dev_xdp_link(dev, mode) != link)
		return -EINVAL;

	bpf_op = dev_xdp_bpf_op(dev, mode);
	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
	dev_xdp_set_link(dev, mode, NULL);
	return 0;
}

static void bpf_xdp_link_release(struct bpf_link *link)
{
	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

	rtnl_lock();

	/* if racing with net_device's tear down, xdp_link->dev might be
	 * already NULL, in which case link was already auto-detached
	 */
9006
	if (xdp_link->dev) {
9007
		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9008 9009
		xdp_link->dev = NULL;
	}
9010 9011 9012 9013

	rtnl_unlock();
}

9014 9015 9016 9017 9018 9019
static int bpf_xdp_link_detach(struct bpf_link *link)
{
	bpf_xdp_link_release(link);
	return 0;
}

9020 9021 9022 9023 9024 9025 9026
static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

	kfree(xdp_link);
}

9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055
static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
				     struct seq_file *seq)
{
	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
	u32 ifindex = 0;

	rtnl_lock();
	if (xdp_link->dev)
		ifindex = xdp_link->dev->ifindex;
	rtnl_unlock();

	seq_printf(seq, "ifindex:\t%u\n", ifindex);
}

static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
				       struct bpf_link_info *info)
{
	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
	u32 ifindex = 0;

	rtnl_lock();
	if (xdp_link->dev)
		ifindex = xdp_link->dev->ifindex;
	rtnl_unlock();

	info->xdp.ifindex = ifindex;
	return 0;
}

9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097
static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
			       struct bpf_prog *old_prog)
{
	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
	enum bpf_xdp_mode mode;
	bpf_op_t bpf_op;
	int err = 0;

	rtnl_lock();

	/* link might have been auto-released already, so fail */
	if (!xdp_link->dev) {
		err = -ENOLINK;
		goto out_unlock;
	}

	if (old_prog && link->prog != old_prog) {
		err = -EPERM;
		goto out_unlock;
	}
	old_prog = link->prog;
	if (old_prog == new_prog) {
		/* no-op, don't disturb drivers */
		bpf_prog_put(new_prog);
		goto out_unlock;
	}

	mode = dev_xdp_mode(xdp_link->flags);
	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
			      xdp_link->flags, new_prog);
	if (err)
		goto out_unlock;

	old_prog = xchg(&link->prog, new_prog);
	bpf_prog_put(old_prog);

out_unlock:
	rtnl_unlock();
	return err;
}

9098 9099 9100
static const struct bpf_link_ops bpf_xdp_link_lops = {
	.release = bpf_xdp_link_release,
	.dealloc = bpf_xdp_link_dealloc,
9101
	.detach = bpf_xdp_link_detach,
9102 9103
	.show_fdinfo = bpf_xdp_link_show_fdinfo,
	.fill_link_info = bpf_xdp_link_fill_link_info,
9104
	.update_prog = bpf_xdp_link_update,
9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153
};

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
	struct net *net = current->nsproxy->net_ns;
	struct bpf_link_primer link_primer;
	struct bpf_xdp_link *link;
	struct net_device *dev;
	int err, fd;

	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
	if (!dev)
		return -EINVAL;

	link = kzalloc(sizeof(*link), GFP_USER);
	if (!link) {
		err = -ENOMEM;
		goto out_put_dev;
	}

	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
	link->dev = dev;
	link->flags = attr->link_create.flags;

	err = bpf_link_prime(&link->link, &link_primer);
	if (err) {
		kfree(link);
		goto out_put_dev;
	}

	rtnl_lock();
	err = dev_xdp_attach_link(dev, NULL, link);
	rtnl_unlock();

	if (err) {
		bpf_link_cleanup(&link_primer);
		goto out_put_dev;
	}

	fd = bpf_link_settle(&link_primer);
	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
	dev_put(dev);
	return fd;

out_put_dev:
	dev_put(dev);
	return err;
}

9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186
/**
 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 *	@dev: device
 *	@extack: netlink extended ack
 *	@fd: new program fd or negative value to clear
 *	@expected_fd: old program fd that userspace expects to replace or clear
 *	@flags: xdp-related flags
 *
 *	Set or clear a bpf program for a device
 */
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
		      int fd, int expected_fd, u32 flags)
{
	enum bpf_xdp_mode mode = dev_xdp_mode(flags);
	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
	int err;

	ASSERT_RTNL();

	if (fd >= 0) {
		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
						 mode != XDP_MODE_SKB);
		if (IS_ERR(new_prog))
			return PTR_ERR(new_prog);
	}

	if (expected_fd >= 0) {
		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
						 mode != XDP_MODE_SKB);
		if (IS_ERR(old_prog)) {
			err = PTR_ERR(old_prog);
			old_prog = NULL;
			goto err_out;
9187
		}
9188 9189
	}

9190
	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9191

9192 9193 9194 9195 9196
err_out:
	if (err && new_prog)
		bpf_prog_put(new_prog);
	if (old_prog)
		bpf_prog_put(old_prog);
9197 9198 9199
	return err;
}

L
Linus Torvalds 已提交
9200 9201
/**
 *	dev_new_index	-	allocate an ifindex
9202
 *	@net: the applicable net namespace
L
Linus Torvalds 已提交
9203 9204 9205 9206 9207
 *
 *	Returns a suitable unique value for a new device interface
 *	number.  The caller must hold the rtnl semaphore or the
 *	dev_base_lock to be sure it remains unique.
 */
9208
static int dev_new_index(struct net *net)
L
Linus Torvalds 已提交
9209
{
9210
	int ifindex = net->ifindex;
9211

L
Linus Torvalds 已提交
9212 9213 9214
	for (;;) {
		if (++ifindex <= 0)
			ifindex = 1;
9215
		if (!__dev_get_by_index(net, ifindex))
9216
			return net->ifindex = ifindex;
L
Linus Torvalds 已提交
9217 9218 9219 9220
	}
}

/* Delayed registration/unregisteration */
9221
static LIST_HEAD(net_todo_list);
9222
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
L
Linus Torvalds 已提交
9223

9224
static void net_set_todo(struct net_device *dev)
L
Linus Torvalds 已提交
9225 9226
{
	list_add_tail(&dev->todo_list, &net_todo_list);
9227
	dev_net(dev)->dev_unreg_count++;
L
Linus Torvalds 已提交
9228 9229
}

9230
static void rollback_registered_many(struct list_head *head)
9231
{
9232
	struct net_device *dev, *tmp;
9233
	LIST_HEAD(close_head);
9234

9235 9236 9237
	BUG_ON(dev_boot_phase);
	ASSERT_RTNL();

9238
	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9239
		/* Some devices call without registering
9240 9241
		 * for initialization unwind. Remove those
		 * devices and proceed with the remaining.
9242 9243
		 */
		if (dev->reg_state == NETREG_UNINITIALIZED) {
9244 9245
			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
				 dev->name, dev);
9246

9247
			WARN_ON(1);
9248 9249
			list_del(&dev->unreg_list);
			continue;
9250
		}
9251
		dev->dismantle = true;
9252
		BUG_ON(dev->reg_state != NETREG_REGISTERED);
9253
	}
9254

9255
	/* If device is running, close it first. */
9256 9257
	list_for_each_entry(dev, head, unreg_list)
		list_add_tail(&dev->close_list, &close_head);
9258
	dev_close_many(&close_head, true);
9259

9260
	list_for_each_entry(dev, head, unreg_list) {
9261 9262
		/* And unlink it from device chain. */
		unlist_netdevice(dev);
9263

9264 9265
		dev->reg_state = NETREG_UNREGISTERING;
	}
9266
	flush_all_backlogs();
9267 9268 9269

	synchronize_net();

9270
	list_for_each_entry(dev, head, unreg_list) {
9271 9272
		struct sk_buff *skb = NULL;

9273 9274
		/* Shutdown queueing discipline. */
		dev_shutdown(dev);
9275

9276
		dev_xdp_uninstall(dev);
9277

9278
		/* Notify protocols, that we are about to destroy
9279 9280
		 * this device. They should clean all the things.
		 */
9281
		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9282

9283 9284
		if (!dev->rtnl_link_ops ||
		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9285
			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
9286
						     GFP_KERNEL, NULL, 0);
9287

9288 9289 9290
		/*
		 *	Flush the unicast and multicast chains
		 */
9291
		dev_uc_flush(dev);
9292
		dev_mc_flush(dev);
9293

9294
		netdev_name_node_alt_flush(dev);
9295 9296
		netdev_name_node_free(dev->name_node);

9297 9298
		if (dev->netdev_ops->ndo_uninit)
			dev->netdev_ops->ndo_uninit(dev);
9299

9300 9301
		if (skb)
			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
9302

J
Jiri Pirko 已提交
9303 9304
		/* Notifier chain MUST detach us all upper devices. */
		WARN_ON(netdev_has_any_upper_dev(dev));
9305
		WARN_ON(netdev_has_any_lower_dev(dev));
9306

9307 9308
		/* Remove entries from kobject tree */
		netdev_unregister_kobject(dev);
9309 9310 9311 9312
#ifdef CONFIG_XPS
		/* Remove XPS queueing entries */
		netif_reset_xps_queues_gt(dev, 0);
#endif
9313
	}
9314

9315
	synchronize_net();
9316

9317
	list_for_each_entry(dev, head, unreg_list)
9318 9319 9320 9321 9322 9323 9324 9325 9326
		dev_put(dev);
}

static void rollback_registered(struct net_device *dev)
{
	LIST_HEAD(single);

	list_add(&dev->unreg_list, &single);
	rollback_registered_many(&single);
E
Eric Dumazet 已提交
9327
	list_del(&single);
9328 9329
}

9330 9331 9332 9333 9334
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
	struct net_device *upper, netdev_features_t features)
{
	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
	netdev_features_t feature;
9335
	int feature_bit;
9336

9337
	for_each_netdev_feature(upper_disables, feature_bit) {
9338
		feature = __NETIF_F_BIT(feature_bit);
9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354
		if (!(upper->wanted_features & feature)
		    && (features & feature)) {
			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
				   &feature, upper->name);
			features &= ~feature;
		}
	}

	return features;
}

static void netdev_sync_lower_features(struct net_device *upper,
	struct net_device *lower, netdev_features_t features)
{
	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
	netdev_features_t feature;
9355
	int feature_bit;
9356

9357
	for_each_netdev_feature(upper_disables, feature_bit) {
9358
		feature = __NETIF_F_BIT(feature_bit);
9359 9360 9361 9362
		if (!(features & feature) && (lower->features & feature)) {
			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
				   &feature, lower->name);
			lower->wanted_features &= ~feature;
9363
			__netdev_update_features(lower);
9364 9365 9366 9367

			if (unlikely(lower->features & feature))
				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
					    &feature, lower->name);
9368 9369
			else
				netdev_features_change(lower);
9370 9371 9372 9373
		}
	}
}

9374 9375
static netdev_features_t netdev_fix_features(struct net_device *dev,
	netdev_features_t features)
9376
{
9377 9378 9379
	/* Fix illegal checksum combinations */
	if ((features & NETIF_F_HW_CSUM) &&
	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9380
		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9381 9382 9383
		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
	}

9384
	/* TSO requires that SG is present as well. */
9385
	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9386
		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9387
		features &= ~NETIF_F_ALL_TSO;
9388 9389
	}

9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402
	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
					!(features & NETIF_F_IP_CSUM)) {
		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
		features &= ~NETIF_F_TSO;
		features &= ~NETIF_F_TSO_ECN;
	}

	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
					 !(features & NETIF_F_IPV6_CSUM)) {
		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
		features &= ~NETIF_F_TSO6;
	}

9403 9404 9405 9406
	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
		features &= ~NETIF_F_TSO_MANGLEID;

9407 9408 9409 9410
	/* TSO ECN requires that TSO is present as well. */
	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
		features &= ~NETIF_F_TSO_ECN;

9411 9412
	/* Software GSO depends on SG. */
	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9413
		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9414 9415 9416
		features &= ~NETIF_F_GSO;
	}

9417 9418 9419 9420 9421 9422 9423 9424
	/* GSO partial features require GSO partial be set */
	if ((features & dev->gso_partial_features) &&
	    !(features & NETIF_F_GSO_PARTIAL)) {
		netdev_dbg(dev,
			   "Dropping partially supported GSO features since no GSO partial.\n");
		features &= ~dev->gso_partial_features;
	}

M
Michael Chan 已提交
9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436
	if (!(features & NETIF_F_RXCSUM)) {
		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
		 * successfully merged by hardware must also have the
		 * checksum verified by hardware.  If the user does not
		 * want to enable RXCSUM, logically, we should disable GRO_HW.
		 */
		if (features & NETIF_F_GRO_HW) {
			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
			features &= ~NETIF_F_GRO_HW;
		}
	}

9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447
	/* LRO/HW-GRO features cannot be combined with RX-FCS */
	if (features & NETIF_F_RXFCS) {
		if (features & NETIF_F_LRO) {
			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
			features &= ~NETIF_F_LRO;
		}

		if (features & NETIF_F_GRO_HW) {
			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
			features &= ~NETIF_F_GRO_HW;
		}
9448 9449
	}

9450 9451 9452
	return features;
}

9453
int __netdev_update_features(struct net_device *dev)
9454
{
9455
	struct net_device *upper, *lower;
9456
	netdev_features_t features;
9457
	struct list_head *iter;
9458
	int err = -1;
9459

9460 9461
	ASSERT_RTNL();

9462 9463 9464 9465 9466 9467 9468 9469
	features = netdev_get_wanted_features(dev);

	if (dev->netdev_ops->ndo_fix_features)
		features = dev->netdev_ops->ndo_fix_features(dev, features);

	/* driver might be less strict about feature dependencies */
	features = netdev_fix_features(dev, features);

9470 9471 9472 9473
	/* some features can't be enabled if they're off an an upper device */
	netdev_for_each_upper_dev_rcu(dev, upper, iter)
		features = netdev_sync_upper_features(dev, upper, features);

9474
	if (dev->features == features)
9475
		goto sync_lower;
9476

9477 9478
	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
		&dev->features, &features);
9479 9480 9481

	if (dev->netdev_ops->ndo_set_features)
		err = dev->netdev_ops->ndo_set_features(dev, features);
9482 9483
	else
		err = 0;
9484

9485
	if (unlikely(err < 0)) {
9486
		netdev_err(dev,
9487 9488
			"set_features() failed (%d); wanted %pNF, left %pNF\n",
			err, &features, &dev->features);
9489 9490 9491 9492
		/* return non-0 since some features might have changed and
		 * it's better to fire a spurious notification than miss it
		 */
		return -1;
9493 9494
	}

9495
sync_lower:
9496 9497 9498 9499 9500 9501
	/* some features must be disabled on lower devices when disabled
	 * on an upper device (think: bonding master or bridge)
	 */
	netdev_for_each_lower_dev(dev, lower, iter)
		netdev_sync_lower_features(dev, lower, features);

9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520
	if (!err) {
		netdev_features_t diff = features ^ dev->features;

		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
			/* udp_tunnel_{get,drop}_rx_info both need
			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
			 * device, or they won't do anything.
			 * Thus we need to update dev->features
			 * *before* calling udp_tunnel_get_rx_info,
			 * but *after* calling udp_tunnel_drop_rx_info.
			 */
			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
				dev->features = features;
				udp_tunnel_get_rx_info(dev);
			} else {
				udp_tunnel_drop_rx_info(dev);
			}
		}

9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538
		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
				dev->features = features;
				err |= vlan_get_rx_ctag_filter_info(dev);
			} else {
				vlan_drop_rx_ctag_filter_info(dev);
			}
		}

		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
				dev->features = features;
				err |= vlan_get_rx_stag_filter_info(dev);
			} else {
				vlan_drop_rx_stag_filter_info(dev);
			}
		}

9539
		dev->features = features;
9540
	}
9541

9542
	return err < 0 ? 0 : 1;
9543 9544
}

9545 9546 9547 9548 9549 9550 9551 9552
/**
 *	netdev_update_features - recalculate device features
 *	@dev: the device to check
 *
 *	Recalculate dev->features set and send notifications if it
 *	has changed. Should be called after driver or hardware dependent
 *	conditions might have changed that influence the features.
 */
9553 9554 9555 9556
void netdev_update_features(struct net_device *dev)
{
	if (__netdev_update_features(dev))
		netdev_features_change(dev);
9557 9558 9559
}
EXPORT_SYMBOL(netdev_update_features);

9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576
/**
 *	netdev_change_features - recalculate device features
 *	@dev: the device to check
 *
 *	Recalculate dev->features set and send notifications even
 *	if they have not changed. Should be called instead of
 *	netdev_update_features() if also dev->vlan_features might
 *	have changed to allow the changes to be propagated to stacked
 *	VLAN devices.
 */
void netdev_change_features(struct net_device *dev)
{
	__netdev_update_features(dev);
	netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);

9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593
/**
 *	netif_stacked_transfer_operstate -	transfer operstate
 *	@rootdev: the root or lower level device to transfer state from
 *	@dev: the device to transfer operstate to
 *
 *	Transfer operational state from root to device. This is normally
 *	called when a stacking relationship exists between the root
 *	device and the device(a leaf device).
 */
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
					struct net_device *dev)
{
	if (rootdev->operstate == IF_OPER_DORMANT)
		netif_dormant_on(dev);
	else
		netif_dormant_off(dev);

A
Andrew Lunn 已提交
9594 9595 9596 9597 9598
	if (rootdev->operstate == IF_OPER_TESTING)
		netif_testing_on(dev);
	else
		netif_testing_off(dev);

9599 9600 9601 9602
	if (netif_carrier_ok(rootdev))
		netif_carrier_on(dev);
	else
		netif_carrier_off(dev);
9603 9604 9605
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);

9606 9607 9608
static int netif_alloc_rx_queues(struct net_device *dev)
{
	unsigned int i, count = dev->num_rx_queues;
T
Tom Herbert 已提交
9609
	struct netdev_rx_queue *rx;
9610
	size_t sz = count * sizeof(*rx);
9611
	int err = 0;
9612

T
Tom Herbert 已提交
9613
	BUG_ON(count < 1);
9614

9615
	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9616 9617 9618
	if (!rx)
		return -ENOMEM;

T
Tom Herbert 已提交
9619 9620
	dev->_rx = rx;

9621
	for (i = 0; i < count; i++) {
T
Tom Herbert 已提交
9622
		rx[i].dev = dev;
9623 9624 9625 9626 9627 9628

		/* XDP RX-queue setup */
		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
		if (err < 0)
			goto err_rxq_info;
	}
9629
	return 0;
9630 9631 9632 9633 9634

err_rxq_info:
	/* Rollback successful reg's and free other resources */
	while (i--)
		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9635
	kvfree(dev->_rx);
9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648
	dev->_rx = NULL;
	return err;
}

static void netif_free_rx_queues(struct net_device *dev)
{
	unsigned int i, count = dev->num_rx_queues;

	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
	if (!dev->_rx)
		return;

	for (i = 0; i < count; i++)
J
Jakub Kicinski 已提交
9649 9650 9651
		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);

	kvfree(dev->_rx);
9652 9653
}

C
Changli Gao 已提交
9654 9655 9656 9657 9658
static void netdev_init_one_queue(struct net_device *dev,
				  struct netdev_queue *queue, void *_unused)
{
	/* Initialize queue lock */
	spin_lock_init(&queue->_xmit_lock);
9659
	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
C
Changli Gao 已提交
9660
	queue->xmit_lock_owner = -1;
9661
	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
C
Changli Gao 已提交
9662
	queue->dev = dev;
T
Tom Herbert 已提交
9663 9664 9665
#ifdef CONFIG_BQL
	dql_init(&queue->dql, HZ);
#endif
C
Changli Gao 已提交
9666 9667
}

9668 9669
static void netif_free_tx_queues(struct net_device *dev)
{
W
WANG Cong 已提交
9670
	kvfree(dev->_tx);
9671 9672
}

9673 9674 9675 9676
static int netif_alloc_netdev_queues(struct net_device *dev)
{
	unsigned int count = dev->num_tx_queues;
	struct netdev_queue *tx;
9677
	size_t sz = count * sizeof(*tx);
9678

9679 9680
	if (count < 1 || count > 0xffff)
		return -EINVAL;
9681

9682
	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9683 9684 9685
	if (!tx)
		return -ENOMEM;

9686
	dev->_tx = tx;
T
Tom Herbert 已提交
9687

9688 9689
	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
	spin_lock_init(&dev->tx_global_lock);
C
Changli Gao 已提交
9690 9691

	return 0;
9692 9693
}

9694 9695 9696 9697 9698 9699
void netif_tx_stop_all_queues(struct net_device *dev)
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9700

9701 9702 9703 9704 9705
		netif_tx_stop_queue(txq);
	}
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);

L
Linus Torvalds 已提交
9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725
/**
 *	register_netdevice	- register a network device
 *	@dev: device to register
 *
 *	Take a completed network device structure and add it to the kernel
 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 *	chain. 0 is returned on success. A negative errno code is returned
 *	on a failure to set up the device, or if the name is a duplicate.
 *
 *	Callers must hold the rtnl semaphore. You may want
 *	register_netdev() instead of this.
 *
 *	BUGS:
 *	The locking appears insufficient to guarantee two parallel registers
 *	will not get the same name.
 */

int register_netdevice(struct net_device *dev)
{
	int ret;
9726
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
9727

9728 9729
	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
		     NETDEV_FEATURE_COUNT);
L
Linus Torvalds 已提交
9730 9731 9732
	BUG_ON(dev_boot_phase);
	ASSERT_RTNL();

9733 9734
	might_sleep();

L
Linus Torvalds 已提交
9735 9736
	/* When net_device's are persistent, this will be fatal. */
	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9737
	BUG_ON(!net);
L
Linus Torvalds 已提交
9738

9739 9740 9741 9742
	ret = ethtool_check_ops(dev->ethtool_ops);
	if (ret)
		return ret;

9743
	spin_lock_init(&dev->addr_list_lock);
9744
	netdev_set_addr_lockdep_class(dev);
L
Linus Torvalds 已提交
9745

9746
	ret = dev_get_valid_name(net, dev, dev->name);
9747 9748 9749
	if (ret < 0)
		goto out;

9750
	ret = -ENOMEM;
9751 9752 9753 9754
	dev->name_node = netdev_name_node_head_alloc(dev);
	if (!dev->name_node)
		goto out;

L
Linus Torvalds 已提交
9755
	/* Init, if this function is available */
9756 9757
	if (dev->netdev_ops->ndo_init) {
		ret = dev->netdev_ops->ndo_init(dev);
L
Linus Torvalds 已提交
9758 9759 9760
		if (ret) {
			if (ret > 0)
				ret = -EIO;
9761
			goto err_free_name;
L
Linus Torvalds 已提交
9762 9763
		}
	}
9764

9765 9766
	if (((dev->hw_features | dev->features) &
	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
9767 9768 9769 9770 9771 9772 9773
	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
		ret = -EINVAL;
		goto err_uninit;
	}

9774 9775 9776 9777 9778 9779
	ret = -EBUSY;
	if (!dev->ifindex)
		dev->ifindex = dev_new_index(net);
	else if (__dev_get_by_index(net, dev->ifindex))
		goto err_uninit;

9780 9781 9782
	/* Transfer changeable features to wanted_features and enable
	 * software offloads (GSO and GRO).
	 */
9783
	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9784
	dev->features |= NETIF_F_SOFT_FEATURES;
9785 9786 9787 9788 9789 9790

	if (dev->netdev_ops->ndo_udp_tunnel_add) {
		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
	}

9791
	dev->wanted_features = dev->features & dev->hw_features;
L
Linus Torvalds 已提交
9792

9793
	if (!(dev->flags & IFF_LOOPBACK))
9794
		dev->hw_features |= NETIF_F_NOCACHE_COPY;
9795

9796 9797 9798 9799 9800
	/* If IPv4 TCP segmentation offload is supported we should also
	 * allow the device to enable segmenting the frame with the option
	 * of ignoring a static IP ID value.  This doesn't enable the
	 * feature itself but allows the user to enable it later.
	 */
9801 9802
	if (dev->hw_features & NETIF_F_TSO)
		dev->hw_features |= NETIF_F_TSO_MANGLEID;
9803 9804 9805 9806 9807 9808
	if (dev->vlan_features & NETIF_F_TSO)
		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
	if (dev->mpls_features & NETIF_F_TSO)
		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
	if (dev->hw_enc_features & NETIF_F_TSO)
		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
9809

9810
	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
9811
	 */
9812
	dev->vlan_features |= NETIF_F_HIGHDMA;
9813

9814 9815
	/* Make NETIF_F_SG inheritable to tunnel devices.
	 */
9816
	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
9817

S
Simon Horman 已提交
9818 9819 9820 9821
	/* Make NETIF_F_SG inheritable to MPLS.
	 */
	dev->mpls_features |= NETIF_F_SG;

9822 9823 9824 9825 9826
	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
	ret = notifier_to_errno(ret);
	if (ret)
		goto err_uninit;

9827
	ret = netdev_register_kobject(dev);
9828 9829
	if (ret) {
		dev->reg_state = NETREG_UNREGISTERED;
9830
		goto err_uninit;
9831
	}
9832 9833
	dev->reg_state = NETREG_REGISTERED;

9834
	__netdev_update_features(dev);
9835

L
Linus Torvalds 已提交
9836 9837 9838 9839 9840 9841 9842
	/*
	 *	Default initial state at registry is that the
	 *	device is present.
	 */

	set_bit(__LINK_STATE_PRESENT, &dev->state);

9843 9844
	linkwatch_init_dev(dev);

L
Linus Torvalds 已提交
9845 9846
	dev_init_scheduler(dev);
	dev_hold(dev);
9847
	list_netdevice(dev);
9848
	add_device_randomness(dev->dev_addr, dev->addr_len);
L
Linus Torvalds 已提交
9849

9850 9851 9852 9853 9854 9855 9856
	/* If the device has permanent device address, driver should
	 * set dev_addr and also addr_assign_type should be set to
	 * NET_ADDR_PERM (default value).
	 */
	if (dev->addr_assign_type == NET_ADDR_PERM)
		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);

L
Linus Torvalds 已提交
9857
	/* Notify protocols, that a new device appeared. */
9858
	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
9859
	ret = notifier_to_errno(ret);
9860 9861
	if (ret) {
		rollback_registered(dev);
9862 9863
		rcu_barrier();

9864
		dev->reg_state = NETREG_UNREGISTERED;
9865 9866 9867 9868 9869 9870 9871
		/* We should put the kobject that hold in
		 * netdev_unregister_kobject(), otherwise
		 * the net device cannot be freed when
		 * driver calls free_netdev(), because the
		 * kobject is being hold.
		 */
		kobject_put(&dev->dev.kobj);
9872
	}
9873 9874 9875 9876
	/*
	 *	Prevent userspace races by waiting until the network
	 *	device is fully setup before sending notifications.
	 */
9877 9878
	if (!dev->rtnl_link_ops ||
	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9879
		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
L
Linus Torvalds 已提交
9880 9881 9882

out:
	return ret;
9883 9884

err_uninit:
9885 9886
	if (dev->netdev_ops->ndo_uninit)
		dev->netdev_ops->ndo_uninit(dev);
9887 9888
	if (dev->priv_destructor)
		dev->priv_destructor(dev);
9889 9890
err_free_name:
	netdev_name_node_free(dev->name_node);
9891
	goto out;
L
Linus Torvalds 已提交
9892
}
E
Eric Dumazet 已提交
9893
EXPORT_SYMBOL(register_netdevice);
L
Linus Torvalds 已提交
9894

9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925
/**
 *	init_dummy_netdev	- init a dummy network device for NAPI
 *	@dev: device to init
 *
 *	This takes a network device structure and initialize the minimum
 *	amount of fields so it can be used to schedule NAPI polls without
 *	registering a full blown interface. This is to be used by drivers
 *	that need to tie several hardware interfaces to a single NAPI
 *	poll scheduler due to HW limitations.
 */
int init_dummy_netdev(struct net_device *dev)
{
	/* Clear everything. Note we don't initialize spinlocks
	 * are they aren't supposed to be taken by any of the
	 * NAPI code and this dummy netdev is supposed to be
	 * only ever used for NAPI polls
	 */
	memset(dev, 0, sizeof(struct net_device));

	/* make sure we BUG if trying to hit standard
	 * register/unregister code path
	 */
	dev->reg_state = NETREG_DUMMY;

	/* NAPI wants this */
	INIT_LIST_HEAD(&dev->napi_list);

	/* a dummy interface is started by default */
	set_bit(__LINK_STATE_PRESENT, &dev->state);
	set_bit(__LINK_STATE_START, &dev->state);

9926 9927 9928
	/* napi_busy_loop stats accounting wants this */
	dev_net_set(dev, &init_net);

E
Eric Dumazet 已提交
9929 9930 9931 9932 9933
	/* Note : We dont allocate pcpu_refcnt for dummy devices,
	 * because users of this 'device' dont need to change
	 * its refcount.
	 */

9934 9935 9936 9937 9938
	return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);


L
Linus Torvalds 已提交
9939 9940 9941 9942 9943 9944 9945 9946 9947
/**
 *	register_netdev	- register a network device
 *	@dev: device to register
 *
 *	Take a completed network device structure and add it to the kernel
 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 *	chain. 0 is returned on success. A negative errno code is returned
 *	on a failure to set up the device, or if the name is a duplicate.
 *
9948
 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
L
Linus Torvalds 已提交
9949 9950 9951 9952 9953 9954 9955
 *	and expands the device name if you passed a format string to
 *	alloc_netdev.
 */
int register_netdev(struct net_device *dev)
{
	int err;

9956 9957
	if (rtnl_lock_killable())
		return -EINTR;
L
Linus Torvalds 已提交
9958 9959 9960 9961 9962 9963
	err = register_netdevice(dev);
	rtnl_unlock();
	return err;
}
EXPORT_SYMBOL(register_netdev);

E
Eric Dumazet 已提交
9964 9965 9966 9967 9968 9969 9970 9971 9972 9973
int netdev_refcnt_read(const struct net_device *dev)
{
	int i, refcnt = 0;

	for_each_possible_cpu(i)
		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
	return refcnt;
}
EXPORT_SYMBOL(netdev_refcnt_read);

9974
/**
L
Linus Torvalds 已提交
9975
 * netdev_wait_allrefs - wait until all references are gone.
9976
 * @dev: target net_device
L
Linus Torvalds 已提交
9977 9978 9979 9980 9981 9982 9983
 *
 * This is called when unregistering network devices.
 *
 * Any protocol or device that holds a reference should register
 * for netdevice notification, and cleanup and put back the
 * reference if they receive an UNREGISTER event.
 * We can get stuck here if buggy protocols don't correctly
9984
 * call dev_put.
L
Linus Torvalds 已提交
9985 9986 9987 9988
 */
static void netdev_wait_allrefs(struct net_device *dev)
{
	unsigned long rebroadcast_time, warning_time;
E
Eric Dumazet 已提交
9989
	int refcnt;
L
Linus Torvalds 已提交
9990

9991 9992
	linkwatch_forget_dev(dev);

L
Linus Torvalds 已提交
9993
	rebroadcast_time = warning_time = jiffies;
E
Eric Dumazet 已提交
9994 9995 9996
	refcnt = netdev_refcnt_read(dev);

	while (refcnt != 0) {
L
Linus Torvalds 已提交
9997
		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
9998
			rtnl_lock();
L
Linus Torvalds 已提交
9999 10000

			/* Rebroadcast unregister notification */
10001
			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
L
Linus Torvalds 已提交
10002

10003
			__rtnl_unlock();
10004
			rcu_barrier();
10005 10006
			rtnl_lock();

L
Linus Torvalds 已提交
10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017
			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
				     &dev->state)) {
				/* We must not have linkwatch events
				 * pending on unregister. If this
				 * happens, we simply run the queue
				 * unscheduled, resulting in a noop
				 * for this device.
				 */
				linkwatch_run_queue();
			}

10018
			__rtnl_unlock();
L
Linus Torvalds 已提交
10019 10020 10021 10022 10023 10024

			rebroadcast_time = jiffies;
		}

		msleep(250);

E
Eric Dumazet 已提交
10025 10026
		refcnt = netdev_refcnt_read(dev);

10027
		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10028 10029
			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
				 dev->name, refcnt);
L
Linus Torvalds 已提交
10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048
			warning_time = jiffies;
		}
	}
}

/* The sequence is:
 *
 *	rtnl_lock();
 *	...
 *	register_netdevice(x1);
 *	register_netdevice(x2);
 *	...
 *	unregister_netdevice(y1);
 *	unregister_netdevice(y2);
 *      ...
 *	rtnl_unlock();
 *	free_netdev(y1);
 *	free_netdev(y2);
 *
H
Herbert Xu 已提交
10049
 * We are invoked by rtnl_unlock().
L
Linus Torvalds 已提交
10050
 * This allows us to deal with problems:
10051
 * 1) We can delete sysfs objects which invoke hotplug
L
Linus Torvalds 已提交
10052 10053 10054
 *    without deadlocking with linkwatch via keventd.
 * 2) Since we run with the RTNL semaphore not held, we can sleep
 *    safely in order to wait for the netdev refcnt to drop to zero.
H
Herbert Xu 已提交
10055 10056 10057
 *
 * We must not return until all unregister events added during
 * the interval the lock was held have been completed.
L
Linus Torvalds 已提交
10058 10059 10060
 */
void netdev_run_todo(void)
{
10061
	struct list_head list;
L
Linus Torvalds 已提交
10062 10063

	/* Snapshot list, allow later requests */
10064
	list_replace_init(&net_todo_list, &list);
H
Herbert Xu 已提交
10065 10066

	__rtnl_unlock();
10067

10068 10069

	/* Wait for rcu callbacks to finish before next phase */
10070 10071 10072
	if (!list_empty(&list))
		rcu_barrier();

L
Linus Torvalds 已提交
10073 10074
	while (!list_empty(&list)) {
		struct net_device *dev
10075
			= list_first_entry(&list, struct net_device, todo_list);
L
Linus Torvalds 已提交
10076 10077
		list_del(&dev->todo_list);

10078
		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10079
			pr_err("network todo '%s' but state %d\n",
10080 10081 10082 10083
			       dev->name, dev->reg_state);
			dump_stack();
			continue;
		}
L
Linus Torvalds 已提交
10084

10085
		dev->reg_state = NETREG_UNREGISTERED;
L
Linus Torvalds 已提交
10086

10087
		netdev_wait_allrefs(dev);
L
Linus Torvalds 已提交
10088

10089
		/* paranoia */
E
Eric Dumazet 已提交
10090
		BUG_ON(netdev_refcnt_read(dev));
10091 10092
		BUG_ON(!list_empty(&dev->ptype_all));
		BUG_ON(!list_empty(&dev->ptype_specific));
10093 10094
		WARN_ON(rcu_access_pointer(dev->ip_ptr));
		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10095
#if IS_ENABLED(CONFIG_DECNET)
10096
		WARN_ON(dev->dn_ptr);
10097
#endif
10098 10099 10100 10101
		if (dev->priv_destructor)
			dev->priv_destructor(dev);
		if (dev->needs_free_netdev)
			free_netdev(dev);
10102

10103 10104 10105 10106 10107 10108
		/* Report a network device has been unregistered */
		rtnl_lock();
		dev_net(dev)->dev_unreg_count--;
		__rtnl_unlock();
		wake_up(&netdev_unregistering_wq);

10109 10110
		/* Free network device */
		kobject_put(&dev->dev.kobj);
L
Linus Torvalds 已提交
10111 10112 10113
	}
}

10114 10115 10116 10117
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 * all the same fields in the same order as net_device_stats, with only
 * the type differing, but rtnl_link_stats64 may have additional fields
 * at the end for newer counters.
10118
 */
10119 10120
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
			     const struct net_device_stats *netdev_stats)
10121 10122
{
#if BITS_PER_LONG == 64
10123
	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10124
	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10125 10126 10127
	/* zero out counters that only exist in rtnl_link_stats64 */
	memset((char *)stats64 + sizeof(*netdev_stats), 0,
	       sizeof(*stats64) - sizeof(*netdev_stats));
10128
#else
10129
	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10130 10131 10132
	const unsigned long *src = (const unsigned long *)netdev_stats;
	u64 *dst = (u64 *)stats64;

10133
	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10134 10135
	for (i = 0; i < n; i++)
		dst[i] = src[i];
10136 10137 10138
	/* zero out counters that only exist in rtnl_link_stats64 */
	memset((char *)stats64 + n * sizeof(u64), 0,
	       sizeof(*stats64) - n * sizeof(u64));
10139 10140
#endif
}
10141
EXPORT_SYMBOL(netdev_stats_to_stats64);
10142

10143 10144 10145
/**
 *	dev_get_stats	- get network device statistics
 *	@dev: device to get statistics from
10146
 *	@storage: place to store stats
10147
 *
10148 10149 10150 10151
 *	Get network statistics from device. Return @storage.
 *	The device driver may provide its own method by setting
 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 *	otherwise the internal statistics structure is used.
10152
 */
10153 10154
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
					struct rtnl_link_stats64 *storage)
10155
{
10156 10157
	const struct net_device_ops *ops = dev->netdev_ops;

10158 10159
	if (ops->ndo_get_stats64) {
		memset(storage, 0, sizeof(*storage));
10160 10161
		ops->ndo_get_stats64(dev, storage);
	} else if (ops->ndo_get_stats) {
10162
		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10163 10164
	} else {
		netdev_stats_to_stats64(storage, &dev->stats);
10165
	}
10166 10167 10168
	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10169
	return storage;
R
Rusty Russell 已提交
10170
}
10171
EXPORT_SYMBOL(dev_get_stats);
R
Rusty Russell 已提交
10172

10173
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10174
{
10175
	struct netdev_queue *queue = dev_ingress_queue(dev);
10176

10177 10178 10179 10180 10181 10182 10183
#ifdef CONFIG_NET_CLS_ACT
	if (queue)
		return queue;
	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
	if (!queue)
		return NULL;
	netdev_init_one_queue(dev, queue, NULL);
E
Eric Dumazet 已提交
10184
	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10185 10186 10187 10188
	queue->qdisc_sleeping = &noop_qdisc;
	rcu_assign_pointer(dev->ingress_queue, queue);
#endif
	return queue;
10189 10190
}

10191 10192
static const struct ethtool_ops default_ethtool_ops;

10193 10194 10195 10196 10197 10198 10199 10200
void netdev_set_default_ethtool_ops(struct net_device *dev,
				    const struct ethtool_ops *ops)
{
	if (dev->ethtool_ops == &default_ethtool_ops)
		dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);

10201 10202 10203 10204
void netdev_freemem(struct net_device *dev)
{
	char *addr = (char *)dev - dev->padded;

W
WANG Cong 已提交
10205
	kvfree(addr);
10206 10207
}

L
Linus Torvalds 已提交
10208
/**
10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219
 * alloc_netdev_mqs - allocate network device
 * @sizeof_priv: size of private data to allocate space for
 * @name: device name format string
 * @name_assign_type: origin of device name
 * @setup: callback to initialize device
 * @txqs: the number of TX subqueues to allocate
 * @rxqs: the number of RX subqueues to allocate
 *
 * Allocates a struct net_device with private data area for driver use
 * and performs basic initialization.  Also allocates subqueue structs
 * for each queue on the device.
L
Linus Torvalds 已提交
10220
 */
T
Tom Herbert 已提交
10221
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10222
		unsigned char name_assign_type,
T
Tom Herbert 已提交
10223 10224
		void (*setup)(struct net_device *),
		unsigned int txqs, unsigned int rxqs)
L
Linus Torvalds 已提交
10225 10226
{
	struct net_device *dev;
10227
	unsigned int alloc_size;
10228
	struct net_device *p;
L
Linus Torvalds 已提交
10229

10230 10231
	BUG_ON(strlen(name) >= sizeof(dev->name));

T
Tom Herbert 已提交
10232
	if (txqs < 1) {
10233
		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10234 10235 10236
		return NULL;
	}

T
Tom Herbert 已提交
10237
	if (rxqs < 1) {
10238
		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
T
Tom Herbert 已提交
10239 10240 10241
		return NULL;
	}

10242
	alloc_size = sizeof(struct net_device);
10243 10244
	if (sizeof_priv) {
		/* ensure 32-byte alignment of private area */
10245
		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10246 10247 10248
		alloc_size += sizeof_priv;
	}
	/* ensure 32-byte alignment of whole construct */
10249
	alloc_size += NETDEV_ALIGN - 1;
L
Linus Torvalds 已提交
10250

10251
	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10252
	if (!p)
L
Linus Torvalds 已提交
10253 10254
		return NULL;

10255
	dev = PTR_ALIGN(p, NETDEV_ALIGN);
L
Linus Torvalds 已提交
10256
	dev->padded = (char *)dev - (char *)p;
10257

E
Eric Dumazet 已提交
10258 10259
	dev->pcpu_refcnt = alloc_percpu(int);
	if (!dev->pcpu_refcnt)
10260
		goto free_dev;
10261 10262

	if (dev_addr_init(dev))
E
Eric Dumazet 已提交
10263
		goto free_pcpu;
10264

10265
	dev_mc_init(dev);
10266
	dev_uc_init(dev);
J
Jiri Pirko 已提交
10267

10268
	dev_net_set(dev, &init_net);
L
Linus Torvalds 已提交
10269

10270
	dev->gso_max_size = GSO_MAX_SIZE;
10271
	dev->gso_max_segs = GSO_MAX_SEGS;
T
Taehee Yoo 已提交
10272 10273
	dev->upper_level = 1;
	dev->lower_level = 1;
10274 10275 10276

	INIT_LIST_HEAD(&dev->napi_list);
	INIT_LIST_HEAD(&dev->unreg_list);
10277
	INIT_LIST_HEAD(&dev->close_list);
10278
	INIT_LIST_HEAD(&dev->link_watch_list);
10279 10280
	INIT_LIST_HEAD(&dev->adj_list.upper);
	INIT_LIST_HEAD(&dev->adj_list.lower);
10281 10282
	INIT_LIST_HEAD(&dev->ptype_all);
	INIT_LIST_HEAD(&dev->ptype_specific);
10283
	INIT_LIST_HEAD(&dev->net_notifier_list);
10284 10285 10286
#ifdef CONFIG_NET_SCHED
	hash_init(dev->qdisc_hash);
#endif
10287
	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10288 10289
	setup(dev);

10290
	if (!dev->tx_queue_len) {
10291
		dev->priv_flags |= IFF_NO_QUEUE;
10292
		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10293
	}
10294

T
Tom Herbert 已提交
10295 10296
	dev->num_tx_queues = txqs;
	dev->real_num_tx_queues = txqs;
10297
	if (netif_alloc_netdev_queues(dev))
10298
		goto free_all;
10299

T
Tom Herbert 已提交
10300 10301
	dev->num_rx_queues = rxqs;
	dev->real_num_rx_queues = rxqs;
T
Tom Herbert 已提交
10302
	if (netif_alloc_rx_queues(dev))
10303
		goto free_all;
T
Tom Herbert 已提交
10304

L
Linus Torvalds 已提交
10305
	strcpy(dev->name, name);
10306
	dev->name_assign_type = name_assign_type;
10307
	dev->group = INIT_NETDEV_GROUP;
10308 10309
	if (!dev->ethtool_ops)
		dev->ethtool_ops = &default_ethtool_ops;
10310

10311
	nf_hook_ingress_init(dev);
10312

L
Linus Torvalds 已提交
10313
	return dev;
10314

10315 10316 10317 10318
free_all:
	free_netdev(dev);
	return NULL;

E
Eric Dumazet 已提交
10319 10320
free_pcpu:
	free_percpu(dev->pcpu_refcnt);
10321 10322
free_dev:
	netdev_freemem(dev);
10323
	return NULL;
L
Linus Torvalds 已提交
10324
}
T
Tom Herbert 已提交
10325
EXPORT_SYMBOL(alloc_netdev_mqs);
L
Linus Torvalds 已提交
10326 10327

/**
10328 10329
 * free_netdev - free network device
 * @dev: device
L
Linus Torvalds 已提交
10330
 *
10331 10332 10333 10334
 * This function does the last stage of destroying an allocated device
 * interface. The reference to the device object is released. If this
 * is the last reference then it will be freed.Must be called in process
 * context.
L
Linus Torvalds 已提交
10335 10336 10337
 */
void free_netdev(struct net_device *dev)
{
10338 10339
	struct napi_struct *p, *n;

10340
	might_sleep();
10341
	netif_free_tx_queues(dev);
10342
	netif_free_rx_queues(dev);
10343

10344
	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10345

10346 10347 10348
	/* Flush device addresses */
	dev_addr_flush(dev);

10349 10350 10351
	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
		netif_napi_del(p);

E
Eric Dumazet 已提交
10352 10353
	free_percpu(dev->pcpu_refcnt);
	dev->pcpu_refcnt = NULL;
10354 10355
	free_percpu(dev->xdp_bulkq);
	dev->xdp_bulkq = NULL;
E
Eric Dumazet 已提交
10356

S
Stephen Hemminger 已提交
10357
	/*  Compatibility with error handling in drivers */
L
Linus Torvalds 已提交
10358
	if (dev->reg_state == NETREG_UNINITIALIZED) {
10359
		netdev_freemem(dev);
L
Linus Torvalds 已提交
10360 10361 10362 10363 10364 10365
		return;
	}

	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
	dev->reg_state = NETREG_RELEASED;

10366 10367
	/* will free via device release */
	put_device(&dev->dev);
L
Linus Torvalds 已提交
10368
}
E
Eric Dumazet 已提交
10369
EXPORT_SYMBOL(free_netdev);
10370

10371 10372 10373 10374 10375 10376
/**
 *	synchronize_net -  Synchronize with packet receive processing
 *
 *	Wait for packets currently being received to be done.
 *	Does not block later packets from starting.
 */
10377
void synchronize_net(void)
L
Linus Torvalds 已提交
10378 10379
{
	might_sleep();
10380 10381 10382 10383
	if (rtnl_is_locked())
		synchronize_rcu_expedited();
	else
		synchronize_rcu();
L
Linus Torvalds 已提交
10384
}
E
Eric Dumazet 已提交
10385
EXPORT_SYMBOL(synchronize_net);
L
Linus Torvalds 已提交
10386 10387

/**
10388
 *	unregister_netdevice_queue - remove device from the kernel
L
Linus Torvalds 已提交
10389
 *	@dev: device
10390
 *	@head: list
10391
 *
L
Linus Torvalds 已提交
10392
 *	This function shuts down a device interface and removes it
10393
 *	from the kernel tables.
10394
 *	If head not NULL, device is queued to be unregistered later.
L
Linus Torvalds 已提交
10395 10396 10397 10398 10399
 *
 *	Callers must hold the rtnl semaphore.  You may want
 *	unregister_netdev() instead of this.
 */

10400
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
L
Linus Torvalds 已提交
10401
{
10402 10403
	ASSERT_RTNL();

10404
	if (head) {
10405
		list_move_tail(&dev->unreg_list, head);
10406 10407 10408 10409 10410
	} else {
		rollback_registered(dev);
		/* Finish processing unregister after unlock */
		net_set_todo(dev);
	}
L
Linus Torvalds 已提交
10411
}
10412
EXPORT_SYMBOL(unregister_netdevice_queue);
L
Linus Torvalds 已提交
10413

10414 10415 10416
/**
 *	unregister_netdevice_many - unregister many devices
 *	@head: list of devices
10417 10418 10419
 *
 *  Note: As most callers use a stack allocated list_head,
 *  we force a list_del() to make sure stack wont be corrupted later.
10420 10421 10422 10423 10424 10425 10426 10427 10428
 */
void unregister_netdevice_many(struct list_head *head)
{
	struct net_device *dev;

	if (!list_empty(head)) {
		rollback_registered_many(head);
		list_for_each_entry(dev, head, unreg_list)
			net_set_todo(dev);
10429
		list_del(head);
10430 10431
	}
}
10432
EXPORT_SYMBOL(unregister_netdevice_many);
10433

L
Linus Torvalds 已提交
10434 10435 10436 10437 10438
/**
 *	unregister_netdev - remove device from the kernel
 *	@dev: device
 *
 *	This function shuts down a device interface and removes it
10439
 *	from the kernel tables.
L
Linus Torvalds 已提交
10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452
 *
 *	This is just a wrapper for unregister_netdevice that takes
 *	the rtnl semaphore.  In general you want to use this and not
 *	unregister_netdevice.
 */
void unregister_netdev(struct net_device *dev)
{
	rtnl_lock();
	unregister_netdevice(dev);
	rtnl_unlock();
}
EXPORT_SYMBOL(unregister_netdev);

10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468
/**
 *	dev_change_net_namespace - move device to different nethost namespace
 *	@dev: device
 *	@net: network namespace
 *	@pat: If not NULL name pattern to try if the current device name
 *	      is already taken in the destination network namespace.
 *
 *	This function shuts down a device interface and moves it
 *	to a new network namespace. On success 0 is returned, on
 *	a failure a netagive errno code is returned.
 *
 *	Callers must hold the rtnl semaphore.
 */

int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
10469
	struct net *net_old = dev_net(dev);
10470
	int err, new_nsid, new_ifindex;
10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484

	ASSERT_RTNL();

	/* Don't allow namespace local devices to be moved. */
	err = -EINVAL;
	if (dev->features & NETIF_F_NETNS_LOCAL)
		goto out;

	/* Ensure the device has been registrered */
	if (dev->reg_state != NETREG_REGISTERED)
		goto out;

	/* Get out if there is nothing todo */
	err = 0;
10485
	if (net_eq(net_old, net))
10486 10487 10488 10489 10490 10491
		goto out;

	/* Pick the destination device name, and ensure
	 * we can use it in the destination network namespace.
	 */
	err = -EEXIST;
10492
	if (__dev_get_by_name(net, dev->name)) {
10493 10494 10495
		/* We get here if we can't use the current device name */
		if (!pat)
			goto out;
10496 10497
		err = dev_get_valid_name(net, dev, pat);
		if (err < 0)
10498 10499 10500 10501 10502 10503 10504 10505
			goto out;
	}

	/*
	 * And now a mini version of register_netdevice unregister_netdevice.
	 */

	/* If device is running close it first. */
10506
	dev_close(dev);
10507 10508 10509 10510 10511 10512 10513 10514 10515 10516

	/* And unlink it from device chain */
	unlist_netdevice(dev);

	synchronize_net();

	/* Shutdown queueing discipline. */
	dev_shutdown(dev);

	/* Notify protocols, that we are about to destroy
10517 10518 10519 10520 10521 10522
	 * this device. They should clean all the things.
	 *
	 * Note that dev->reg_state stays at NETREG_REGISTERED.
	 * This is wanted because this way 8021q and macvlan know
	 * the device is just moving and can keep their slaves up.
	 */
10523
	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10524
	rcu_barrier();
10525

10526
	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10527 10528 10529 10530 10531 10532 10533 10534
	/* If there is an ifindex conflict assign a new one */
	if (__dev_get_by_index(net, dev->ifindex))
		new_ifindex = dev_new_index(net);
	else
		new_ifindex = dev->ifindex;

	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
			    new_ifindex);
10535 10536 10537 10538

	/*
	 *	Flush the unicast and multicast chains
	 */
10539
	dev_uc_flush(dev);
10540
	dev_mc_flush(dev);
10541

10542 10543
	/* Send a netdev-removed uevent to the old namespace */
	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10544
	netdev_adjacent_del_links(dev);
10545

10546 10547 10548
	/* Move per-net netdevice notifiers that are following the netdevice */
	move_netdevice_notifiers_dev_net(dev, net);

10549
	/* Actually switch the network namespace */
10550
	dev_net_set(dev, net);
10551
	dev->ifindex = new_ifindex;
10552

10553 10554
	/* Send a netdev-add uevent to the new namespace */
	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10555
	netdev_adjacent_add_links(dev);
10556

10557
	/* Fixup kobjects */
10558
	err = device_rename(&dev->dev, dev->name);
10559
	WARN_ON(err);
10560

10561 10562 10563 10564 10565 10566
	/* Adapt owner in case owning user namespace of target network
	 * namespace is different from the original one.
	 */
	err = netdev_change_owner(dev, net_old, net);
	WARN_ON(err);

10567 10568 10569 10570 10571 10572
	/* Add the device back in the hashes */
	list_netdevice(dev);

	/* Notify protocols, that a new device appeared. */
	call_netdevice_notifiers(NETDEV_REGISTER, dev);

10573 10574 10575 10576
	/*
	 *	Prevent userspace races by waiting until the network
	 *	device is fully setup before sending notifications.
	 */
10577
	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10578

10579 10580 10581 10582 10583
	synchronize_net();
	err = 0;
out:
	return err;
}
10584
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10585

10586
static int dev_cpu_dead(unsigned int oldcpu)
L
Linus Torvalds 已提交
10587 10588 10589
{
	struct sk_buff **list_skb;
	struct sk_buff *skb;
10590
	unsigned int cpu;
10591
	struct softnet_data *sd, *oldsd, *remsd = NULL;
L
Linus Torvalds 已提交
10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606

	local_irq_disable();
	cpu = smp_processor_id();
	sd = &per_cpu(softnet_data, cpu);
	oldsd = &per_cpu(softnet_data, oldcpu);

	/* Find end of our completion_queue. */
	list_skb = &sd->completion_queue;
	while (*list_skb)
		list_skb = &(*list_skb)->next;
	/* Append completion queue from offline CPU. */
	*list_skb = oldsd->completion_queue;
	oldsd->completion_queue = NULL;

	/* Append output queue from offline CPU. */
10607 10608 10609 10610 10611 10612
	if (oldsd->output_queue) {
		*sd->output_queue_tailp = oldsd->output_queue;
		sd->output_queue_tailp = oldsd->output_queue_tailp;
		oldsd->output_queue = NULL;
		oldsd->output_queue_tailp = &oldsd->output_queue;
	}
E
Eric Dumazet 已提交
10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626
	/* Append NAPI poll list from offline CPU, with one exception :
	 * process_backlog() must be called by cpu owning percpu backlog.
	 * We properly handle process_queue & input_pkt_queue later.
	 */
	while (!list_empty(&oldsd->poll_list)) {
		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
							    struct napi_struct,
							    poll_list);

		list_del_init(&napi->poll_list);
		if (napi->poll == process_backlog)
			napi->state = 0;
		else
			____napi_schedule(sd, napi);
10627
	}
L
Linus Torvalds 已提交
10628 10629 10630 10631

	raise_softirq_irqoff(NET_TX_SOFTIRQ);
	local_irq_enable();

10632 10633 10634 10635 10636 10637 10638
#ifdef CONFIG_RPS
	remsd = oldsd->rps_ipi_list;
	oldsd->rps_ipi_list = NULL;
#endif
	/* send out pending IPI's on offline CPU */
	net_rps_send_ipi(remsd);

L
Linus Torvalds 已提交
10639
	/* Process offline CPU's input_pkt_queue */
10640
	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10641
		netif_rx_ni(skb);
10642
		input_queue_head_incr(oldsd);
T
Tom Herbert 已提交
10643
	}
E
Eric Dumazet 已提交
10644
	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10645
		netif_rx_ni(skb);
10646 10647
		input_queue_head_incr(oldsd);
	}
L
Linus Torvalds 已提交
10648

10649
	return 0;
L
Linus Torvalds 已提交
10650 10651
}

10652
/**
10653 10654 10655 10656
 *	netdev_increment_features - increment feature set by one
 *	@all: current feature set
 *	@one: new feature set
 *	@mask: mask feature set
10657 10658
 *
 *	Computes a new feature set after adding a device with feature set
10659 10660
 *	@one to the master device with current feature set @all.  Will not
 *	enable anything that is off in @mask. Returns the new feature set.
10661
 */
10662 10663
netdev_features_t netdev_increment_features(netdev_features_t all,
	netdev_features_t one, netdev_features_t mask)
10664
{
10665
	if (mask & NETIF_F_HW_CSUM)
10666
		mask |= NETIF_F_CSUM_MASK;
10667
	mask |= NETIF_F_VLAN_CHALLENGED;
10668

10669
	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10670
	all &= one | ~NETIF_F_ALL_FOR_ALL;
10671

10672
	/* If one device supports hw checksumming, set for all. */
10673 10674
	if (all & NETIF_F_HW_CSUM)
		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10675 10676 10677

	return all;
}
10678
EXPORT_SYMBOL(netdev_increment_features);
10679

10680
static struct hlist_head * __net_init netdev_create_hash(void)
10681 10682 10683 10684
{
	int i;
	struct hlist_head *hash;

10685
	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10686 10687 10688 10689 10690 10691 10692
	if (hash != NULL)
		for (i = 0; i < NETDEV_HASHENTRIES; i++)
			INIT_HLIST_HEAD(&hash[i]);

	return hash;
}

10693
/* Initialize per network namespace state */
10694
static int __net_init netdev_init(struct net *net)
10695
{
L
Li RongQing 已提交
10696
	BUILD_BUG_ON(GRO_HASH_BUCKETS >
10697
		     8 * sizeof_field(struct napi_struct, gro_bitmask));
L
Li RongQing 已提交
10698

10699 10700
	if (net != &init_net)
		INIT_LIST_HEAD(&net->dev_base_head);
10701

10702 10703 10704
	net->dev_name_head = netdev_create_hash();
	if (net->dev_name_head == NULL)
		goto err_name;
10705

10706 10707 10708
	net->dev_index_head = netdev_create_hash();
	if (net->dev_index_head == NULL)
		goto err_idx;
10709

10710 10711
	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);

10712
	return 0;
10713 10714 10715 10716 10717

err_idx:
	kfree(net->dev_name_head);
err_name:
	return -ENOMEM;
10718 10719
}

10720 10721 10722 10723 10724 10725
/**
 *	netdev_drivername - network driver for the device
 *	@dev: network device
 *
 *	Determine network driver for device.
 */
10726
const char *netdev_drivername(const struct net_device *dev)
10727
{
10728 10729
	const struct device_driver *driver;
	const struct device *parent;
10730
	const char *empty = "";
10731 10732 10733

	parent = dev->dev.parent;
	if (!parent)
10734
		return empty;
10735 10736 10737

	driver = parent->driver;
	if (driver && driver->name)
10738 10739
		return driver->name;
	return empty;
10740 10741
}

10742 10743
static void __netdev_printk(const char *level, const struct net_device *dev,
			    struct va_format *vaf)
10744
{
10745
	if (dev && dev->dev.parent) {
10746 10747 10748 10749 10750 10751 10752
		dev_printk_emit(level[1] - '0',
				dev->dev.parent,
				"%s %s %s%s: %pV",
				dev_driver_string(dev->dev.parent),
				dev_name(dev->dev.parent),
				netdev_name(dev), netdev_reg_state(dev),
				vaf);
10753
	} else if (dev) {
10754 10755
		printk("%s%s%s: %pV",
		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10756
	} else {
10757
		printk("%s(NULL net_device): %pV", level, vaf);
10758
	}
10759 10760
}

10761 10762
void netdev_printk(const char *level, const struct net_device *dev,
		   const char *format, ...)
10763 10764 10765 10766 10767 10768 10769 10770 10771
{
	struct va_format vaf;
	va_list args;

	va_start(args, format);

	vaf.fmt = format;
	vaf.va = &args;

10772
	__netdev_printk(level, dev, &vaf);
10773

10774 10775 10776 10777 10778
	va_end(args);
}
EXPORT_SYMBOL(netdev_printk);

#define define_netdev_printk_level(func, level)			\
10779
void func(const struct net_device *dev, const char *fmt, ...)	\
10780 10781 10782 10783 10784 10785 10786 10787 10788
{								\
	struct va_format vaf;					\
	va_list args;						\
								\
	va_start(args, fmt);					\
								\
	vaf.fmt = fmt;						\
	vaf.va = &args;						\
								\
10789
	__netdev_printk(level, dev, &vaf);			\
10790
								\
10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802
	va_end(args);						\
}								\
EXPORT_SYMBOL(func);

define_netdev_printk_level(netdev_emerg, KERN_EMERG);
define_netdev_printk_level(netdev_alert, KERN_ALERT);
define_netdev_printk_level(netdev_crit, KERN_CRIT);
define_netdev_printk_level(netdev_err, KERN_ERR);
define_netdev_printk_level(netdev_warn, KERN_WARNING);
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
define_netdev_printk_level(netdev_info, KERN_INFO);

10803
static void __net_exit netdev_exit(struct net *net)
10804 10805 10806
{
	kfree(net->dev_name_head);
	kfree(net->dev_index_head);
10807 10808
	if (net != &init_net)
		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10809 10810
}

10811
static struct pernet_operations __net_initdata netdev_net_ops = {
10812 10813 10814 10815
	.init = netdev_init,
	.exit = netdev_exit,
};

10816
static void __net_exit default_device_exit(struct net *net)
10817
{
10818
	struct net_device *dev, *aux;
10819
	/*
10820
	 * Push all migratable network devices back to the
10821 10822 10823
	 * initial network namespace
	 */
	rtnl_lock();
10824
	for_each_netdev_safe(net, dev, aux) {
10825
		int err;
10826
		char fb_name[IFNAMSIZ];
10827 10828 10829 10830 10831

		/* Ignore unmoveable devices (i.e. loopback) */
		if (dev->features & NETIF_F_NETNS_LOCAL)
			continue;

10832 10833 10834
		/* Leave virtual devices for the generic cleanup */
		if (dev->rtnl_link_ops)
			continue;
10835

L
Lucas De Marchi 已提交
10836
		/* Push remaining network devices to init_net */
10837
		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10838 10839
		if (__dev_get_by_name(&init_net, fb_name))
			snprintf(fb_name, IFNAMSIZ, "dev%%d");
10840
		err = dev_change_net_namespace(dev, &init_net, fb_name);
10841
		if (err) {
10842 10843
			pr_emerg("%s: failed to move %s to init_net: %d\n",
				 __func__, dev->name, err);
10844
			BUG();
10845 10846 10847 10848 10849
		}
	}
	rtnl_unlock();
}

10850 10851 10852 10853 10854 10855 10856
static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
{
	/* Return with the rtnl_lock held when there are no network
	 * devices unregistering in any network namespace in net_list.
	 */
	struct net *net;
	bool unregistering;
10857
	DEFINE_WAIT_FUNC(wait, woken_wake_function);
10858

10859
	add_wait_queue(&netdev_unregistering_wq, &wait);
10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871
	for (;;) {
		unregistering = false;
		rtnl_lock();
		list_for_each_entry(net, net_list, exit_list) {
			if (net->dev_unreg_count > 0) {
				unregistering = true;
				break;
			}
		}
		if (!unregistering)
			break;
		__rtnl_unlock();
10872 10873

		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10874
	}
10875
	remove_wait_queue(&netdev_unregistering_wq, &wait);
10876 10877
}

10878 10879 10880
static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
	/* At exit all network devices most be removed from a network
10881
	 * namespace.  Do this in the reverse order of registration.
10882 10883 10884 10885 10886 10887 10888
	 * Do this across as many network namespaces as possible to
	 * improve batching efficiency.
	 */
	struct net_device *dev;
	struct net *net;
	LIST_HEAD(dev_kill_list);

10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900
	/* To prevent network device cleanup code from dereferencing
	 * loopback devices or network devices that have been freed
	 * wait here for all pending unregistrations to complete,
	 * before unregistring the loopback device and allowing the
	 * network namespace be freed.
	 *
	 * The netdev todo list containing all network devices
	 * unregistrations that happen in default_device_exit_batch
	 * will run in the rtnl_unlock() at the end of
	 * default_device_exit_batch.
	 */
	rtnl_lock_unregistering(net_list);
10901 10902
	list_for_each_entry(net, net_list, exit_list) {
		for_each_netdev_reverse(net, dev) {
10903
			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10904 10905 10906 10907 10908 10909 10910 10911 10912
				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
			else
				unregister_netdevice_queue(dev, &dev_kill_list);
		}
	}
	unregister_netdevice_many(&dev_kill_list);
	rtnl_unlock();
}

10913
static struct pernet_operations __net_initdata default_device_ops = {
10914
	.exit = default_device_exit,
10915
	.exit_batch = default_device_exit_batch,
10916 10917
};

L
Linus Torvalds 已提交
10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937
/*
 *	Initialize the DEV module. At boot time this walks the device list and
 *	unhooks any devices that fail to initialise (normally hardware not
 *	present) and leaves us with a valid list of present and active devices.
 *
 */

/*
 *       This is called single threaded during boot, so no need
 *       to take the rtnl semaphore.
 */
static int __init net_dev_init(void)
{
	int i, rc = -ENOMEM;

	BUG_ON(!dev_boot_phase);

	if (dev_proc_init())
		goto out;

10938
	if (netdev_kobject_init())
L
Linus Torvalds 已提交
10939 10940 10941
		goto out;

	INIT_LIST_HEAD(&ptype_all);
10942
	for (i = 0; i < PTYPE_HASH_SIZE; i++)
L
Linus Torvalds 已提交
10943 10944
		INIT_LIST_HEAD(&ptype_base[i]);

10945 10946
	INIT_LIST_HEAD(&offload_base);

10947 10948
	if (register_pernet_subsys(&netdev_net_ops))
		goto out;
L
Linus Torvalds 已提交
10949 10950 10951 10952 10953

	/*
	 *	Initialise the packet receive queues.
	 */

10954
	for_each_possible_cpu(i) {
10955
		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
E
Eric Dumazet 已提交
10956
		struct softnet_data *sd = &per_cpu(softnet_data, i);
L
Linus Torvalds 已提交
10957

10958 10959
		INIT_WORK(flush, flush_backlog);

E
Eric Dumazet 已提交
10960
		skb_queue_head_init(&sd->input_pkt_queue);
10961
		skb_queue_head_init(&sd->process_queue);
10962 10963 10964
#ifdef CONFIG_XFRM_OFFLOAD
		skb_queue_head_init(&sd->xfrm_backlog);
#endif
E
Eric Dumazet 已提交
10965
		INIT_LIST_HEAD(&sd->poll_list);
10966
		sd->output_queue_tailp = &sd->output_queue;
E
Eric Dumazet 已提交
10967
#ifdef CONFIG_RPS
E
Eric Dumazet 已提交
10968 10969 10970
		sd->csd.func = rps_trigger_softirq;
		sd->csd.info = sd;
		sd->cpu = i;
10971
#endif
T
Tom Herbert 已提交
10972

10973
		init_gro_hash(&sd->backlog);
E
Eric Dumazet 已提交
10974 10975
		sd->backlog.poll = process_backlog;
		sd->backlog.weight = weight_p;
L
Linus Torvalds 已提交
10976 10977 10978 10979
	}

	dev_boot_phase = 0;

10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994
	/* The loopback device is special if any other network devices
	 * is present in a network namespace the loopback device must
	 * be present. Since we now dynamically allocate and free the
	 * loopback device ensure this invariant is maintained by
	 * keeping the loopback device as the first device on the
	 * list of network devices.  Ensuring the loopback devices
	 * is the first device that appears and the last network device
	 * that disappears.
	 */
	if (register_pernet_device(&loopback_net_ops))
		goto out;

	if (register_pernet_device(&default_device_ops))
		goto out;

10995 10996
	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
L
Linus Torvalds 已提交
10997

10998 10999 11000
	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
				       NULL, dev_cpu_dead);
	WARN_ON(rc < 0);
L
Linus Torvalds 已提交
11001 11002 11003 11004 11005 11006
	rc = 0;
out:
	return rc;
}

subsys_initcall(net_dev_init);