fib_semantics.c 44.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: semantics.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

16
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
17 18 19 20 21 22 23 24 25 26 27
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
28
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
29 30 31 32 33
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
34
#include <linux/slab.h>
35
#include <linux/netlink.h>
L
Linus Torvalds 已提交
36

37
#include <net/arp.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
44
#include <net/netlink.h>
45
#include <net/nexthop.h>
46
#include <net/lwtunnel.h>
47
#include <net/fib_notifier.h>
D
David Ahern 已提交
48
#include <net/addrconf.h>
L
Linus Torvalds 已提交
49 50 51

#include "fib_lookup.h"

52
static DEFINE_SPINLOCK(fib_info_lock);
L
Linus Torvalds 已提交
53 54
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
55
static unsigned int fib_info_hash_size;
L
Linus Torvalds 已提交
56 57 58 59 60 61 62 63
static unsigned int fib_info_cnt;

#define DEVINDEX_HASHBITS 8
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];

#ifdef CONFIG_IP_ROUTE_MULTIPATH

E
Eric Dumazet 已提交
64 65 66 67 68 69 70 71 72 73 74
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh;				\
	for (nhsel = 0, nh = (fi)->fib_nh;				\
	     nhsel < (fi)->fib_nhs;					\
	     nh++, nhsel++)

#define change_nexthops(fi) {						\
	int nhsel; struct fib_nh *nexthop_nh;				\
	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	     nhsel < (fi)->fib_nhs;					\
	     nexthop_nh++, nhsel++)
L
Linus Torvalds 已提交
75 76 77 78 79

#else /* CONFIG_IP_ROUTE_MULTIPATH */

/* Hope, that gcc will optimize it to get rid of dummy loop */

E
Eric Dumazet 已提交
80 81 82
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
83

E
Eric Dumazet 已提交
84 85 86 87
#define change_nexthops(fi) {						\
	int nhsel;							\
	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
88 89 90 91 92 93

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

#define endfor_nexthops(fi) }


94
const struct fib_prop fib_props[RTN_MAX + 1] = {
E
Eric Dumazet 已提交
95
	[RTN_UNSPEC] = {
L
Linus Torvalds 已提交
96 97
		.error	= 0,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
98 99
	},
	[RTN_UNICAST] = {
L
Linus Torvalds 已提交
100 101
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
102 103
	},
	[RTN_LOCAL] = {
L
Linus Torvalds 已提交
104 105
		.error	= 0,
		.scope	= RT_SCOPE_HOST,
E
Eric Dumazet 已提交
106 107
	},
	[RTN_BROADCAST] = {
L
Linus Torvalds 已提交
108 109
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
110 111
	},
	[RTN_ANYCAST] = {
L
Linus Torvalds 已提交
112 113
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
114 115
	},
	[RTN_MULTICAST] = {
L
Linus Torvalds 已提交
116 117
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
118 119
	},
	[RTN_BLACKHOLE] = {
L
Linus Torvalds 已提交
120 121
		.error	= -EINVAL,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
122 123
	},
	[RTN_UNREACHABLE] = {
L
Linus Torvalds 已提交
124 125
		.error	= -EHOSTUNREACH,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
126 127
	},
	[RTN_PROHIBIT] = {
L
Linus Torvalds 已提交
128 129
		.error	= -EACCES,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
130 131
	},
	[RTN_THROW] = {
L
Linus Torvalds 已提交
132 133
		.error	= -EAGAIN,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
134 135
	},
	[RTN_NAT] = {
L
Linus Torvalds 已提交
136 137
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
138 139
	},
	[RTN_XRESOLVE] = {
L
Linus Torvalds 已提交
140 141
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
142
	},
L
Linus Torvalds 已提交
143 144
};

145 146 147 148 149 150 151 152 153 154 155 156
static void rt_fibinfo_free(struct rtable __rcu **rtp)
{
	struct rtable *rt = rcu_dereference_protected(*rtp, 1);

	if (!rt)
		return;

	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
	 * because we waited an RCU grace period before calling
	 * free_fib_info_rcu()
	 */

W
Wei Wang 已提交
157
	dst_dev_put(&rt->dst);
158
	dst_release_immediate(&rt->dst);
159 160
}

161 162
static void free_nh_exceptions(struct fib_nh *nh)
{
163
	struct fnhe_hash_bucket *hash;
164 165
	int i;

166 167 168
	hash = rcu_dereference_protected(nh->nh_exceptions, 1);
	if (!hash)
		return;
169 170 171
	for (i = 0; i < FNHE_HASH_SIZE; i++) {
		struct fib_nh_exception *fnhe;

E
Eric Dumazet 已提交
172
		fnhe = rcu_dereference_protected(hash[i].chain, 1);
173 174
		while (fnhe) {
			struct fib_nh_exception *next;
S
Stephen Hemminger 已提交
175

E
Eric Dumazet 已提交
176
			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
177

178 179
			rt_fibinfo_free(&fnhe->fnhe_rth_input);
			rt_fibinfo_free(&fnhe->fnhe_rth_output);
180

181 182 183 184 185 186 187 188
			kfree(fnhe);

			fnhe = next;
		}
	}
	kfree(hash);
}

189
static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
E
Eric Dumazet 已提交
190 191 192 193 194 195 196 197 198 199
{
	int cpu;

	if (!rtp)
		return;

	for_each_possible_cpu(cpu) {
		struct rtable *rt;

		rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
200
		if (rt) {
W
Wei Wang 已提交
201
			dst_dev_put(&rt->dst);
202
			dst_release_immediate(&rt->dst);
203
		}
E
Eric Dumazet 已提交
204 205 206 207
	}
	free_percpu(rtp);
}

208 209 210 211 212 213 214 215 216
void fib_nh_common_release(struct fib_nh_common *nhc)
{
	if (nhc->nhc_dev)
		dev_put(nhc->nhc_dev);

	lwtstate_put(nhc->nhc_lwtstate);
}
EXPORT_SYMBOL_GPL(fib_nh_common_release);

217 218 219 220 221 222
void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
	if (fib_nh->nh_tclassid)
		net->ipv4.fib_num_tclassid_users--;
#endif
223
	fib_nh_common_release(&fib_nh->nh_common);
224 225 226 227 228
	free_nh_exceptions(fib_nh);
	rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output);
	rt_fibinfo_free(&fib_nh->nh_rth_input);
}

L
Linus Torvalds 已提交
229
/* Release a nexthop info record */
230 231 232 233
static void free_fib_info_rcu(struct rcu_head *head)
{
	struct fib_info *fi = container_of(head, struct fib_info, rcu);

234
	change_nexthops(fi) {
235
		fib_nh_release(fi->fib_net, nexthop_nh);
236 237
	} endfor_nexthops(fi);

238 239
	ip_fib_metrics_put(fi->fib_metrics);

240 241
	kfree(fi);
}
L
Linus Torvalds 已提交
242 243 244 245

void free_fib_info(struct fib_info *fi)
{
	if (fi->fib_dead == 0) {
J
Joe Perches 已提交
246
		pr_warn("Freeing alive fib_info %p\n", fi);
L
Linus Torvalds 已提交
247 248 249
		return;
	}
	fib_info_cnt--;
250

251
	call_rcu(&fi->rcu, free_fib_info_rcu);
L
Linus Torvalds 已提交
252
}
I
Ido Schimmel 已提交
253
EXPORT_SYMBOL_GPL(free_fib_info);
L
Linus Torvalds 已提交
254 255 256

void fib_release_info(struct fib_info *fi)
{
257
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
258 259 260 261 262
	if (fi && --fi->fib_treeref == 0) {
		hlist_del(&fi->fib_hash);
		if (fi->fib_prefsrc)
			hlist_del(&fi->fib_lhash);
		change_nexthops(fi) {
D
David Ahern 已提交
263
			if (!nexthop_nh->fib_nh_dev)
L
Linus Torvalds 已提交
264
				continue;
265
			hlist_del(&nexthop_nh->nh_hash);
L
Linus Torvalds 已提交
266 267 268 269
		} endfor_nexthops(fi)
		fi->fib_dead = 1;
		fib_info_put(fi);
	}
270
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
271 272
}

E
Eric Dumazet 已提交
273
static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
L
Linus Torvalds 已提交
274 275 276 277
{
	const struct fib_nh *onh = ofi->fib_nh;

	for_nexthops(fi) {
D
David Ahern 已提交
278 279 280
		if (nh->fib_nh_oif != onh->fib_nh_oif ||
		    nh->fib_nh_gw4 != onh->fib_nh_gw4 ||
		    nh->fib_nh_scope != onh->fib_nh_scope ||
L
Linus Torvalds 已提交
281
#ifdef CONFIG_IP_ROUTE_MULTIPATH
D
David Ahern 已提交
282
		    nh->fib_nh_weight != onh->fib_nh_weight ||
L
Linus Torvalds 已提交
283
#endif
284
#ifdef CONFIG_IP_ROUTE_CLASSID
L
Linus Torvalds 已提交
285 286
		    nh->nh_tclassid != onh->nh_tclassid ||
#endif
D
David Ahern 已提交
287 288
		    lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
		    ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
L
Linus Torvalds 已提交
289 290 291 292 293 294
			return -1;
		onh++;
	} endfor_nexthops(fi);
	return 0;
}

295 296 297 298 299 300 301 302 303
static inline unsigned int fib_devindex_hashfn(unsigned int val)
{
	unsigned int mask = DEVINDEX_HASHSIZE - 1;

	return (val ^
		(val >> DEVINDEX_HASHBITS) ^
		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}

L
Linus Torvalds 已提交
304 305
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
306
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
307 308
	unsigned int val = fi->fib_nhs;

309
	val ^= (fi->fib_protocol << 8) | fi->fib_scope;
A
Al Viro 已提交
310
	val ^= (__force u32)fi->fib_prefsrc;
L
Linus Torvalds 已提交
311
	val ^= fi->fib_priority;
312
	for_nexthops(fi) {
D
David Ahern 已提交
313
		val ^= fib_devindex_hashfn(nh->fib_nh_oif);
314
	} endfor_nexthops(fi)
L
Linus Torvalds 已提交
315 316 317 318 319 320 321 322 323 324 325 326 327

	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}

static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
	struct hlist_head *head;
	struct fib_info *fi;
	unsigned int hash;

	hash = fib_info_hashfn(nfi);
	head = &fib_info_hash[hash];

328
	hlist_for_each_entry(fi, head, fib_hash) {
O
Octavian Purdila 已提交
329
		if (!net_eq(fi->fib_net, nfi->fib_net))
330
			continue;
L
Linus Torvalds 已提交
331 332 333
		if (fi->fib_nhs != nfi->fib_nhs)
			continue;
		if (nfi->fib_protocol == fi->fib_protocol &&
334
		    nfi->fib_scope == fi->fib_scope &&
L
Linus Torvalds 已提交
335 336
		    nfi->fib_prefsrc == fi->fib_prefsrc &&
		    nfi->fib_priority == fi->fib_priority &&
E
Eric Dumazet 已提交
337
		    nfi->fib_type == fi->fib_type &&
L
Linus Torvalds 已提交
338
		    memcmp(nfi->fib_metrics, fi->fib_metrics,
E
Eric Dumazet 已提交
339
			   sizeof(u32) * RTAX_MAX) == 0 &&
340
		    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
L
Linus Torvalds 已提交
341 342 343 344 345 346 347 348
		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
			return fi;
	}

	return NULL;
}

/* Check, that the gateway is already configured.
E
Eric Dumazet 已提交
349
 * Used only by redirect accept routine.
L
Linus Torvalds 已提交
350
 */
351
int ip_fib_check_default(__be32 gw, struct net_device *dev)
L
Linus Torvalds 已提交
352 353 354 355 356
{
	struct hlist_head *head;
	struct fib_nh *nh;
	unsigned int hash;

357
	spin_lock(&fib_info_lock);
L
Linus Torvalds 已提交
358 359 360

	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
361
	hlist_for_each_entry(nh, head, nh_hash) {
D
David Ahern 已提交
362 363 364
		if (nh->fib_nh_dev == dev &&
		    nh->fib_nh_gw4 == gw &&
		    !(nh->fib_nh_flags & RTNH_F_DEAD)) {
365
			spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
366 367 368 369
			return 0;
		}
	}

370
	spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
371 372 373 374

	return -1;
}

375 376 377 378 379 380
static inline size_t fib_nlmsg_size(struct fib_info *fi)
{
	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
			 + nla_total_size(4) /* RTA_TABLE */
			 + nla_total_size(4) /* RTA_DST */
			 + nla_total_size(4) /* RTA_PRIORITY */
381 382
			 + nla_total_size(4) /* RTA_PREFSRC */
			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
383 384 385 386 387

	/* space for nested metrics */
	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));

	if (fi->fib_nhs) {
388
		size_t nh_encapsize = 0;
389 390 391 392 393 394 395 396
		/* Also handles the special case fib_nhs == 1 */

		/* each nexthop is packed in an attribute */
		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));

		/* may contain flow and gateway attribute */
		nhsize += 2 * nla_total_size(4);

397 398
		/* grab encap info */
		for_nexthops(fi) {
D
David Ahern 已提交
399
			if (nh->fib_nh_lws) {
400 401
				/* RTA_ENCAP_TYPE */
				nh_encapsize += lwtunnel_get_encap_size(
D
David Ahern 已提交
402
						nh->fib_nh_lws);
403 404 405 406 407
				/* RTA_ENCAP */
				nh_encapsize +=  nla_total_size(2);
			}
		} endfor_nexthops(fi);

408
		/* all nexthops are packed in a nested attribute */
409 410 411
		payload += nla_total_size((fi->fib_nhs * nhsize) +
					  nh_encapsize);

412 413 414 415 416
	}

	return payload;
}

A
Al Viro 已提交
417
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
418
	       int dst_len, u32 tb_id, const struct nl_info *info,
419
	       unsigned int nlm_flags)
L
Linus Torvalds 已提交
420 421
{
	struct sk_buff *skb;
422
	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
423
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
424

425
	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
426
	if (!skb)
427
		goto errout;
L
Linus Torvalds 已提交
428

429
	err = fib_dump_info(skb, info->portid, seq, event, tb_id,
430
			    fa->fa_type, key, dst_len,
431
			    fa->fa_tos, fa->fa_info, nlm_flags);
432 433 434 435 436 437
	if (err < 0) {
		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
438
	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
439 440
		    info->nlh, GFP_KERNEL);
	return;
441 442
errout:
	if (err < 0)
443
		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
L
Linus Torvalds 已提交
444 445
}

446 447 448
static int fib_detect_death(struct fib_info *fi, int order,
			    struct fib_info **last_resort, int *last_idx,
			    int dflt)
L
Linus Torvalds 已提交
449 450 451 452
{
	struct neighbour *n;
	int state = NUD_NONE;

D
David Ahern 已提交
453
	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev);
L
Linus Torvalds 已提交
454 455 456
	if (n) {
		state = n->nud_state;
		neigh_release(n);
457 458
	} else {
		return 0;
L
Linus Torvalds 已提交
459
	}
460
	if (state == NUD_REACHABLE)
L
Linus Torvalds 已提交
461
		return 0;
E
Eric Dumazet 已提交
462
	if ((state & NUD_VALID) && order != dflt)
L
Linus Torvalds 已提交
463
		return 0;
E
Eric Dumazet 已提交
464
	if ((state & NUD_VALID) ||
465
	    (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
L
Linus Torvalds 已提交
466 467 468 469 470 471
		*last_resort = fi;
		*last_idx = order;
	}
	return 1;
}

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
		       u16 encap_type, void *cfg, gfp_t gfp_flags,
		       struct netlink_ext_ack *extack)
{
	if (encap) {
		struct lwtunnel_state *lwtstate;
		int err;

		if (encap_type == LWTUNNEL_ENCAP_NONE) {
			NL_SET_ERR_MSG(extack, "LWT encap type not specified");
			return -EINVAL;
		}
		err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family,
					   cfg, &lwtstate, extack);
		if (err)
			return err;

		nhc->nhc_lwtstate = lwtstate_get(lwtstate);
	}

	return 0;
}
EXPORT_SYMBOL_GPL(fib_nh_common_init);

496 497 498 499 500 501
int fib_nh_init(struct net *net, struct fib_nh *nh,
		struct fib_config *cfg, int nh_weight,
		struct netlink_ext_ack *extack)
{
	int err = -ENOMEM;

502 503
	nh->fib_nh_family = AF_INET;

504 505 506 507
	nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
	if (!nh->nh_pcpu_rth_output)
		goto err_out;

508 509 510 511
	err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap,
				 cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
	if (err)
		goto init_failure;
512

D
David Ahern 已提交
513
	nh->fib_nh_oif = cfg->fc_oif;
514 515
	if (cfg->fc_gw_family == AF_INET) {
		nh->fib_nh_gw4 = cfg->fc_gw4;
516
		nh->fib_nh_gw_family = AF_INET;
517
	}
D
David Ahern 已提交
518
	nh->fib_nh_flags = cfg->fc_flags;
519 520 521 522 523 524 525

#ifdef CONFIG_IP_ROUTE_CLASSID
	nh->nh_tclassid = cfg->fc_flow;
	if (nh->nh_tclassid)
		net->ipv4.fib_num_tclassid_users++;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
D
David Ahern 已提交
526
	nh->fib_nh_weight = nh_weight;
527 528 529
#endif
	return 0;

530
init_failure:
531 532 533 534 535 536
	rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output);
	nh->nh_pcpu_rth_output = NULL;
err_out:
	return err;
}

L
Linus Torvalds 已提交
537 538
#ifdef CONFIG_IP_ROUTE_MULTIPATH

539 540
static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
541 542 543
{
	int nhs = 0;

544
	while (rtnh_ok(rtnh, remaining)) {
L
Linus Torvalds 已提交
545
		nhs++;
546 547 548 549
		rtnh = rtnh_next(rtnh, &remaining);
	}

	/* leftover implies invalid nexthop configuration, discard it */
550 551 552 553 554 555 556
	if (remaining > 0) {
		NL_SET_ERR_MSG(extack,
			       "Invalid nexthop configuration - extra data after nexthops");
		nhs = 0;
	}

	return nhs;
L
Linus Torvalds 已提交
557 558
}

559
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
560 561
		       int remaining, struct fib_config *cfg,
		       struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
562
{
563 564
	struct net *net = fi->fib_net;
	struct fib_config fib_cfg;
565 566
	int ret;

L
Linus Torvalds 已提交
567
	change_nexthops(fi) {
568 569
		int attrlen;

570 571
		memset(&fib_cfg, 0, sizeof(fib_cfg));

572 573 574
		if (!rtnh_ok(rtnh, remaining)) {
			NL_SET_ERR_MSG(extack,
				       "Invalid nexthop configuration - extra data after nexthop");
L
Linus Torvalds 已提交
575
			return -EINVAL;
576
		}
577

578 579 580
		if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
			NL_SET_ERR_MSG(extack,
				       "Invalid flags for nexthop - can not contain DEAD or LINKDOWN");
581
			return -EINVAL;
582
		}
583

584 585
		fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
		fib_cfg.fc_oif = rtnh->rtnh_ifindex;
586 587 588 589 590 591

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
592 593 594 595
			if (nla) {
				fib_cfg.fc_gw_family = AF_INET;
				fib_cfg.fc_gw4 = nla_get_in_addr(nla);
			}
596

597
			nla = nla_find(attrs, attrlen, RTA_FLOW);
598 599 600 601 602 603 604
			if (nla)
				fib_cfg.fc_flow = nla_get_u32(nla);

			fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
			if (nla)
				fib_cfg.fc_encap_type = nla_get_u16(nla);
L
Linus Torvalds 已提交
605
		}
606

607 608 609 610 611
		ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
				  rtnh->rtnh_hops + 1, extack);
		if (ret)
			goto errout;

612
		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
613
	} endfor_nexthops(fi);
614

615
	ret = -EINVAL;
D
David Ahern 已提交
616
	if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) {
617 618 619 620
		NL_SET_ERR_MSG(extack,
			       "Nexthop device index does not match RTA_OIF");
		goto errout;
	}
621 622 623 624 625 626 627 628
	if (cfg->fc_gw_family) {
		if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family ||
		    (cfg->fc_gw_family == AF_INET &&
		     fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4)) {
			NL_SET_ERR_MSG(extack,
				       "Nexthop gateway does not match RTA_GATEWAY");
			goto errout;
		}
629 630 631 632 633 634 635 636 637
	}
#ifdef CONFIG_IP_ROUTE_CLASSID
	if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
		NL_SET_ERR_MSG(extack,
			       "Nexthop class id does not match RTA_FLOW");
		goto errout;
	}
#endif
	ret = 0;
638 639
errout:
	return ret;
L
Linus Torvalds 已提交
640 641
}

P
Peter Nørlund 已提交
642 643 644 645 646 647 648 649 650 651
static void fib_rebalance(struct fib_info *fi)
{
	int total;
	int w;

	if (fi->fib_nhs < 2)
		return;

	total = 0;
	for_nexthops(fi) {
D
David Ahern 已提交
652
		if (nh->fib_nh_flags & RTNH_F_DEAD)
P
Peter Nørlund 已提交
653 654
			continue;

D
David Ahern 已提交
655 656
		if (ip_ignore_linkdown(nh->fib_nh_dev) &&
		    nh->fib_nh_flags & RTNH_F_LINKDOWN)
P
Peter Nørlund 已提交
657 658
			continue;

D
David Ahern 已提交
659
		total += nh->fib_nh_weight;
P
Peter Nørlund 已提交
660 661 662 663 664 665
	} endfor_nexthops(fi);

	w = 0;
	change_nexthops(fi) {
		int upper_bound;

D
David Ahern 已提交
666
		if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) {
P
Peter Nørlund 已提交
667
			upper_bound = -1;
D
David Ahern 已提交
668 669
		} else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) &&
			   nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
P
Peter Nørlund 已提交
670 671
			upper_bound = -1;
		} else {
D
David Ahern 已提交
672
			w += nexthop_nh->fib_nh_weight;
673 674
			upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
							    total) - 1;
P
Peter Nørlund 已提交
675 676
		}

D
David Ahern 已提交
677
		atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound);
P
Peter Nørlund 已提交
678 679 680 681
	} endfor_nexthops(fi);
}
#else /* CONFIG_IP_ROUTE_MULTIPATH */

682 683 684 685 686 687 688 689 690
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
		       int remaining, struct fib_config *cfg,
		       struct netlink_ext_ack *extack)
{
	NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");

	return -EINVAL;
}

P
Peter Nørlund 已提交
691 692 693
#define fib_rebalance(fi) do { } while (0)

#endif /* CONFIG_IP_ROUTE_MULTIPATH */
L
Linus Torvalds 已提交
694

695
static int fib_encap_match(u16 encap_type,
Y
Ying Xue 已提交
696
			   struct nlattr *encap,
697
			   const struct fib_nh *nh,
698 699
			   const struct fib_config *cfg,
			   struct netlink_ext_ack *extack)
700 701
{
	struct lwtunnel_state *lwtstate;
J
Jiri Benc 已提交
702
	int ret, result = 0;
703 704 705 706

	if (encap_type == LWTUNNEL_ENCAP_NONE)
		return 0;

707 708
	ret = lwtunnel_build_state(encap_type, encap, AF_INET,
				   cfg, &lwtstate, extack);
J
Jiri Benc 已提交
709
	if (!ret) {
D
David Ahern 已提交
710
		result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
J
Jiri Benc 已提交
711 712
		lwtstate_free(lwtstate);
	}
713

J
Jiri Benc 已提交
714
	return result;
715 716
}

717 718
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
		 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
719 720
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
721 722
	struct rtnexthop *rtnh;
	int remaining;
L
Linus Torvalds 已提交
723 724
#endif

725
	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
L
Linus Torvalds 已提交
726 727
		return 1;

728
	if (cfg->fc_oif || cfg->fc_gw_family) {
729
		if (cfg->fc_encap) {
730 731 732
			if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
					    fi->fib_nh, cfg, extack))
				return 1;
733
		}
734 735 736 737 738
#ifdef CONFIG_IP_ROUTE_CLASSID
		if (cfg->fc_flow &&
		    cfg->fc_flow != fi->fib_nh->nh_tclassid)
			return 1;
#endif
739 740 741 742 743 744 745 746 747 748
		if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) ||
		    (cfg->fc_gw_family &&
		     cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family))
			return 1;

		if (cfg->fc_gw_family == AF_INET &&
		    cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4)
			return 1;

		return 0;
L
Linus Torvalds 已提交
749 750 751
	}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
752
	if (!cfg->fc_mp)
L
Linus Torvalds 已提交
753
		return 0;
754 755 756

	rtnh = cfg->fc_mp;
	remaining = cfg->fc_mp_len;
757

L
Linus Torvalds 已提交
758
	for_nexthops(fi) {
759
		int attrlen;
L
Linus Torvalds 已提交
760

761
		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
762
			return -EINVAL;
763

D
David Ahern 已提交
764
		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif)
L
Linus Torvalds 已提交
765
			return 1;
766 767

		attrlen = rtnh_attrlen(rtnh);
768
		if (attrlen > 0) {
769 770 771
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
D
David Ahern 已提交
772
			if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4)
L
Linus Torvalds 已提交
773
				return 1;
774
#ifdef CONFIG_IP_ROUTE_CLASSID
775 776
			nla = nla_find(attrs, attrlen, RTA_FLOW);
			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
L
Linus Torvalds 已提交
777 778 779
				return 1;
#endif
		}
780 781

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
782 783 784 785 786
	} endfor_nexthops(fi);
#endif
	return 0;
}

787 788 789 790 791 792 793 794 795 796
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
{
	struct nlattr *nla;
	int remaining;

	if (!cfg->fc_mx)
		return true;

	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
		int type = nla_type(nla);
797
		u32 fi_val, val;
798 799 800 801 802 803 804 805 806 807 808

		if (!type)
			continue;
		if (type > RTAX_MAX)
			return false;

		if (type == RTAX_CC_ALGO) {
			char tmp[TCP_CA_NAME_MAX];
			bool ecn_ca = false;

			nla_strlcpy(tmp, nla, sizeof(tmp));
809
			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
810
		} else {
811 812
			if (nla_len(nla) != sizeof(u32))
				return false;
813 814 815
			val = nla_get_u32(nla);
		}

816 817 818 819 820
		fi_val = fi->fib_metrics->metrics[type - 1];
		if (type == RTAX_FEATURES)
			fi_val &= ~DST_FEATURE_ECN_CA;

		if (fi_val != val)
821 822 823 824 825 826
			return false;
	}

	return true;
}

L
Linus Torvalds 已提交
827 828

/*
E
Eric Dumazet 已提交
829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869
 * Picture
 * -------
 *
 * Semantics of nexthop is very messy by historical reasons.
 * We have to take into account, that:
 * a) gateway can be actually local interface address,
 *    so that gatewayed route is direct.
 * b) gateway must be on-link address, possibly
 *    described not by an ifaddr, but also by a direct route.
 * c) If both gateway and interface are specified, they should not
 *    contradict.
 * d) If we use tunnel routes, gateway could be not on-link.
 *
 * Attempt to reconcile all of these (alas, self-contradictory) conditions
 * results in pretty ugly and hairy code with obscure logic.
 *
 * I chose to generalized it instead, so that the size
 * of code does not increase practically, but it becomes
 * much more general.
 * Every prefix is assigned a "scope" value: "host" is local address,
 * "link" is direct route,
 * [ ... "site" ... "interior" ... ]
 * and "universe" is true gateway route with global meaning.
 *
 * Every prefix refers to a set of "nexthop"s (gw, oif),
 * where gw must have narrower scope. This recursion stops
 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 * which means that gw is forced to be on link.
 *
 * Code is still hairy, but now it is apparently logically
 * consistent and very flexible. F.e. as by-product it allows
 * to co-exists in peace independent exterior and interior
 * routing processes.
 *
 * Normally it looks as following.
 *
 * {universe prefix}  -> (gw, oif) [scope link]
 *		  |
 *		  |-> {link prefix} -> (gw, oif) [scope local]
 *					|
 *					|-> {local prefix} (terminal node)
L
Linus Torvalds 已提交
870
 */
871 872
static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
			struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
873
{
874
	int err = 0;
875
	struct net *net;
E
Eric Dumazet 已提交
876
	struct net_device *dev;
L
Linus Torvalds 已提交
877

878
	net = cfg->fc_nlinfo.nl_net;
D
David Ahern 已提交
879
	if (nh->fib_nh_gw4) {
L
Linus Torvalds 已提交
880 881
		struct fib_result res;

D
David Ahern 已提交
882
		if (nh->fib_nh_flags & RTNH_F_ONLINK) {
D
David Ahern 已提交
883
			unsigned int addr_type;
L
Linus Torvalds 已提交
884

885 886 887
			if (cfg->fc_scope >= RT_SCOPE_LINK) {
				NL_SET_ERR_MSG(extack,
					       "Nexthop has invalid scope");
L
Linus Torvalds 已提交
888
				return -EINVAL;
889
			}
D
David Ahern 已提交
890
			dev = __dev_get_by_index(net, nh->fib_nh_oif);
891 892
			if (!dev) {
				NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
L
Linus Torvalds 已提交
893
				return -ENODEV;
894
			}
895 896 897
			if (!(dev->flags & IFF_UP)) {
				NL_SET_ERR_MSG(extack,
					       "Nexthop device is not up");
L
Linus Torvalds 已提交
898
				return -ENETDOWN;
899
			}
D
David Ahern 已提交
900 901
			addr_type = inet_addr_type_dev_table(net, dev,
							     nh->fib_nh_gw4);
902 903 904
			if (addr_type != RTN_UNICAST) {
				NL_SET_ERR_MSG(extack,
					       "Nexthop has invalid gateway");
D
David Ahern 已提交
905
				return -EINVAL;
906
			}
907
			if (!netif_carrier_ok(dev))
D
David Ahern 已提交
908 909
				nh->fib_nh_flags |= RTNH_F_LINKDOWN;
			nh->fib_nh_dev = dev;
L
Linus Torvalds 已提交
910
			dev_hold(dev);
D
David Ahern 已提交
911
			nh->fib_nh_scope = RT_SCOPE_LINK;
L
Linus Torvalds 已提交
912 913
			return 0;
		}
E
Eric Dumazet 已提交
914
		rcu_read_lock();
L
Linus Torvalds 已提交
915
		{
916
			struct fib_table *tbl = NULL;
D
David S. Miller 已提交
917
			struct flowi4 fl4 = {
D
David Ahern 已提交
918
				.daddr = nh->fib_nh_gw4,
D
David S. Miller 已提交
919
				.flowi4_scope = cfg->fc_scope + 1,
D
David Ahern 已提交
920
				.flowi4_oif = nh->fib_nh_oif,
921
				.flowi4_iif = LOOPBACK_IFINDEX,
922
			};
L
Linus Torvalds 已提交
923 924

			/* It is not necessary, but requires a bit of thinking */
D
David S. Miller 已提交
925 926
			if (fl4.flowi4_scope < RT_SCOPE_LINK)
				fl4.flowi4_scope = RT_SCOPE_LINK;
927 928 929 930 931 932

			if (cfg->fc_table)
				tbl = fib_get_table(net, cfg->fc_table);

			if (tbl)
				err = fib_table_lookup(tbl, &fl4, &res,
933 934
						       FIB_LOOKUP_IGNORE_LINKSTATE |
						       FIB_LOOKUP_NOREF);
D
David Ahern 已提交
935 936 937 938 939 940

			/* on error or if no table given do full lookup. This
			 * is needed for example when nexthops are in the local
			 * table rather than the given table
			 */
			if (!tbl || err) {
941 942
				err = fib_lookup(net, &fl4, &res,
						 FIB_LOOKUP_IGNORE_LINKSTATE);
D
David Ahern 已提交
943 944
			}

E
Eric Dumazet 已提交
945
			if (err) {
946 947
				NL_SET_ERR_MSG(extack,
					       "Nexthop has invalid gateway");
E
Eric Dumazet 已提交
948
				rcu_read_unlock();
L
Linus Torvalds 已提交
949
				return err;
E
Eric Dumazet 已提交
950
			}
L
Linus Torvalds 已提交
951 952
		}
		err = -EINVAL;
953 954
		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
L
Linus Torvalds 已提交
955
			goto out;
956
		}
D
David Ahern 已提交
957 958 959
		nh->fib_nh_scope = res.scope;
		nh->fib_nh_oif = FIB_RES_OIF(res);
		nh->fib_nh_dev = dev = FIB_RES_DEV(res);
960 961 962
		if (!dev) {
			NL_SET_ERR_MSG(extack,
				       "No egress device for nexthop gateway");
L
Linus Torvalds 已提交
963
			goto out;
964
		}
E
Eric Dumazet 已提交
965
		dev_hold(dev);
966
		if (!netif_carrier_ok(dev))
D
David Ahern 已提交
967
			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
968
		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
L
Linus Torvalds 已提交
969 970 971
	} else {
		struct in_device *in_dev;

D
David Ahern 已提交
972
		if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
973 974
			NL_SET_ERR_MSG(extack,
				       "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
L
Linus Torvalds 已提交
975
			return -EINVAL;
976
		}
977 978
		rcu_read_lock();
		err = -ENODEV;
D
David Ahern 已提交
979
		in_dev = inetdev_by_index(net, nh->fib_nh_oif);
980
		if (!in_dev)
981 982
			goto out;
		err = -ENETDOWN;
983 984
		if (!(in_dev->dev->flags & IFF_UP)) {
			NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
985
			goto out;
986
		}
D
David Ahern 已提交
987 988 989 990 991
		nh->fib_nh_dev = in_dev->dev;
		dev_hold(nh->fib_nh_dev);
		nh->fib_nh_scope = RT_SCOPE_HOST;
		if (!netif_carrier_ok(nh->fib_nh_dev))
			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
992
		err = 0;
L
Linus Torvalds 已提交
993
	}
994 995 996
out:
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
997 998
}

A
Al Viro 已提交
999
static inline unsigned int fib_laddr_hashfn(__be32 val)
L
Linus Torvalds 已提交
1000
{
1001
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
1002

E
Eric Dumazet 已提交
1003 1004 1005
	return ((__force u32)val ^
		((__force u32)val >> 7) ^
		((__force u32)val >> 14)) & mask;
L
Linus Torvalds 已提交
1006 1007
}

1008
static struct hlist_head *fib_info_hash_alloc(int bytes)
L
Linus Torvalds 已提交
1009 1010
{
	if (bytes <= PAGE_SIZE)
1011
		return kzalloc(bytes, GFP_KERNEL);
L
Linus Torvalds 已提交
1012 1013
	else
		return (struct hlist_head *)
E
Eric Dumazet 已提交
1014 1015
			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
					 get_order(bytes));
L
Linus Torvalds 已提交
1016 1017
}

1018
static void fib_info_hash_free(struct hlist_head *hash, int bytes)
L
Linus Torvalds 已提交
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
{
	if (!hash)
		return;

	if (bytes <= PAGE_SIZE)
		kfree(hash);
	else
		free_pages((unsigned long) hash, get_order(bytes));
}

1029 1030 1031
static void fib_info_hash_move(struct hlist_head *new_info_hash,
			       struct hlist_head *new_laddrhash,
			       unsigned int new_size)
L
Linus Torvalds 已提交
1032
{
1033
	struct hlist_head *old_info_hash, *old_laddrhash;
1034
	unsigned int old_size = fib_info_hash_size;
1035
	unsigned int i, bytes;
L
Linus Torvalds 已提交
1036

1037
	spin_lock_bh(&fib_info_lock);
1038 1039
	old_info_hash = fib_info_hash;
	old_laddrhash = fib_info_laddrhash;
1040
	fib_info_hash_size = new_size;
L
Linus Torvalds 已提交
1041 1042 1043

	for (i = 0; i < old_size; i++) {
		struct hlist_head *head = &fib_info_hash[i];
1044
		struct hlist_node *n;
L
Linus Torvalds 已提交
1045 1046
		struct fib_info *fi;

1047
		hlist_for_each_entry_safe(fi, n, head, fib_hash) {
L
Linus Torvalds 已提交
1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
			struct hlist_head *dest;
			unsigned int new_hash;

			new_hash = fib_info_hashfn(fi);
			dest = &new_info_hash[new_hash];
			hlist_add_head(&fi->fib_hash, dest);
		}
	}
	fib_info_hash = new_info_hash;

	for (i = 0; i < old_size; i++) {
		struct hlist_head *lhead = &fib_info_laddrhash[i];
1060
		struct hlist_node *n;
L
Linus Torvalds 已提交
1061 1062
		struct fib_info *fi;

1063
		hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
L
Linus Torvalds 已提交
1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
			struct hlist_head *ldest;
			unsigned int new_hash;

			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
			ldest = &new_laddrhash[new_hash];
			hlist_add_head(&fi->fib_lhash, ldest);
		}
	}
	fib_info_laddrhash = new_laddrhash;

1074
	spin_unlock_bh(&fib_info_lock);
1075 1076

	bytes = old_size * sizeof(struct hlist_head *);
1077 1078
	fib_info_hash_free(old_info_hash, bytes);
	fib_info_hash_free(old_laddrhash, bytes);
L
Linus Torvalds 已提交
1079 1080
}

1081 1082
__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
{
D
David Ahern 已提交
1083 1084
	nh->nh_saddr = inet_select_addr(nh->fib_nh_dev,
					nh->fib_nh_gw4,
1085
					nh->nh_parent->fib_scope);
1086 1087 1088 1089 1090
	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);

	return nh->nh_saddr;
}

1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
{
	struct fib_nh_common *nhc = res->nhc;
	struct fib_nh *nh;

	if (res->fi->fib_prefsrc)
		return res->fi->fib_prefsrc;

	nh = container_of(nhc, struct fib_nh, nh_common);
	if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
		return nh->nh_saddr;

	return fib_info_update_nh_saddr(net, nh);
}

1106 1107 1108 1109
static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
{
	if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
	    fib_prefsrc != cfg->fc_dst) {
D
David Ahern 已提交
1110
		u32 tb_id = cfg->fc_table;
D
David Ahern 已提交
1111
		int rc;
1112 1113 1114 1115

		if (tb_id == RT_TABLE_MAIN)
			tb_id = RT_TABLE_LOCAL;

D
David Ahern 已提交
1116 1117 1118 1119 1120 1121
		rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
					  fib_prefsrc, tb_id);

		if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
			rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
						  fib_prefsrc, RT_TABLE_LOCAL);
1122
		}
D
David Ahern 已提交
1123 1124 1125

		if (rc != RTN_LOCAL)
			return false;
1126 1127 1128 1129
	}
	return true;
}

1130 1131
struct fib_info *fib_create_info(struct fib_config *cfg,
				 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1132 1133 1134 1135 1136
{
	int err;
	struct fib_info *fi = NULL;
	struct fib_info *ofi;
	int nhs = 1;
1137
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
1138

1139 1140 1141
	if (cfg->fc_type > RTN_MAX)
		goto err_inval;

L
Linus Torvalds 已提交
1142
	/* Fast check to catch the most weird cases */
1143 1144
	if (fib_props[cfg->fc_type].scope > cfg->fc_scope) {
		NL_SET_ERR_MSG(extack, "Invalid scope");
L
Linus Torvalds 已提交
1145
		goto err_inval;
1146
	}
L
Linus Torvalds 已提交
1147

1148 1149 1150
	if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
		NL_SET_ERR_MSG(extack,
			       "Invalid rtm_flags - can not contain DEAD or LINKDOWN");
1151
		goto err_inval;
1152
	}
1153

L
Linus Torvalds 已提交
1154
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1155
	if (cfg->fc_mp) {
1156
		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
L
Linus Torvalds 已提交
1157 1158 1159 1160 1161 1162
		if (nhs == 0)
			goto err_inval;
	}
#endif

	err = -ENOBUFS;
1163 1164
	if (fib_info_cnt >= fib_info_hash_size) {
		unsigned int new_size = fib_info_hash_size << 1;
L
Linus Torvalds 已提交
1165 1166 1167 1168 1169
		struct hlist_head *new_info_hash;
		struct hlist_head *new_laddrhash;
		unsigned int bytes;

		if (!new_size)
1170
			new_size = 16;
L
Linus Torvalds 已提交
1171
		bytes = new_size * sizeof(struct hlist_head *);
1172 1173
		new_info_hash = fib_info_hash_alloc(bytes);
		new_laddrhash = fib_info_hash_alloc(bytes);
L
Linus Torvalds 已提交
1174
		if (!new_info_hash || !new_laddrhash) {
1175 1176
			fib_info_hash_free(new_info_hash, bytes);
			fib_info_hash_free(new_laddrhash, bytes);
1177
		} else
1178
			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
L
Linus Torvalds 已提交
1179

1180
		if (!fib_info_hash_size)
L
Linus Torvalds 已提交
1181 1182 1183
			goto failure;
	}

1184
	fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
1185
	if (!fi)
L
Linus Torvalds 已提交
1186
		goto failure;
1187
	fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
1188
					      cfg->fc_mx_len, extack);
1189 1190 1191 1192
	if (unlikely(IS_ERR(fi->fib_metrics))) {
		err = PTR_ERR(fi->fib_metrics);
		kfree(fi);
		return ERR_PTR(err);
1193
	}
1194

1195
	fib_info_cnt++;
1196
	fi->fib_net = net;
1197
	fi->fib_protocol = cfg->fc_protocol;
1198
	fi->fib_scope = cfg->fc_scope;
1199 1200 1201
	fi->fib_flags = cfg->fc_flags;
	fi->fib_priority = cfg->fc_priority;
	fi->fib_prefsrc = cfg->fc_prefsrc;
E
Eric Dumazet 已提交
1202
	fi->fib_type = cfg->fc_type;
1203
	fi->fib_tb_id = cfg->fc_table;
L
Linus Torvalds 已提交
1204 1205 1206

	fi->fib_nhs = nhs;
	change_nexthops(fi) {
1207
		nexthop_nh->nh_parent = fi;
L
Linus Torvalds 已提交
1208 1209
	} endfor_nexthops(fi)

1210
	if (cfg->fc_mp)
1211
		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
1212 1213
	else
		err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
1214

1215 1216
	if (err != 0)
		goto failure;
L
Linus Torvalds 已提交
1217

1218
	if (fib_props[cfg->fc_type].error) {
1219
		if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
1220 1221
			NL_SET_ERR_MSG(extack,
				       "Gateway, device and multipath can not be specified for this route type");
L
Linus Torvalds 已提交
1222
			goto err_inval;
1223
		}
L
Linus Torvalds 已提交
1224
		goto link_it;
1225 1226 1227 1228 1229 1230 1231 1232 1233
	} else {
		switch (cfg->fc_type) {
		case RTN_UNICAST:
		case RTN_LOCAL:
		case RTN_BROADCAST:
		case RTN_ANYCAST:
		case RTN_MULTICAST:
			break;
		default:
1234
			NL_SET_ERR_MSG(extack, "Invalid route type");
1235 1236
			goto err_inval;
		}
L
Linus Torvalds 已提交
1237 1238
	}

1239 1240
	if (cfg->fc_scope > RT_SCOPE_HOST) {
		NL_SET_ERR_MSG(extack, "Invalid scope");
L
Linus Torvalds 已提交
1241
		goto err_inval;
1242
	}
L
Linus Torvalds 已提交
1243

1244
	if (cfg->fc_scope == RT_SCOPE_HOST) {
L
Linus Torvalds 已提交
1245 1246 1247
		struct fib_nh *nh = fi->fib_nh;

		/* Local address is added. */
1248 1249 1250
		if (nhs != 1) {
			NL_SET_ERR_MSG(extack,
				       "Route with host scope can not have multiple nexthops");
1251
			goto err_inval;
1252
		}
1253
		if (nh->fib_nh_gw_family) {
1254 1255
			NL_SET_ERR_MSG(extack,
				       "Route with host scope can not have a gateway");
L
Linus Torvalds 已提交
1256
			goto err_inval;
1257
		}
D
David Ahern 已提交
1258 1259
		nh->fib_nh_scope = RT_SCOPE_NOWHERE;
		nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif);
L
Linus Torvalds 已提交
1260
		err = -ENODEV;
D
David Ahern 已提交
1261
		if (!nh->fib_nh_dev)
L
Linus Torvalds 已提交
1262 1263
			goto failure;
	} else {
1264 1265
		int linkdown = 0;

L
Linus Torvalds 已提交
1266
		change_nexthops(fi) {
1267
			err = fib_check_nh(cfg, nexthop_nh, extack);
E
Eric Dumazet 已提交
1268
			if (err != 0)
L
Linus Torvalds 已提交
1269
				goto failure;
D
David Ahern 已提交
1270
			if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
1271
				linkdown++;
L
Linus Torvalds 已提交
1272
		} endfor_nexthops(fi)
1273 1274
		if (linkdown == fi->fib_nhs)
			fi->fib_flags |= RTNH_F_LINKDOWN;
L
Linus Torvalds 已提交
1275 1276
	}

1277 1278
	if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) {
		NL_SET_ERR_MSG(extack, "Invalid prefsrc address");
1279
		goto err_inval;
1280
	}
L
Linus Torvalds 已提交
1281

1282
	change_nexthops(fi) {
1283
		fib_info_update_nh_saddr(net, nexthop_nh);
1284 1285
	} endfor_nexthops(fi)

P
Peter Nørlund 已提交
1286 1287
	fib_rebalance(fi);

L
Linus Torvalds 已提交
1288
link_it:
E
Eric Dumazet 已提交
1289 1290
	ofi = fib_find_info(fi);
	if (ofi) {
L
Linus Torvalds 已提交
1291 1292 1293 1294 1295 1296 1297
		fi->fib_dead = 1;
		free_fib_info(fi);
		ofi->fib_treeref++;
		return ofi;
	}

	fi->fib_treeref++;
1298
	refcount_set(&fi->fib_clntref, 1);
1299
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
	hlist_add_head(&fi->fib_hash,
		       &fib_info_hash[fib_info_hashfn(fi)]);
	if (fi->fib_prefsrc) {
		struct hlist_head *head;

		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
		hlist_add_head(&fi->fib_lhash, head);
	}
	change_nexthops(fi) {
		struct hlist_head *head;
		unsigned int hash;

D
David Ahern 已提交
1312
		if (!nexthop_nh->fib_nh_dev)
L
Linus Torvalds 已提交
1313
			continue;
D
David Ahern 已提交
1314
		hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
L
Linus Torvalds 已提交
1315
		head = &fib_info_devhash[hash];
1316
		hlist_add_head(&nexthop_nh->nh_hash, head);
L
Linus Torvalds 已提交
1317
	} endfor_nexthops(fi)
1318
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
1319 1320 1321 1322 1323 1324
	return fi;

err_inval:
	err = -EINVAL;

failure:
1325
	if (fi) {
L
Linus Torvalds 已提交
1326 1327 1328
		fi->fib_dead = 1;
		free_fib_info(fi);
	}
1329 1330

	return ERR_PTR(err);
L
Linus Torvalds 已提交
1331 1332
}

D
David Ahern 已提交
1333 1334
int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
		     unsigned int *flags, bool skip_oif)
1335
{
1336
	if (nhc->nhc_flags & RTNH_F_DEAD)
1337 1338
		*flags |= RTNH_F_DEAD;

1339
	if (nhc->nhc_flags & RTNH_F_LINKDOWN) {
1340 1341 1342
		*flags |= RTNH_F_LINKDOWN;

		rcu_read_lock();
1343 1344 1345 1346 1347
		switch (nhc->nhc_family) {
		case AF_INET:
			if (ip_ignore_linkdown(nhc->nhc_dev))
				*flags |= RTNH_F_DEAD;
			break;
D
David Ahern 已提交
1348 1349 1350 1351
		case AF_INET6:
			if (ip6_ignore_linkdown(nhc->nhc_dev))
				*flags |= RTNH_F_DEAD;
			break;
1352
		}
1353 1354 1355
		rcu_read_unlock();
	}

1356 1357 1358 1359 1360 1361 1362 1363 1364
	switch (nhc->nhc_gw_family) {
	case AF_INET:
		if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
			goto nla_put_failure;
		break;
	case AF_INET6:
		if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0)
			goto nla_put_failure;
		break;
1365
	}
1366

1367 1368
	*flags |= (nhc->nhc_flags & RTNH_F_ONLINK);
	if (nhc->nhc_flags & RTNH_F_OFFLOAD)
1369 1370
		*flags |= RTNH_F_OFFLOAD;

1371 1372
	if (!skip_oif && nhc->nhc_dev &&
	    nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
1373 1374
		goto nla_put_failure;

1375 1376
	if (nhc->nhc_lwtstate &&
	    lwtunnel_fill_encap(skb, nhc->nhc_lwtstate) < 0)
1377 1378 1379 1380 1381 1382 1383
		goto nla_put_failure;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}
D
David Ahern 已提交
1384
EXPORT_SYMBOL_GPL(fib_nexthop_info);
1385

D
David Ahern 已提交
1386 1387 1388
#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
		    int nh_weight)
1389
{
1390
	const struct net_device *dev = nhc->nhc_dev;
1391 1392 1393 1394 1395 1396 1397
	struct rtnexthop *rtnh;
	unsigned int flags = 0;

	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
	if (!rtnh)
		goto nla_put_failure;

1398
	rtnh->rtnh_hops = nh_weight - 1;
1399 1400
	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;

1401
	if (fib_nexthop_info(skb, nhc, &flags, true) < 0)
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413
		goto nla_put_failure;

	rtnh->rtnh_flags = flags;

	/* length of rtnetlink header + attributes */
	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}
D
David Ahern 已提交
1414
EXPORT_SYMBOL_GPL(fib_add_nexthop);
1415
#endif
1416

1417
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1418 1419 1420 1421 1422 1423 1424 1425 1426
static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
{
	struct nlattr *mp;

	mp = nla_nest_start(skb, RTA_MULTIPATH);
	if (!mp)
		goto nla_put_failure;

	for_nexthops(fi) {
1427
		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449
			goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
		if (nh->nh_tclassid &&
		    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
			goto nla_put_failure;
#endif
	} endfor_nexthops(fi);

	nla_nest_end(skb, mp);

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}
#else
static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
{
	return 0;
}
#endif

1450
int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1451
		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
1452
		  struct fib_info *fi, unsigned int flags)
L
Linus Torvalds 已提交
1453
{
1454
	struct nlmsghdr *nlh;
L
Linus Torvalds 已提交
1455 1456
	struct rtmsg *rtm;

1457
	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
1458
	if (!nlh)
1459
		return -EMSGSIZE;
1460 1461

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
1462 1463 1464 1465
	rtm->rtm_family = AF_INET;
	rtm->rtm_dst_len = dst_len;
	rtm->rtm_src_len = 0;
	rtm->rtm_tos = tos;
1466 1467 1468 1469
	if (tb_id < 256)
		rtm->rtm_table = tb_id;
	else
		rtm->rtm_table = RT_TABLE_COMPAT;
D
David S. Miller 已提交
1470 1471
	if (nla_put_u32(skb, RTA_TABLE, tb_id))
		goto nla_put_failure;
L
Linus Torvalds 已提交
1472 1473
	rtm->rtm_type = type;
	rtm->rtm_flags = fi->fib_flags;
1474
	rtm->rtm_scope = fi->fib_scope;
L
Linus Torvalds 已提交
1475
	rtm->rtm_protocol = fi->fib_protocol;
1476

D
David S. Miller 已提交
1477
	if (rtm->rtm_dst_len &&
1478
	    nla_put_in_addr(skb, RTA_DST, dst))
D
David S. Miller 已提交
1479 1480 1481 1482
		goto nla_put_failure;
	if (fi->fib_priority &&
	    nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
		goto nla_put_failure;
1483
	if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
1484 1485
		goto nla_put_failure;

D
David S. Miller 已提交
1486
	if (fi->fib_prefsrc &&
1487
	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
D
David S. Miller 已提交
1488
		goto nla_put_failure;
L
Linus Torvalds 已提交
1489
	if (fi->fib_nhs == 1) {
1490 1491 1492
		struct fib_nh *nh = &fi->fib_nh[0];
		unsigned int flags = 0;

1493
		if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
D
David S. Miller 已提交
1494
			goto nla_put_failure;
1495 1496

		rtm->rtm_flags = flags;
1497
#ifdef CONFIG_IP_ROUTE_CLASSID
1498 1499
		if (nh->nh_tclassid &&
		    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
D
David S. Miller 已提交
1500
			goto nla_put_failure;
1501
#endif
1502 1503
	} else {
		if (fib_add_multipath(skb, fi) < 0)
1504
			goto nla_put_failure;
L
Linus Torvalds 已提交
1505 1506
	}

1507 1508
	nlmsg_end(skb, nlh);
	return 0;
L
Linus Torvalds 已提交
1509

1510
nla_put_failure:
1511 1512
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
1513 1514 1515
}

/*
E
Eric Dumazet 已提交
1516 1517 1518 1519
 * Update FIB if:
 * - local address disappeared -> we must delete all the entries
 *   referring to it.
 * - device went down -> we must shutdown all nexthops going via it.
L
Linus Torvalds 已提交
1520
 */
1521
int fib_sync_down_addr(struct net_device *dev, __be32 local)
L
Linus Torvalds 已提交
1522 1523
{
	int ret = 0;
D
Denis V. Lunev 已提交
1524 1525
	unsigned int hash = fib_laddr_hashfn(local);
	struct hlist_head *head = &fib_info_laddrhash[hash];
1526 1527
	struct net *net = dev_net(dev);
	int tb_id = l3mdev_fib_table(dev);
D
Denis V. Lunev 已提交
1528
	struct fib_info *fi;
L
Linus Torvalds 已提交
1529

1530
	if (!fib_info_laddrhash || local == 0)
D
Denis V. Lunev 已提交
1531
		return 0;
L
Linus Torvalds 已提交
1532

1533
	hlist_for_each_entry(fi, head, fib_lhash) {
1534 1535
		if (!net_eq(fi->fib_net, net) ||
		    fi->fib_tb_id != tb_id)
1536
			continue;
D
Denis V. Lunev 已提交
1537 1538 1539
		if (fi->fib_prefsrc == local) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1540 1541
		}
	}
D
Denis V. Lunev 已提交
1542 1543 1544
	return ret;
}

D
David Ahern 已提交
1545
static int call_fib_nh_notifiers(struct fib_nh *nh,
1546 1547
				 enum fib_event_type event_type)
{
D
David Ahern 已提交
1548
	bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev);
1549
	struct fib_nh_notifier_info info = {
D
David Ahern 已提交
1550
		.fib_nh = nh,
1551 1552 1553 1554
	};

	switch (event_type) {
	case FIB_EVENT_NH_ADD:
D
David Ahern 已提交
1555
		if (nh->fib_nh_flags & RTNH_F_DEAD)
1556
			break;
D
David Ahern 已提交
1557
		if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN)
1558
			break;
D
David Ahern 已提交
1559
		return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type,
1560
					   &info.info);
1561
	case FIB_EVENT_NH_DEL:
D
David Ahern 已提交
1562 1563 1564
		if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) ||
		    (nh->fib_nh_flags & RTNH_F_DEAD))
			return call_fib4_notifiers(dev_net(nh->fib_nh_dev),
1565
						   event_type, &info.info);
1566 1567 1568 1569 1570 1571 1572
	default:
		break;
	}

	return NOTIFY_DONE;
}

1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617
/* Update the PMTU of exceptions when:
 * - the new MTU of the first hop becomes smaller than the PMTU
 * - the old MTU was the same as the PMTU, and it limited discovery of
 *   larger MTUs on the path. With that limit raised, we can now
 *   discover larger MTUs
 * A special case is locked exceptions, for which the PMTU is smaller
 * than the minimal accepted PMTU:
 * - if the new MTU is greater than the PMTU, don't make any change
 * - otherwise, unlock and set PMTU
 */
static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig)
{
	struct fnhe_hash_bucket *bucket;
	int i;

	bucket = rcu_dereference_protected(nh->nh_exceptions, 1);
	if (!bucket)
		return;

	for (i = 0; i < FNHE_HASH_SIZE; i++) {
		struct fib_nh_exception *fnhe;

		for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
		     fnhe;
		     fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
			if (fnhe->fnhe_mtu_locked) {
				if (new <= fnhe->fnhe_pmtu) {
					fnhe->fnhe_pmtu = new;
					fnhe->fnhe_mtu_locked = false;
				}
			} else if (new < fnhe->fnhe_pmtu ||
				   orig == fnhe->fnhe_pmtu) {
				fnhe->fnhe_pmtu = new;
			}
		}
	}
}

void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
{
	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
	struct hlist_head *head = &fib_info_devhash[hash];
	struct fib_nh *nh;

	hlist_for_each_entry(nh, head, nh_hash) {
D
David Ahern 已提交
1618
		if (nh->fib_nh_dev == dev)
1619 1620 1621 1622
			nh_update_mtu(nh, dev->mtu, orig_mtu);
	}
}

1623 1624 1625 1626 1627 1628 1629
/* Event              force Flags           Description
 * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
 * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
 * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
 * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
 */
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
D
Denis V. Lunev 已提交
1630 1631 1632 1633 1634 1635 1636
{
	int ret = 0;
	int scope = RT_SCOPE_NOWHERE;
	struct fib_info *prev_fi = NULL;
	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
	struct hlist_head *head = &fib_info_devhash[hash];
	struct fib_nh *nh;
L
Linus Torvalds 已提交
1637

1638
	if (force)
D
Denis V. Lunev 已提交
1639
		scope = -1;
L
Linus Torvalds 已提交
1640

1641
	hlist_for_each_entry(nh, head, nh_hash) {
D
Denis V. Lunev 已提交
1642 1643
		struct fib_info *fi = nh->nh_parent;
		int dead;
L
Linus Torvalds 已提交
1644

D
Denis V. Lunev 已提交
1645
		BUG_ON(!fi->fib_nhs);
D
David Ahern 已提交
1646
		if (nh->fib_nh_dev != dev || fi == prev_fi)
D
Denis V. Lunev 已提交
1647 1648 1649 1650
			continue;
		prev_fi = fi;
		dead = 0;
		change_nexthops(fi) {
D
David Ahern 已提交
1651
			if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD)
D
Denis V. Lunev 已提交
1652
				dead++;
D
David Ahern 已提交
1653 1654
			else if (nexthop_nh->fib_nh_dev == dev &&
				 nexthop_nh->fib_nh_scope != scope) {
1655 1656 1657
				switch (event) {
				case NETDEV_DOWN:
				case NETDEV_UNREGISTER:
D
David Ahern 已提交
1658
					nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
1659 1660
					/* fall through */
				case NETDEV_CHANGE:
D
David Ahern 已提交
1661
					nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
1662 1663
					break;
				}
1664 1665
				call_fib_nh_notifiers(nexthop_nh,
						      FIB_EVENT_NH_DEL);
D
Denis V. Lunev 已提交
1666 1667
				dead++;
			}
L
Linus Torvalds 已提交
1668
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1669
			if (event == NETDEV_UNREGISTER &&
D
David Ahern 已提交
1670
			    nexthop_nh->fib_nh_dev == dev) {
D
Denis V. Lunev 已提交
1671 1672
				dead = fi->fib_nhs;
				break;
L
Linus Torvalds 已提交
1673
			}
D
Denis V. Lunev 已提交
1674 1675 1676
#endif
		} endfor_nexthops(fi)
		if (dead == fi->fib_nhs) {
1677 1678 1679 1680 1681 1682 1683 1684 1685
			switch (event) {
			case NETDEV_DOWN:
			case NETDEV_UNREGISTER:
				fi->fib_flags |= RTNH_F_DEAD;
				/* fall through */
			case NETDEV_CHANGE:
				fi->fib_flags |= RTNH_F_LINKDOWN;
				break;
			}
D
Denis V. Lunev 已提交
1686
			ret++;
L
Linus Torvalds 已提交
1687
		}
P
Peter Nørlund 已提交
1688 1689

		fib_rebalance(fi);
L
Linus Torvalds 已提交
1690 1691 1692 1693 1694
	}

	return ret;
}

1695
/* Must be invoked inside of an RCU protected region.  */
1696
static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
1697 1698
{
	struct fib_info *fi = NULL, *last_resort = NULL;
1699
	struct hlist_head *fa_head = res->fa_head;
1700
	struct fib_table *tb = res->table;
1701
	u8 slen = 32 - res->prefixlen;
1702
	int order = -1, last_idx = -1;
1703 1704 1705
	struct fib_alias *fa, *fa1 = NULL;
	u32 last_prio = res->fi->fib_priority;
	u8 last_tos = 0;
1706

1707
	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
1708 1709
		struct fib_info *next_fi = fa->fa_info;

1710 1711
		if (fa->fa_slen != slen)
			continue;
1712 1713
		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
			continue;
1714 1715
		if (fa->tb_id != tb->tb_id)
			continue;
1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
		if (next_fi->fib_priority > last_prio &&
		    fa->fa_tos == last_tos) {
			if (last_tos)
				continue;
			break;
		}
		if (next_fi->fib_flags & RTNH_F_DEAD)
			continue;
		last_tos = fa->fa_tos;
		last_prio = next_fi->fib_priority;

1727
		if (next_fi->fib_scope != res->scope ||
1728 1729
		    fa->fa_type != RTN_UNICAST)
			continue;
D
David Ahern 已提交
1730 1731
		if (!next_fi->fib_nh[0].fib_nh_gw4 ||
		    next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK)
1732 1733 1734 1735
			continue;

		fib_alias_accessed(fa);

1736
		if (!fi) {
1737 1738
			if (next_fi != res->fi)
				break;
1739
			fa1 = fa;
1740
		} else if (!fib_detect_death(fi, order, &last_resort,
1741
					     &last_idx, fa1->fa_default)) {
1742
			fib_result_assign(res, fi);
1743
			fa1->fa_default = order;
1744 1745 1746 1747 1748 1749
			goto out;
		}
		fi = next_fi;
		order++;
	}

1750
	if (order <= 0 || !fi) {
1751 1752
		if (fa1)
			fa1->fa_default = -1;
1753 1754 1755 1756
		goto out;
	}

	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1757
			      fa1->fa_default)) {
1758
		fib_result_assign(res, fi);
1759
		fa1->fa_default = order;
1760 1761 1762 1763 1764
		goto out;
	}

	if (last_idx >= 0)
		fib_result_assign(res, last_resort);
1765
	fa1->fa_default = last_idx;
1766
out:
1767
	return;
1768 1769
}

L
Linus Torvalds 已提交
1770
/*
E
Eric Dumazet 已提交
1771 1772
 * Dead device goes up. We wake up dead nexthops.
 * It takes sense only on multipath routes.
L
Linus Torvalds 已提交
1773
 */
1774
int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
L
Linus Torvalds 已提交
1775 1776 1777 1778 1779 1780 1781
{
	struct fib_info *prev_fi;
	unsigned int hash;
	struct hlist_head *head;
	struct fib_nh *nh;
	int ret;

E
Eric Dumazet 已提交
1782
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1783 1784
		return 0;

1785 1786 1787 1788 1789 1790 1791
	if (nh_flags & RTNH_F_DEAD) {
		unsigned int flags = dev_get_flags(dev);

		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
			nh_flags |= RTNH_F_LINKDOWN;
	}

L
Linus Torvalds 已提交
1792 1793 1794 1795 1796
	prev_fi = NULL;
	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	ret = 0;

1797
	hlist_for_each_entry(nh, head, nh_hash) {
L
Linus Torvalds 已提交
1798 1799 1800 1801
		struct fib_info *fi = nh->nh_parent;
		int alive;

		BUG_ON(!fi->fib_nhs);
D
David Ahern 已提交
1802
		if (nh->fib_nh_dev != dev || fi == prev_fi)
L
Linus Torvalds 已提交
1803 1804 1805 1806 1807
			continue;

		prev_fi = fi;
		alive = 0;
		change_nexthops(fi) {
D
David Ahern 已提交
1808
			if (!(nexthop_nh->fib_nh_flags & nh_flags)) {
L
Linus Torvalds 已提交
1809 1810 1811
				alive++;
				continue;
			}
D
David Ahern 已提交
1812 1813
			if (!nexthop_nh->fib_nh_dev ||
			    !(nexthop_nh->fib_nh_dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1814
				continue;
D
David Ahern 已提交
1815
			if (nexthop_nh->fib_nh_dev != dev ||
1816
			    !__in_dev_get_rtnl(dev))
L
Linus Torvalds 已提交
1817 1818
				continue;
			alive++;
D
David Ahern 已提交
1819
			nexthop_nh->fib_nh_flags &= ~nh_flags;
1820
			call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
L
Linus Torvalds 已提交
1821 1822 1823
		} endfor_nexthops(fi)

		if (alive > 0) {
1824
			fi->fib_flags &= ~nh_flags;
L
Linus Torvalds 已提交
1825 1826
			ret++;
		}
P
Peter Nørlund 已提交
1827 1828

		fib_rebalance(fi);
L
Linus Torvalds 已提交
1829 1830 1831 1832 1833
	}

	return ret;
}

1834
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1835 1836 1837 1838
static bool fib_good_nh(const struct fib_nh *nh)
{
	int state = NUD_REACHABLE;

D
David Ahern 已提交
1839
	if (nh->fib_nh_scope == RT_SCOPE_LINK) {
1840 1841 1842 1843
		struct neighbour *n;

		rcu_read_lock_bh();

D
David Ahern 已提交
1844 1845
		n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
					      (__force u32)nh->fib_nh_gw4);
1846 1847 1848 1849 1850 1851 1852 1853
		if (n)
			state = n->nud_state;

		rcu_read_unlock_bh();
	}

	return !!(state & NUD_VALID);
}
1854

P
Peter Nørlund 已提交
1855
void fib_select_multipath(struct fib_result *res, int hash)
L
Linus Torvalds 已提交
1856 1857
{
	struct fib_info *fi = res->fi;
1858 1859
	struct net *net = fi->fib_net;
	bool first = false;
L
Linus Torvalds 已提交
1860

1861
	change_nexthops(fi) {
1862
		if (net->ipv4.sysctl_fib_multipath_use_neigh) {
1863
			if (!fib_good_nh(nexthop_nh))
1864 1865 1866
				continue;
			if (!first) {
				res->nh_sel = nhsel;
1867
				res->nhc = &nexthop_nh->nh_common;
1868 1869 1870 1871
				first = true;
			}
		}

1872
		if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
P
Peter Nørlund 已提交
1873
			continue;
L
Linus Torvalds 已提交
1874

1875
		res->nh_sel = nhsel;
1876
		res->nhc = &nexthop_nh->nh_common;
1877
		return;
L
Linus Torvalds 已提交
1878 1879 1880
	} endfor_nexthops(fi);
}
#endif
1881 1882

void fib_select_path(struct net *net, struct fib_result *res,
1883
		     struct flowi4 *fl4, const struct sk_buff *skb)
1884
{
D
David Ahern 已提交
1885 1886
	if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF))
		goto check_saddr;
1887

1888
#ifdef CONFIG_IP_ROUTE_MULTIPATH
D
David Ahern 已提交
1889
	if (res->fi->fib_nhs > 1) {
1890
		int h = fib_multipath_hash(net, fl4, skb, NULL);
1891

1892
		fib_select_multipath(res, h);
1893 1894 1895 1896 1897
	}
	else
#endif
	if (!res->prefixlen &&
	    res->table->tb_num_default > 1 &&
D
David Ahern 已提交
1898
	    res->type == RTN_UNICAST)
1899 1900
		fib_select_default(fl4, res);

D
David Ahern 已提交
1901
check_saddr:
1902
	if (!fl4->saddr)
1903
		fl4->saddr = fib_result_prefsrc(net, res);
1904
}