fib_semantics.c 38.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: semantics.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

16
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
17 18 19 20 21 22 23 24 25 26 27
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
28
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
29 30 31 32 33
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
34
#include <linux/slab.h>
L
Linus Torvalds 已提交
35

36
#include <net/arp.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
43
#include <net/netlink.h>
44
#include <net/nexthop.h>
45
#include <net/lwtunnel.h>
L
Linus Torvalds 已提交
46 47 48

#include "fib_lookup.h"

49
static DEFINE_SPINLOCK(fib_info_lock);
L
Linus Torvalds 已提交
50 51
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
52
static unsigned int fib_info_hash_size;
L
Linus Torvalds 已提交
53 54 55 56 57 58 59 60
static unsigned int fib_info_cnt;

#define DEVINDEX_HASHBITS 8
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];

#ifdef CONFIG_IP_ROUTE_MULTIPATH

E
Eric Dumazet 已提交
61 62 63 64 65 66 67 68 69 70 71
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh;				\
	for (nhsel = 0, nh = (fi)->fib_nh;				\
	     nhsel < (fi)->fib_nhs;					\
	     nh++, nhsel++)

#define change_nexthops(fi) {						\
	int nhsel; struct fib_nh *nexthop_nh;				\
	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	     nhsel < (fi)->fib_nhs;					\
	     nexthop_nh++, nhsel++)
L
Linus Torvalds 已提交
72 73 74 75 76

#else /* CONFIG_IP_ROUTE_MULTIPATH */

/* Hope, that gcc will optimize it to get rid of dummy loop */

E
Eric Dumazet 已提交
77 78 79
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
80

E
Eric Dumazet 已提交
81 82 83 84
#define change_nexthops(fi) {						\
	int nhsel;							\
	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
85 86 87 88 89 90

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

#define endfor_nexthops(fi) }


91
const struct fib_prop fib_props[RTN_MAX + 1] = {
E
Eric Dumazet 已提交
92
	[RTN_UNSPEC] = {
L
Linus Torvalds 已提交
93 94
		.error	= 0,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
95 96
	},
	[RTN_UNICAST] = {
L
Linus Torvalds 已提交
97 98
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
99 100
	},
	[RTN_LOCAL] = {
L
Linus Torvalds 已提交
101 102
		.error	= 0,
		.scope	= RT_SCOPE_HOST,
E
Eric Dumazet 已提交
103 104
	},
	[RTN_BROADCAST] = {
L
Linus Torvalds 已提交
105 106
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
107 108
	},
	[RTN_ANYCAST] = {
L
Linus Torvalds 已提交
109 110
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
111 112
	},
	[RTN_MULTICAST] = {
L
Linus Torvalds 已提交
113 114
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
115 116
	},
	[RTN_BLACKHOLE] = {
L
Linus Torvalds 已提交
117 118
		.error	= -EINVAL,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
119 120
	},
	[RTN_UNREACHABLE] = {
L
Linus Torvalds 已提交
121 122
		.error	= -EHOSTUNREACH,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
123 124
	},
	[RTN_PROHIBIT] = {
L
Linus Torvalds 已提交
125 126
		.error	= -EACCES,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
127 128
	},
	[RTN_THROW] = {
L
Linus Torvalds 已提交
129 130
		.error	= -EAGAIN,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
131 132
	},
	[RTN_NAT] = {
L
Linus Torvalds 已提交
133 134
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
135 136
	},
	[RTN_XRESOLVE] = {
L
Linus Torvalds 已提交
137 138
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
139
	},
L
Linus Torvalds 已提交
140 141
};

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
static void rt_fibinfo_free(struct rtable __rcu **rtp)
{
	struct rtable *rt = rcu_dereference_protected(*rtp, 1);

	if (!rt)
		return;

	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
	 * because we waited an RCU grace period before calling
	 * free_fib_info_rcu()
	 */

	dst_free(&rt->dst);
}

157 158
static void free_nh_exceptions(struct fib_nh *nh)
{
159
	struct fnhe_hash_bucket *hash;
160 161
	int i;

162 163 164
	hash = rcu_dereference_protected(nh->nh_exceptions, 1);
	if (!hash)
		return;
165 166 167
	for (i = 0; i < FNHE_HASH_SIZE; i++) {
		struct fib_nh_exception *fnhe;

E
Eric Dumazet 已提交
168
		fnhe = rcu_dereference_protected(hash[i].chain, 1);
169 170 171
		while (fnhe) {
			struct fib_nh_exception *next;
			
E
Eric Dumazet 已提交
172
			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
173

174 175
			rt_fibinfo_free(&fnhe->fnhe_rth_input);
			rt_fibinfo_free(&fnhe->fnhe_rth_output);
176

177 178 179 180 181 182 183 184
			kfree(fnhe);

			fnhe = next;
		}
	}
	kfree(hash);
}

185
static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
E
Eric Dumazet 已提交
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
{
	int cpu;

	if (!rtp)
		return;

	for_each_possible_cpu(cpu) {
		struct rtable *rt;

		rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
		if (rt)
			dst_free(&rt->dst);
	}
	free_percpu(rtp);
}

L
Linus Torvalds 已提交
202
/* Release a nexthop info record */
203 204 205 206
static void free_fib_info_rcu(struct rcu_head *head)
{
	struct fib_info *fi = container_of(head, struct fib_info, rcu);

207 208 209
	change_nexthops(fi) {
		if (nexthop_nh->nh_dev)
			dev_put(nexthop_nh->nh_dev);
210
		lwtstate_put(nexthop_nh->nh_lwtstate);
211
		free_nh_exceptions(nexthop_nh);
212 213
		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
214 215
	} endfor_nexthops(fi);

216 217 218 219
	if (fi->fib_metrics != (u32 *) dst_default_metrics)
		kfree(fi->fib_metrics);
	kfree(fi);
}
L
Linus Torvalds 已提交
220 221 222 223

void free_fib_info(struct fib_info *fi)
{
	if (fi->fib_dead == 0) {
J
Joe Perches 已提交
224
		pr_warn("Freeing alive fib_info %p\n", fi);
L
Linus Torvalds 已提交
225 226 227
		return;
	}
	fib_info_cnt--;
228 229 230
#ifdef CONFIG_IP_ROUTE_CLASSID
	change_nexthops(fi) {
		if (nexthop_nh->nh_tclassid)
231
			fi->fib_net->ipv4.fib_num_tclassid_users--;
232 233
	} endfor_nexthops(fi);
#endif
234
	call_rcu(&fi->rcu, free_fib_info_rcu);
L
Linus Torvalds 已提交
235
}
I
Ido Schimmel 已提交
236
EXPORT_SYMBOL_GPL(free_fib_info);
L
Linus Torvalds 已提交
237 238 239

void fib_release_info(struct fib_info *fi)
{
240
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
241 242 243 244 245
	if (fi && --fi->fib_treeref == 0) {
		hlist_del(&fi->fib_hash);
		if (fi->fib_prefsrc)
			hlist_del(&fi->fib_lhash);
		change_nexthops(fi) {
246
			if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
247
				continue;
248
			hlist_del(&nexthop_nh->nh_hash);
L
Linus Torvalds 已提交
249 250 251 252
		} endfor_nexthops(fi)
		fi->fib_dead = 1;
		fib_info_put(fi);
	}
253
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
254 255
}

E
Eric Dumazet 已提交
256
static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
L
Linus Torvalds 已提交
257 258 259 260 261 262 263 264 265 266
{
	const struct fib_nh *onh = ofi->fib_nh;

	for_nexthops(fi) {
		if (nh->nh_oif != onh->nh_oif ||
		    nh->nh_gw  != onh->nh_gw ||
		    nh->nh_scope != onh->nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		    nh->nh_weight != onh->nh_weight ||
#endif
267
#ifdef CONFIG_IP_ROUTE_CLASSID
L
Linus Torvalds 已提交
268 269
		    nh->nh_tclassid != onh->nh_tclassid ||
#endif
270
		    lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
271
		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
L
Linus Torvalds 已提交
272 273 274 275 276 277
			return -1;
		onh++;
	} endfor_nexthops(fi);
	return 0;
}

278 279 280 281 282 283 284 285 286
static inline unsigned int fib_devindex_hashfn(unsigned int val)
{
	unsigned int mask = DEVINDEX_HASHSIZE - 1;

	return (val ^
		(val >> DEVINDEX_HASHBITS) ^
		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}

L
Linus Torvalds 已提交
287 288
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
289
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
290 291
	unsigned int val = fi->fib_nhs;

292
	val ^= (fi->fib_protocol << 8) | fi->fib_scope;
A
Al Viro 已提交
293
	val ^= (__force u32)fi->fib_prefsrc;
L
Linus Torvalds 已提交
294
	val ^= fi->fib_priority;
295 296 297
	for_nexthops(fi) {
		val ^= fib_devindex_hashfn(nh->nh_oif);
	} endfor_nexthops(fi)
L
Linus Torvalds 已提交
298 299 300 301 302 303 304 305 306 307 308 309 310

	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}

static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
	struct hlist_head *head;
	struct fib_info *fi;
	unsigned int hash;

	hash = fib_info_hashfn(nfi);
	head = &fib_info_hash[hash];

311
	hlist_for_each_entry(fi, head, fib_hash) {
O
Octavian Purdila 已提交
312
		if (!net_eq(fi->fib_net, nfi->fib_net))
313
			continue;
L
Linus Torvalds 已提交
314 315 316
		if (fi->fib_nhs != nfi->fib_nhs)
			continue;
		if (nfi->fib_protocol == fi->fib_protocol &&
317
		    nfi->fib_scope == fi->fib_scope &&
L
Linus Torvalds 已提交
318 319
		    nfi->fib_prefsrc == fi->fib_prefsrc &&
		    nfi->fib_priority == fi->fib_priority &&
E
Eric Dumazet 已提交
320
		    nfi->fib_type == fi->fib_type &&
L
Linus Torvalds 已提交
321
		    memcmp(nfi->fib_metrics, fi->fib_metrics,
E
Eric Dumazet 已提交
322
			   sizeof(u32) * RTAX_MAX) == 0 &&
323
		    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
L
Linus Torvalds 已提交
324 325 326 327 328 329 330 331
		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
			return fi;
	}

	return NULL;
}

/* Check, that the gateway is already configured.
E
Eric Dumazet 已提交
332
 * Used only by redirect accept routine.
L
Linus Torvalds 已提交
333
 */
334
int ip_fib_check_default(__be32 gw, struct net_device *dev)
L
Linus Torvalds 已提交
335 336 337 338 339
{
	struct hlist_head *head;
	struct fib_nh *nh;
	unsigned int hash;

340
	spin_lock(&fib_info_lock);
L
Linus Torvalds 已提交
341 342 343

	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
344
	hlist_for_each_entry(nh, head, nh_hash) {
L
Linus Torvalds 已提交
345 346
		if (nh->nh_dev == dev &&
		    nh->nh_gw == gw &&
E
Eric Dumazet 已提交
347
		    !(nh->nh_flags & RTNH_F_DEAD)) {
348
			spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
349 350 351 352
			return 0;
		}
	}

353
	spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
354 355 356 357

	return -1;
}

358 359 360 361 362 363
static inline size_t fib_nlmsg_size(struct fib_info *fi)
{
	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
			 + nla_total_size(4) /* RTA_TABLE */
			 + nla_total_size(4) /* RTA_DST */
			 + nla_total_size(4) /* RTA_PRIORITY */
364 365
			 + nla_total_size(4) /* RTA_PREFSRC */
			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
366 367 368 369 370

	/* space for nested metrics */
	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));

	if (fi->fib_nhs) {
371
		size_t nh_encapsize = 0;
372 373 374 375 376 377 378 379
		/* Also handles the special case fib_nhs == 1 */

		/* each nexthop is packed in an attribute */
		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));

		/* may contain flow and gateway attribute */
		nhsize += 2 * nla_total_size(4);

380 381 382 383 384 385 386 387 388 389 390
		/* grab encap info */
		for_nexthops(fi) {
			if (nh->nh_lwtstate) {
				/* RTA_ENCAP_TYPE */
				nh_encapsize += lwtunnel_get_encap_size(
						nh->nh_lwtstate);
				/* RTA_ENCAP */
				nh_encapsize +=  nla_total_size(2);
			}
		} endfor_nexthops(fi);

391
		/* all nexthops are packed in a nested attribute */
392 393 394
		payload += nla_total_size((fi->fib_nhs * nhsize) +
					  nh_encapsize);

395 396 397 398 399
	}

	return payload;
}

A
Al Viro 已提交
400
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
401
	       int dst_len, u32 tb_id, const struct nl_info *info,
402
	       unsigned int nlm_flags)
L
Linus Torvalds 已提交
403 404
{
	struct sk_buff *skb;
405
	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
406
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
407

408
	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
409
	if (!skb)
410
		goto errout;
L
Linus Torvalds 已提交
411

412
	err = fib_dump_info(skb, info->portid, seq, event, tb_id,
413
			    fa->fa_type, key, dst_len,
414
			    fa->fa_tos, fa->fa_info, nlm_flags);
415 416 417 418 419 420
	if (err < 0) {
		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
421
	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
422 423
		    info->nlh, GFP_KERNEL);
	return;
424 425
errout:
	if (err < 0)
426
		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
L
Linus Torvalds 已提交
427 428
}

429 430 431
static int fib_detect_death(struct fib_info *fi, int order,
			    struct fib_info **last_resort, int *last_idx,
			    int dflt)
L
Linus Torvalds 已提交
432 433 434 435 436 437 438 439
{
	struct neighbour *n;
	int state = NUD_NONE;

	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
	if (n) {
		state = n->nud_state;
		neigh_release(n);
440 441
	} else {
		return 0;
L
Linus Torvalds 已提交
442
	}
443
	if (state == NUD_REACHABLE)
L
Linus Torvalds 已提交
444
		return 0;
E
Eric Dumazet 已提交
445
	if ((state & NUD_VALID) && order != dflt)
L
Linus Torvalds 已提交
446
		return 0;
E
Eric Dumazet 已提交
447
	if ((state & NUD_VALID) ||
448
	    (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
L
Linus Torvalds 已提交
449 450 451 452 453 454 455 456
		*last_resort = fi;
		*last_idx = order;
	}
	return 1;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH

457
static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
L
Linus Torvalds 已提交
458 459 460
{
	int nhs = 0;

461
	while (rtnh_ok(rtnh, remaining)) {
L
Linus Torvalds 已提交
462
		nhs++;
463 464 465 466 467
		rtnh = rtnh_next(rtnh, &remaining);
	}

	/* leftover implies invalid nexthop configuration, discard it */
	return remaining > 0 ? 0 : nhs;
L
Linus Torvalds 已提交
468 469
}

470 471
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
		       int remaining, struct fib_config *cfg)
L
Linus Torvalds 已提交
472
{
473 474
	int ret;

L
Linus Torvalds 已提交
475
	change_nexthops(fi) {
476 477 478
		int attrlen;

		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
479
			return -EINVAL;
480

481 482 483
		if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
			return -EINVAL;

484 485 486 487
		nexthop_nh->nh_flags =
			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
488 489 490 491 492 493

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
494
			nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
495
#ifdef CONFIG_IP_ROUTE_CLASSID
496
			nla = nla_find(attrs, attrlen, RTA_FLOW);
497
			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
498
			if (nexthop_nh->nh_tclassid)
499
				fi->fib_net->ipv4.fib_num_tclassid_users++;
L
Linus Torvalds 已提交
500
#endif
501 502 503 504 505 506 507 508 509
			nla = nla_find(attrs, attrlen, RTA_ENCAP);
			if (nla) {
				struct lwtunnel_state *lwtstate;
				struct nlattr *nla_entype;

				nla_entype = nla_find(attrs, attrlen,
						      RTA_ENCAP_TYPE);
				if (!nla_entype)
					goto err_inval;
510 511

				ret = lwtunnel_build_state(nla_get_u16(
512
							   nla_entype),
513 514
							   nla,  AF_INET, cfg,
							   &lwtstate);
515 516
				if (ret)
					goto errout;
517 518
				nexthop_nh->nh_lwtstate =
					lwtstate_get(lwtstate);
519
			}
L
Linus Torvalds 已提交
520
		}
521 522

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
523
	} endfor_nexthops(fi);
524

L
Linus Torvalds 已提交
525
	return 0;
526 527 528 529 530 531

err_inval:
	ret = -EINVAL;

errout:
	return ret;
L
Linus Torvalds 已提交
532 533
}

P
Peter Nørlund 已提交
534 535 536 537 538 539 540 541 542 543 544 545 546 547
static void fib_rebalance(struct fib_info *fi)
{
	int total;
	int w;
	struct in_device *in_dev;

	if (fi->fib_nhs < 2)
		return;

	total = 0;
	for_nexthops(fi) {
		if (nh->nh_flags & RTNH_F_DEAD)
			continue;

548
		in_dev = __in_dev_get_rtnl(nh->nh_dev);
P
Peter Nørlund 已提交
549 550 551 552 553 554 555 556 557 558 559 560 561

		if (in_dev &&
		    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
		    nh->nh_flags & RTNH_F_LINKDOWN)
			continue;

		total += nh->nh_weight;
	} endfor_nexthops(fi);

	w = 0;
	change_nexthops(fi) {
		int upper_bound;

562
		in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
P
Peter Nørlund 已提交
563 564 565 566 567 568 569 570 571

		if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
			upper_bound = -1;
		} else if (in_dev &&
			   IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
			   nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
			upper_bound = -1;
		} else {
			w += nexthop_nh->nh_weight;
572 573
			upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
							    total) - 1;
P
Peter Nørlund 已提交
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
		}

		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
	} endfor_nexthops(fi);
}

static inline void fib_add_weight(struct fib_info *fi,
				  const struct fib_nh *nh)
{
	fi->fib_weight += nh->nh_weight;
}

#else /* CONFIG_IP_ROUTE_MULTIPATH */

#define fib_rebalance(fi) do { } while (0)
#define fib_add_weight(fi, nh) do { } while (0)

#endif /* CONFIG_IP_ROUTE_MULTIPATH */
L
Linus Torvalds 已提交
592

593
static int fib_encap_match(u16 encap_type,
Y
Ying Xue 已提交
594
			   struct nlattr *encap,
595
			   const struct fib_nh *nh,
596
			   const struct fib_config *cfg)
597 598
{
	struct lwtunnel_state *lwtstate;
J
Jiri Benc 已提交
599
	int ret, result = 0;
600 601 602 603

	if (encap_type == LWTUNNEL_ENCAP_NONE)
		return 0;

604
	ret = lwtunnel_build_state(encap_type, encap,
605
				   AF_INET, cfg, &lwtstate);
J
Jiri Benc 已提交
606 607 608 609
	if (!ret) {
		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
		lwtstate_free(lwtstate);
	}
610

J
Jiri Benc 已提交
611
	return result;
612 613
}

614
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
L
Linus Torvalds 已提交
615 616
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
617 618
	struct rtnexthop *rtnh;
	int remaining;
L
Linus Torvalds 已提交
619 620
#endif

621
	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
L
Linus Torvalds 已提交
622 623
		return 1;

624
	if (cfg->fc_oif || cfg->fc_gw) {
625
		if (cfg->fc_encap) {
626 627
			if (fib_encap_match(cfg->fc_encap_type,
					    cfg->fc_encap, fi->fib_nh, cfg))
628 629
			    return 1;
		}
630 631
		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
L
Linus Torvalds 已提交
632 633 634 635 636
			return 0;
		return 1;
	}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
637
	if (!cfg->fc_mp)
L
Linus Torvalds 已提交
638
		return 0;
639 640 641

	rtnh = cfg->fc_mp;
	remaining = cfg->fc_mp_len;
642

L
Linus Torvalds 已提交
643
	for_nexthops(fi) {
644
		int attrlen;
L
Linus Torvalds 已提交
645

646
		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
647
			return -EINVAL;
648 649

		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
L
Linus Torvalds 已提交
650
			return 1;
651 652

		attrlen = rtnh_attrlen(rtnh);
653
		if (attrlen > 0) {
654 655 656
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
657
			if (nla && nla_get_in_addr(nla) != nh->nh_gw)
L
Linus Torvalds 已提交
658
				return 1;
659
#ifdef CONFIG_IP_ROUTE_CLASSID
660 661
			nla = nla_find(attrs, attrlen, RTA_FLOW);
			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
L
Linus Torvalds 已提交
662 663 664
				return 1;
#endif
		}
665 666

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
667 668 669 670 671 672 673
	} endfor_nexthops(fi);
#endif
	return 0;
}


/*
E
Eric Dumazet 已提交
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
 * Picture
 * -------
 *
 * Semantics of nexthop is very messy by historical reasons.
 * We have to take into account, that:
 * a) gateway can be actually local interface address,
 *    so that gatewayed route is direct.
 * b) gateway must be on-link address, possibly
 *    described not by an ifaddr, but also by a direct route.
 * c) If both gateway and interface are specified, they should not
 *    contradict.
 * d) If we use tunnel routes, gateway could be not on-link.
 *
 * Attempt to reconcile all of these (alas, self-contradictory) conditions
 * results in pretty ugly and hairy code with obscure logic.
 *
 * I chose to generalized it instead, so that the size
 * of code does not increase practically, but it becomes
 * much more general.
 * Every prefix is assigned a "scope" value: "host" is local address,
 * "link" is direct route,
 * [ ... "site" ... "interior" ... ]
 * and "universe" is true gateway route with global meaning.
 *
 * Every prefix refers to a set of "nexthop"s (gw, oif),
 * where gw must have narrower scope. This recursion stops
 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 * which means that gw is forced to be on link.
 *
 * Code is still hairy, but now it is apparently logically
 * consistent and very flexible. F.e. as by-product it allows
 * to co-exists in peace independent exterior and interior
 * routing processes.
 *
 * Normally it looks as following.
 *
 * {universe prefix}  -> (gw, oif) [scope link]
 *		  |
 *		  |-> {link prefix} -> (gw, oif) [scope local]
 *					|
 *					|-> {local prefix} (terminal node)
L
Linus Torvalds 已提交
715
 */
716 717
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
			struct fib_nh *nh)
L
Linus Torvalds 已提交
718
{
719
	int err = 0;
720
	struct net *net;
E
Eric Dumazet 已提交
721
	struct net_device *dev;
L
Linus Torvalds 已提交
722

723
	net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
724 725 726
	if (nh->nh_gw) {
		struct fib_result res;

E
Eric Dumazet 已提交
727
		if (nh->nh_flags & RTNH_F_ONLINK) {
D
David Ahern 已提交
728
			unsigned int addr_type;
L
Linus Torvalds 已提交
729

730
			if (cfg->fc_scope >= RT_SCOPE_LINK)
L
Linus Torvalds 已提交
731
				return -EINVAL;
E
Eric Dumazet 已提交
732 733
			dev = __dev_get_by_index(net, nh->nh_oif);
			if (!dev)
L
Linus Torvalds 已提交
734
				return -ENODEV;
E
Eric Dumazet 已提交
735
			if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
736
				return -ENETDOWN;
D
David Ahern 已提交
737 738 739
			addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
			if (addr_type != RTN_UNICAST)
				return -EINVAL;
740 741
			if (!netif_carrier_ok(dev))
				nh->nh_flags |= RTNH_F_LINKDOWN;
L
Linus Torvalds 已提交
742 743 744 745 746
			nh->nh_dev = dev;
			dev_hold(dev);
			nh->nh_scope = RT_SCOPE_LINK;
			return 0;
		}
E
Eric Dumazet 已提交
747
		rcu_read_lock();
L
Linus Torvalds 已提交
748
		{
749
			struct fib_table *tbl = NULL;
D
David S. Miller 已提交
750 751 752 753
			struct flowi4 fl4 = {
				.daddr = nh->nh_gw,
				.flowi4_scope = cfg->fc_scope + 1,
				.flowi4_oif = nh->nh_oif,
754
				.flowi4_iif = LOOPBACK_IFINDEX,
755
			};
L
Linus Torvalds 已提交
756 757

			/* It is not necessary, but requires a bit of thinking */
D
David S. Miller 已提交
758 759
			if (fl4.flowi4_scope < RT_SCOPE_LINK)
				fl4.flowi4_scope = RT_SCOPE_LINK;
760 761 762 763 764 765

			if (cfg->fc_table)
				tbl = fib_get_table(net, cfg->fc_table);

			if (tbl)
				err = fib_table_lookup(tbl, &fl4, &res,
766 767
						       FIB_LOOKUP_IGNORE_LINKSTATE |
						       FIB_LOOKUP_NOREF);
D
David Ahern 已提交
768 769 770 771 772 773

			/* on error or if no table given do full lookup. This
			 * is needed for example when nexthops are in the local
			 * table rather than the given table
			 */
			if (!tbl || err) {
774 775
				err = fib_lookup(net, &fl4, &res,
						 FIB_LOOKUP_IGNORE_LINKSTATE);
D
David Ahern 已提交
776 777
			}

E
Eric Dumazet 已提交
778 779
			if (err) {
				rcu_read_unlock();
L
Linus Torvalds 已提交
780
				return err;
E
Eric Dumazet 已提交
781
			}
L
Linus Torvalds 已提交
782 783 784 785 786 787
		}
		err = -EINVAL;
		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
			goto out;
		nh->nh_scope = res.scope;
		nh->nh_oif = FIB_RES_OIF(res);
E
Eric Dumazet 已提交
788 789
		nh->nh_dev = dev = FIB_RES_DEV(res);
		if (!dev)
L
Linus Torvalds 已提交
790
			goto out;
E
Eric Dumazet 已提交
791
		dev_hold(dev);
792 793
		if (!netif_carrier_ok(dev))
			nh->nh_flags |= RTNH_F_LINKDOWN;
794
		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
L
Linus Torvalds 已提交
795 796 797
	} else {
		struct in_device *in_dev;

E
Eric Dumazet 已提交
798
		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
L
Linus Torvalds 已提交
799 800
			return -EINVAL;

801 802
		rcu_read_lock();
		err = -ENODEV;
803
		in_dev = inetdev_by_index(net, nh->nh_oif);
804
		if (!in_dev)
805 806 807 808
			goto out;
		err = -ENETDOWN;
		if (!(in_dev->dev->flags & IFF_UP))
			goto out;
L
Linus Torvalds 已提交
809 810 811
		nh->nh_dev = in_dev->dev;
		dev_hold(nh->nh_dev);
		nh->nh_scope = RT_SCOPE_HOST;
812 813
		if (!netif_carrier_ok(nh->nh_dev))
			nh->nh_flags |= RTNH_F_LINKDOWN;
814
		err = 0;
L
Linus Torvalds 已提交
815
	}
816 817 818
out:
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
819 820
}

A
Al Viro 已提交
821
static inline unsigned int fib_laddr_hashfn(__be32 val)
L
Linus Torvalds 已提交
822
{
823
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
824

E
Eric Dumazet 已提交
825 826 827
	return ((__force u32)val ^
		((__force u32)val >> 7) ^
		((__force u32)val >> 14)) & mask;
L
Linus Torvalds 已提交
828 829
}

830
static struct hlist_head *fib_info_hash_alloc(int bytes)
L
Linus Torvalds 已提交
831 832
{
	if (bytes <= PAGE_SIZE)
833
		return kzalloc(bytes, GFP_KERNEL);
L
Linus Torvalds 已提交
834 835
	else
		return (struct hlist_head *)
E
Eric Dumazet 已提交
836 837
			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
					 get_order(bytes));
L
Linus Torvalds 已提交
838 839
}

840
static void fib_info_hash_free(struct hlist_head *hash, int bytes)
L
Linus Torvalds 已提交
841 842 843 844 845 846 847 848 849 850
{
	if (!hash)
		return;

	if (bytes <= PAGE_SIZE)
		kfree(hash);
	else
		free_pages((unsigned long) hash, get_order(bytes));
}

851 852 853
static void fib_info_hash_move(struct hlist_head *new_info_hash,
			       struct hlist_head *new_laddrhash,
			       unsigned int new_size)
L
Linus Torvalds 已提交
854
{
855
	struct hlist_head *old_info_hash, *old_laddrhash;
856
	unsigned int old_size = fib_info_hash_size;
857
	unsigned int i, bytes;
L
Linus Torvalds 已提交
858

859
	spin_lock_bh(&fib_info_lock);
860 861
	old_info_hash = fib_info_hash;
	old_laddrhash = fib_info_laddrhash;
862
	fib_info_hash_size = new_size;
L
Linus Torvalds 已提交
863 864 865

	for (i = 0; i < old_size; i++) {
		struct hlist_head *head = &fib_info_hash[i];
866
		struct hlist_node *n;
L
Linus Torvalds 已提交
867 868
		struct fib_info *fi;

869
		hlist_for_each_entry_safe(fi, n, head, fib_hash) {
L
Linus Torvalds 已提交
870 871 872 873 874 875 876 877 878 879 880 881
			struct hlist_head *dest;
			unsigned int new_hash;

			new_hash = fib_info_hashfn(fi);
			dest = &new_info_hash[new_hash];
			hlist_add_head(&fi->fib_hash, dest);
		}
	}
	fib_info_hash = new_info_hash;

	for (i = 0; i < old_size; i++) {
		struct hlist_head *lhead = &fib_info_laddrhash[i];
882
		struct hlist_node *n;
L
Linus Torvalds 已提交
883 884
		struct fib_info *fi;

885
		hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
L
Linus Torvalds 已提交
886 887 888 889 890 891 892 893 894 895
			struct hlist_head *ldest;
			unsigned int new_hash;

			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
			ldest = &new_laddrhash[new_hash];
			hlist_add_head(&fi->fib_lhash, ldest);
		}
	}
	fib_info_laddrhash = new_laddrhash;

896
	spin_unlock_bh(&fib_info_lock);
897 898

	bytes = old_size * sizeof(struct hlist_head *);
899 900
	fib_info_hash_free(old_info_hash, bytes);
	fib_info_hash_free(old_laddrhash, bytes);
L
Linus Torvalds 已提交
901 902
}

903 904 905 906
__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
{
	nh->nh_saddr = inet_select_addr(nh->nh_dev,
					nh->nh_gw,
907
					nh->nh_parent->fib_scope);
908 909 910 911 912
	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);

	return nh->nh_saddr;
}

913 914 915 916
static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
{
	if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
	    fib_prefsrc != cfg->fc_dst) {
D
David Ahern 已提交
917
		u32 tb_id = cfg->fc_table;
D
David Ahern 已提交
918
		int rc;
919 920 921 922

		if (tb_id == RT_TABLE_MAIN)
			tb_id = RT_TABLE_LOCAL;

D
David Ahern 已提交
923 924 925 926 927 928
		rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
					  fib_prefsrc, tb_id);

		if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
			rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
						  fib_prefsrc, RT_TABLE_LOCAL);
929
		}
D
David Ahern 已提交
930 931 932

		if (rc != RTN_LOCAL)
			return false;
933 934 935 936
	}
	return true;
}

937 938 939
static int
fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
{
940
	bool ecn_ca = false;
941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959
	struct nlattr *nla;
	int remaining;

	if (!cfg->fc_mx)
		return 0;

	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
		int type = nla_type(nla);
		u32 val;

		if (!type)
			continue;
		if (type > RTAX_MAX)
			return -EINVAL;

		if (type == RTAX_CC_ALGO) {
			char tmp[TCP_CA_NAME_MAX];

			nla_strlcpy(tmp, nla, sizeof(tmp));
960
			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
961 962 963 964 965 966 967 968 969
			if (val == TCP_CA_UNSPEC)
				return -EINVAL;
		} else {
			val = nla_get_u32(nla);
		}
		if (type == RTAX_ADVMSS && val > 65535 - 40)
			val = 65535 - 40;
		if (type == RTAX_MTU && val > 65535 - 15)
			val = 65535 - 15;
970 971
		if (type == RTAX_HOPLIMIT && val > 255)
			val = 255;
972 973
		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
			return -EINVAL;
974 975 976
		fi->fib_metrics[type - 1] = val;
	}

977 978 979
	if (ecn_ca)
		fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;

980 981 982
	return 0;
}

983
struct fib_info *fib_create_info(struct fib_config *cfg)
L
Linus Torvalds 已提交
984 985 986 987 988
{
	int err;
	struct fib_info *fi = NULL;
	struct fib_info *ofi;
	int nhs = 1;
989
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
990

991 992 993
	if (cfg->fc_type > RTN_MAX)
		goto err_inval;

L
Linus Torvalds 已提交
994
	/* Fast check to catch the most weird cases */
995
	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
L
Linus Torvalds 已提交
996 997
		goto err_inval;

998 999 1000
	if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
		goto err_inval;

L
Linus Torvalds 已提交
1001
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1002 1003
	if (cfg->fc_mp) {
		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
L
Linus Torvalds 已提交
1004 1005 1006 1007 1008 1009
		if (nhs == 0)
			goto err_inval;
	}
#endif

	err = -ENOBUFS;
1010 1011
	if (fib_info_cnt >= fib_info_hash_size) {
		unsigned int new_size = fib_info_hash_size << 1;
L
Linus Torvalds 已提交
1012 1013 1014 1015 1016
		struct hlist_head *new_info_hash;
		struct hlist_head *new_laddrhash;
		unsigned int bytes;

		if (!new_size)
1017
			new_size = 16;
L
Linus Torvalds 已提交
1018
		bytes = new_size * sizeof(struct hlist_head *);
1019 1020
		new_info_hash = fib_info_hash_alloc(bytes);
		new_laddrhash = fib_info_hash_alloc(bytes);
L
Linus Torvalds 已提交
1021
		if (!new_info_hash || !new_laddrhash) {
1022 1023
			fib_info_hash_free(new_info_hash, bytes);
			fib_info_hash_free(new_laddrhash, bytes);
1024
		} else
1025
			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
L
Linus Torvalds 已提交
1026

1027
		if (!fib_info_hash_size)
L
Linus Torvalds 已提交
1028 1029 1030
			goto failure;
	}

1031
	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
1032
	if (!fi)
L
Linus Torvalds 已提交
1033
		goto failure;
1034
	fib_info_cnt++;
1035 1036 1037 1038 1039 1040
	if (cfg->fc_mx) {
		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
		if (!fi->fib_metrics)
			goto failure;
	} else
		fi->fib_metrics = (u32 *) dst_default_metrics;
L
Linus Torvalds 已提交
1041

1042
	fi->fib_net = net;
1043
	fi->fib_protocol = cfg->fc_protocol;
1044
	fi->fib_scope = cfg->fc_scope;
1045 1046 1047
	fi->fib_flags = cfg->fc_flags;
	fi->fib_priority = cfg->fc_priority;
	fi->fib_prefsrc = cfg->fc_prefsrc;
E
Eric Dumazet 已提交
1048
	fi->fib_type = cfg->fc_type;
1049
	fi->fib_tb_id = cfg->fc_table;
L
Linus Torvalds 已提交
1050 1051 1052

	fi->fib_nhs = nhs;
	change_nexthops(fi) {
1053
		nexthop_nh->nh_parent = fi;
E
Eric Dumazet 已提交
1054
		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
1055 1056
		if (!nexthop_nh->nh_pcpu_rth_output)
			goto failure;
L
Linus Torvalds 已提交
1057 1058
	} endfor_nexthops(fi)

1059 1060 1061
	err = fib_convert_metrics(fi, cfg);
	if (err)
		goto failure;
L
Linus Torvalds 已提交
1062

1063
	if (cfg->fc_mp) {
L
Linus Torvalds 已提交
1064
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1065 1066
		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
		if (err != 0)
L
Linus Torvalds 已提交
1067
			goto failure;
1068
		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
L
Linus Torvalds 已提交
1069
			goto err_inval;
1070
		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
L
Linus Torvalds 已提交
1071
			goto err_inval;
1072
#ifdef CONFIG_IP_ROUTE_CLASSID
1073
		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
L
Linus Torvalds 已提交
1074 1075 1076 1077 1078 1079 1080
			goto err_inval;
#endif
#else
		goto err_inval;
#endif
	} else {
		struct fib_nh *nh = fi->fib_nh;
1081

1082 1083 1084 1085 1086
		if (cfg->fc_encap) {
			struct lwtunnel_state *lwtstate;

			if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
				goto err_inval;
1087
			err = lwtunnel_build_state(cfg->fc_encap_type,
1088 1089
						   cfg->fc_encap, AF_INET, cfg,
						   &lwtstate);
1090 1091 1092
			if (err)
				goto failure;

1093
			nh->nh_lwtstate = lwtstate_get(lwtstate);
1094
		}
1095 1096 1097
		nh->nh_oif = cfg->fc_oif;
		nh->nh_gw = cfg->fc_gw;
		nh->nh_flags = cfg->fc_flags;
1098
#ifdef CONFIG_IP_ROUTE_CLASSID
1099
		nh->nh_tclassid = cfg->fc_flow;
1100
		if (nh->nh_tclassid)
1101
			fi->fib_net->ipv4.fib_num_tclassid_users++;
L
Linus Torvalds 已提交
1102 1103 1104 1105 1106 1107
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		nh->nh_weight = 1;
#endif
	}

1108 1109
	if (fib_props[cfg->fc_type].error) {
		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
L
Linus Torvalds 已提交
1110 1111
			goto err_inval;
		goto link_it;
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
	} else {
		switch (cfg->fc_type) {
		case RTN_UNICAST:
		case RTN_LOCAL:
		case RTN_BROADCAST:
		case RTN_ANYCAST:
		case RTN_MULTICAST:
			break;
		default:
			goto err_inval;
		}
L
Linus Torvalds 已提交
1123 1124
	}

1125
	if (cfg->fc_scope > RT_SCOPE_HOST)
L
Linus Torvalds 已提交
1126 1127
		goto err_inval;

1128
	if (cfg->fc_scope == RT_SCOPE_HOST) {
L
Linus Torvalds 已提交
1129 1130 1131 1132 1133 1134
		struct fib_nh *nh = fi->fib_nh;

		/* Local address is added. */
		if (nhs != 1 || nh->nh_gw)
			goto err_inval;
		nh->nh_scope = RT_SCOPE_NOWHERE;
1135
		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
L
Linus Torvalds 已提交
1136
		err = -ENODEV;
1137
		if (!nh->nh_dev)
L
Linus Torvalds 已提交
1138 1139
			goto failure;
	} else {
1140 1141
		int linkdown = 0;

L
Linus Torvalds 已提交
1142
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1143 1144
			err = fib_check_nh(cfg, fi, nexthop_nh);
			if (err != 0)
L
Linus Torvalds 已提交
1145
				goto failure;
1146 1147
			if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
				linkdown++;
L
Linus Torvalds 已提交
1148
		} endfor_nexthops(fi)
1149 1150
		if (linkdown == fi->fib_nhs)
			fi->fib_flags |= RTNH_F_LINKDOWN;
L
Linus Torvalds 已提交
1151 1152
	}

1153 1154
	if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
		goto err_inval;
L
Linus Torvalds 已提交
1155

1156
	change_nexthops(fi) {
1157
		fib_info_update_nh_saddr(net, nexthop_nh);
P
Peter Nørlund 已提交
1158
		fib_add_weight(fi, nexthop_nh);
1159 1160
	} endfor_nexthops(fi)

P
Peter Nørlund 已提交
1161 1162
	fib_rebalance(fi);

L
Linus Torvalds 已提交
1163
link_it:
E
Eric Dumazet 已提交
1164 1165
	ofi = fib_find_info(fi);
	if (ofi) {
L
Linus Torvalds 已提交
1166 1167 1168 1169 1170 1171 1172 1173
		fi->fib_dead = 1;
		free_fib_info(fi);
		ofi->fib_treeref++;
		return ofi;
	}

	fi->fib_treeref++;
	atomic_inc(&fi->fib_clntref);
1174
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
	hlist_add_head(&fi->fib_hash,
		       &fib_info_hash[fib_info_hashfn(fi)]);
	if (fi->fib_prefsrc) {
		struct hlist_head *head;

		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
		hlist_add_head(&fi->fib_lhash, head);
	}
	change_nexthops(fi) {
		struct hlist_head *head;
		unsigned int hash;

1187
		if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
1188
			continue;
1189
		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
L
Linus Torvalds 已提交
1190
		head = &fib_info_devhash[hash];
1191
		hlist_add_head(&nexthop_nh->nh_hash, head);
L
Linus Torvalds 已提交
1192
	} endfor_nexthops(fi)
1193
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
1194 1195 1196 1197 1198 1199
	return fi;

err_inval:
	err = -EINVAL;

failure:
1200
	if (fi) {
L
Linus Torvalds 已提交
1201 1202 1203
		fi->fib_dead = 1;
		free_fib_info(fi);
	}
1204 1205

	return ERR_PTR(err);
L
Linus Torvalds 已提交
1206 1207
}

1208
int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1209
		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
1210
		  struct fib_info *fi, unsigned int flags)
L
Linus Torvalds 已提交
1211
{
1212
	struct nlmsghdr *nlh;
L
Linus Torvalds 已提交
1213 1214
	struct rtmsg *rtm;

1215
	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
1216
	if (!nlh)
1217
		return -EMSGSIZE;
1218 1219

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
1220 1221 1222 1223
	rtm->rtm_family = AF_INET;
	rtm->rtm_dst_len = dst_len;
	rtm->rtm_src_len = 0;
	rtm->rtm_tos = tos;
1224 1225 1226 1227
	if (tb_id < 256)
		rtm->rtm_table = tb_id;
	else
		rtm->rtm_table = RT_TABLE_COMPAT;
D
David S. Miller 已提交
1228 1229
	if (nla_put_u32(skb, RTA_TABLE, tb_id))
		goto nla_put_failure;
L
Linus Torvalds 已提交
1230 1231
	rtm->rtm_type = type;
	rtm->rtm_flags = fi->fib_flags;
1232
	rtm->rtm_scope = fi->fib_scope;
L
Linus Torvalds 已提交
1233
	rtm->rtm_protocol = fi->fib_protocol;
1234

D
David S. Miller 已提交
1235
	if (rtm->rtm_dst_len &&
1236
	    nla_put_in_addr(skb, RTA_DST, dst))
D
David S. Miller 已提交
1237 1238 1239 1240
		goto nla_put_failure;
	if (fi->fib_priority &&
	    nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
		goto nla_put_failure;
L
Linus Torvalds 已提交
1241
	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
1242 1243
		goto nla_put_failure;

D
David S. Miller 已提交
1244
	if (fi->fib_prefsrc &&
1245
	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
D
David S. Miller 已提交
1246
		goto nla_put_failure;
L
Linus Torvalds 已提交
1247
	if (fi->fib_nhs == 1) {
1248 1249
		struct in_device *in_dev;

D
David S. Miller 已提交
1250
		if (fi->fib_nh->nh_gw &&
1251
		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
D
David S. Miller 已提交
1252 1253 1254 1255
			goto nla_put_failure;
		if (fi->fib_nh->nh_oif &&
		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
			goto nla_put_failure;
1256
		if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
1257
			in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
1258 1259 1260 1261
			if (in_dev &&
			    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
				rtm->rtm_flags |= RTNH_F_DEAD;
		}
1262
#ifdef CONFIG_IP_ROUTE_CLASSID
D
David S. Miller 已提交
1263 1264 1265
		if (fi->fib_nh[0].nh_tclassid &&
		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
			goto nla_put_failure;
1266
#endif
1267 1268 1269
		if (fi->fib_nh->nh_lwtstate &&
		    lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
			goto nla_put_failure;
L
Linus Torvalds 已提交
1270 1271 1272
	}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
	if (fi->fib_nhs > 1) {
1273 1274 1275 1276
		struct rtnexthop *rtnh;
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
1277
		if (!mp)
1278
			goto nla_put_failure;
L
Linus Torvalds 已提交
1279 1280

		for_nexthops(fi) {
1281 1282
			struct in_device *in_dev;

1283
			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1284
			if (!rtnh)
1285 1286 1287
				goto nla_put_failure;

			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1288
			if (nh->nh_flags & RTNH_F_LINKDOWN) {
1289
				in_dev = __in_dev_get_rtnl(nh->nh_dev);
1290 1291 1292 1293
				if (in_dev &&
				    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
					rtnh->rtnh_flags |= RTNH_F_DEAD;
			}
1294 1295 1296
			rtnh->rtnh_hops = nh->nh_weight - 1;
			rtnh->rtnh_ifindex = nh->nh_oif;

D
David S. Miller 已提交
1297
			if (nh->nh_gw &&
1298
			    nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
D
David S. Miller 已提交
1299
				goto nla_put_failure;
1300
#ifdef CONFIG_IP_ROUTE_CLASSID
D
David S. Miller 已提交
1301 1302 1303
			if (nh->nh_tclassid &&
			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
				goto nla_put_failure;
1304
#endif
1305 1306 1307 1308
			if (nh->nh_lwtstate &&
			    lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
				goto nla_put_failure;

1309 1310
			/* length of rtnetlink header + attributes */
			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
L
Linus Torvalds 已提交
1311
		} endfor_nexthops(fi);
1312 1313

		nla_nest_end(skb, mp);
L
Linus Torvalds 已提交
1314 1315
	}
#endif
1316 1317
	nlmsg_end(skb, nlh);
	return 0;
L
Linus Torvalds 已提交
1318

1319
nla_put_failure:
1320 1321
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
1322 1323 1324
}

/*
E
Eric Dumazet 已提交
1325 1326 1327 1328
 * Update FIB if:
 * - local address disappeared -> we must delete all the entries
 *   referring to it.
 * - device went down -> we must shutdown all nexthops going via it.
L
Linus Torvalds 已提交
1329
 */
1330
int fib_sync_down_addr(struct net_device *dev, __be32 local)
L
Linus Torvalds 已提交
1331 1332
{
	int ret = 0;
D
Denis V. Lunev 已提交
1333 1334
	unsigned int hash = fib_laddr_hashfn(local);
	struct hlist_head *head = &fib_info_laddrhash[hash];
1335 1336
	struct net *net = dev_net(dev);
	int tb_id = l3mdev_fib_table(dev);
D
Denis V. Lunev 已提交
1337
	struct fib_info *fi;
L
Linus Torvalds 已提交
1338

1339
	if (!fib_info_laddrhash || local == 0)
D
Denis V. Lunev 已提交
1340
		return 0;
L
Linus Torvalds 已提交
1341

1342
	hlist_for_each_entry(fi, head, fib_lhash) {
1343 1344
		if (!net_eq(fi->fib_net, net) ||
		    fi->fib_tb_id != tb_id)
1345
			continue;
D
Denis V. Lunev 已提交
1346 1347 1348
		if (fi->fib_prefsrc == local) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1349 1350
		}
	}
D
Denis V. Lunev 已提交
1351 1352 1353
	return ret;
}

1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383
static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
				 enum fib_event_type event_type)
{
	struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
	struct fib_nh_notifier_info info = {
		.fib_nh = fib_nh,
	};

	switch (event_type) {
	case FIB_EVENT_NH_ADD:
		if (fib_nh->nh_flags & RTNH_F_DEAD)
			break;
		if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
		    fib_nh->nh_flags & RTNH_F_LINKDOWN)
			break;
		return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
					  &info.info);
	case FIB_EVENT_NH_DEL:
		if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
		     fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
		    (fib_nh->nh_flags & RTNH_F_DEAD))
			return call_fib_notifiers(dev_net(fib_nh->nh_dev),
						  event_type, &info.info);
	default:
		break;
	}

	return NOTIFY_DONE;
}

1384 1385 1386 1387 1388 1389 1390
/* Event              force Flags           Description
 * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
 * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
 * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
 * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
 */
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
D
Denis V. Lunev 已提交
1391 1392 1393 1394 1395 1396 1397
{
	int ret = 0;
	int scope = RT_SCOPE_NOWHERE;
	struct fib_info *prev_fi = NULL;
	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
	struct hlist_head *head = &fib_info_devhash[hash];
	struct fib_nh *nh;
L
Linus Torvalds 已提交
1398

1399
	if (force)
D
Denis V. Lunev 已提交
1400
		scope = -1;
L
Linus Torvalds 已提交
1401

1402
	hlist_for_each_entry(nh, head, nh_hash) {
D
Denis V. Lunev 已提交
1403 1404
		struct fib_info *fi = nh->nh_parent;
		int dead;
L
Linus Torvalds 已提交
1405

D
Denis V. Lunev 已提交
1406 1407 1408 1409 1410 1411
		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;
		prev_fi = fi;
		dead = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1412
			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
D
Denis V. Lunev 已提交
1413
				dead++;
1414 1415
			else if (nexthop_nh->nh_dev == dev &&
				 nexthop_nh->nh_scope != scope) {
1416 1417 1418 1419 1420 1421 1422 1423 1424
				switch (event) {
				case NETDEV_DOWN:
				case NETDEV_UNREGISTER:
					nexthop_nh->nh_flags |= RTNH_F_DEAD;
					/* fall through */
				case NETDEV_CHANGE:
					nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
					break;
				}
1425 1426
				call_fib_nh_notifiers(nexthop_nh,
						      FIB_EVENT_NH_DEL);
D
Denis V. Lunev 已提交
1427 1428
				dead++;
			}
L
Linus Torvalds 已提交
1429
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1430 1431
			if (event == NETDEV_UNREGISTER &&
			    nexthop_nh->nh_dev == dev) {
D
Denis V. Lunev 已提交
1432 1433
				dead = fi->fib_nhs;
				break;
L
Linus Torvalds 已提交
1434
			}
D
Denis V. Lunev 已提交
1435 1436 1437
#endif
		} endfor_nexthops(fi)
		if (dead == fi->fib_nhs) {
1438 1439 1440 1441 1442 1443 1444 1445 1446
			switch (event) {
			case NETDEV_DOWN:
			case NETDEV_UNREGISTER:
				fi->fib_flags |= RTNH_F_DEAD;
				/* fall through */
			case NETDEV_CHANGE:
				fi->fib_flags |= RTNH_F_LINKDOWN;
				break;
			}
D
Denis V. Lunev 已提交
1447
			ret++;
L
Linus Torvalds 已提交
1448
		}
P
Peter Nørlund 已提交
1449 1450

		fib_rebalance(fi);
L
Linus Torvalds 已提交
1451 1452 1453 1454 1455
	}

	return ret;
}

1456
/* Must be invoked inside of an RCU protected region.  */
1457
static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
1458 1459
{
	struct fib_info *fi = NULL, *last_resort = NULL;
1460
	struct hlist_head *fa_head = res->fa_head;
1461
	struct fib_table *tb = res->table;
1462
	u8 slen = 32 - res->prefixlen;
1463
	int order = -1, last_idx = -1;
1464 1465 1466
	struct fib_alias *fa, *fa1 = NULL;
	u32 last_prio = res->fi->fib_priority;
	u8 last_tos = 0;
1467

1468
	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
1469 1470
		struct fib_info *next_fi = fa->fa_info;

1471 1472
		if (fa->fa_slen != slen)
			continue;
1473 1474
		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
			continue;
1475 1476
		if (fa->tb_id != tb->tb_id)
			continue;
1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487
		if (next_fi->fib_priority > last_prio &&
		    fa->fa_tos == last_tos) {
			if (last_tos)
				continue;
			break;
		}
		if (next_fi->fib_flags & RTNH_F_DEAD)
			continue;
		last_tos = fa->fa_tos;
		last_prio = next_fi->fib_priority;

1488
		if (next_fi->fib_scope != res->scope ||
1489 1490 1491 1492 1493 1494 1495 1496
		    fa->fa_type != RTN_UNICAST)
			continue;
		if (!next_fi->fib_nh[0].nh_gw ||
		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
			continue;

		fib_alias_accessed(fa);

1497
		if (!fi) {
1498 1499
			if (next_fi != res->fi)
				break;
1500
			fa1 = fa;
1501
		} else if (!fib_detect_death(fi, order, &last_resort,
1502
					     &last_idx, fa1->fa_default)) {
1503
			fib_result_assign(res, fi);
1504
			fa1->fa_default = order;
1505 1506 1507 1508 1509 1510
			goto out;
		}
		fi = next_fi;
		order++;
	}

1511
	if (order <= 0 || !fi) {
1512 1513
		if (fa1)
			fa1->fa_default = -1;
1514 1515 1516 1517
		goto out;
	}

	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1518
			      fa1->fa_default)) {
1519
		fib_result_assign(res, fi);
1520
		fa1->fa_default = order;
1521 1522 1523 1524 1525
		goto out;
	}

	if (last_idx >= 0)
		fib_result_assign(res, last_resort);
1526
	fa1->fa_default = last_idx;
1527
out:
1528
	return;
1529 1530
}

L
Linus Torvalds 已提交
1531
/*
E
Eric Dumazet 已提交
1532 1533
 * Dead device goes up. We wake up dead nexthops.
 * It takes sense only on multipath routes.
L
Linus Torvalds 已提交
1534
 */
1535
int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
L
Linus Torvalds 已提交
1536 1537 1538 1539 1540 1541 1542
{
	struct fib_info *prev_fi;
	unsigned int hash;
	struct hlist_head *head;
	struct fib_nh *nh;
	int ret;

E
Eric Dumazet 已提交
1543
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1544 1545
		return 0;

1546 1547 1548 1549 1550 1551 1552
	if (nh_flags & RTNH_F_DEAD) {
		unsigned int flags = dev_get_flags(dev);

		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
			nh_flags |= RTNH_F_LINKDOWN;
	}

L
Linus Torvalds 已提交
1553 1554 1555 1556 1557
	prev_fi = NULL;
	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	ret = 0;

1558
	hlist_for_each_entry(nh, head, nh_hash) {
L
Linus Torvalds 已提交
1559 1560 1561 1562 1563 1564 1565 1566 1567 1568
		struct fib_info *fi = nh->nh_parent;
		int alive;

		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;

		prev_fi = fi;
		alive = 0;
		change_nexthops(fi) {
1569
			if (!(nexthop_nh->nh_flags & nh_flags)) {
L
Linus Torvalds 已提交
1570 1571 1572
				alive++;
				continue;
			}
1573
			if (!nexthop_nh->nh_dev ||
E
Eric Dumazet 已提交
1574
			    !(nexthop_nh->nh_dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1575
				continue;
1576 1577
			if (nexthop_nh->nh_dev != dev ||
			    !__in_dev_get_rtnl(dev))
L
Linus Torvalds 已提交
1578 1579
				continue;
			alive++;
1580
			nexthop_nh->nh_flags &= ~nh_flags;
1581
			call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
L
Linus Torvalds 已提交
1582 1583 1584
		} endfor_nexthops(fi)

		if (alive > 0) {
1585
			fi->fib_flags &= ~nh_flags;
L
Linus Torvalds 已提交
1586 1587
			ret++;
		}
P
Peter Nørlund 已提交
1588 1589

		fib_rebalance(fi);
L
Linus Torvalds 已提交
1590 1591 1592 1593 1594
	}

	return ret;
}

1595
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1596 1597 1598 1599 1600 1601 1602 1603 1604
static bool fib_good_nh(const struct fib_nh *nh)
{
	int state = NUD_REACHABLE;

	if (nh->nh_scope == RT_SCOPE_LINK) {
		struct neighbour *n;

		rcu_read_lock_bh();

1605 1606
		n = __ipv4_neigh_lookup_noref(nh->nh_dev,
					      (__force u32)nh->nh_gw);
1607 1608 1609 1610 1611 1612 1613 1614
		if (n)
			state = n->nud_state;

		rcu_read_unlock_bh();
	}

	return !!(state & NUD_VALID);
}
1615

P
Peter Nørlund 已提交
1616
void fib_select_multipath(struct fib_result *res, int hash)
L
Linus Torvalds 已提交
1617 1618
{
	struct fib_info *fi = res->fi;
1619 1620
	struct net *net = fi->fib_net;
	bool first = false;
L
Linus Torvalds 已提交
1621

P
Peter Nørlund 已提交
1622 1623 1624
	for_nexthops(fi) {
		if (hash > atomic_read(&nh->nh_upper_bound))
			continue;
L
Linus Torvalds 已提交
1625

1626 1627 1628 1629 1630 1631 1632 1633 1634
		if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
		    fib_good_nh(nh)) {
			res->nh_sel = nhsel;
			return;
		}
		if (!first) {
			res->nh_sel = nhsel;
			first = true;
		}
L
Linus Torvalds 已提交
1635 1636 1637
	} endfor_nexthops(fi);
}
#endif
1638 1639

void fib_select_path(struct net *net, struct fib_result *res,
1640
		     struct flowi4 *fl4, const struct sk_buff *skb)
1641
{
1642 1643 1644 1645 1646
	bool oif_check;

	oif_check = (fl4->flowi4_oif == 0 ||
		     fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);

1647
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1648
	if (res->fi->fib_nhs > 1 && oif_check) {
1649
		int h = fib_multipath_hash(res->fi, fl4, skb);
1650

1651
		fib_select_multipath(res, h);
1652 1653 1654 1655 1656
	}
	else
#endif
	if (!res->prefixlen &&
	    res->table->tb_num_default > 1 &&
1657
	    res->type == RTN_UNICAST && oif_check)
1658 1659 1660 1661 1662 1663
		fib_select_default(fl4, res);

	if (!fl4->saddr)
		fl4->saddr = FIB_RES_PREFSRC(net, *res);
}
EXPORT_SYMBOL_GPL(fib_select_path);