fib_semantics.c 28.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: semantics.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
29
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
30 31 32 33 34
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
35
#include <linux/slab.h>
L
Linus Torvalds 已提交
36

37
#include <net/arp.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
44
#include <net/netlink.h>
45
#include <net/nexthop.h>
L
Linus Torvalds 已提交
46 47 48

#include "fib_lookup.h"

49
static DEFINE_SPINLOCK(fib_info_lock);
L
Linus Torvalds 已提交
50 51
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
52
static unsigned int fib_info_hash_size;
L
Linus Torvalds 已提交
53 54 55 56 57 58 59 60 61 62
static unsigned int fib_info_cnt;

#define DEVINDEX_HASHBITS 8
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];

#ifdef CONFIG_IP_ROUTE_MULTIPATH

static DEFINE_SPINLOCK(fib_multipath_lock);

E
Eric Dumazet 已提交
63 64 65 66 67 68 69 70 71 72 73
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh;				\
	for (nhsel = 0, nh = (fi)->fib_nh;				\
	     nhsel < (fi)->fib_nhs;					\
	     nh++, nhsel++)

#define change_nexthops(fi) {						\
	int nhsel; struct fib_nh *nexthop_nh;				\
	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	     nhsel < (fi)->fib_nhs;					\
	     nexthop_nh++, nhsel++)
L
Linus Torvalds 已提交
74 75 76 77 78

#else /* CONFIG_IP_ROUTE_MULTIPATH */

/* Hope, that gcc will optimize it to get rid of dummy loop */

E
Eric Dumazet 已提交
79 80 81
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
82

E
Eric Dumazet 已提交
83 84 85 86
#define change_nexthops(fi) {						\
	int nhsel;							\
	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
87 88 89 90 91 92

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

#define endfor_nexthops(fi) }


93
const struct fib_prop fib_props[RTN_MAX + 1] = {
E
Eric Dumazet 已提交
94
	[RTN_UNSPEC] = {
L
Linus Torvalds 已提交
95 96
		.error	= 0,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
97 98
	},
	[RTN_UNICAST] = {
L
Linus Torvalds 已提交
99 100
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
101 102
	},
	[RTN_LOCAL] = {
L
Linus Torvalds 已提交
103 104
		.error	= 0,
		.scope	= RT_SCOPE_HOST,
E
Eric Dumazet 已提交
105 106
	},
	[RTN_BROADCAST] = {
L
Linus Torvalds 已提交
107 108
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
109 110
	},
	[RTN_ANYCAST] = {
L
Linus Torvalds 已提交
111 112
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
113 114
	},
	[RTN_MULTICAST] = {
L
Linus Torvalds 已提交
115 116
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
117 118
	},
	[RTN_BLACKHOLE] = {
L
Linus Torvalds 已提交
119 120
		.error	= -EINVAL,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
121 122
	},
	[RTN_UNREACHABLE] = {
L
Linus Torvalds 已提交
123 124
		.error	= -EHOSTUNREACH,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
125 126
	},
	[RTN_PROHIBIT] = {
L
Linus Torvalds 已提交
127 128
		.error	= -EACCES,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
129 130
	},
	[RTN_THROW] = {
L
Linus Torvalds 已提交
131 132
		.error	= -EAGAIN,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
133 134
	},
	[RTN_NAT] = {
L
Linus Torvalds 已提交
135 136
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
137 138
	},
	[RTN_XRESOLVE] = {
L
Linus Torvalds 已提交
139 140
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
141
	},
L
Linus Torvalds 已提交
142 143 144 145 146
};


/* Release a nexthop info record */

E
Eric Dumazet 已提交
147 148 149 150
static void free_fib_info_rcu(struct rcu_head *head)
{
	struct fib_info *fi = container_of(head, struct fib_info, rcu);

151 152
	if (fi->fib_metrics != (u32 *) dst_default_metrics)
		kfree(fi->fib_metrics);
E
Eric Dumazet 已提交
153 154 155
	kfree(fi);
}

L
Linus Torvalds 已提交
156 157 158
void free_fib_info(struct fib_info *fi)
{
	if (fi->fib_dead == 0) {
E
Eric Dumazet 已提交
159
		pr_warning("Freeing alive fib_info %p\n", fi);
L
Linus Torvalds 已提交
160 161 162
		return;
	}
	change_nexthops(fi) {
163 164 165
		if (nexthop_nh->nh_dev)
			dev_put(nexthop_nh->nh_dev);
		nexthop_nh->nh_dev = NULL;
L
Linus Torvalds 已提交
166 167
	} endfor_nexthops(fi);
	fib_info_cnt--;
168
	release_net(fi->fib_net);
E
Eric Dumazet 已提交
169
	call_rcu(&fi->rcu, free_fib_info_rcu);
L
Linus Torvalds 已提交
170 171 172 173
}

void fib_release_info(struct fib_info *fi)
{
174
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
175 176 177 178 179
	if (fi && --fi->fib_treeref == 0) {
		hlist_del(&fi->fib_hash);
		if (fi->fib_prefsrc)
			hlist_del(&fi->fib_lhash);
		change_nexthops(fi) {
180
			if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
181
				continue;
182
			hlist_del(&nexthop_nh->nh_hash);
L
Linus Torvalds 已提交
183 184 185 186
		} endfor_nexthops(fi)
		fi->fib_dead = 1;
		fib_info_put(fi);
	}
187
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
188 189
}

E
Eric Dumazet 已提交
190
static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
L
Linus Torvalds 已提交
191 192 193 194 195 196 197 198 199 200
{
	const struct fib_nh *onh = ofi->fib_nh;

	for_nexthops(fi) {
		if (nh->nh_oif != onh->nh_oif ||
		    nh->nh_gw  != onh->nh_gw ||
		    nh->nh_scope != onh->nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		    nh->nh_weight != onh->nh_weight ||
#endif
201
#ifdef CONFIG_IP_ROUTE_CLASSID
L
Linus Torvalds 已提交
202 203
		    nh->nh_tclassid != onh->nh_tclassid ||
#endif
E
Eric Dumazet 已提交
204
		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
L
Linus Torvalds 已提交
205 206 207 208 209 210
			return -1;
		onh++;
	} endfor_nexthops(fi);
	return 0;
}

211 212 213 214 215 216 217 218 219
static inline unsigned int fib_devindex_hashfn(unsigned int val)
{
	unsigned int mask = DEVINDEX_HASHSIZE - 1;

	return (val ^
		(val >> DEVINDEX_HASHBITS) ^
		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}

L
Linus Torvalds 已提交
220 221
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
222
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
223 224 225
	unsigned int val = fi->fib_nhs;

	val ^= fi->fib_protocol;
A
Al Viro 已提交
226
	val ^= (__force u32)fi->fib_prefsrc;
L
Linus Torvalds 已提交
227
	val ^= fi->fib_priority;
228 229 230
	for_nexthops(fi) {
		val ^= fib_devindex_hashfn(nh->nh_oif);
	} endfor_nexthops(fi)
L
Linus Torvalds 已提交
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}

static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_info *fi;
	unsigned int hash;

	hash = fib_info_hashfn(nfi);
	head = &fib_info_hash[hash];

	hlist_for_each_entry(fi, node, head, fib_hash) {
O
Octavian Purdila 已提交
246
		if (!net_eq(fi->fib_net, nfi->fib_net))
247
			continue;
L
Linus Torvalds 已提交
248 249 250 251 252 253 254
		if (fi->fib_nhs != nfi->fib_nhs)
			continue;
		if (nfi->fib_protocol == fi->fib_protocol &&
		    nfi->fib_prefsrc == fi->fib_prefsrc &&
		    nfi->fib_priority == fi->fib_priority &&
		    memcmp(nfi->fib_metrics, fi->fib_metrics,
			   sizeof(fi->fib_metrics)) == 0 &&
E
Eric Dumazet 已提交
255
		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
L
Linus Torvalds 已提交
256 257 258 259 260 261 262 263
		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
			return fi;
	}

	return NULL;
}

/* Check, that the gateway is already configured.
E
Eric Dumazet 已提交
264
 * Used only by redirect accept routine.
L
Linus Torvalds 已提交
265
 */
266
int ip_fib_check_default(__be32 gw, struct net_device *dev)
L
Linus Torvalds 已提交
267 268 269 270 271 272
{
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_nh *nh;
	unsigned int hash;

273
	spin_lock(&fib_info_lock);
L
Linus Torvalds 已提交
274 275 276 277 278 279

	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	hlist_for_each_entry(nh, node, head, nh_hash) {
		if (nh->nh_dev == dev &&
		    nh->nh_gw == gw &&
E
Eric Dumazet 已提交
280
		    !(nh->nh_flags & RTNH_F_DEAD)) {
281
			spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
282 283 284 285
			return 0;
		}
	}

286
	spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
287 288 289 290

	return -1;
}

291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
static inline size_t fib_nlmsg_size(struct fib_info *fi)
{
	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
			 + nla_total_size(4) /* RTA_TABLE */
			 + nla_total_size(4) /* RTA_DST */
			 + nla_total_size(4) /* RTA_PRIORITY */
			 + nla_total_size(4); /* RTA_PREFSRC */

	/* space for nested metrics */
	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));

	if (fi->fib_nhs) {
		/* Also handles the special case fib_nhs == 1 */

		/* each nexthop is packed in an attribute */
		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));

		/* may contain flow and gateway attribute */
		nhsize += 2 * nla_total_size(4);

		/* all nexthops are packed in a nested attribute */
		payload += nla_total_size(fi->fib_nhs * nhsize);
	}

	return payload;
}

A
Al Viro 已提交
318
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
319 320
	       int dst_len, u32 tb_id, struct nl_info *info,
	       unsigned int nlm_flags)
L
Linus Torvalds 已提交
321 322
{
	struct sk_buff *skb;
323
	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
324
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
325

326
	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
327 328
	if (skb == NULL)
		goto errout;
L
Linus Torvalds 已提交
329

330
	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
331
			    fa->fa_type, fa->fa_scope, key, dst_len,
332
			    fa->fa_tos, fa->fa_info, nlm_flags);
333 334 335 336 337 338
	if (err < 0) {
		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
339 340 341
	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
		    info->nlh, GFP_KERNEL);
	return;
342 343
errout:
	if (err < 0)
344
		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
L
Linus Torvalds 已提交
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
}

/* Return the first fib alias matching TOS with
 * priority less than or equal to PRIO.
 */
struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
{
	if (fah) {
		struct fib_alias *fa;
		list_for_each_entry(fa, fah, fa_list) {
			if (fa->fa_tos > tos)
				continue;
			if (fa->fa_info->fib_priority >= prio ||
			    fa->fa_tos < tos)
				return fa;
		}
	}
	return NULL;
}

int fib_detect_death(struct fib_info *fi, int order,
366
		     struct fib_info **last_resort, int *last_idx, int dflt)
L
Linus Torvalds 已提交
367 368 369 370 371 372 373 374 375
{
	struct neighbour *n;
	int state = NUD_NONE;

	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
	if (n) {
		state = n->nud_state;
		neigh_release(n);
	}
376
	if (state == NUD_REACHABLE)
L
Linus Torvalds 已提交
377
		return 0;
E
Eric Dumazet 已提交
378
	if ((state & NUD_VALID) && order != dflt)
L
Linus Torvalds 已提交
379
		return 0;
E
Eric Dumazet 已提交
380 381
	if ((state & NUD_VALID) ||
	    (*last_idx < 0 && order > dflt)) {
L
Linus Torvalds 已提交
382 383 384 385 386 387 388 389
		*last_resort = fi;
		*last_idx = order;
	}
	return 1;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH

390
static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
L
Linus Torvalds 已提交
391 392 393
{
	int nhs = 0;

394
	while (rtnh_ok(rtnh, remaining)) {
L
Linus Torvalds 已提交
395
		nhs++;
396 397 398 399 400
		rtnh = rtnh_next(rtnh, &remaining);
	}

	/* leftover implies invalid nexthop configuration, discard it */
	return remaining > 0 ? 0 : nhs;
L
Linus Torvalds 已提交
401 402
}

403 404
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
		       int remaining, struct fib_config *cfg)
L
Linus Torvalds 已提交
405 406
{
	change_nexthops(fi) {
407 408 409
		int attrlen;

		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
410
			return -EINVAL;
411

412 413 414 415
		nexthop_nh->nh_flags =
			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
416 417 418 419 420 421

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
422
			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
423
#ifdef CONFIG_IP_ROUTE_CLASSID
424
			nla = nla_find(attrs, attrlen, RTA_FLOW);
425
			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
L
Linus Torvalds 已提交
426 427
#endif
		}
428 429

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
430
	} endfor_nexthops(fi);
431

L
Linus Torvalds 已提交
432 433 434 435 436
	return 0;
}

#endif

437
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
L
Linus Torvalds 已提交
438 439
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
440 441
	struct rtnexthop *rtnh;
	int remaining;
L
Linus Torvalds 已提交
442 443
#endif

444
	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
L
Linus Torvalds 已提交
445 446
		return 1;

447 448 449
	if (cfg->fc_oif || cfg->fc_gw) {
		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
L
Linus Torvalds 已提交
450 451 452 453 454
			return 0;
		return 1;
	}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
455
	if (cfg->fc_mp == NULL)
L
Linus Torvalds 已提交
456
		return 0;
457 458 459

	rtnh = cfg->fc_mp;
	remaining = cfg->fc_mp_len;
460

L
Linus Torvalds 已提交
461
	for_nexthops(fi) {
462
		int attrlen;
L
Linus Torvalds 已提交
463

464
		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
465
			return -EINVAL;
466 467

		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
L
Linus Torvalds 已提交
468
			return 1;
469 470 471 472 473 474

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen < 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
475
			if (nla && nla_get_be32(nla) != nh->nh_gw)
L
Linus Torvalds 已提交
476
				return 1;
477
#ifdef CONFIG_IP_ROUTE_CLASSID
478 479
			nla = nla_find(attrs, attrlen, RTA_FLOW);
			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
L
Linus Torvalds 已提交
480 481 482
				return 1;
#endif
		}
483 484

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
485 486 487 488 489 490 491
	} endfor_nexthops(fi);
#endif
	return 0;
}


/*
E
Eric Dumazet 已提交
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
 * Picture
 * -------
 *
 * Semantics of nexthop is very messy by historical reasons.
 * We have to take into account, that:
 * a) gateway can be actually local interface address,
 *    so that gatewayed route is direct.
 * b) gateway must be on-link address, possibly
 *    described not by an ifaddr, but also by a direct route.
 * c) If both gateway and interface are specified, they should not
 *    contradict.
 * d) If we use tunnel routes, gateway could be not on-link.
 *
 * Attempt to reconcile all of these (alas, self-contradictory) conditions
 * results in pretty ugly and hairy code with obscure logic.
 *
 * I chose to generalized it instead, so that the size
 * of code does not increase practically, but it becomes
 * much more general.
 * Every prefix is assigned a "scope" value: "host" is local address,
 * "link" is direct route,
 * [ ... "site" ... "interior" ... ]
 * and "universe" is true gateway route with global meaning.
 *
 * Every prefix refers to a set of "nexthop"s (gw, oif),
 * where gw must have narrower scope. This recursion stops
 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 * which means that gw is forced to be on link.
 *
 * Code is still hairy, but now it is apparently logically
 * consistent and very flexible. F.e. as by-product it allows
 * to co-exists in peace independent exterior and interior
 * routing processes.
 *
 * Normally it looks as following.
 *
 * {universe prefix}  -> (gw, oif) [scope link]
 *		  |
 *		  |-> {link prefix} -> (gw, oif) [scope local]
 *					|
 *					|-> {local prefix} (terminal node)
L
Linus Torvalds 已提交
533
 */
534 535
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
			struct fib_nh *nh)
L
Linus Torvalds 已提交
536 537
{
	int err;
538
	struct net *net;
E
Eric Dumazet 已提交
539
	struct net_device *dev;
L
Linus Torvalds 已提交
540

541
	net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
542 543 544
	if (nh->nh_gw) {
		struct fib_result res;

E
Eric Dumazet 已提交
545
		if (nh->nh_flags & RTNH_F_ONLINK) {
L
Linus Torvalds 已提交
546

547
			if (cfg->fc_scope >= RT_SCOPE_LINK)
L
Linus Torvalds 已提交
548
				return -EINVAL;
549
			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
L
Linus Torvalds 已提交
550
				return -EINVAL;
E
Eric Dumazet 已提交
551 552
			dev = __dev_get_by_index(net, nh->nh_oif);
			if (!dev)
L
Linus Torvalds 已提交
553
				return -ENODEV;
E
Eric Dumazet 已提交
554
			if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
555 556 557 558 559 560
				return -ENETDOWN;
			nh->nh_dev = dev;
			dev_hold(dev);
			nh->nh_scope = RT_SCOPE_LINK;
			return 0;
		}
E
Eric Dumazet 已提交
561
		rcu_read_lock();
L
Linus Torvalds 已提交
562
		{
563
			struct flowi fl = {
564 565
				.fl4_dst = nh->nh_gw,
				.fl4_scope = cfg->fc_scope + 1,
566 567
				.oif = nh->nh_oif,
			};
L
Linus Torvalds 已提交
568 569 570 571

			/* It is not necessary, but requires a bit of thinking */
			if (fl.fl4_scope < RT_SCOPE_LINK)
				fl.fl4_scope = RT_SCOPE_LINK;
E
Eric Dumazet 已提交
572
			err = fib_lookup(net, &fl, &res);
E
Eric Dumazet 已提交
573 574
			if (err) {
				rcu_read_unlock();
L
Linus Torvalds 已提交
575
				return err;
E
Eric Dumazet 已提交
576
			}
L
Linus Torvalds 已提交
577 578 579 580 581 582
		}
		err = -EINVAL;
		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
			goto out;
		nh->nh_scope = res.scope;
		nh->nh_oif = FIB_RES_OIF(res);
E
Eric Dumazet 已提交
583 584
		nh->nh_dev = dev = FIB_RES_DEV(res);
		if (!dev)
L
Linus Torvalds 已提交
585
			goto out;
E
Eric Dumazet 已提交
586
		dev_hold(dev);
587
		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
L
Linus Torvalds 已提交
588 589 590
	} else {
		struct in_device *in_dev;

E
Eric Dumazet 已提交
591
		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
L
Linus Torvalds 已提交
592 593
			return -EINVAL;

594 595
		rcu_read_lock();
		err = -ENODEV;
596
		in_dev = inetdev_by_index(net, nh->nh_oif);
L
Linus Torvalds 已提交
597
		if (in_dev == NULL)
598 599 600 601
			goto out;
		err = -ENETDOWN;
		if (!(in_dev->dev->flags & IFF_UP))
			goto out;
L
Linus Torvalds 已提交
602 603 604
		nh->nh_dev = in_dev->dev;
		dev_hold(nh->nh_dev);
		nh->nh_scope = RT_SCOPE_HOST;
605
		err = 0;
L
Linus Torvalds 已提交
606
	}
607 608 609
out:
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
610 611
}

A
Al Viro 已提交
612
static inline unsigned int fib_laddr_hashfn(__be32 val)
L
Linus Torvalds 已提交
613
{
614
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
615

E
Eric Dumazet 已提交
616 617 618
	return ((__force u32)val ^
		((__force u32)val >> 7) ^
		((__force u32)val >> 14)) & mask;
L
Linus Torvalds 已提交
619 620
}

621
static struct hlist_head *fib_info_hash_alloc(int bytes)
L
Linus Torvalds 已提交
622 623
{
	if (bytes <= PAGE_SIZE)
624
		return kzalloc(bytes, GFP_KERNEL);
L
Linus Torvalds 已提交
625 626
	else
		return (struct hlist_head *)
E
Eric Dumazet 已提交
627 628
			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
					 get_order(bytes));
L
Linus Torvalds 已提交
629 630
}

631
static void fib_info_hash_free(struct hlist_head *hash, int bytes)
L
Linus Torvalds 已提交
632 633 634 635 636 637 638 639 640 641
{
	if (!hash)
		return;

	if (bytes <= PAGE_SIZE)
		kfree(hash);
	else
		free_pages((unsigned long) hash, get_order(bytes));
}

642 643 644
static void fib_info_hash_move(struct hlist_head *new_info_hash,
			       struct hlist_head *new_laddrhash,
			       unsigned int new_size)
L
Linus Torvalds 已提交
645
{
646
	struct hlist_head *old_info_hash, *old_laddrhash;
647
	unsigned int old_size = fib_info_hash_size;
648
	unsigned int i, bytes;
L
Linus Torvalds 已提交
649

650
	spin_lock_bh(&fib_info_lock);
651 652
	old_info_hash = fib_info_hash;
	old_laddrhash = fib_info_laddrhash;
653
	fib_info_hash_size = new_size;
L
Linus Torvalds 已提交
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690

	for (i = 0; i < old_size; i++) {
		struct hlist_head *head = &fib_info_hash[i];
		struct hlist_node *node, *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
			struct hlist_head *dest;
			unsigned int new_hash;

			hlist_del(&fi->fib_hash);

			new_hash = fib_info_hashfn(fi);
			dest = &new_info_hash[new_hash];
			hlist_add_head(&fi->fib_hash, dest);
		}
	}
	fib_info_hash = new_info_hash;

	for (i = 0; i < old_size; i++) {
		struct hlist_head *lhead = &fib_info_laddrhash[i];
		struct hlist_node *node, *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
			struct hlist_head *ldest;
			unsigned int new_hash;

			hlist_del(&fi->fib_lhash);

			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
			ldest = &new_laddrhash[new_hash];
			hlist_add_head(&fi->fib_lhash, ldest);
		}
	}
	fib_info_laddrhash = new_laddrhash;

691
	spin_unlock_bh(&fib_info_lock);
692 693

	bytes = old_size * sizeof(struct hlist_head *);
694 695
	fib_info_hash_free(old_info_hash, bytes);
	fib_info_hash_free(old_laddrhash, bytes);
L
Linus Torvalds 已提交
696 697
}

698
struct fib_info *fib_create_info(struct fib_config *cfg)
L
Linus Torvalds 已提交
699 700 701 702 703
{
	int err;
	struct fib_info *fi = NULL;
	struct fib_info *ofi;
	int nhs = 1;
704
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
705

706 707 708
	if (cfg->fc_type > RTN_MAX)
		goto err_inval;

L
Linus Torvalds 已提交
709
	/* Fast check to catch the most weird cases */
710
	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
L
Linus Torvalds 已提交
711 712 713
		goto err_inval;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
714 715
	if (cfg->fc_mp) {
		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
L
Linus Torvalds 已提交
716 717 718 719 720 721
		if (nhs == 0)
			goto err_inval;
	}
#endif

	err = -ENOBUFS;
722 723
	if (fib_info_cnt >= fib_info_hash_size) {
		unsigned int new_size = fib_info_hash_size << 1;
L
Linus Torvalds 已提交
724 725 726 727 728 729 730
		struct hlist_head *new_info_hash;
		struct hlist_head *new_laddrhash;
		unsigned int bytes;

		if (!new_size)
			new_size = 1;
		bytes = new_size * sizeof(struct hlist_head *);
731 732
		new_info_hash = fib_info_hash_alloc(bytes);
		new_laddrhash = fib_info_hash_alloc(bytes);
L
Linus Torvalds 已提交
733
		if (!new_info_hash || !new_laddrhash) {
734 735
			fib_info_hash_free(new_info_hash, bytes);
			fib_info_hash_free(new_laddrhash, bytes);
736
		} else
737
			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
L
Linus Torvalds 已提交
738

739
		if (!fib_info_hash_size)
L
Linus Torvalds 已提交
740 741 742
			goto failure;
	}

743
	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
L
Linus Torvalds 已提交
744 745
	if (fi == NULL)
		goto failure;
746 747 748 749 750 751
	if (cfg->fc_mx) {
		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
		if (!fi->fib_metrics)
			goto failure;
	} else
		fi->fib_metrics = (u32 *) dst_default_metrics;
L
Linus Torvalds 已提交
752 753
	fib_info_cnt++;

754
	fi->fib_net = hold_net(net);
755 756 757 758
	fi->fib_protocol = cfg->fc_protocol;
	fi->fib_flags = cfg->fc_flags;
	fi->fib_priority = cfg->fc_priority;
	fi->fib_prefsrc = cfg->fc_prefsrc;
L
Linus Torvalds 已提交
759 760 761

	fi->fib_nhs = nhs;
	change_nexthops(fi) {
762
		nexthop_nh->nh_parent = fi;
L
Linus Torvalds 已提交
763 764
	} endfor_nexthops(fi)

765 766 767 768 769
	if (cfg->fc_mx) {
		struct nlattr *nla;
		int remaining;

		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
770
			int type = nla_type(nla);
771 772 773

			if (type) {
				if (type > RTAX_MAX)
L
Linus Torvalds 已提交
774
					goto err_inval;
775
				fi->fib_metrics[type - 1] = nla_get_u32(nla);
L
Linus Torvalds 已提交
776 777 778 779
			}
		}
	}

780
	if (cfg->fc_mp) {
L
Linus Torvalds 已提交
781
#ifdef CONFIG_IP_ROUTE_MULTIPATH
782 783
		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
		if (err != 0)
L
Linus Torvalds 已提交
784
			goto failure;
785
		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
L
Linus Torvalds 已提交
786
			goto err_inval;
787
		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
L
Linus Torvalds 已提交
788
			goto err_inval;
789
#ifdef CONFIG_IP_ROUTE_CLASSID
790
		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
L
Linus Torvalds 已提交
791 792 793 794 795 796 797
			goto err_inval;
#endif
#else
		goto err_inval;
#endif
	} else {
		struct fib_nh *nh = fi->fib_nh;
798 799 800 801

		nh->nh_oif = cfg->fc_oif;
		nh->nh_gw = cfg->fc_gw;
		nh->nh_flags = cfg->fc_flags;
802
#ifdef CONFIG_IP_ROUTE_CLASSID
803
		nh->nh_tclassid = cfg->fc_flow;
L
Linus Torvalds 已提交
804 805 806 807 808 809
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		nh->nh_weight = 1;
#endif
	}

810 811
	if (fib_props[cfg->fc_type].error) {
		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
L
Linus Torvalds 已提交
812 813
			goto err_inval;
		goto link_it;
814 815 816 817 818 819 820 821 822 823 824
	} else {
		switch (cfg->fc_type) {
		case RTN_UNICAST:
		case RTN_LOCAL:
		case RTN_BROADCAST:
		case RTN_ANYCAST:
		case RTN_MULTICAST:
			break;
		default:
			goto err_inval;
		}
L
Linus Torvalds 已提交
825 826
	}

827
	if (cfg->fc_scope > RT_SCOPE_HOST)
L
Linus Torvalds 已提交
828 829
		goto err_inval;

830
	if (cfg->fc_scope == RT_SCOPE_HOST) {
L
Linus Torvalds 已提交
831 832 833 834 835 836
		struct fib_nh *nh = fi->fib_nh;

		/* Local address is added. */
		if (nhs != 1 || nh->nh_gw)
			goto err_inval;
		nh->nh_scope = RT_SCOPE_NOWHERE;
837
		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
L
Linus Torvalds 已提交
838 839 840 841 842
		err = -ENODEV;
		if (nh->nh_dev == NULL)
			goto failure;
	} else {
		change_nexthops(fi) {
E
Eric Dumazet 已提交
843 844
			err = fib_check_nh(cfg, fi, nexthop_nh);
			if (err != 0)
L
Linus Torvalds 已提交
845 846 847 848 849
				goto failure;
		} endfor_nexthops(fi)
	}

	if (fi->fib_prefsrc) {
850 851
		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
		    fi->fib_prefsrc != cfg->fc_dst)
852
			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
L
Linus Torvalds 已提交
853 854 855 856
				goto err_inval;
	}

link_it:
E
Eric Dumazet 已提交
857 858
	ofi = fib_find_info(fi);
	if (ofi) {
L
Linus Torvalds 已提交
859 860 861 862 863 864 865 866
		fi->fib_dead = 1;
		free_fib_info(fi);
		ofi->fib_treeref++;
		return ofi;
	}

	fi->fib_treeref++;
	atomic_inc(&fi->fib_clntref);
867
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
868 869 870 871 872 873 874 875 876 877 878 879
	hlist_add_head(&fi->fib_hash,
		       &fib_info_hash[fib_info_hashfn(fi)]);
	if (fi->fib_prefsrc) {
		struct hlist_head *head;

		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
		hlist_add_head(&fi->fib_lhash, head);
	}
	change_nexthops(fi) {
		struct hlist_head *head;
		unsigned int hash;

880
		if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
881
			continue;
882
		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
L
Linus Torvalds 已提交
883
		head = &fib_info_devhash[hash];
884
		hlist_add_head(&nexthop_nh->nh_hash, head);
L
Linus Torvalds 已提交
885
	} endfor_nexthops(fi)
886
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
887 888 889 890 891 892
	return fi;

err_inval:
	err = -EINVAL;

failure:
893
	if (fi) {
L
Linus Torvalds 已提交
894 895 896
		fi->fib_dead = 1;
		free_fib_info(fi);
	}
897 898

	return ERR_PTR(err);
L
Linus Torvalds 已提交
899 900 901 902
}

/* Find appropriate source address to this destination */

A
Al Viro 已提交
903
__be32 __fib_res_prefsrc(struct fib_result *res)
L
Linus Torvalds 已提交
904 905 906 907
{
	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
}

908
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
A
Al Viro 已提交
909
		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
910
		  struct fib_info *fi, unsigned int flags)
L
Linus Torvalds 已提交
911
{
912
	struct nlmsghdr *nlh;
L
Linus Torvalds 已提交
913 914
	struct rtmsg *rtm;

915 916
	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
	if (nlh == NULL)
917
		return -EMSGSIZE;
918 919

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
920 921 922 923
	rtm->rtm_family = AF_INET;
	rtm->rtm_dst_len = dst_len;
	rtm->rtm_src_len = 0;
	rtm->rtm_tos = tos;
924 925 926 927
	if (tb_id < 256)
		rtm->rtm_table = tb_id;
	else
		rtm->rtm_table = RT_TABLE_COMPAT;
928
	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
L
Linus Torvalds 已提交
929 930 931 932
	rtm->rtm_type = type;
	rtm->rtm_flags = fi->fib_flags;
	rtm->rtm_scope = scope;
	rtm->rtm_protocol = fi->fib_protocol;
933 934

	if (rtm->rtm_dst_len)
935
		NLA_PUT_BE32(skb, RTA_DST, dst);
936

L
Linus Torvalds 已提交
937
	if (fi->fib_priority)
938 939
		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);

L
Linus Torvalds 已提交
940
	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
941 942
		goto nla_put_failure;

L
Linus Torvalds 已提交
943
	if (fi->fib_prefsrc)
944
		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
945

L
Linus Torvalds 已提交
946 947
	if (fi->fib_nhs == 1) {
		if (fi->fib_nh->nh_gw)
948
			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
949

L
Linus Torvalds 已提交
950
		if (fi->fib_nh->nh_oif)
951
			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
952
#ifdef CONFIG_IP_ROUTE_CLASSID
953
		if (fi->fib_nh[0].nh_tclassid)
954
			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
955
#endif
L
Linus Torvalds 已提交
956 957 958
	}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
	if (fi->fib_nhs > 1) {
959 960 961 962 963 964
		struct rtnexthop *rtnh;
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (mp == NULL)
			goto nla_put_failure;
L
Linus Torvalds 已提交
965 966

		for_nexthops(fi) {
967 968 969 970 971 972 973 974
			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
			if (rtnh == NULL)
				goto nla_put_failure;

			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
			rtnh->rtnh_hops = nh->nh_weight - 1;
			rtnh->rtnh_ifindex = nh->nh_oif;

L
Linus Torvalds 已提交
975
			if (nh->nh_gw)
976
				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
977
#ifdef CONFIG_IP_ROUTE_CLASSID
978
			if (nh->nh_tclassid)
979
				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
980
#endif
981 982
			/* length of rtnetlink header + attributes */
			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
L
Linus Torvalds 已提交
983
		} endfor_nexthops(fi);
984 985

		nla_nest_end(skb, mp);
L
Linus Torvalds 已提交
986 987
	}
#endif
988
	return nlmsg_end(skb, nlh);
L
Linus Torvalds 已提交
989

990
nla_put_failure:
991 992
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
993 994 995
}

/*
E
Eric Dumazet 已提交
996 997 998 999
 * Update FIB if:
 * - local address disappeared -> we must delete all the entries
 *   referring to it.
 * - device went down -> we must shutdown all nexthops going via it.
L
Linus Torvalds 已提交
1000
 */
1001
int fib_sync_down_addr(struct net *net, __be32 local)
L
Linus Torvalds 已提交
1002 1003
{
	int ret = 0;
D
Denis V. Lunev 已提交
1004 1005 1006 1007
	unsigned int hash = fib_laddr_hashfn(local);
	struct hlist_head *head = &fib_info_laddrhash[hash];
	struct hlist_node *node;
	struct fib_info *fi;
L
Linus Torvalds 已提交
1008

D
Denis V. Lunev 已提交
1009 1010
	if (fib_info_laddrhash == NULL || local == 0)
		return 0;
L
Linus Torvalds 已提交
1011

D
Denis V. Lunev 已提交
1012
	hlist_for_each_entry(fi, node, head, fib_lhash) {
O
Octavian Purdila 已提交
1013
		if (!net_eq(fi->fib_net, net))
1014
			continue;
D
Denis V. Lunev 已提交
1015 1016 1017
		if (fi->fib_prefsrc == local) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1018 1019
		}
	}
D
Denis V. Lunev 已提交
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
	return ret;
}

int fib_sync_down_dev(struct net_device *dev, int force)
{
	int ret = 0;
	int scope = RT_SCOPE_NOWHERE;
	struct fib_info *prev_fi = NULL;
	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
	struct hlist_head *head = &fib_info_devhash[hash];
	struct hlist_node *node;
	struct fib_nh *nh;
L
Linus Torvalds 已提交
1032

D
Denis V. Lunev 已提交
1033 1034
	if (force)
		scope = -1;
L
Linus Torvalds 已提交
1035

D
Denis V. Lunev 已提交
1036 1037 1038
	hlist_for_each_entry(nh, node, head, nh_hash) {
		struct fib_info *fi = nh->nh_parent;
		int dead;
L
Linus Torvalds 已提交
1039

D
Denis V. Lunev 已提交
1040 1041 1042 1043 1044 1045
		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;
		prev_fi = fi;
		dead = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1046
			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
D
Denis V. Lunev 已提交
1047
				dead++;
1048 1049 1050
			else if (nexthop_nh->nh_dev == dev &&
				 nexthop_nh->nh_scope != scope) {
				nexthop_nh->nh_flags |= RTNH_F_DEAD;
L
Linus Torvalds 已提交
1051
#ifdef CONFIG_IP_ROUTE_MULTIPATH
D
Denis V. Lunev 已提交
1052
				spin_lock_bh(&fib_multipath_lock);
1053 1054
				fi->fib_power -= nexthop_nh->nh_power;
				nexthop_nh->nh_power = 0;
D
Denis V. Lunev 已提交
1055
				spin_unlock_bh(&fib_multipath_lock);
L
Linus Torvalds 已提交
1056
#endif
D
Denis V. Lunev 已提交
1057 1058
				dead++;
			}
L
Linus Torvalds 已提交
1059
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1060
			if (force > 1 && nexthop_nh->nh_dev == dev) {
D
Denis V. Lunev 已提交
1061 1062
				dead = fi->fib_nhs;
				break;
L
Linus Torvalds 已提交
1063
			}
D
Denis V. Lunev 已提交
1064 1065 1066 1067 1068
#endif
		} endfor_nexthops(fi)
		if (dead == fi->fib_nhs) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1069 1070 1071 1072 1073 1074
		}
	}

	return ret;
}

1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
/* Must be invoked inside of an RCU protected region.  */
void fib_select_default(struct fib_result *res)
{
	struct fib_info *fi = NULL, *last_resort = NULL;
	struct list_head *fa_head = res->fa_head;
	struct fib_table *tb = res->table;
	int order = -1, last_idx = -1;
	struct fib_alias *fa;

	list_for_each_entry_rcu(fa, fa_head, fa_list) {
		struct fib_info *next_fi = fa->fa_info;

		if (fa->fa_scope != res->scope ||
		    fa->fa_type != RTN_UNICAST)
			continue;

		if (next_fi->fib_priority > res->fi->fib_priority)
			break;
		if (!next_fi->fib_nh[0].nh_gw ||
		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
			continue;

		fib_alias_accessed(fa);

		if (fi == NULL) {
			if (next_fi != res->fi)
				break;
		} else if (!fib_detect_death(fi, order, &last_resort,
					     &last_idx, tb->tb_default)) {
			fib_result_assign(res, fi);
			tb->tb_default = order;
			goto out;
		}
		fi = next_fi;
		order++;
	}

	if (order <= 0 || fi == NULL) {
		tb->tb_default = -1;
		goto out;
	}

	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
				tb->tb_default)) {
		fib_result_assign(res, fi);
		tb->tb_default = order;
		goto out;
	}

	if (last_idx >= 0)
		fib_result_assign(res, last_resort);
	tb->tb_default = last_idx;
out:
1128
	return;
1129 1130
}

L
Linus Torvalds 已提交
1131 1132 1133
#ifdef CONFIG_IP_ROUTE_MULTIPATH

/*
E
Eric Dumazet 已提交
1134 1135
 * Dead device goes up. We wake up dead nexthops.
 * It takes sense only on multipath routes.
L
Linus Torvalds 已提交
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
 */
int fib_sync_up(struct net_device *dev)
{
	struct fib_info *prev_fi;
	unsigned int hash;
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_nh *nh;
	int ret;

E
Eric Dumazet 已提交
1146
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
		return 0;

	prev_fi = NULL;
	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	ret = 0;

	hlist_for_each_entry(nh, node, head, nh_hash) {
		struct fib_info *fi = nh->nh_parent;
		int alive;

		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;

		prev_fi = fi;
		alive = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1165
			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
L
Linus Torvalds 已提交
1166 1167 1168
				alive++;
				continue;
			}
1169
			if (nexthop_nh->nh_dev == NULL ||
E
Eric Dumazet 已提交
1170
			    !(nexthop_nh->nh_dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1171
				continue;
1172 1173
			if (nexthop_nh->nh_dev != dev ||
			    !__in_dev_get_rtnl(dev))
L
Linus Torvalds 已提交
1174 1175 1176
				continue;
			alive++;
			spin_lock_bh(&fib_multipath_lock);
1177 1178
			nexthop_nh->nh_power = 0;
			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
L
Linus Torvalds 已提交
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
			spin_unlock_bh(&fib_multipath_lock);
		} endfor_nexthops(fi)

		if (alive > 0) {
			fi->fib_flags &= ~RTNH_F_DEAD;
			ret++;
		}
	}

	return ret;
}

/*
E
Eric Dumazet 已提交
1192 1193
 * The algorithm is suboptimal, but it provides really
 * fair weighted route distribution.
L
Linus Torvalds 已提交
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
 */
void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
{
	struct fib_info *fi = res->fi;
	int w;

	spin_lock_bh(&fib_multipath_lock);
	if (fi->fib_power <= 0) {
		int power = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1204
			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1205 1206
				power += nexthop_nh->nh_weight;
				nexthop_nh->nh_power = nexthop_nh->nh_weight;
L
Linus Torvalds 已提交
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
			}
		} endfor_nexthops(fi);
		fi->fib_power = power;
		if (power <= 0) {
			spin_unlock_bh(&fib_multipath_lock);
			/* Race condition: route has just become dead. */
			res->nh_sel = 0;
			return;
		}
	}


	/* w should be random number [0..fi->fib_power-1],
E
Eric Dumazet 已提交
1220
	 * it is pretty bad approximation.
L
Linus Torvalds 已提交
1221 1222 1223 1224 1225
	 */

	w = jiffies % fi->fib_power;

	change_nexthops(fi) {
E
Eric Dumazet 已提交
1226
		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1227
		    nexthop_nh->nh_power) {
E
Eric Dumazet 已提交
1228 1229
			w -= nexthop_nh->nh_power;
			if (w <= 0) {
1230
				nexthop_nh->nh_power--;
L
Linus Torvalds 已提交
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
				fi->fib_power--;
				res->nh_sel = nhsel;
				spin_unlock_bh(&fib_multipath_lock);
				return;
			}
		}
	} endfor_nexthops(fi);

	/* Race condition: route has just become dead. */
	res->nh_sel = 0;
	spin_unlock_bh(&fib_multipath_lock);
}
#endif