fib_semantics.c 29.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: semantics.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
29
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
30 31 32 33 34
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
35
#include <linux/slab.h>
L
Linus Torvalds 已提交
36

37
#include <net/arp.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
44
#include <net/netlink.h>
45
#include <net/nexthop.h>
L
Linus Torvalds 已提交
46 47 48

#include "fib_lookup.h"

49
static DEFINE_SPINLOCK(fib_info_lock);
L
Linus Torvalds 已提交
50 51
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
52
static unsigned int fib_info_hash_size;
L
Linus Torvalds 已提交
53 54 55 56 57 58 59 60 61 62
static unsigned int fib_info_cnt;

#define DEVINDEX_HASHBITS 8
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];

#ifdef CONFIG_IP_ROUTE_MULTIPATH

static DEFINE_SPINLOCK(fib_multipath_lock);

E
Eric Dumazet 已提交
63 64 65 66 67 68 69 70 71 72 73
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh;				\
	for (nhsel = 0, nh = (fi)->fib_nh;				\
	     nhsel < (fi)->fib_nhs;					\
	     nh++, nhsel++)

#define change_nexthops(fi) {						\
	int nhsel; struct fib_nh *nexthop_nh;				\
	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	     nhsel < (fi)->fib_nhs;					\
	     nexthop_nh++, nhsel++)
L
Linus Torvalds 已提交
74 75 76 77 78

#else /* CONFIG_IP_ROUTE_MULTIPATH */

/* Hope, that gcc will optimize it to get rid of dummy loop */

E
Eric Dumazet 已提交
79 80 81
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
82

E
Eric Dumazet 已提交
83 84 85 86
#define change_nexthops(fi) {						\
	int nhsel;							\
	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
87 88 89 90 91 92

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

#define endfor_nexthops(fi) }


93
const struct fib_prop fib_props[RTN_MAX + 1] = {
E
Eric Dumazet 已提交
94
	[RTN_UNSPEC] = {
L
Linus Torvalds 已提交
95 96
		.error	= 0,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
97 98
	},
	[RTN_UNICAST] = {
L
Linus Torvalds 已提交
99 100
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
101 102
	},
	[RTN_LOCAL] = {
L
Linus Torvalds 已提交
103 104
		.error	= 0,
		.scope	= RT_SCOPE_HOST,
E
Eric Dumazet 已提交
105 106
	},
	[RTN_BROADCAST] = {
L
Linus Torvalds 已提交
107 108
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
109 110
	},
	[RTN_ANYCAST] = {
L
Linus Torvalds 已提交
111 112
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
113 114
	},
	[RTN_MULTICAST] = {
L
Linus Torvalds 已提交
115 116
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
117 118
	},
	[RTN_BLACKHOLE] = {
L
Linus Torvalds 已提交
119 120
		.error	= -EINVAL,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
121 122
	},
	[RTN_UNREACHABLE] = {
L
Linus Torvalds 已提交
123 124
		.error	= -EHOSTUNREACH,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
125 126
	},
	[RTN_PROHIBIT] = {
L
Linus Torvalds 已提交
127 128
		.error	= -EACCES,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
129 130
	},
	[RTN_THROW] = {
L
Linus Torvalds 已提交
131 132
		.error	= -EAGAIN,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
133 134
	},
	[RTN_NAT] = {
L
Linus Torvalds 已提交
135 136
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
137 138
	},
	[RTN_XRESOLVE] = {
L
Linus Torvalds 已提交
139 140
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
141
	},
L
Linus Torvalds 已提交
142 143 144 145 146
};


/* Release a nexthop info record */

E
Eric Dumazet 已提交
147 148 149 150
static void free_fib_info_rcu(struct rcu_head *head)
{
	struct fib_info *fi = container_of(head, struct fib_info, rcu);

151 152
	if (fi->fib_metrics != (u32 *) dst_default_metrics)
		kfree(fi->fib_metrics);
E
Eric Dumazet 已提交
153 154 155
	kfree(fi);
}

L
Linus Torvalds 已提交
156 157 158
void free_fib_info(struct fib_info *fi)
{
	if (fi->fib_dead == 0) {
E
Eric Dumazet 已提交
159
		pr_warning("Freeing alive fib_info %p\n", fi);
L
Linus Torvalds 已提交
160 161 162
		return;
	}
	change_nexthops(fi) {
163 164 165
		if (nexthop_nh->nh_dev)
			dev_put(nexthop_nh->nh_dev);
		nexthop_nh->nh_dev = NULL;
L
Linus Torvalds 已提交
166 167
	} endfor_nexthops(fi);
	fib_info_cnt--;
168
	release_net(fi->fib_net);
E
Eric Dumazet 已提交
169
	call_rcu(&fi->rcu, free_fib_info_rcu);
L
Linus Torvalds 已提交
170 171 172 173
}

void fib_release_info(struct fib_info *fi)
{
174
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
175 176 177 178 179
	if (fi && --fi->fib_treeref == 0) {
		hlist_del(&fi->fib_hash);
		if (fi->fib_prefsrc)
			hlist_del(&fi->fib_lhash);
		change_nexthops(fi) {
180
			if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
181
				continue;
182
			hlist_del(&nexthop_nh->nh_hash);
L
Linus Torvalds 已提交
183 184 185 186
		} endfor_nexthops(fi)
		fi->fib_dead = 1;
		fib_info_put(fi);
	}
187
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
188 189
}

E
Eric Dumazet 已提交
190
static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
L
Linus Torvalds 已提交
191 192 193 194 195 196 197 198 199 200
{
	const struct fib_nh *onh = ofi->fib_nh;

	for_nexthops(fi) {
		if (nh->nh_oif != onh->nh_oif ||
		    nh->nh_gw  != onh->nh_gw ||
		    nh->nh_scope != onh->nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		    nh->nh_weight != onh->nh_weight ||
#endif
201
#ifdef CONFIG_IP_ROUTE_CLASSID
L
Linus Torvalds 已提交
202 203
		    nh->nh_tclassid != onh->nh_tclassid ||
#endif
E
Eric Dumazet 已提交
204
		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
L
Linus Torvalds 已提交
205 206 207 208 209 210
			return -1;
		onh++;
	} endfor_nexthops(fi);
	return 0;
}

211 212 213 214 215 216 217 218 219
static inline unsigned int fib_devindex_hashfn(unsigned int val)
{
	unsigned int mask = DEVINDEX_HASHSIZE - 1;

	return (val ^
		(val >> DEVINDEX_HASHBITS) ^
		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}

L
Linus Torvalds 已提交
220 221
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
222
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
223 224 225
	unsigned int val = fi->fib_nhs;

	val ^= fi->fib_protocol;
A
Al Viro 已提交
226
	val ^= (__force u32)fi->fib_prefsrc;
L
Linus Torvalds 已提交
227
	val ^= fi->fib_priority;
228 229 230
	for_nexthops(fi) {
		val ^= fib_devindex_hashfn(nh->nh_oif);
	} endfor_nexthops(fi)
L
Linus Torvalds 已提交
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}

static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_info *fi;
	unsigned int hash;

	hash = fib_info_hashfn(nfi);
	head = &fib_info_hash[hash];

	hlist_for_each_entry(fi, node, head, fib_hash) {
O
Octavian Purdila 已提交
246
		if (!net_eq(fi->fib_net, nfi->fib_net))
247
			continue;
L
Linus Torvalds 已提交
248 249 250 251 252 253
		if (fi->fib_nhs != nfi->fib_nhs)
			continue;
		if (nfi->fib_protocol == fi->fib_protocol &&
		    nfi->fib_prefsrc == fi->fib_prefsrc &&
		    nfi->fib_priority == fi->fib_priority &&
		    memcmp(nfi->fib_metrics, fi->fib_metrics,
E
Eric Dumazet 已提交
254
			   sizeof(u32) * RTAX_MAX) == 0 &&
E
Eric Dumazet 已提交
255
		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
L
Linus Torvalds 已提交
256 257 258 259 260 261 262 263
		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
			return fi;
	}

	return NULL;
}

/* Check, that the gateway is already configured.
E
Eric Dumazet 已提交
264
 * Used only by redirect accept routine.
L
Linus Torvalds 已提交
265
 */
266
int ip_fib_check_default(__be32 gw, struct net_device *dev)
L
Linus Torvalds 已提交
267 268 269 270 271 272
{
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_nh *nh;
	unsigned int hash;

273
	spin_lock(&fib_info_lock);
L
Linus Torvalds 已提交
274 275 276 277 278 279

	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	hlist_for_each_entry(nh, node, head, nh_hash) {
		if (nh->nh_dev == dev &&
		    nh->nh_gw == gw &&
E
Eric Dumazet 已提交
280
		    !(nh->nh_flags & RTNH_F_DEAD)) {
281
			spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
282 283 284 285
			return 0;
		}
	}

286
	spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
287 288 289 290

	return -1;
}

291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
static inline size_t fib_nlmsg_size(struct fib_info *fi)
{
	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
			 + nla_total_size(4) /* RTA_TABLE */
			 + nla_total_size(4) /* RTA_DST */
			 + nla_total_size(4) /* RTA_PRIORITY */
			 + nla_total_size(4); /* RTA_PREFSRC */

	/* space for nested metrics */
	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));

	if (fi->fib_nhs) {
		/* Also handles the special case fib_nhs == 1 */

		/* each nexthop is packed in an attribute */
		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));

		/* may contain flow and gateway attribute */
		nhsize += 2 * nla_total_size(4);

		/* all nexthops are packed in a nested attribute */
		payload += nla_total_size(fi->fib_nhs * nhsize);
	}

	return payload;
}

A
Al Viro 已提交
318
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
319 320
	       int dst_len, u32 tb_id, struct nl_info *info,
	       unsigned int nlm_flags)
L
Linus Torvalds 已提交
321 322
{
	struct sk_buff *skb;
323
	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
324
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
325

326
	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
327 328
	if (skb == NULL)
		goto errout;
L
Linus Torvalds 已提交
329

330
	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
331
			    fa->fa_type, fa->fa_scope, key, dst_len,
332
			    fa->fa_tos, fa->fa_info, nlm_flags);
333 334 335 336 337 338
	if (err < 0) {
		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
339 340 341
	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
		    info->nlh, GFP_KERNEL);
	return;
342 343
errout:
	if (err < 0)
344
		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
L
Linus Torvalds 已提交
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
}

/* Return the first fib alias matching TOS with
 * priority less than or equal to PRIO.
 */
struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
{
	if (fah) {
		struct fib_alias *fa;
		list_for_each_entry(fa, fah, fa_list) {
			if (fa->fa_tos > tos)
				continue;
			if (fa->fa_info->fib_priority >= prio ||
			    fa->fa_tos < tos)
				return fa;
		}
	}
	return NULL;
}

int fib_detect_death(struct fib_info *fi, int order,
366
		     struct fib_info **last_resort, int *last_idx, int dflt)
L
Linus Torvalds 已提交
367 368 369 370 371 372 373 374 375
{
	struct neighbour *n;
	int state = NUD_NONE;

	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
	if (n) {
		state = n->nud_state;
		neigh_release(n);
	}
376
	if (state == NUD_REACHABLE)
L
Linus Torvalds 已提交
377
		return 0;
E
Eric Dumazet 已提交
378
	if ((state & NUD_VALID) && order != dflt)
L
Linus Torvalds 已提交
379
		return 0;
E
Eric Dumazet 已提交
380 381
	if ((state & NUD_VALID) ||
	    (*last_idx < 0 && order > dflt)) {
L
Linus Torvalds 已提交
382 383 384 385 386 387 388 389
		*last_resort = fi;
		*last_idx = order;
	}
	return 1;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH

390
static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
L
Linus Torvalds 已提交
391 392 393
{
	int nhs = 0;

394
	while (rtnh_ok(rtnh, remaining)) {
L
Linus Torvalds 已提交
395
		nhs++;
396 397 398 399 400
		rtnh = rtnh_next(rtnh, &remaining);
	}

	/* leftover implies invalid nexthop configuration, discard it */
	return remaining > 0 ? 0 : nhs;
L
Linus Torvalds 已提交
401 402
}

403 404
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
		       int remaining, struct fib_config *cfg)
L
Linus Torvalds 已提交
405 406
{
	change_nexthops(fi) {
407 408 409
		int attrlen;

		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
410
			return -EINVAL;
411

412 413 414 415
		nexthop_nh->nh_flags =
			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
416 417 418 419 420 421

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
422
			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
423
#ifdef CONFIG_IP_ROUTE_CLASSID
424
			nla = nla_find(attrs, attrlen, RTA_FLOW);
425
			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
L
Linus Torvalds 已提交
426 427
#endif
		}
428 429

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
430
	} endfor_nexthops(fi);
431

L
Linus Torvalds 已提交
432 433 434 435 436
	return 0;
}

#endif

437
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
L
Linus Torvalds 已提交
438 439
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
440 441
	struct rtnexthop *rtnh;
	int remaining;
L
Linus Torvalds 已提交
442 443
#endif

444
	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
L
Linus Torvalds 已提交
445 446
		return 1;

447 448 449
	if (cfg->fc_oif || cfg->fc_gw) {
		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
L
Linus Torvalds 已提交
450 451 452 453 454
			return 0;
		return 1;
	}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
455
	if (cfg->fc_mp == NULL)
L
Linus Torvalds 已提交
456
		return 0;
457 458 459

	rtnh = cfg->fc_mp;
	remaining = cfg->fc_mp_len;
460

L
Linus Torvalds 已提交
461
	for_nexthops(fi) {
462
		int attrlen;
L
Linus Torvalds 已提交
463

464
		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
465
			return -EINVAL;
466 467

		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
L
Linus Torvalds 已提交
468
			return 1;
469 470 471 472 473 474

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen < 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
475
			if (nla && nla_get_be32(nla) != nh->nh_gw)
L
Linus Torvalds 已提交
476
				return 1;
477
#ifdef CONFIG_IP_ROUTE_CLASSID
478 479
			nla = nla_find(attrs, attrlen, RTA_FLOW);
			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
L
Linus Torvalds 已提交
480 481 482
				return 1;
#endif
		}
483 484

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
485 486 487 488 489 490 491
	} endfor_nexthops(fi);
#endif
	return 0;
}


/*
E
Eric Dumazet 已提交
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
 * Picture
 * -------
 *
 * Semantics of nexthop is very messy by historical reasons.
 * We have to take into account, that:
 * a) gateway can be actually local interface address,
 *    so that gatewayed route is direct.
 * b) gateway must be on-link address, possibly
 *    described not by an ifaddr, but also by a direct route.
 * c) If both gateway and interface are specified, they should not
 *    contradict.
 * d) If we use tunnel routes, gateway could be not on-link.
 *
 * Attempt to reconcile all of these (alas, self-contradictory) conditions
 * results in pretty ugly and hairy code with obscure logic.
 *
 * I chose to generalized it instead, so that the size
 * of code does not increase practically, but it becomes
 * much more general.
 * Every prefix is assigned a "scope" value: "host" is local address,
 * "link" is direct route,
 * [ ... "site" ... "interior" ... ]
 * and "universe" is true gateway route with global meaning.
 *
 * Every prefix refers to a set of "nexthop"s (gw, oif),
 * where gw must have narrower scope. This recursion stops
 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 * which means that gw is forced to be on link.
 *
 * Code is still hairy, but now it is apparently logically
 * consistent and very flexible. F.e. as by-product it allows
 * to co-exists in peace independent exterior and interior
 * routing processes.
 *
 * Normally it looks as following.
 *
 * {universe prefix}  -> (gw, oif) [scope link]
 *		  |
 *		  |-> {link prefix} -> (gw, oif) [scope local]
 *					|
 *					|-> {local prefix} (terminal node)
L
Linus Torvalds 已提交
533
 */
534 535
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
			struct fib_nh *nh)
L
Linus Torvalds 已提交
536 537
{
	int err;
538
	struct net *net;
E
Eric Dumazet 已提交
539
	struct net_device *dev;
L
Linus Torvalds 已提交
540

541
	net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
542 543 544
	if (nh->nh_gw) {
		struct fib_result res;

E
Eric Dumazet 已提交
545
		if (nh->nh_flags & RTNH_F_ONLINK) {
L
Linus Torvalds 已提交
546

547
			if (cfg->fc_scope >= RT_SCOPE_LINK)
L
Linus Torvalds 已提交
548
				return -EINVAL;
549
			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
L
Linus Torvalds 已提交
550
				return -EINVAL;
E
Eric Dumazet 已提交
551 552
			dev = __dev_get_by_index(net, nh->nh_oif);
			if (!dev)
L
Linus Torvalds 已提交
553
				return -ENODEV;
E
Eric Dumazet 已提交
554
			if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
555 556 557 558 559 560
				return -ENETDOWN;
			nh->nh_dev = dev;
			dev_hold(dev);
			nh->nh_scope = RT_SCOPE_LINK;
			return 0;
		}
E
Eric Dumazet 已提交
561
		rcu_read_lock();
L
Linus Torvalds 已提交
562
		{
D
David S. Miller 已提交
563 564 565 566
			struct flowi4 fl4 = {
				.daddr = nh->nh_gw,
				.flowi4_scope = cfg->fc_scope + 1,
				.flowi4_oif = nh->nh_oif,
567
			};
L
Linus Torvalds 已提交
568 569

			/* It is not necessary, but requires a bit of thinking */
D
David S. Miller 已提交
570 571 572
			if (fl4.flowi4_scope < RT_SCOPE_LINK)
				fl4.flowi4_scope = RT_SCOPE_LINK;
			err = fib_lookup(net, &fl4, &res);
E
Eric Dumazet 已提交
573 574
			if (err) {
				rcu_read_unlock();
L
Linus Torvalds 已提交
575
				return err;
E
Eric Dumazet 已提交
576
			}
L
Linus Torvalds 已提交
577 578 579 580 581 582
		}
		err = -EINVAL;
		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
			goto out;
		nh->nh_scope = res.scope;
		nh->nh_oif = FIB_RES_OIF(res);
E
Eric Dumazet 已提交
583 584
		nh->nh_dev = dev = FIB_RES_DEV(res);
		if (!dev)
L
Linus Torvalds 已提交
585
			goto out;
E
Eric Dumazet 已提交
586
		dev_hold(dev);
587
		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
L
Linus Torvalds 已提交
588 589 590
	} else {
		struct in_device *in_dev;

E
Eric Dumazet 已提交
591
		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
L
Linus Torvalds 已提交
592 593
			return -EINVAL;

594 595
		rcu_read_lock();
		err = -ENODEV;
596
		in_dev = inetdev_by_index(net, nh->nh_oif);
L
Linus Torvalds 已提交
597
		if (in_dev == NULL)
598 599 600 601
			goto out;
		err = -ENETDOWN;
		if (!(in_dev->dev->flags & IFF_UP))
			goto out;
L
Linus Torvalds 已提交
602 603 604
		nh->nh_dev = in_dev->dev;
		dev_hold(nh->nh_dev);
		nh->nh_scope = RT_SCOPE_HOST;
605
		err = 0;
L
Linus Torvalds 已提交
606
	}
607 608 609
out:
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
610 611
}

A
Al Viro 已提交
612
static inline unsigned int fib_laddr_hashfn(__be32 val)
L
Linus Torvalds 已提交
613
{
614
	unsigned int mask = (fib_info_hash_size - 1);
L
Linus Torvalds 已提交
615

E
Eric Dumazet 已提交
616 617 618
	return ((__force u32)val ^
		((__force u32)val >> 7) ^
		((__force u32)val >> 14)) & mask;
L
Linus Torvalds 已提交
619 620
}

621
static struct hlist_head *fib_info_hash_alloc(int bytes)
L
Linus Torvalds 已提交
622 623
{
	if (bytes <= PAGE_SIZE)
624
		return kzalloc(bytes, GFP_KERNEL);
L
Linus Torvalds 已提交
625 626
	else
		return (struct hlist_head *)
E
Eric Dumazet 已提交
627 628
			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
					 get_order(bytes));
L
Linus Torvalds 已提交
629 630
}

631
static void fib_info_hash_free(struct hlist_head *hash, int bytes)
L
Linus Torvalds 已提交
632 633 634 635 636 637 638 639 640 641
{
	if (!hash)
		return;

	if (bytes <= PAGE_SIZE)
		kfree(hash);
	else
		free_pages((unsigned long) hash, get_order(bytes));
}

642 643 644
static void fib_info_hash_move(struct hlist_head *new_info_hash,
			       struct hlist_head *new_laddrhash,
			       unsigned int new_size)
L
Linus Torvalds 已提交
645
{
646
	struct hlist_head *old_info_hash, *old_laddrhash;
647
	unsigned int old_size = fib_info_hash_size;
648
	unsigned int i, bytes;
L
Linus Torvalds 已提交
649

650
	spin_lock_bh(&fib_info_lock);
651 652
	old_info_hash = fib_info_hash;
	old_laddrhash = fib_info_laddrhash;
653
	fib_info_hash_size = new_size;
L
Linus Torvalds 已提交
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690

	for (i = 0; i < old_size; i++) {
		struct hlist_head *head = &fib_info_hash[i];
		struct hlist_node *node, *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
			struct hlist_head *dest;
			unsigned int new_hash;

			hlist_del(&fi->fib_hash);

			new_hash = fib_info_hashfn(fi);
			dest = &new_info_hash[new_hash];
			hlist_add_head(&fi->fib_hash, dest);
		}
	}
	fib_info_hash = new_info_hash;

	for (i = 0; i < old_size; i++) {
		struct hlist_head *lhead = &fib_info_laddrhash[i];
		struct hlist_node *node, *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
			struct hlist_head *ldest;
			unsigned int new_hash;

			hlist_del(&fi->fib_lhash);

			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
			ldest = &new_laddrhash[new_hash];
			hlist_add_head(&fi->fib_lhash, ldest);
		}
	}
	fib_info_laddrhash = new_laddrhash;

691
	spin_unlock_bh(&fib_info_lock);
692 693

	bytes = old_size * sizeof(struct hlist_head *);
694 695
	fib_info_hash_free(old_info_hash, bytes);
	fib_info_hash_free(old_laddrhash, bytes);
L
Linus Torvalds 已提交
696 697
}

698 699 700 701 702 703 704 705 706 707
__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
{
	nh->nh_saddr = inet_select_addr(nh->nh_dev,
					nh->nh_gw,
					nh->nh_cfg_scope);
	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);

	return nh->nh_saddr;
}

708
struct fib_info *fib_create_info(struct fib_config *cfg)
L
Linus Torvalds 已提交
709 710 711 712 713
{
	int err;
	struct fib_info *fi = NULL;
	struct fib_info *ofi;
	int nhs = 1;
714
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
715

716 717 718
	if (cfg->fc_type > RTN_MAX)
		goto err_inval;

L
Linus Torvalds 已提交
719
	/* Fast check to catch the most weird cases */
720
	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
L
Linus Torvalds 已提交
721 722 723
		goto err_inval;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
724 725
	if (cfg->fc_mp) {
		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
L
Linus Torvalds 已提交
726 727 728 729 730 731
		if (nhs == 0)
			goto err_inval;
	}
#endif

	err = -ENOBUFS;
732 733
	if (fib_info_cnt >= fib_info_hash_size) {
		unsigned int new_size = fib_info_hash_size << 1;
L
Linus Torvalds 已提交
734 735 736 737 738 739 740
		struct hlist_head *new_info_hash;
		struct hlist_head *new_laddrhash;
		unsigned int bytes;

		if (!new_size)
			new_size = 1;
		bytes = new_size * sizeof(struct hlist_head *);
741 742
		new_info_hash = fib_info_hash_alloc(bytes);
		new_laddrhash = fib_info_hash_alloc(bytes);
L
Linus Torvalds 已提交
743
		if (!new_info_hash || !new_laddrhash) {
744 745
			fib_info_hash_free(new_info_hash, bytes);
			fib_info_hash_free(new_laddrhash, bytes);
746
		} else
747
			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
L
Linus Torvalds 已提交
748

749
		if (!fib_info_hash_size)
L
Linus Torvalds 已提交
750 751 752
			goto failure;
	}

753
	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
L
Linus Torvalds 已提交
754 755
	if (fi == NULL)
		goto failure;
756 757 758 759 760 761
	if (cfg->fc_mx) {
		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
		if (!fi->fib_metrics)
			goto failure;
	} else
		fi->fib_metrics = (u32 *) dst_default_metrics;
L
Linus Torvalds 已提交
762 763
	fib_info_cnt++;

764
	fi->fib_net = hold_net(net);
765 766 767 768
	fi->fib_protocol = cfg->fc_protocol;
	fi->fib_flags = cfg->fc_flags;
	fi->fib_priority = cfg->fc_priority;
	fi->fib_prefsrc = cfg->fc_prefsrc;
L
Linus Torvalds 已提交
769 770 771

	fi->fib_nhs = nhs;
	change_nexthops(fi) {
772
		nexthop_nh->nh_parent = fi;
L
Linus Torvalds 已提交
773 774
	} endfor_nexthops(fi)

775 776 777 778 779
	if (cfg->fc_mx) {
		struct nlattr *nla;
		int remaining;

		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
780
			int type = nla_type(nla);
781 782 783

			if (type) {
				if (type > RTAX_MAX)
L
Linus Torvalds 已提交
784
					goto err_inval;
785
				fi->fib_metrics[type - 1] = nla_get_u32(nla);
L
Linus Torvalds 已提交
786 787 788 789
			}
		}
	}

790
	if (cfg->fc_mp) {
L
Linus Torvalds 已提交
791
#ifdef CONFIG_IP_ROUTE_MULTIPATH
792 793
		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
		if (err != 0)
L
Linus Torvalds 已提交
794
			goto failure;
795
		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
L
Linus Torvalds 已提交
796
			goto err_inval;
797
		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
L
Linus Torvalds 已提交
798
			goto err_inval;
799
#ifdef CONFIG_IP_ROUTE_CLASSID
800
		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
L
Linus Torvalds 已提交
801 802 803 804 805 806 807
			goto err_inval;
#endif
#else
		goto err_inval;
#endif
	} else {
		struct fib_nh *nh = fi->fib_nh;
808 809 810 811

		nh->nh_oif = cfg->fc_oif;
		nh->nh_gw = cfg->fc_gw;
		nh->nh_flags = cfg->fc_flags;
812
#ifdef CONFIG_IP_ROUTE_CLASSID
813
		nh->nh_tclassid = cfg->fc_flow;
L
Linus Torvalds 已提交
814 815 816 817 818 819
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		nh->nh_weight = 1;
#endif
	}

820 821
	if (fib_props[cfg->fc_type].error) {
		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
L
Linus Torvalds 已提交
822 823
			goto err_inval;
		goto link_it;
824 825 826 827 828 829 830 831 832 833 834
	} else {
		switch (cfg->fc_type) {
		case RTN_UNICAST:
		case RTN_LOCAL:
		case RTN_BROADCAST:
		case RTN_ANYCAST:
		case RTN_MULTICAST:
			break;
		default:
			goto err_inval;
		}
L
Linus Torvalds 已提交
835 836
	}

837
	if (cfg->fc_scope > RT_SCOPE_HOST)
L
Linus Torvalds 已提交
838 839
		goto err_inval;

840
	if (cfg->fc_scope == RT_SCOPE_HOST) {
L
Linus Torvalds 已提交
841 842 843 844 845 846
		struct fib_nh *nh = fi->fib_nh;

		/* Local address is added. */
		if (nhs != 1 || nh->nh_gw)
			goto err_inval;
		nh->nh_scope = RT_SCOPE_NOWHERE;
847
		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
L
Linus Torvalds 已提交
848 849 850 851 852
		err = -ENODEV;
		if (nh->nh_dev == NULL)
			goto failure;
	} else {
		change_nexthops(fi) {
E
Eric Dumazet 已提交
853 854
			err = fib_check_nh(cfg, fi, nexthop_nh);
			if (err != 0)
L
Linus Torvalds 已提交
855 856 857 858 859
				goto failure;
		} endfor_nexthops(fi)
	}

	if (fi->fib_prefsrc) {
860 861
		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
		    fi->fib_prefsrc != cfg->fc_dst)
862
			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
L
Linus Torvalds 已提交
863 864 865
				goto err_inval;
	}

866
	change_nexthops(fi) {
867
		nexthop_nh->nh_cfg_scope = cfg->fc_scope;
868
		fib_info_update_nh_saddr(net, nexthop_nh);
869 870
	} endfor_nexthops(fi)

L
Linus Torvalds 已提交
871
link_it:
E
Eric Dumazet 已提交
872 873
	ofi = fib_find_info(fi);
	if (ofi) {
L
Linus Torvalds 已提交
874 875 876 877 878 879 880 881
		fi->fib_dead = 1;
		free_fib_info(fi);
		ofi->fib_treeref++;
		return ofi;
	}

	fi->fib_treeref++;
	atomic_inc(&fi->fib_clntref);
882
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
883 884 885 886 887 888 889 890 891 892 893 894
	hlist_add_head(&fi->fib_hash,
		       &fib_info_hash[fib_info_hashfn(fi)]);
	if (fi->fib_prefsrc) {
		struct hlist_head *head;

		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
		hlist_add_head(&fi->fib_lhash, head);
	}
	change_nexthops(fi) {
		struct hlist_head *head;
		unsigned int hash;

895
		if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
896
			continue;
897
		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
L
Linus Torvalds 已提交
898
		head = &fib_info_devhash[hash];
899
		hlist_add_head(&nexthop_nh->nh_hash, head);
L
Linus Torvalds 已提交
900
	} endfor_nexthops(fi)
901
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
902 903 904 905 906 907
	return fi;

err_inval:
	err = -EINVAL;

failure:
908
	if (fi) {
L
Linus Torvalds 已提交
909 910 911
		fi->fib_dead = 1;
		free_fib_info(fi);
	}
912 913

	return ERR_PTR(err);
L
Linus Torvalds 已提交
914 915
}

916
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
A
Al Viro 已提交
917
		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
918
		  struct fib_info *fi, unsigned int flags)
L
Linus Torvalds 已提交
919
{
920
	struct nlmsghdr *nlh;
L
Linus Torvalds 已提交
921 922
	struct rtmsg *rtm;

923 924
	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
	if (nlh == NULL)
925
		return -EMSGSIZE;
926 927

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
928 929 930 931
	rtm->rtm_family = AF_INET;
	rtm->rtm_dst_len = dst_len;
	rtm->rtm_src_len = 0;
	rtm->rtm_tos = tos;
932 933 934 935
	if (tb_id < 256)
		rtm->rtm_table = tb_id;
	else
		rtm->rtm_table = RT_TABLE_COMPAT;
936
	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
L
Linus Torvalds 已提交
937 938 939 940
	rtm->rtm_type = type;
	rtm->rtm_flags = fi->fib_flags;
	rtm->rtm_scope = scope;
	rtm->rtm_protocol = fi->fib_protocol;
941 942

	if (rtm->rtm_dst_len)
943
		NLA_PUT_BE32(skb, RTA_DST, dst);
944

L
Linus Torvalds 已提交
945
	if (fi->fib_priority)
946 947
		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);

L
Linus Torvalds 已提交
948
	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
949 950
		goto nla_put_failure;

L
Linus Torvalds 已提交
951
	if (fi->fib_prefsrc)
952
		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
953

L
Linus Torvalds 已提交
954 955
	if (fi->fib_nhs == 1) {
		if (fi->fib_nh->nh_gw)
956
			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
957

L
Linus Torvalds 已提交
958
		if (fi->fib_nh->nh_oif)
959
			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
960
#ifdef CONFIG_IP_ROUTE_CLASSID
961
		if (fi->fib_nh[0].nh_tclassid)
962
			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
963
#endif
L
Linus Torvalds 已提交
964 965 966
	}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
	if (fi->fib_nhs > 1) {
967 968 969 970 971 972
		struct rtnexthop *rtnh;
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (mp == NULL)
			goto nla_put_failure;
L
Linus Torvalds 已提交
973 974

		for_nexthops(fi) {
975 976 977 978 979 980 981 982
			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
			if (rtnh == NULL)
				goto nla_put_failure;

			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
			rtnh->rtnh_hops = nh->nh_weight - 1;
			rtnh->rtnh_ifindex = nh->nh_oif;

L
Linus Torvalds 已提交
983
			if (nh->nh_gw)
984
				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
985
#ifdef CONFIG_IP_ROUTE_CLASSID
986
			if (nh->nh_tclassid)
987
				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
988
#endif
989 990
			/* length of rtnetlink header + attributes */
			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
L
Linus Torvalds 已提交
991
		} endfor_nexthops(fi);
992 993

		nla_nest_end(skb, mp);
L
Linus Torvalds 已提交
994 995
	}
#endif
996
	return nlmsg_end(skb, nlh);
L
Linus Torvalds 已提交
997

998
nla_put_failure:
999 1000
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
1001 1002 1003
}

/*
E
Eric Dumazet 已提交
1004 1005 1006 1007
 * Update FIB if:
 * - local address disappeared -> we must delete all the entries
 *   referring to it.
 * - device went down -> we must shutdown all nexthops going via it.
L
Linus Torvalds 已提交
1008
 */
1009
int fib_sync_down_addr(struct net *net, __be32 local)
L
Linus Torvalds 已提交
1010 1011
{
	int ret = 0;
D
Denis V. Lunev 已提交
1012 1013 1014 1015
	unsigned int hash = fib_laddr_hashfn(local);
	struct hlist_head *head = &fib_info_laddrhash[hash];
	struct hlist_node *node;
	struct fib_info *fi;
L
Linus Torvalds 已提交
1016

D
Denis V. Lunev 已提交
1017 1018
	if (fib_info_laddrhash == NULL || local == 0)
		return 0;
L
Linus Torvalds 已提交
1019

D
Denis V. Lunev 已提交
1020
	hlist_for_each_entry(fi, node, head, fib_lhash) {
O
Octavian Purdila 已提交
1021
		if (!net_eq(fi->fib_net, net))
1022
			continue;
D
Denis V. Lunev 已提交
1023 1024 1025
		if (fi->fib_prefsrc == local) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1026 1027
		}
	}
D
Denis V. Lunev 已提交
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039
	return ret;
}

int fib_sync_down_dev(struct net_device *dev, int force)
{
	int ret = 0;
	int scope = RT_SCOPE_NOWHERE;
	struct fib_info *prev_fi = NULL;
	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
	struct hlist_head *head = &fib_info_devhash[hash];
	struct hlist_node *node;
	struct fib_nh *nh;
L
Linus Torvalds 已提交
1040

D
Denis V. Lunev 已提交
1041 1042
	if (force)
		scope = -1;
L
Linus Torvalds 已提交
1043

D
Denis V. Lunev 已提交
1044 1045 1046
	hlist_for_each_entry(nh, node, head, nh_hash) {
		struct fib_info *fi = nh->nh_parent;
		int dead;
L
Linus Torvalds 已提交
1047

D
Denis V. Lunev 已提交
1048 1049 1050 1051 1052 1053
		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;
		prev_fi = fi;
		dead = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1054
			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
D
Denis V. Lunev 已提交
1055
				dead++;
1056 1057 1058
			else if (nexthop_nh->nh_dev == dev &&
				 nexthop_nh->nh_scope != scope) {
				nexthop_nh->nh_flags |= RTNH_F_DEAD;
L
Linus Torvalds 已提交
1059
#ifdef CONFIG_IP_ROUTE_MULTIPATH
D
Denis V. Lunev 已提交
1060
				spin_lock_bh(&fib_multipath_lock);
1061 1062
				fi->fib_power -= nexthop_nh->nh_power;
				nexthop_nh->nh_power = 0;
D
Denis V. Lunev 已提交
1063
				spin_unlock_bh(&fib_multipath_lock);
L
Linus Torvalds 已提交
1064
#endif
D
Denis V. Lunev 已提交
1065 1066
				dead++;
			}
L
Linus Torvalds 已提交
1067
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1068
			if (force > 1 && nexthop_nh->nh_dev == dev) {
D
Denis V. Lunev 已提交
1069 1070
				dead = fi->fib_nhs;
				break;
L
Linus Torvalds 已提交
1071
			}
D
Denis V. Lunev 已提交
1072 1073 1074 1075 1076
#endif
		} endfor_nexthops(fi)
		if (dead == fi->fib_nhs) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1077 1078 1079 1080 1081 1082
		}
	}

	return ret;
}

1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
/* Must be invoked inside of an RCU protected region.  */
void fib_select_default(struct fib_result *res)
{
	struct fib_info *fi = NULL, *last_resort = NULL;
	struct list_head *fa_head = res->fa_head;
	struct fib_table *tb = res->table;
	int order = -1, last_idx = -1;
	struct fib_alias *fa;

	list_for_each_entry_rcu(fa, fa_head, fa_list) {
		struct fib_info *next_fi = fa->fa_info;

		if (fa->fa_scope != res->scope ||
		    fa->fa_type != RTN_UNICAST)
			continue;

		if (next_fi->fib_priority > res->fi->fib_priority)
			break;
		if (!next_fi->fib_nh[0].nh_gw ||
		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
			continue;

		fib_alias_accessed(fa);

		if (fi == NULL) {
			if (next_fi != res->fi)
				break;
		} else if (!fib_detect_death(fi, order, &last_resort,
					     &last_idx, tb->tb_default)) {
			fib_result_assign(res, fi);
			tb->tb_default = order;
			goto out;
		}
		fi = next_fi;
		order++;
	}

	if (order <= 0 || fi == NULL) {
		tb->tb_default = -1;
		goto out;
	}

	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
				tb->tb_default)) {
		fib_result_assign(res, fi);
		tb->tb_default = order;
		goto out;
	}

	if (last_idx >= 0)
		fib_result_assign(res, last_resort);
	tb->tb_default = last_idx;
out:
1136
	return;
1137 1138
}

L
Linus Torvalds 已提交
1139 1140 1141
#ifdef CONFIG_IP_ROUTE_MULTIPATH

/*
E
Eric Dumazet 已提交
1142 1143
 * Dead device goes up. We wake up dead nexthops.
 * It takes sense only on multipath routes.
L
Linus Torvalds 已提交
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
 */
int fib_sync_up(struct net_device *dev)
{
	struct fib_info *prev_fi;
	unsigned int hash;
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_nh *nh;
	int ret;

E
Eric Dumazet 已提交
1154
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
		return 0;

	prev_fi = NULL;
	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	ret = 0;

	hlist_for_each_entry(nh, node, head, nh_hash) {
		struct fib_info *fi = nh->nh_parent;
		int alive;

		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;

		prev_fi = fi;
		alive = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1173
			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
L
Linus Torvalds 已提交
1174 1175 1176
				alive++;
				continue;
			}
1177
			if (nexthop_nh->nh_dev == NULL ||
E
Eric Dumazet 已提交
1178
			    !(nexthop_nh->nh_dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1179
				continue;
1180 1181
			if (nexthop_nh->nh_dev != dev ||
			    !__in_dev_get_rtnl(dev))
L
Linus Torvalds 已提交
1182 1183 1184
				continue;
			alive++;
			spin_lock_bh(&fib_multipath_lock);
1185 1186
			nexthop_nh->nh_power = 0;
			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
L
Linus Torvalds 已提交
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
			spin_unlock_bh(&fib_multipath_lock);
		} endfor_nexthops(fi)

		if (alive > 0) {
			fi->fib_flags &= ~RTNH_F_DEAD;
			ret++;
		}
	}

	return ret;
}

/*
E
Eric Dumazet 已提交
1200 1201
 * The algorithm is suboptimal, but it provides really
 * fair weighted route distribution.
L
Linus Torvalds 已提交
1202
 */
1203
void fib_select_multipath(struct fib_result *res)
L
Linus Torvalds 已提交
1204 1205 1206 1207 1208 1209 1210 1211
{
	struct fib_info *fi = res->fi;
	int w;

	spin_lock_bh(&fib_multipath_lock);
	if (fi->fib_power <= 0) {
		int power = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1212
			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1213 1214
				power += nexthop_nh->nh_weight;
				nexthop_nh->nh_power = nexthop_nh->nh_weight;
L
Linus Torvalds 已提交
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
			}
		} endfor_nexthops(fi);
		fi->fib_power = power;
		if (power <= 0) {
			spin_unlock_bh(&fib_multipath_lock);
			/* Race condition: route has just become dead. */
			res->nh_sel = 0;
			return;
		}
	}


	/* w should be random number [0..fi->fib_power-1],
E
Eric Dumazet 已提交
1228
	 * it is pretty bad approximation.
L
Linus Torvalds 已提交
1229 1230 1231 1232 1233
	 */

	w = jiffies % fi->fib_power;

	change_nexthops(fi) {
E
Eric Dumazet 已提交
1234
		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1235
		    nexthop_nh->nh_power) {
E
Eric Dumazet 已提交
1236 1237
			w -= nexthop_nh->nh_power;
			if (w <= 0) {
1238
				nexthop_nh->nh_power--;
L
Linus Torvalds 已提交
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251
				fi->fib_power--;
				res->nh_sel = nhsel;
				spin_unlock_bh(&fib_multipath_lock);
				return;
			}
		}
	} endfor_nexthops(fi);

	/* Race condition: route has just become dead. */
	res->nh_sel = 0;
	spin_unlock_bh(&fib_multipath_lock);
}
#endif