fib_semantics.c 30.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: semantics.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
29
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
30 31 32 33 34
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
35
#include <linux/slab.h>
L
Linus Torvalds 已提交
36

37
#include <net/arp.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
44
#include <net/netlink.h>
45
#include <net/nexthop.h>
L
Linus Torvalds 已提交
46 47 48

#include "fib_lookup.h"

49
static DEFINE_SPINLOCK(fib_info_lock);
L
Linus Torvalds 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
static unsigned int fib_hash_size;
static unsigned int fib_info_cnt;

#define DEVINDEX_HASHBITS 8
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];

#ifdef CONFIG_IP_ROUTE_MULTIPATH

static DEFINE_SPINLOCK(fib_multipath_lock);

E
Eric Dumazet 已提交
63 64 65 66 67 68 69 70 71 72 73
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh;				\
	for (nhsel = 0, nh = (fi)->fib_nh;				\
	     nhsel < (fi)->fib_nhs;					\
	     nh++, nhsel++)

#define change_nexthops(fi) {						\
	int nhsel; struct fib_nh *nexthop_nh;				\
	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	     nhsel < (fi)->fib_nhs;					\
	     nexthop_nh++, nhsel++)
L
Linus Torvalds 已提交
74 75 76 77 78

#else /* CONFIG_IP_ROUTE_MULTIPATH */

/* Hope, that gcc will optimize it to get rid of dummy loop */

E
Eric Dumazet 已提交
79 80 81
#define for_nexthops(fi) {						\
	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
82

E
Eric Dumazet 已提交
83 84 85 86
#define change_nexthops(fi) {						\
	int nhsel;							\
	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
	for (nhsel = 0; nhsel < 1; nhsel++)
L
Linus Torvalds 已提交
87 88 89 90 91 92

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

#define endfor_nexthops(fi) }


93
static const struct
L
Linus Torvalds 已提交
94 95 96
{
	int	error;
	u8	scope;
97
} fib_props[RTN_MAX + 1] = {
E
Eric Dumazet 已提交
98
	[RTN_UNSPEC] = {
L
Linus Torvalds 已提交
99 100
		.error	= 0,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
101 102
	},
	[RTN_UNICAST] = {
L
Linus Torvalds 已提交
103 104
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
105 106
	},
	[RTN_LOCAL] = {
L
Linus Torvalds 已提交
107 108
		.error	= 0,
		.scope	= RT_SCOPE_HOST,
E
Eric Dumazet 已提交
109 110
	},
	[RTN_BROADCAST] = {
L
Linus Torvalds 已提交
111 112
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
113 114
	},
	[RTN_ANYCAST] = {
L
Linus Torvalds 已提交
115 116
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
E
Eric Dumazet 已提交
117 118
	},
	[RTN_MULTICAST] = {
L
Linus Torvalds 已提交
119 120
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
121 122
	},
	[RTN_BLACKHOLE] = {
L
Linus Torvalds 已提交
123 124
		.error	= -EINVAL,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
125 126
	},
	[RTN_UNREACHABLE] = {
L
Linus Torvalds 已提交
127 128
		.error	= -EHOSTUNREACH,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
129 130
	},
	[RTN_PROHIBIT] = {
L
Linus Torvalds 已提交
131 132
		.error	= -EACCES,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
133 134
	},
	[RTN_THROW] = {
L
Linus Torvalds 已提交
135 136
		.error	= -EAGAIN,
		.scope	= RT_SCOPE_UNIVERSE,
E
Eric Dumazet 已提交
137 138
	},
	[RTN_NAT] = {
L
Linus Torvalds 已提交
139 140
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
141 142
	},
	[RTN_XRESOLVE] = {
L
Linus Torvalds 已提交
143 144
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
E
Eric Dumazet 已提交
145
	},
L
Linus Torvalds 已提交
146 147 148 149 150
};


/* Release a nexthop info record */

E
Eric Dumazet 已提交
151 152 153 154
static void free_fib_info_rcu(struct rcu_head *head)
{
	struct fib_info *fi = container_of(head, struct fib_info, rcu);

155 156
	if (fi->fib_metrics != (u32 *) dst_default_metrics)
		kfree(fi->fib_metrics);
E
Eric Dumazet 已提交
157 158 159
	kfree(fi);
}

L
Linus Torvalds 已提交
160 161 162
void free_fib_info(struct fib_info *fi)
{
	if (fi->fib_dead == 0) {
E
Eric Dumazet 已提交
163
		pr_warning("Freeing alive fib_info %p\n", fi);
L
Linus Torvalds 已提交
164 165 166
		return;
	}
	change_nexthops(fi) {
167 168 169
		if (nexthop_nh->nh_dev)
			dev_put(nexthop_nh->nh_dev);
		nexthop_nh->nh_dev = NULL;
L
Linus Torvalds 已提交
170 171
	} endfor_nexthops(fi);
	fib_info_cnt--;
172
	release_net(fi->fib_net);
E
Eric Dumazet 已提交
173
	call_rcu(&fi->rcu, free_fib_info_rcu);
L
Linus Torvalds 已提交
174 175 176 177
}

void fib_release_info(struct fib_info *fi)
{
178
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
179 180 181 182 183
	if (fi && --fi->fib_treeref == 0) {
		hlist_del(&fi->fib_hash);
		if (fi->fib_prefsrc)
			hlist_del(&fi->fib_lhash);
		change_nexthops(fi) {
184
			if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
185
				continue;
186
			hlist_del(&nexthop_nh->nh_hash);
L
Linus Torvalds 已提交
187 188 189 190
		} endfor_nexthops(fi)
		fi->fib_dead = 1;
		fib_info_put(fi);
	}
191
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
192 193
}

E
Eric Dumazet 已提交
194
static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
L
Linus Torvalds 已提交
195 196 197 198 199 200 201 202 203 204
{
	const struct fib_nh *onh = ofi->fib_nh;

	for_nexthops(fi) {
		if (nh->nh_oif != onh->nh_oif ||
		    nh->nh_gw  != onh->nh_gw ||
		    nh->nh_scope != onh->nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		    nh->nh_weight != onh->nh_weight ||
#endif
205
#ifdef CONFIG_IP_ROUTE_CLASSID
L
Linus Torvalds 已提交
206 207
		    nh->nh_tclassid != onh->nh_tclassid ||
#endif
E
Eric Dumazet 已提交
208
		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
L
Linus Torvalds 已提交
209 210 211 212 213 214
			return -1;
		onh++;
	} endfor_nexthops(fi);
	return 0;
}

215 216 217 218 219 220 221 222 223
static inline unsigned int fib_devindex_hashfn(unsigned int val)
{
	unsigned int mask = DEVINDEX_HASHSIZE - 1;

	return (val ^
		(val >> DEVINDEX_HASHBITS) ^
		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}

L
Linus Torvalds 已提交
224 225 226 227 228 229
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
	unsigned int mask = (fib_hash_size - 1);
	unsigned int val = fi->fib_nhs;

	val ^= fi->fib_protocol;
A
Al Viro 已提交
230
	val ^= (__force u32)fi->fib_prefsrc;
L
Linus Torvalds 已提交
231
	val ^= fi->fib_priority;
232 233 234
	for_nexthops(fi) {
		val ^= fib_devindex_hashfn(nh->nh_oif);
	} endfor_nexthops(fi)
L
Linus Torvalds 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249

	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}

static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_info *fi;
	unsigned int hash;

	hash = fib_info_hashfn(nfi);
	head = &fib_info_hash[hash];

	hlist_for_each_entry(fi, node, head, fib_hash) {
O
Octavian Purdila 已提交
250
		if (!net_eq(fi->fib_net, nfi->fib_net))
251
			continue;
L
Linus Torvalds 已提交
252 253 254 255 256 257 258
		if (fi->fib_nhs != nfi->fib_nhs)
			continue;
		if (nfi->fib_protocol == fi->fib_protocol &&
		    nfi->fib_prefsrc == fi->fib_prefsrc &&
		    nfi->fib_priority == fi->fib_priority &&
		    memcmp(nfi->fib_metrics, fi->fib_metrics,
			   sizeof(fi->fib_metrics)) == 0 &&
E
Eric Dumazet 已提交
259
		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
L
Linus Torvalds 已提交
260 261 262 263 264 265 266 267
		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
			return fi;
	}

	return NULL;
}

/* Check, that the gateway is already configured.
E
Eric Dumazet 已提交
268
 * Used only by redirect accept routine.
L
Linus Torvalds 已提交
269
 */
270
int ip_fib_check_default(__be32 gw, struct net_device *dev)
L
Linus Torvalds 已提交
271 272 273 274 275 276
{
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_nh *nh;
	unsigned int hash;

277
	spin_lock(&fib_info_lock);
L
Linus Torvalds 已提交
278 279 280 281 282 283

	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	hlist_for_each_entry(nh, node, head, nh_hash) {
		if (nh->nh_dev == dev &&
		    nh->nh_gw == gw &&
E
Eric Dumazet 已提交
284
		    !(nh->nh_flags & RTNH_F_DEAD)) {
285
			spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
286 287 288 289
			return 0;
		}
	}

290
	spin_unlock(&fib_info_lock);
L
Linus Torvalds 已提交
291 292 293 294

	return -1;
}

295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
static inline size_t fib_nlmsg_size(struct fib_info *fi)
{
	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
			 + nla_total_size(4) /* RTA_TABLE */
			 + nla_total_size(4) /* RTA_DST */
			 + nla_total_size(4) /* RTA_PRIORITY */
			 + nla_total_size(4); /* RTA_PREFSRC */

	/* space for nested metrics */
	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));

	if (fi->fib_nhs) {
		/* Also handles the special case fib_nhs == 1 */

		/* each nexthop is packed in an attribute */
		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));

		/* may contain flow and gateway attribute */
		nhsize += 2 * nla_total_size(4);

		/* all nexthops are packed in a nested attribute */
		payload += nla_total_size(fi->fib_nhs * nhsize);
	}

	return payload;
}

A
Al Viro 已提交
322
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
323 324
	       int dst_len, u32 tb_id, struct nl_info *info,
	       unsigned int nlm_flags)
L
Linus Torvalds 已提交
325 326
{
	struct sk_buff *skb;
327
	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
328
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
329

330
	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
331 332
	if (skb == NULL)
		goto errout;
L
Linus Torvalds 已提交
333

334
	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
335
			    fa->fa_type, fa->fa_scope, key, dst_len,
336
			    fa->fa_tos, fa->fa_info, nlm_flags);
337 338 339 340 341 342
	if (err < 0) {
		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
343 344 345
	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
		    info->nlh, GFP_KERNEL);
	return;
346 347
errout:
	if (err < 0)
348
		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
L
Linus Torvalds 已提交
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
}

/* Return the first fib alias matching TOS with
 * priority less than or equal to PRIO.
 */
struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
{
	if (fah) {
		struct fib_alias *fa;
		list_for_each_entry(fa, fah, fa_list) {
			if (fa->fa_tos > tos)
				continue;
			if (fa->fa_info->fib_priority >= prio ||
			    fa->fa_tos < tos)
				return fa;
		}
	}
	return NULL;
}

int fib_detect_death(struct fib_info *fi, int order,
370
		     struct fib_info **last_resort, int *last_idx, int dflt)
L
Linus Torvalds 已提交
371 372 373 374 375 376 377 378 379
{
	struct neighbour *n;
	int state = NUD_NONE;

	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
	if (n) {
		state = n->nud_state;
		neigh_release(n);
	}
380
	if (state == NUD_REACHABLE)
L
Linus Torvalds 已提交
381
		return 0;
E
Eric Dumazet 已提交
382
	if ((state & NUD_VALID) && order != dflt)
L
Linus Torvalds 已提交
383
		return 0;
E
Eric Dumazet 已提交
384 385
	if ((state & NUD_VALID) ||
	    (*last_idx < 0 && order > dflt)) {
L
Linus Torvalds 已提交
386 387 388 389 390 391 392 393
		*last_resort = fi;
		*last_idx = order;
	}
	return 1;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH

394
static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
L
Linus Torvalds 已提交
395 396 397
{
	int nhs = 0;

398
	while (rtnh_ok(rtnh, remaining)) {
L
Linus Torvalds 已提交
399
		nhs++;
400 401 402 403 404
		rtnh = rtnh_next(rtnh, &remaining);
	}

	/* leftover implies invalid nexthop configuration, discard it */
	return remaining > 0 ? 0 : nhs;
L
Linus Torvalds 已提交
405 406
}

407 408
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
		       int remaining, struct fib_config *cfg)
L
Linus Torvalds 已提交
409 410
{
	change_nexthops(fi) {
411 412 413
		int attrlen;

		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
414
			return -EINVAL;
415

416 417 418 419
		nexthop_nh->nh_flags =
			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
420 421 422 423 424 425

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
426
			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
427
#ifdef CONFIG_IP_ROUTE_CLASSID
428
			nla = nla_find(attrs, attrlen, RTA_FLOW);
429
			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
L
Linus Torvalds 已提交
430 431
#endif
		}
432 433

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
434
	} endfor_nexthops(fi);
435

L
Linus Torvalds 已提交
436 437 438 439 440
	return 0;
}

#endif

441
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
L
Linus Torvalds 已提交
442 443
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
444 445
	struct rtnexthop *rtnh;
	int remaining;
L
Linus Torvalds 已提交
446 447
#endif

448
	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
L
Linus Torvalds 已提交
449 450
		return 1;

451 452 453
	if (cfg->fc_oif || cfg->fc_gw) {
		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
L
Linus Torvalds 已提交
454 455 456 457 458
			return 0;
		return 1;
	}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
459
	if (cfg->fc_mp == NULL)
L
Linus Torvalds 已提交
460
		return 0;
461 462 463

	rtnh = cfg->fc_mp;
	remaining = cfg->fc_mp_len;
464

L
Linus Torvalds 已提交
465
	for_nexthops(fi) {
466
		int attrlen;
L
Linus Torvalds 已提交
467

468
		if (!rtnh_ok(rtnh, remaining))
L
Linus Torvalds 已提交
469
			return -EINVAL;
470 471

		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
L
Linus Torvalds 已提交
472
			return 1;
473 474 475 476 477 478

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen < 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
479
			if (nla && nla_get_be32(nla) != nh->nh_gw)
L
Linus Torvalds 已提交
480
				return 1;
481
#ifdef CONFIG_IP_ROUTE_CLASSID
482 483
			nla = nla_find(attrs, attrlen, RTA_FLOW);
			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
L
Linus Torvalds 已提交
484 485 486
				return 1;
#endif
		}
487 488

		rtnh = rtnh_next(rtnh, &remaining);
L
Linus Torvalds 已提交
489 490 491 492 493 494 495
	} endfor_nexthops(fi);
#endif
	return 0;
}


/*
E
Eric Dumazet 已提交
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
 * Picture
 * -------
 *
 * Semantics of nexthop is very messy by historical reasons.
 * We have to take into account, that:
 * a) gateway can be actually local interface address,
 *    so that gatewayed route is direct.
 * b) gateway must be on-link address, possibly
 *    described not by an ifaddr, but also by a direct route.
 * c) If both gateway and interface are specified, they should not
 *    contradict.
 * d) If we use tunnel routes, gateway could be not on-link.
 *
 * Attempt to reconcile all of these (alas, self-contradictory) conditions
 * results in pretty ugly and hairy code with obscure logic.
 *
 * I chose to generalized it instead, so that the size
 * of code does not increase practically, but it becomes
 * much more general.
 * Every prefix is assigned a "scope" value: "host" is local address,
 * "link" is direct route,
 * [ ... "site" ... "interior" ... ]
 * and "universe" is true gateway route with global meaning.
 *
 * Every prefix refers to a set of "nexthop"s (gw, oif),
 * where gw must have narrower scope. This recursion stops
 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 * which means that gw is forced to be on link.
 *
 * Code is still hairy, but now it is apparently logically
 * consistent and very flexible. F.e. as by-product it allows
 * to co-exists in peace independent exterior and interior
 * routing processes.
 *
 * Normally it looks as following.
 *
 * {universe prefix}  -> (gw, oif) [scope link]
 *		  |
 *		  |-> {link prefix} -> (gw, oif) [scope local]
 *					|
 *					|-> {local prefix} (terminal node)
L
Linus Torvalds 已提交
537
 */
538 539
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
			struct fib_nh *nh)
L
Linus Torvalds 已提交
540 541
{
	int err;
542
	struct net *net;
E
Eric Dumazet 已提交
543
	struct net_device *dev;
L
Linus Torvalds 已提交
544

545
	net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
546 547 548
	if (nh->nh_gw) {
		struct fib_result res;

E
Eric Dumazet 已提交
549
		if (nh->nh_flags & RTNH_F_ONLINK) {
L
Linus Torvalds 已提交
550

551
			if (cfg->fc_scope >= RT_SCOPE_LINK)
L
Linus Torvalds 已提交
552
				return -EINVAL;
553
			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
L
Linus Torvalds 已提交
554
				return -EINVAL;
E
Eric Dumazet 已提交
555 556
			dev = __dev_get_by_index(net, nh->nh_oif);
			if (!dev)
L
Linus Torvalds 已提交
557
				return -ENODEV;
E
Eric Dumazet 已提交
558
			if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
559 560 561 562 563 564
				return -ENETDOWN;
			nh->nh_dev = dev;
			dev_hold(dev);
			nh->nh_scope = RT_SCOPE_LINK;
			return 0;
		}
E
Eric Dumazet 已提交
565
		rcu_read_lock();
L
Linus Torvalds 已提交
566
		{
567
			struct flowi fl = {
568 569
				.fl4_dst = nh->nh_gw,
				.fl4_scope = cfg->fc_scope + 1,
570 571
				.oif = nh->nh_oif,
			};
L
Linus Torvalds 已提交
572 573 574 575

			/* It is not necessary, but requires a bit of thinking */
			if (fl.fl4_scope < RT_SCOPE_LINK)
				fl.fl4_scope = RT_SCOPE_LINK;
E
Eric Dumazet 已提交
576
			err = fib_lookup(net, &fl, &res);
E
Eric Dumazet 已提交
577 578
			if (err) {
				rcu_read_unlock();
L
Linus Torvalds 已提交
579
				return err;
E
Eric Dumazet 已提交
580
			}
L
Linus Torvalds 已提交
581 582 583 584 585 586
		}
		err = -EINVAL;
		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
			goto out;
		nh->nh_scope = res.scope;
		nh->nh_oif = FIB_RES_OIF(res);
E
Eric Dumazet 已提交
587 588
		nh->nh_dev = dev = FIB_RES_DEV(res);
		if (!dev)
L
Linus Torvalds 已提交
589
			goto out;
E
Eric Dumazet 已提交
590
		dev_hold(dev);
591
		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
L
Linus Torvalds 已提交
592 593 594
	} else {
		struct in_device *in_dev;

E
Eric Dumazet 已提交
595
		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
L
Linus Torvalds 已提交
596 597
			return -EINVAL;

598 599
		rcu_read_lock();
		err = -ENODEV;
600
		in_dev = inetdev_by_index(net, nh->nh_oif);
L
Linus Torvalds 已提交
601
		if (in_dev == NULL)
602 603 604 605
			goto out;
		err = -ENETDOWN;
		if (!(in_dev->dev->flags & IFF_UP))
			goto out;
L
Linus Torvalds 已提交
606 607 608
		nh->nh_dev = in_dev->dev;
		dev_hold(nh->nh_dev);
		nh->nh_scope = RT_SCOPE_HOST;
609
		err = 0;
L
Linus Torvalds 已提交
610
	}
611 612 613
out:
	rcu_read_unlock();
	return err;
L
Linus Torvalds 已提交
614 615
}

A
Al Viro 已提交
616
static inline unsigned int fib_laddr_hashfn(__be32 val)
L
Linus Torvalds 已提交
617 618 619
{
	unsigned int mask = (fib_hash_size - 1);

E
Eric Dumazet 已提交
620 621 622
	return ((__force u32)val ^
		((__force u32)val >> 7) ^
		((__force u32)val >> 14)) & mask;
L
Linus Torvalds 已提交
623 624 625 626 627
}

static struct hlist_head *fib_hash_alloc(int bytes)
{
	if (bytes <= PAGE_SIZE)
628
		return kzalloc(bytes, GFP_KERNEL);
L
Linus Torvalds 已提交
629 630
	else
		return (struct hlist_head *)
E
Eric Dumazet 已提交
631 632
			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
					 get_order(bytes));
L
Linus Torvalds 已提交
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
}

static void fib_hash_free(struct hlist_head *hash, int bytes)
{
	if (!hash)
		return;

	if (bytes <= PAGE_SIZE)
		kfree(hash);
	else
		free_pages((unsigned long) hash, get_order(bytes));
}

static void fib_hash_move(struct hlist_head *new_info_hash,
			  struct hlist_head *new_laddrhash,
			  unsigned int new_size)
{
650
	struct hlist_head *old_info_hash, *old_laddrhash;
L
Linus Torvalds 已提交
651
	unsigned int old_size = fib_hash_size;
652
	unsigned int i, bytes;
L
Linus Torvalds 已提交
653

654
	spin_lock_bh(&fib_info_lock);
655 656
	old_info_hash = fib_info_hash;
	old_laddrhash = fib_info_laddrhash;
L
Linus Torvalds 已提交
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
	fib_hash_size = new_size;

	for (i = 0; i < old_size; i++) {
		struct hlist_head *head = &fib_info_hash[i];
		struct hlist_node *node, *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
			struct hlist_head *dest;
			unsigned int new_hash;

			hlist_del(&fi->fib_hash);

			new_hash = fib_info_hashfn(fi);
			dest = &new_info_hash[new_hash];
			hlist_add_head(&fi->fib_hash, dest);
		}
	}
	fib_info_hash = new_info_hash;

	for (i = 0; i < old_size; i++) {
		struct hlist_head *lhead = &fib_info_laddrhash[i];
		struct hlist_node *node, *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
			struct hlist_head *ldest;
			unsigned int new_hash;

			hlist_del(&fi->fib_lhash);

			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
			ldest = &new_laddrhash[new_hash];
			hlist_add_head(&fi->fib_lhash, ldest);
		}
	}
	fib_info_laddrhash = new_laddrhash;

695
	spin_unlock_bh(&fib_info_lock);
696 697 698 699

	bytes = old_size * sizeof(struct hlist_head *);
	fib_hash_free(old_info_hash, bytes);
	fib_hash_free(old_laddrhash, bytes);
L
Linus Torvalds 已提交
700 701
}

702
struct fib_info *fib_create_info(struct fib_config *cfg)
L
Linus Torvalds 已提交
703 704 705 706 707
{
	int err;
	struct fib_info *fi = NULL;
	struct fib_info *ofi;
	int nhs = 1;
708
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
709 710

	/* Fast check to catch the most weird cases */
711
	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
L
Linus Torvalds 已提交
712 713 714
		goto err_inval;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
715 716
	if (cfg->fc_mp) {
		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
L
Linus Torvalds 已提交
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
		if (nhs == 0)
			goto err_inval;
	}
#endif

	err = -ENOBUFS;
	if (fib_info_cnt >= fib_hash_size) {
		unsigned int new_size = fib_hash_size << 1;
		struct hlist_head *new_info_hash;
		struct hlist_head *new_laddrhash;
		unsigned int bytes;

		if (!new_size)
			new_size = 1;
		bytes = new_size * sizeof(struct hlist_head *);
		new_info_hash = fib_hash_alloc(bytes);
		new_laddrhash = fib_hash_alloc(bytes);
		if (!new_info_hash || !new_laddrhash) {
			fib_hash_free(new_info_hash, bytes);
			fib_hash_free(new_laddrhash, bytes);
737
		} else
L
Linus Torvalds 已提交
738 739 740 741 742 743
			fib_hash_move(new_info_hash, new_laddrhash, new_size);

		if (!fib_hash_size)
			goto failure;
	}

744
	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
L
Linus Torvalds 已提交
745 746
	if (fi == NULL)
		goto failure;
747 748 749 750 751 752
	if (cfg->fc_mx) {
		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
		if (!fi->fib_metrics)
			goto failure;
	} else
		fi->fib_metrics = (u32 *) dst_default_metrics;
L
Linus Torvalds 已提交
753 754
	fib_info_cnt++;

755
	fi->fib_net = hold_net(net);
756 757 758 759
	fi->fib_protocol = cfg->fc_protocol;
	fi->fib_flags = cfg->fc_flags;
	fi->fib_priority = cfg->fc_priority;
	fi->fib_prefsrc = cfg->fc_prefsrc;
L
Linus Torvalds 已提交
760 761 762

	fi->fib_nhs = nhs;
	change_nexthops(fi) {
763
		nexthop_nh->nh_parent = fi;
L
Linus Torvalds 已提交
764 765
	} endfor_nexthops(fi)

766 767 768 769 770
	if (cfg->fc_mx) {
		struct nlattr *nla;
		int remaining;

		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
771
			int type = nla_type(nla);
772 773 774

			if (type) {
				if (type > RTAX_MAX)
L
Linus Torvalds 已提交
775
					goto err_inval;
776
				fi->fib_metrics[type - 1] = nla_get_u32(nla);
L
Linus Torvalds 已提交
777 778 779 780
			}
		}
	}

781
	if (cfg->fc_mp) {
L
Linus Torvalds 已提交
782
#ifdef CONFIG_IP_ROUTE_MULTIPATH
783 784
		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
		if (err != 0)
L
Linus Torvalds 已提交
785
			goto failure;
786
		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
L
Linus Torvalds 已提交
787
			goto err_inval;
788
		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
L
Linus Torvalds 已提交
789
			goto err_inval;
790
#ifdef CONFIG_IP_ROUTE_CLASSID
791
		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
L
Linus Torvalds 已提交
792 793 794 795 796 797 798
			goto err_inval;
#endif
#else
		goto err_inval;
#endif
	} else {
		struct fib_nh *nh = fi->fib_nh;
799 800 801 802

		nh->nh_oif = cfg->fc_oif;
		nh->nh_gw = cfg->fc_gw;
		nh->nh_flags = cfg->fc_flags;
803
#ifdef CONFIG_IP_ROUTE_CLASSID
804
		nh->nh_tclassid = cfg->fc_flow;
L
Linus Torvalds 已提交
805 806 807 808 809 810
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		nh->nh_weight = 1;
#endif
	}

811 812
	if (fib_props[cfg->fc_type].error) {
		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
L
Linus Torvalds 已提交
813 814 815 816
			goto err_inval;
		goto link_it;
	}

817
	if (cfg->fc_scope > RT_SCOPE_HOST)
L
Linus Torvalds 已提交
818 819
		goto err_inval;

820
	if (cfg->fc_scope == RT_SCOPE_HOST) {
L
Linus Torvalds 已提交
821 822 823 824 825 826
		struct fib_nh *nh = fi->fib_nh;

		/* Local address is added. */
		if (nhs != 1 || nh->nh_gw)
			goto err_inval;
		nh->nh_scope = RT_SCOPE_NOWHERE;
827
		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
L
Linus Torvalds 已提交
828 829 830 831 832
		err = -ENODEV;
		if (nh->nh_dev == NULL)
			goto failure;
	} else {
		change_nexthops(fi) {
E
Eric Dumazet 已提交
833 834
			err = fib_check_nh(cfg, fi, nexthop_nh);
			if (err != 0)
L
Linus Torvalds 已提交
835 836 837 838 839
				goto failure;
		} endfor_nexthops(fi)
	}

	if (fi->fib_prefsrc) {
840 841
		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
		    fi->fib_prefsrc != cfg->fc_dst)
842
			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
L
Linus Torvalds 已提交
843 844 845 846
				goto err_inval;
	}

link_it:
E
Eric Dumazet 已提交
847 848
	ofi = fib_find_info(fi);
	if (ofi) {
L
Linus Torvalds 已提交
849 850 851 852 853 854 855 856
		fi->fib_dead = 1;
		free_fib_info(fi);
		ofi->fib_treeref++;
		return ofi;
	}

	fi->fib_treeref++;
	atomic_inc(&fi->fib_clntref);
857
	spin_lock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
858 859 860 861 862 863 864 865 866 867 868 869
	hlist_add_head(&fi->fib_hash,
		       &fib_info_hash[fib_info_hashfn(fi)]);
	if (fi->fib_prefsrc) {
		struct hlist_head *head;

		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
		hlist_add_head(&fi->fib_lhash, head);
	}
	change_nexthops(fi) {
		struct hlist_head *head;
		unsigned int hash;

870
		if (!nexthop_nh->nh_dev)
L
Linus Torvalds 已提交
871
			continue;
872
		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
L
Linus Torvalds 已提交
873
		head = &fib_info_devhash[hash];
874
		hlist_add_head(&nexthop_nh->nh_hash, head);
L
Linus Torvalds 已提交
875
	} endfor_nexthops(fi)
876
	spin_unlock_bh(&fib_info_lock);
L
Linus Torvalds 已提交
877 878 879 880 881 882
	return fi;

err_inval:
	err = -EINVAL;

failure:
883
	if (fi) {
L
Linus Torvalds 已提交
884 885 886
		fi->fib_dead = 1;
		free_fib_info(fi);
	}
887 888

	return ERR_PTR(err);
L
Linus Torvalds 已提交
889 890
}

891
/* Note! fib_semantic_match intentionally uses  RCU list functions. */
892 893 894
int fib_semantic_match(struct fib_table *tb, struct list_head *head,
		       const struct flowi *flp, struct fib_result *res,
		       int prefixlen, int fib_flags)
L
Linus Torvalds 已提交
895 896 897 898
{
	struct fib_alias *fa;
	int nh_sel = 0;

899
	list_for_each_entry_rcu(fa, head, fa_list) {
L
Linus Torvalds 已提交
900 901 902 903 904 905 906 907 908
		int err;

		if (fa->fa_tos &&
		    fa->fa_tos != flp->fl4_tos)
			continue;

		if (fa->fa_scope < flp->fl4_scope)
			continue;

909
		fib_alias_accessed(fa);
L
Linus Torvalds 已提交
910 911 912 913 914 915 916 917 918 919 920 921 922 923 924

		err = fib_props[fa->fa_type].error;
		if (err == 0) {
			struct fib_info *fi = fa->fa_info;

			if (fi->fib_flags & RTNH_F_DEAD)
				continue;

			switch (fa->fa_type) {
			case RTN_UNICAST:
			case RTN_LOCAL:
			case RTN_BROADCAST:
			case RTN_ANYCAST:
			case RTN_MULTICAST:
				for_nexthops(fi) {
E
Eric Dumazet 已提交
925
					if (nh->nh_flags & RTNH_F_DEAD)
L
Linus Torvalds 已提交
926 927 928 929 930 931 932 933 934 935
						continue;
					if (!flp->oif || flp->oif == nh->nh_oif)
						break;
				}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
				if (nhsel < fi->fib_nhs) {
					nh_sel = nhsel;
					goto out_fill_res;
				}
#else
E
Eric Dumazet 已提交
936
				if (nhsel < 1)
L
Linus Torvalds 已提交
937 938 939 940 941 942
					goto out_fill_res;
#endif
				endfor_nexthops(fi);
				continue;

			default:
E
Eric Dumazet 已提交
943 944
				pr_warning("fib_semantic_match bad type %#x\n",
					   fa->fa_type);
L
Linus Torvalds 已提交
945
				return -EINVAL;
946
			}
L
Linus Torvalds 已提交
947 948 949 950 951 952 953 954 955 956 957
		}
		return err;
	}
	return 1;

out_fill_res:
	res->prefixlen = prefixlen;
	res->nh_sel = nh_sel;
	res->type = fa->fa_type;
	res->scope = fa->fa_scope;
	res->fi = fa->fa_info;
958 959
	res->table = tb;
	res->fa_head = head;
E
Eric Dumazet 已提交
960 961
	if (!(fib_flags & FIB_LOOKUP_NOREF))
		atomic_inc(&res->fi->fib_clntref);
L
Linus Torvalds 已提交
962 963 964 965 966
	return 0;
}

/* Find appropriate source address to this destination */

A
Al Viro 已提交
967
__be32 __fib_res_prefsrc(struct fib_result *res)
L
Linus Torvalds 已提交
968 969 970 971
{
	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
}

972
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
A
Al Viro 已提交
973
		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
974
		  struct fib_info *fi, unsigned int flags)
L
Linus Torvalds 已提交
975
{
976
	struct nlmsghdr *nlh;
L
Linus Torvalds 已提交
977 978
	struct rtmsg *rtm;

979 980
	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
	if (nlh == NULL)
981
		return -EMSGSIZE;
982 983

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
984 985 986 987
	rtm->rtm_family = AF_INET;
	rtm->rtm_dst_len = dst_len;
	rtm->rtm_src_len = 0;
	rtm->rtm_tos = tos;
988 989 990 991
	if (tb_id < 256)
		rtm->rtm_table = tb_id;
	else
		rtm->rtm_table = RT_TABLE_COMPAT;
992
	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
L
Linus Torvalds 已提交
993 994 995 996
	rtm->rtm_type = type;
	rtm->rtm_flags = fi->fib_flags;
	rtm->rtm_scope = scope;
	rtm->rtm_protocol = fi->fib_protocol;
997 998

	if (rtm->rtm_dst_len)
999
		NLA_PUT_BE32(skb, RTA_DST, dst);
1000

L
Linus Torvalds 已提交
1001
	if (fi->fib_priority)
1002 1003
		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);

L
Linus Torvalds 已提交
1004
	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
1005 1006
		goto nla_put_failure;

L
Linus Torvalds 已提交
1007
	if (fi->fib_prefsrc)
1008
		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
1009

L
Linus Torvalds 已提交
1010 1011
	if (fi->fib_nhs == 1) {
		if (fi->fib_nh->nh_gw)
1012
			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
1013

L
Linus Torvalds 已提交
1014
		if (fi->fib_nh->nh_oif)
1015
			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
1016
#ifdef CONFIG_IP_ROUTE_CLASSID
1017
		if (fi->fib_nh[0].nh_tclassid)
1018
			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1019
#endif
L
Linus Torvalds 已提交
1020 1021 1022
	}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
	if (fi->fib_nhs > 1) {
1023 1024 1025 1026 1027 1028
		struct rtnexthop *rtnh;
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (mp == NULL)
			goto nla_put_failure;
L
Linus Torvalds 已提交
1029 1030

		for_nexthops(fi) {
1031 1032 1033 1034 1035 1036 1037 1038
			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
			if (rtnh == NULL)
				goto nla_put_failure;

			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
			rtnh->rtnh_hops = nh->nh_weight - 1;
			rtnh->rtnh_ifindex = nh->nh_oif;

L
Linus Torvalds 已提交
1039
			if (nh->nh_gw)
1040
				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1041
#ifdef CONFIG_IP_ROUTE_CLASSID
1042
			if (nh->nh_tclassid)
1043
				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1044
#endif
1045 1046
			/* length of rtnetlink header + attributes */
			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
L
Linus Torvalds 已提交
1047
		} endfor_nexthops(fi);
1048 1049

		nla_nest_end(skb, mp);
L
Linus Torvalds 已提交
1050 1051
	}
#endif
1052
	return nlmsg_end(skb, nlh);
L
Linus Torvalds 已提交
1053

1054
nla_put_failure:
1055 1056
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
1057 1058 1059
}

/*
E
Eric Dumazet 已提交
1060 1061 1062 1063
 * Update FIB if:
 * - local address disappeared -> we must delete all the entries
 *   referring to it.
 * - device went down -> we must shutdown all nexthops going via it.
L
Linus Torvalds 已提交
1064
 */
1065
int fib_sync_down_addr(struct net *net, __be32 local)
L
Linus Torvalds 已提交
1066 1067
{
	int ret = 0;
D
Denis V. Lunev 已提交
1068 1069 1070 1071
	unsigned int hash = fib_laddr_hashfn(local);
	struct hlist_head *head = &fib_info_laddrhash[hash];
	struct hlist_node *node;
	struct fib_info *fi;
L
Linus Torvalds 已提交
1072

D
Denis V. Lunev 已提交
1073 1074
	if (fib_info_laddrhash == NULL || local == 0)
		return 0;
L
Linus Torvalds 已提交
1075

D
Denis V. Lunev 已提交
1076
	hlist_for_each_entry(fi, node, head, fib_lhash) {
O
Octavian Purdila 已提交
1077
		if (!net_eq(fi->fib_net, net))
1078
			continue;
D
Denis V. Lunev 已提交
1079 1080 1081
		if (fi->fib_prefsrc == local) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1082 1083
		}
	}
D
Denis V. Lunev 已提交
1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
	return ret;
}

int fib_sync_down_dev(struct net_device *dev, int force)
{
	int ret = 0;
	int scope = RT_SCOPE_NOWHERE;
	struct fib_info *prev_fi = NULL;
	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
	struct hlist_head *head = &fib_info_devhash[hash];
	struct hlist_node *node;
	struct fib_nh *nh;
L
Linus Torvalds 已提交
1096

D
Denis V. Lunev 已提交
1097 1098
	if (force)
		scope = -1;
L
Linus Torvalds 已提交
1099

D
Denis V. Lunev 已提交
1100 1101 1102
	hlist_for_each_entry(nh, node, head, nh_hash) {
		struct fib_info *fi = nh->nh_parent;
		int dead;
L
Linus Torvalds 已提交
1103

D
Denis V. Lunev 已提交
1104 1105 1106 1107 1108 1109
		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;
		prev_fi = fi;
		dead = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1110
			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
D
Denis V. Lunev 已提交
1111
				dead++;
1112 1113 1114
			else if (nexthop_nh->nh_dev == dev &&
				 nexthop_nh->nh_scope != scope) {
				nexthop_nh->nh_flags |= RTNH_F_DEAD;
L
Linus Torvalds 已提交
1115
#ifdef CONFIG_IP_ROUTE_MULTIPATH
D
Denis V. Lunev 已提交
1116
				spin_lock_bh(&fib_multipath_lock);
1117 1118
				fi->fib_power -= nexthop_nh->nh_power;
				nexthop_nh->nh_power = 0;
D
Denis V. Lunev 已提交
1119
				spin_unlock_bh(&fib_multipath_lock);
L
Linus Torvalds 已提交
1120
#endif
D
Denis V. Lunev 已提交
1121 1122
				dead++;
			}
L
Linus Torvalds 已提交
1123
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1124
			if (force > 1 && nexthop_nh->nh_dev == dev) {
D
Denis V. Lunev 已提交
1125 1126
				dead = fi->fib_nhs;
				break;
L
Linus Torvalds 已提交
1127
			}
D
Denis V. Lunev 已提交
1128 1129 1130 1131 1132
#endif
		} endfor_nexthops(fi)
		if (dead == fi->fib_nhs) {
			fi->fib_flags |= RTNH_F_DEAD;
			ret++;
L
Linus Torvalds 已提交
1133 1134 1135 1136 1137 1138
		}
	}

	return ret;
}

1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
/* Must be invoked inside of an RCU protected region.  */
void fib_select_default(struct fib_result *res)
{
	struct fib_info *fi = NULL, *last_resort = NULL;
	struct list_head *fa_head = res->fa_head;
	struct fib_table *tb = res->table;
	int order = -1, last_idx = -1;
	struct fib_alias *fa;

	list_for_each_entry_rcu(fa, fa_head, fa_list) {
		struct fib_info *next_fi = fa->fa_info;

		if (fa->fa_scope != res->scope ||
		    fa->fa_type != RTN_UNICAST)
			continue;

		if (next_fi->fib_priority > res->fi->fib_priority)
			break;
		if (!next_fi->fib_nh[0].nh_gw ||
		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
			continue;

		fib_alias_accessed(fa);

		if (fi == NULL) {
			if (next_fi != res->fi)
				break;
		} else if (!fib_detect_death(fi, order, &last_resort,
					     &last_idx, tb->tb_default)) {
			fib_result_assign(res, fi);
			tb->tb_default = order;
			goto out;
		}
		fi = next_fi;
		order++;
	}

	if (order <= 0 || fi == NULL) {
		tb->tb_default = -1;
		goto out;
	}

	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
				tb->tb_default)) {
		fib_result_assign(res, fi);
		tb->tb_default = order;
		goto out;
	}

	if (last_idx >= 0)
		fib_result_assign(res, last_resort);
	tb->tb_default = last_idx;
out:
	rcu_read_unlock();
}

L
Linus Torvalds 已提交
1195 1196 1197
#ifdef CONFIG_IP_ROUTE_MULTIPATH

/*
E
Eric Dumazet 已提交
1198 1199
 * Dead device goes up. We wake up dead nexthops.
 * It takes sense only on multipath routes.
L
Linus Torvalds 已提交
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209
 */
int fib_sync_up(struct net_device *dev)
{
	struct fib_info *prev_fi;
	unsigned int hash;
	struct hlist_head *head;
	struct hlist_node *node;
	struct fib_nh *nh;
	int ret;

E
Eric Dumazet 已提交
1210
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228
		return 0;

	prev_fi = NULL;
	hash = fib_devindex_hashfn(dev->ifindex);
	head = &fib_info_devhash[hash];
	ret = 0;

	hlist_for_each_entry(nh, node, head, nh_hash) {
		struct fib_info *fi = nh->nh_parent;
		int alive;

		BUG_ON(!fi->fib_nhs);
		if (nh->nh_dev != dev || fi == prev_fi)
			continue;

		prev_fi = fi;
		alive = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1229
			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
L
Linus Torvalds 已提交
1230 1231 1232
				alive++;
				continue;
			}
1233
			if (nexthop_nh->nh_dev == NULL ||
E
Eric Dumazet 已提交
1234
			    !(nexthop_nh->nh_dev->flags & IFF_UP))
L
Linus Torvalds 已提交
1235
				continue;
1236 1237
			if (nexthop_nh->nh_dev != dev ||
			    !__in_dev_get_rtnl(dev))
L
Linus Torvalds 已提交
1238 1239 1240
				continue;
			alive++;
			spin_lock_bh(&fib_multipath_lock);
1241 1242
			nexthop_nh->nh_power = 0;
			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
L
Linus Torvalds 已提交
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
			spin_unlock_bh(&fib_multipath_lock);
		} endfor_nexthops(fi)

		if (alive > 0) {
			fi->fib_flags &= ~RTNH_F_DEAD;
			ret++;
		}
	}

	return ret;
}

/*
E
Eric Dumazet 已提交
1256 1257
 * The algorithm is suboptimal, but it provides really
 * fair weighted route distribution.
L
Linus Torvalds 已提交
1258 1259 1260 1261 1262 1263 1264 1265 1266 1267
 */
void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
{
	struct fib_info *fi = res->fi;
	int w;

	spin_lock_bh(&fib_multipath_lock);
	if (fi->fib_power <= 0) {
		int power = 0;
		change_nexthops(fi) {
E
Eric Dumazet 已提交
1268
			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1269 1270
				power += nexthop_nh->nh_weight;
				nexthop_nh->nh_power = nexthop_nh->nh_weight;
L
Linus Torvalds 已提交
1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
			}
		} endfor_nexthops(fi);
		fi->fib_power = power;
		if (power <= 0) {
			spin_unlock_bh(&fib_multipath_lock);
			/* Race condition: route has just become dead. */
			res->nh_sel = 0;
			return;
		}
	}


	/* w should be random number [0..fi->fib_power-1],
E
Eric Dumazet 已提交
1284
	 * it is pretty bad approximation.
L
Linus Torvalds 已提交
1285 1286 1287 1288 1289
	 */

	w = jiffies % fi->fib_power;

	change_nexthops(fi) {
E
Eric Dumazet 已提交
1290
		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1291
		    nexthop_nh->nh_power) {
E
Eric Dumazet 已提交
1292 1293
			w -= nexthop_nh->nh_power;
			if (w <= 0) {
1294
				nexthop_nh->nh_power--;
L
Linus Torvalds 已提交
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307
				fi->fib_power--;
				res->nh_sel = nhsel;
				spin_unlock_bh(&fib_multipath_lock);
				return;
			}
		}
	} endfor_nexthops(fi);

	/* Race condition: route has just become dead. */
	res->nh_sel = 0;
	spin_unlock_bh(&fib_multipath_lock);
}
#endif