fib_frontend.c 24.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: FIB frontend.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
20
#include <linux/capability.h>
L
Linus Torvalds 已提交
21 22 23 24 25 26 27 28 29
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
30
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
31
#include <linux/netdevice.h>
32
#include <linux/if_addr.h>
L
Linus Torvalds 已提交
33 34 35
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/init.h>
36
#include <linux/list.h>
37
#include <linux/slab.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45

#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
46
#include <net/rtnetlink.h>
L
Linus Torvalds 已提交
47 48 49

#ifndef CONFIG_IP_MULTIPLE_TABLES

50
static int __net_init fib4_rules_init(struct net *net)
51
{
52 53
	struct fib_table *local_table, *main_table;

54
	local_table = fib_hash_table(RT_TABLE_LOCAL);
55
	if (local_table == NULL)
56 57
		return -ENOMEM;

58
	main_table  = fib_hash_table(RT_TABLE_MAIN);
59
	if (main_table == NULL)
60 61
		goto fail;

62
	hlist_add_head_rcu(&local_table->tb_hlist,
63
				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
64
	hlist_add_head_rcu(&main_table->tb_hlist,
65
				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
66 67 68
	return 0;

fail:
69
	kfree(local_table);
70
	return -ENOMEM;
71
}
72
#else
L
Linus Torvalds 已提交
73

74
struct fib_table *fib_new_table(struct net *net, u32 id)
L
Linus Torvalds 已提交
75 76
{
	struct fib_table *tb;
77
	unsigned int h;
L
Linus Torvalds 已提交
78

79 80
	if (id == 0)
		id = RT_TABLE_MAIN;
81
	tb = fib_get_table(net, id);
82 83
	if (tb)
		return tb;
84 85

	tb = fib_hash_table(id);
L
Linus Torvalds 已提交
86 87
	if (!tb)
		return NULL;
88
	h = id & (FIB_TABLE_HASHSZ - 1);
89
	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
L
Linus Torvalds 已提交
90 91 92
	return tb;
}

93
struct fib_table *fib_get_table(struct net *net, u32 id)
94 95 96
{
	struct fib_table *tb;
	struct hlist_node *node;
97
	struct hlist_head *head;
98
	unsigned int h;
L
Linus Torvalds 已提交
99

100 101 102
	if (id == 0)
		id = RT_TABLE_MAIN;
	h = id & (FIB_TABLE_HASHSZ - 1);
103

104
	rcu_read_lock();
105 106
	head = &net->ipv4.fib_table_hash[h];
	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
107 108 109 110 111 112 113 114
		if (tb->tb_id == id) {
			rcu_read_unlock();
			return tb;
		}
	}
	rcu_read_unlock();
	return NULL;
}
L
Linus Torvalds 已提交
115 116
#endif /* CONFIG_IP_MULTIPLE_TABLES */

117 118
void fib_select_default(struct net *net,
			const struct flowi *flp, struct fib_result *res)
119 120 121 122 123 124 125 126
{
	struct fib_table *tb;
	int table = RT_TABLE_MAIN;
#ifdef CONFIG_IP_MULTIPLE_TABLES
	if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
		return;
	table = res->r->table;
#endif
127
	tb = fib_get_table(net, table);
128
	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
129
		fib_table_select_default(tb, flp, res);
130 131
}

132
static void fib_flush(struct net *net)
L
Linus Torvalds 已提交
133 134 135
{
	int flushed = 0;
	struct fib_table *tb;
136
	struct hlist_node *node;
137
	struct hlist_head *head;
138
	unsigned int h;
L
Linus Torvalds 已提交
139

140
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
141 142
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist)
143
			flushed += fib_table_flush(tb);
L
Linus Torvalds 已提交
144 145 146
	}

	if (flushed)
147
		rt_cache_flush(net, -1);
L
Linus Torvalds 已提交
148 149
}

E
Eric Dumazet 已提交
150 151 152 153 154 155
/**
 * __ip_dev_find - find the first device with a given source address.
 * @net: the net namespace
 * @addr: the source address
 * @devref: if true, take a reference on the found device
 *
156
 * If a caller uses devref=false, it should be protected by RCU, or RTNL
L
Linus Torvalds 已提交
157
 */
E
Eric Dumazet 已提交
158
struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
L
Linus Torvalds 已提交
159
{
E
Eric Dumazet 已提交
160 161 162 163 164 165 166 167 168
	struct flowi fl = {
		.nl_u = {
			.ip4_u = {
				.daddr = addr
			}
		},
		.flags = FLOWI_FLAG_MATCH_ANY_IIF
	};
	struct fib_result res = { 0 };
L
Linus Torvalds 已提交
169 170
	struct net_device *dev = NULL;

E
Eric Dumazet 已提交
171 172 173
	rcu_read_lock();
	if (fib_lookup(net, &fl, &res)) {
		rcu_read_unlock();
L
Linus Torvalds 已提交
174
		return NULL;
E
Eric Dumazet 已提交
175
	}
L
Linus Torvalds 已提交
176 177 178 179
	if (res.type != RTN_LOCAL)
		goto out;
	dev = FIB_RES_DEV(res);

E
Eric Dumazet 已提交
180
	if (dev && devref)
L
Linus Torvalds 已提交
181 182
		dev_hold(dev);
out:
E
Eric Dumazet 已提交
183
	rcu_read_unlock();
L
Linus Torvalds 已提交
184 185
	return dev;
}
E
Eric Dumazet 已提交
186
EXPORT_SYMBOL(__ip_dev_find);
L
Linus Torvalds 已提交
187

188 189 190 191
/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
192 193
static inline unsigned __inet_dev_addr_type(struct net *net,
					    const struct net_device *dev,
194
					    __be32 addr)
L
Linus Torvalds 已提交
195 196 197 198
{
	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
	struct fib_result	res;
	unsigned ret = RTN_BROADCAST;
199
	struct fib_table *local_table;
L
Linus Torvalds 已提交
200

201
	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
L
Linus Torvalds 已提交
202
		return RTN_BROADCAST;
203
	if (ipv4_is_multicast(addr))
L
Linus Torvalds 已提交
204 205 206 207 208
		return RTN_MULTICAST;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif
209

210
	local_table = fib_get_table(net, RT_TABLE_LOCAL);
211
	if (local_table) {
L
Linus Torvalds 已提交
212
		ret = RTN_UNICAST;
E
Eric Dumazet 已提交
213 214
		rcu_read_lock();
		if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
215 216
			if (!dev || dev == res.fi->fib_dev)
				ret = res.type;
L
Linus Torvalds 已提交
217
		}
E
Eric Dumazet 已提交
218
		rcu_read_unlock();
L
Linus Torvalds 已提交
219 220 221 222
	}
	return ret;
}

223
unsigned int inet_addr_type(struct net *net, __be32 addr)
224
{
225
	return __inet_dev_addr_type(net, NULL, addr);
226
}
E
Eric Dumazet 已提交
227
EXPORT_SYMBOL(inet_addr_type);
228

229 230
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
				__be32 addr)
231
{
E
Eric Dumazet 已提交
232
	return __inet_dev_addr_type(net, dev, addr);
233
}
E
Eric Dumazet 已提交
234
EXPORT_SYMBOL(inet_dev_addr_type);
235

L
Linus Torvalds 已提交
236
/* Given (packet source, input interface) and optional (dst, oif, tos):
E
Eric Dumazet 已提交
237 238 239 240 241
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
 * - figure out what "logical" interface this packet arrived
 *   and calculate "specific destination" address.
 * - check, that packet arrived from expected physical interface.
E
Eric Dumazet 已提交
242
 * called with rcu_read_lock()
L
Linus Torvalds 已提交
243
 */
244
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
J
jamal 已提交
245 246
			struct net_device *dev, __be32 *spec_dst,
			u32 *itag, u32 mark)
L
Linus Torvalds 已提交
247 248
{
	struct in_device *in_dev;
E
Eric Dumazet 已提交
249 250 251 252 253 254 255 256 257 258 259
	struct flowi fl = {
		.nl_u = {
			.ip4_u = {
				.daddr = src,
				.saddr = dst,
				.tos = tos
			}
		},
		.mark = mark,
		.iif = oif
	};
L
Linus Torvalds 已提交
260
	struct fib_result res;
261
	int no_addr, rpf, accept_local;
262
	bool dev_match;
L
Linus Torvalds 已提交
263
	int ret;
264
	struct net *net;
L
Linus Torvalds 已提交
265

266
	no_addr = rpf = accept_local = 0;
267
	in_dev = __in_dev_get_rcu(dev);
L
Linus Torvalds 已提交
268 269 270
	if (in_dev) {
		no_addr = in_dev->ifa_list == NULL;
		rpf = IN_DEV_RPFILTER(in_dev);
271
		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
272 273
		if (mark && !IN_DEV_SRC_VMARK(in_dev))
			fl.mark = 0;
L
Linus Torvalds 已提交
274 275 276 277 278
	}

	if (in_dev == NULL)
		goto e_inval;

279
	net = dev_net(dev);
280
	if (fib_lookup(net, &fl, &res))
L
Linus Torvalds 已提交
281
		goto last_resort;
282 283
	if (res.type != RTN_UNICAST) {
		if (res.type != RTN_LOCAL || !accept_local)
E
Eric Dumazet 已提交
284
			goto e_inval;
285
	}
L
Linus Torvalds 已提交
286 287
	*spec_dst = FIB_RES_PREFSRC(res);
	fib_combine_itag(itag, &res);
288 289
	dev_match = false;

L
Linus Torvalds 已提交
290
#ifdef CONFIG_IP_ROUTE_MULTIPATH
291 292 293 294 295 296 297 298
	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
		struct fib_nh *nh = &res.fi->fib_nh[ret];

		if (nh->nh_dev == dev) {
			dev_match = true;
			break;
		}
	}
L
Linus Torvalds 已提交
299 300
#else
	if (FIB_RES_DEV(res) == dev)
301
		dev_match = true;
L
Linus Torvalds 已提交
302
#endif
303
	if (dev_match) {
L
Linus Torvalds 已提交
304 305 306 307 308
		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		return ret;
	}
	if (no_addr)
		goto last_resort;
309
	if (rpf == 1)
310
		goto e_rpf;
L
Linus Torvalds 已提交
311 312 313
	fl.oif = dev->ifindex;

	ret = 0;
314
	if (fib_lookup(net, &fl, &res) == 0) {
L
Linus Torvalds 已提交
315 316 317 318 319 320 321 322 323
		if (res.type == RTN_UNICAST) {
			*spec_dst = FIB_RES_PREFSRC(res);
			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		}
	}
	return ret;

last_resort:
	if (rpf)
324
		goto e_rpf;
L
Linus Torvalds 已提交
325 326 327 328 329 330
	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
	*itag = 0;
	return 0;

e_inval:
	return -EINVAL;
331 332
e_rpf:
	return -EXDEV;
L
Linus Torvalds 已提交
333 334
}

A
Al Viro 已提交
335
static inline __be32 sk_extract_addr(struct sockaddr *addr)
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
{
	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
}

static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
{
	struct nlattr *nla;

	nla = (struct nlattr *) ((char *) mx + len);
	nla->nla_type = type;
	nla->nla_len = nla_attr_size(4);
	*(u32 *) nla_data(nla) = value;

	return len + nla_total_size(4);
}

352
static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
353 354
				 struct fib_config *cfg)
{
355
	__be32 addr;
356 357 358
	int plen;

	memset(cfg, 0, sizeof(*cfg));
359
	cfg->fc_nlinfo.nl_net = net;
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374

	if (rt->rt_dst.sa_family != AF_INET)
		return -EAFNOSUPPORT;

	/*
	 * Check mask for validity:
	 * a) it must be contiguous.
	 * b) destination must have all host bits clear.
	 * c) if application forgot to set correct family (AF_INET),
	 *    reject request unless it is absolutely clear i.e.
	 *    both family and mask are zero.
	 */
	plen = 32;
	addr = sk_extract_addr(&rt->rt_dst);
	if (!(rt->rt_flags & RTF_HOST)) {
A
Al Viro 已提交
375
		__be32 mask = sk_extract_addr(&rt->rt_genmask);
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419

		if (rt->rt_genmask.sa_family != AF_INET) {
			if (mask || rt->rt_genmask.sa_family)
				return -EAFNOSUPPORT;
		}

		if (bad_mask(mask, addr))
			return -EINVAL;

		plen = inet_mask_len(mask);
	}

	cfg->fc_dst_len = plen;
	cfg->fc_dst = addr;

	if (cmd != SIOCDELRT) {
		cfg->fc_nlflags = NLM_F_CREATE;
		cfg->fc_protocol = RTPROT_BOOT;
	}

	if (rt->rt_metric)
		cfg->fc_priority = rt->rt_metric - 1;

	if (rt->rt_flags & RTF_REJECT) {
		cfg->fc_scope = RT_SCOPE_HOST;
		cfg->fc_type = RTN_UNREACHABLE;
		return 0;
	}

	cfg->fc_scope = RT_SCOPE_NOWHERE;
	cfg->fc_type = RTN_UNICAST;

	if (rt->rt_dev) {
		char *colon;
		struct net_device *dev;
		char devname[IFNAMSIZ];

		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
			return -EFAULT;

		devname[IFNAMSIZ-1] = 0;
		colon = strchr(devname, ':');
		if (colon)
			*colon = 0;
420
		dev = __dev_get_by_name(net, devname);
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
		if (!dev)
			return -ENODEV;
		cfg->fc_oif = dev->ifindex;
		if (colon) {
			struct in_ifaddr *ifa;
			struct in_device *in_dev = __in_dev_get_rtnl(dev);
			if (!in_dev)
				return -ENODEV;
			*colon = ':';
			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
				if (strcmp(ifa->ifa_label, devname) == 0)
					break;
			if (ifa == NULL)
				return -ENODEV;
			cfg->fc_prefsrc = ifa->ifa_local;
		}
	}

	addr = sk_extract_addr(&rt->rt_gateway);
	if (rt->rt_gateway.sa_family == AF_INET && addr) {
		cfg->fc_gw = addr;
		if (rt->rt_flags & RTF_GATEWAY &&
443
		    inet_addr_type(net, addr) == RTN_UNICAST)
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
			cfg->fc_scope = RT_SCOPE_UNIVERSE;
	}

	if (cmd == SIOCDELRT)
		return 0;

	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
		return -EINVAL;

	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
		cfg->fc_scope = RT_SCOPE_LINK;

	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
		struct nlattr *mx;
		int len = 0;

		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
461
		if (mx == NULL)
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
			return -ENOMEM;

		if (rt->rt_flags & RTF_MTU)
			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);

		if (rt->rt_flags & RTF_WINDOW)
			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);

		if (rt->rt_flags & RTF_IRTT)
			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);

		cfg->fc_mx = mx;
		cfg->fc_mx_len = len;
	}

	return 0;
}

L
Linus Torvalds 已提交
480
/*
E
Eric Dumazet 已提交
481 482
 * Handle IP routing ioctl calls.
 * These are used to manipulate the routing tables
L
Linus Torvalds 已提交
483
 */
484
int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
485
{
486 487
	struct fib_config cfg;
	struct rtentry rt;
L
Linus Torvalds 已提交
488 489 490 491 492 493 494
	int err;

	switch (cmd) {
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
495 496

		if (copy_from_user(&rt, arg, sizeof(rt)))
L
Linus Torvalds 已提交
497
			return -EFAULT;
498

L
Linus Torvalds 已提交
499
		rtnl_lock();
500
		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
L
Linus Torvalds 已提交
501
		if (err == 0) {
502 503
			struct fib_table *tb;

L
Linus Torvalds 已提交
504
			if (cmd == SIOCDELRT) {
505
				tb = fib_get_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
506
				if (tb)
507
					err = fib_table_delete(tb, &cfg);
508 509
				else
					err = -ESRCH;
L
Linus Torvalds 已提交
510
			} else {
511
				tb = fib_new_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
512
				if (tb)
513
					err = fib_table_insert(tb, &cfg);
514 515
				else
					err = -ENOBUFS;
L
Linus Torvalds 已提交
516
			}
517 518 519

			/* allocated by rtentry_to_fib_config() */
			kfree(cfg.fc_mx);
L
Linus Torvalds 已提交
520 521 522 523 524 525 526
		}
		rtnl_unlock();
		return err;
	}
	return -EINVAL;
}

E
Eric Dumazet 已提交
527
const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
528 529 530 531 532 533 534 535
	[RTA_DST]		= { .type = NLA_U32 },
	[RTA_SRC]		= { .type = NLA_U32 },
	[RTA_IIF]		= { .type = NLA_U32 },
	[RTA_OIF]		= { .type = NLA_U32 },
	[RTA_GATEWAY]		= { .type = NLA_U32 },
	[RTA_PRIORITY]		= { .type = NLA_U32 },
	[RTA_PREFSRC]		= { .type = NLA_U32 },
	[RTA_METRICS]		= { .type = NLA_NESTED },
536
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
537 538 539
	[RTA_FLOW]		= { .type = NLA_U32 },
};

540
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
E
Eric Dumazet 已提交
541
			     struct nlmsghdr *nlh, struct fib_config *cfg)
L
Linus Torvalds 已提交
542
{
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
	struct nlattr *attr;
	int err, remaining;
	struct rtmsg *rtm;

	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
	if (err < 0)
		goto errout;

	memset(cfg, 0, sizeof(*cfg));

	rtm = nlmsg_data(nlh);
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_tos = rtm->rtm_tos;
	cfg->fc_table = rtm->rtm_table;
	cfg->fc_protocol = rtm->rtm_protocol;
	cfg->fc_scope = rtm->rtm_scope;
	cfg->fc_type = rtm->rtm_type;
	cfg->fc_flags = rtm->rtm_flags;
	cfg->fc_nlflags = nlh->nlmsg_flags;

	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
	cfg->fc_nlinfo.nlh = nlh;
565
	cfg->fc_nlinfo.nl_net = net;
566

567 568 569 570 571
	if (cfg->fc_type > RTN_MAX) {
		err = -EINVAL;
		goto errout;
	}

572
	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
573
		switch (nla_type(attr)) {
574
		case RTA_DST:
575
			cfg->fc_dst = nla_get_be32(attr);
576 577 578 579 580
			break;
		case RTA_OIF:
			cfg->fc_oif = nla_get_u32(attr);
			break;
		case RTA_GATEWAY:
581
			cfg->fc_gw = nla_get_be32(attr);
582 583 584 585 586
			break;
		case RTA_PRIORITY:
			cfg->fc_priority = nla_get_u32(attr);
			break;
		case RTA_PREFSRC:
587
			cfg->fc_prefsrc = nla_get_be32(attr);
588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
			break;
		case RTA_METRICS:
			cfg->fc_mx = nla_data(attr);
			cfg->fc_mx_len = nla_len(attr);
			break;
		case RTA_MULTIPATH:
			cfg->fc_mp = nla_data(attr);
			cfg->fc_mp_len = nla_len(attr);
			break;
		case RTA_FLOW:
			cfg->fc_flow = nla_get_u32(attr);
			break;
		case RTA_TABLE:
			cfg->fc_table = nla_get_u32(attr);
			break;
L
Linus Torvalds 已提交
603 604
		}
	}
605

L
Linus Torvalds 已提交
606
	return 0;
607 608
errout:
	return err;
L
Linus Torvalds 已提交
609 610
}

611
static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
612
{
613
	struct net *net = sock_net(skb->sk);
614 615 616
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
617

618
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
619 620
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
621

622
	tb = fib_get_table(net, cfg.fc_table);
623 624 625 626 627
	if (tb == NULL) {
		err = -ESRCH;
		goto errout;
	}

628
	err = fib_table_delete(tb, &cfg);
629 630
errout:
	return err;
L
Linus Torvalds 已提交
631 632
}

633
static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
634
{
635
	struct net *net = sock_net(skb->sk);
636 637 638
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
639

640
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
641 642
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
643

644
	tb = fib_new_table(net, cfg.fc_table);
645 646 647 648 649
	if (tb == NULL) {
		err = -ENOBUFS;
		goto errout;
	}

650
	err = fib_table_insert(tb, &cfg);
651 652
errout:
	return err;
L
Linus Torvalds 已提交
653 654
}

655
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
L
Linus Torvalds 已提交
656
{
657
	struct net *net = sock_net(skb->sk);
658 659
	unsigned int h, s_h;
	unsigned int e = 0, s_e;
L
Linus Torvalds 已提交
660
	struct fib_table *tb;
661
	struct hlist_node *node;
662
	struct hlist_head *head;
663
	int dumped = 0;
L
Linus Torvalds 已提交
664

665 666
	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
L
Linus Torvalds 已提交
667 668
		return ip_rt_dump(skb, cb);

669 670 671 672 673
	s_h = cb->args[0];
	s_e = cb->args[1];

	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
		e = 0;
674 675
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist) {
676 677 678 679
			if (e < s_e)
				goto next;
			if (dumped)
				memset(&cb->args[2], 0, sizeof(cb->args) -
680
						 2 * sizeof(cb->args[0]));
681
			if (fib_table_dump(tb, skb, cb) < 0)
682 683 684 685 686
				goto out;
			dumped = 1;
next:
			e++;
		}
L
Linus Torvalds 已提交
687
	}
688 689 690
out:
	cb->args[1] = e;
	cb->args[0] = h;
L
Linus Torvalds 已提交
691 692 693 694 695

	return skb->len;
}

/* Prepare and feed intra-kernel routing request.
E
Eric Dumazet 已提交
696 697 698 699
 * Really, it should be netlink message, but :-( netlink
 * can be not configured, so that we feed it directly
 * to fib engine. It is legal, because all events occur
 * only when netlink is already locked.
L
Linus Torvalds 已提交
700
 */
A
Al Viro 已提交
701
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
702
{
703
	struct net *net = dev_net(ifa->ifa_dev->dev);
704 705 706 707 708 709 710 711 712
	struct fib_table *tb;
	struct fib_config cfg = {
		.fc_protocol = RTPROT_KERNEL,
		.fc_type = type,
		.fc_dst = dst,
		.fc_dst_len = dst_len,
		.fc_prefsrc = ifa->ifa_local,
		.fc_oif = ifa->ifa_dev->dev->ifindex,
		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
713
		.fc_nlinfo = {
714
			.nl_net = net,
715
		},
716
	};
L
Linus Torvalds 已提交
717 718

	if (type == RTN_UNICAST)
719
		tb = fib_new_table(net, RT_TABLE_MAIN);
L
Linus Torvalds 已提交
720
	else
721
		tb = fib_new_table(net, RT_TABLE_LOCAL);
L
Linus Torvalds 已提交
722 723 724 725

	if (tb == NULL)
		return;

726
	cfg.fc_table = tb->tb_id;
L
Linus Torvalds 已提交
727

728 729 730 731
	if (type != RTN_LOCAL)
		cfg.fc_scope = RT_SCOPE_LINK;
	else
		cfg.fc_scope = RT_SCOPE_HOST;
L
Linus Torvalds 已提交
732 733

	if (cmd == RTM_NEWROUTE)
734
		fib_table_insert(tb, &cfg);
L
Linus Torvalds 已提交
735
	else
736
		fib_table_delete(tb, &cfg);
L
Linus Torvalds 已提交
737 738
}

739
void fib_add_ifaddr(struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
740 741 742 743
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *prim = ifa;
A
Al Viro 已提交
744 745
	__be32 mask = ifa->ifa_mask;
	__be32 addr = ifa->ifa_local;
E
Eric Dumazet 已提交
746
	__be32 prefix = ifa->ifa_address & mask;
L
Linus Torvalds 已提交
747

E
Eric Dumazet 已提交
748
	if (ifa->ifa_flags & IFA_F_SECONDARY) {
L
Linus Torvalds 已提交
749 750
		prim = inet_ifa_byprefix(in_dev, prefix, mask);
		if (prim == NULL) {
751
			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
752 753 754 755 756 757
			return;
		}
	}

	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);

E
Eric Dumazet 已提交
758
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
759 760 761
		return;

	/* Add broadcast address, if it is explicitly assigned. */
A
Al Viro 已提交
762
	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
L
Linus Torvalds 已提交
763 764
		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);

E
Eric Dumazet 已提交
765
	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
L
Linus Torvalds 已提交
766
	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
E
Eric Dumazet 已提交
767 768 769
		fib_magic(RTM_NEWROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  prefix, ifa->ifa_prefixlen, prim);
L
Linus Torvalds 已提交
770 771 772 773

		/* Add network specific broadcasts, when it takes a sense */
		if (ifa->ifa_prefixlen < 31) {
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
E
Eric Dumazet 已提交
774 775
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
				  32, prim);
L
Linus Torvalds 已提交
776 777 778 779 780 781 782 783 784 785
		}
	}
}

static void fib_del_ifaddr(struct in_ifaddr *ifa)
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *ifa1;
	struct in_ifaddr *prim = ifa;
E
Eric Dumazet 已提交
786 787
	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
	__be32 any = ifa->ifa_address & ifa->ifa_mask;
L
Linus Torvalds 已提交
788 789 790 791 792 793
#define LOCAL_OK	1
#define BRD_OK		2
#define BRD0_OK		4
#define BRD1_OK		8
	unsigned ok = 0;

E
Eric Dumazet 已提交
794 795 796 797
	if (!(ifa->ifa_flags & IFA_F_SECONDARY))
		fib_magic(RTM_DELROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  any, ifa->ifa_prefixlen, prim);
L
Linus Torvalds 已提交
798 799 800
	else {
		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
		if (prim == NULL) {
801
			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
802 803 804 805 806
			return;
		}
	}

	/* Deletion is more complicated than add.
E
Eric Dumazet 已提交
807 808 809
	 * We should take care of not to delete too much :-)
	 *
	 * Scan address list to be sure that addresses are really gone.
L
Linus Torvalds 已提交
810 811 812 813 814 815 816 817 818 819 820 821 822
	 */

	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
		if (ifa->ifa_local == ifa1->ifa_local)
			ok |= LOCAL_OK;
		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
			ok |= BRD_OK;
		if (brd == ifa1->ifa_broadcast)
			ok |= BRD1_OK;
		if (any == ifa1->ifa_broadcast)
			ok |= BRD0_OK;
	}

E
Eric Dumazet 已提交
823
	if (!(ok & BRD_OK))
L
Linus Torvalds 已提交
824
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
E
Eric Dumazet 已提交
825
	if (!(ok & BRD1_OK))
L
Linus Torvalds 已提交
826
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
E
Eric Dumazet 已提交
827
	if (!(ok & BRD0_OK))
L
Linus Torvalds 已提交
828
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
E
Eric Dumazet 已提交
829
	if (!(ok & LOCAL_OK)) {
L
Linus Torvalds 已提交
830 831 832
		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);

		/* Check, that this local address finally disappeared. */
833
		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
L
Linus Torvalds 已提交
834
			/* And the last, but not the least thing.
E
Eric Dumazet 已提交
835 836 837 838 839
			 * We must flush stray FIB entries.
			 *
			 * First of all, we scan fib_info list searching
			 * for stray nexthop entries, then ignite fib_flush.
			 */
840 841
			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
				fib_flush(dev_net(dev));
L
Linus Torvalds 已提交
842 843 844 845 846 847 848 849
		}
	}
#undef LOCAL_OK
#undef BRD_OK
#undef BRD0_OK
#undef BRD1_OK
}

E
Eric Dumazet 已提交
850
static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
851
{
852

853
	struct fib_result       res;
E
Eric Dumazet 已提交
854 855 856 857 858 859 860 861 862 863
	struct flowi            fl = {
		.mark = frn->fl_mark,
		.nl_u = {
			.ip4_u = {
				.daddr = frn->fl_addr,
				.tos = frn->fl_tos,
				.scope = frn->fl_scope
			}
		}
	};
864

865 866 867 868
#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif

869
	frn->err = -ENOENT;
870 871 872 873
	if (tb) {
		local_bh_disable();

		frn->tb_id = tb->tb_id;
E
Eric Dumazet 已提交
874 875
		rcu_read_lock();
		frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
876 877 878 879 880 881 882

		if (!frn->err) {
			frn->prefixlen = res.prefixlen;
			frn->nh_sel = res.nh_sel;
			frn->type = res.type;
			frn->scope = res.scope;
		}
E
Eric Dumazet 已提交
883
		rcu_read_unlock();
884 885 886 887
		local_bh_enable();
	}
}

888
static void nl_fib_input(struct sk_buff *skb)
889
{
890
	struct net *net;
891
	struct fib_result_nl *frn;
892
	struct nlmsghdr *nlh;
893
	struct fib_table *tb;
894
	u32 pid;
895

896
	net = sock_net(skb->sk);
897
	nlh = nlmsg_hdr(skb);
898
	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
899
	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
900
		return;
901 902 903 904 905

	skb = skb_clone(skb, GFP_KERNEL);
	if (skb == NULL)
		return;
	nlh = nlmsg_hdr(skb);
906

907
	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
908
	tb = fib_get_table(net, frn->tb_id_in);
909 910

	nl_fib_lookup(frn, tb);
911

E
Eric Dumazet 已提交
912 913
	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
	NETLINK_CB(skb).pid = 0;        /* from kernel */
914
	NETLINK_CB(skb).dst_group = 0;  /* unicast */
915
	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
916
}
917

918
static int __net_init nl_fib_lookup_init(struct net *net)
919
{
920 921 922 923
	struct sock *sk;
	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
				   nl_fib_input, NULL, THIS_MODULE);
	if (sk == NULL)
924
		return -EAFNOSUPPORT;
925
	net->ipv4.fibnl = sk;
926 927 928 929 930
	return 0;
}

static void nl_fib_lookup_exit(struct net *net)
{
931
	netlink_kernel_release(net->ipv4.fibnl);
932
	net->ipv4.fibnl = NULL;
933 934
}

935
static void fib_disable_ip(struct net_device *dev, int force, int delay)
L
Linus Torvalds 已提交
936
{
D
Denis V. Lunev 已提交
937
	if (fib_sync_down_dev(dev, force))
938
		fib_flush(dev_net(dev));
939
	rt_cache_flush(dev_net(dev), delay);
L
Linus Torvalds 已提交
940 941 942 943 944
	arp_ifdown(dev);
}

static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
945
	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
946
	struct net_device *dev = ifa->ifa_dev->dev;
L
Linus Torvalds 已提交
947 948 949 950 951

	switch (event) {
	case NETDEV_UP:
		fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
952
		fib_sync_up(dev);
L
Linus Torvalds 已提交
953
#endif
954
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
955 956 957
		break;
	case NETDEV_DOWN:
		fib_del_ifaddr(ifa);
958
		if (ifa->ifa_dev->ifa_list == NULL) {
L
Linus Torvalds 已提交
959
			/* Last address was deleted from this interface.
E
Eric Dumazet 已提交
960
			 * Disable IP.
L
Linus Torvalds 已提交
961
			 */
962
			fib_disable_ip(dev, 1, 0);
L
Linus Torvalds 已提交
963
		} else {
964
			rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
965 966 967 968 969 970 971 972 973
		}
		break;
	}
	return NOTIFY_DONE;
}

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
	struct net_device *dev = ptr;
974
	struct in_device *in_dev = __in_dev_get_rtnl(dev);
L
Linus Torvalds 已提交
975 976

	if (event == NETDEV_UNREGISTER) {
977
		fib_disable_ip(dev, 2, -1);
L
Linus Torvalds 已提交
978 979 980 981 982 983 984 985 986 987 988 989 990 991
		return NOTIFY_DONE;
	}

	if (!in_dev)
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
		for_ifa(in_dev) {
			fib_add_ifaddr(ifa);
		} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		fib_sync_up(dev);
#endif
992
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
993 994
		break;
	case NETDEV_DOWN:
995
		fib_disable_ip(dev, 0, 0);
L
Linus Torvalds 已提交
996 997 998
		break;
	case NETDEV_CHANGEMTU:
	case NETDEV_CHANGE:
999
		rt_cache_flush(dev_net(dev), 0);
L
Linus Torvalds 已提交
1000
		break;
1001 1002 1003
	case NETDEV_UNREGISTER_BATCH:
		rt_cache_flush_batch();
		break;
L
Linus Torvalds 已提交
1004 1005 1006 1007 1008
	}
	return NOTIFY_DONE;
}

static struct notifier_block fib_inetaddr_notifier = {
1009
	.notifier_call = fib_inetaddr_event,
L
Linus Torvalds 已提交
1010 1011 1012
};

static struct notifier_block fib_netdev_notifier = {
1013
	.notifier_call = fib_netdev_event,
L
Linus Torvalds 已提交
1014 1015
};

1016
static int __net_init ip_fib_net_init(struct net *net)
L
Linus Torvalds 已提交
1017
{
1018
	int err;
1019 1020 1021 1022
	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;

	/* Avoid false sharing : Use at least a full cache line */
	size = max_t(size_t, size, L1_CACHE_BYTES);
1023

1024
	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1025 1026 1027
	if (net->ipv4.fib_table_hash == NULL)
		return -ENOMEM;

1028 1029 1030 1031 1032 1033 1034 1035
	err = fib4_rules_init(net);
	if (err < 0)
		goto fail;
	return 0;

fail:
	kfree(net->ipv4.fib_table_hash);
	return err;
1036
}
L
Linus Torvalds 已提交
1037

1038
static void ip_fib_net_exit(struct net *net)
1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
{
	unsigned int i;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	fib4_rules_exit(net);
#endif

	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
		struct fib_table *tb;
		struct hlist_head *head;
		struct hlist_node *node, *tmp;
1050

1051
		head = &net->ipv4.fib_table_hash[i];
1052 1053
		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
			hlist_del(node);
1054
			fib_table_flush(tb);
1055
			fib_free_table(tb);
1056 1057
		}
	}
1058
	kfree(net->ipv4.fib_table_hash);
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
}

static int __net_init fib_net_init(struct net *net)
{
	int error;

	error = ip_fib_net_init(net);
	if (error < 0)
		goto out;
	error = nl_fib_lookup_init(net);
	if (error < 0)
		goto out_nlfl;
	error = fib_proc_init(net);
	if (error < 0)
		goto out_proc;
out:
	return error;

out_proc:
	nl_fib_lookup_exit(net);
out_nlfl:
	ip_fib_net_exit(net);
	goto out;
}

static void __net_exit fib_net_exit(struct net *net)
{
	fib_proc_exit(net);
	nl_fib_lookup_exit(net);
	ip_fib_net_exit(net);
}

static struct pernet_operations fib_net_ops = {
	.init = fib_net_init,
	.exit = fib_net_exit,
};

void __init ip_fib_init(void)
{
1098 1099 1100
	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1101 1102 1103 1104

	register_pernet_subsys(&fib_net_ops);
	register_netdevice_notifier(&fib_netdev_notifier);
	register_inetaddr_notifier(&fib_inetaddr_notifier);
1105 1106

	fib_hash_init();
L
Linus Torvalds 已提交
1107
}