fib_frontend.c 26.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: FIB frontend.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
20
#include <linux/capability.h>
L
Linus Torvalds 已提交
21 22 23 24 25 26 27 28 29
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
30
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
31
#include <linux/netdevice.h>
32
#include <linux/if_addr.h>
L
Linus Torvalds 已提交
33 34 35
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/init.h>
36
#include <linux/list.h>
37
#include <linux/slab.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45

#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
46
#include <net/rtnetlink.h>
L
Linus Torvalds 已提交
47 48 49

#ifndef CONFIG_IP_MULTIPLE_TABLES

50
static int __net_init fib4_rules_init(struct net *net)
51
{
52 53
	struct fib_table *local_table, *main_table;

54
	local_table = fib_trie_table(RT_TABLE_LOCAL);
55
	if (local_table == NULL)
56 57
		return -ENOMEM;

58
	main_table  = fib_trie_table(RT_TABLE_MAIN);
59
	if (main_table == NULL)
60 61
		goto fail;

62
	hlist_add_head_rcu(&local_table->tb_hlist,
63
				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
64
	hlist_add_head_rcu(&main_table->tb_hlist,
65
				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
66 67 68
	return 0;

fail:
69
	kfree(local_table);
70
	return -ENOMEM;
71
}
72
#else
L
Linus Torvalds 已提交
73

74
struct fib_table *fib_new_table(struct net *net, u32 id)
L
Linus Torvalds 已提交
75 76
{
	struct fib_table *tb;
77
	unsigned int h;
L
Linus Torvalds 已提交
78

79 80
	if (id == 0)
		id = RT_TABLE_MAIN;
81
	tb = fib_get_table(net, id);
82 83
	if (tb)
		return tb;
84

85
	tb = fib_trie_table(id);
L
Linus Torvalds 已提交
86 87
	if (!tb)
		return NULL;
88
	h = id & (FIB_TABLE_HASHSZ - 1);
89
	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
L
Linus Torvalds 已提交
90 91 92
	return tb;
}

93
struct fib_table *fib_get_table(struct net *net, u32 id)
94 95 96
{
	struct fib_table *tb;
	struct hlist_node *node;
97
	struct hlist_head *head;
98
	unsigned int h;
L
Linus Torvalds 已提交
99

100 101 102
	if (id == 0)
		id = RT_TABLE_MAIN;
	h = id & (FIB_TABLE_HASHSZ - 1);
103

104
	rcu_read_lock();
105 106
	head = &net->ipv4.fib_table_hash[h];
	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
107 108 109 110 111 112 113 114
		if (tb->tb_id == id) {
			rcu_read_unlock();
			return tb;
		}
	}
	rcu_read_unlock();
	return NULL;
}
L
Linus Torvalds 已提交
115 116
#endif /* CONFIG_IP_MULTIPLE_TABLES */

117
static void fib_flush(struct net *net)
L
Linus Torvalds 已提交
118 119 120
{
	int flushed = 0;
	struct fib_table *tb;
121
	struct hlist_node *node;
122
	struct hlist_head *head;
123
	unsigned int h;
L
Linus Torvalds 已提交
124

125
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
126 127
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist)
128
			flushed += fib_table_flush(tb);
L
Linus Torvalds 已提交
129 130 131
	}

	if (flushed)
132
		rt_cache_flush(net, -1);
L
Linus Torvalds 已提交
133 134
}

135 136 137 138
/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
139 140
static inline unsigned __inet_dev_addr_type(struct net *net,
					    const struct net_device *dev,
141
					    __be32 addr)
L
Linus Torvalds 已提交
142
{
D
David S. Miller 已提交
143
	struct flowi4		fl4 = { .daddr = addr };
L
Linus Torvalds 已提交
144 145
	struct fib_result	res;
	unsigned ret = RTN_BROADCAST;
146
	struct fib_table *local_table;
L
Linus Torvalds 已提交
147

148
	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
L
Linus Torvalds 已提交
149
		return RTN_BROADCAST;
150
	if (ipv4_is_multicast(addr))
L
Linus Torvalds 已提交
151 152 153 154 155
		return RTN_MULTICAST;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif
156

157
	local_table = fib_get_table(net, RT_TABLE_LOCAL);
158
	if (local_table) {
L
Linus Torvalds 已提交
159
		ret = RTN_UNICAST;
E
Eric Dumazet 已提交
160
		rcu_read_lock();
D
David S. Miller 已提交
161
		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
162 163
			if (!dev || dev == res.fi->fib_dev)
				ret = res.type;
L
Linus Torvalds 已提交
164
		}
E
Eric Dumazet 已提交
165
		rcu_read_unlock();
L
Linus Torvalds 已提交
166 167 168 169
	}
	return ret;
}

170
unsigned int inet_addr_type(struct net *net, __be32 addr)
171
{
172
	return __inet_dev_addr_type(net, NULL, addr);
173
}
E
Eric Dumazet 已提交
174
EXPORT_SYMBOL(inet_addr_type);
175

176 177
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
				__be32 addr)
178
{
E
Eric Dumazet 已提交
179
	return __inet_dev_addr_type(net, dev, addr);
180
}
E
Eric Dumazet 已提交
181
EXPORT_SYMBOL(inet_dev_addr_type);
182

L
Linus Torvalds 已提交
183
/* Given (packet source, input interface) and optional (dst, oif, tos):
E
Eric Dumazet 已提交
184 185 186 187 188
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
 * - figure out what "logical" interface this packet arrived
 *   and calculate "specific destination" address.
 * - check, that packet arrived from expected physical interface.
E
Eric Dumazet 已提交
189
 * called with rcu_read_lock()
L
Linus Torvalds 已提交
190
 */
191
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
J
jamal 已提交
192 193
			struct net_device *dev, __be32 *spec_dst,
			u32 *itag, u32 mark)
L
Linus Torvalds 已提交
194 195
{
	struct in_device *in_dev;
D
David S. Miller 已提交
196
	struct flowi4 fl4;
L
Linus Torvalds 已提交
197
	struct fib_result res;
198
	int no_addr, rpf, accept_local;
199
	bool dev_match;
L
Linus Torvalds 已提交
200
	int ret;
201
	struct net *net;
L
Linus Torvalds 已提交
202

D
David S. Miller 已提交
203 204 205 206 207 208 209
	fl4.flowi4_oif = 0;
	fl4.flowi4_iif = oif;
	fl4.flowi4_mark = mark;
	fl4.daddr = src;
	fl4.saddr = dst;
	fl4.flowi4_tos = tos;
	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
210

211
	no_addr = rpf = accept_local = 0;
212
	in_dev = __in_dev_get_rcu(dev);
L
Linus Torvalds 已提交
213 214 215
	if (in_dev) {
		no_addr = in_dev->ifa_list == NULL;
		rpf = IN_DEV_RPFILTER(in_dev);
216
		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
217
		if (mark && !IN_DEV_SRC_VMARK(in_dev))
D
David S. Miller 已提交
218
			fl4.flowi4_mark = 0;
L
Linus Torvalds 已提交
219 220 221 222 223
	}

	if (in_dev == NULL)
		goto e_inval;

224
	net = dev_net(dev);
D
David S. Miller 已提交
225
	if (fib_lookup(net, &fl4, &res))
L
Linus Torvalds 已提交
226
		goto last_resort;
227 228
	if (res.type != RTN_UNICAST) {
		if (res.type != RTN_LOCAL || !accept_local)
E
Eric Dumazet 已提交
229
			goto e_inval;
230
	}
231
	*spec_dst = FIB_RES_PREFSRC(net, res);
L
Linus Torvalds 已提交
232
	fib_combine_itag(itag, &res);
233 234
	dev_match = false;

L
Linus Torvalds 已提交
235
#ifdef CONFIG_IP_ROUTE_MULTIPATH
236 237 238 239 240 241 242 243
	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
		struct fib_nh *nh = &res.fi->fib_nh[ret];

		if (nh->nh_dev == dev) {
			dev_match = true;
			break;
		}
	}
L
Linus Torvalds 已提交
244 245
#else
	if (FIB_RES_DEV(res) == dev)
246
		dev_match = true;
L
Linus Torvalds 已提交
247
#endif
248
	if (dev_match) {
L
Linus Torvalds 已提交
249 250 251 252 253
		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		return ret;
	}
	if (no_addr)
		goto last_resort;
254
	if (rpf == 1)
255
		goto e_rpf;
D
David S. Miller 已提交
256
	fl4.flowi4_oif = dev->ifindex;
L
Linus Torvalds 已提交
257 258

	ret = 0;
D
David S. Miller 已提交
259
	if (fib_lookup(net, &fl4, &res) == 0) {
L
Linus Torvalds 已提交
260
		if (res.type == RTN_UNICAST) {
261
			*spec_dst = FIB_RES_PREFSRC(net, res);
L
Linus Torvalds 已提交
262 263 264 265 266 267 268
			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		}
	}
	return ret;

last_resort:
	if (rpf)
269
		goto e_rpf;
L
Linus Torvalds 已提交
270 271 272 273 274 275
	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
	*itag = 0;
	return 0;

e_inval:
	return -EINVAL;
276 277
e_rpf:
	return -EXDEV;
L
Linus Torvalds 已提交
278 279
}

A
Al Viro 已提交
280
static inline __be32 sk_extract_addr(struct sockaddr *addr)
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
{
	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
}

static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
{
	struct nlattr *nla;

	nla = (struct nlattr *) ((char *) mx + len);
	nla->nla_type = type;
	nla->nla_len = nla_attr_size(4);
	*(u32 *) nla_data(nla) = value;

	return len + nla_total_size(4);
}

297
static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
298 299
				 struct fib_config *cfg)
{
300
	__be32 addr;
301 302 303
	int plen;

	memset(cfg, 0, sizeof(*cfg));
304
	cfg->fc_nlinfo.nl_net = net;
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319

	if (rt->rt_dst.sa_family != AF_INET)
		return -EAFNOSUPPORT;

	/*
	 * Check mask for validity:
	 * a) it must be contiguous.
	 * b) destination must have all host bits clear.
	 * c) if application forgot to set correct family (AF_INET),
	 *    reject request unless it is absolutely clear i.e.
	 *    both family and mask are zero.
	 */
	plen = 32;
	addr = sk_extract_addr(&rt->rt_dst);
	if (!(rt->rt_flags & RTF_HOST)) {
A
Al Viro 已提交
320
		__be32 mask = sk_extract_addr(&rt->rt_genmask);
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364

		if (rt->rt_genmask.sa_family != AF_INET) {
			if (mask || rt->rt_genmask.sa_family)
				return -EAFNOSUPPORT;
		}

		if (bad_mask(mask, addr))
			return -EINVAL;

		plen = inet_mask_len(mask);
	}

	cfg->fc_dst_len = plen;
	cfg->fc_dst = addr;

	if (cmd != SIOCDELRT) {
		cfg->fc_nlflags = NLM_F_CREATE;
		cfg->fc_protocol = RTPROT_BOOT;
	}

	if (rt->rt_metric)
		cfg->fc_priority = rt->rt_metric - 1;

	if (rt->rt_flags & RTF_REJECT) {
		cfg->fc_scope = RT_SCOPE_HOST;
		cfg->fc_type = RTN_UNREACHABLE;
		return 0;
	}

	cfg->fc_scope = RT_SCOPE_NOWHERE;
	cfg->fc_type = RTN_UNICAST;

	if (rt->rt_dev) {
		char *colon;
		struct net_device *dev;
		char devname[IFNAMSIZ];

		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
			return -EFAULT;

		devname[IFNAMSIZ-1] = 0;
		colon = strchr(devname, ':');
		if (colon)
			*colon = 0;
365
		dev = __dev_get_by_name(net, devname);
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
		if (!dev)
			return -ENODEV;
		cfg->fc_oif = dev->ifindex;
		if (colon) {
			struct in_ifaddr *ifa;
			struct in_device *in_dev = __in_dev_get_rtnl(dev);
			if (!in_dev)
				return -ENODEV;
			*colon = ':';
			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
				if (strcmp(ifa->ifa_label, devname) == 0)
					break;
			if (ifa == NULL)
				return -ENODEV;
			cfg->fc_prefsrc = ifa->ifa_local;
		}
	}

	addr = sk_extract_addr(&rt->rt_gateway);
	if (rt->rt_gateway.sa_family == AF_INET && addr) {
		cfg->fc_gw = addr;
		if (rt->rt_flags & RTF_GATEWAY &&
388
		    inet_addr_type(net, addr) == RTN_UNICAST)
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
			cfg->fc_scope = RT_SCOPE_UNIVERSE;
	}

	if (cmd == SIOCDELRT)
		return 0;

	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
		return -EINVAL;

	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
		cfg->fc_scope = RT_SCOPE_LINK;

	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
		struct nlattr *mx;
		int len = 0;

		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
406
		if (mx == NULL)
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
			return -ENOMEM;

		if (rt->rt_flags & RTF_MTU)
			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);

		if (rt->rt_flags & RTF_WINDOW)
			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);

		if (rt->rt_flags & RTF_IRTT)
			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);

		cfg->fc_mx = mx;
		cfg->fc_mx_len = len;
	}

	return 0;
}

L
Linus Torvalds 已提交
425
/*
E
Eric Dumazet 已提交
426 427
 * Handle IP routing ioctl calls.
 * These are used to manipulate the routing tables
L
Linus Torvalds 已提交
428
 */
429
int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
430
{
431 432
	struct fib_config cfg;
	struct rtentry rt;
L
Linus Torvalds 已提交
433 434 435 436 437 438 439
	int err;

	switch (cmd) {
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
440 441

		if (copy_from_user(&rt, arg, sizeof(rt)))
L
Linus Torvalds 已提交
442
			return -EFAULT;
443

L
Linus Torvalds 已提交
444
		rtnl_lock();
445
		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
L
Linus Torvalds 已提交
446
		if (err == 0) {
447 448
			struct fib_table *tb;

L
Linus Torvalds 已提交
449
			if (cmd == SIOCDELRT) {
450
				tb = fib_get_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
451
				if (tb)
452
					err = fib_table_delete(tb, &cfg);
453 454
				else
					err = -ESRCH;
L
Linus Torvalds 已提交
455
			} else {
456
				tb = fib_new_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
457
				if (tb)
458
					err = fib_table_insert(tb, &cfg);
459 460
				else
					err = -ENOBUFS;
L
Linus Torvalds 已提交
461
			}
462 463 464

			/* allocated by rtentry_to_fib_config() */
			kfree(cfg.fc_mx);
L
Linus Torvalds 已提交
465 466 467 468 469 470 471
		}
		rtnl_unlock();
		return err;
	}
	return -EINVAL;
}

E
Eric Dumazet 已提交
472
const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
473 474 475 476 477 478 479 480
	[RTA_DST]		= { .type = NLA_U32 },
	[RTA_SRC]		= { .type = NLA_U32 },
	[RTA_IIF]		= { .type = NLA_U32 },
	[RTA_OIF]		= { .type = NLA_U32 },
	[RTA_GATEWAY]		= { .type = NLA_U32 },
	[RTA_PRIORITY]		= { .type = NLA_U32 },
	[RTA_PREFSRC]		= { .type = NLA_U32 },
	[RTA_METRICS]		= { .type = NLA_NESTED },
481
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
482 483 484
	[RTA_FLOW]		= { .type = NLA_U32 },
};

485
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
E
Eric Dumazet 已提交
486
			     struct nlmsghdr *nlh, struct fib_config *cfg)
L
Linus Torvalds 已提交
487
{
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509
	struct nlattr *attr;
	int err, remaining;
	struct rtmsg *rtm;

	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
	if (err < 0)
		goto errout;

	memset(cfg, 0, sizeof(*cfg));

	rtm = nlmsg_data(nlh);
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_tos = rtm->rtm_tos;
	cfg->fc_table = rtm->rtm_table;
	cfg->fc_protocol = rtm->rtm_protocol;
	cfg->fc_scope = rtm->rtm_scope;
	cfg->fc_type = rtm->rtm_type;
	cfg->fc_flags = rtm->rtm_flags;
	cfg->fc_nlflags = nlh->nlmsg_flags;

	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
	cfg->fc_nlinfo.nlh = nlh;
510
	cfg->fc_nlinfo.nl_net = net;
511

512 513 514 515 516
	if (cfg->fc_type > RTN_MAX) {
		err = -EINVAL;
		goto errout;
	}

517
	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
518
		switch (nla_type(attr)) {
519
		case RTA_DST:
520
			cfg->fc_dst = nla_get_be32(attr);
521 522 523 524 525
			break;
		case RTA_OIF:
			cfg->fc_oif = nla_get_u32(attr);
			break;
		case RTA_GATEWAY:
526
			cfg->fc_gw = nla_get_be32(attr);
527 528 529 530 531
			break;
		case RTA_PRIORITY:
			cfg->fc_priority = nla_get_u32(attr);
			break;
		case RTA_PREFSRC:
532
			cfg->fc_prefsrc = nla_get_be32(attr);
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
			break;
		case RTA_METRICS:
			cfg->fc_mx = nla_data(attr);
			cfg->fc_mx_len = nla_len(attr);
			break;
		case RTA_MULTIPATH:
			cfg->fc_mp = nla_data(attr);
			cfg->fc_mp_len = nla_len(attr);
			break;
		case RTA_FLOW:
			cfg->fc_flow = nla_get_u32(attr);
			break;
		case RTA_TABLE:
			cfg->fc_table = nla_get_u32(attr);
			break;
L
Linus Torvalds 已提交
548 549
		}
	}
550

L
Linus Torvalds 已提交
551
	return 0;
552 553
errout:
	return err;
L
Linus Torvalds 已提交
554 555
}

556
static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
557
{
558
	struct net *net = sock_net(skb->sk);
559 560 561
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
562

563
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
564 565
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
566

567
	tb = fib_get_table(net, cfg.fc_table);
568 569 570 571 572
	if (tb == NULL) {
		err = -ESRCH;
		goto errout;
	}

573
	err = fib_table_delete(tb, &cfg);
574 575
errout:
	return err;
L
Linus Torvalds 已提交
576 577
}

578
static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
579
{
580
	struct net *net = sock_net(skb->sk);
581 582 583
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
584

585
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
586 587
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
588

589
	tb = fib_new_table(net, cfg.fc_table);
590 591 592 593 594
	if (tb == NULL) {
		err = -ENOBUFS;
		goto errout;
	}

595
	err = fib_table_insert(tb, &cfg);
596 597
errout:
	return err;
L
Linus Torvalds 已提交
598 599
}

600
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
L
Linus Torvalds 已提交
601
{
602
	struct net *net = sock_net(skb->sk);
603 604
	unsigned int h, s_h;
	unsigned int e = 0, s_e;
L
Linus Torvalds 已提交
605
	struct fib_table *tb;
606
	struct hlist_node *node;
607
	struct hlist_head *head;
608
	int dumped = 0;
L
Linus Torvalds 已提交
609

610 611
	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
L
Linus Torvalds 已提交
612 613
		return ip_rt_dump(skb, cb);

614 615 616 617 618
	s_h = cb->args[0];
	s_e = cb->args[1];

	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
		e = 0;
619 620
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist) {
621 622 623 624
			if (e < s_e)
				goto next;
			if (dumped)
				memset(&cb->args[2], 0, sizeof(cb->args) -
625
						 2 * sizeof(cb->args[0]));
626
			if (fib_table_dump(tb, skb, cb) < 0)
627 628 629 630 631
				goto out;
			dumped = 1;
next:
			e++;
		}
L
Linus Torvalds 已提交
632
	}
633 634 635
out:
	cb->args[1] = e;
	cb->args[0] = h;
L
Linus Torvalds 已提交
636 637 638 639 640

	return skb->len;
}

/* Prepare and feed intra-kernel routing request.
E
Eric Dumazet 已提交
641 642 643 644
 * Really, it should be netlink message, but :-( netlink
 * can be not configured, so that we feed it directly
 * to fib engine. It is legal, because all events occur
 * only when netlink is already locked.
L
Linus Torvalds 已提交
645
 */
A
Al Viro 已提交
646
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
647
{
648
	struct net *net = dev_net(ifa->ifa_dev->dev);
649 650 651 652 653 654 655 656 657
	struct fib_table *tb;
	struct fib_config cfg = {
		.fc_protocol = RTPROT_KERNEL,
		.fc_type = type,
		.fc_dst = dst,
		.fc_dst_len = dst_len,
		.fc_prefsrc = ifa->ifa_local,
		.fc_oif = ifa->ifa_dev->dev->ifindex,
		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
658
		.fc_nlinfo = {
659
			.nl_net = net,
660
		},
661
	};
L
Linus Torvalds 已提交
662 663

	if (type == RTN_UNICAST)
664
		tb = fib_new_table(net, RT_TABLE_MAIN);
L
Linus Torvalds 已提交
665
	else
666
		tb = fib_new_table(net, RT_TABLE_LOCAL);
L
Linus Torvalds 已提交
667 668 669 670

	if (tb == NULL)
		return;

671
	cfg.fc_table = tb->tb_id;
L
Linus Torvalds 已提交
672

673 674 675 676
	if (type != RTN_LOCAL)
		cfg.fc_scope = RT_SCOPE_LINK;
	else
		cfg.fc_scope = RT_SCOPE_HOST;
L
Linus Torvalds 已提交
677 678

	if (cmd == RTM_NEWROUTE)
679
		fib_table_insert(tb, &cfg);
L
Linus Torvalds 已提交
680
	else
681
		fib_table_delete(tb, &cfg);
L
Linus Torvalds 已提交
682 683
}

684
void fib_add_ifaddr(struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
685 686 687 688
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *prim = ifa;
A
Al Viro 已提交
689 690
	__be32 mask = ifa->ifa_mask;
	__be32 addr = ifa->ifa_local;
E
Eric Dumazet 已提交
691
	__be32 prefix = ifa->ifa_address & mask;
L
Linus Torvalds 已提交
692

E
Eric Dumazet 已提交
693
	if (ifa->ifa_flags & IFA_F_SECONDARY) {
L
Linus Torvalds 已提交
694 695
		prim = inet_ifa_byprefix(in_dev, prefix, mask);
		if (prim == NULL) {
696
			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
697 698 699 700 701 702
			return;
		}
	}

	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);

E
Eric Dumazet 已提交
703
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
704 705 706
		return;

	/* Add broadcast address, if it is explicitly assigned. */
A
Al Viro 已提交
707
	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
L
Linus Torvalds 已提交
708 709
		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);

E
Eric Dumazet 已提交
710
	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
L
Linus Torvalds 已提交
711
	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
E
Eric Dumazet 已提交
712 713 714
		fib_magic(RTM_NEWROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  prefix, ifa->ifa_prefixlen, prim);
L
Linus Torvalds 已提交
715 716 717 718

		/* Add network specific broadcasts, when it takes a sense */
		if (ifa->ifa_prefixlen < 31) {
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
E
Eric Dumazet 已提交
719 720
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
				  32, prim);
L
Linus Torvalds 已提交
721 722 723 724
		}
	}
}

725 726 727 728 729 730
/* Delete primary or secondary address.
 * Optionally, on secondary address promotion consider the addresses
 * from subnet iprim as deleted, even if they are in device list.
 * In this case the secondary ifa can be in device list.
 */
void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
L
Linus Torvalds 已提交
731 732 733 734
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *ifa1;
735
	struct in_ifaddr *prim = ifa, *prim1 = NULL;
E
Eric Dumazet 已提交
736 737
	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
	__be32 any = ifa->ifa_address & ifa->ifa_mask;
L
Linus Torvalds 已提交
738 739 740 741 742
#define LOCAL_OK	1
#define BRD_OK		2
#define BRD0_OK		4
#define BRD1_OK		8
	unsigned ok = 0;
743 744 745
	int subnet = 0;		/* Primary network */
	int gone = 1;		/* Address is missing */
	int same_prefsrc = 0;	/* Another primary with same IP */
L
Linus Torvalds 已提交
746

747
	if (ifa->ifa_flags & IFA_F_SECONDARY) {
L
Linus Torvalds 已提交
748 749
		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
		if (prim == NULL) {
750
			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
751 752
			return;
		}
753 754 755 756 757 758 759 760 761 762
		if (iprim && iprim != prim) {
			printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
			return;
		}
	} else if (!ipv4_is_zeronet(any) &&
		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
		fib_magic(RTM_DELROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  any, ifa->ifa_prefixlen, prim);
		subnet = 1;
L
Linus Torvalds 已提交
763 764 765
	}

	/* Deletion is more complicated than add.
E
Eric Dumazet 已提交
766 767 768
	 * We should take care of not to delete too much :-)
	 *
	 * Scan address list to be sure that addresses are really gone.
L
Linus Torvalds 已提交
769 770 771
	 */

	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
		if (ifa1 == ifa) {
			/* promotion, keep the IP */
			gone = 0;
			continue;
		}
		/* Ignore IFAs from our subnet */
		if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
		    inet_ifa_match(ifa1->ifa_address, iprim))
			continue;

		/* Ignore ifa1 if it uses different primary IP (prefsrc) */
		if (ifa1->ifa_flags & IFA_F_SECONDARY) {
			/* Another address from our subnet? */
			if (ifa1->ifa_mask == prim->ifa_mask &&
			    inet_ifa_match(ifa1->ifa_address, prim))
				prim1 = prim;
			else {
				/* We reached the secondaries, so
				 * same_prefsrc should be determined.
				 */
				if (!same_prefsrc)
					continue;
				/* Search new prim1 if ifa1 is not
				 * using the current prim1
				 */
				if (!prim1 ||
				    ifa1->ifa_mask != prim1->ifa_mask ||
				    !inet_ifa_match(ifa1->ifa_address, prim1))
					prim1 = inet_ifa_byprefix(in_dev,
							ifa1->ifa_address,
							ifa1->ifa_mask);
				if (!prim1)
					continue;
				if (prim1->ifa_local != prim->ifa_local)
					continue;
			}
		} else {
			if (prim->ifa_local != ifa1->ifa_local)
				continue;
			prim1 = ifa1;
			if (prim != prim1)
				same_prefsrc = 1;
		}
L
Linus Torvalds 已提交
815 816 817 818 819 820 821 822
		if (ifa->ifa_local == ifa1->ifa_local)
			ok |= LOCAL_OK;
		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
			ok |= BRD_OK;
		if (brd == ifa1->ifa_broadcast)
			ok |= BRD1_OK;
		if (any == ifa1->ifa_broadcast)
			ok |= BRD0_OK;
823 824 825 826 827 828 829 830 831 832 833 834 835 836 837
		/* primary has network specific broadcasts */
		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;

			if (!ipv4_is_zeronet(any1)) {
				if (ifa->ifa_broadcast == brd1 ||
				    ifa->ifa_broadcast == any1)
					ok |= BRD_OK;
				if (brd == brd1 || brd == any1)
					ok |= BRD1_OK;
				if (any == brd1 || any == any1)
					ok |= BRD0_OK;
			}
		}
L
Linus Torvalds 已提交
838 839
	}

E
Eric Dumazet 已提交
840
	if (!(ok & BRD_OK))
L
Linus Torvalds 已提交
841
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
842 843 844 845 846 847
	if (subnet && ifa->ifa_prefixlen < 31) {
		if (!(ok & BRD1_OK))
			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
		if (!(ok & BRD0_OK))
			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
	}
E
Eric Dumazet 已提交
848
	if (!(ok & LOCAL_OK)) {
L
Linus Torvalds 已提交
849 850 851
		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);

		/* Check, that this local address finally disappeared. */
852 853
		if (gone &&
		    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
L
Linus Torvalds 已提交
854
			/* And the last, but not the least thing.
E
Eric Dumazet 已提交
855 856 857 858 859
			 * We must flush stray FIB entries.
			 *
			 * First of all, we scan fib_info list searching
			 * for stray nexthop entries, then ignite fib_flush.
			 */
860 861
			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
				fib_flush(dev_net(dev));
L
Linus Torvalds 已提交
862 863 864 865 866 867 868 869
		}
	}
#undef LOCAL_OK
#undef BRD_OK
#undef BRD0_OK
#undef BRD1_OK
}

E
Eric Dumazet 已提交
870
static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
871
{
872

873
	struct fib_result       res;
D
David S. Miller 已提交
874 875 876 877 878
	struct flowi4           fl4 = {
		.flowi4_mark = frn->fl_mark,
		.daddr = frn->fl_addr,
		.flowi4_tos = frn->fl_tos,
		.flowi4_scope = frn->fl_scope,
E
Eric Dumazet 已提交
879
	};
880

881 882 883 884
#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif

885
	frn->err = -ENOENT;
886 887 888 889
	if (tb) {
		local_bh_disable();

		frn->tb_id = tb->tb_id;
E
Eric Dumazet 已提交
890
		rcu_read_lock();
D
David S. Miller 已提交
891
		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
892 893 894 895 896 897 898

		if (!frn->err) {
			frn->prefixlen = res.prefixlen;
			frn->nh_sel = res.nh_sel;
			frn->type = res.type;
			frn->scope = res.scope;
		}
E
Eric Dumazet 已提交
899
		rcu_read_unlock();
900 901 902 903
		local_bh_enable();
	}
}

904
static void nl_fib_input(struct sk_buff *skb)
905
{
906
	struct net *net;
907
	struct fib_result_nl *frn;
908
	struct nlmsghdr *nlh;
909
	struct fib_table *tb;
910
	u32 pid;
911

912
	net = sock_net(skb->sk);
913
	nlh = nlmsg_hdr(skb);
914
	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
915
	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
916
		return;
917 918 919 920 921

	skb = skb_clone(skb, GFP_KERNEL);
	if (skb == NULL)
		return;
	nlh = nlmsg_hdr(skb);
922

923
	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
924
	tb = fib_get_table(net, frn->tb_id_in);
925 926

	nl_fib_lookup(frn, tb);
927

E
Eric Dumazet 已提交
928 929
	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
	NETLINK_CB(skb).pid = 0;        /* from kernel */
930
	NETLINK_CB(skb).dst_group = 0;  /* unicast */
931
	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
932
}
933

934
static int __net_init nl_fib_lookup_init(struct net *net)
935
{
936 937 938 939
	struct sock *sk;
	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
				   nl_fib_input, NULL, THIS_MODULE);
	if (sk == NULL)
940
		return -EAFNOSUPPORT;
941
	net->ipv4.fibnl = sk;
942 943 944 945 946
	return 0;
}

static void nl_fib_lookup_exit(struct net *net)
{
947
	netlink_kernel_release(net->ipv4.fibnl);
948
	net->ipv4.fibnl = NULL;
949 950
}

951
static void fib_disable_ip(struct net_device *dev, int force, int delay)
L
Linus Torvalds 已提交
952
{
D
Denis V. Lunev 已提交
953
	if (fib_sync_down_dev(dev, force))
954
		fib_flush(dev_net(dev));
955
	rt_cache_flush(dev_net(dev), delay);
L
Linus Torvalds 已提交
956 957 958 959 960
	arp_ifdown(dev);
}

static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
961
	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
962
	struct net_device *dev = ifa->ifa_dev->dev;
963
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
964 965 966 967 968

	switch (event) {
	case NETDEV_UP:
		fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
969
		fib_sync_up(dev);
L
Linus Torvalds 已提交
970
#endif
971
		atomic_inc(&net->ipv4.dev_addr_genid);
972
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
973 974
		break;
	case NETDEV_DOWN:
975
		fib_del_ifaddr(ifa, NULL);
976
		atomic_inc(&net->ipv4.dev_addr_genid);
977
		if (ifa->ifa_dev->ifa_list == NULL) {
L
Linus Torvalds 已提交
978
			/* Last address was deleted from this interface.
E
Eric Dumazet 已提交
979
			 * Disable IP.
L
Linus Torvalds 已提交
980
			 */
981
			fib_disable_ip(dev, 1, 0);
L
Linus Torvalds 已提交
982
		} else {
983
			rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
984 985 986 987 988 989 990 991 992
		}
		break;
	}
	return NOTIFY_DONE;
}

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
	struct net_device *dev = ptr;
993
	struct in_device *in_dev = __in_dev_get_rtnl(dev);
994
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
995 996

	if (event == NETDEV_UNREGISTER) {
997
		fib_disable_ip(dev, 2, -1);
L
Linus Torvalds 已提交
998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
		return NOTIFY_DONE;
	}

	if (!in_dev)
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
		for_ifa(in_dev) {
			fib_add_ifaddr(ifa);
		} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		fib_sync_up(dev);
#endif
1012
		atomic_inc(&net->ipv4.dev_addr_genid);
1013
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
1014 1015
		break;
	case NETDEV_DOWN:
1016
		fib_disable_ip(dev, 0, 0);
L
Linus Torvalds 已提交
1017 1018 1019
		break;
	case NETDEV_CHANGEMTU:
	case NETDEV_CHANGE:
1020
		rt_cache_flush(dev_net(dev), 0);
L
Linus Torvalds 已提交
1021
		break;
1022
	case NETDEV_UNREGISTER_BATCH:
1023 1024 1025 1026 1027
		/* The batch unregister is only called on the first
		 * device in the list of devices being unregistered.
		 * Therefore we should not pass dev_net(dev) in here.
		 */
		rt_cache_flush_batch(NULL);
1028
		break;
L
Linus Torvalds 已提交
1029 1030 1031 1032 1033
	}
	return NOTIFY_DONE;
}

static struct notifier_block fib_inetaddr_notifier = {
1034
	.notifier_call = fib_inetaddr_event,
L
Linus Torvalds 已提交
1035 1036 1037
};

static struct notifier_block fib_netdev_notifier = {
1038
	.notifier_call = fib_netdev_event,
L
Linus Torvalds 已提交
1039 1040
};

1041
static int __net_init ip_fib_net_init(struct net *net)
L
Linus Torvalds 已提交
1042
{
1043
	int err;
1044 1045 1046 1047
	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;

	/* Avoid false sharing : Use at least a full cache line */
	size = max_t(size_t, size, L1_CACHE_BYTES);
1048

1049
	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1050 1051 1052
	if (net->ipv4.fib_table_hash == NULL)
		return -ENOMEM;

1053 1054 1055 1056 1057 1058 1059 1060
	err = fib4_rules_init(net);
	if (err < 0)
		goto fail;
	return 0;

fail:
	kfree(net->ipv4.fib_table_hash);
	return err;
1061
}
L
Linus Torvalds 已提交
1062

1063
static void ip_fib_net_exit(struct net *net)
1064 1065 1066 1067 1068 1069 1070
{
	unsigned int i;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	fib4_rules_exit(net);
#endif

1071
	rtnl_lock();
1072 1073 1074 1075
	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
		struct fib_table *tb;
		struct hlist_head *head;
		struct hlist_node *node, *tmp;
1076

1077
		head = &net->ipv4.fib_table_hash[i];
1078 1079
		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
			hlist_del(node);
1080
			fib_table_flush(tb);
1081
			fib_free_table(tb);
1082 1083
		}
	}
1084
	rtnl_unlock();
1085
	kfree(net->ipv4.fib_table_hash);
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
}

static int __net_init fib_net_init(struct net *net)
{
	int error;

	error = ip_fib_net_init(net);
	if (error < 0)
		goto out;
	error = nl_fib_lookup_init(net);
	if (error < 0)
		goto out_nlfl;
	error = fib_proc_init(net);
	if (error < 0)
		goto out_proc;
out:
	return error;

out_proc:
	nl_fib_lookup_exit(net);
out_nlfl:
	ip_fib_net_exit(net);
	goto out;
}

static void __net_exit fib_net_exit(struct net *net)
{
	fib_proc_exit(net);
	nl_fib_lookup_exit(net);
	ip_fib_net_exit(net);
}

static struct pernet_operations fib_net_ops = {
	.init = fib_net_init,
	.exit = fib_net_exit,
};

void __init ip_fib_init(void)
{
1125 1126 1127
	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1128 1129 1130 1131

	register_pernet_subsys(&fib_net_ops);
	register_netdevice_notifier(&fib_netdev_notifier);
	register_inetaddr_notifier(&fib_inetaddr_notifier);
1132

1133
	fib_trie_init();
L
Linus Torvalds 已提交
1134
}