fib_frontend.c 23.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: FIB frontend.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
20
#include <linux/capability.h>
L
Linus Torvalds 已提交
21 22 23 24 25 26 27 28 29
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
30
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
31
#include <linux/netdevice.h>
32
#include <linux/if_addr.h>
L
Linus Torvalds 已提交
33 34 35
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/init.h>
36
#include <linux/list.h>
37
#include <linux/slab.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45

#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
46
#include <net/rtnetlink.h>
L
Linus Torvalds 已提交
47 48 49

#ifndef CONFIG_IP_MULTIPLE_TABLES

50
static int __net_init fib4_rules_init(struct net *net)
51
{
52 53
	struct fib_table *local_table, *main_table;

54
	local_table = fib_trie_table(RT_TABLE_LOCAL);
55
	if (local_table == NULL)
56 57
		return -ENOMEM;

58
	main_table  = fib_trie_table(RT_TABLE_MAIN);
59
	if (main_table == NULL)
60 61
		goto fail;

62
	hlist_add_head_rcu(&local_table->tb_hlist,
63
				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
64
	hlist_add_head_rcu(&main_table->tb_hlist,
65
				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
66 67 68
	return 0;

fail:
69
	kfree(local_table);
70
	return -ENOMEM;
71
}
72
#else
L
Linus Torvalds 已提交
73

74
struct fib_table *fib_new_table(struct net *net, u32 id)
L
Linus Torvalds 已提交
75 76
{
	struct fib_table *tb;
77
	unsigned int h;
L
Linus Torvalds 已提交
78

79 80
	if (id == 0)
		id = RT_TABLE_MAIN;
81
	tb = fib_get_table(net, id);
82 83
	if (tb)
		return tb;
84

85
	tb = fib_trie_table(id);
L
Linus Torvalds 已提交
86 87
	if (!tb)
		return NULL;
88
	h = id & (FIB_TABLE_HASHSZ - 1);
89
	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
L
Linus Torvalds 已提交
90 91 92
	return tb;
}

93
struct fib_table *fib_get_table(struct net *net, u32 id)
94 95 96
{
	struct fib_table *tb;
	struct hlist_node *node;
97
	struct hlist_head *head;
98
	unsigned int h;
L
Linus Torvalds 已提交
99

100 101 102
	if (id == 0)
		id = RT_TABLE_MAIN;
	h = id & (FIB_TABLE_HASHSZ - 1);
103

104
	rcu_read_lock();
105 106
	head = &net->ipv4.fib_table_hash[h];
	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
107 108 109 110 111 112 113 114
		if (tb->tb_id == id) {
			rcu_read_unlock();
			return tb;
		}
	}
	rcu_read_unlock();
	return NULL;
}
L
Linus Torvalds 已提交
115 116
#endif /* CONFIG_IP_MULTIPLE_TABLES */

117
static void fib_flush(struct net *net)
L
Linus Torvalds 已提交
118 119 120
{
	int flushed = 0;
	struct fib_table *tb;
121
	struct hlist_node *node;
122
	struct hlist_head *head;
123
	unsigned int h;
L
Linus Torvalds 已提交
124

125
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
126 127
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist)
128
			flushed += fib_table_flush(tb);
L
Linus Torvalds 已提交
129 130 131
	}

	if (flushed)
132
		rt_cache_flush(net, -1);
L
Linus Torvalds 已提交
133 134
}

135 136 137 138
/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
139 140
static inline unsigned __inet_dev_addr_type(struct net *net,
					    const struct net_device *dev,
141
					    __be32 addr)
L
Linus Torvalds 已提交
142
{
D
David S. Miller 已提交
143
	struct flowi4		fl4 = { .daddr = addr };
L
Linus Torvalds 已提交
144 145
	struct fib_result	res;
	unsigned ret = RTN_BROADCAST;
146
	struct fib_table *local_table;
L
Linus Torvalds 已提交
147

148
	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
L
Linus Torvalds 已提交
149
		return RTN_BROADCAST;
150
	if (ipv4_is_multicast(addr))
L
Linus Torvalds 已提交
151 152 153 154 155
		return RTN_MULTICAST;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif
156

157
	local_table = fib_get_table(net, RT_TABLE_LOCAL);
158
	if (local_table) {
L
Linus Torvalds 已提交
159
		ret = RTN_UNICAST;
E
Eric Dumazet 已提交
160
		rcu_read_lock();
D
David S. Miller 已提交
161
		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
162 163
			if (!dev || dev == res.fi->fib_dev)
				ret = res.type;
L
Linus Torvalds 已提交
164
		}
E
Eric Dumazet 已提交
165
		rcu_read_unlock();
L
Linus Torvalds 已提交
166 167 168 169
	}
	return ret;
}

170
unsigned int inet_addr_type(struct net *net, __be32 addr)
171
{
172
	return __inet_dev_addr_type(net, NULL, addr);
173
}
E
Eric Dumazet 已提交
174
EXPORT_SYMBOL(inet_addr_type);
175

176 177
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
				__be32 addr)
178
{
E
Eric Dumazet 已提交
179
	return __inet_dev_addr_type(net, dev, addr);
180
}
E
Eric Dumazet 已提交
181
EXPORT_SYMBOL(inet_dev_addr_type);
182

L
Linus Torvalds 已提交
183
/* Given (packet source, input interface) and optional (dst, oif, tos):
E
Eric Dumazet 已提交
184 185 186 187 188
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
 * - figure out what "logical" interface this packet arrived
 *   and calculate "specific destination" address.
 * - check, that packet arrived from expected physical interface.
E
Eric Dumazet 已提交
189
 * called with rcu_read_lock()
L
Linus Torvalds 已提交
190
 */
191
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
J
jamal 已提交
192 193
			struct net_device *dev, __be32 *spec_dst,
			u32 *itag, u32 mark)
L
Linus Torvalds 已提交
194 195
{
	struct in_device *in_dev;
D
David S. Miller 已提交
196
	struct flowi4 fl4;
L
Linus Torvalds 已提交
197
	struct fib_result res;
198
	int no_addr, rpf, accept_local;
199
	bool dev_match;
L
Linus Torvalds 已提交
200
	int ret;
201
	struct net *net;
L
Linus Torvalds 已提交
202

D
David S. Miller 已提交
203 204 205 206 207 208 209
	fl4.flowi4_oif = 0;
	fl4.flowi4_iif = oif;
	fl4.flowi4_mark = mark;
	fl4.daddr = src;
	fl4.saddr = dst;
	fl4.flowi4_tos = tos;
	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
210

211
	no_addr = rpf = accept_local = 0;
212
	in_dev = __in_dev_get_rcu(dev);
L
Linus Torvalds 已提交
213 214 215
	if (in_dev) {
		no_addr = in_dev->ifa_list == NULL;
		rpf = IN_DEV_RPFILTER(in_dev);
216
		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
217
		if (mark && !IN_DEV_SRC_VMARK(in_dev))
D
David S. Miller 已提交
218
			fl4.flowi4_mark = 0;
L
Linus Torvalds 已提交
219 220 221 222 223
	}

	if (in_dev == NULL)
		goto e_inval;

224
	net = dev_net(dev);
D
David S. Miller 已提交
225
	if (fib_lookup(net, &fl4, &res))
L
Linus Torvalds 已提交
226
		goto last_resort;
227 228
	if (res.type != RTN_UNICAST) {
		if (res.type != RTN_LOCAL || !accept_local)
E
Eric Dumazet 已提交
229
			goto e_inval;
230
	}
L
Linus Torvalds 已提交
231 232
	*spec_dst = FIB_RES_PREFSRC(res);
	fib_combine_itag(itag, &res);
233 234
	dev_match = false;

L
Linus Torvalds 已提交
235
#ifdef CONFIG_IP_ROUTE_MULTIPATH
236 237 238 239 240 241 242 243
	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
		struct fib_nh *nh = &res.fi->fib_nh[ret];

		if (nh->nh_dev == dev) {
			dev_match = true;
			break;
		}
	}
L
Linus Torvalds 已提交
244 245
#else
	if (FIB_RES_DEV(res) == dev)
246
		dev_match = true;
L
Linus Torvalds 已提交
247
#endif
248
	if (dev_match) {
L
Linus Torvalds 已提交
249 250 251 252 253
		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		return ret;
	}
	if (no_addr)
		goto last_resort;
254
	if (rpf == 1)
255
		goto e_rpf;
D
David S. Miller 已提交
256
	fl4.flowi4_oif = dev->ifindex;
L
Linus Torvalds 已提交
257 258

	ret = 0;
D
David S. Miller 已提交
259
	if (fib_lookup(net, &fl4, &res) == 0) {
L
Linus Torvalds 已提交
260 261 262 263 264 265 266 267 268
		if (res.type == RTN_UNICAST) {
			*spec_dst = FIB_RES_PREFSRC(res);
			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		}
	}
	return ret;

last_resort:
	if (rpf)
269
		goto e_rpf;
L
Linus Torvalds 已提交
270 271 272 273 274 275
	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
	*itag = 0;
	return 0;

e_inval:
	return -EINVAL;
276 277
e_rpf:
	return -EXDEV;
L
Linus Torvalds 已提交
278 279
}

A
Al Viro 已提交
280
static inline __be32 sk_extract_addr(struct sockaddr *addr)
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
{
	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
}

static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
{
	struct nlattr *nla;

	nla = (struct nlattr *) ((char *) mx + len);
	nla->nla_type = type;
	nla->nla_len = nla_attr_size(4);
	*(u32 *) nla_data(nla) = value;

	return len + nla_total_size(4);
}

297
static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
298 299
				 struct fib_config *cfg)
{
300
	__be32 addr;
301 302 303
	int plen;

	memset(cfg, 0, sizeof(*cfg));
304
	cfg->fc_nlinfo.nl_net = net;
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319

	if (rt->rt_dst.sa_family != AF_INET)
		return -EAFNOSUPPORT;

	/*
	 * Check mask for validity:
	 * a) it must be contiguous.
	 * b) destination must have all host bits clear.
	 * c) if application forgot to set correct family (AF_INET),
	 *    reject request unless it is absolutely clear i.e.
	 *    both family and mask are zero.
	 */
	plen = 32;
	addr = sk_extract_addr(&rt->rt_dst);
	if (!(rt->rt_flags & RTF_HOST)) {
A
Al Viro 已提交
320
		__be32 mask = sk_extract_addr(&rt->rt_genmask);
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364

		if (rt->rt_genmask.sa_family != AF_INET) {
			if (mask || rt->rt_genmask.sa_family)
				return -EAFNOSUPPORT;
		}

		if (bad_mask(mask, addr))
			return -EINVAL;

		plen = inet_mask_len(mask);
	}

	cfg->fc_dst_len = plen;
	cfg->fc_dst = addr;

	if (cmd != SIOCDELRT) {
		cfg->fc_nlflags = NLM_F_CREATE;
		cfg->fc_protocol = RTPROT_BOOT;
	}

	if (rt->rt_metric)
		cfg->fc_priority = rt->rt_metric - 1;

	if (rt->rt_flags & RTF_REJECT) {
		cfg->fc_scope = RT_SCOPE_HOST;
		cfg->fc_type = RTN_UNREACHABLE;
		return 0;
	}

	cfg->fc_scope = RT_SCOPE_NOWHERE;
	cfg->fc_type = RTN_UNICAST;

	if (rt->rt_dev) {
		char *colon;
		struct net_device *dev;
		char devname[IFNAMSIZ];

		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
			return -EFAULT;

		devname[IFNAMSIZ-1] = 0;
		colon = strchr(devname, ':');
		if (colon)
			*colon = 0;
365
		dev = __dev_get_by_name(net, devname);
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
		if (!dev)
			return -ENODEV;
		cfg->fc_oif = dev->ifindex;
		if (colon) {
			struct in_ifaddr *ifa;
			struct in_device *in_dev = __in_dev_get_rtnl(dev);
			if (!in_dev)
				return -ENODEV;
			*colon = ':';
			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
				if (strcmp(ifa->ifa_label, devname) == 0)
					break;
			if (ifa == NULL)
				return -ENODEV;
			cfg->fc_prefsrc = ifa->ifa_local;
		}
	}

	addr = sk_extract_addr(&rt->rt_gateway);
	if (rt->rt_gateway.sa_family == AF_INET && addr) {
		cfg->fc_gw = addr;
		if (rt->rt_flags & RTF_GATEWAY &&
388
		    inet_addr_type(net, addr) == RTN_UNICAST)
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
			cfg->fc_scope = RT_SCOPE_UNIVERSE;
	}

	if (cmd == SIOCDELRT)
		return 0;

	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
		return -EINVAL;

	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
		cfg->fc_scope = RT_SCOPE_LINK;

	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
		struct nlattr *mx;
		int len = 0;

		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
406
		if (mx == NULL)
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
			return -ENOMEM;

		if (rt->rt_flags & RTF_MTU)
			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);

		if (rt->rt_flags & RTF_WINDOW)
			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);

		if (rt->rt_flags & RTF_IRTT)
			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);

		cfg->fc_mx = mx;
		cfg->fc_mx_len = len;
	}

	return 0;
}

L
Linus Torvalds 已提交
425
/*
E
Eric Dumazet 已提交
426 427
 * Handle IP routing ioctl calls.
 * These are used to manipulate the routing tables
L
Linus Torvalds 已提交
428
 */
429
int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
430
{
431 432
	struct fib_config cfg;
	struct rtentry rt;
L
Linus Torvalds 已提交
433 434 435 436 437 438 439
	int err;

	switch (cmd) {
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
440 441

		if (copy_from_user(&rt, arg, sizeof(rt)))
L
Linus Torvalds 已提交
442
			return -EFAULT;
443

L
Linus Torvalds 已提交
444
		rtnl_lock();
445
		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
L
Linus Torvalds 已提交
446
		if (err == 0) {
447 448
			struct fib_table *tb;

L
Linus Torvalds 已提交
449
			if (cmd == SIOCDELRT) {
450
				tb = fib_get_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
451
				if (tb)
452
					err = fib_table_delete(tb, &cfg);
453 454
				else
					err = -ESRCH;
L
Linus Torvalds 已提交
455
			} else {
456
				tb = fib_new_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
457
				if (tb)
458
					err = fib_table_insert(tb, &cfg);
459 460
				else
					err = -ENOBUFS;
L
Linus Torvalds 已提交
461
			}
462 463 464

			/* allocated by rtentry_to_fib_config() */
			kfree(cfg.fc_mx);
L
Linus Torvalds 已提交
465 466 467 468 469 470 471
		}
		rtnl_unlock();
		return err;
	}
	return -EINVAL;
}

E
Eric Dumazet 已提交
472
const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
473 474 475 476 477 478 479 480
	[RTA_DST]		= { .type = NLA_U32 },
	[RTA_SRC]		= { .type = NLA_U32 },
	[RTA_IIF]		= { .type = NLA_U32 },
	[RTA_OIF]		= { .type = NLA_U32 },
	[RTA_GATEWAY]		= { .type = NLA_U32 },
	[RTA_PRIORITY]		= { .type = NLA_U32 },
	[RTA_PREFSRC]		= { .type = NLA_U32 },
	[RTA_METRICS]		= { .type = NLA_NESTED },
481
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
482 483 484
	[RTA_FLOW]		= { .type = NLA_U32 },
};

485
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
E
Eric Dumazet 已提交
486
			     struct nlmsghdr *nlh, struct fib_config *cfg)
L
Linus Torvalds 已提交
487
{
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509
	struct nlattr *attr;
	int err, remaining;
	struct rtmsg *rtm;

	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
	if (err < 0)
		goto errout;

	memset(cfg, 0, sizeof(*cfg));

	rtm = nlmsg_data(nlh);
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_tos = rtm->rtm_tos;
	cfg->fc_table = rtm->rtm_table;
	cfg->fc_protocol = rtm->rtm_protocol;
	cfg->fc_scope = rtm->rtm_scope;
	cfg->fc_type = rtm->rtm_type;
	cfg->fc_flags = rtm->rtm_flags;
	cfg->fc_nlflags = nlh->nlmsg_flags;

	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
	cfg->fc_nlinfo.nlh = nlh;
510
	cfg->fc_nlinfo.nl_net = net;
511

512 513 514 515 516
	if (cfg->fc_type > RTN_MAX) {
		err = -EINVAL;
		goto errout;
	}

517
	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
518
		switch (nla_type(attr)) {
519
		case RTA_DST:
520
			cfg->fc_dst = nla_get_be32(attr);
521 522 523 524 525
			break;
		case RTA_OIF:
			cfg->fc_oif = nla_get_u32(attr);
			break;
		case RTA_GATEWAY:
526
			cfg->fc_gw = nla_get_be32(attr);
527 528 529 530 531
			break;
		case RTA_PRIORITY:
			cfg->fc_priority = nla_get_u32(attr);
			break;
		case RTA_PREFSRC:
532
			cfg->fc_prefsrc = nla_get_be32(attr);
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
			break;
		case RTA_METRICS:
			cfg->fc_mx = nla_data(attr);
			cfg->fc_mx_len = nla_len(attr);
			break;
		case RTA_MULTIPATH:
			cfg->fc_mp = nla_data(attr);
			cfg->fc_mp_len = nla_len(attr);
			break;
		case RTA_FLOW:
			cfg->fc_flow = nla_get_u32(attr);
			break;
		case RTA_TABLE:
			cfg->fc_table = nla_get_u32(attr);
			break;
L
Linus Torvalds 已提交
548 549
		}
	}
550

L
Linus Torvalds 已提交
551
	return 0;
552 553
errout:
	return err;
L
Linus Torvalds 已提交
554 555
}

556
static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
557
{
558
	struct net *net = sock_net(skb->sk);
559 560 561
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
562

563
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
564 565
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
566

567
	tb = fib_get_table(net, cfg.fc_table);
568 569 570 571 572
	if (tb == NULL) {
		err = -ESRCH;
		goto errout;
	}

573
	err = fib_table_delete(tb, &cfg);
574 575
errout:
	return err;
L
Linus Torvalds 已提交
576 577
}

578
static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
579
{
580
	struct net *net = sock_net(skb->sk);
581 582 583
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
584

585
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
586 587
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
588

589
	tb = fib_new_table(net, cfg.fc_table);
590 591 592 593 594
	if (tb == NULL) {
		err = -ENOBUFS;
		goto errout;
	}

595
	err = fib_table_insert(tb, &cfg);
596 597
errout:
	return err;
L
Linus Torvalds 已提交
598 599
}

600
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
L
Linus Torvalds 已提交
601
{
602
	struct net *net = sock_net(skb->sk);
603 604
	unsigned int h, s_h;
	unsigned int e = 0, s_e;
L
Linus Torvalds 已提交
605
	struct fib_table *tb;
606
	struct hlist_node *node;
607
	struct hlist_head *head;
608
	int dumped = 0;
L
Linus Torvalds 已提交
609

610 611
	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
L
Linus Torvalds 已提交
612 613
		return ip_rt_dump(skb, cb);

614 615 616 617 618
	s_h = cb->args[0];
	s_e = cb->args[1];

	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
		e = 0;
619 620
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist) {
621 622 623 624
			if (e < s_e)
				goto next;
			if (dumped)
				memset(&cb->args[2], 0, sizeof(cb->args) -
625
						 2 * sizeof(cb->args[0]));
626
			if (fib_table_dump(tb, skb, cb) < 0)
627 628 629 630 631
				goto out;
			dumped = 1;
next:
			e++;
		}
L
Linus Torvalds 已提交
632
	}
633 634 635
out:
	cb->args[1] = e;
	cb->args[0] = h;
L
Linus Torvalds 已提交
636 637 638 639 640

	return skb->len;
}

/* Prepare and feed intra-kernel routing request.
E
Eric Dumazet 已提交
641 642 643 644
 * Really, it should be netlink message, but :-( netlink
 * can be not configured, so that we feed it directly
 * to fib engine. It is legal, because all events occur
 * only when netlink is already locked.
L
Linus Torvalds 已提交
645
 */
A
Al Viro 已提交
646
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
647
{
648
	struct net *net = dev_net(ifa->ifa_dev->dev);
649 650 651 652 653 654 655 656 657
	struct fib_table *tb;
	struct fib_config cfg = {
		.fc_protocol = RTPROT_KERNEL,
		.fc_type = type,
		.fc_dst = dst,
		.fc_dst_len = dst_len,
		.fc_prefsrc = ifa->ifa_local,
		.fc_oif = ifa->ifa_dev->dev->ifindex,
		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
658
		.fc_nlinfo = {
659
			.nl_net = net,
660
		},
661
	};
L
Linus Torvalds 已提交
662 663

	if (type == RTN_UNICAST)
664
		tb = fib_new_table(net, RT_TABLE_MAIN);
L
Linus Torvalds 已提交
665
	else
666
		tb = fib_new_table(net, RT_TABLE_LOCAL);
L
Linus Torvalds 已提交
667 668 669 670

	if (tb == NULL)
		return;

671
	cfg.fc_table = tb->tb_id;
L
Linus Torvalds 已提交
672

673 674 675 676
	if (type != RTN_LOCAL)
		cfg.fc_scope = RT_SCOPE_LINK;
	else
		cfg.fc_scope = RT_SCOPE_HOST;
L
Linus Torvalds 已提交
677 678

	if (cmd == RTM_NEWROUTE)
679
		fib_table_insert(tb, &cfg);
L
Linus Torvalds 已提交
680
	else
681
		fib_table_delete(tb, &cfg);
L
Linus Torvalds 已提交
682 683
}

684
void fib_add_ifaddr(struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
685 686 687 688
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *prim = ifa;
A
Al Viro 已提交
689 690
	__be32 mask = ifa->ifa_mask;
	__be32 addr = ifa->ifa_local;
E
Eric Dumazet 已提交
691
	__be32 prefix = ifa->ifa_address & mask;
L
Linus Torvalds 已提交
692

E
Eric Dumazet 已提交
693
	if (ifa->ifa_flags & IFA_F_SECONDARY) {
L
Linus Torvalds 已提交
694 695
		prim = inet_ifa_byprefix(in_dev, prefix, mask);
		if (prim == NULL) {
696
			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
697 698 699 700 701 702
			return;
		}
	}

	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);

E
Eric Dumazet 已提交
703
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
704 705 706
		return;

	/* Add broadcast address, if it is explicitly assigned. */
A
Al Viro 已提交
707
	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
L
Linus Torvalds 已提交
708 709
		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);

E
Eric Dumazet 已提交
710
	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
L
Linus Torvalds 已提交
711
	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
E
Eric Dumazet 已提交
712 713 714
		fib_magic(RTM_NEWROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  prefix, ifa->ifa_prefixlen, prim);
L
Linus Torvalds 已提交
715 716 717 718

		/* Add network specific broadcasts, when it takes a sense */
		if (ifa->ifa_prefixlen < 31) {
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
E
Eric Dumazet 已提交
719 720
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
				  32, prim);
L
Linus Torvalds 已提交
721 722 723 724 725 726 727 728 729 730
		}
	}
}

static void fib_del_ifaddr(struct in_ifaddr *ifa)
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *ifa1;
	struct in_ifaddr *prim = ifa;
E
Eric Dumazet 已提交
731 732
	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
	__be32 any = ifa->ifa_address & ifa->ifa_mask;
L
Linus Torvalds 已提交
733 734 735 736 737 738
#define LOCAL_OK	1
#define BRD_OK		2
#define BRD0_OK		4
#define BRD1_OK		8
	unsigned ok = 0;

E
Eric Dumazet 已提交
739 740 741 742
	if (!(ifa->ifa_flags & IFA_F_SECONDARY))
		fib_magic(RTM_DELROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  any, ifa->ifa_prefixlen, prim);
L
Linus Torvalds 已提交
743 744 745
	else {
		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
		if (prim == NULL) {
746
			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
747 748 749 750 751
			return;
		}
	}

	/* Deletion is more complicated than add.
E
Eric Dumazet 已提交
752 753 754
	 * We should take care of not to delete too much :-)
	 *
	 * Scan address list to be sure that addresses are really gone.
L
Linus Torvalds 已提交
755 756 757 758 759 760 761 762 763 764 765 766 767
	 */

	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
		if (ifa->ifa_local == ifa1->ifa_local)
			ok |= LOCAL_OK;
		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
			ok |= BRD_OK;
		if (brd == ifa1->ifa_broadcast)
			ok |= BRD1_OK;
		if (any == ifa1->ifa_broadcast)
			ok |= BRD0_OK;
	}

E
Eric Dumazet 已提交
768
	if (!(ok & BRD_OK))
L
Linus Torvalds 已提交
769
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
E
Eric Dumazet 已提交
770
	if (!(ok & BRD1_OK))
L
Linus Torvalds 已提交
771
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
E
Eric Dumazet 已提交
772
	if (!(ok & BRD0_OK))
L
Linus Torvalds 已提交
773
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
E
Eric Dumazet 已提交
774
	if (!(ok & LOCAL_OK)) {
L
Linus Torvalds 已提交
775 776 777
		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);

		/* Check, that this local address finally disappeared. */
778
		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
L
Linus Torvalds 已提交
779
			/* And the last, but not the least thing.
E
Eric Dumazet 已提交
780 781 782 783 784
			 * We must flush stray FIB entries.
			 *
			 * First of all, we scan fib_info list searching
			 * for stray nexthop entries, then ignite fib_flush.
			 */
785 786
			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
				fib_flush(dev_net(dev));
L
Linus Torvalds 已提交
787 788 789 790 791 792 793 794
		}
	}
#undef LOCAL_OK
#undef BRD_OK
#undef BRD0_OK
#undef BRD1_OK
}

E
Eric Dumazet 已提交
795
static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
796
{
797

798
	struct fib_result       res;
D
David S. Miller 已提交
799 800 801 802 803
	struct flowi4           fl4 = {
		.flowi4_mark = frn->fl_mark,
		.daddr = frn->fl_addr,
		.flowi4_tos = frn->fl_tos,
		.flowi4_scope = frn->fl_scope,
E
Eric Dumazet 已提交
804
	};
805

806 807 808 809
#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif

810
	frn->err = -ENOENT;
811 812 813 814
	if (tb) {
		local_bh_disable();

		frn->tb_id = tb->tb_id;
E
Eric Dumazet 已提交
815
		rcu_read_lock();
D
David S. Miller 已提交
816
		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
817 818 819 820 821 822 823

		if (!frn->err) {
			frn->prefixlen = res.prefixlen;
			frn->nh_sel = res.nh_sel;
			frn->type = res.type;
			frn->scope = res.scope;
		}
E
Eric Dumazet 已提交
824
		rcu_read_unlock();
825 826 827 828
		local_bh_enable();
	}
}

829
static void nl_fib_input(struct sk_buff *skb)
830
{
831
	struct net *net;
832
	struct fib_result_nl *frn;
833
	struct nlmsghdr *nlh;
834
	struct fib_table *tb;
835
	u32 pid;
836

837
	net = sock_net(skb->sk);
838
	nlh = nlmsg_hdr(skb);
839
	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
840
	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
841
		return;
842 843 844 845 846

	skb = skb_clone(skb, GFP_KERNEL);
	if (skb == NULL)
		return;
	nlh = nlmsg_hdr(skb);
847

848
	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
849
	tb = fib_get_table(net, frn->tb_id_in);
850 851

	nl_fib_lookup(frn, tb);
852

E
Eric Dumazet 已提交
853 854
	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
	NETLINK_CB(skb).pid = 0;        /* from kernel */
855
	NETLINK_CB(skb).dst_group = 0;  /* unicast */
856
	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
857
}
858

859
static int __net_init nl_fib_lookup_init(struct net *net)
860
{
861 862 863 864
	struct sock *sk;
	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
				   nl_fib_input, NULL, THIS_MODULE);
	if (sk == NULL)
865
		return -EAFNOSUPPORT;
866
	net->ipv4.fibnl = sk;
867 868 869 870 871
	return 0;
}

static void nl_fib_lookup_exit(struct net *net)
{
872
	netlink_kernel_release(net->ipv4.fibnl);
873
	net->ipv4.fibnl = NULL;
874 875
}

876
static void fib_disable_ip(struct net_device *dev, int force, int delay)
L
Linus Torvalds 已提交
877
{
D
Denis V. Lunev 已提交
878
	if (fib_sync_down_dev(dev, force))
879
		fib_flush(dev_net(dev));
880
	rt_cache_flush(dev_net(dev), delay);
L
Linus Torvalds 已提交
881 882 883 884 885
	arp_ifdown(dev);
}

static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
886
	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
887
	struct net_device *dev = ifa->ifa_dev->dev;
L
Linus Torvalds 已提交
888 889 890 891 892

	switch (event) {
	case NETDEV_UP:
		fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
893
		fib_sync_up(dev);
L
Linus Torvalds 已提交
894
#endif
895
		fib_update_nh_saddrs(dev);
896
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
897 898 899
		break;
	case NETDEV_DOWN:
		fib_del_ifaddr(ifa);
900
		fib_update_nh_saddrs(dev);
901
		if (ifa->ifa_dev->ifa_list == NULL) {
L
Linus Torvalds 已提交
902
			/* Last address was deleted from this interface.
E
Eric Dumazet 已提交
903
			 * Disable IP.
L
Linus Torvalds 已提交
904
			 */
905
			fib_disable_ip(dev, 1, 0);
L
Linus Torvalds 已提交
906
		} else {
907
			rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
908 909 910 911 912 913 914 915 916
		}
		break;
	}
	return NOTIFY_DONE;
}

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
	struct net_device *dev = ptr;
917
	struct in_device *in_dev = __in_dev_get_rtnl(dev);
L
Linus Torvalds 已提交
918 919

	if (event == NETDEV_UNREGISTER) {
920
		fib_disable_ip(dev, 2, -1);
L
Linus Torvalds 已提交
921 922 923 924 925 926 927 928 929 930 931 932 933 934
		return NOTIFY_DONE;
	}

	if (!in_dev)
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
		for_ifa(in_dev) {
			fib_add_ifaddr(ifa);
		} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		fib_sync_up(dev);
#endif
935
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
936 937
		break;
	case NETDEV_DOWN:
938
		fib_disable_ip(dev, 0, 0);
L
Linus Torvalds 已提交
939 940 941
		break;
	case NETDEV_CHANGEMTU:
	case NETDEV_CHANGE:
942
		rt_cache_flush(dev_net(dev), 0);
L
Linus Torvalds 已提交
943
		break;
944
	case NETDEV_UNREGISTER_BATCH:
945 946 947 948 949
		/* The batch unregister is only called on the first
		 * device in the list of devices being unregistered.
		 * Therefore we should not pass dev_net(dev) in here.
		 */
		rt_cache_flush_batch(NULL);
950
		break;
L
Linus Torvalds 已提交
951 952 953 954 955
	}
	return NOTIFY_DONE;
}

static struct notifier_block fib_inetaddr_notifier = {
956
	.notifier_call = fib_inetaddr_event,
L
Linus Torvalds 已提交
957 958 959
};

static struct notifier_block fib_netdev_notifier = {
960
	.notifier_call = fib_netdev_event,
L
Linus Torvalds 已提交
961 962
};

963
static int __net_init ip_fib_net_init(struct net *net)
L
Linus Torvalds 已提交
964
{
965
	int err;
966 967 968 969
	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;

	/* Avoid false sharing : Use at least a full cache line */
	size = max_t(size_t, size, L1_CACHE_BYTES);
970

971
	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
972 973 974
	if (net->ipv4.fib_table_hash == NULL)
		return -ENOMEM;

975 976 977 978 979 980 981 982
	err = fib4_rules_init(net);
	if (err < 0)
		goto fail;
	return 0;

fail:
	kfree(net->ipv4.fib_table_hash);
	return err;
983
}
L
Linus Torvalds 已提交
984

985
static void ip_fib_net_exit(struct net *net)
986 987 988 989 990 991 992 993 994 995 996
{
	unsigned int i;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	fib4_rules_exit(net);
#endif

	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
		struct fib_table *tb;
		struct hlist_head *head;
		struct hlist_node *node, *tmp;
997

998
		head = &net->ipv4.fib_table_hash[i];
999 1000
		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
			hlist_del(node);
1001
			fib_table_flush(tb);
1002
			fib_free_table(tb);
1003 1004
		}
	}
1005
	kfree(net->ipv4.fib_table_hash);
1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
}

static int __net_init fib_net_init(struct net *net)
{
	int error;

	error = ip_fib_net_init(net);
	if (error < 0)
		goto out;
	error = nl_fib_lookup_init(net);
	if (error < 0)
		goto out_nlfl;
	error = fib_proc_init(net);
	if (error < 0)
		goto out_proc;
out:
	return error;

out_proc:
	nl_fib_lookup_exit(net);
out_nlfl:
	ip_fib_net_exit(net);
	goto out;
}

static void __net_exit fib_net_exit(struct net *net)
{
	fib_proc_exit(net);
	nl_fib_lookup_exit(net);
	ip_fib_net_exit(net);
}

static struct pernet_operations fib_net_ops = {
	.init = fib_net_init,
	.exit = fib_net_exit,
};

void __init ip_fib_init(void)
{
1045 1046 1047
	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1048 1049 1050 1051

	register_pernet_subsys(&fib_net_ops);
	register_netdevice_notifier(&fib_netdev_notifier);
	register_inetaddr_notifier(&fib_inetaddr_notifier);
1052

1053
	fib_trie_init();
L
Linus Torvalds 已提交
1054
}