fib_frontend.c 24.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 Forwarding Information Base: FIB frontend.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
20
#include <linux/capability.h>
L
Linus Torvalds 已提交
21 22 23 24 25 26 27 28 29
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
30
#include <linux/inetdevice.h>
L
Linus Torvalds 已提交
31
#include <linux/netdevice.h>
32
#include <linux/if_addr.h>
L
Linus Torvalds 已提交
33 34 35
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/init.h>
36
#include <linux/list.h>
37
#include <linux/slab.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45

#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
46
#include <net/rtnetlink.h>
L
Linus Torvalds 已提交
47 48 49

#ifndef CONFIG_IP_MULTIPLE_TABLES

50
static int __net_init fib4_rules_init(struct net *net)
51
{
52 53
	struct fib_table *local_table, *main_table;

54
	local_table = fib_hash_table(RT_TABLE_LOCAL);
55
	if (local_table == NULL)
56 57
		return -ENOMEM;

58
	main_table  = fib_hash_table(RT_TABLE_MAIN);
59
	if (main_table == NULL)
60 61
		goto fail;

62
	hlist_add_head_rcu(&local_table->tb_hlist,
63
				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
64
	hlist_add_head_rcu(&main_table->tb_hlist,
65
				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
66 67 68
	return 0;

fail:
69
	kfree(local_table);
70
	return -ENOMEM;
71
}
72
#else
L
Linus Torvalds 已提交
73

74
struct fib_table *fib_new_table(struct net *net, u32 id)
L
Linus Torvalds 已提交
75 76
{
	struct fib_table *tb;
77
	unsigned int h;
L
Linus Torvalds 已提交
78

79 80
	if (id == 0)
		id = RT_TABLE_MAIN;
81
	tb = fib_get_table(net, id);
82 83
	if (tb)
		return tb;
84 85

	tb = fib_hash_table(id);
L
Linus Torvalds 已提交
86 87
	if (!tb)
		return NULL;
88
	h = id & (FIB_TABLE_HASHSZ - 1);
89
	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
L
Linus Torvalds 已提交
90 91 92
	return tb;
}

93
struct fib_table *fib_get_table(struct net *net, u32 id)
94 95 96
{
	struct fib_table *tb;
	struct hlist_node *node;
97
	struct hlist_head *head;
98
	unsigned int h;
L
Linus Torvalds 已提交
99

100 101 102
	if (id == 0)
		id = RT_TABLE_MAIN;
	h = id & (FIB_TABLE_HASHSZ - 1);
103

104
	rcu_read_lock();
105 106
	head = &net->ipv4.fib_table_hash[h];
	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
107 108 109 110 111 112 113 114
		if (tb->tb_id == id) {
			rcu_read_unlock();
			return tb;
		}
	}
	rcu_read_unlock();
	return NULL;
}
L
Linus Torvalds 已提交
115 116
#endif /* CONFIG_IP_MULTIPLE_TABLES */

117
static void fib_flush(struct net *net)
L
Linus Torvalds 已提交
118 119 120
{
	int flushed = 0;
	struct fib_table *tb;
121
	struct hlist_node *node;
122
	struct hlist_head *head;
123
	unsigned int h;
L
Linus Torvalds 已提交
124

125
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
126 127
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist)
128
			flushed += fib_table_flush(tb);
L
Linus Torvalds 已提交
129 130 131
	}

	if (flushed)
132
		rt_cache_flush(net, -1);
L
Linus Torvalds 已提交
133 134
}

E
Eric Dumazet 已提交
135 136 137 138 139 140
/**
 * __ip_dev_find - find the first device with a given source address.
 * @net: the net namespace
 * @addr: the source address
 * @devref: if true, take a reference on the found device
 *
141
 * If a caller uses devref=false, it should be protected by RCU, or RTNL
L
Linus Torvalds 已提交
142
 */
E
Eric Dumazet 已提交
143
struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
L
Linus Torvalds 已提交
144
{
E
Eric Dumazet 已提交
145
	struct flowi fl = {
146
		.fl4_dst = addr,
E
Eric Dumazet 已提交
147 148
	};
	struct fib_result res = { 0 };
L
Linus Torvalds 已提交
149
	struct net_device *dev = NULL;
150 151 152 153 154
	struct fib_table *local_table;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif
L
Linus Torvalds 已提交
155

E
Eric Dumazet 已提交
156
	rcu_read_lock();
157 158 159
	local_table = fib_get_table(net, RT_TABLE_LOCAL);
	if (!local_table ||
	    fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
E
Eric Dumazet 已提交
160
		rcu_read_unlock();
L
Linus Torvalds 已提交
161
		return NULL;
E
Eric Dumazet 已提交
162
	}
L
Linus Torvalds 已提交
163 164 165 166
	if (res.type != RTN_LOCAL)
		goto out;
	dev = FIB_RES_DEV(res);

E
Eric Dumazet 已提交
167
	if (dev && devref)
L
Linus Torvalds 已提交
168 169
		dev_hold(dev);
out:
E
Eric Dumazet 已提交
170
	rcu_read_unlock();
L
Linus Torvalds 已提交
171 172
	return dev;
}
E
Eric Dumazet 已提交
173
EXPORT_SYMBOL(__ip_dev_find);
L
Linus Torvalds 已提交
174

175 176 177 178
/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
179 180
static inline unsigned __inet_dev_addr_type(struct net *net,
					    const struct net_device *dev,
181
					    __be32 addr)
L
Linus Torvalds 已提交
182
{
183
	struct flowi		fl = { .fl4_dst = addr };
L
Linus Torvalds 已提交
184 185
	struct fib_result	res;
	unsigned ret = RTN_BROADCAST;
186
	struct fib_table *local_table;
L
Linus Torvalds 已提交
187

188
	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
L
Linus Torvalds 已提交
189
		return RTN_BROADCAST;
190
	if (ipv4_is_multicast(addr))
L
Linus Torvalds 已提交
191 192 193 194 195
		return RTN_MULTICAST;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif
196

197
	local_table = fib_get_table(net, RT_TABLE_LOCAL);
198
	if (local_table) {
L
Linus Torvalds 已提交
199
		ret = RTN_UNICAST;
E
Eric Dumazet 已提交
200 201
		rcu_read_lock();
		if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
202 203
			if (!dev || dev == res.fi->fib_dev)
				ret = res.type;
L
Linus Torvalds 已提交
204
		}
E
Eric Dumazet 已提交
205
		rcu_read_unlock();
L
Linus Torvalds 已提交
206 207 208 209
	}
	return ret;
}

210
unsigned int inet_addr_type(struct net *net, __be32 addr)
211
{
212
	return __inet_dev_addr_type(net, NULL, addr);
213
}
E
Eric Dumazet 已提交
214
EXPORT_SYMBOL(inet_addr_type);
215

216 217
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
				__be32 addr)
218
{
E
Eric Dumazet 已提交
219
	return __inet_dev_addr_type(net, dev, addr);
220
}
E
Eric Dumazet 已提交
221
EXPORT_SYMBOL(inet_dev_addr_type);
222

L
Linus Torvalds 已提交
223
/* Given (packet source, input interface) and optional (dst, oif, tos):
E
Eric Dumazet 已提交
224 225 226 227 228
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
 * - figure out what "logical" interface this packet arrived
 *   and calculate "specific destination" address.
 * - check, that packet arrived from expected physical interface.
E
Eric Dumazet 已提交
229
 * called with rcu_read_lock()
L
Linus Torvalds 已提交
230
 */
231
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
J
jamal 已提交
232 233
			struct net_device *dev, __be32 *spec_dst,
			u32 *itag, u32 mark)
L
Linus Torvalds 已提交
234 235
{
	struct in_device *in_dev;
E
Eric Dumazet 已提交
236
	struct flowi fl = {
237 238 239
		.fl4_dst = src,
		.fl4_src = dst,
		.fl4_tos = tos,
E
Eric Dumazet 已提交
240 241 242
		.mark = mark,
		.iif = oif
	};
L
Linus Torvalds 已提交
243
	struct fib_result res;
244
	int no_addr, rpf, accept_local;
245
	bool dev_match;
L
Linus Torvalds 已提交
246
	int ret;
247
	struct net *net;
L
Linus Torvalds 已提交
248

249
	no_addr = rpf = accept_local = 0;
250
	in_dev = __in_dev_get_rcu(dev);
L
Linus Torvalds 已提交
251 252 253
	if (in_dev) {
		no_addr = in_dev->ifa_list == NULL;
		rpf = IN_DEV_RPFILTER(in_dev);
254
		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
255 256
		if (mark && !IN_DEV_SRC_VMARK(in_dev))
			fl.mark = 0;
L
Linus Torvalds 已提交
257 258 259 260 261
	}

	if (in_dev == NULL)
		goto e_inval;

262
	net = dev_net(dev);
263
	if (fib_lookup(net, &fl, &res))
L
Linus Torvalds 已提交
264
		goto last_resort;
265 266
	if (res.type != RTN_UNICAST) {
		if (res.type != RTN_LOCAL || !accept_local)
E
Eric Dumazet 已提交
267
			goto e_inval;
268
	}
L
Linus Torvalds 已提交
269 270
	*spec_dst = FIB_RES_PREFSRC(res);
	fib_combine_itag(itag, &res);
271 272
	dev_match = false;

L
Linus Torvalds 已提交
273
#ifdef CONFIG_IP_ROUTE_MULTIPATH
274 275 276 277 278 279 280 281
	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
		struct fib_nh *nh = &res.fi->fib_nh[ret];

		if (nh->nh_dev == dev) {
			dev_match = true;
			break;
		}
	}
L
Linus Torvalds 已提交
282 283
#else
	if (FIB_RES_DEV(res) == dev)
284
		dev_match = true;
L
Linus Torvalds 已提交
285
#endif
286
	if (dev_match) {
L
Linus Torvalds 已提交
287 288 289 290 291
		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		return ret;
	}
	if (no_addr)
		goto last_resort;
292
	if (rpf == 1)
293
		goto e_rpf;
L
Linus Torvalds 已提交
294 295 296
	fl.oif = dev->ifindex;

	ret = 0;
297
	if (fib_lookup(net, &fl, &res) == 0) {
L
Linus Torvalds 已提交
298 299 300 301 302 303 304 305 306
		if (res.type == RTN_UNICAST) {
			*spec_dst = FIB_RES_PREFSRC(res);
			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
		}
	}
	return ret;

last_resort:
	if (rpf)
307
		goto e_rpf;
L
Linus Torvalds 已提交
308 309 310 311 312 313
	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
	*itag = 0;
	return 0;

e_inval:
	return -EINVAL;
314 315
e_rpf:
	return -EXDEV;
L
Linus Torvalds 已提交
316 317
}

A
Al Viro 已提交
318
static inline __be32 sk_extract_addr(struct sockaddr *addr)
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
{
	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
}

static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
{
	struct nlattr *nla;

	nla = (struct nlattr *) ((char *) mx + len);
	nla->nla_type = type;
	nla->nla_len = nla_attr_size(4);
	*(u32 *) nla_data(nla) = value;

	return len + nla_total_size(4);
}

335
static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
336 337
				 struct fib_config *cfg)
{
338
	__be32 addr;
339 340 341
	int plen;

	memset(cfg, 0, sizeof(*cfg));
342
	cfg->fc_nlinfo.nl_net = net;
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357

	if (rt->rt_dst.sa_family != AF_INET)
		return -EAFNOSUPPORT;

	/*
	 * Check mask for validity:
	 * a) it must be contiguous.
	 * b) destination must have all host bits clear.
	 * c) if application forgot to set correct family (AF_INET),
	 *    reject request unless it is absolutely clear i.e.
	 *    both family and mask are zero.
	 */
	plen = 32;
	addr = sk_extract_addr(&rt->rt_dst);
	if (!(rt->rt_flags & RTF_HOST)) {
A
Al Viro 已提交
358
		__be32 mask = sk_extract_addr(&rt->rt_genmask);
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402

		if (rt->rt_genmask.sa_family != AF_INET) {
			if (mask || rt->rt_genmask.sa_family)
				return -EAFNOSUPPORT;
		}

		if (bad_mask(mask, addr))
			return -EINVAL;

		plen = inet_mask_len(mask);
	}

	cfg->fc_dst_len = plen;
	cfg->fc_dst = addr;

	if (cmd != SIOCDELRT) {
		cfg->fc_nlflags = NLM_F_CREATE;
		cfg->fc_protocol = RTPROT_BOOT;
	}

	if (rt->rt_metric)
		cfg->fc_priority = rt->rt_metric - 1;

	if (rt->rt_flags & RTF_REJECT) {
		cfg->fc_scope = RT_SCOPE_HOST;
		cfg->fc_type = RTN_UNREACHABLE;
		return 0;
	}

	cfg->fc_scope = RT_SCOPE_NOWHERE;
	cfg->fc_type = RTN_UNICAST;

	if (rt->rt_dev) {
		char *colon;
		struct net_device *dev;
		char devname[IFNAMSIZ];

		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
			return -EFAULT;

		devname[IFNAMSIZ-1] = 0;
		colon = strchr(devname, ':');
		if (colon)
			*colon = 0;
403
		dev = __dev_get_by_name(net, devname);
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
		if (!dev)
			return -ENODEV;
		cfg->fc_oif = dev->ifindex;
		if (colon) {
			struct in_ifaddr *ifa;
			struct in_device *in_dev = __in_dev_get_rtnl(dev);
			if (!in_dev)
				return -ENODEV;
			*colon = ':';
			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
				if (strcmp(ifa->ifa_label, devname) == 0)
					break;
			if (ifa == NULL)
				return -ENODEV;
			cfg->fc_prefsrc = ifa->ifa_local;
		}
	}

	addr = sk_extract_addr(&rt->rt_gateway);
	if (rt->rt_gateway.sa_family == AF_INET && addr) {
		cfg->fc_gw = addr;
		if (rt->rt_flags & RTF_GATEWAY &&
426
		    inet_addr_type(net, addr) == RTN_UNICAST)
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
			cfg->fc_scope = RT_SCOPE_UNIVERSE;
	}

	if (cmd == SIOCDELRT)
		return 0;

	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
		return -EINVAL;

	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
		cfg->fc_scope = RT_SCOPE_LINK;

	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
		struct nlattr *mx;
		int len = 0;

		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
444
		if (mx == NULL)
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
			return -ENOMEM;

		if (rt->rt_flags & RTF_MTU)
			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);

		if (rt->rt_flags & RTF_WINDOW)
			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);

		if (rt->rt_flags & RTF_IRTT)
			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);

		cfg->fc_mx = mx;
		cfg->fc_mx_len = len;
	}

	return 0;
}

L
Linus Torvalds 已提交
463
/*
E
Eric Dumazet 已提交
464 465
 * Handle IP routing ioctl calls.
 * These are used to manipulate the routing tables
L
Linus Torvalds 已提交
466
 */
467
int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
468
{
469 470
	struct fib_config cfg;
	struct rtentry rt;
L
Linus Torvalds 已提交
471 472 473 474 475 476 477
	int err;

	switch (cmd) {
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
478 479

		if (copy_from_user(&rt, arg, sizeof(rt)))
L
Linus Torvalds 已提交
480
			return -EFAULT;
481

L
Linus Torvalds 已提交
482
		rtnl_lock();
483
		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
L
Linus Torvalds 已提交
484
		if (err == 0) {
485 486
			struct fib_table *tb;

L
Linus Torvalds 已提交
487
			if (cmd == SIOCDELRT) {
488
				tb = fib_get_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
489
				if (tb)
490
					err = fib_table_delete(tb, &cfg);
491 492
				else
					err = -ESRCH;
L
Linus Torvalds 已提交
493
			} else {
494
				tb = fib_new_table(net, cfg.fc_table);
L
Linus Torvalds 已提交
495
				if (tb)
496
					err = fib_table_insert(tb, &cfg);
497 498
				else
					err = -ENOBUFS;
L
Linus Torvalds 已提交
499
			}
500 501 502

			/* allocated by rtentry_to_fib_config() */
			kfree(cfg.fc_mx);
L
Linus Torvalds 已提交
503 504 505 506 507 508 509
		}
		rtnl_unlock();
		return err;
	}
	return -EINVAL;
}

E
Eric Dumazet 已提交
510
const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
511 512 513 514 515 516 517 518
	[RTA_DST]		= { .type = NLA_U32 },
	[RTA_SRC]		= { .type = NLA_U32 },
	[RTA_IIF]		= { .type = NLA_U32 },
	[RTA_OIF]		= { .type = NLA_U32 },
	[RTA_GATEWAY]		= { .type = NLA_U32 },
	[RTA_PRIORITY]		= { .type = NLA_U32 },
	[RTA_PREFSRC]		= { .type = NLA_U32 },
	[RTA_METRICS]		= { .type = NLA_NESTED },
519
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
520 521 522
	[RTA_FLOW]		= { .type = NLA_U32 },
};

523
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
E
Eric Dumazet 已提交
524
			     struct nlmsghdr *nlh, struct fib_config *cfg)
L
Linus Torvalds 已提交
525
{
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
	struct nlattr *attr;
	int err, remaining;
	struct rtmsg *rtm;

	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
	if (err < 0)
		goto errout;

	memset(cfg, 0, sizeof(*cfg));

	rtm = nlmsg_data(nlh);
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_tos = rtm->rtm_tos;
	cfg->fc_table = rtm->rtm_table;
	cfg->fc_protocol = rtm->rtm_protocol;
	cfg->fc_scope = rtm->rtm_scope;
	cfg->fc_type = rtm->rtm_type;
	cfg->fc_flags = rtm->rtm_flags;
	cfg->fc_nlflags = nlh->nlmsg_flags;

	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
	cfg->fc_nlinfo.nlh = nlh;
548
	cfg->fc_nlinfo.nl_net = net;
549

550 551 552 553 554
	if (cfg->fc_type > RTN_MAX) {
		err = -EINVAL;
		goto errout;
	}

555
	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
556
		switch (nla_type(attr)) {
557
		case RTA_DST:
558
			cfg->fc_dst = nla_get_be32(attr);
559 560 561 562 563
			break;
		case RTA_OIF:
			cfg->fc_oif = nla_get_u32(attr);
			break;
		case RTA_GATEWAY:
564
			cfg->fc_gw = nla_get_be32(attr);
565 566 567 568 569
			break;
		case RTA_PRIORITY:
			cfg->fc_priority = nla_get_u32(attr);
			break;
		case RTA_PREFSRC:
570
			cfg->fc_prefsrc = nla_get_be32(attr);
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
			break;
		case RTA_METRICS:
			cfg->fc_mx = nla_data(attr);
			cfg->fc_mx_len = nla_len(attr);
			break;
		case RTA_MULTIPATH:
			cfg->fc_mp = nla_data(attr);
			cfg->fc_mp_len = nla_len(attr);
			break;
		case RTA_FLOW:
			cfg->fc_flow = nla_get_u32(attr);
			break;
		case RTA_TABLE:
			cfg->fc_table = nla_get_u32(attr);
			break;
L
Linus Torvalds 已提交
586 587
		}
	}
588

L
Linus Torvalds 已提交
589
	return 0;
590 591
errout:
	return err;
L
Linus Torvalds 已提交
592 593
}

594
static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
595
{
596
	struct net *net = sock_net(skb->sk);
597 598 599
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
600

601
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
602 603
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
604

605
	tb = fib_get_table(net, cfg.fc_table);
606 607 608 609 610
	if (tb == NULL) {
		err = -ESRCH;
		goto errout;
	}

611
	err = fib_table_delete(tb, &cfg);
612 613
errout:
	return err;
L
Linus Torvalds 已提交
614 615
}

616
static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
L
Linus Torvalds 已提交
617
{
618
	struct net *net = sock_net(skb->sk);
619 620 621
	struct fib_config cfg;
	struct fib_table *tb;
	int err;
L
Linus Torvalds 已提交
622

623
	err = rtm_to_fib_config(net, skb, nlh, &cfg);
624 625
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
626

627
	tb = fib_new_table(net, cfg.fc_table);
628 629 630 631 632
	if (tb == NULL) {
		err = -ENOBUFS;
		goto errout;
	}

633
	err = fib_table_insert(tb, &cfg);
634 635
errout:
	return err;
L
Linus Torvalds 已提交
636 637
}

638
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
L
Linus Torvalds 已提交
639
{
640
	struct net *net = sock_net(skb->sk);
641 642
	unsigned int h, s_h;
	unsigned int e = 0, s_e;
L
Linus Torvalds 已提交
643
	struct fib_table *tb;
644
	struct hlist_node *node;
645
	struct hlist_head *head;
646
	int dumped = 0;
L
Linus Torvalds 已提交
647

648 649
	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
L
Linus Torvalds 已提交
650 651
		return ip_rt_dump(skb, cb);

652 653 654 655 656
	s_h = cb->args[0];
	s_e = cb->args[1];

	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
		e = 0;
657 658
		head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry(tb, node, head, tb_hlist) {
659 660 661 662
			if (e < s_e)
				goto next;
			if (dumped)
				memset(&cb->args[2], 0, sizeof(cb->args) -
663
						 2 * sizeof(cb->args[0]));
664
			if (fib_table_dump(tb, skb, cb) < 0)
665 666 667 668 669
				goto out;
			dumped = 1;
next:
			e++;
		}
L
Linus Torvalds 已提交
670
	}
671 672 673
out:
	cb->args[1] = e;
	cb->args[0] = h;
L
Linus Torvalds 已提交
674 675 676 677 678

	return skb->len;
}

/* Prepare and feed intra-kernel routing request.
E
Eric Dumazet 已提交
679 680 681 682
 * Really, it should be netlink message, but :-( netlink
 * can be not configured, so that we feed it directly
 * to fib engine. It is legal, because all events occur
 * only when netlink is already locked.
L
Linus Torvalds 已提交
683
 */
A
Al Viro 已提交
684
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
685
{
686
	struct net *net = dev_net(ifa->ifa_dev->dev);
687 688 689 690 691 692 693 694 695
	struct fib_table *tb;
	struct fib_config cfg = {
		.fc_protocol = RTPROT_KERNEL,
		.fc_type = type,
		.fc_dst = dst,
		.fc_dst_len = dst_len,
		.fc_prefsrc = ifa->ifa_local,
		.fc_oif = ifa->ifa_dev->dev->ifindex,
		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
696
		.fc_nlinfo = {
697
			.nl_net = net,
698
		},
699
	};
L
Linus Torvalds 已提交
700 701

	if (type == RTN_UNICAST)
702
		tb = fib_new_table(net, RT_TABLE_MAIN);
L
Linus Torvalds 已提交
703
	else
704
		tb = fib_new_table(net, RT_TABLE_LOCAL);
L
Linus Torvalds 已提交
705 706 707 708

	if (tb == NULL)
		return;

709
	cfg.fc_table = tb->tb_id;
L
Linus Torvalds 已提交
710

711 712 713 714
	if (type != RTN_LOCAL)
		cfg.fc_scope = RT_SCOPE_LINK;
	else
		cfg.fc_scope = RT_SCOPE_HOST;
L
Linus Torvalds 已提交
715 716

	if (cmd == RTM_NEWROUTE)
717
		fib_table_insert(tb, &cfg);
L
Linus Torvalds 已提交
718
	else
719
		fib_table_delete(tb, &cfg);
L
Linus Torvalds 已提交
720 721
}

722
void fib_add_ifaddr(struct in_ifaddr *ifa)
L
Linus Torvalds 已提交
723 724 725 726
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *prim = ifa;
A
Al Viro 已提交
727 728
	__be32 mask = ifa->ifa_mask;
	__be32 addr = ifa->ifa_local;
E
Eric Dumazet 已提交
729
	__be32 prefix = ifa->ifa_address & mask;
L
Linus Torvalds 已提交
730

E
Eric Dumazet 已提交
731
	if (ifa->ifa_flags & IFA_F_SECONDARY) {
L
Linus Torvalds 已提交
732 733
		prim = inet_ifa_byprefix(in_dev, prefix, mask);
		if (prim == NULL) {
734
			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
735 736 737 738 739 740
			return;
		}
	}

	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);

E
Eric Dumazet 已提交
741
	if (!(dev->flags & IFF_UP))
L
Linus Torvalds 已提交
742 743 744
		return;

	/* Add broadcast address, if it is explicitly assigned. */
A
Al Viro 已提交
745
	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
L
Linus Torvalds 已提交
746 747
		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);

E
Eric Dumazet 已提交
748
	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
L
Linus Torvalds 已提交
749
	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
E
Eric Dumazet 已提交
750 751 752
		fib_magic(RTM_NEWROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  prefix, ifa->ifa_prefixlen, prim);
L
Linus Torvalds 已提交
753 754 755 756

		/* Add network specific broadcasts, when it takes a sense */
		if (ifa->ifa_prefixlen < 31) {
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
E
Eric Dumazet 已提交
757 758
			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
				  32, prim);
L
Linus Torvalds 已提交
759 760 761 762 763 764 765 766 767 768
		}
	}
}

static void fib_del_ifaddr(struct in_ifaddr *ifa)
{
	struct in_device *in_dev = ifa->ifa_dev;
	struct net_device *dev = in_dev->dev;
	struct in_ifaddr *ifa1;
	struct in_ifaddr *prim = ifa;
E
Eric Dumazet 已提交
769 770
	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
	__be32 any = ifa->ifa_address & ifa->ifa_mask;
L
Linus Torvalds 已提交
771 772 773 774 775 776
#define LOCAL_OK	1
#define BRD_OK		2
#define BRD0_OK		4
#define BRD1_OK		8
	unsigned ok = 0;

E
Eric Dumazet 已提交
777 778 779 780
	if (!(ifa->ifa_flags & IFA_F_SECONDARY))
		fib_magic(RTM_DELROUTE,
			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
			  any, ifa->ifa_prefixlen, prim);
L
Linus Torvalds 已提交
781 782 783
	else {
		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
		if (prim == NULL) {
784
			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
L
Linus Torvalds 已提交
785 786 787 788 789
			return;
		}
	}

	/* Deletion is more complicated than add.
E
Eric Dumazet 已提交
790 791 792
	 * We should take care of not to delete too much :-)
	 *
	 * Scan address list to be sure that addresses are really gone.
L
Linus Torvalds 已提交
793 794 795 796 797 798 799 800 801 802 803 804 805
	 */

	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
		if (ifa->ifa_local == ifa1->ifa_local)
			ok |= LOCAL_OK;
		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
			ok |= BRD_OK;
		if (brd == ifa1->ifa_broadcast)
			ok |= BRD1_OK;
		if (any == ifa1->ifa_broadcast)
			ok |= BRD0_OK;
	}

E
Eric Dumazet 已提交
806
	if (!(ok & BRD_OK))
L
Linus Torvalds 已提交
807
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
E
Eric Dumazet 已提交
808
	if (!(ok & BRD1_OK))
L
Linus Torvalds 已提交
809
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
E
Eric Dumazet 已提交
810
	if (!(ok & BRD0_OK))
L
Linus Torvalds 已提交
811
		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
E
Eric Dumazet 已提交
812
	if (!(ok & LOCAL_OK)) {
L
Linus Torvalds 已提交
813 814 815
		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);

		/* Check, that this local address finally disappeared. */
816
		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
L
Linus Torvalds 已提交
817
			/* And the last, but not the least thing.
E
Eric Dumazet 已提交
818 819 820 821 822
			 * We must flush stray FIB entries.
			 *
			 * First of all, we scan fib_info list searching
			 * for stray nexthop entries, then ignite fib_flush.
			 */
823 824
			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
				fib_flush(dev_net(dev));
L
Linus Torvalds 已提交
825 826 827 828 829 830 831 832
		}
	}
#undef LOCAL_OK
#undef BRD_OK
#undef BRD0_OK
#undef BRD1_OK
}

E
Eric Dumazet 已提交
833
static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
834
{
835

836
	struct fib_result       res;
E
Eric Dumazet 已提交
837 838
	struct flowi            fl = {
		.mark = frn->fl_mark,
839 840 841
		.fl4_dst = frn->fl_addr,
		.fl4_tos = frn->fl_tos,
		.fl4_scope = frn->fl_scope,
E
Eric Dumazet 已提交
842
	};
843

844 845 846 847
#ifdef CONFIG_IP_MULTIPLE_TABLES
	res.r = NULL;
#endif

848
	frn->err = -ENOENT;
849 850 851 852
	if (tb) {
		local_bh_disable();

		frn->tb_id = tb->tb_id;
E
Eric Dumazet 已提交
853 854
		rcu_read_lock();
		frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
855 856 857 858 859 860 861

		if (!frn->err) {
			frn->prefixlen = res.prefixlen;
			frn->nh_sel = res.nh_sel;
			frn->type = res.type;
			frn->scope = res.scope;
		}
E
Eric Dumazet 已提交
862
		rcu_read_unlock();
863 864 865 866
		local_bh_enable();
	}
}

867
static void nl_fib_input(struct sk_buff *skb)
868
{
869
	struct net *net;
870
	struct fib_result_nl *frn;
871
	struct nlmsghdr *nlh;
872
	struct fib_table *tb;
873
	u32 pid;
874

875
	net = sock_net(skb->sk);
876
	nlh = nlmsg_hdr(skb);
877
	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
878
	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
879
		return;
880 881 882 883 884

	skb = skb_clone(skb, GFP_KERNEL);
	if (skb == NULL)
		return;
	nlh = nlmsg_hdr(skb);
885

886
	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
887
	tb = fib_get_table(net, frn->tb_id_in);
888 889

	nl_fib_lookup(frn, tb);
890

E
Eric Dumazet 已提交
891 892
	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
	NETLINK_CB(skb).pid = 0;        /* from kernel */
893
	NETLINK_CB(skb).dst_group = 0;  /* unicast */
894
	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
895
}
896

897
static int __net_init nl_fib_lookup_init(struct net *net)
898
{
899 900 901 902
	struct sock *sk;
	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
				   nl_fib_input, NULL, THIS_MODULE);
	if (sk == NULL)
903
		return -EAFNOSUPPORT;
904
	net->ipv4.fibnl = sk;
905 906 907 908 909
	return 0;
}

static void nl_fib_lookup_exit(struct net *net)
{
910
	netlink_kernel_release(net->ipv4.fibnl);
911
	net->ipv4.fibnl = NULL;
912 913
}

914
static void fib_disable_ip(struct net_device *dev, int force, int delay)
L
Linus Torvalds 已提交
915
{
D
Denis V. Lunev 已提交
916
	if (fib_sync_down_dev(dev, force))
917
		fib_flush(dev_net(dev));
918
	rt_cache_flush(dev_net(dev), delay);
L
Linus Torvalds 已提交
919 920 921 922 923
	arp_ifdown(dev);
}

static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
924
	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
925
	struct net_device *dev = ifa->ifa_dev->dev;
L
Linus Torvalds 已提交
926 927 928 929 930

	switch (event) {
	case NETDEV_UP:
		fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
931
		fib_sync_up(dev);
L
Linus Torvalds 已提交
932
#endif
933
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
934 935 936
		break;
	case NETDEV_DOWN:
		fib_del_ifaddr(ifa);
937
		if (ifa->ifa_dev->ifa_list == NULL) {
L
Linus Torvalds 已提交
938
			/* Last address was deleted from this interface.
E
Eric Dumazet 已提交
939
			 * Disable IP.
L
Linus Torvalds 已提交
940
			 */
941
			fib_disable_ip(dev, 1, 0);
L
Linus Torvalds 已提交
942
		} else {
943
			rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
944 945 946 947 948 949 950 951 952
		}
		break;
	}
	return NOTIFY_DONE;
}

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
	struct net_device *dev = ptr;
953
	struct in_device *in_dev = __in_dev_get_rtnl(dev);
L
Linus Torvalds 已提交
954 955

	if (event == NETDEV_UNREGISTER) {
956
		fib_disable_ip(dev, 2, -1);
L
Linus Torvalds 已提交
957 958 959 960 961 962 963 964 965 966 967 968 969 970
		return NOTIFY_DONE;
	}

	if (!in_dev)
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
		for_ifa(in_dev) {
			fib_add_ifaddr(ifa);
		} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
		fib_sync_up(dev);
#endif
971
		rt_cache_flush(dev_net(dev), -1);
L
Linus Torvalds 已提交
972 973
		break;
	case NETDEV_DOWN:
974
		fib_disable_ip(dev, 0, 0);
L
Linus Torvalds 已提交
975 976 977
		break;
	case NETDEV_CHANGEMTU:
	case NETDEV_CHANGE:
978
		rt_cache_flush(dev_net(dev), 0);
L
Linus Torvalds 已提交
979
		break;
980
	case NETDEV_UNREGISTER_BATCH:
981 982 983 984 985
		/* The batch unregister is only called on the first
		 * device in the list of devices being unregistered.
		 * Therefore we should not pass dev_net(dev) in here.
		 */
		rt_cache_flush_batch(NULL);
986
		break;
L
Linus Torvalds 已提交
987 988 989 990 991
	}
	return NOTIFY_DONE;
}

static struct notifier_block fib_inetaddr_notifier = {
992
	.notifier_call = fib_inetaddr_event,
L
Linus Torvalds 已提交
993 994 995
};

static struct notifier_block fib_netdev_notifier = {
996
	.notifier_call = fib_netdev_event,
L
Linus Torvalds 已提交
997 998
};

999
static int __net_init ip_fib_net_init(struct net *net)
L
Linus Torvalds 已提交
1000
{
1001
	int err;
1002 1003 1004 1005
	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;

	/* Avoid false sharing : Use at least a full cache line */
	size = max_t(size_t, size, L1_CACHE_BYTES);
1006

1007
	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1008 1009 1010
	if (net->ipv4.fib_table_hash == NULL)
		return -ENOMEM;

1011 1012 1013 1014 1015 1016 1017 1018
	err = fib4_rules_init(net);
	if (err < 0)
		goto fail;
	return 0;

fail:
	kfree(net->ipv4.fib_table_hash);
	return err;
1019
}
L
Linus Torvalds 已提交
1020

1021
static void ip_fib_net_exit(struct net *net)
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
{
	unsigned int i;

#ifdef CONFIG_IP_MULTIPLE_TABLES
	fib4_rules_exit(net);
#endif

	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
		struct fib_table *tb;
		struct hlist_head *head;
		struct hlist_node *node, *tmp;
1033

1034
		head = &net->ipv4.fib_table_hash[i];
1035 1036
		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
			hlist_del(node);
1037
			fib_table_flush(tb);
1038
			fib_free_table(tb);
1039 1040
		}
	}
1041
	kfree(net->ipv4.fib_table_hash);
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
}

static int __net_init fib_net_init(struct net *net)
{
	int error;

	error = ip_fib_net_init(net);
	if (error < 0)
		goto out;
	error = nl_fib_lookup_init(net);
	if (error < 0)
		goto out_nlfl;
	error = fib_proc_init(net);
	if (error < 0)
		goto out_proc;
out:
	return error;

out_proc:
	nl_fib_lookup_exit(net);
out_nlfl:
	ip_fib_net_exit(net);
	goto out;
}

static void __net_exit fib_net_exit(struct net *net)
{
	fib_proc_exit(net);
	nl_fib_lookup_exit(net);
	ip_fib_net_exit(net);
}

static struct pernet_operations fib_net_ops = {
	.init = fib_net_init,
	.exit = fib_net_exit,
};

void __init ip_fib_init(void)
{
1081 1082 1083
	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1084 1085 1086 1087

	register_pernet_subsys(&fib_net_ops);
	register_netdevice_notifier(&fib_netdev_notifier);
	register_inetaddr_notifier(&fib_inetaddr_notifier);
1088 1089

	fib_hash_init();
L
Linus Torvalds 已提交
1090
}