fib_trie.c 61.7 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14 15
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
 * This work is based on the LPC-trie which is originally descibed in:
16
 *
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53 54

#include <asm/uaccess.h>
#include <asm/system.h>
J
Jiri Slaby 已提交
55
#include <linux/bitops.h>
56 57 58 59 60 61 62 63 64
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
65
#include <linux/inetdevice.h>
66 67 68
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
69
#include <linux/rcupdate.h>
70 71 72 73
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
74
#include <linux/slab.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89 90 91 92

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
93 94
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
95 96
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
97 98

struct node {
O
Olof Johansson 已提交
99
	unsigned long parent;
100
	t_key key;
101 102 103
};

struct leaf {
O
Olof Johansson 已提交
104
	unsigned long parent;
105
	t_key key;
106
	struct hlist_head list;
R
Robert Olsson 已提交
107
	struct rcu_head rcu;
108 109 110 111
};

struct leaf_info {
	struct hlist_node hlist;
R
Robert Olsson 已提交
112
	struct rcu_head rcu;
113 114 115 116 117
	int plen;
	struct list_head falh;
};

struct tnode {
O
Olof Johansson 已提交
118
	unsigned long parent;
119
	t_key key;
120 121
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
122 123
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
124 125 126
	union {
		struct rcu_head rcu;
		struct work_struct work;
J
Jarek Poplawski 已提交
127
		struct tnode *tnode_free;
128
	};
O
Olof Johansson 已提交
129
	struct node *child[0];
130 131 132 133 134 135 136 137 138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140 141 142 143 144 145 146 147 148
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
149
	unsigned int prefixes;
R
Robert Olsson 已提交
150
	unsigned int nodesizes[MAX_STAT_DEPTH];
151
};
152 153

struct trie {
O
Olof Johansson 已提交
154
	struct node *trie;
155 156 157 158 159 160
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
};

static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161 162
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
				  int wasfull);
163
static struct node *resize(struct trie *t, struct tnode *tn);
164 165
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
166 167
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
168 169 170 171 172 173 174 175
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
176

177
static struct kmem_cache *fn_alias_kmem __read_mostly;
178
static struct kmem_cache *trie_leaf_kmem __read_mostly;
179

S
Stephen Hemminger 已提交
180 181
static inline struct tnode *node_parent(struct node *node)
{
182 183 184 185 186 187
	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
}

static inline struct tnode *node_parent_rcu(struct node *node)
{
	struct tnode *ret = node_parent(node);
S
Stephen Hemminger 已提交
188

189 190 191
	return rcu_dereference_check(ret,
				     rcu_read_lock_held() ||
				     lockdep_rtnl_is_held());
S
Stephen Hemminger 已提交
192 193
}

194 195 196
/* Same as rcu_assign_pointer
 * but that macro() assumes that value is a pointer.
 */
S
Stephen Hemminger 已提交
197 198
static inline void node_set_parent(struct node *node, struct tnode *ptr)
{
199 200
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
201
}
R
Robert Olsson 已提交
202

203 204 205
static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
206

207 208 209 210
	return tn->child[i];
}

static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
211
{
212
	struct node *ret = tnode_get_child(tn, i);
213

214 215 216
	return rcu_dereference_check(ret,
				     rcu_read_lock_held() ||
				     lockdep_rtnl_is_held());
217 218
}

S
Stephen Hemmigner 已提交
219
static inline int tnode_child_length(const struct tnode *tn)
220
{
O
Olof Johansson 已提交
221
	return 1 << tn->bits;
222 223
}

S
Stephen Hemminger 已提交
224 225 226 227 228
static inline t_key mask_pfx(t_key k, unsigned short l)
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

229 230
static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
{
O
Olof Johansson 已提交
231
	if (offset < KEYLENGTH)
232
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
233
	else
234 235 236 237 238
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
239
	return a == b;
240 241 242 243
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
244 245
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
246 247
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
248
}
249 250 251 252 253 254

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

255 256 257
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
258 259 260 261 262
		i++;
	return i;
}

/*
263 264
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
265 266 267 268
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

269 270 271 272 273
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
274 275
  correct key path.

276 277 278 279 280
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
281 282
  call to tkey_sub_equals() in trie_insert().

283
  if n is an internal node - a 'tnode' here, the various parts of its key
284 285
  have many different meanings.

286
  Example:
287 288 289
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
290
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
291 292 293 294 295 296 297 298 299

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
300
  n->bits = 4
301

302 303
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
304 305 306
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
307
  index into the parent's child array. That is, they will be used to find
308 309 310 311 312
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

313
  All the bits we have seen so far are significant to the node n. The rest
314 315
  of the bits are really not needed or indeed known in n->key.

316
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
317
  n's child array, and will of course be different for each child.
318

319

320 321 322 323 324
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
325
static inline void check_tnode(const struct tnode *tn)
326
{
S
Stephen Hemminger 已提交
327
	WARN_ON(tn && tn->pos+tn->bits > 32);
328 329
}

330 331
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
332
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
333
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
334 335

static void __alias_free_mem(struct rcu_head *head)
336
{
R
Robert Olsson 已提交
337 338
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
339 340
}

R
Robert Olsson 已提交
341
static inline void alias_free_mem_rcu(struct fib_alias *fa)
342
{
R
Robert Olsson 已提交
343 344
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
345

R
Robert Olsson 已提交
346 347
static void __leaf_free_rcu(struct rcu_head *head)
{
348 349
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
350
}
O
Olof Johansson 已提交
351

352 353 354 355 356
static inline void free_leaf(struct leaf *l)
{
	call_rcu_bh(&l->rcu, __leaf_free_rcu);
}

R
Robert Olsson 已提交
357
static void __leaf_info_free_rcu(struct rcu_head *head)
358
{
R
Robert Olsson 已提交
359
	kfree(container_of(head, struct leaf_info, rcu));
360 361
}

R
Robert Olsson 已提交
362
static inline void free_leaf_info(struct leaf_info *leaf)
363
{
R
Robert Olsson 已提交
364
	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
365 366
}

367
static struct tnode *tnode_alloc(size_t size)
368
{
R
Robert Olsson 已提交
369
	if (size <= PAGE_SIZE)
370
		return kzalloc(size, GFP_KERNEL);
371 372 373
	else
		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
}
R
Robert Olsson 已提交
374

375 376 377 378
static void __tnode_vfree(struct work_struct *arg)
{
	struct tnode *tn = container_of(arg, struct tnode, work);
	vfree(tn);
379 380
}

R
Robert Olsson 已提交
381
static void __tnode_free_rcu(struct rcu_head *head)
382
{
R
Robert Olsson 已提交
383
	struct tnode *tn = container_of(head, struct tnode, rcu);
384 385
	size_t size = sizeof(struct tnode) +
		      (sizeof(struct node *) << tn->bits);
386 387 388

	if (size <= PAGE_SIZE)
		kfree(tn);
389 390 391 392
	else {
		INIT_WORK(&tn->work, __tnode_vfree);
		schedule_work(&tn->work);
	}
393 394
}

R
Robert Olsson 已提交
395 396
static inline void tnode_free(struct tnode *tn)
{
397 398 399
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
400
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
401 402
}

J
Jarek Poplawski 已提交
403 404 405
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
406 407
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
408 409
	tnode_free_size += sizeof(struct tnode) +
			   (sizeof(struct node *) << tn->bits);
J
Jarek Poplawski 已提交
410 411 412 413 414 415 416 417 418 419 420
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
421 422 423 424 425

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
426 427
}

R
Robert Olsson 已提交
428 429
static struct leaf *leaf_new(void)
{
430
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

448
static struct tnode *tnode_new(t_key key, int pos, int bits)
449
{
450
	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
451
	struct tnode *tn = tnode_alloc(sz);
452

O
Olof Johansson 已提交
453
	if (tn) {
R
Robert Olsson 已提交
454
		tn->parent = T_TNODE;
455 456 457 458 459 460
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
461

462 463
	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
		 (unsigned long) (sizeof(struct node) << bits));
464 465 466 467 468 469 470 471
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

S
Stephen Hemmigner 已提交
472
static inline int tnode_full(const struct tnode *tn, const struct node *n)
473
{
474
	if (n == NULL || IS_LEAF(n))
475 476 477 478 479
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

480 481
static inline void put_child(struct trie *t, struct tnode *tn, int i,
			     struct node *n)
482 483 484 485
{
	tnode_put_child_reorg(tn, i, n, -1);
}

486
 /*
487 488 489 490
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

491 492
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
				  int wasfull)
493
{
R
Robert Olsson 已提交
494
	struct node *chi = tn->child[i];
495 496
	int isfull;

S
Stephen Hemminger 已提交
497 498
	BUG_ON(i >= 1<<tn->bits);

499 500 501 502 503
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
504

505
	/* update fullChildren */
O
Olof Johansson 已提交
506
	if (wasfull == -1)
507 508 509
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
510
	if (wasfull && !isfull)
511
		tn->full_children--;
512
	else if (!wasfull && isfull)
513
		tn->full_children++;
O
Olof Johansson 已提交
514

515
	if (n)
S
Stephen Hemminger 已提交
516
		node_set_parent(n, tn);
517

R
Robert Olsson 已提交
518
	rcu_assign_pointer(tn->child[i], n);
519 520
}

J
Jens Låås 已提交
521
#define MAX_WORK 10
522
static struct node *resize(struct trie *t, struct tnode *tn)
523 524
{
	int i;
525
	struct tnode *old_tn;
526 527
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
528
	int max_work;
529

530
	if (!tn)
531 532
		return NULL;

S
Stephen Hemminger 已提交
533 534
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
535 536 537

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
538
		tnode_free_safe(tn);
539 540 541 542
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
J
Jens Låås 已提交
543
		goto one_child;
544
	/*
545 546 547 548 549
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
550 551
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
552
	 * Telecommunications, page 6:
553
	 * "A node is doubled if the ratio of non-empty children to all
554 555
	 * children in the *doubled* node is at least 'high'."
	 *
556 557 558 559 560
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
561
	 * multiply the left-hand side by 50.
562 563 564 565
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
566
	 * children, that is non-null tnodes with a skip value of 0.
567
	 * All of those will be doubled in the resulting inflated tnode, so
568
	 * we just count them one extra time here.
569
	 *
570
	 * A clearer way to write this would be:
571
	 *
572
	 * to_be_doubled = tn->full_children;
573
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
574 575 576 577
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
578
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
579 580
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
581 582 583
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
584 585 586
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
587
	 *
588 589 590
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
591
	 *
592
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
593
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
594
	 *    tn->full_children) >= inflate_threshold * new_child_length
595
	 *
596
	 * expand new_child_length:
597
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
598
	 *    tn->full_children) >=
599
	 *      inflate_threshold * tnode_child_length(tn) * 2
600
	 *
601
	 * shorten again:
602
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
603
	 *    tn->empty_children) >= inflate_threshold *
604
	 *    tnode_child_length(tn)
605
	 *
606 607 608
	 */

	check_tnode(tn);
609

610 611
	/* Keep root node larger  */

J
Jens Låås 已提交
612 613 614 615 616
	if (!node_parent((struct node*) tn)) {
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
	}
	else {
617
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
618 619
		halve_threshold_use = halve_threshold;
	}
620

J
Jens Låås 已提交
621 622
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
623 624 625
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
626

627 628
		old_tn = tn;
		tn = inflate(t, tn);
629

630 631
		if (IS_ERR(tn)) {
			tn = old_tn;
632 633 634 635 636
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
637 638 639 640
	}

	check_tnode(tn);

J
Jens Låås 已提交
641 642 643 644
	/* Return if at least one inflate is run */
	if( max_work != MAX_WORK)
		return (struct node *) tn;

645 646 647 648
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
649

J
Jens Låås 已提交
650 651
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
652
	       100 * (tnode_child_length(tn) - tn->empty_children) <
653
	       halve_threshold_use * tnode_child_length(tn)) {
654

655 656 657 658
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
659 660 661 662 663 664
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
665

666

667
	/* Only one child remains */
J
Jens Låås 已提交
668 669
	if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
670
		for (i = 0; i < tnode_child_length(tn); i++) {
O
Olof Johansson 已提交
671
			struct node *n;
672

O
Olof Johansson 已提交
673
			n = tn->child[i];
R
Robert Olsson 已提交
674
			if (!n)
O
Olof Johansson 已提交
675 676 677 678
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
679
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
680
			tnode_free_safe(tn);
O
Olof Johansson 已提交
681
			return n;
682
		}
J
Jens Låås 已提交
683
	}
684 685 686
	return (struct node *) tn;
}

687
static struct tnode *inflate(struct trie *t, struct tnode *tn)
688 689 690 691 692
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
693
	pr_debug("In inflate\n");
694 695 696

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
697
	if (!tn)
698
		return ERR_PTR(-ENOMEM);
699 700

	/*
701 702 703
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
704 705
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
706 707

	for (i = 0; i < olen; i++) {
708
		struct tnode *inode;
709

710
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
711 712 713 714 715
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
716
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
717

718 719
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
720 721
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
722

723 724 725
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

726
			if (!right) {
727 728
				tnode_free(left);
				goto nomem;
729
			}
730 731 732 733 734 735

			put_child(t, tn, 2*i, (struct node *) left);
			put_child(t, tn, 2*i+1, (struct node *) right);
		}
	}

O
Olof Johansson 已提交
736
	for (i = 0; i < olen; i++) {
737
		struct tnode *inode;
738
		struct node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
739 740
		struct tnode *left, *right;
		int size, j;
741

742 743 744 745 746 747
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

748
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
749
		   tn->pos + tn->bits - 1) {
750 751 752
			if (tkey_extract_bits(node->key,
					      oldtnode->pos + oldtnode->bits,
					      1) == 0)
753 754 755 756 757 758 759 760 761 762 763 764 765
				put_child(t, tn, 2*i, node);
			else
				put_child(t, tn, 2*i+1, node);
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
			put_child(t, tn, 2*i, inode->child[0]);
			put_child(t, tn, 2*i+1, inode->child[1]);

J
Jarek Poplawski 已提交
766
			tnode_free_safe(inode);
O
Olof Johansson 已提交
767
			continue;
768 769
		}

O
Olof Johansson 已提交
770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
788

O
Olof Johansson 已提交
789 790 791
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
792

O
Olof Johansson 已提交
793 794
		left = (struct tnode *) tnode_get_child(tn, 2*i);
		put_child(t, tn, 2*i, NULL);
795

O
Olof Johansson 已提交
796
		BUG_ON(!left);
797

O
Olof Johansson 已提交
798 799
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
		put_child(t, tn, 2*i+1, NULL);
800

O
Olof Johansson 已提交
801
		BUG_ON(!right);
802

O
Olof Johansson 已提交
803 804 805 806
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
			put_child(t, left, j, inode->child[j]);
			put_child(t, right, j, inode->child[j + size]);
807
		}
O
Olof Johansson 已提交
808 809 810
		put_child(t, tn, 2*i, resize(t, left));
		put_child(t, tn, 2*i+1, resize(t, right));

J
Jarek Poplawski 已提交
811
		tnode_free_safe(inode);
812
	}
J
Jarek Poplawski 已提交
813
	tnode_free_safe(oldtnode);
814
	return tn;
815 816 817 818 819
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

S
Stephen Hemminger 已提交
820
		for (j = 0; j < size; j++)
821 822 823 824
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
S
Stephen Hemminger 已提交
825

826 827
		return ERR_PTR(-ENOMEM);
	}
828 829
}

830
static struct tnode *halve(struct trie *t, struct tnode *tn)
831 832 833 834 835 836
{
	struct tnode *oldtnode = tn;
	struct node *left, *right;
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
837
	pr_debug("In halve\n");
838 839

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
840

841 842
	if (!tn)
		return ERR_PTR(-ENOMEM);
843 844

	/*
845 846 847
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
848 849 850
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
851
	for (i = 0; i < olen; i += 2) {
852 853
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
854

855
		/* Two nonempty children */
S
Stephen Hemminger 已提交
856
		if (left && right) {
857
			struct tnode *newn;
S
Stephen Hemminger 已提交
858

859
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
860 861

			if (!newn)
862
				goto nomem;
S
Stephen Hemminger 已提交
863

864
			put_child(t, tn, i/2, (struct node *)newn);
865 866 867
		}

	}
868

O
Olof Johansson 已提交
869 870 871
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

872 873
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
874

875 876 877 878 879
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
			put_child(t, tn, i/2, right);
O
Olof Johansson 已提交
880
			continue;
S
Stephen Hemminger 已提交
881
		}
O
Olof Johansson 已提交
882 883

		if (right == NULL) {
884
			put_child(t, tn, i/2, left);
O
Olof Johansson 已提交
885 886
			continue;
		}
887

888
		/* Two nonempty children */
O
Olof Johansson 已提交
889 890 891 892 893
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
		put_child(t, tn, i/2, NULL);
		put_child(t, newBinNode, 0, left);
		put_child(t, newBinNode, 1, right);
		put_child(t, tn, i/2, resize(t, newBinNode));
894
	}
J
Jarek Poplawski 已提交
895
	tnode_free_safe(oldtnode);
896
	return tn;
897 898 899 900 901
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

S
Stephen Hemminger 已提交
902
		for (j = 0; j < size; j++)
903 904 905 906
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
S
Stephen Hemminger 已提交
907

908 909
		return ERR_PTR(-ENOMEM);
	}
910 911
}

R
Robert Olsson 已提交
912
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
913 914
 via get_fa_head and dump */

R
Robert Olsson 已提交
915
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
916
{
R
Robert Olsson 已提交
917
	struct hlist_head *head = &l->list;
918 919 920
	struct hlist_node *node;
	struct leaf_info *li;

R
Robert Olsson 已提交
921
	hlist_for_each_entry_rcu(li, node, head, hlist)
922
		if (li->plen == plen)
923
			return li;
O
Olof Johansson 已提交
924

925 926 927
	return NULL;
}

928
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
929
{
R
Robert Olsson 已提交
930
	struct leaf_info *li = find_leaf_info(l, plen);
931

O
Olof Johansson 已提交
932 933
	if (!li)
		return NULL;
934

O
Olof Johansson 已提交
935
	return &li->falh;
936 937 938 939
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
	struct leaf_info *li = NULL, *last = NULL;
	struct hlist_node *node;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
		hlist_for_each_entry(li, node, head, hlist) {
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
			hlist_add_after_rcu(&last->hlist, &new->hlist);
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
957 958
}

R
Robert Olsson 已提交
959 960
/* rcu_read_lock needs to be hold by caller from readside */

961 962 963 964 965 966 967 968
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
	struct node *n;

	pos = 0;
969 970 971
	n = rcu_dereference_check(t->trie,
				  rcu_read_lock_held() ||
				  lockdep_rtnl_is_held());
972 973 974

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
975

976
		check_tnode(tn);
O
Olof Johansson 已提交
977

978
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
979
			pos = tn->pos + tn->bits;
980 981 982 983
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
984
		} else
985 986 987 988
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
989 990 991
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

992 993 994
	return NULL;
}

995
static void trie_rebalance(struct trie *t, struct tnode *tn)
996 997
{
	int wasfull;
R
Robert Olsson 已提交
998
	t_key cindex, key;
S
Stephen Hemminger 已提交
999
	struct tnode *tp;
1000

R
Robert Olsson 已提交
1001 1002
	key = tn->key;

S
Stephen Hemminger 已提交
1003
	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1004 1005
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1006 1007 1008 1009
		tn = (struct tnode *) resize(t, (struct tnode *)tn);

		tnode_put_child_reorg((struct tnode *)tp, cindex,
				      (struct node *)tn, wasfull);
O
Olof Johansson 已提交
1010

S
Stephen Hemminger 已提交
1011
		tp = node_parent((struct node *) tn);
1012 1013 1014
		if (!tp)
			rcu_assign_pointer(t->trie, (struct node *)tn);

J
Jarek Poplawski 已提交
1015
		tnode_free_flush();
S
Stephen Hemminger 已提交
1016
		if (!tp)
1017
			break;
S
Stephen Hemminger 已提交
1018
		tn = tp;
1019
	}
S
Stephen Hemminger 已提交
1020

1021
	/* Handle last (top) tnode */
1022
	if (IS_TNODE(tn))
1023
		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1024

1025 1026
	rcu_assign_pointer(t->trie, (struct node *)tn);
	tnode_free_flush();
1027 1028
}

R
Robert Olsson 已提交
1029 1030
/* only used from updater-side */

1031
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1032 1033 1034 1035 1036 1037
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
	struct node *n;
	struct leaf *l;
	int missbit;
1038
	struct list_head *fa_head = NULL;
1039 1040 1041 1042
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
1043
	n = t->trie;
1044

1045 1046
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1047
	 * and we should just put our new leaf in that.
1048 1049
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1050 1051
	 * not be the parent's 'pos'+'bits'!
	 *
1052
	 * If it does match the current key, get pos/bits from it, extract
1053 1054 1055 1056
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1057 1058 1059
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1060 1061 1062 1063 1064
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1065

1066
		check_tnode(tn);
O
Olof Johansson 已提交
1067

1068
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1069
			tp = tn;
O
Olof Johansson 已提交
1070
			pos = tn->pos + tn->bits;
1071 1072 1073 1074
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1075

S
Stephen Hemminger 已提交
1076
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1077
		} else
1078 1079 1080 1081 1082 1083
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1084
	 * tp is n's (parent) ----> NULL or TNODE
1085 1086
	 */

O
Olof Johansson 已提交
1087
	BUG_ON(tp && IS_LEAF(tp));
1088 1089 1090

	/* Case 1: n is a leaf. Compare prefixes */

1091
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1092
		l = (struct leaf *) n;
1093
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1094

1095 1096
		if (!li)
			return NULL;
1097 1098 1099 1100 1101 1102 1103

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1104 1105
	if (!l)
		return NULL;
1106 1107 1108 1109

	l->key = key;
	li = leaf_info_new(plen);

1110
	if (!li) {
1111
		free_leaf(l);
1112
		return NULL;
1113
	}
1114 1115 1116 1117 1118

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1119
		/* Case 2: n is NULL, and will just insert a new leaf */
1120

S
Stephen Hemminger 已提交
1121
		node_set_parent((struct node *)l, tp);
1122

O
Olof Johansson 已提交
1123 1124 1125 1126
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1127 1128
		/*
		 *  Add a new tnode here
1129 1130 1131 1132
		 *  first tnode need some special handling
		 */

		if (tp)
O
Olof Johansson 已提交
1133
			pos = tp->pos+tp->bits;
1134
		else
O
Olof Johansson 已提交
1135 1136
			pos = 0;

1137
		if (n) {
1138 1139
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1140
		} else {
1141
			newpos = 0;
1142
			tn = tnode_new(key, newpos, 1); /* First tnode */
1143 1144
		}

1145
		if (!tn) {
1146
			free_leaf_info(li);
1147
			free_leaf(l);
1148
			return NULL;
O
Olof Johansson 已提交
1149 1150
		}

S
Stephen Hemminger 已提交
1151
		node_set_parent((struct node *)tn, tp);
1152

O
Olof Johansson 已提交
1153
		missbit = tkey_extract_bits(key, newpos, 1);
1154 1155 1156
		put_child(t, tn, missbit, (struct node *)l);
		put_child(t, tn, 1-missbit, n);

1157
		if (tp) {
1158
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1159 1160
			put_child(t, (struct tnode *)tp, cindex,
				  (struct node *)tn);
O
Olof Johansson 已提交
1161
		} else {
1162
			rcu_assign_pointer(t->trie, (struct node *)tn);
1163 1164 1165
			tp = tn;
		}
	}
O
Olof Johansson 已提交
1166 1167

	if (tp && tp->pos + tp->bits > 32)
1168 1169 1170
		pr_warning("fib_trie"
			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			   tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1171

1172
	/* Rebalance the trie */
R
Robert Olsson 已提交
1173

1174
	trie_rebalance(t, tp);
1175
done:
1176 1177 1178
	return fa_head;
}

1179 1180 1181
/*
 * Caller must hold RTNL.
 */
1182
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1183 1184 1185
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1186
	struct list_head *fa_head = NULL;
1187
	struct fib_info *fi;
1188 1189
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1190 1191 1192 1193 1194 1195 1196
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1197
	key = ntohl(cfg->fc_dst);
1198

1199
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1200

O
Olof Johansson 已提交
1201
	mask = ntohl(inet_make_mask(plen));
1202

1203
	if (key & ~mask)
1204 1205 1206 1207
		return -EINVAL;

	key = key & mask;

1208 1209 1210
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1211
		goto err;
1212
	}
1213 1214

	l = fib_find_node(t, key);
1215
	fa = NULL;
1216

1217
	if (l) {
1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1233 1234 1235
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1236 1237

		err = -EEXIST;
1238
		if (cfg->fc_nlflags & NLM_F_EXCL)
1239 1240
			goto out;

1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_scope == cfg->fc_scope &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1262
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1263 1264 1265
			struct fib_info *fi_drop;
			u8 state;

1266 1267 1268 1269
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1270
				goto out;
1271
			}
R
Robert Olsson 已提交
1272
			err = -ENOBUFS;
1273
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1274 1275
			if (new_fa == NULL)
				goto out;
1276 1277

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1278 1279
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1280 1281
			new_fa->fa_type = cfg->fc_type;
			new_fa->fa_scope = cfg->fc_scope;
1282
			state = fa->fa_state;
1283
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1284

R
Robert Olsson 已提交
1285 1286
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1287 1288 1289

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1290
				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1291 1292
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1293

O
Olof Johansson 已提交
1294
			goto succeeded;
1295 1296 1297 1298 1299
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1300 1301
		if (fa_match)
			goto out;
1302

1303
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1304
			fa = fa_first;
1305 1306
	}
	err = -ENOENT;
1307
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1308 1309 1310
		goto out;

	err = -ENOBUFS;
1311
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1312 1313 1314 1315 1316
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1317 1318
	new_fa->fa_type = cfg->fc_type;
	new_fa->fa_scope = cfg->fc_scope;
1319 1320 1321 1322 1323
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1324
	if (!fa_head) {
1325 1326 1327
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1328
			goto out_free_new_fa;
1329
		}
1330
	}
1331

R
Robert Olsson 已提交
1332 1333
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1334

1335
	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1336
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1337
		  &cfg->fc_nlinfo, 0);
1338 1339
succeeded:
	return 0;
1340 1341 1342

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1343 1344
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1345
err:
1346 1347 1348
	return err;
}

R
Robert Olsson 已提交
1349
/* should be called with rcu_read_lock */
1350 1351 1352
static int check_leaf(struct trie *t, struct leaf *l,
		      t_key key,  const struct flowi *flp,
		      struct fib_result *res)
1353 1354 1355 1356
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
	struct hlist_node *node;
1357

R
Robert Olsson 已提交
1358
	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1359 1360 1361 1362
		int err;
		int plen = li->plen;
		__be32 mask = inet_make_mask(plen);

1363
		if (l->key != (key & ntohl(mask)))
1364 1365
			continue;

1366
		err = fib_semantic_match(&li->falh, flp, res, plen);
1367

1368
#ifdef CONFIG_IP_FIB_TRIE_STATS
1369
		if (err <= 0)
1370
			t->stats.semantic_match_passed++;
1371 1372
		else
			t->stats.semantic_match_miss++;
1373
#endif
1374
		if (err <= 0)
1375
			return err;
1376
	}
1377

1378
	return 1;
1379 1380
}

1381 1382
int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
		     struct fib_result *res)
1383 1384
{
	struct trie *t = (struct trie *) tb->tb_data;
1385
	int ret;
1386 1387 1388
	struct node *n;
	struct tnode *pn;
	int pos, bits;
O
Olof Johansson 已提交
1389
	t_key key = ntohl(flp->fl4_dst);
1390 1391 1392
	int chopped_off;
	t_key cindex = 0;
	int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1393 1394 1395 1396
	struct tnode *cn;
	t_key node_prefix, key_prefix, pref_mismatch;
	int mp;

R
Robert Olsson 已提交
1397
	rcu_read_lock();
O
Olof Johansson 已提交
1398

R
Robert Olsson 已提交
1399
	n = rcu_dereference(t->trie);
1400
	if (!n)
1401 1402 1403 1404 1405 1406 1407 1408
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats.gets++;
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1409
		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1410
		goto found;
1411
	}
1412

1413 1414
	pn = (struct tnode *) n;
	chopped_off = 0;
1415

O
Olof Johansson 已提交
1416
	while (pn) {
1417 1418 1419
		pos = pn->pos;
		bits = pn->bits;

1420
		if (!chopped_off)
S
Stephen Hemminger 已提交
1421 1422
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1423

1424
		n = tnode_get_child_rcu(pn, cindex);
1425 1426 1427 1428 1429 1430 1431 1432

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.null_node_hit++;
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1433
		if (IS_LEAF(n)) {
1434 1435
			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
			if (ret > 0)
O
Olof Johansson 已提交
1436
				goto backtrace;
1437
			goto found;
O
Olof Johansson 已提交
1438 1439 1440
		}

		cn = (struct tnode *)n;
1441

O
Olof Johansson 已提交
1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1457

O
Olof Johansson 已提交
1458 1459 1460 1461 1462 1463 1464 1465 1466
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1467

1468 1469
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1470

O
Olof Johansson 已提交
1471 1472
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1473 1474
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1475 1476
				goto backtrace;
		}
1477

O
Olof Johansson 已提交
1478 1479 1480 1481 1482 1483 1484 1485 1486
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1487

O
Olof Johansson 已提交
1488 1489 1490 1491 1492 1493 1494 1495
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1496

1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1508 1509
		 */

S
Stephen Hemminger 已提交
1510 1511
		node_prefix = mask_pfx(cn->key, cn->pos);
		key_prefix = mask_pfx(key, cn->pos);
O
Olof Johansson 已提交
1512 1513 1514
		pref_mismatch = key_prefix^node_prefix;
		mp = 0;

1515 1516 1517 1518
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1519 1520 1521 1522
		 */
		if (pref_mismatch) {
			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
				mp++;
1523
				pref_mismatch = pref_mismatch << 1;
O
Olof Johansson 已提交
1524 1525 1526 1527 1528 1529 1530 1531
			}
			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);

			if (key_prefix != 0)
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1532
		}
1533

O
Olof Johansson 已提交
1534 1535 1536 1537
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1538 1539 1540 1541
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1542 1543
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1544 1545 1546 1547
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1548 1549
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1550

1551
		/*
1552
		 * Either we do the actual chop off according or if we have
1553 1554 1555
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1556
		if (chopped_off <= pn->bits) {
1557
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1558
		} else {
1559
			struct tnode *parent = node_parent_rcu((struct node *) pn);
S
Stephen Hemminger 已提交
1560
			if (!parent)
1561
				goto failed;
O
Olof Johansson 已提交
1562

1563
			/* Get Child's index */
S
Stephen Hemminger 已提交
1564 1565
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1566 1567 1568 1569 1570 1571
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.backtrack++;
#endif
			goto backtrace;
1572
		}
1573 1574
	}
failed:
1575
	ret = 1;
1576
found:
R
Robert Olsson 已提交
1577
	rcu_read_unlock();
1578 1579 1580
	return ret;
}

1581 1582 1583 1584
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1585
{
1586
	struct tnode *tp = node_parent((struct node *) l);
1587

1588
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1589

1590
	if (tp) {
1591
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1592
		put_child(t, (struct tnode *)tp, cindex, NULL);
1593
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1594
	} else
R
Robert Olsson 已提交
1595
		rcu_assign_pointer(t->trie, NULL);
1596

1597
	free_leaf(l);
1598 1599
}

1600 1601 1602
/*
 * Caller must hold RTNL.
 */
1603
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1604 1605 1606
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1607 1608
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1609 1610 1611
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1612 1613
	struct leaf_info *li;

1614
	if (plen > 32)
1615 1616
		return -EINVAL;

1617
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1618
	mask = ntohl(inet_make_mask(plen));
1619

1620
	if (key & ~mask)
1621 1622 1623 1624 1625
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1626
	if (!l)
1627 1628 1629 1630 1631 1632 1633 1634
		return -ESRCH;

	fa_head = get_fa_head(l, plen);
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1635
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1636 1637

	fa_to_delete = NULL;
1638 1639
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1640 1641 1642 1643 1644
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1645 1646 1647 1648 1649 1650
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
		     fa->fa_scope == cfg->fc_scope) &&
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1651 1652 1653 1654 1655
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1656 1657
	if (!fa_to_delete)
		return -ESRCH;
1658

O
Olof Johansson 已提交
1659
	fa = fa_to_delete;
1660
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1661
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1662 1663

	l = fib_find_node(t, key);
R
Robert Olsson 已提交
1664
	li = find_leaf_info(l, plen);
1665

R
Robert Olsson 已提交
1666
	list_del_rcu(&fa->fa_list);
1667

O
Olof Johansson 已提交
1668
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1669
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1670
		free_leaf_info(li);
R
Robert Olsson 已提交
1671
	}
1672

O
Olof Johansson 已提交
1673
	if (hlist_empty(&l->list))
1674
		trie_leaf_remove(t, l);
1675

O
Olof Johansson 已提交
1676
	if (fa->fa_state & FA_S_ACCESSED)
1677
		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1678

R
Robert Olsson 已提交
1679 1680
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1681
	return 0;
1682 1683
}

1684
static int trie_flush_list(struct list_head *head)
1685 1686 1687 1688 1689 1690 1691
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1692 1693 1694 1695
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1696 1697 1698 1699 1700 1701
			found++;
		}
	}
	return found;
}

1702
static int trie_flush_leaf(struct leaf *l)
1703 1704 1705 1706 1707 1708 1709
{
	int found = 0;
	struct hlist_head *lih = &l->list;
	struct hlist_node *node, *tmp;
	struct leaf_info *li = NULL;

	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1710
		found += trie_flush_list(&li->falh);
1711 1712

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1713
			hlist_del_rcu(&li->hlist);
1714 1715 1716 1717 1718 1719
			free_leaf_info(li);
		}
	}
	return found;
}

1720 1721 1722 1723 1724
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1725
{
1726 1727
	do {
		t_key idx;
1728 1729

		if (c)
1730
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1731
		else
1732
			idx = 0;
R
Robert Olsson 已提交
1733

1734 1735
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1736
			if (!c)
O
Olof Johansson 已提交
1737 1738
				continue;

1739 1740 1741
			if (IS_LEAF(c)) {
				prefetch(p->child[idx]);
				return (struct leaf *) c;
1742
			}
1743 1744 1745 1746

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1747
		}
1748 1749

		/* Node empty, walk back up to parent */
O
Olof Johansson 已提交
1750
		c = (struct node *) p;
1751 1752 1753 1754 1755 1756 1757
	} while ( (p = node_parent_rcu(c)) != NULL);

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
1758 1759 1760
	struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
							rcu_read_lock_held() ||
							lockdep_rtnl_is_held());
1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
	struct node *c = (struct node *) l;
1774
	struct tnode *p = node_parent_rcu(c);
1775 1776 1777 1778 1779

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1780 1781
}

1782 1783 1784 1785
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1786
	while (l && index-- > 0)
1787
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1788

1789 1790 1791 1792
	return l;
}


1793 1794 1795
/*
 * Caller must hold RTNL.
 */
1796
int fib_table_flush(struct fib_table *tb)
1797 1798
{
	struct trie *t = (struct trie *) tb->tb_data;
1799
	struct leaf *l, *ll = NULL;
1800
	int found = 0;
1801

1802
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1803
		found += trie_flush_leaf(l);
1804 1805

		if (ll && hlist_empty(&ll->list))
1806
			trie_leaf_remove(t, ll);
1807 1808 1809 1810
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1811
		trie_leaf_remove(t, ll);
1812

S
Stephen Hemminger 已提交
1813
	pr_debug("trie_flush found=%d\n", found);
1814 1815 1816
	return found;
}

1817 1818 1819
void fib_table_select_default(struct fib_table *tb,
			      const struct flowi *flp,
			      struct fib_result *res)
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832
{
	struct trie *t = (struct trie *) tb->tb_data;
	int order, last_idx;
	struct fib_info *fi = NULL;
	struct fib_info *last_resort;
	struct fib_alias *fa = NULL;
	struct list_head *fa_head;
	struct leaf *l;

	last_idx = -1;
	last_resort = NULL;
	order = -1;

R
Robert Olsson 已提交
1833
	rcu_read_lock();
1834

1835
	l = fib_find_node(t, 0);
1836
	if (!l)
1837 1838 1839
		goto out;

	fa_head = get_fa_head(l, 0);
1840
	if (!fa_head)
1841 1842
		goto out;

1843
	if (list_empty(fa_head))
1844 1845
		goto out;

R
Robert Olsson 已提交
1846
	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1847
		struct fib_info *next_fi = fa->fa_info;
O
Olof Johansson 已提交
1848

1849 1850 1851
		if (fa->fa_scope != res->scope ||
		    fa->fa_type != RTN_UNICAST)
			continue;
O
Olof Johansson 已提交
1852

1853 1854 1855 1856 1857 1858
		if (next_fi->fib_priority > res->fi->fib_priority)
			break;
		if (!next_fi->fib_nh[0].nh_gw ||
		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
			continue;
		fa->fa_state |= FA_S_ACCESSED;
O
Olof Johansson 已提交
1859

1860 1861 1862 1863
		if (fi == NULL) {
			if (next_fi != res->fi)
				break;
		} else if (!fib_detect_death(fi, order, &last_resort,
1864
					     &last_idx, tb->tb_default)) {
1865
			fib_result_assign(res, fi);
1866
			tb->tb_default = order;
1867 1868 1869 1870 1871 1872
			goto out;
		}
		fi = next_fi;
		order++;
	}
	if (order <= 0 || fi == NULL) {
1873
		tb->tb_default = -1;
1874 1875 1876
		goto out;
	}

1877 1878
	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
				tb->tb_default)) {
1879
		fib_result_assign(res, fi);
1880
		tb->tb_default = order;
1881 1882
		goto out;
	}
1883 1884
	if (last_idx >= 0)
		fib_result_assign(res, last_resort);
1885 1886
	tb->tb_default = last_idx;
out:
R
Robert Olsson 已提交
1887
	rcu_read_unlock();
1888 1889
}

1890 1891
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1892 1893 1894 1895
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1896
	__be32 xkey = htonl(key);
1897

1898
	s_i = cb->args[5];
1899 1900
	i = 0;

R
Robert Olsson 已提交
1901 1902 1903
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
		if (i < s_i) {
			i++;
			continue;
		}

		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
				  fa->fa_scope,
1915
				  xkey,
1916 1917
				  plen,
				  fa->fa_tos,
1918
				  fa->fa_info, NLM_F_MULTI) < 0) {
1919
			cb->args[5] = i;
1920
			return -1;
O
Olof Johansson 已提交
1921
		}
1922 1923
		i++;
	}
1924
	cb->args[5] = i;
1925 1926 1927
	return skb->len;
}

1928 1929
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1930
{
1931 1932 1933
	struct leaf_info *li;
	struct hlist_node *node;
	int i, s_i;
1934

1935
	s_i = cb->args[4];
1936
	i = 0;
1937

1938 1939 1940 1941
	/* rcu_read_lock is hold by caller */
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
		if (i < s_i) {
			i++;
1942
			continue;
1943
		}
O
Olof Johansson 已提交
1944

1945
		if (i > s_i)
1946
			cb->args[5] = 0;
1947

1948
		if (list_empty(&li->falh))
1949 1950
			continue;

1951
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1952
			cb->args[4] = i;
1953 1954
			return -1;
		}
1955
		i++;
1956
	}
1957

1958
	cb->args[4] = i;
1959 1960 1961
	return skb->len;
}

1962 1963
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1964
{
1965
	struct leaf *l;
1966
	struct trie *t = (struct trie *) tb->tb_data;
1967
	t_key key = cb->args[2];
1968
	int count = cb->args[3];
1969

R
Robert Olsson 已提交
1970
	rcu_read_lock();
1971 1972 1973
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1974
	if (count == 0)
1975 1976
		l = trie_firstleaf(t);
	else {
1977 1978 1979
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1980
		l = fib_find_node(t, key);
1981 1982
		if (!l)
			l = trie_leafindex(t, count);
1983
	}
1984

1985 1986
	while (l) {
		cb->args[2] = l->key;
1987
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1988
			cb->args[3] = count;
1989 1990
			rcu_read_unlock();
			return -1;
1991
		}
1992

1993
		++count;
1994
		l = trie_nextleaf(l);
1995 1996
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1997
	}
1998
	cb->args[3] = count;
R
Robert Olsson 已提交
1999
	rcu_read_unlock();
2000

2001 2002 2003
	return skb->len;
}

2004 2005
void __init fib_hash_init(void)
{
2006 2007
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
2008 2009 2010 2011 2012 2013
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
2014
}
2015

2016 2017 2018

/* Fix more generic FIB names for init later */
struct fib_table *fib_hash_table(u32 id)
2019 2020 2021 2022 2023 2024 2025 2026 2027 2028
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
2029
	tb->tb_default = -1;
2030 2031

	t = (struct trie *) tb->tb_data;
2032
	memset(t, 0, sizeof(*t));
2033 2034

	if (id == RT_TABLE_LOCAL)
2035
		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2036 2037 2038 2039

	return tb;
}

2040 2041 2042
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
2043
	struct seq_net_private p;
2044
	struct fib_table *tb;
2045 2046 2047 2048
	struct tnode *tnode;
	unsigned index;
	unsigned depth;
};
2049

2050
static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2051
{
2052 2053 2054
	struct tnode *tn = iter->tnode;
	unsigned cindex = iter->index;
	struct tnode *p;
2055

2056 2057 2058 2059
	/* A single entry routing table */
	if (!tn)
		return NULL;

2060 2061 2062 2063
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2064
		struct node *n = tnode_get_child_rcu(tn, cindex);
2065

2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2078

2079 2080
		++cindex;
	}
O
Olof Johansson 已提交
2081

2082
	/* Current node exhausted, pop back up */
2083
	p = node_parent_rcu((struct node *)tn);
2084 2085 2086 2087 2088
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2089
	}
2090 2091 2092

	/* got root? */
	return NULL;
2093 2094
}

2095 2096
static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
				       struct trie *t)
2097
{
2098
	struct node *n;
2099

S
Stephen Hemminger 已提交
2100
	if (!t)
2101 2102 2103
		return NULL;

	n = rcu_dereference(t->trie);
2104
	if (!n)
2105
		return NULL;
2106

2107 2108 2109 2110 2111 2112 2113 2114
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2115
	}
2116 2117

	return n;
2118
}
O
Olof Johansson 已提交
2119

2120 2121 2122 2123
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
	struct node *n;
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2124

2125
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2126

2127
	rcu_read_lock();
2128
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2129
		if (IS_LEAF(n)) {
2130 2131 2132 2133
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;
			struct hlist_node *tmp;

2134 2135 2136 2137
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2138 2139 2140

			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
				++s->prefixes;
2141 2142 2143 2144 2145
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2146
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2147 2148
				s->nodesizes[tn->bits]++;

2149 2150 2151
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2152 2153
		}
	}
R
Robert Olsson 已提交
2154
	rcu_read_unlock();
2155 2156
}

2157 2158 2159 2160
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2161
{
2162
	unsigned i, max, pointers, bytes, avdepth;
2163

2164 2165 2166 2167
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2168

2169 2170
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2171
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2172

2173 2174
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2175 2176 2177 2178

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2179
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2180
	bytes += sizeof(struct tnode) * stat->tnodes;
2181

R
Robert Olsson 已提交
2182 2183
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2184
		max--;
2185

2186 2187 2188
	pointers = 0;
	for (i = 1; i <= max; i++)
		if (stat->nodesizes[i] != 0) {
2189
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2190 2191 2192
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2193
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2194

2195
	bytes += sizeof(struct node *) * pointers;
2196 2197
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2198
}
R
Robert Olsson 已提交
2199

2200
#ifdef CONFIG_IP_FIB_TRIE_STATS
2201 2202 2203 2204
static void trie_show_usage(struct seq_file *seq,
			    const struct trie_use_stats *stats)
{
	seq_printf(seq, "\nCounters:\n---------\n");
2205 2206 2207 2208 2209 2210 2211 2212 2213
	seq_printf(seq, "gets = %u\n", stats->gets);
	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
	seq_printf(seq, "semantic match passed = %u\n",
		   stats->semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n",
		   stats->semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n",
		   stats->resize_node_skipped);
2214
}
2215 2216
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2217
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2218
{
2219 2220 2221 2222 2223 2224
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2225
}
2226

2227

2228 2229
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2230
	struct net *net = (struct net *)seq->private;
2231
	unsigned int h;
2232

2233
	seq_printf(seq,
2234 2235
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2236 2237
		   sizeof(struct leaf), sizeof(struct tnode));

2238 2239 2240 2241 2242 2243 2244 2245
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;

		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2246

2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
			trie_show_usage(seq, &t->stats);
#endif
		}
	}
2259

2260
	return 0;
2261 2262
}

2263
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2264
{
2265
	return single_open_net(inode, file, fib_triestat_seq_show);
2266 2267
}

2268
static const struct file_operations fib_triestat_fops = {
2269 2270 2271 2272
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2273
	.release = single_release_net,
2274 2275
};

2276
static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2277
{
2278 2279
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2280
	loff_t idx = 0;
2281
	unsigned int h;
2282

2283 2284 2285 2286
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;
2287

2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298
		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
			struct node *n;

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2299
	}
2300

2301 2302 2303
	return NULL;
}

2304
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2305
	__acquires(RCU)
2306
{
2307
	rcu_read_lock();
2308
	return fib_trie_get_idx(seq, *pos);
2309 2310
}

2311
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2312
{
2313
	struct fib_trie_iter *iter = seq->private;
2314
	struct net *net = seq_file_net(seq);
2315 2316 2317 2318
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
	struct node *n;
2319

2320
	++*pos;
2321 2322 2323 2324
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2325

2326 2327 2328 2329 2330 2331 2332 2333
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2334

2335 2336 2337 2338 2339 2340 2341 2342 2343
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2344
	return NULL;
2345 2346 2347 2348

found:
	iter->tb = tb;
	return n;
2349
}
2350

2351
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2352
	__releases(RCU)
2353
{
2354 2355
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2356

2357 2358 2359 2360
static void seq_indent(struct seq_file *seq, int n)
{
	while (n-- > 0) seq_puts(seq, "   ");
}
2361

2362
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2363
{
S
Stephen Hemminger 已提交
2364
	switch (s) {
2365 2366 2367 2368 2369 2370
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2371
		snprintf(buf, len, "scope=%d", s);
2372 2373 2374
		return buf;
	}
}
2375

2376
static const char *const rtn_type_names[__RTN_MAX] = {
2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2390

2391
static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2392 2393 2394
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2395
	snprintf(buf, len, "type %u", t);
2396
	return buf;
2397 2398
}

2399 2400
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2401
{
2402 2403
	const struct fib_trie_iter *iter = seq->private;
	struct node *n = v;
2404

2405 2406
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2407

2408 2409
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2410
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2411

2412
		seq_indent(seq, iter->depth-1);
2413 2414
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2415
			   tn->empty_children);
2416

2417 2418
	} else {
		struct leaf *l = (struct leaf *) n;
2419 2420
		struct leaf_info *li;
		struct hlist_node *node;
A
Al Viro 已提交
2421
		__be32 val = htonl(l->key);
2422 2423

		seq_indent(seq, iter->depth);
2424
		seq_printf(seq, "  |-- %pI4\n", &val);
2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438

		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
						     fa->fa_scope),
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2439
					seq_printf(seq, " tos=%d", fa->fa_tos);
2440
				seq_putc(seq, '\n');
2441 2442
			}
		}
2443
	}
2444

2445 2446 2447
	return 0;
}

2448
static const struct seq_operations fib_trie_seq_ops = {
2449 2450 2451 2452
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2453 2454
};

2455
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2456
{
2457 2458
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2459 2460
}

2461
static const struct file_operations fib_trie_fops = {
2462 2463 2464 2465
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2466
	.release = seq_release_net,
2467 2468
};

2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2509
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

A
Al Viro 已提交
2547
static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2548
{
2549 2550 2551 2552
	static unsigned type2flags[RTN_MAX + 1] = {
		[7] = RTF_REJECT, [8] = RTF_REJECT,
	};
	unsigned flags = type2flags[type];
2553

2554 2555
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2556
	if (mask == htonl(0xFFFFFFFF))
2557 2558 2559
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2560 2561
}

2562 2563 2564 2565 2566 2567 2568
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
 * 	and needs to be same as fib_hash output to avoid breaking
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2569
{
2570
	struct leaf *l = v;
2571 2572
	struct leaf_info *li;
	struct hlist_node *node;
2573

2574 2575 2576 2577 2578 2579
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2580

2581
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2582
		struct fib_alias *fa;
A
Al Viro 已提交
2583
		__be32 mask, prefix;
O
Olof Johansson 已提交
2584

2585 2586
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2587

2588
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2589
			const struct fib_info *fi = fa->fa_info;
2590
			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2591
			int len;
2592

2593 2594 2595
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2596

2597
			if (fi)
2598 2599 2600
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2601 2602 2603 2604 2605
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2606 2607
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2608
					 fi->fib_window,
2609
					 fi->fib_rtt >> 3, &len);
2610
			else
2611 2612 2613
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2614
					 prefix, 0, flags, 0, 0, 0,
2615
					 mask, 0, 0, 0, &len);
2616

2617
			seq_printf(seq, "%*s\n", 127 - len, "");
2618
		}
2619 2620 2621 2622 2623
	}

	return 0;
}

2624
static const struct seq_operations fib_route_seq_ops = {
2625 2626 2627
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2628
	.show   = fib_route_seq_show,
2629 2630
};

2631
static int fib_route_seq_open(struct inode *inode, struct file *file)
2632
{
2633
	return seq_open_net(inode, file, &fib_route_seq_ops,
2634
			    sizeof(struct fib_route_iter));
2635 2636
}

2637
static const struct file_operations fib_route_fops = {
2638 2639 2640 2641
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2642
	.release = seq_release_net,
2643 2644
};

2645
int __net_init fib_proc_init(struct net *net)
2646
{
2647
	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2648 2649
		goto out1;

2650 2651
	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
				  &fib_triestat_fops))
2652 2653
		goto out2;

2654
	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2655 2656
		goto out3;

2657
	return 0;
2658 2659

out3:
2660
	proc_net_remove(net, "fib_triestat");
2661
out2:
2662
	proc_net_remove(net, "fib_trie");
2663 2664
out1:
	return -ENOMEM;
2665 2666
}

2667
void __net_exit fib_proc_exit(struct net *net)
2668
{
2669 2670 2671
	proc_net_remove(net, "fib_trie");
	proc_net_remove(net, "fib_triestat");
	proc_net_remove(net, "route");
2672 2673 2674
}

#endif /* CONFIG_PROC_FS */