fib_trie.c 61.8 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14 15
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
 * This work is based on the LPC-trie which is originally descibed in:
16
 *
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

R
Robert Olsson 已提交
51
#define VERSION "0.408"
52 53 54

#include <asm/uaccess.h>
#include <asm/system.h>
J
Jiri Slaby 已提交
55
#include <linux/bitops.h>
56 57 58 59 60 61 62 63 64
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
65
#include <linux/inetdevice.h>
66 67 68
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
69
#include <linux/rcupdate.h>
70 71 72 73
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
74
#include <net/net_namespace.h>
75 76 77 78 79 80 81 82
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
83
#define MAX_STAT_DEPTH 32
84 85 86 87 88 89 90 91

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
92 93
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
94 95
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
96 97

struct node {
O
Olof Johansson 已提交
98
	unsigned long parent;
99
	t_key key;
100 101 102
};

struct leaf {
O
Olof Johansson 已提交
103
	unsigned long parent;
104
	t_key key;
105
	struct hlist_head list;
R
Robert Olsson 已提交
106
	struct rcu_head rcu;
107 108 109 110
};

struct leaf_info {
	struct hlist_node hlist;
R
Robert Olsson 已提交
111
	struct rcu_head rcu;
112 113 114 115 116
	int plen;
	struct list_head falh;
};

struct tnode {
O
Olof Johansson 已提交
117
	unsigned long parent;
118
	t_key key;
119 120
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
121 122
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
123 124 125
	union {
		struct rcu_head rcu;
		struct work_struct work;
J
Jarek Poplawski 已提交
126
		struct tnode *tnode_free;
127
	};
O
Olof Johansson 已提交
128
	struct node *child[0];
129 130 131 132 133 134 135 136 137
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
138
	unsigned int resize_node_skipped;
139 140 141 142 143 144 145 146 147
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
148
	unsigned int prefixes;
R
Robert Olsson 已提交
149
	unsigned int nodesizes[MAX_STAT_DEPTH];
150
};
151 152

struct trie {
O
Olof Johansson 已提交
153
	struct node *trie;
154 155 156 157 158 159
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
};

static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 161
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
				  int wasfull);
162
static struct node *resize(struct trie *t, struct tnode *tn);
163 164
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
165 166
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
167

168
static struct kmem_cache *fn_alias_kmem __read_mostly;
169
static struct kmem_cache *trie_leaf_kmem __read_mostly;
170

S
Stephen Hemminger 已提交
171 172
static inline struct tnode *node_parent(struct node *node)
{
173 174 175 176 177 178
	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
}

static inline struct tnode *node_parent_rcu(struct node *node)
{
	struct tnode *ret = node_parent(node);
S
Stephen Hemminger 已提交
179 180 181 182

	return rcu_dereference(ret);
}

183 184 185
/* Same as rcu_assign_pointer
 * but that macro() assumes that value is a pointer.
 */
S
Stephen Hemminger 已提交
186 187
static inline void node_set_parent(struct node *node, struct tnode *ptr)
{
188 189
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
190
}
R
Robert Olsson 已提交
191

192 193 194
static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
195

196 197 198 199
	return tn->child[i];
}

static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
200
{
201
	struct node *ret = tnode_get_child(tn, i);
202

203
	return rcu_dereference(ret);
204 205
}

S
Stephen Hemmigner 已提交
206
static inline int tnode_child_length(const struct tnode *tn)
207
{
O
Olof Johansson 已提交
208
	return 1 << tn->bits;
209 210
}

S
Stephen Hemminger 已提交
211 212 213 214 215
static inline t_key mask_pfx(t_key k, unsigned short l)
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

216 217
static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
{
O
Olof Johansson 已提交
218
	if (offset < KEYLENGTH)
219
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
220
	else
221 222 223 224 225
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
226
	return a == b;
227 228 229 230
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
231 232
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
233 234
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
235
}
236 237 238 239 240 241

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

242 243 244
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
245 246 247 248 249
		i++;
	return i;
}

/*
250 251
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
252 253 254 255
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

256 257 258 259 260
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
261 262
  correct key path.

263 264 265 266 267
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
268 269
  call to tkey_sub_equals() in trie_insert().

270
  if n is an internal node - a 'tnode' here, the various parts of its key
271 272
  have many different meanings.

273
  Example:
274 275 276
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
277
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
278 279 280 281 282 283 284 285 286

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
287
  n->bits = 4
288

289 290
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
291 292 293
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
294
  index into the parent's child array. That is, they will be used to find
295 296 297 298 299
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

300
  All the bits we have seen so far are significant to the node n. The rest
301 302
  of the bits are really not needed or indeed known in n->key.

303
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
304
  n's child array, and will of course be different for each child.
305

306

307 308 309 310 311
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
312
static inline void check_tnode(const struct tnode *tn)
313
{
S
Stephen Hemminger 已提交
314
	WARN_ON(tn && tn->pos+tn->bits > 32);
315 316
}

317 318 319 320
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
static const int halve_threshold_root = 8;
static const int inflate_threshold_root = 15;
321

R
Robert Olsson 已提交
322 323

static void __alias_free_mem(struct rcu_head *head)
324
{
R
Robert Olsson 已提交
325 326
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
327 328
}

R
Robert Olsson 已提交
329
static inline void alias_free_mem_rcu(struct fib_alias *fa)
330
{
R
Robert Olsson 已提交
331 332
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
333

R
Robert Olsson 已提交
334 335
static void __leaf_free_rcu(struct rcu_head *head)
{
336 337
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
338
}
O
Olof Johansson 已提交
339

340 341 342 343 344
static inline void free_leaf(struct leaf *l)
{
	call_rcu_bh(&l->rcu, __leaf_free_rcu);
}

R
Robert Olsson 已提交
345
static void __leaf_info_free_rcu(struct rcu_head *head)
346
{
R
Robert Olsson 已提交
347
	kfree(container_of(head, struct leaf_info, rcu));
348 349
}

R
Robert Olsson 已提交
350
static inline void free_leaf_info(struct leaf_info *leaf)
351
{
R
Robert Olsson 已提交
352
	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
353 354
}

355
static struct tnode *tnode_alloc(size_t size)
356
{
R
Robert Olsson 已提交
357
	if (size <= PAGE_SIZE)
358
		return kzalloc(size, GFP_KERNEL);
359 360 361
	else
		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
}
R
Robert Olsson 已提交
362

363 364 365 366
static void __tnode_vfree(struct work_struct *arg)
{
	struct tnode *tn = container_of(arg, struct tnode, work);
	vfree(tn);
367 368
}

R
Robert Olsson 已提交
369
static void __tnode_free_rcu(struct rcu_head *head)
370
{
R
Robert Olsson 已提交
371
	struct tnode *tn = container_of(head, struct tnode, rcu);
372 373
	size_t size = sizeof(struct tnode) +
		      (sizeof(struct node *) << tn->bits);
374 375 376

	if (size <= PAGE_SIZE)
		kfree(tn);
377 378 379 380
	else {
		INIT_WORK(&tn->work, __tnode_vfree);
		schedule_work(&tn->work);
	}
381 382
}

R
Robert Olsson 已提交
383 384
static inline void tnode_free(struct tnode *tn)
{
385 386 387
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
388
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
389 390
}

J
Jarek Poplawski 已提交
391 392 393
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
394 395
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
J
Jarek Poplawski 已提交
396 397 398 399 400 401 402 403 404 405 406 407 408
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
}

R
Robert Olsson 已提交
409 410
static struct leaf *leaf_new(void)
{
411
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

429
static struct tnode *tnode_new(t_key key, int pos, int bits)
430
{
431
	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
432
	struct tnode *tn = tnode_alloc(sz);
433

O
Olof Johansson 已提交
434
	if (tn) {
R
Robert Olsson 已提交
435
		tn->parent = T_TNODE;
436 437 438 439 440 441
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
442

443 444
	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
		 (unsigned long) (sizeof(struct node) << bits));
445 446 447 448 449 450 451 452
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

S
Stephen Hemmigner 已提交
453
static inline int tnode_full(const struct tnode *tn, const struct node *n)
454
{
455
	if (n == NULL || IS_LEAF(n))
456 457 458 459 460
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

461 462
static inline void put_child(struct trie *t, struct tnode *tn, int i,
			     struct node *n)
463 464 465 466
{
	tnode_put_child_reorg(tn, i, n, -1);
}

467
 /*
468 469 470 471
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

472 473
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
				  int wasfull)
474
{
R
Robert Olsson 已提交
475
	struct node *chi = tn->child[i];
476 477
	int isfull;

S
Stephen Hemminger 已提交
478 479
	BUG_ON(i >= 1<<tn->bits);

480 481 482 483 484
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
485

486
	/* update fullChildren */
O
Olof Johansson 已提交
487
	if (wasfull == -1)
488 489 490
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
491
	if (wasfull && !isfull)
492
		tn->full_children--;
493
	else if (!wasfull && isfull)
494
		tn->full_children++;
O
Olof Johansson 已提交
495

496
	if (n)
S
Stephen Hemminger 已提交
497
		node_set_parent(n, tn);
498

R
Robert Olsson 已提交
499
	rcu_assign_pointer(tn->child[i], n);
500 501
}

502
static struct node *resize(struct trie *t, struct tnode *tn)
503 504
{
	int i;
505
	int err = 0;
506
	struct tnode *old_tn;
507 508
	int inflate_threshold_use;
	int halve_threshold_use;
R
Robert Olsson 已提交
509
	int max_resize;
510

511
	if (!tn)
512 513
		return NULL;

S
Stephen Hemminger 已提交
514 515
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
516 517 518

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
519
		tnode_free_safe(tn);
520 521 522 523 524
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
		for (i = 0; i < tnode_child_length(tn); i++) {
O
Olof Johansson 已提交
525
			struct node *n;
526

O
Olof Johansson 已提交
527
			n = tn->child[i];
R
Robert Olsson 已提交
528
			if (!n)
O
Olof Johansson 已提交
529 530 531
				continue;

			/* compress one level */
S
Stephen Hemminger 已提交
532
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
533
			tnode_free_safe(tn);
O
Olof Johansson 已提交
534
			return n;
535
		}
536
	/*
537 538 539 540 541
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
542 543
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
544
	 * Telecommunications, page 6:
545
	 * "A node is doubled if the ratio of non-empty children to all
546 547
	 * children in the *doubled* node is at least 'high'."
	 *
548 549 550 551 552
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
553
	 * multiply the left-hand side by 50.
554 555 556 557
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
558
	 * children, that is non-null tnodes with a skip value of 0.
559
	 * All of those will be doubled in the resulting inflated tnode, so
560
	 * we just count them one extra time here.
561
	 *
562
	 * A clearer way to write this would be:
563
	 *
564
	 * to_be_doubled = tn->full_children;
565
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
566 567 568 569
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
570
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
571 572
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
573 574 575
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
576 577 578
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
579
	 *
580 581 582
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
583
	 *
584
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
585
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
586
	 *    tn->full_children) >= inflate_threshold * new_child_length
587
	 *
588
	 * expand new_child_length:
589
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
590
	 *    tn->full_children) >=
591
	 *      inflate_threshold * tnode_child_length(tn) * 2
592
	 *
593
	 * shorten again:
594
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
595
	 *    tn->empty_children) >= inflate_threshold *
596
	 *    tnode_child_length(tn)
597
	 *
598 599 600
	 */

	check_tnode(tn);
601

602 603
	/* Keep root node larger  */

S
Stephen Hemminger 已提交
604
	if (!tn->parent)
605
		inflate_threshold_use = inflate_threshold_root;
606
	else
607 608
		inflate_threshold_use = inflate_threshold;

609
	err = 0;
R
Robert Olsson 已提交
610 611
	max_resize = 10;
	while ((tn->full_children > 0 &&  max_resize-- &&
612 613 614
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
615

616 617
		old_tn = tn;
		tn = inflate(t, tn);
618

619 620
		if (IS_ERR(tn)) {
			tn = old_tn;
621 622 623 624 625
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
626 627
	}

R
Robert Olsson 已提交
628 629
	if (max_resize < 0) {
		if (!tn->parent)
630 631 632
			pr_warning("Fix inflate_threshold_root."
				   " Now=%d size=%d bits\n",
				   inflate_threshold_root, tn->bits);
R
Robert Olsson 已提交
633
		else
634 635 636
			pr_warning("Fix inflate_threshold."
				   " Now=%d size=%d bits\n",
				   inflate_threshold, tn->bits);
R
Robert Olsson 已提交
637 638
	}

639 640 641 642 643 644
	check_tnode(tn);

	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
645

646 647 648

	/* Keep root node larger  */

S
Stephen Hemminger 已提交
649
	if (!tn->parent)
650
		halve_threshold_use = halve_threshold_root;
651
	else
652 653
		halve_threshold_use = halve_threshold;

654
	err = 0;
R
Robert Olsson 已提交
655 656
	max_resize = 10;
	while (tn->bits > 1 &&  max_resize-- &&
657
	       100 * (tnode_child_length(tn) - tn->empty_children) <
658
	       halve_threshold_use * tnode_child_length(tn)) {
659

660 661 662 663
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
664 665 666 667 668 669
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
670

R
Robert Olsson 已提交
671 672
	if (max_resize < 0) {
		if (!tn->parent)
673 674 675
			pr_warning("Fix halve_threshold_root."
				   " Now=%d size=%d bits\n",
				   halve_threshold_root, tn->bits);
R
Robert Olsson 已提交
676
		else
677 678 679
			pr_warning("Fix halve_threshold."
				   " Now=%d size=%d bits\n",
				   halve_threshold, tn->bits);
R
Robert Olsson 已提交
680
	}
681

682 683 684
	/* Only one child remains */
	if (tn->empty_children == tnode_child_length(tn) - 1)
		for (i = 0; i < tnode_child_length(tn); i++) {
O
Olof Johansson 已提交
685
			struct node *n;
686

O
Olof Johansson 已提交
687
			n = tn->child[i];
R
Robert Olsson 已提交
688
			if (!n)
O
Olof Johansson 已提交
689 690 691 692
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
693
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
694
			tnode_free_safe(tn);
O
Olof Johansson 已提交
695
			return n;
696 697 698 699 700
		}

	return (struct node *) tn;
}

701
static struct tnode *inflate(struct trie *t, struct tnode *tn)
702 703 704 705 706
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
707
	pr_debug("In inflate\n");
708 709 710

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
711
	if (!tn)
712
		return ERR_PTR(-ENOMEM);
713 714

	/*
715 716 717
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
718 719
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
720 721

	for (i = 0; i < olen; i++) {
722
		struct tnode *inode;
723

724
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
725 726 727 728 729
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
730
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
731

732 733
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
734 735
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
736

737 738 739
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

740
			if (!right) {
741 742
				tnode_free(left);
				goto nomem;
743
			}
744 745 746 747 748 749

			put_child(t, tn, 2*i, (struct node *) left);
			put_child(t, tn, 2*i+1, (struct node *) right);
		}
	}

O
Olof Johansson 已提交
750
	for (i = 0; i < olen; i++) {
751
		struct tnode *inode;
752
		struct node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
753 754
		struct tnode *left, *right;
		int size, j;
755

756 757 758 759 760 761
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

762
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
763
		   tn->pos + tn->bits - 1) {
764 765 766
			if (tkey_extract_bits(node->key,
					      oldtnode->pos + oldtnode->bits,
					      1) == 0)
767 768 769 770 771 772 773 774 775 776 777 778 779
				put_child(t, tn, 2*i, node);
			else
				put_child(t, tn, 2*i+1, node);
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
			put_child(t, tn, 2*i, inode->child[0]);
			put_child(t, tn, 2*i+1, inode->child[1]);

J
Jarek Poplawski 已提交
780
			tnode_free_safe(inode);
O
Olof Johansson 已提交
781
			continue;
782 783
		}

O
Olof Johansson 已提交
784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
802

O
Olof Johansson 已提交
803 804 805
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
806

O
Olof Johansson 已提交
807 808
		left = (struct tnode *) tnode_get_child(tn, 2*i);
		put_child(t, tn, 2*i, NULL);
809

O
Olof Johansson 已提交
810
		BUG_ON(!left);
811

O
Olof Johansson 已提交
812 813
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
		put_child(t, tn, 2*i+1, NULL);
814

O
Olof Johansson 已提交
815
		BUG_ON(!right);
816

O
Olof Johansson 已提交
817 818 819 820
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
			put_child(t, left, j, inode->child[j]);
			put_child(t, right, j, inode->child[j + size]);
821
		}
O
Olof Johansson 已提交
822 823 824
		put_child(t, tn, 2*i, resize(t, left));
		put_child(t, tn, 2*i+1, resize(t, right));

J
Jarek Poplawski 已提交
825
		tnode_free_safe(inode);
826
	}
J
Jarek Poplawski 已提交
827
	tnode_free_safe(oldtnode);
828
	return tn;
829 830 831 832 833
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

S
Stephen Hemminger 已提交
834
		for (j = 0; j < size; j++)
835 836 837 838
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
S
Stephen Hemminger 已提交
839

840 841
		return ERR_PTR(-ENOMEM);
	}
842 843
}

844
static struct tnode *halve(struct trie *t, struct tnode *tn)
845 846 847 848 849 850
{
	struct tnode *oldtnode = tn;
	struct node *left, *right;
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
851
	pr_debug("In halve\n");
852 853

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
854

855 856
	if (!tn)
		return ERR_PTR(-ENOMEM);
857 858

	/*
859 860 861
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
862 863 864
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
865
	for (i = 0; i < olen; i += 2) {
866 867
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
868

869
		/* Two nonempty children */
S
Stephen Hemminger 已提交
870
		if (left && right) {
871
			struct tnode *newn;
S
Stephen Hemminger 已提交
872

873
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
874 875

			if (!newn)
876
				goto nomem;
S
Stephen Hemminger 已提交
877

878
			put_child(t, tn, i/2, (struct node *)newn);
879 880 881
		}

	}
882

O
Olof Johansson 已提交
883 884 885
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

886 887
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
888

889 890 891 892 893
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
			put_child(t, tn, i/2, right);
O
Olof Johansson 已提交
894
			continue;
S
Stephen Hemminger 已提交
895
		}
O
Olof Johansson 已提交
896 897

		if (right == NULL) {
898
			put_child(t, tn, i/2, left);
O
Olof Johansson 已提交
899 900
			continue;
		}
901

902
		/* Two nonempty children */
O
Olof Johansson 已提交
903 904 905 906 907
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
		put_child(t, tn, i/2, NULL);
		put_child(t, newBinNode, 0, left);
		put_child(t, newBinNode, 1, right);
		put_child(t, tn, i/2, resize(t, newBinNode));
908
	}
J
Jarek Poplawski 已提交
909
	tnode_free_safe(oldtnode);
910
	return tn;
911 912 913 914 915
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

S
Stephen Hemminger 已提交
916
		for (j = 0; j < size; j++)
917 918 919 920
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
S
Stephen Hemminger 已提交
921

922 923
		return ERR_PTR(-ENOMEM);
	}
924 925
}

R
Robert Olsson 已提交
926
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
927 928
 via get_fa_head and dump */

R
Robert Olsson 已提交
929
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
930
{
R
Robert Olsson 已提交
931
	struct hlist_head *head = &l->list;
932 933 934
	struct hlist_node *node;
	struct leaf_info *li;

R
Robert Olsson 已提交
935
	hlist_for_each_entry_rcu(li, node, head, hlist)
936
		if (li->plen == plen)
937
			return li;
O
Olof Johansson 已提交
938

939 940 941
	return NULL;
}

942
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
943
{
R
Robert Olsson 已提交
944
	struct leaf_info *li = find_leaf_info(l, plen);
945

O
Olof Johansson 已提交
946 947
	if (!li)
		return NULL;
948

O
Olof Johansson 已提交
949
	return &li->falh;
950 951 952 953
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
	struct leaf_info *li = NULL, *last = NULL;
	struct hlist_node *node;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
		hlist_for_each_entry(li, node, head, hlist) {
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
			hlist_add_after_rcu(&last->hlist, &new->hlist);
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
971 972
}

R
Robert Olsson 已提交
973 974
/* rcu_read_lock needs to be hold by caller from readside */

975 976 977 978 979 980 981 982
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
	struct node *n;

	pos = 0;
R
Robert Olsson 已提交
983
	n = rcu_dereference(t->trie);
984 985 986

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
987

988
		check_tnode(tn);
O
Olof Johansson 已提交
989

990
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
991
			pos = tn->pos + tn->bits;
992 993 994 995
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
996
		} else
997 998 999 1000
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
1001 1002 1003
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

1004 1005 1006
	return NULL;
}

1007
static void trie_rebalance(struct trie *t, struct tnode *tn)
1008 1009
{
	int wasfull;
R
Robert Olsson 已提交
1010
	t_key cindex, key;
S
Stephen Hemminger 已提交
1011
	struct tnode *tp;
1012

R
Robert Olsson 已提交
1013 1014
	key = tn->key;

S
Stephen Hemminger 已提交
1015
	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1016 1017
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1018 1019 1020 1021
		tn = (struct tnode *) resize(t, (struct tnode *)tn);

		tnode_put_child_reorg((struct tnode *)tp, cindex,
				      (struct node *)tn, wasfull);
O
Olof Johansson 已提交
1022

S
Stephen Hemminger 已提交
1023
		tp = node_parent((struct node *) tn);
1024 1025 1026
		if (!tp)
			rcu_assign_pointer(t->trie, (struct node *)tn);

J
Jarek Poplawski 已提交
1027
		tnode_free_flush();
S
Stephen Hemminger 已提交
1028
		if (!tp)
1029
			break;
S
Stephen Hemminger 已提交
1030
		tn = tp;
1031
	}
S
Stephen Hemminger 已提交
1032

1033
	/* Handle last (top) tnode */
1034
	if (IS_TNODE(tn))
1035
		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1036

1037 1038 1039 1040
	rcu_assign_pointer(t->trie, (struct node *)tn);
	tnode_free_flush();

	return;
1041 1042
}

R
Robert Olsson 已提交
1043 1044
/* only used from updater-side */

1045
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1046 1047 1048 1049 1050 1051
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
	struct node *n;
	struct leaf *l;
	int missbit;
1052
	struct list_head *fa_head = NULL;
1053 1054 1055 1056
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
1057
	n = t->trie;
1058

1059 1060
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1061
	 * and we should just put our new leaf in that.
1062 1063
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1064 1065
	 * not be the parent's 'pos'+'bits'!
	 *
1066
	 * If it does match the current key, get pos/bits from it, extract
1067 1068 1069 1070
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1071 1072 1073
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1074 1075 1076 1077 1078
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1079

1080
		check_tnode(tn);
O
Olof Johansson 已提交
1081

1082
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1083
			tp = tn;
O
Olof Johansson 已提交
1084
			pos = tn->pos + tn->bits;
1085 1086 1087 1088
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1089

S
Stephen Hemminger 已提交
1090
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1091
		} else
1092 1093 1094 1095 1096 1097
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1098
	 * tp is n's (parent) ----> NULL or TNODE
1099 1100
	 */

O
Olof Johansson 已提交
1101
	BUG_ON(tp && IS_LEAF(tp));
1102 1103 1104

	/* Case 1: n is a leaf. Compare prefixes */

1105
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1106
		l = (struct leaf *) n;
1107
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1108

1109 1110
		if (!li)
			return NULL;
1111 1112 1113 1114 1115 1116 1117

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1118 1119
	if (!l)
		return NULL;
1120 1121 1122 1123

	l->key = key;
	li = leaf_info_new(plen);

1124
	if (!li) {
1125
		free_leaf(l);
1126
		return NULL;
1127
	}
1128 1129 1130 1131 1132

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1133
		/* Case 2: n is NULL, and will just insert a new leaf */
1134

S
Stephen Hemminger 已提交
1135
		node_set_parent((struct node *)l, tp);
1136

O
Olof Johansson 已提交
1137 1138 1139 1140
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1141 1142
		/*
		 *  Add a new tnode here
1143 1144 1145 1146
		 *  first tnode need some special handling
		 */

		if (tp)
O
Olof Johansson 已提交
1147
			pos = tp->pos+tp->bits;
1148
		else
O
Olof Johansson 已提交
1149 1150
			pos = 0;

1151
		if (n) {
1152 1153
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1154
		} else {
1155
			newpos = 0;
1156
			tn = tnode_new(key, newpos, 1); /* First tnode */
1157 1158
		}

1159
		if (!tn) {
1160
			free_leaf_info(li);
1161
			free_leaf(l);
1162
			return NULL;
O
Olof Johansson 已提交
1163 1164
		}

S
Stephen Hemminger 已提交
1165
		node_set_parent((struct node *)tn, tp);
1166

O
Olof Johansson 已提交
1167
		missbit = tkey_extract_bits(key, newpos, 1);
1168 1169 1170
		put_child(t, tn, missbit, (struct node *)l);
		put_child(t, tn, 1-missbit, n);

1171
		if (tp) {
1172
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1173 1174
			put_child(t, (struct tnode *)tp, cindex,
				  (struct node *)tn);
O
Olof Johansson 已提交
1175
		} else {
1176
			rcu_assign_pointer(t->trie, (struct node *)tn);
1177 1178 1179
			tp = tn;
		}
	}
O
Olof Johansson 已提交
1180 1181

	if (tp && tp->pos + tp->bits > 32)
1182 1183 1184
		pr_warning("fib_trie"
			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			   tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1185

1186
	/* Rebalance the trie */
R
Robert Olsson 已提交
1187

1188
	trie_rebalance(t, tp);
1189
done:
1190 1191 1192
	return fa_head;
}

1193 1194 1195
/*
 * Caller must hold RTNL.
 */
1196
static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1197 1198 1199
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1200
	struct list_head *fa_head = NULL;
1201
	struct fib_info *fi;
1202 1203
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1204 1205 1206 1207 1208 1209 1210
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1211
	key = ntohl(cfg->fc_dst);
1212

1213
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1214

O
Olof Johansson 已提交
1215
	mask = ntohl(inet_make_mask(plen));
1216

1217
	if (key & ~mask)
1218 1219 1220 1221
		return -EINVAL;

	key = key & mask;

1222 1223 1224
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1225
		goto err;
1226
	}
1227 1228

	l = fib_find_node(t, key);
1229
	fa = NULL;
1230

1231
	if (l) {
1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1247 1248 1249
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1250 1251

		err = -EEXIST;
1252
		if (cfg->fc_nlflags & NLM_F_EXCL)
1253 1254
			goto out;

1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_scope == cfg->fc_scope &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1276
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1277 1278 1279
			struct fib_info *fi_drop;
			u8 state;

1280 1281 1282 1283
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1284
				goto out;
1285
			}
R
Robert Olsson 已提交
1286
			err = -ENOBUFS;
1287
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1288 1289
			if (new_fa == NULL)
				goto out;
1290 1291

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1292 1293
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1294 1295
			new_fa->fa_type = cfg->fc_type;
			new_fa->fa_scope = cfg->fc_scope;
1296
			state = fa->fa_state;
1297
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1298

R
Robert Olsson 已提交
1299 1300
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1301 1302 1303

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1304
				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1305 1306
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1307

O
Olof Johansson 已提交
1308
			goto succeeded;
1309 1310 1311 1312 1313
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1314 1315
		if (fa_match)
			goto out;
1316

1317
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1318
			fa = fa_first;
1319 1320
	}
	err = -ENOENT;
1321
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1322 1323 1324
		goto out;

	err = -ENOBUFS;
1325
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1326 1327 1328 1329 1330
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1331 1332
	new_fa->fa_type = cfg->fc_type;
	new_fa->fa_scope = cfg->fc_scope;
1333 1334 1335 1336 1337
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1338
	if (!fa_head) {
1339 1340 1341
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1342
			goto out_free_new_fa;
1343
		}
1344
	}
1345

R
Robert Olsson 已提交
1346 1347
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1348

1349
	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1350
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1351
		  &cfg->fc_nlinfo, 0);
1352 1353
succeeded:
	return 0;
1354 1355 1356

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1357 1358
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1359
err:
1360 1361 1362
	return err;
}

R
Robert Olsson 已提交
1363
/* should be called with rcu_read_lock */
1364 1365 1366
static int check_leaf(struct trie *t, struct leaf *l,
		      t_key key,  const struct flowi *flp,
		      struct fib_result *res)
1367 1368 1369 1370
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
	struct hlist_node *node;
1371

R
Robert Olsson 已提交
1372
	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1373 1374 1375 1376
		int err;
		int plen = li->plen;
		__be32 mask = inet_make_mask(plen);

1377
		if (l->key != (key & ntohl(mask)))
1378 1379
			continue;

1380
		err = fib_semantic_match(&li->falh, flp, res, plen);
1381

1382
#ifdef CONFIG_IP_FIB_TRIE_STATS
1383
		if (err <= 0)
1384
			t->stats.semantic_match_passed++;
1385 1386
		else
			t->stats.semantic_match_miss++;
1387
#endif
1388
		if (err <= 0)
1389
			return err;
1390
	}
1391

1392
	return 1;
1393 1394
}

1395 1396
static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
			  struct fib_result *res)
1397 1398
{
	struct trie *t = (struct trie *) tb->tb_data;
1399
	int ret;
1400 1401 1402
	struct node *n;
	struct tnode *pn;
	int pos, bits;
O
Olof Johansson 已提交
1403
	t_key key = ntohl(flp->fl4_dst);
1404 1405 1406
	int chopped_off;
	t_key cindex = 0;
	int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1407 1408 1409 1410
	struct tnode *cn;
	t_key node_prefix, key_prefix, pref_mismatch;
	int mp;

R
Robert Olsson 已提交
1411
	rcu_read_lock();
O
Olof Johansson 已提交
1412

R
Robert Olsson 已提交
1413
	n = rcu_dereference(t->trie);
1414
	if (!n)
1415 1416 1417 1418 1419 1420 1421 1422
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats.gets++;
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1423
		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1424
		goto found;
1425
	}
1426

1427 1428
	pn = (struct tnode *) n;
	chopped_off = 0;
1429

O
Olof Johansson 已提交
1430
	while (pn) {
1431 1432 1433
		pos = pn->pos;
		bits = pn->bits;

1434
		if (!chopped_off)
S
Stephen Hemminger 已提交
1435 1436
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1437 1438 1439 1440 1441 1442 1443 1444 1445 1446

		n = tnode_get_child(pn, cindex);

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.null_node_hit++;
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1447
		if (IS_LEAF(n)) {
1448 1449
			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
			if (ret > 0)
O
Olof Johansson 已提交
1450
				goto backtrace;
1451
			goto found;
O
Olof Johansson 已提交
1452 1453 1454
		}

		cn = (struct tnode *)n;
1455

O
Olof Johansson 已提交
1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1471

O
Olof Johansson 已提交
1472 1473 1474 1475 1476 1477 1478 1479 1480
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1481

1482 1483
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1484

O
Olof Johansson 已提交
1485 1486
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1487 1488
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1489 1490
				goto backtrace;
		}
1491

O
Olof Johansson 已提交
1492 1493 1494 1495 1496 1497 1498 1499 1500
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1501

O
Olof Johansson 已提交
1502 1503 1504 1505 1506 1507 1508 1509
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1510

1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1522 1523
		 */

S
Stephen Hemminger 已提交
1524 1525
		node_prefix = mask_pfx(cn->key, cn->pos);
		key_prefix = mask_pfx(key, cn->pos);
O
Olof Johansson 已提交
1526 1527 1528
		pref_mismatch = key_prefix^node_prefix;
		mp = 0;

1529 1530 1531 1532
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1533 1534 1535 1536
		 */
		if (pref_mismatch) {
			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
				mp++;
1537
				pref_mismatch = pref_mismatch << 1;
O
Olof Johansson 已提交
1538 1539 1540 1541 1542 1543 1544 1545
			}
			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);

			if (key_prefix != 0)
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1546
		}
1547

O
Olof Johansson 已提交
1548 1549 1550 1551
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1552 1553 1554 1555
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1556 1557
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1558 1559 1560 1561
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1562 1563
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1564

1565
		/*
1566
		 * Either we do the actual chop off according or if we have
1567 1568 1569
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1570
		if (chopped_off <= pn->bits) {
1571
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1572
		} else {
S
Stephen Hemminger 已提交
1573 1574
			struct tnode *parent = node_parent((struct node *) pn);
			if (!parent)
1575
				goto failed;
O
Olof Johansson 已提交
1576

1577
			/* Get Child's index */
S
Stephen Hemminger 已提交
1578 1579
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1580 1581 1582 1583 1584 1585
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.backtrack++;
#endif
			goto backtrace;
1586
		}
1587 1588
	}
failed:
1589
	ret = 1;
1590
found:
R
Robert Olsson 已提交
1591
	rcu_read_unlock();
1592 1593 1594
	return ret;
}

1595 1596 1597 1598
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1599
{
1600
	struct tnode *tp = node_parent((struct node *) l);
1601

1602
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1603

1604
	if (tp) {
1605
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1606
		put_child(t, (struct tnode *)tp, cindex, NULL);
1607
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1608
	} else
R
Robert Olsson 已提交
1609
		rcu_assign_pointer(t->trie, NULL);
1610

1611
	free_leaf(l);
1612 1613
}

1614 1615 1616
/*
 * Caller must hold RTNL.
 */
1617
static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1618 1619 1620
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1621 1622
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1623 1624 1625
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1626 1627
	struct leaf_info *li;

1628
	if (plen > 32)
1629 1630
		return -EINVAL;

1631
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1632
	mask = ntohl(inet_make_mask(plen));
1633

1634
	if (key & ~mask)
1635 1636 1637 1638 1639
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1640
	if (!l)
1641 1642 1643 1644 1645 1646 1647 1648
		return -ESRCH;

	fa_head = get_fa_head(l, plen);
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1649
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1650 1651

	fa_to_delete = NULL;
1652 1653
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1654 1655 1656 1657 1658
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1659 1660 1661 1662 1663 1664
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
		     fa->fa_scope == cfg->fc_scope) &&
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1665 1666 1667 1668 1669
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1670 1671
	if (!fa_to_delete)
		return -ESRCH;
1672

O
Olof Johansson 已提交
1673
	fa = fa_to_delete;
1674
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1675
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1676 1677

	l = fib_find_node(t, key);
R
Robert Olsson 已提交
1678
	li = find_leaf_info(l, plen);
1679

R
Robert Olsson 已提交
1680
	list_del_rcu(&fa->fa_list);
1681

O
Olof Johansson 已提交
1682
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1683
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1684
		free_leaf_info(li);
R
Robert Olsson 已提交
1685
	}
1686

O
Olof Johansson 已提交
1687
	if (hlist_empty(&l->list))
1688
		trie_leaf_remove(t, l);
1689

O
Olof Johansson 已提交
1690
	if (fa->fa_state & FA_S_ACCESSED)
1691
		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1692

R
Robert Olsson 已提交
1693 1694
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1695
	return 0;
1696 1697
}

1698
static int trie_flush_list(struct list_head *head)
1699 1700 1701 1702 1703 1704 1705
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1706 1707 1708 1709
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1710 1711 1712 1713 1714 1715
			found++;
		}
	}
	return found;
}

1716
static int trie_flush_leaf(struct leaf *l)
1717 1718 1719 1720 1721 1722 1723
{
	int found = 0;
	struct hlist_head *lih = &l->list;
	struct hlist_node *node, *tmp;
	struct leaf_info *li = NULL;

	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1724
		found += trie_flush_list(&li->falh);
1725 1726

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1727
			hlist_del_rcu(&li->hlist);
1728 1729 1730 1731 1732 1733
			free_leaf_info(li);
		}
	}
	return found;
}

1734 1735 1736 1737 1738
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1739
{
1740 1741
	do {
		t_key idx;
1742 1743

		if (c)
1744
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1745
		else
1746
			idx = 0;
R
Robert Olsson 已提交
1747

1748 1749
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1750
			if (!c)
O
Olof Johansson 已提交
1751 1752
				continue;

1753 1754 1755
			if (IS_LEAF(c)) {
				prefetch(p->child[idx]);
				return (struct leaf *) c;
1756
			}
1757 1758 1759 1760

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1761
		}
1762 1763

		/* Node empty, walk back up to parent */
O
Olof Johansson 已提交
1764
		c = (struct node *) p;
1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791
	} while ( (p = node_parent_rcu(c)) != NULL);

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
	struct node *c = (struct node *) l;
	struct tnode *p = node_parent(c);

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1792 1793
}

1794 1795 1796 1797
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1798
	while (l && index-- > 0)
1799
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1800

1801 1802 1803 1804
	return l;
}


1805 1806 1807
/*
 * Caller must hold RTNL.
 */
1808 1809 1810
static int fn_trie_flush(struct fib_table *tb)
{
	struct trie *t = (struct trie *) tb->tb_data;
1811
	struct leaf *l, *ll = NULL;
1812
	int found = 0;
1813

1814
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1815
		found += trie_flush_leaf(l);
1816 1817

		if (ll && hlist_empty(&ll->list))
1818
			trie_leaf_remove(t, ll);
1819 1820 1821 1822
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1823
		trie_leaf_remove(t, ll);
1824

S
Stephen Hemminger 已提交
1825
	pr_debug("trie_flush found=%d\n", found);
1826 1827 1828
	return found;
}

1829 1830 1831
static void fn_trie_select_default(struct fib_table *tb,
				   const struct flowi *flp,
				   struct fib_result *res)
1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844
{
	struct trie *t = (struct trie *) tb->tb_data;
	int order, last_idx;
	struct fib_info *fi = NULL;
	struct fib_info *last_resort;
	struct fib_alias *fa = NULL;
	struct list_head *fa_head;
	struct leaf *l;

	last_idx = -1;
	last_resort = NULL;
	order = -1;

R
Robert Olsson 已提交
1845
	rcu_read_lock();
1846

1847
	l = fib_find_node(t, 0);
1848
	if (!l)
1849 1850 1851
		goto out;

	fa_head = get_fa_head(l, 0);
1852
	if (!fa_head)
1853 1854
		goto out;

1855
	if (list_empty(fa_head))
1856 1857
		goto out;

R
Robert Olsson 已提交
1858
	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1859
		struct fib_info *next_fi = fa->fa_info;
O
Olof Johansson 已提交
1860

1861 1862 1863
		if (fa->fa_scope != res->scope ||
		    fa->fa_type != RTN_UNICAST)
			continue;
O
Olof Johansson 已提交
1864

1865 1866 1867 1868 1869 1870
		if (next_fi->fib_priority > res->fi->fib_priority)
			break;
		if (!next_fi->fib_nh[0].nh_gw ||
		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
			continue;
		fa->fa_state |= FA_S_ACCESSED;
O
Olof Johansson 已提交
1871

1872 1873 1874 1875
		if (fi == NULL) {
			if (next_fi != res->fi)
				break;
		} else if (!fib_detect_death(fi, order, &last_resort,
1876
					     &last_idx, tb->tb_default)) {
1877
			fib_result_assign(res, fi);
1878
			tb->tb_default = order;
1879 1880 1881 1882 1883 1884
			goto out;
		}
		fi = next_fi;
		order++;
	}
	if (order <= 0 || fi == NULL) {
1885
		tb->tb_default = -1;
1886 1887 1888
		goto out;
	}

1889 1890
	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
				tb->tb_default)) {
1891
		fib_result_assign(res, fi);
1892
		tb->tb_default = order;
1893 1894
		goto out;
	}
1895 1896
	if (last_idx >= 0)
		fib_result_assign(res, last_resort);
1897 1898
	tb->tb_default = last_idx;
out:
R
Robert Olsson 已提交
1899
	rcu_read_unlock();
1900 1901
}

1902 1903
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1904 1905 1906 1907
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1908
	__be32 xkey = htonl(key);
1909

1910
	s_i = cb->args[5];
1911 1912
	i = 0;

R
Robert Olsson 已提交
1913 1914 1915
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926
		if (i < s_i) {
			i++;
			continue;
		}

		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
				  fa->fa_scope,
1927
				  xkey,
1928 1929
				  plen,
				  fa->fa_tos,
1930
				  fa->fa_info, NLM_F_MULTI) < 0) {
1931
			cb->args[5] = i;
1932
			return -1;
O
Olof Johansson 已提交
1933
		}
1934 1935
		i++;
	}
1936
	cb->args[5] = i;
1937 1938 1939
	return skb->len;
}

1940 1941
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1942
{
1943 1944 1945
	struct leaf_info *li;
	struct hlist_node *node;
	int i, s_i;
1946

1947
	s_i = cb->args[4];
1948
	i = 0;
1949

1950 1951 1952 1953
	/* rcu_read_lock is hold by caller */
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
		if (i < s_i) {
			i++;
1954
			continue;
1955
		}
O
Olof Johansson 已提交
1956

1957
		if (i > s_i)
1958
			cb->args[5] = 0;
1959

1960
		if (list_empty(&li->falh))
1961 1962
			continue;

1963
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1964
			cb->args[4] = i;
1965 1966
			return -1;
		}
1967
		i++;
1968
	}
1969

1970
	cb->args[4] = i;
1971 1972 1973
	return skb->len;
}

1974 1975
static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
			struct netlink_callback *cb)
1976
{
1977
	struct leaf *l;
1978
	struct trie *t = (struct trie *) tb->tb_data;
1979
	t_key key = cb->args[2];
1980
	int count = cb->args[3];
1981

R
Robert Olsson 已提交
1982
	rcu_read_lock();
1983 1984 1985
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1986
	if (count == 0)
1987 1988
		l = trie_firstleaf(t);
	else {
1989 1990 1991
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1992
		l = fib_find_node(t, key);
1993 1994
		if (!l)
			l = trie_leafindex(t, count);
1995
	}
1996

1997 1998
	while (l) {
		cb->args[2] = l->key;
1999
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
2000
			cb->args[3] = count;
2001 2002
			rcu_read_unlock();
			return -1;
2003
		}
2004

2005
		++count;
2006
		l = trie_nextleaf(l);
2007 2008
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
2009
	}
2010
	cb->args[3] = count;
R
Robert Olsson 已提交
2011
	rcu_read_unlock();
2012

2013 2014 2015
	return skb->len;
}

2016 2017
void __init fib_hash_init(void)
{
2018 2019
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
2020 2021 2022 2023 2024 2025
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
2026
}
2027

2028 2029 2030

/* Fix more generic FIB names for init later */
struct fib_table *fib_hash_table(u32 id)
2031 2032 2033 2034 2035 2036 2037 2038 2039 2040
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
2041
	tb->tb_default = -1;
2042 2043 2044 2045 2046 2047 2048 2049
	tb->tb_lookup = fn_trie_lookup;
	tb->tb_insert = fn_trie_insert;
	tb->tb_delete = fn_trie_delete;
	tb->tb_flush = fn_trie_flush;
	tb->tb_select_default = fn_trie_select_default;
	tb->tb_dump = fn_trie_dump;

	t = (struct trie *) tb->tb_data;
2050
	memset(t, 0, sizeof(*t));
2051 2052

	if (id == RT_TABLE_LOCAL)
2053
		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2054 2055 2056 2057

	return tb;
}

2058 2059 2060
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
2061
	struct seq_net_private p;
2062
	struct fib_table *tb;
2063 2064 2065 2066
	struct tnode *tnode;
	unsigned index;
	unsigned depth;
};
2067

2068
static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2069
{
2070 2071 2072
	struct tnode *tn = iter->tnode;
	unsigned cindex = iter->index;
	struct tnode *p;
2073

2074 2075 2076 2077
	/* A single entry routing table */
	if (!tn)
		return NULL;

2078 2079 2080 2081
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2082
		struct node *n = tnode_get_child_rcu(tn, cindex);
2083

2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2096

2097 2098
		++cindex;
	}
O
Olof Johansson 已提交
2099

2100
	/* Current node exhausted, pop back up */
2101
	p = node_parent_rcu((struct node *)tn);
2102 2103 2104 2105 2106
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2107
	}
2108 2109 2110

	/* got root? */
	return NULL;
2111 2112
}

2113 2114
static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
				       struct trie *t)
2115
{
2116
	struct node *n;
2117

S
Stephen Hemminger 已提交
2118
	if (!t)
2119 2120 2121
		return NULL;

	n = rcu_dereference(t->trie);
2122
	if (!n)
2123
		return NULL;
2124

2125 2126 2127 2128 2129 2130 2131 2132
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2133
	}
2134 2135

	return n;
2136
}
O
Olof Johansson 已提交
2137

2138 2139 2140 2141
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
	struct node *n;
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2142

2143
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2144

2145
	rcu_read_lock();
2146
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2147
		if (IS_LEAF(n)) {
2148 2149 2150 2151
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;
			struct hlist_node *tmp;

2152 2153 2154 2155
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2156 2157 2158

			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
				++s->prefixes;
2159 2160 2161 2162 2163
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2164
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2165 2166
				s->nodesizes[tn->bits]++;

2167 2168 2169
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2170 2171
		}
	}
R
Robert Olsson 已提交
2172
	rcu_read_unlock();
2173 2174
}

2175 2176 2177 2178
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2179
{
2180
	unsigned i, max, pointers, bytes, avdepth;
2181

2182 2183 2184 2185
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2186

2187 2188
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2189
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2190

2191 2192
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2193 2194 2195 2196

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2197
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2198
	bytes += sizeof(struct tnode) * stat->tnodes;
2199

R
Robert Olsson 已提交
2200 2201
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2202
		max--;
2203

2204 2205 2206
	pointers = 0;
	for (i = 1; i <= max; i++)
		if (stat->nodesizes[i] != 0) {
2207
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2208 2209 2210
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2211
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2212

2213
	bytes += sizeof(struct node *) * pointers;
2214 2215
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2216
}
R
Robert Olsson 已提交
2217

2218
#ifdef CONFIG_IP_FIB_TRIE_STATS
2219 2220 2221 2222
static void trie_show_usage(struct seq_file *seq,
			    const struct trie_use_stats *stats)
{
	seq_printf(seq, "\nCounters:\n---------\n");
2223 2224 2225 2226 2227 2228 2229 2230 2231
	seq_printf(seq, "gets = %u\n", stats->gets);
	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
	seq_printf(seq, "semantic match passed = %u\n",
		   stats->semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n",
		   stats->semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n",
		   stats->resize_node_skipped);
2232
}
2233 2234
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2235
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2236
{
2237 2238 2239 2240 2241 2242
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2243
}
2244

2245

2246 2247
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2248
	struct net *net = (struct net *)seq->private;
2249
	unsigned int h;
2250

2251
	seq_printf(seq,
2252 2253
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2254 2255
		   sizeof(struct leaf), sizeof(struct tnode));

2256 2257 2258 2259 2260 2261 2262 2263
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;

		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2264

2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
			trie_show_usage(seq, &t->stats);
#endif
		}
	}
2277

2278
	return 0;
2279 2280
}

2281
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2282
{
2283
	return single_open_net(inode, file, fib_triestat_seq_show);
2284 2285
}

2286
static const struct file_operations fib_triestat_fops = {
2287 2288 2289 2290
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2291
	.release = single_release_net,
2292 2293
};

2294
static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2295
{
2296 2297
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2298
	loff_t idx = 0;
2299
	unsigned int h;
2300

2301 2302 2303 2304
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;
2305

2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316
		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
			struct node *n;

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2317
	}
2318

2319 2320 2321
	return NULL;
}

2322
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2323
	__acquires(RCU)
2324
{
2325
	rcu_read_lock();
2326
	return fib_trie_get_idx(seq, *pos);
2327 2328
}

2329
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2330
{
2331
	struct fib_trie_iter *iter = seq->private;
2332
	struct net *net = seq_file_net(seq);
2333 2334 2335 2336
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
	struct node *n;
2337

2338
	++*pos;
2339 2340 2341 2342
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2343

2344 2345 2346 2347 2348 2349 2350 2351
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2352

2353 2354 2355 2356 2357 2358 2359 2360 2361
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2362
	return NULL;
2363 2364 2365 2366

found:
	iter->tb = tb;
	return n;
2367
}
2368

2369
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2370
	__releases(RCU)
2371
{
2372 2373
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2374

2375 2376 2377 2378
static void seq_indent(struct seq_file *seq, int n)
{
	while (n-- > 0) seq_puts(seq, "   ");
}
2379

2380
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2381
{
S
Stephen Hemminger 已提交
2382
	switch (s) {
2383 2384 2385 2386 2387 2388
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2389
		snprintf(buf, len, "scope=%d", s);
2390 2391 2392
		return buf;
	}
}
2393

2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407
static const char *rtn_type_names[__RTN_MAX] = {
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2408

2409
static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2410 2411 2412
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2413
	snprintf(buf, len, "type %u", t);
2414
	return buf;
2415 2416
}

2417 2418
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2419
{
2420 2421
	const struct fib_trie_iter *iter = seq->private;
	struct node *n = v;
2422

2423 2424
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2425

2426 2427
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2428
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2429

2430
		seq_indent(seq, iter->depth-1);
2431 2432
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2433
			   tn->empty_children);
2434

2435 2436
	} else {
		struct leaf *l = (struct leaf *) n;
2437 2438
		struct leaf_info *li;
		struct hlist_node *node;
A
Al Viro 已提交
2439
		__be32 val = htonl(l->key);
2440 2441

		seq_indent(seq, iter->depth);
2442
		seq_printf(seq, "  |-- %pI4\n", &val);
2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456

		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
						     fa->fa_scope),
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2457
					seq_printf(seq, " tos=%d", fa->fa_tos);
2458
				seq_putc(seq, '\n');
2459 2460
			}
		}
2461
	}
2462

2463 2464 2465
	return 0;
}

2466
static const struct seq_operations fib_trie_seq_ops = {
2467 2468 2469 2470
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2471 2472
};

2473
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2474
{
2475 2476
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2477 2478
}

2479
static const struct file_operations fib_trie_fops = {
2480 2481 2482 2483
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2484
	.release = seq_release_net,
2485 2486
};

2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2527
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

A
Al Viro 已提交
2565
static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2566
{
2567 2568 2569 2570
	static unsigned type2flags[RTN_MAX + 1] = {
		[7] = RTF_REJECT, [8] = RTF_REJECT,
	};
	unsigned flags = type2flags[type];
2571

2572 2573
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2574
	if (mask == htonl(0xFFFFFFFF))
2575 2576 2577
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2578 2579
}

2580 2581 2582 2583 2584 2585 2586
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
 * 	and needs to be same as fib_hash output to avoid breaking
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2587
{
2588
	struct leaf *l = v;
2589 2590
	struct leaf_info *li;
	struct hlist_node *node;
2591

2592 2593 2594 2595 2596 2597
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2598

2599
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2600
		struct fib_alias *fa;
A
Al Viro 已提交
2601
		__be32 mask, prefix;
O
Olof Johansson 已提交
2602

2603 2604
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2605

2606
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2607
			const struct fib_info *fi = fa->fa_info;
2608
			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2609
			int len;
2610

2611 2612 2613
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2614

2615
			if (fi)
2616 2617 2618
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2619 2620 2621 2622 2623
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2624 2625
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2626
					 fi->fib_window,
2627
					 fi->fib_rtt >> 3, &len);
2628
			else
2629 2630 2631
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2632
					 prefix, 0, flags, 0, 0, 0,
2633
					 mask, 0, 0, 0, &len);
2634

2635
			seq_printf(seq, "%*s\n", 127 - len, "");
2636
		}
2637 2638 2639 2640 2641
	}

	return 0;
}

2642
static const struct seq_operations fib_route_seq_ops = {
2643 2644 2645
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2646
	.show   = fib_route_seq_show,
2647 2648
};

2649
static int fib_route_seq_open(struct inode *inode, struct file *file)
2650
{
2651
	return seq_open_net(inode, file, &fib_route_seq_ops,
2652
			    sizeof(struct fib_route_iter));
2653 2654
}

2655
static const struct file_operations fib_route_fops = {
2656 2657 2658 2659
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2660
	.release = seq_release_net,
2661 2662
};

2663
int __net_init fib_proc_init(struct net *net)
2664
{
2665
	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2666 2667
		goto out1;

2668 2669
	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
				  &fib_triestat_fops))
2670 2671
		goto out2;

2672
	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2673 2674
		goto out3;

2675
	return 0;
2676 2677

out3:
2678
	proc_net_remove(net, "fib_triestat");
2679
out2:
2680
	proc_net_remove(net, "fib_trie");
2681 2682
out1:
	return -ENOMEM;
2683 2684
}

2685
void __net_exit fib_proc_exit(struct net *net)
2686
{
2687 2688 2689
	proc_net_remove(net, "fib_trie");
	proc_net_remove(net, "fib_triestat");
	proc_net_remove(net, "route");
2690 2691 2692
}

#endif /* CONFIG_PROC_FS */