fib_trie.c 61.7 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89 90 91 92

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
93 94
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
95 96
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
97

98
struct rt_trie_node {
O
Olof Johansson 已提交
99
	unsigned long parent;
100
	t_key key;
101 102 103
};

struct leaf {
O
Olof Johansson 已提交
104
	unsigned long parent;
105
	t_key key;
106
	struct hlist_head list;
R
Robert Olsson 已提交
107
	struct rcu_head rcu;
108 109 110 111 112
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
113
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
114
	struct list_head falh;
115
	struct rcu_head rcu;
116 117 118
};

struct tnode {
O
Olof Johansson 已提交
119
	unsigned long parent;
120
	t_key key;
121 122
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123 124
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
125 126
	union {
		struct rcu_head rcu;
J
Jarek Poplawski 已提交
127
		struct tnode *tnode_free;
128
	};
E
Eric Dumazet 已提交
129
	struct rt_trie_node __rcu *child[0];
130 131 132 133 134 135 136 137 138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140 141 142 143 144 145 146 147 148
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
149
	unsigned int prefixes;
R
Robert Olsson 已提交
150
	unsigned int nodesizes[MAX_STAT_DEPTH];
151
};
152 153

struct trie {
E
Eric Dumazet 已提交
154
	struct rt_trie_node __rcu *trie;
155
#ifdef CONFIG_IP_FIB_TRIE_STATS
156
	struct trie_use_stats __percpu *stats;
157 158 159
#endif
};

160
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
161
				  int wasfull);
162
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163 164
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
165 166
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
167 168 169 170 171 172 173 174
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
175

176
static struct kmem_cache *fn_alias_kmem __read_mostly;
177
static struct kmem_cache *trie_leaf_kmem __read_mostly;
178

E
Eric Dumazet 已提交
179 180 181 182
/*
 * caller must hold RTNL
 */
static inline struct tnode *node_parent(const struct rt_trie_node *node)
S
Stephen Hemminger 已提交
183
{
E
Eric Dumazet 已提交
184 185 186 187 188
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());

	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
189 190
}

E
Eric Dumazet 已提交
191 192 193 194
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195
{
E
Eric Dumazet 已提交
196 197 198 199
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
							   lockdep_rtnl_is_held());
S
Stephen Hemminger 已提交
200

E
Eric Dumazet 已提交
201
	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
S
Stephen Hemminger 已提交
202 203
}

204
/* Same as rcu_assign_pointer
205 206
 * but that macro() assumes that value is a pointer.
 */
207
static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
S
Stephen Hemminger 已提交
208
{
209 210
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
211
}
R
Robert Olsson 已提交
212

E
Eric Dumazet 已提交
213 214 215 216
/*
 * caller must hold RTNL
 */
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217 218
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
219

E
Eric Dumazet 已提交
220
	return rtnl_dereference(tn->child[i]);
221 222
}

E
Eric Dumazet 已提交
223 224 225 226
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227
{
E
Eric Dumazet 已提交
228
	BUG_ON(i >= 1U << tn->bits);
229

E
Eric Dumazet 已提交
230
	return rcu_dereference_rtnl(tn->child[i]);
231 232
}

S
Stephen Hemmigner 已提交
233
static inline int tnode_child_length(const struct tnode *tn)
234
{
O
Olof Johansson 已提交
235
	return 1 << tn->bits;
236 237
}

238
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
239 240 241 242
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

243
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
244
{
O
Olof Johansson 已提交
245
	if (offset < KEYLENGTH)
246
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
247
	else
248 249 250 251 252
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
253
	return a == b;
254 255 256 257
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
258 259
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
260 261
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
262
}
263 264 265 266 267 268

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

269 270 271
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
272 273 274 275 276
		i++;
	return i;
}

/*
277 278
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
279 280 281 282
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

283 284 285 286 287
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
288 289
  correct key path.

290 291 292 293 294
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
295 296
  call to tkey_sub_equals() in trie_insert().

297
  if n is an internal node - a 'tnode' here, the various parts of its key
298 299
  have many different meanings.

300
  Example:
301 302 303
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
304
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
305 306 307 308 309 310 311 312 313

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
314
  n->bits = 4
315

316 317
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318 319 320
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321
  index into the parent's child array. That is, they will be used to find
322 323 324 325 326
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

327
  All the bits we have seen so far are significant to the node n. The rest
328 329
  of the bits are really not needed or indeed known in n->key.

330
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331
  n's child array, and will of course be different for each child.
332

333

334 335 336 337 338
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
339
static inline void check_tnode(const struct tnode *tn)
340
{
S
Stephen Hemminger 已提交
341
	WARN_ON(tn && tn->pos+tn->bits > 32);
342 343
}

344 345
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
346
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
347
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
348 349

static void __alias_free_mem(struct rcu_head *head)
350
{
R
Robert Olsson 已提交
351 352
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
353 354
}

R
Robert Olsson 已提交
355
static inline void alias_free_mem_rcu(struct fib_alias *fa)
356
{
R
Robert Olsson 已提交
357 358
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
359

R
Robert Olsson 已提交
360 361
static void __leaf_free_rcu(struct rcu_head *head)
{
362 363
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
364
}
O
Olof Johansson 已提交
365

366 367
static inline void free_leaf(struct leaf *l)
{
368
	call_rcu(&l->rcu, __leaf_free_rcu);
369 370
}

R
Robert Olsson 已提交
371
static inline void free_leaf_info(struct leaf_info *leaf)
372
{
373
	kfree_rcu(leaf, rcu);
374 375
}

376
static struct tnode *tnode_alloc(size_t size)
377
{
R
Robert Olsson 已提交
378
	if (size <= PAGE_SIZE)
379
		return kzalloc(size, GFP_KERNEL);
380
	else
381
		return vzalloc(size);
382
}
R
Robert Olsson 已提交
383 384

static void __tnode_free_rcu(struct rcu_head *head)
385
{
R
Robert Olsson 已提交
386
	struct tnode *tn = container_of(head, struct tnode, rcu);
387
	size_t size = sizeof(struct tnode) +
388
		      (sizeof(struct rt_trie_node *) << tn->bits);
389 390 391

	if (size <= PAGE_SIZE)
		kfree(tn);
A
Al Viro 已提交
392 393
	else
		vfree(tn);
394 395
}

R
Robert Olsson 已提交
396 397
static inline void tnode_free(struct tnode *tn)
{
398 399 400
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
401
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
402 403
}

J
Jarek Poplawski 已提交
404 405 406
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
407 408
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
409
	tnode_free_size += sizeof(struct tnode) +
410
			   (sizeof(struct rt_trie_node *) << tn->bits);
J
Jarek Poplawski 已提交
411 412 413 414 415 416 417 418 419 420 421
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
422 423 424 425 426

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
427 428
}

R
Robert Olsson 已提交
429 430
static struct leaf *leaf_new(void)
{
431
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
432 433 434 435 436 437 438 439 440 441 442 443
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
444
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
445 446 447 448 449
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

450
static struct tnode *tnode_new(t_key key, int pos, int bits)
451
{
452
	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
453
	struct tnode *tn = tnode_alloc(sz);
454

O
Olof Johansson 已提交
455
	if (tn) {
R
Robert Olsson 已提交
456
		tn->parent = T_TNODE;
457 458 459 460 461 462
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
463

E
Eric Dumazet 已提交
464
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
L
Lin Ming 已提交
465
		 sizeof(struct rt_trie_node *) << bits);
466 467 468 469 470 471 472 473
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

474
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475
{
476
	if (n == NULL || IS_LEAF(n))
477 478 479 480 481
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

L
Lin Ming 已提交
482
static inline void put_child(struct tnode *tn, int i,
483
			     struct rt_trie_node *n)
484 485 486 487
{
	tnode_put_child_reorg(tn, i, n, -1);
}

488
 /*
489 490 491 492
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

493
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494
				  int wasfull)
495
{
E
Eric Dumazet 已提交
496
	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
497 498
	int isfull;

S
Stephen Hemminger 已提交
499 500
	BUG_ON(i >= 1<<tn->bits);

501 502 503 504 505
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
506

507
	/* update fullChildren */
O
Olof Johansson 已提交
508
	if (wasfull == -1)
509 510 511
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
512
	if (wasfull && !isfull)
513
		tn->full_children--;
514
	else if (!wasfull && isfull)
515
		tn->full_children++;
O
Olof Johansson 已提交
516

517
	if (n)
S
Stephen Hemminger 已提交
518
		node_set_parent(n, tn);
519

520
	rcu_assign_pointer(tn->child[i], n);
521 522
}

J
Jens Låås 已提交
523
#define MAX_WORK 10
524
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525 526
{
	int i;
527
	struct tnode *old_tn;
528 529
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
530
	int max_work;
531

532
	if (!tn)
533 534
		return NULL;

S
Stephen Hemminger 已提交
535 536
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
537 538 539

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
540
		tnode_free_safe(tn);
541 542 543 544
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
J
Jens Låås 已提交
545
		goto one_child;
546
	/*
547 548 549 550 551
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
552 553
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554
	 * Telecommunications, page 6:
555
	 * "A node is doubled if the ratio of non-empty children to all
556 557
	 * children in the *doubled* node is at least 'high'."
	 *
558 559 560 561 562
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
563
	 * multiply the left-hand side by 50.
564 565 566 567
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
568
	 * children, that is non-null tnodes with a skip value of 0.
569
	 * All of those will be doubled in the resulting inflated tnode, so
570
	 * we just count them one extra time here.
571
	 *
572
	 * A clearer way to write this would be:
573
	 *
574
	 * to_be_doubled = tn->full_children;
575
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 577 578 579
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
580
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 582
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
583 584 585
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
586 587 588
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
589
	 *
590 591 592
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
593
	 *
594
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
595
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
596
	 *    tn->full_children) >= inflate_threshold * new_child_length
597
	 *
598
	 * expand new_child_length:
599
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
600
	 *    tn->full_children) >=
601
	 *      inflate_threshold * tnode_child_length(tn) * 2
602
	 *
603
	 * shorten again:
604
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
605
	 *    tn->empty_children) >= inflate_threshold *
606
	 *    tnode_child_length(tn)
607
	 *
608 609 610
	 */

	check_tnode(tn);
611

612 613
	/* Keep root node larger  */

614
	if (!node_parent((struct rt_trie_node *)tn)) {
J
Jens Låås 已提交
615 616
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
617
	} else {
618
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
619 620
		halve_threshold_use = halve_threshold;
	}
621

J
Jens Låås 已提交
622 623
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
624 625 626
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
627

628 629
		old_tn = tn;
		tn = inflate(t, tn);
630

631 632
		if (IS_ERR(tn)) {
			tn = old_tn;
633
#ifdef CONFIG_IP_FIB_TRIE_STATS
634
			this_cpu_inc(t->stats->resize_node_skipped);
635 636 637
#endif
			break;
		}
638 639 640 641
	}

	check_tnode(tn);

J
Jens Låås 已提交
642
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
643
	if (max_work != MAX_WORK)
644
		return (struct rt_trie_node *) tn;
J
Jens Låås 已提交
645

646 647 648 649
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
650

J
Jens Låås 已提交
651 652
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
653
	       100 * (tnode_child_length(tn) - tn->empty_children) <
654
	       halve_threshold_use * tnode_child_length(tn)) {
655

656 657 658 659
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
660
#ifdef CONFIG_IP_FIB_TRIE_STATS
661
			this_cpu_inc(t->stats->resize_node_skipped);
662 663 664 665
#endif
			break;
		}
	}
666

667

668
	/* Only one child remains */
J
Jens Låås 已提交
669 670
	if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
671
		for (i = 0; i < tnode_child_length(tn); i++) {
672
			struct rt_trie_node *n;
673

E
Eric Dumazet 已提交
674
			n = rtnl_dereference(tn->child[i]);
R
Robert Olsson 已提交
675
			if (!n)
O
Olof Johansson 已提交
676 677 678 679
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
680
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
681
			tnode_free_safe(tn);
O
Olof Johansson 已提交
682
			return n;
683
		}
J
Jens Låås 已提交
684
	}
685
	return (struct rt_trie_node *) tn;
686 687
}

E
Eric Dumazet 已提交
688 689 690 691 692 693 694 695 696 697 698 699 700 701

static void tnode_clean_free(struct tnode *tn)
{
	int i;
	struct tnode *tofree;

	for (i = 0; i < tnode_child_length(tn); i++) {
		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
		if (tofree)
			tnode_free(tofree);
	}
	tnode_free(tn);
}

702
static struct tnode *inflate(struct trie *t, struct tnode *tn)
703 704 705 706 707
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
708
	pr_debug("In inflate\n");
709 710 711

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
712
	if (!tn)
713
		return ERR_PTR(-ENOMEM);
714 715

	/*
716 717 718
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
719 720
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
721 722

	for (i = 0; i < olen; i++) {
723
		struct tnode *inode;
724

725
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 727 728 729 730
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
731
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732

733 734
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
735 736
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
737

738 739 740
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

741
			if (!right) {
742 743
				tnode_free(left);
				goto nomem;
744
			}
745

L
Lin Ming 已提交
746 747
			put_child(tn, 2*i, (struct rt_trie_node *) left);
			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
748 749 750
		}
	}

O
Olof Johansson 已提交
751
	for (i = 0; i < olen; i++) {
752
		struct tnode *inode;
753
		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
754 755
		struct tnode *left, *right;
		int size, j;
756

757 758 759 760 761 762
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

763
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
764
		   tn->pos + tn->bits - 1) {
765 766 767
			put_child(tn,
				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
				node);
768 769 770 771 772 773 774
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
L
Lin Ming 已提交
775 776
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
777

J
Jarek Poplawski 已提交
778
			tnode_free_safe(inode);
O
Olof Johansson 已提交
779
			continue;
780 781
		}

O
Olof Johansson 已提交
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
800

O
Olof Johansson 已提交
801 802 803
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
804

O
Olof Johansson 已提交
805
		left = (struct tnode *) tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
806
		put_child(tn, 2*i, NULL);
807

O
Olof Johansson 已提交
808
		BUG_ON(!left);
809

O
Olof Johansson 已提交
810
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
811
		put_child(tn, 2*i+1, NULL);
812

O
Olof Johansson 已提交
813
		BUG_ON(!right);
814

O
Olof Johansson 已提交
815 816
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
817 818
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
819
		}
L
Lin Ming 已提交
820 821
		put_child(tn, 2*i, resize(t, left));
		put_child(tn, 2*i+1, resize(t, right));
O
Olof Johansson 已提交
822

J
Jarek Poplawski 已提交
823
		tnode_free_safe(inode);
824
	}
J
Jarek Poplawski 已提交
825
	tnode_free_safe(oldtnode);
826
	return tn;
827
nomem:
E
Eric Dumazet 已提交
828 829
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
830 831
}

832
static struct tnode *halve(struct trie *t, struct tnode *tn)
833 834
{
	struct tnode *oldtnode = tn;
835
	struct rt_trie_node *left, *right;
836 837 838
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
839
	pr_debug("In halve\n");
840 841

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
842

843 844
	if (!tn)
		return ERR_PTR(-ENOMEM);
845 846

	/*
847 848 849
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
850 851 852
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
853
	for (i = 0; i < olen; i += 2) {
854 855
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
856

857
		/* Two nonempty children */
S
Stephen Hemminger 已提交
858
		if (left && right) {
859
			struct tnode *newn;
S
Stephen Hemminger 已提交
860

861
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
862 863

			if (!newn)
864
				goto nomem;
S
Stephen Hemminger 已提交
865

L
Lin Ming 已提交
866
			put_child(tn, i/2, (struct rt_trie_node *)newn);
867 868 869
		}

	}
870

O
Olof Johansson 已提交
871 872 873
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

874 875
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
876

877 878 879 880
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
881
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
882
			continue;
S
Stephen Hemminger 已提交
883
		}
O
Olof Johansson 已提交
884 885

		if (right == NULL) {
L
Lin Ming 已提交
886
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
887 888
			continue;
		}
889

890
		/* Two nonempty children */
O
Olof Johansson 已提交
891
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
L
Lin Ming 已提交
892 893 894 895
		put_child(tn, i/2, NULL);
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
		put_child(tn, i/2, resize(t, newBinNode));
896
	}
J
Jarek Poplawski 已提交
897
	tnode_free_safe(oldtnode);
898
	return tn;
899
nomem:
E
Eric Dumazet 已提交
900 901
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
902 903
}

R
Robert Olsson 已提交
904
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
905 906
 via get_fa_head and dump */

R
Robert Olsson 已提交
907
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
908
{
R
Robert Olsson 已提交
909
	struct hlist_head *head = &l->list;
910 911
	struct leaf_info *li;

912
	hlist_for_each_entry_rcu(li, head, hlist)
913
		if (li->plen == plen)
914
			return li;
O
Olof Johansson 已提交
915

916 917 918
	return NULL;
}

919
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
920
{
R
Robert Olsson 已提交
921
	struct leaf_info *li = find_leaf_info(l, plen);
922

O
Olof Johansson 已提交
923 924
	if (!li)
		return NULL;
925

O
Olof Johansson 已提交
926
	return &li->falh;
927 928 929 930
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
931 932 933 934 935
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
936
		hlist_for_each_entry(li, head, hlist) {
937 938 939 940 941 942
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
943
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
944 945 946
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
947 948
}

R
Robert Olsson 已提交
949 950
/* rcu_read_lock needs to be hold by caller from readside */

951 952 953 954 955
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
956
	struct rt_trie_node *n;
957 958

	pos = 0;
E
Eric Dumazet 已提交
959
	n = rcu_dereference_rtnl(t->trie);
960 961 962

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
963

964
		check_tnode(tn);
O
Olof Johansson 已提交
965

966
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
967
			pos = tn->pos + tn->bits;
968 969 970 971
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
972
		} else
973 974 975 976
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
977 978 979
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

980 981 982
	return NULL;
}

983
static void trie_rebalance(struct trie *t, struct tnode *tn)
984 985
{
	int wasfull;
R
Robert Olsson 已提交
986
	t_key cindex, key;
S
Stephen Hemminger 已提交
987
	struct tnode *tp;
988

R
Robert Olsson 已提交
989 990
	key = tn->key;

991
	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
992 993
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
J
Joe Perches 已提交
994
		tn = (struct tnode *)resize(t, tn);
995

J
Joe Perches 已提交
996
		tnode_put_child_reorg(tp, cindex,
997
				      (struct rt_trie_node *)tn, wasfull);
O
Olof Johansson 已提交
998

999
		tp = node_parent((struct rt_trie_node *) tn);
1000
		if (!tp)
1001
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1002

J
Jarek Poplawski 已提交
1003
		tnode_free_flush();
S
Stephen Hemminger 已提交
1004
		if (!tp)
1005
			break;
S
Stephen Hemminger 已提交
1006
		tn = tp;
1007
	}
S
Stephen Hemminger 已提交
1008

1009
	/* Handle last (top) tnode */
1010
	if (IS_TNODE(tn))
J
Joe Perches 已提交
1011
		tn = (struct tnode *)resize(t, tn);
1012

1013
	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014
	tnode_free_flush();
1015 1016
}

R
Robert Olsson 已提交
1017 1018
/* only used from updater-side */

1019
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020 1021 1022
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
1023
	struct rt_trie_node *n;
1024 1025
	struct leaf *l;
	int missbit;
1026
	struct list_head *fa_head = NULL;
1027 1028 1029 1030
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
E
Eric Dumazet 已提交
1031
	n = rtnl_dereference(t->trie);
1032

1033 1034
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1035
	 * and we should just put our new leaf in that.
1036 1037
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 1039
	 * not be the parent's 'pos'+'bits'!
	 *
1040
	 * If it does match the current key, get pos/bits from it, extract
1041 1042 1043 1044
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1045 1046 1047
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1048 1049 1050 1051 1052
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1053

1054
		check_tnode(tn);
O
Olof Johansson 已提交
1055

1056
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057
			tp = tn;
O
Olof Johansson 已提交
1058
			pos = tn->pos + tn->bits;
1059 1060 1061 1062
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1063

S
Stephen Hemminger 已提交
1064
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1065
		} else
1066 1067 1068 1069 1070 1071
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1072
	 * tp is n's (parent) ----> NULL or TNODE
1073 1074
	 */

O
Olof Johansson 已提交
1075
	BUG_ON(tp && IS_LEAF(tp));
1076 1077 1078

	/* Case 1: n is a leaf. Compare prefixes */

1079
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080
		l = (struct leaf *) n;
1081
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1082

1083 1084
		if (!li)
			return NULL;
1085 1086 1087 1088 1089 1090 1091

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1092 1093
	if (!l)
		return NULL;
1094 1095 1096 1097

	l->key = key;
	li = leaf_info_new(plen);

1098
	if (!li) {
1099
		free_leaf(l);
1100
		return NULL;
1101
	}
1102 1103 1104 1105 1106

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1107
		/* Case 2: n is NULL, and will just insert a new leaf */
1108

1109
		node_set_parent((struct rt_trie_node *)l, tp);
1110

O
Olof Johansson 已提交
1111
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1112
		put_child(tp, cindex, (struct rt_trie_node *)l);
O
Olof Johansson 已提交
1113 1114
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1115 1116
		/*
		 *  Add a new tnode here
1117 1118 1119
		 *  first tnode need some special handling
		 */

1120
		if (n) {
1121
			pos = tp ? tp->pos+tp->bits : 0;
1122 1123
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1124
		} else {
1125
			newpos = 0;
1126
			tn = tnode_new(key, newpos, 1); /* First tnode */
1127 1128
		}

1129
		if (!tn) {
1130
			free_leaf_info(li);
1131
			free_leaf(l);
1132
			return NULL;
O
Olof Johansson 已提交
1133 1134
		}

1135
		node_set_parent((struct rt_trie_node *)tn, tp);
1136

O
Olof Johansson 已提交
1137
		missbit = tkey_extract_bits(key, newpos, 1);
L
Lin Ming 已提交
1138 1139
		put_child(tn, missbit, (struct rt_trie_node *)l);
		put_child(tn, 1-missbit, n);
1140

1141
		if (tp) {
1142
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1143
			put_child(tp, cindex, (struct rt_trie_node *)tn);
O
Olof Johansson 已提交
1144
		} else {
1145
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1146
		}
1147 1148

		tp = tn;
1149
	}
O
Olof Johansson 已提交
1150 1151

	if (tp && tp->pos + tp->bits > 32)
J
Joe Perches 已提交
1152 1153
		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1154

1155
	/* Rebalance the trie */
R
Robert Olsson 已提交
1156

1157
	trie_rebalance(t, tp);
1158
done:
1159 1160 1161
	return fa_head;
}

1162 1163 1164
/*
 * Caller must hold RTNL.
 */
1165
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1166 1167 1168
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1169
	struct list_head *fa_head = NULL;
1170
	struct fib_info *fi;
1171 1172
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1173 1174 1175 1176 1177 1178 1179
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1180
	key = ntohl(cfg->fc_dst);
1181

1182
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1183

O
Olof Johansson 已提交
1184
	mask = ntohl(inet_make_mask(plen));
1185

1186
	if (key & ~mask)
1187 1188 1189 1190
		return -EINVAL;

	key = key & mask;

1191 1192 1193
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1194
		goto err;
1195
	}
1196 1197

	l = fib_find_node(t, key);
1198
	fa = NULL;
1199

1200
	if (l) {
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1216 1217 1218
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1219 1220

		err = -EEXIST;
1221
		if (cfg->fc_nlflags & NLM_F_EXCL)
1222 1223
			goto out;

1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1244
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1245 1246 1247
			struct fib_info *fi_drop;
			u8 state;

1248 1249 1250 1251
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1252
				goto out;
1253
			}
R
Robert Olsson 已提交
1254
			err = -ENOBUFS;
1255
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1256 1257
			if (new_fa == NULL)
				goto out;
1258 1259

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1260 1261
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1262
			new_fa->fa_type = cfg->fc_type;
1263
			state = fa->fa_state;
1264
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1265

R
Robert Olsson 已提交
1266 1267
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1268 1269 1270

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1271
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1272 1273
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1274

O
Olof Johansson 已提交
1275
			goto succeeded;
1276 1277 1278 1279 1280
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1281 1282
		if (fa_match)
			goto out;
1283

1284
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1285
			fa = fa_first;
1286 1287
	}
	err = -ENOENT;
1288
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1289 1290 1291
		goto out;

	err = -ENOBUFS;
1292
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1293 1294 1295 1296 1297
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1298
	new_fa->fa_type = cfg->fc_type;
1299 1300 1301 1302 1303
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1304
	if (!fa_head) {
1305 1306 1307
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1308
			goto out_free_new_fa;
1309
		}
1310
	}
1311

1312 1313 1314
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1315 1316
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1317

1318
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1319
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1320
		  &cfg->fc_nlinfo, 0);
1321 1322
succeeded:
	return 0;
1323 1324 1325

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1326 1327
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1328
err:
1329 1330 1331
	return err;
}

R
Robert Olsson 已提交
1332
/* should be called with rcu_read_lock */
1333
static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1334
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1335
		      struct fib_result *res, int fib_flags)
1336 1337 1338
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
1339

1340
	hlist_for_each_entry_rcu(li, hhead, hlist) {
1341
		struct fib_alias *fa;
1342

1343
		if (l->key != (key & li->mask_plen))
1344 1345
			continue;

1346 1347 1348
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1349

1350
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1351
				continue;
1352 1353
			if (fi->fib_dead)
				continue;
1354
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1355 1356 1357 1358
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (err) {
1359
#ifdef CONFIG_IP_FIB_TRIE_STATS
1360
				this_cpu_inc(t->stats->semantic_match_passed);
1361
#endif
1362
				return err;
1363 1364 1365 1366 1367 1368 1369 1370
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1371
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1372 1373 1374
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1375
				this_cpu_inc(t->stats->semantic_match_passed);
1376
#endif
1377
				res->prefixlen = li->plen;
1378 1379
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1380
				res->scope = fa->fa_info->fib_scope;
1381 1382 1383 1384
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
1385
					atomic_inc(&fi->fib_clntref);
1386 1387 1388 1389 1390
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
1391
		this_cpu_inc(t->stats->semantic_match_miss);
1392 1393
#endif
	}
1394

1395
	return 1;
1396 1397
}

1398
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1399
		     struct fib_result *res, int fib_flags)
1400 1401
{
	struct trie *t = (struct trie *) tb->tb_data;
1402 1403 1404
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats __percpu *stats = t->stats;
#endif
1405
	int ret;
1406
	struct rt_trie_node *n;
1407
	struct tnode *pn;
1408
	unsigned int pos, bits;
1409
	t_key key = ntohl(flp->daddr);
1410
	unsigned int chopped_off;
1411
	t_key cindex = 0;
1412
	unsigned int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1413
	struct tnode *cn;
1414
	t_key pref_mismatch;
O
Olof Johansson 已提交
1415

R
Robert Olsson 已提交
1416
	rcu_read_lock();
O
Olof Johansson 已提交
1417

R
Robert Olsson 已提交
1418
	n = rcu_dereference(t->trie);
1419
	if (!n)
1420 1421 1422
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1423
	this_cpu_inc(stats->gets);
1424 1425 1426 1427
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1428
		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1429
		goto found;
1430
	}
1431

1432 1433
	pn = (struct tnode *) n;
	chopped_off = 0;
1434

O
Olof Johansson 已提交
1435
	while (pn) {
1436 1437 1438
		pos = pn->pos;
		bits = pn->bits;

1439
		if (!chopped_off)
S
Stephen Hemminger 已提交
1440 1441
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1442

1443
		n = tnode_get_child_rcu(pn, cindex);
1444 1445 1446

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
1447
			this_cpu_inc(stats->null_node_hit);
1448 1449 1450 1451
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1452
		if (IS_LEAF(n)) {
1453
			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1454
			if (ret > 0)
O
Olof Johansson 已提交
1455
				goto backtrace;
1456
			goto found;
O
Olof Johansson 已提交
1457 1458 1459
		}

		cn = (struct tnode *)n;
1460

O
Olof Johansson 已提交
1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1476

O
Olof Johansson 已提交
1477 1478 1479 1480 1481 1482 1483 1484 1485
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1486

1487 1488
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1489

O
Olof Johansson 已提交
1490 1491
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1492 1493
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1494 1495
				goto backtrace;
		}
1496

O
Olof Johansson 已提交
1497 1498 1499 1500 1501 1502 1503 1504 1505
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1506

O
Olof Johansson 已提交
1507 1508 1509 1510 1511 1512 1513 1514
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1515

1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1527 1528
		 */

1529
		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
O
Olof Johansson 已提交
1530

1531 1532 1533 1534
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1535 1536
		 */
		if (pref_mismatch) {
1537 1538
			/* fls(x) = __fls(x) + 1 */
			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
O
Olof Johansson 已提交
1539

1540
			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
O
Olof Johansson 已提交
1541 1542 1543 1544
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1545
		}
1546

O
Olof Johansson 已提交
1547 1548 1549 1550
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1551 1552 1553 1554
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1555 1556
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1557 1558 1559 1560
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1561 1562
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1563

1564
		/*
1565
		 * Either we do the actual chop off according or if we have
1566 1567 1568
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1569
		if (chopped_off <= pn->bits) {
1570
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1571
		} else {
1572
			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
S
Stephen Hemminger 已提交
1573
			if (!parent)
1574
				goto failed;
O
Olof Johansson 已提交
1575

1576
			/* Get Child's index */
S
Stephen Hemminger 已提交
1577 1578
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1579 1580 1581
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1582
			this_cpu_inc(stats->backtrack);
1583 1584
#endif
			goto backtrace;
1585
		}
1586 1587
	}
failed:
1588
	ret = 1;
1589
found:
R
Robert Olsson 已提交
1590
	rcu_read_unlock();
1591 1592
	return ret;
}
1593
EXPORT_SYMBOL_GPL(fib_table_lookup);
1594

1595 1596 1597 1598
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1599
{
1600
	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1601

1602
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1603

1604
	if (tp) {
1605
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
L
Lin Ming 已提交
1606
		put_child(tp, cindex, NULL);
1607
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1608
	} else
1609
		RCU_INIT_POINTER(t->trie, NULL);
1610

1611
	free_leaf(l);
1612 1613
}

1614 1615 1616
/*
 * Caller must hold RTNL.
 */
1617
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1618 1619 1620
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1621 1622
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1623 1624 1625
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1626 1627
	struct leaf_info *li;

1628
	if (plen > 32)
1629 1630
		return -EINVAL;

1631
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1632
	mask = ntohl(inet_make_mask(plen));
1633

1634
	if (key & ~mask)
1635 1636 1637 1638 1639
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1640
	if (!l)
1641 1642
		return -ESRCH;

1643 1644 1645 1646 1647 1648
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1649 1650 1651 1652 1653
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1654
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1655 1656

	fa_to_delete = NULL;
1657 1658
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1659 1660 1661 1662 1663
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1664 1665
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1666
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1667 1668
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1669 1670 1671
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1672 1673 1674 1675 1676
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1677 1678
	if (!fa_to_delete)
		return -ESRCH;
1679

O
Olof Johansson 已提交
1680
	fa = fa_to_delete;
1681
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1682
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1683

R
Robert Olsson 已提交
1684
	list_del_rcu(&fa->fa_list);
1685

1686 1687 1688
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1689
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1690
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1691
		free_leaf_info(li);
R
Robert Olsson 已提交
1692
	}
1693

O
Olof Johansson 已提交
1694
	if (hlist_empty(&l->list))
1695
		trie_leaf_remove(t, l);
1696

O
Olof Johansson 已提交
1697
	if (fa->fa_state & FA_S_ACCESSED)
1698
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1699

R
Robert Olsson 已提交
1700 1701
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1702
	return 0;
1703 1704
}

1705
static int trie_flush_list(struct list_head *head)
1706 1707 1708 1709 1710 1711 1712
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1713 1714 1715 1716
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1717 1718 1719 1720 1721 1722
			found++;
		}
	}
	return found;
}

1723
static int trie_flush_leaf(struct leaf *l)
1724 1725 1726
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1727
	struct hlist_node *tmp;
1728 1729
	struct leaf_info *li = NULL;

1730
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1731
		found += trie_flush_list(&li->falh);
1732 1733

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1734
			hlist_del_rcu(&li->hlist);
1735 1736 1737 1738 1739 1740
			free_leaf_info(li);
		}
	}
	return found;
}

1741 1742 1743 1744
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
1745
static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1746
{
1747 1748
	do {
		t_key idx;
1749 1750

		if (c)
1751
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1752
		else
1753
			idx = 0;
R
Robert Olsson 已提交
1754

1755 1756
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1757
			if (!c)
O
Olof Johansson 已提交
1758 1759
				continue;

1760
			if (IS_LEAF(c))
1761 1762 1763 1764 1765
				return (struct leaf *) c;

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1766
		}
1767 1768

		/* Node empty, walk back up to parent */
1769
		c = (struct rt_trie_node *) p;
E
Eric Dumazet 已提交
1770
	} while ((p = node_parent_rcu(c)) != NULL);
1771 1772 1773 1774 1775 1776

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
E
Eric Dumazet 已提交
1777
	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
1790
	struct rt_trie_node *c = (struct rt_trie_node *) l;
1791
	struct tnode *p = node_parent_rcu(c);
1792 1793 1794 1795 1796

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1797 1798
}

1799 1800 1801 1802
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1803
	while (l && index-- > 0)
1804
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1805

1806 1807 1808 1809
	return l;
}


1810 1811 1812
/*
 * Caller must hold RTNL.
 */
1813
int fib_table_flush(struct fib_table *tb)
1814 1815
{
	struct trie *t = (struct trie *) tb->tb_data;
1816
	struct leaf *l, *ll = NULL;
1817
	int found = 0;
1818

1819
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1820
		found += trie_flush_leaf(l);
1821 1822

		if (ll && hlist_empty(&ll->list))
1823
			trie_leaf_remove(t, ll);
1824 1825 1826 1827
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1828
		trie_leaf_remove(t, ll);
1829

S
Stephen Hemminger 已提交
1830
	pr_debug("trie_flush found=%d\n", found);
1831 1832 1833
	return found;
}

1834 1835
void fib_free_table(struct fib_table *tb)
{
1836 1837 1838 1839 1840
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie *t = (struct trie *)tb->tb_data;

	free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
1841 1842 1843
	kfree(tb);
}

1844 1845
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1846 1847 1848 1849
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1850
	__be32 xkey = htonl(key);
1851

1852
	s_i = cb->args[5];
1853 1854
	i = 0;

R
Robert Olsson 已提交
1855 1856 1857
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1858 1859 1860 1861 1862
		if (i < s_i) {
			i++;
			continue;
		}

1863
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1864 1865 1866 1867
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1868
				  xkey,
1869 1870
				  plen,
				  fa->fa_tos,
1871
				  fa->fa_info, NLM_F_MULTI) < 0) {
1872
			cb->args[5] = i;
1873
			return -1;
O
Olof Johansson 已提交
1874
		}
1875 1876
		i++;
	}
1877
	cb->args[5] = i;
1878 1879 1880
	return skb->len;
}

1881 1882
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1883
{
1884 1885
	struct leaf_info *li;
	int i, s_i;
1886

1887
	s_i = cb->args[4];
1888
	i = 0;
1889

1890
	/* rcu_read_lock is hold by caller */
1891
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1892 1893
		if (i < s_i) {
			i++;
1894
			continue;
1895
		}
O
Olof Johansson 已提交
1896

1897
		if (i > s_i)
1898
			cb->args[5] = 0;
1899

1900
		if (list_empty(&li->falh))
1901 1902
			continue;

1903
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1904
			cb->args[4] = i;
1905 1906
			return -1;
		}
1907
		i++;
1908
	}
1909

1910
	cb->args[4] = i;
1911 1912 1913
	return skb->len;
}

1914 1915
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1916
{
1917
	struct leaf *l;
1918
	struct trie *t = (struct trie *) tb->tb_data;
1919
	t_key key = cb->args[2];
1920
	int count = cb->args[3];
1921

R
Robert Olsson 已提交
1922
	rcu_read_lock();
1923 1924 1925
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1926
	if (count == 0)
1927 1928
		l = trie_firstleaf(t);
	else {
1929 1930 1931
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1932
		l = fib_find_node(t, key);
1933 1934
		if (!l)
			l = trie_leafindex(t, count);
1935
	}
1936

1937 1938
	while (l) {
		cb->args[2] = l->key;
1939
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1940
			cb->args[3] = count;
1941 1942
			rcu_read_unlock();
			return -1;
1943
		}
1944

1945
		++count;
1946
		l = trie_nextleaf(l);
1947 1948
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1949
	}
1950
	cb->args[3] = count;
R
Robert Olsson 已提交
1951
	rcu_read_unlock();
1952

1953 1954 1955
	return skb->len;
}

1956
void __init fib_trie_init(void)
1957
{
1958 1959
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1960 1961 1962 1963 1964 1965
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1966
}
1967

1968

1969
struct fib_table *fib_trie_table(u32 id)
1970 1971 1972 1973 1974 1975 1976 1977 1978 1979
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1980
	tb->tb_default = -1;
1981
	tb->tb_num_default = 0;
1982 1983

	t = (struct trie *) tb->tb_data;
1984 1985 1986 1987 1988 1989 1990 1991
	RCU_INIT_POINTER(t->trie, NULL);
#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats = alloc_percpu(struct trie_use_stats);
	if (!t->stats) {
		kfree(tb);
		tb = NULL;
	}
#endif
1992 1993 1994 1995

	return tb;
}

1996 1997 1998
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1999
	struct seq_net_private p;
2000
	struct fib_table *tb;
2001
	struct tnode *tnode;
E
Eric Dumazet 已提交
2002 2003
	unsigned int index;
	unsigned int depth;
2004
};
2005

2006
static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2007
{
2008
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
2009
	unsigned int cindex = iter->index;
2010
	struct tnode *p;
2011

2012 2013 2014 2015
	/* A single entry routing table */
	if (!tn)
		return NULL;

2016 2017 2018 2019
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2020
		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2021

2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2034

2035 2036
		++cindex;
	}
O
Olof Johansson 已提交
2037

2038
	/* Current node exhausted, pop back up */
2039
	p = node_parent_rcu((struct rt_trie_node *)tn);
2040 2041 2042 2043 2044
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2045
	}
2046 2047 2048

	/* got root? */
	return NULL;
2049 2050
}

2051
static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2052
				       struct trie *t)
2053
{
2054
	struct rt_trie_node *n;
2055

S
Stephen Hemminger 已提交
2056
	if (!t)
2057 2058 2059
		return NULL;

	n = rcu_dereference(t->trie);
2060
	if (!n)
2061
		return NULL;
2062

2063 2064 2065 2066 2067 2068 2069 2070
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2071
	}
2072 2073

	return n;
2074
}
O
Olof Johansson 已提交
2075

2076 2077
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
2078
	struct rt_trie_node *n;
2079
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2080

2081
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2082

2083
	rcu_read_lock();
2084
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2085
		if (IS_LEAF(n)) {
2086 2087 2088
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;

2089 2090 2091 2092
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2093

2094
			hlist_for_each_entry_rcu(li, &l->list, hlist)
2095
				++s->prefixes;
2096 2097 2098 2099 2100
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2101
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2102 2103
				s->nodesizes[tn->bits]++;

2104 2105 2106
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2107 2108
		}
	}
R
Robert Olsson 已提交
2109
	rcu_read_unlock();
2110 2111
}

2112 2113 2114 2115
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2116
{
E
Eric Dumazet 已提交
2117
	unsigned int i, max, pointers, bytes, avdepth;
2118

2119 2120 2121 2122
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2123

2124 2125
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2126
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2127

2128 2129
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2130 2131 2132 2133

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2134
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2135
	bytes += sizeof(struct tnode) * stat->tnodes;
2136

R
Robert Olsson 已提交
2137 2138
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2139
		max--;
2140

2141
	pointers = 0;
2142
	for (i = 1; i < max; i++)
2143
		if (stat->nodesizes[i] != 0) {
2144
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2145 2146 2147
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2148
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2149

2150
	bytes += sizeof(struct rt_trie_node *) * pointers;
2151 2152
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2153
}
R
Robert Olsson 已提交
2154

2155
#ifdef CONFIG_IP_FIB_TRIE_STATS
2156
static void trie_show_usage(struct seq_file *seq,
2157
			    const struct trie_use_stats __percpu *stats)
2158
{
2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173
	struct trie_use_stats s = { 0 };
	int cpu;

	/* loop through all of the CPUs and gather up the stats */
	for_each_possible_cpu(cpu) {
		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);

		s.gets += pcpu->gets;
		s.backtrack += pcpu->backtrack;
		s.semantic_match_passed += pcpu->semantic_match_passed;
		s.semantic_match_miss += pcpu->semantic_match_miss;
		s.null_node_hit += pcpu->null_node_hit;
		s.resize_node_skipped += pcpu->resize_node_skipped;
	}

2174
	seq_printf(seq, "\nCounters:\n---------\n");
2175 2176
	seq_printf(seq, "gets = %u\n", s.gets);
	seq_printf(seq, "backtracks = %u\n", s.backtrack);
2177
	seq_printf(seq, "semantic match passed = %u\n",
2178 2179 2180 2181
		   s.semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2182
}
2183 2184
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2185
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2186
{
2187 2188 2189 2190 2191 2192
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2193
}
2194

2195

2196 2197
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2198
	struct net *net = (struct net *)seq->private;
2199
	unsigned int h;
2200

2201
	seq_printf(seq,
2202 2203
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2204 2205
		   sizeof(struct leaf), sizeof(struct tnode));

2206 2207 2208 2209
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

2210
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2211 2212
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2213

2214 2215 2216 2217 2218 2219 2220 2221
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
2222
			trie_show_usage(seq, t->stats);
2223 2224 2225
#endif
		}
	}
2226

2227
	return 0;
2228 2229
}

2230
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2231
{
2232
	return single_open_net(inode, file, fib_triestat_seq_show);
2233 2234
}

2235
static const struct file_operations fib_triestat_fops = {
2236 2237 2238 2239
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2240
	.release = single_release_net,
2241 2242
};

2243
static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2244
{
2245 2246
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2247
	loff_t idx = 0;
2248
	unsigned int h;
2249

2250 2251 2252
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
2253

2254
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2255
			struct rt_trie_node *n;
2256 2257 2258 2259 2260 2261 2262 2263 2264

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2265
	}
2266

2267 2268 2269
	return NULL;
}

2270
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2271
	__acquires(RCU)
2272
{
2273
	rcu_read_lock();
2274
	return fib_trie_get_idx(seq, *pos);
2275 2276
}

2277
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2278
{
2279
	struct fib_trie_iter *iter = seq->private;
2280
	struct net *net = seq_file_net(seq);
2281 2282 2283
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
2284
	struct rt_trie_node *n;
2285

2286
	++*pos;
2287 2288 2289 2290
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2291

2292 2293
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2294
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2295 2296 2297 2298 2299
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2300

2301 2302 2303
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2304
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2305 2306 2307 2308 2309
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2310
	return NULL;
2311 2312 2313 2314

found:
	iter->tb = tb;
	return n;
2315
}
2316

2317
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2318
	__releases(RCU)
2319
{
2320 2321
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2322

2323 2324
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2325 2326
	while (n-- > 0)
		seq_puts(seq, "   ");
2327
}
2328

2329
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2330
{
S
Stephen Hemminger 已提交
2331
	switch (s) {
2332 2333 2334 2335 2336 2337
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2338
		snprintf(buf, len, "scope=%d", s);
2339 2340 2341
		return buf;
	}
}
2342

2343
static const char *const rtn_type_names[__RTN_MAX] = {
2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2357

E
Eric Dumazet 已提交
2358
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2359 2360 2361
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2362
	snprintf(buf, len, "type %u", t);
2363
	return buf;
2364 2365
}

2366 2367
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2368
{
2369
	const struct fib_trie_iter *iter = seq->private;
2370
	struct rt_trie_node *n = v;
2371

2372 2373
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2374

2375 2376
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2377
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2378

2379
		seq_indent(seq, iter->depth-1);
2380 2381
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2382
			   tn->empty_children);
2383

2384 2385
	} else {
		struct leaf *l = (struct leaf *) n;
2386
		struct leaf_info *li;
A
Al Viro 已提交
2387
		__be32 val = htonl(l->key);
2388 2389

		seq_indent(seq, iter->depth);
2390
		seq_printf(seq, "  |-- %pI4\n", &val);
2391

2392
		hlist_for_each_entry_rcu(li, &l->list, hlist) {
2393 2394 2395 2396 2397 2398 2399 2400
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2401
						     fa->fa_info->fib_scope),
2402 2403 2404
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2405
					seq_printf(seq, " tos=%d", fa->fa_tos);
2406
				seq_putc(seq, '\n');
2407 2408
			}
		}
2409
	}
2410

2411 2412 2413
	return 0;
}

2414
static const struct seq_operations fib_trie_seq_ops = {
2415 2416 2417 2418
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2419 2420
};

2421
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2422
{
2423 2424
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2425 2426
}

2427
static const struct file_operations fib_trie_fops = {
2428 2429 2430 2431
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2432
	.release = seq_release_net,
2433 2434
};

2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2475
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2513
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2514
{
E
Eric Dumazet 已提交
2515
	unsigned int flags = 0;
2516

E
Eric Dumazet 已提交
2517 2518
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2519 2520
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2521
	if (mask == htonl(0xFFFFFFFF))
2522 2523 2524
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2525 2526
}

2527 2528 2529
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2530
 *	and needs to be same as fib_hash output to avoid breaking
2531 2532 2533
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2534
{
2535
	struct leaf *l = v;
2536
	struct leaf_info *li;
2537

2538 2539 2540 2541 2542 2543
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2544

2545
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2546
		struct fib_alias *fa;
A
Al Viro 已提交
2547
		__be32 mask, prefix;
O
Olof Johansson 已提交
2548

2549 2550
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2551

2552
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2553
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2554
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2555

2556 2557 2558
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2559

2560 2561
			seq_setwidth(seq, 127);

2562
			if (fi)
2563 2564
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2565
					 "%d\t%08X\t%d\t%u\t%u",
2566 2567 2568 2569 2570
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2571 2572
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2573
					 fi->fib_window,
2574
					 fi->fib_rtt >> 3);
2575
			else
2576 2577
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2578
					 "%d\t%08X\t%d\t%u\t%u",
2579
					 prefix, 0, flags, 0, 0, 0,
2580
					 mask, 0, 0, 0);
2581

2582
			seq_pad(seq, '\n');
2583
		}
2584 2585 2586 2587 2588
	}

	return 0;
}

2589
static const struct seq_operations fib_route_seq_ops = {
2590 2591 2592
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2593
	.show   = fib_route_seq_show,
2594 2595
};

2596
static int fib_route_seq_open(struct inode *inode, struct file *file)
2597
{
2598
	return seq_open_net(inode, file, &fib_route_seq_ops,
2599
			    sizeof(struct fib_route_iter));
2600 2601
}

2602
static const struct file_operations fib_route_fops = {
2603 2604 2605 2606
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2607
	.release = seq_release_net,
2608 2609
};

2610
int __net_init fib_proc_init(struct net *net)
2611
{
2612
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2613 2614
		goto out1;

2615 2616
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2617 2618
		goto out2;

2619
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2620 2621
		goto out3;

2622
	return 0;
2623 2624

out3:
2625
	remove_proc_entry("fib_triestat", net->proc_net);
2626
out2:
2627
	remove_proc_entry("fib_trie", net->proc_net);
2628 2629
out1:
	return -ENOMEM;
2630 2631
}

2632
void __net_exit fib_proc_exit(struct net *net)
2633
{
2634 2635 2636
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2637 2638 2639
}

#endif /* CONFIG_PROC_FS */