fib_trie.c 61.5 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53 54

#include <asm/uaccess.h>
#include <asm/system.h>
J
Jiri Slaby 已提交
55
#include <linux/bitops.h>
56 57 58 59 60 61 62 63 64
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
65
#include <linux/inetdevice.h>
66 67 68
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
69
#include <linux/rcupdate.h>
70 71 72 73
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
74
#include <linux/slab.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89 90 91 92

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
93 94
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
95 96
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
97

98
struct rt_trie_node {
O
Olof Johansson 已提交
99
	unsigned long parent;
100
	t_key key;
101 102 103
};

struct leaf {
O
Olof Johansson 已提交
104
	unsigned long parent;
105
	t_key key;
106
	struct hlist_head list;
R
Robert Olsson 已提交
107
	struct rcu_head rcu;
108 109 110 111
};

struct leaf_info {
	struct hlist_node hlist;
R
Robert Olsson 已提交
112
	struct rcu_head rcu;
113 114 115 116 117
	int plen;
	struct list_head falh;
};

struct tnode {
O
Olof Johansson 已提交
118
	unsigned long parent;
119
	t_key key;
120 121
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
122 123
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
124 125 126
	union {
		struct rcu_head rcu;
		struct work_struct work;
J
Jarek Poplawski 已提交
127
		struct tnode *tnode_free;
128
	};
E
Eric Dumazet 已提交
129
	struct rt_trie_node __rcu *child[0];
130 131 132 133 134 135 136 137 138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140 141 142 143 144 145 146 147 148
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
149
	unsigned int prefixes;
R
Robert Olsson 已提交
150
	unsigned int nodesizes[MAX_STAT_DEPTH];
151
};
152 153

struct trie {
E
Eric Dumazet 已提交
154
	struct rt_trie_node __rcu *trie;
155 156 157 158 159
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
};

160 161
static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162
				  int wasfull);
163
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164 165
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
166 167
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
168 169 170 171 172 173 174 175
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
176

177
static struct kmem_cache *fn_alias_kmem __read_mostly;
178
static struct kmem_cache *trie_leaf_kmem __read_mostly;
179

E
Eric Dumazet 已提交
180 181 182 183
/*
 * caller must hold RTNL
 */
static inline struct tnode *node_parent(const struct rt_trie_node *node)
S
Stephen Hemminger 已提交
184
{
E
Eric Dumazet 已提交
185 186 187 188 189
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());

	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
190 191
}

E
Eric Dumazet 已提交
192 193 194 195
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
196
{
E
Eric Dumazet 已提交
197 198 199 200
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
							   lockdep_rtnl_is_held());
S
Stephen Hemminger 已提交
201

E
Eric Dumazet 已提交
202
	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
S
Stephen Hemminger 已提交
203 204
}

205 206 207
/* Same as rcu_assign_pointer
 * but that macro() assumes that value is a pointer.
 */
208
static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
S
Stephen Hemminger 已提交
209
{
210 211
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
212
}
R
Robert Olsson 已提交
213

E
Eric Dumazet 已提交
214 215 216 217
/*
 * caller must hold RTNL
 */
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
218 219
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
220

E
Eric Dumazet 已提交
221
	return rtnl_dereference(tn->child[i]);
222 223
}

E
Eric Dumazet 已提交
224 225 226 227
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
228
{
E
Eric Dumazet 已提交
229
	BUG_ON(i >= 1U << tn->bits);
230

E
Eric Dumazet 已提交
231
	return rcu_dereference_rtnl(tn->child[i]);
232 233
}

S
Stephen Hemmigner 已提交
234
static inline int tnode_child_length(const struct tnode *tn)
235
{
O
Olof Johansson 已提交
236
	return 1 << tn->bits;
237 238
}

239
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
240 241 242 243
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

244
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
245
{
O
Olof Johansson 已提交
246
	if (offset < KEYLENGTH)
247
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
248
	else
249 250 251 252 253
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
254
	return a == b;
255 256 257 258
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
259 260
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
261 262
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
263
}
264 265 266 267 268 269

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

270 271 272
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
273 274 275 276 277
		i++;
	return i;
}

/*
278 279
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
280 281 282 283
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

284 285 286 287 288
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
289 290
  correct key path.

291 292 293 294 295
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
296 297
  call to tkey_sub_equals() in trie_insert().

298
  if n is an internal node - a 'tnode' here, the various parts of its key
299 300
  have many different meanings.

301
  Example:
302 303 304
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
305
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
306 307 308 309 310 311 312 313 314

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
315
  n->bits = 4
316

317 318
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
319 320 321
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
322
  index into the parent's child array. That is, they will be used to find
323 324 325 326 327
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

328
  All the bits we have seen so far are significant to the node n. The rest
329 330
  of the bits are really not needed or indeed known in n->key.

331
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
332
  n's child array, and will of course be different for each child.
333

334

335 336 337 338 339
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
340
static inline void check_tnode(const struct tnode *tn)
341
{
S
Stephen Hemminger 已提交
342
	WARN_ON(tn && tn->pos+tn->bits > 32);
343 344
}

345 346
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
347
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
348
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
349 350

static void __alias_free_mem(struct rcu_head *head)
351
{
R
Robert Olsson 已提交
352 353
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
354 355
}

R
Robert Olsson 已提交
356
static inline void alias_free_mem_rcu(struct fib_alias *fa)
357
{
R
Robert Olsson 已提交
358 359
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
360

R
Robert Olsson 已提交
361 362
static void __leaf_free_rcu(struct rcu_head *head)
{
363 364
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
365
}
O
Olof Johansson 已提交
366

367 368 369 370 371
static inline void free_leaf(struct leaf *l)
{
	call_rcu_bh(&l->rcu, __leaf_free_rcu);
}

R
Robert Olsson 已提交
372
static inline void free_leaf_info(struct leaf_info *leaf)
373
{
374
	kfree_rcu(leaf, rcu);
375 376
}

377
static struct tnode *tnode_alloc(size_t size)
378
{
R
Robert Olsson 已提交
379
	if (size <= PAGE_SIZE)
380
		return kzalloc(size, GFP_KERNEL);
381
	else
382
		return vzalloc(size);
383
}
R
Robert Olsson 已提交
384

385 386 387 388
static void __tnode_vfree(struct work_struct *arg)
{
	struct tnode *tn = container_of(arg, struct tnode, work);
	vfree(tn);
389 390
}

R
Robert Olsson 已提交
391
static void __tnode_free_rcu(struct rcu_head *head)
392
{
R
Robert Olsson 已提交
393
	struct tnode *tn = container_of(head, struct tnode, rcu);
394
	size_t size = sizeof(struct tnode) +
395
		      (sizeof(struct rt_trie_node *) << tn->bits);
396 397 398

	if (size <= PAGE_SIZE)
		kfree(tn);
399 400 401 402
	else {
		INIT_WORK(&tn->work, __tnode_vfree);
		schedule_work(&tn->work);
	}
403 404
}

R
Robert Olsson 已提交
405 406
static inline void tnode_free(struct tnode *tn)
{
407 408 409
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
410
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
411 412
}

J
Jarek Poplawski 已提交
413 414 415
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
416 417
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
418
	tnode_free_size += sizeof(struct tnode) +
419
			   (sizeof(struct rt_trie_node *) << tn->bits);
J
Jarek Poplawski 已提交
420 421 422 423 424 425 426 427 428 429 430
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
431 432 433 434 435

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
436 437
}

R
Robert Olsson 已提交
438 439
static struct leaf *leaf_new(void)
{
440
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

458
static struct tnode *tnode_new(t_key key, int pos, int bits)
459
{
460
	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
461
	struct tnode *tn = tnode_alloc(sz);
462

O
Olof Johansson 已提交
463
	if (tn) {
R
Robert Olsson 已提交
464
		tn->parent = T_TNODE;
465 466 467 468 469 470
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
471

E
Eric Dumazet 已提交
472
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
473
		 sizeof(struct rt_trie_node) << bits);
474 475 476 477 478 479 480 481
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

482
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
483
{
484
	if (n == NULL || IS_LEAF(n))
485 486 487 488 489
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

490
static inline void put_child(struct trie *t, struct tnode *tn, int i,
491
			     struct rt_trie_node *n)
492 493 494 495
{
	tnode_put_child_reorg(tn, i, n, -1);
}

496
 /*
497 498 499 500
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

501
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
502
				  int wasfull)
503
{
E
Eric Dumazet 已提交
504
	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
505 506
	int isfull;

S
Stephen Hemminger 已提交
507 508
	BUG_ON(i >= 1<<tn->bits);

509 510 511 512 513
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
514

515
	/* update fullChildren */
O
Olof Johansson 已提交
516
	if (wasfull == -1)
517 518 519
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
520
	if (wasfull && !isfull)
521
		tn->full_children--;
522
	else if (!wasfull && isfull)
523
		tn->full_children++;
O
Olof Johansson 已提交
524

525
	if (n)
S
Stephen Hemminger 已提交
526
		node_set_parent(n, tn);
527

R
Robert Olsson 已提交
528
	rcu_assign_pointer(tn->child[i], n);
529 530
}

J
Jens Låås 已提交
531
#define MAX_WORK 10
532
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
533 534
{
	int i;
535
	struct tnode *old_tn;
536 537
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
538
	int max_work;
539

540
	if (!tn)
541 542
		return NULL;

S
Stephen Hemminger 已提交
543 544
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
545 546 547

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
548
		tnode_free_safe(tn);
549 550 551 552
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
J
Jens Låås 已提交
553
		goto one_child;
554
	/*
555 556 557 558 559
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
560 561
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
562
	 * Telecommunications, page 6:
563
	 * "A node is doubled if the ratio of non-empty children to all
564 565
	 * children in the *doubled* node is at least 'high'."
	 *
566 567 568 569 570
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
571
	 * multiply the left-hand side by 50.
572 573 574 575
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
576
	 * children, that is non-null tnodes with a skip value of 0.
577
	 * All of those will be doubled in the resulting inflated tnode, so
578
	 * we just count them one extra time here.
579
	 *
580
	 * A clearer way to write this would be:
581
	 *
582
	 * to_be_doubled = tn->full_children;
583
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
584 585 586 587
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
588
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
589 590
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
591 592 593
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
594 595 596
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
597
	 *
598 599 600
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
601
	 *
602
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
603
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
604
	 *    tn->full_children) >= inflate_threshold * new_child_length
605
	 *
606
	 * expand new_child_length:
607
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
608
	 *    tn->full_children) >=
609
	 *      inflate_threshold * tnode_child_length(tn) * 2
610
	 *
611
	 * shorten again:
612
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
613
	 *    tn->empty_children) >= inflate_threshold *
614
	 *    tnode_child_length(tn)
615
	 *
616 617 618
	 */

	check_tnode(tn);
619

620 621
	/* Keep root node larger  */

622
	if (!node_parent((struct rt_trie_node *)tn)) {
J
Jens Låås 已提交
623 624
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
625
	} else {
626
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
627 628
		halve_threshold_use = halve_threshold;
	}
629

J
Jens Låås 已提交
630 631
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
632 633 634
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
635

636 637
		old_tn = tn;
		tn = inflate(t, tn);
638

639 640
		if (IS_ERR(tn)) {
			tn = old_tn;
641 642 643 644 645
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
646 647 648 649
	}

	check_tnode(tn);

J
Jens Låås 已提交
650
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
651
	if (max_work != MAX_WORK)
652
		return (struct rt_trie_node *) tn;
J
Jens Låås 已提交
653

654 655 656 657
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
658

J
Jens Låås 已提交
659 660
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
661
	       100 * (tnode_child_length(tn) - tn->empty_children) <
662
	       halve_threshold_use * tnode_child_length(tn)) {
663

664 665 666 667
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
668 669 670 671 672 673
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
674

675

676
	/* Only one child remains */
J
Jens Låås 已提交
677 678
	if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
679
		for (i = 0; i < tnode_child_length(tn); i++) {
680
			struct rt_trie_node *n;
681

E
Eric Dumazet 已提交
682
			n = rtnl_dereference(tn->child[i]);
R
Robert Olsson 已提交
683
			if (!n)
O
Olof Johansson 已提交
684 685 686 687
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
688
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
689
			tnode_free_safe(tn);
O
Olof Johansson 已提交
690
			return n;
691
		}
J
Jens Låås 已提交
692
	}
693
	return (struct rt_trie_node *) tn;
694 695
}

E
Eric Dumazet 已提交
696 697 698 699 700 701 702 703 704 705 706 707 708 709

static void tnode_clean_free(struct tnode *tn)
{
	int i;
	struct tnode *tofree;

	for (i = 0; i < tnode_child_length(tn); i++) {
		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
		if (tofree)
			tnode_free(tofree);
	}
	tnode_free(tn);
}

710
static struct tnode *inflate(struct trie *t, struct tnode *tn)
711 712 713 714 715
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
716
	pr_debug("In inflate\n");
717 718 719

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
720
	if (!tn)
721
		return ERR_PTR(-ENOMEM);
722 723

	/*
724 725 726
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
727 728
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
729 730

	for (i = 0; i < olen; i++) {
731
		struct tnode *inode;
732

733
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
734 735 736 737 738
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
739
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
740

741 742
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
743 744
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
745

746 747 748
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

749
			if (!right) {
750 751
				tnode_free(left);
				goto nomem;
752
			}
753

754 755
			put_child(t, tn, 2*i, (struct rt_trie_node *) left);
			put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
756 757 758
		}
	}

O
Olof Johansson 已提交
759
	for (i = 0; i < olen; i++) {
760
		struct tnode *inode;
761
		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
762 763
		struct tnode *left, *right;
		int size, j;
764

765 766 767 768 769 770
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

771
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
772
		   tn->pos + tn->bits - 1) {
773 774 775
			if (tkey_extract_bits(node->key,
					      oldtnode->pos + oldtnode->bits,
					      1) == 0)
776 777 778 779 780 781 782 783 784 785
				put_child(t, tn, 2*i, node);
			else
				put_child(t, tn, 2*i+1, node);
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
E
Eric Dumazet 已提交
786 787
			put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
788

J
Jarek Poplawski 已提交
789
			tnode_free_safe(inode);
O
Olof Johansson 已提交
790
			continue;
791 792
		}

O
Olof Johansson 已提交
793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
811

O
Olof Johansson 已提交
812 813 814
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
815

O
Olof Johansson 已提交
816 817
		left = (struct tnode *) tnode_get_child(tn, 2*i);
		put_child(t, tn, 2*i, NULL);
818

O
Olof Johansson 已提交
819
		BUG_ON(!left);
820

O
Olof Johansson 已提交
821 822
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
		put_child(t, tn, 2*i+1, NULL);
823

O
Olof Johansson 已提交
824
		BUG_ON(!right);
825

O
Olof Johansson 已提交
826 827
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
E
Eric Dumazet 已提交
828 829
			put_child(t, left, j, rtnl_dereference(inode->child[j]));
			put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
830
		}
O
Olof Johansson 已提交
831 832 833
		put_child(t, tn, 2*i, resize(t, left));
		put_child(t, tn, 2*i+1, resize(t, right));

J
Jarek Poplawski 已提交
834
		tnode_free_safe(inode);
835
	}
J
Jarek Poplawski 已提交
836
	tnode_free_safe(oldtnode);
837
	return tn;
838
nomem:
E
Eric Dumazet 已提交
839 840
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
841 842
}

843
static struct tnode *halve(struct trie *t, struct tnode *tn)
844 845
{
	struct tnode *oldtnode = tn;
846
	struct rt_trie_node *left, *right;
847 848 849
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
850
	pr_debug("In halve\n");
851 852

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
853

854 855
	if (!tn)
		return ERR_PTR(-ENOMEM);
856 857

	/*
858 859 860
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
861 862 863
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
864
	for (i = 0; i < olen; i += 2) {
865 866
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
867

868
		/* Two nonempty children */
S
Stephen Hemminger 已提交
869
		if (left && right) {
870
			struct tnode *newn;
S
Stephen Hemminger 已提交
871

872
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
873 874

			if (!newn)
875
				goto nomem;
S
Stephen Hemminger 已提交
876

877
			put_child(t, tn, i/2, (struct rt_trie_node *)newn);
878 879 880
		}

	}
881

O
Olof Johansson 已提交
882 883 884
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

885 886
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
887

888 889 890 891 892
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
			put_child(t, tn, i/2, right);
O
Olof Johansson 已提交
893
			continue;
S
Stephen Hemminger 已提交
894
		}
O
Olof Johansson 已提交
895 896

		if (right == NULL) {
897
			put_child(t, tn, i/2, left);
O
Olof Johansson 已提交
898 899
			continue;
		}
900

901
		/* Two nonempty children */
O
Olof Johansson 已提交
902 903 904 905 906
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
		put_child(t, tn, i/2, NULL);
		put_child(t, newBinNode, 0, left);
		put_child(t, newBinNode, 1, right);
		put_child(t, tn, i/2, resize(t, newBinNode));
907
	}
J
Jarek Poplawski 已提交
908
	tnode_free_safe(oldtnode);
909
	return tn;
910
nomem:
E
Eric Dumazet 已提交
911 912
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
913 914
}

R
Robert Olsson 已提交
915
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
916 917
 via get_fa_head and dump */

R
Robert Olsson 已提交
918
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
919
{
R
Robert Olsson 已提交
920
	struct hlist_head *head = &l->list;
921 922 923
	struct hlist_node *node;
	struct leaf_info *li;

R
Robert Olsson 已提交
924
	hlist_for_each_entry_rcu(li, node, head, hlist)
925
		if (li->plen == plen)
926
			return li;
O
Olof Johansson 已提交
927

928 929 930
	return NULL;
}

931
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
932
{
R
Robert Olsson 已提交
933
	struct leaf_info *li = find_leaf_info(l, plen);
934

O
Olof Johansson 已提交
935 936
	if (!li)
		return NULL;
937

O
Olof Johansson 已提交
938
	return &li->falh;
939 940 941 942
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959
	struct leaf_info *li = NULL, *last = NULL;
	struct hlist_node *node;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
		hlist_for_each_entry(li, node, head, hlist) {
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
			hlist_add_after_rcu(&last->hlist, &new->hlist);
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
960 961
}

R
Robert Olsson 已提交
962 963
/* rcu_read_lock needs to be hold by caller from readside */

964 965 966 967 968
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
969
	struct rt_trie_node *n;
970 971

	pos = 0;
E
Eric Dumazet 已提交
972
	n = rcu_dereference_rtnl(t->trie);
973 974 975

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
976

977
		check_tnode(tn);
O
Olof Johansson 已提交
978

979
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
980
			pos = tn->pos + tn->bits;
981 982 983 984
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
985
		} else
986 987 988 989
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
990 991 992
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

993 994 995
	return NULL;
}

996
static void trie_rebalance(struct trie *t, struct tnode *tn)
997 998
{
	int wasfull;
R
Robert Olsson 已提交
999
	t_key cindex, key;
S
Stephen Hemminger 已提交
1000
	struct tnode *tp;
1001

R
Robert Olsson 已提交
1002 1003
	key = tn->key;

1004
	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1005 1006
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1007 1008 1009
		tn = (struct tnode *) resize(t, (struct tnode *)tn);

		tnode_put_child_reorg((struct tnode *)tp, cindex,
1010
				      (struct rt_trie_node *)tn, wasfull);
O
Olof Johansson 已提交
1011

1012
		tp = node_parent((struct rt_trie_node *) tn);
1013
		if (!tp)
1014
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1015

J
Jarek Poplawski 已提交
1016
		tnode_free_flush();
S
Stephen Hemminger 已提交
1017
		if (!tp)
1018
			break;
S
Stephen Hemminger 已提交
1019
		tn = tp;
1020
	}
S
Stephen Hemminger 已提交
1021

1022
	/* Handle last (top) tnode */
1023
	if (IS_TNODE(tn))
1024
		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1025

1026
	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1027
	tnode_free_flush();
1028 1029
}

R
Robert Olsson 已提交
1030 1031
/* only used from updater-side */

1032
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1033 1034 1035
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
1036
	struct rt_trie_node *n;
1037 1038
	struct leaf *l;
	int missbit;
1039
	struct list_head *fa_head = NULL;
1040 1041 1042 1043
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
E
Eric Dumazet 已提交
1044
	n = rtnl_dereference(t->trie);
1045

1046 1047
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1048
	 * and we should just put our new leaf in that.
1049 1050
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1051 1052
	 * not be the parent's 'pos'+'bits'!
	 *
1053
	 * If it does match the current key, get pos/bits from it, extract
1054 1055 1056 1057
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1058 1059 1060
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1061 1062 1063 1064 1065
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1066

1067
		check_tnode(tn);
O
Olof Johansson 已提交
1068

1069
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1070
			tp = tn;
O
Olof Johansson 已提交
1071
			pos = tn->pos + tn->bits;
1072 1073 1074 1075
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1076

S
Stephen Hemminger 已提交
1077
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1078
		} else
1079 1080 1081 1082 1083 1084
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1085
	 * tp is n's (parent) ----> NULL or TNODE
1086 1087
	 */

O
Olof Johansson 已提交
1088
	BUG_ON(tp && IS_LEAF(tp));
1089 1090 1091

	/* Case 1: n is a leaf. Compare prefixes */

1092
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1093
		l = (struct leaf *) n;
1094
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1095

1096 1097
		if (!li)
			return NULL;
1098 1099 1100 1101 1102 1103 1104

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1105 1106
	if (!l)
		return NULL;
1107 1108 1109 1110

	l->key = key;
	li = leaf_info_new(plen);

1111
	if (!li) {
1112
		free_leaf(l);
1113
		return NULL;
1114
	}
1115 1116 1117 1118 1119

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1120
		/* Case 2: n is NULL, and will just insert a new leaf */
1121

1122
		node_set_parent((struct rt_trie_node *)l, tp);
1123

O
Olof Johansson 已提交
1124
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125
		put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
O
Olof Johansson 已提交
1126 1127
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1128 1129
		/*
		 *  Add a new tnode here
1130 1131 1132 1133
		 *  first tnode need some special handling
		 */

		if (tp)
O
Olof Johansson 已提交
1134
			pos = tp->pos+tp->bits;
1135
		else
O
Olof Johansson 已提交
1136 1137
			pos = 0;

1138
		if (n) {
1139 1140
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1141
		} else {
1142
			newpos = 0;
1143
			tn = tnode_new(key, newpos, 1); /* First tnode */
1144 1145
		}

1146
		if (!tn) {
1147
			free_leaf_info(li);
1148
			free_leaf(l);
1149
			return NULL;
O
Olof Johansson 已提交
1150 1151
		}

1152
		node_set_parent((struct rt_trie_node *)tn, tp);
1153

O
Olof Johansson 已提交
1154
		missbit = tkey_extract_bits(key, newpos, 1);
1155
		put_child(t, tn, missbit, (struct rt_trie_node *)l);
1156 1157
		put_child(t, tn, 1-missbit, n);

1158
		if (tp) {
1159
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1160
			put_child(t, (struct tnode *)tp, cindex,
1161
				  (struct rt_trie_node *)tn);
O
Olof Johansson 已提交
1162
		} else {
1163
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1164 1165 1166
			tp = tn;
		}
	}
O
Olof Johansson 已提交
1167 1168

	if (tp && tp->pos + tp->bits > 32)
1169 1170 1171
		pr_warning("fib_trie"
			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			   tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1172

1173
	/* Rebalance the trie */
R
Robert Olsson 已提交
1174

1175
	trie_rebalance(t, tp);
1176
done:
1177 1178 1179
	return fa_head;
}

1180 1181 1182
/*
 * Caller must hold RTNL.
 */
1183
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1184 1185 1186
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1187
	struct list_head *fa_head = NULL;
1188
	struct fib_info *fi;
1189 1190
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1191 1192 1193 1194 1195 1196 1197
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1198
	key = ntohl(cfg->fc_dst);
1199

1200
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1201

O
Olof Johansson 已提交
1202
	mask = ntohl(inet_make_mask(plen));
1203

1204
	if (key & ~mask)
1205 1206 1207 1208
		return -EINVAL;

	key = key & mask;

1209 1210 1211
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1212
		goto err;
1213
	}
1214 1215

	l = fib_find_node(t, key);
1216
	fa = NULL;
1217

1218
	if (l) {
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1234 1235 1236
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1237 1238

		err = -EEXIST;
1239
		if (cfg->fc_nlflags & NLM_F_EXCL)
1240 1241
			goto out;

1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1262
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1263 1264 1265
			struct fib_info *fi_drop;
			u8 state;

1266 1267 1268 1269
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1270
				goto out;
1271
			}
R
Robert Olsson 已提交
1272
			err = -ENOBUFS;
1273
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1274 1275
			if (new_fa == NULL)
				goto out;
1276 1277

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1278 1279
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1280
			new_fa->fa_type = cfg->fc_type;
1281
			state = fa->fa_state;
1282
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1283

R
Robert Olsson 已提交
1284 1285
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1286 1287 1288

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1289
				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1290 1291
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1292

O
Olof Johansson 已提交
1293
			goto succeeded;
1294 1295 1296 1297 1298
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1299 1300
		if (fa_match)
			goto out;
1301

1302
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1303
			fa = fa_first;
1304 1305
	}
	err = -ENOENT;
1306
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1307 1308 1309
		goto out;

	err = -ENOBUFS;
1310
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1311 1312 1313 1314 1315
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1316
	new_fa->fa_type = cfg->fc_type;
1317 1318 1319 1320 1321
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1322
	if (!fa_head) {
1323 1324 1325
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1326
			goto out_free_new_fa;
1327
		}
1328
	}
1329

1330 1331 1332
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1333 1334
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1335

1336
	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1337
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1338
		  &cfg->fc_nlinfo, 0);
1339 1340
succeeded:
	return 0;
1341 1342 1343

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1344 1345
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1346
err:
1347 1348 1349
	return err;
}

R
Robert Olsson 已提交
1350
/* should be called with rcu_read_lock */
1351
static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1352
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1353
		      struct fib_result *res, int fib_flags)
1354 1355 1356 1357
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
	struct hlist_node *node;
1358

R
Robert Olsson 已提交
1359
	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1360
		struct fib_alias *fa;
1361 1362 1363
		int plen = li->plen;
		__be32 mask = inet_make_mask(plen);

1364
		if (l->key != (key & ntohl(mask)))
1365 1366
			continue;

1367 1368 1369
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1370

1371
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1372
				continue;
1373
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1374 1375 1376 1377
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (err) {
1378
#ifdef CONFIG_IP_FIB_TRIE_STATS
1379
				t->stats.semantic_match_passed++;
1380
#endif
1381
				return err;
1382 1383 1384 1385 1386 1387 1388 1389
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1390
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1391 1392 1393 1394 1395 1396 1397 1398
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
				t->stats.semantic_match_passed++;
#endif
				res->prefixlen = plen;
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1399
				res->scope = fa->fa_info->fib_scope;
1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
					atomic_inc(&res->fi->fib_clntref);
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
		t->stats.semantic_match_miss++;
1411 1412
#endif
	}
1413

1414
	return 1;
1415 1416
}

1417
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1418
		     struct fib_result *res, int fib_flags)
1419 1420
{
	struct trie *t = (struct trie *) tb->tb_data;
1421
	int ret;
1422
	struct rt_trie_node *n;
1423
	struct tnode *pn;
1424
	unsigned int pos, bits;
1425
	t_key key = ntohl(flp->daddr);
1426
	unsigned int chopped_off;
1427
	t_key cindex = 0;
1428
	unsigned int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1429
	struct tnode *cn;
1430
	t_key pref_mismatch;
O
Olof Johansson 已提交
1431

R
Robert Olsson 已提交
1432
	rcu_read_lock();
O
Olof Johansson 已提交
1433

R
Robert Olsson 已提交
1434
	n = rcu_dereference(t->trie);
1435
	if (!n)
1436 1437 1438 1439 1440 1441 1442 1443
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats.gets++;
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1444
		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1445
		goto found;
1446
	}
1447

1448 1449
	pn = (struct tnode *) n;
	chopped_off = 0;
1450

O
Olof Johansson 已提交
1451
	while (pn) {
1452 1453 1454
		pos = pn->pos;
		bits = pn->bits;

1455
		if (!chopped_off)
S
Stephen Hemminger 已提交
1456 1457
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1458

1459
		n = tnode_get_child_rcu(pn, cindex);
1460 1461 1462 1463 1464 1465 1466 1467

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.null_node_hit++;
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1468
		if (IS_LEAF(n)) {
1469
			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1470
			if (ret > 0)
O
Olof Johansson 已提交
1471
				goto backtrace;
1472
			goto found;
O
Olof Johansson 已提交
1473 1474 1475
		}

		cn = (struct tnode *)n;
1476

O
Olof Johansson 已提交
1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1492

O
Olof Johansson 已提交
1493 1494 1495 1496 1497 1498 1499 1500 1501
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1502

1503 1504
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1505

O
Olof Johansson 已提交
1506 1507
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1508 1509
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1510 1511
				goto backtrace;
		}
1512

O
Olof Johansson 已提交
1513 1514 1515 1516 1517 1518 1519 1520 1521
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1522

O
Olof Johansson 已提交
1523 1524 1525 1526 1527 1528 1529 1530
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1531

1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1543 1544
		 */

1545
		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
O
Olof Johansson 已提交
1546

1547 1548 1549 1550
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1551 1552
		 */
		if (pref_mismatch) {
1553
			int mp = KEYLENGTH - fls(pref_mismatch);
O
Olof Johansson 已提交
1554

1555
			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
O
Olof Johansson 已提交
1556 1557 1558 1559
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1560
		}
1561

O
Olof Johansson 已提交
1562 1563 1564 1565
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1566 1567 1568 1569
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1570 1571
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1572 1573 1574 1575
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1576 1577
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1578

1579
		/*
1580
		 * Either we do the actual chop off according or if we have
1581 1582 1583
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1584
		if (chopped_off <= pn->bits) {
1585
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1586
		} else {
1587
			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
S
Stephen Hemminger 已提交
1588
			if (!parent)
1589
				goto failed;
O
Olof Johansson 已提交
1590

1591
			/* Get Child's index */
S
Stephen Hemminger 已提交
1592 1593
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1594 1595 1596 1597 1598 1599
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.backtrack++;
#endif
			goto backtrace;
1600
		}
1601 1602
	}
failed:
1603
	ret = 1;
1604
found:
R
Robert Olsson 已提交
1605
	rcu_read_unlock();
1606 1607 1608
	return ret;
}

1609 1610 1611 1612
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1613
{
1614
	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1615

1616
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1617

1618
	if (tp) {
1619
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1620
		put_child(t, (struct tnode *)tp, cindex, NULL);
1621
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1622
	} else
R
Robert Olsson 已提交
1623
		rcu_assign_pointer(t->trie, NULL);
1624

1625
	free_leaf(l);
1626 1627
}

1628 1629 1630
/*
 * Caller must hold RTNL.
 */
1631
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1632 1633 1634
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1635 1636
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1637 1638 1639
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1640 1641
	struct leaf_info *li;

1642
	if (plen > 32)
1643 1644
		return -EINVAL;

1645
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1646
	mask = ntohl(inet_make_mask(plen));
1647

1648
	if (key & ~mask)
1649 1650 1651 1652 1653
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1654
	if (!l)
1655 1656 1657 1658 1659 1660 1661 1662
		return -ESRCH;

	fa_head = get_fa_head(l, plen);
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1663
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1664 1665

	fa_to_delete = NULL;
1666 1667
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1668 1669 1670 1671 1672
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1673 1674
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1675
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1676 1677
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1678 1679 1680
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1681 1682 1683 1684 1685
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1686 1687
	if (!fa_to_delete)
		return -ESRCH;
1688

O
Olof Johansson 已提交
1689
	fa = fa_to_delete;
1690
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1691
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1692 1693

	l = fib_find_node(t, key);
R
Robert Olsson 已提交
1694
	li = find_leaf_info(l, plen);
1695

R
Robert Olsson 已提交
1696
	list_del_rcu(&fa->fa_list);
1697

1698 1699 1700
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1701
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1702
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1703
		free_leaf_info(li);
R
Robert Olsson 已提交
1704
	}
1705

O
Olof Johansson 已提交
1706
	if (hlist_empty(&l->list))
1707
		trie_leaf_remove(t, l);
1708

O
Olof Johansson 已提交
1709
	if (fa->fa_state & FA_S_ACCESSED)
1710
		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1711

R
Robert Olsson 已提交
1712 1713
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1714
	return 0;
1715 1716
}

1717
static int trie_flush_list(struct list_head *head)
1718 1719 1720 1721 1722 1723 1724
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1725 1726 1727 1728
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1729 1730 1731 1732 1733 1734
			found++;
		}
	}
	return found;
}

1735
static int trie_flush_leaf(struct leaf *l)
1736 1737 1738 1739 1740 1741 1742
{
	int found = 0;
	struct hlist_head *lih = &l->list;
	struct hlist_node *node, *tmp;
	struct leaf_info *li = NULL;

	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1743
		found += trie_flush_list(&li->falh);
1744 1745

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1746
			hlist_del_rcu(&li->hlist);
1747 1748 1749 1750 1751 1752
			free_leaf_info(li);
		}
	}
	return found;
}

1753 1754 1755 1756
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
1757
static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1758
{
1759 1760
	do {
		t_key idx;
1761 1762

		if (c)
1763
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1764
		else
1765
			idx = 0;
R
Robert Olsson 已提交
1766

1767 1768
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1769
			if (!c)
O
Olof Johansson 已提交
1770 1771
				continue;

1772
			if (IS_LEAF(c)) {
E
Eric Dumazet 已提交
1773
				prefetch(rcu_dereference_rtnl(p->child[idx]));
1774
				return (struct leaf *) c;
1775
			}
1776 1777 1778 1779

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1780
		}
1781 1782

		/* Node empty, walk back up to parent */
1783
		c = (struct rt_trie_node *) p;
E
Eric Dumazet 已提交
1784
	} while ((p = node_parent_rcu(c)) != NULL);
1785 1786 1787 1788 1789 1790

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
E
Eric Dumazet 已提交
1791
	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
1804
	struct rt_trie_node *c = (struct rt_trie_node *) l;
1805
	struct tnode *p = node_parent_rcu(c);
1806 1807 1808 1809 1810

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1811 1812
}

1813 1814 1815 1816
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1817
	while (l && index-- > 0)
1818
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1819

1820 1821 1822 1823
	return l;
}


1824 1825 1826
/*
 * Caller must hold RTNL.
 */
1827
int fib_table_flush(struct fib_table *tb)
1828 1829
{
	struct trie *t = (struct trie *) tb->tb_data;
1830
	struct leaf *l, *ll = NULL;
1831
	int found = 0;
1832

1833
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1834
		found += trie_flush_leaf(l);
1835 1836

		if (ll && hlist_empty(&ll->list))
1837
			trie_leaf_remove(t, ll);
1838 1839 1840 1841
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1842
		trie_leaf_remove(t, ll);
1843

S
Stephen Hemminger 已提交
1844
	pr_debug("trie_flush found=%d\n", found);
1845 1846 1847
	return found;
}

1848 1849 1850 1851 1852
void fib_free_table(struct fib_table *tb)
{
	kfree(tb);
}

1853 1854
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1855 1856 1857 1858
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1859
	__be32 xkey = htonl(key);
1860

1861
	s_i = cb->args[5];
1862 1863
	i = 0;

R
Robert Olsson 已提交
1864 1865 1866
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
		if (i < s_i) {
			i++;
			continue;
		}

		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1877
				  xkey,
1878 1879
				  plen,
				  fa->fa_tos,
1880
				  fa->fa_info, NLM_F_MULTI) < 0) {
1881
			cb->args[5] = i;
1882
			return -1;
O
Olof Johansson 已提交
1883
		}
1884 1885
		i++;
	}
1886
	cb->args[5] = i;
1887 1888 1889
	return skb->len;
}

1890 1891
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1892
{
1893 1894 1895
	struct leaf_info *li;
	struct hlist_node *node;
	int i, s_i;
1896

1897
	s_i = cb->args[4];
1898
	i = 0;
1899

1900 1901 1902 1903
	/* rcu_read_lock is hold by caller */
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
		if (i < s_i) {
			i++;
1904
			continue;
1905
		}
O
Olof Johansson 已提交
1906

1907
		if (i > s_i)
1908
			cb->args[5] = 0;
1909

1910
		if (list_empty(&li->falh))
1911 1912
			continue;

1913
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1914
			cb->args[4] = i;
1915 1916
			return -1;
		}
1917
		i++;
1918
	}
1919

1920
	cb->args[4] = i;
1921 1922 1923
	return skb->len;
}

1924 1925
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1926
{
1927
	struct leaf *l;
1928
	struct trie *t = (struct trie *) tb->tb_data;
1929
	t_key key = cb->args[2];
1930
	int count = cb->args[3];
1931

R
Robert Olsson 已提交
1932
	rcu_read_lock();
1933 1934 1935
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1936
	if (count == 0)
1937 1938
		l = trie_firstleaf(t);
	else {
1939 1940 1941
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1942
		l = fib_find_node(t, key);
1943 1944
		if (!l)
			l = trie_leafindex(t, count);
1945
	}
1946

1947 1948
	while (l) {
		cb->args[2] = l->key;
1949
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1950
			cb->args[3] = count;
1951 1952
			rcu_read_unlock();
			return -1;
1953
		}
1954

1955
		++count;
1956
		l = trie_nextleaf(l);
1957 1958
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1959
	}
1960
	cb->args[3] = count;
R
Robert Olsson 已提交
1961
	rcu_read_unlock();
1962

1963 1964 1965
	return skb->len;
}

1966
void __init fib_trie_init(void)
1967
{
1968 1969
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1970 1971 1972 1973 1974 1975
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1976
}
1977

1978

1979
struct fib_table *fib_trie_table(u32 id)
1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1990
	tb->tb_default = -1;
1991
	tb->tb_num_default = 0;
1992 1993

	t = (struct trie *) tb->tb_data;
1994
	memset(t, 0, sizeof(*t));
1995 1996 1997 1998

	return tb;
}

1999 2000 2001
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
2002
	struct seq_net_private p;
2003
	struct fib_table *tb;
2004
	struct tnode *tnode;
E
Eric Dumazet 已提交
2005 2006
	unsigned int index;
	unsigned int depth;
2007
};
2008

2009
static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2010
{
2011
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
2012
	unsigned int cindex = iter->index;
2013
	struct tnode *p;
2014

2015 2016 2017 2018
	/* A single entry routing table */
	if (!tn)
		return NULL;

2019 2020 2021 2022
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2023
		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2024

2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2037

2038 2039
		++cindex;
	}
O
Olof Johansson 已提交
2040

2041
	/* Current node exhausted, pop back up */
2042
	p = node_parent_rcu((struct rt_trie_node *)tn);
2043 2044 2045 2046 2047
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2048
	}
2049 2050 2051

	/* got root? */
	return NULL;
2052 2053
}

2054
static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2055
				       struct trie *t)
2056
{
2057
	struct rt_trie_node *n;
2058

S
Stephen Hemminger 已提交
2059
	if (!t)
2060 2061 2062
		return NULL;

	n = rcu_dereference(t->trie);
2063
	if (!n)
2064
		return NULL;
2065

2066 2067 2068 2069 2070 2071 2072 2073
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2074
	}
2075 2076

	return n;
2077
}
O
Olof Johansson 已提交
2078

2079 2080
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
2081
	struct rt_trie_node *n;
2082
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2083

2084
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2085

2086
	rcu_read_lock();
2087
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2088
		if (IS_LEAF(n)) {
2089 2090 2091 2092
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;
			struct hlist_node *tmp;

2093 2094 2095 2096
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2097 2098 2099

			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
				++s->prefixes;
2100 2101 2102 2103 2104
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2105
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2106 2107
				s->nodesizes[tn->bits]++;

2108 2109 2110
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2111 2112
		}
	}
R
Robert Olsson 已提交
2113
	rcu_read_unlock();
2114 2115
}

2116 2117 2118 2119
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2120
{
E
Eric Dumazet 已提交
2121
	unsigned int i, max, pointers, bytes, avdepth;
2122

2123 2124 2125 2126
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2127

2128 2129
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2130
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2131

2132 2133
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2134 2135 2136 2137

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2138
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2139
	bytes += sizeof(struct tnode) * stat->tnodes;
2140

R
Robert Olsson 已提交
2141 2142
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2143
		max--;
2144

2145 2146 2147
	pointers = 0;
	for (i = 1; i <= max; i++)
		if (stat->nodesizes[i] != 0) {
2148
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2149 2150 2151
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2152
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2153

2154
	bytes += sizeof(struct rt_trie_node *) * pointers;
2155 2156
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2157
}
R
Robert Olsson 已提交
2158

2159
#ifdef CONFIG_IP_FIB_TRIE_STATS
2160 2161 2162 2163
static void trie_show_usage(struct seq_file *seq,
			    const struct trie_use_stats *stats)
{
	seq_printf(seq, "\nCounters:\n---------\n");
2164 2165 2166 2167 2168 2169 2170 2171 2172
	seq_printf(seq, "gets = %u\n", stats->gets);
	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
	seq_printf(seq, "semantic match passed = %u\n",
		   stats->semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n",
		   stats->semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n",
		   stats->resize_node_skipped);
2173
}
2174 2175
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2176
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2177
{
2178 2179 2180 2181 2182 2183
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2184
}
2185

2186

2187 2188
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2189
	struct net *net = (struct net *)seq->private;
2190
	unsigned int h;
2191

2192
	seq_printf(seq,
2193 2194
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2195 2196
		   sizeof(struct leaf), sizeof(struct tnode));

2197 2198 2199 2200 2201 2202 2203 2204
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;

		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2205

2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
			trie_show_usage(seq, &t->stats);
#endif
		}
	}
2218

2219
	return 0;
2220 2221
}

2222
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2223
{
2224
	return single_open_net(inode, file, fib_triestat_seq_show);
2225 2226
}

2227
static const struct file_operations fib_triestat_fops = {
2228 2229 2230 2231
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2232
	.release = single_release_net,
2233 2234
};

2235
static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2236
{
2237 2238
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2239
	loff_t idx = 0;
2240
	unsigned int h;
2241

2242 2243 2244 2245
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;
2246

2247
		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2248
			struct rt_trie_node *n;
2249 2250 2251 2252 2253 2254 2255 2256 2257

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2258
	}
2259

2260 2261 2262
	return NULL;
}

2263
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2264
	__acquires(RCU)
2265
{
2266
	rcu_read_lock();
2267
	return fib_trie_get_idx(seq, *pos);
2268 2269
}

2270
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2271
{
2272
	struct fib_trie_iter *iter = seq->private;
2273
	struct net *net = seq_file_net(seq);
2274 2275 2276
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
2277
	struct rt_trie_node *n;
2278

2279
	++*pos;
2280 2281 2282 2283
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2284

2285 2286
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2287
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2288 2289 2290 2291 2292
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2293

2294 2295 2296 2297 2298 2299 2300 2301 2302
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2303
	return NULL;
2304 2305 2306 2307

found:
	iter->tb = tb;
	return n;
2308
}
2309

2310
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2311
	__releases(RCU)
2312
{
2313 2314
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2315

2316 2317
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2318 2319
	while (n-- > 0)
		seq_puts(seq, "   ");
2320
}
2321

2322
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2323
{
S
Stephen Hemminger 已提交
2324
	switch (s) {
2325 2326 2327 2328 2329 2330
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2331
		snprintf(buf, len, "scope=%d", s);
2332 2333 2334
		return buf;
	}
}
2335

2336
static const char *const rtn_type_names[__RTN_MAX] = {
2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2350

E
Eric Dumazet 已提交
2351
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2352 2353 2354
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2355
	snprintf(buf, len, "type %u", t);
2356
	return buf;
2357 2358
}

2359 2360
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2361
{
2362
	const struct fib_trie_iter *iter = seq->private;
2363
	struct rt_trie_node *n = v;
2364

2365 2366
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2367

2368 2369
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2370
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2371

2372
		seq_indent(seq, iter->depth-1);
2373 2374
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2375
			   tn->empty_children);
2376

2377 2378
	} else {
		struct leaf *l = (struct leaf *) n;
2379 2380
		struct leaf_info *li;
		struct hlist_node *node;
A
Al Viro 已提交
2381
		__be32 val = htonl(l->key);
2382 2383

		seq_indent(seq, iter->depth);
2384
		seq_printf(seq, "  |-- %pI4\n", &val);
2385 2386 2387 2388 2389 2390 2391 2392 2393 2394

		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2395
						     fa->fa_info->fib_scope),
2396 2397 2398
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2399
					seq_printf(seq, " tos=%d", fa->fa_tos);
2400
				seq_putc(seq, '\n');
2401 2402
			}
		}
2403
	}
2404

2405 2406 2407
	return 0;
}

2408
static const struct seq_operations fib_trie_seq_ops = {
2409 2410 2411 2412
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2413 2414
};

2415
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2416
{
2417 2418
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2419 2420
}

2421
static const struct file_operations fib_trie_fops = {
2422 2423 2424 2425
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2426
	.release = seq_release_net,
2427 2428
};

2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2469
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2507
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2508
{
E
Eric Dumazet 已提交
2509
	unsigned int flags = 0;
2510

E
Eric Dumazet 已提交
2511 2512
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2513 2514
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2515
	if (mask == htonl(0xFFFFFFFF))
2516 2517 2518
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2519 2520
}

2521 2522 2523
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2524
 *	and needs to be same as fib_hash output to avoid breaking
2525 2526 2527
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2528
{
2529
	struct leaf *l = v;
2530 2531
	struct leaf_info *li;
	struct hlist_node *node;
2532

2533 2534 2535 2536 2537 2538
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2539

2540
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2541
		struct fib_alias *fa;
A
Al Viro 已提交
2542
		__be32 mask, prefix;
O
Olof Johansson 已提交
2543

2544 2545
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2546

2547
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2548
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2549
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2550
			int len;
2551

2552 2553 2554
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2555

2556
			if (fi)
2557 2558 2559
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2560 2561 2562 2563 2564
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2565 2566
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2567
					 fi->fib_window,
2568
					 fi->fib_rtt >> 3, &len);
2569
			else
2570 2571 2572
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2573
					 prefix, 0, flags, 0, 0, 0,
2574
					 mask, 0, 0, 0, &len);
2575

2576
			seq_printf(seq, "%*s\n", 127 - len, "");
2577
		}
2578 2579 2580 2581 2582
	}

	return 0;
}

2583
static const struct seq_operations fib_route_seq_ops = {
2584 2585 2586
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2587
	.show   = fib_route_seq_show,
2588 2589
};

2590
static int fib_route_seq_open(struct inode *inode, struct file *file)
2591
{
2592
	return seq_open_net(inode, file, &fib_route_seq_ops,
2593
			    sizeof(struct fib_route_iter));
2594 2595
}

2596
static const struct file_operations fib_route_fops = {
2597 2598 2599 2600
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2601
	.release = seq_release_net,
2602 2603
};

2604
int __net_init fib_proc_init(struct net *net)
2605
{
2606
	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2607 2608
		goto out1;

2609 2610
	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
				  &fib_triestat_fops))
2611 2612
		goto out2;

2613
	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2614 2615
		goto out3;

2616
	return 0;
2617 2618

out3:
2619
	proc_net_remove(net, "fib_triestat");
2620
out2:
2621
	proc_net_remove(net, "fib_trie");
2622 2623
out1:
	return -ENOMEM;
2624 2625
}

2626
void __net_exit fib_proc_exit(struct net *net)
2627
{
2628 2629 2630
	proc_net_remove(net, "fib_trie");
	proc_net_remove(net, "fib_triestat");
	proc_net_remove(net, "route");
2631 2632 2633
}

#endif /* CONFIG_PROC_FS */