fib_trie.c 60.9 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53 54

#include <asm/uaccess.h>
#include <asm/system.h>
J
Jiri Slaby 已提交
55
#include <linux/bitops.h>
56 57 58 59 60 61 62 63 64
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
65
#include <linux/inetdevice.h>
66 67 68
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
69
#include <linux/rcupdate.h>
70 71 72 73
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
74
#include <linux/slab.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89 90 91 92

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
93 94
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
95 96
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
97

98
struct rt_trie_node {
O
Olof Johansson 已提交
99
	unsigned long parent;
100
	t_key key;
101 102 103
};

struct leaf {
O
Olof Johansson 已提交
104
	unsigned long parent;
105
	t_key key;
106
	struct hlist_head list;
R
Robert Olsson 已提交
107
	struct rcu_head rcu;
108 109 110 111
};

struct leaf_info {
	struct hlist_node hlist;
R
Robert Olsson 已提交
112
	struct rcu_head rcu;
113 114 115 116 117
	int plen;
	struct list_head falh;
};

struct tnode {
O
Olof Johansson 已提交
118
	unsigned long parent;
119
	t_key key;
120 121
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
122 123
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
124 125 126
	union {
		struct rcu_head rcu;
		struct work_struct work;
J
Jarek Poplawski 已提交
127
		struct tnode *tnode_free;
128
	};
129
	struct rt_trie_node *child[0];
130 131 132 133 134 135 136 137 138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140 141 142 143 144 145 146 147 148
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
149
	unsigned int prefixes;
R
Robert Olsson 已提交
150
	unsigned int nodesizes[MAX_STAT_DEPTH];
151
};
152 153

struct trie {
154
	struct rt_trie_node *trie;
155 156 157 158 159
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
};

160 161
static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162
				  int wasfull);
163
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164 165
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
166 167
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
168 169 170 171 172 173 174 175
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
176

177
static struct kmem_cache *fn_alias_kmem __read_mostly;
178
static struct kmem_cache *trie_leaf_kmem __read_mostly;
179

180
static inline struct tnode *node_parent(struct rt_trie_node *node)
S
Stephen Hemminger 已提交
181
{
182 183 184
	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
}

185
static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
186 187
{
	struct tnode *ret = node_parent(node);
S
Stephen Hemminger 已提交
188

E
Eric Dumazet 已提交
189
	return rcu_dereference_rtnl(ret);
S
Stephen Hemminger 已提交
190 191
}

192 193 194
/* Same as rcu_assign_pointer
 * but that macro() assumes that value is a pointer.
 */
195
static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
S
Stephen Hemminger 已提交
196
{
197 198
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
199
}
R
Robert Olsson 已提交
200

201
static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
202 203
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
204

205 206 207
	return tn->child[i];
}

208
static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209
{
210
	struct rt_trie_node *ret = tnode_get_child(tn, i);
211

E
Eric Dumazet 已提交
212
	return rcu_dereference_rtnl(ret);
213 214
}

S
Stephen Hemmigner 已提交
215
static inline int tnode_child_length(const struct tnode *tn)
216
{
O
Olof Johansson 已提交
217
	return 1 << tn->bits;
218 219
}

220
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
221 222 223 224
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

225
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
226
{
O
Olof Johansson 已提交
227
	if (offset < KEYLENGTH)
228
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
229
	else
230 231 232 233 234
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
235
	return a == b;
236 237 238 239
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
240 241
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
242 243
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
244
}
245 246 247 248 249 250

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

251 252 253
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
254 255 256 257 258
		i++;
	return i;
}

/*
259 260
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
261 262 263 264
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

265 266 267 268 269
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
270 271
  correct key path.

272 273 274 275 276
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
277 278
  call to tkey_sub_equals() in trie_insert().

279
  if n is an internal node - a 'tnode' here, the various parts of its key
280 281
  have many different meanings.

282
  Example:
283 284 285
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
286
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
287 288 289 290 291 292 293 294 295

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
296
  n->bits = 4
297

298 299
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
300 301 302
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
303
  index into the parent's child array. That is, they will be used to find
304 305 306 307 308
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

309
  All the bits we have seen so far are significant to the node n. The rest
310 311
  of the bits are really not needed or indeed known in n->key.

312
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
313
  n's child array, and will of course be different for each child.
314

315

316 317 318 319 320
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
321
static inline void check_tnode(const struct tnode *tn)
322
{
S
Stephen Hemminger 已提交
323
	WARN_ON(tn && tn->pos+tn->bits > 32);
324 325
}

326 327
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
328
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
329
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
330 331

static void __alias_free_mem(struct rcu_head *head)
332
{
R
Robert Olsson 已提交
333 334
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
335 336
}

R
Robert Olsson 已提交
337
static inline void alias_free_mem_rcu(struct fib_alias *fa)
338
{
R
Robert Olsson 已提交
339 340
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
341

R
Robert Olsson 已提交
342 343
static void __leaf_free_rcu(struct rcu_head *head)
{
344 345
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
346
}
O
Olof Johansson 已提交
347

348 349 350 351 352
static inline void free_leaf(struct leaf *l)
{
	call_rcu_bh(&l->rcu, __leaf_free_rcu);
}

R
Robert Olsson 已提交
353
static inline void free_leaf_info(struct leaf_info *leaf)
354
{
355
	kfree_rcu(leaf, rcu);
356 357
}

358
static struct tnode *tnode_alloc(size_t size)
359
{
R
Robert Olsson 已提交
360
	if (size <= PAGE_SIZE)
361
		return kzalloc(size, GFP_KERNEL);
362
	else
363
		return vzalloc(size);
364
}
R
Robert Olsson 已提交
365

366 367 368 369
static void __tnode_vfree(struct work_struct *arg)
{
	struct tnode *tn = container_of(arg, struct tnode, work);
	vfree(tn);
370 371
}

R
Robert Olsson 已提交
372
static void __tnode_free_rcu(struct rcu_head *head)
373
{
R
Robert Olsson 已提交
374
	struct tnode *tn = container_of(head, struct tnode, rcu);
375
	size_t size = sizeof(struct tnode) +
376
		      (sizeof(struct rt_trie_node *) << tn->bits);
377 378 379

	if (size <= PAGE_SIZE)
		kfree(tn);
380 381 382 383
	else {
		INIT_WORK(&tn->work, __tnode_vfree);
		schedule_work(&tn->work);
	}
384 385
}

R
Robert Olsson 已提交
386 387
static inline void tnode_free(struct tnode *tn)
{
388 389 390
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
391
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
392 393
}

J
Jarek Poplawski 已提交
394 395 396
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
397 398
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
399
	tnode_free_size += sizeof(struct tnode) +
400
			   (sizeof(struct rt_trie_node *) << tn->bits);
J
Jarek Poplawski 已提交
401 402 403 404 405 406 407 408 409 410 411
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
412 413 414 415 416

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
417 418
}

R
Robert Olsson 已提交
419 420
static struct leaf *leaf_new(void)
{
421
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

439
static struct tnode *tnode_new(t_key key, int pos, int bits)
440
{
441
	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
442
	struct tnode *tn = tnode_alloc(sz);
443

O
Olof Johansson 已提交
444
	if (tn) {
R
Robert Olsson 已提交
445
		tn->parent = T_TNODE;
446 447 448 449 450 451
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
452

E
Eric Dumazet 已提交
453
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
454
		 sizeof(struct rt_trie_node) << bits);
455 456 457 458 459 460 461 462
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

463
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
464
{
465
	if (n == NULL || IS_LEAF(n))
466 467 468 469 470
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

471
static inline void put_child(struct trie *t, struct tnode *tn, int i,
472
			     struct rt_trie_node *n)
473 474 475 476
{
	tnode_put_child_reorg(tn, i, n, -1);
}

477
 /*
478 479 480 481
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

482
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
483
				  int wasfull)
484
{
485
	struct rt_trie_node *chi = tn->child[i];
486 487
	int isfull;

S
Stephen Hemminger 已提交
488 489
	BUG_ON(i >= 1<<tn->bits);

490 491 492 493 494
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
495

496
	/* update fullChildren */
O
Olof Johansson 已提交
497
	if (wasfull == -1)
498 499 500
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
501
	if (wasfull && !isfull)
502
		tn->full_children--;
503
	else if (!wasfull && isfull)
504
		tn->full_children++;
O
Olof Johansson 已提交
505

506
	if (n)
S
Stephen Hemminger 已提交
507
		node_set_parent(n, tn);
508

R
Robert Olsson 已提交
509
	rcu_assign_pointer(tn->child[i], n);
510 511
}

J
Jens Låås 已提交
512
#define MAX_WORK 10
513
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
514 515
{
	int i;
516
	struct tnode *old_tn;
517 518
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
519
	int max_work;
520

521
	if (!tn)
522 523
		return NULL;

S
Stephen Hemminger 已提交
524 525
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
526 527 528

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
529
		tnode_free_safe(tn);
530 531 532 533
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
J
Jens Låås 已提交
534
		goto one_child;
535
	/*
536 537 538 539 540
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
541 542
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
543
	 * Telecommunications, page 6:
544
	 * "A node is doubled if the ratio of non-empty children to all
545 546
	 * children in the *doubled* node is at least 'high'."
	 *
547 548 549 550 551
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
552
	 * multiply the left-hand side by 50.
553 554 555 556
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
557
	 * children, that is non-null tnodes with a skip value of 0.
558
	 * All of those will be doubled in the resulting inflated tnode, so
559
	 * we just count them one extra time here.
560
	 *
561
	 * A clearer way to write this would be:
562
	 *
563
	 * to_be_doubled = tn->full_children;
564
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
565 566 567 568
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
569
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
570 571
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
572 573 574
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
575 576 577
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
578
	 *
579 580 581
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
582
	 *
583
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
584
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
585
	 *    tn->full_children) >= inflate_threshold * new_child_length
586
	 *
587
	 * expand new_child_length:
588
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
589
	 *    tn->full_children) >=
590
	 *      inflate_threshold * tnode_child_length(tn) * 2
591
	 *
592
	 * shorten again:
593
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
594
	 *    tn->empty_children) >= inflate_threshold *
595
	 *    tnode_child_length(tn)
596
	 *
597 598 599
	 */

	check_tnode(tn);
600

601 602
	/* Keep root node larger  */

603
	if (!node_parent((struct rt_trie_node *)tn)) {
J
Jens Låås 已提交
604 605
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
606
	} else {
607
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
608 609
		halve_threshold_use = halve_threshold;
	}
610

J
Jens Låås 已提交
611 612
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
613 614 615
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
616

617 618
		old_tn = tn;
		tn = inflate(t, tn);
619

620 621
		if (IS_ERR(tn)) {
			tn = old_tn;
622 623 624 625 626
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
627 628 629 630
	}

	check_tnode(tn);

J
Jens Låås 已提交
631
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
632
	if (max_work != MAX_WORK)
633
		return (struct rt_trie_node *) tn;
J
Jens Låås 已提交
634

635 636 637 638
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
639

J
Jens Låås 已提交
640 641
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
642
	       100 * (tnode_child_length(tn) - tn->empty_children) <
643
	       halve_threshold_use * tnode_child_length(tn)) {
644

645 646 647 648
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
649 650 651 652 653 654
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
655

656

657
	/* Only one child remains */
J
Jens Låås 已提交
658 659
	if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
660
		for (i = 0; i < tnode_child_length(tn); i++) {
661
			struct rt_trie_node *n;
662

O
Olof Johansson 已提交
663
			n = tn->child[i];
R
Robert Olsson 已提交
664
			if (!n)
O
Olof Johansson 已提交
665 666 667 668
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
669
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
670
			tnode_free_safe(tn);
O
Olof Johansson 已提交
671
			return n;
672
		}
J
Jens Låås 已提交
673
	}
674
	return (struct rt_trie_node *) tn;
675 676
}

677
static struct tnode *inflate(struct trie *t, struct tnode *tn)
678 679 680 681 682
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
683
	pr_debug("In inflate\n");
684 685 686

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
687
	if (!tn)
688
		return ERR_PTR(-ENOMEM);
689 690

	/*
691 692 693
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
694 695
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
696 697

	for (i = 0; i < olen; i++) {
698
		struct tnode *inode;
699

700
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
701 702 703 704 705
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
706
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
707

708 709
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
710 711
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
712

713 714 715
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

716
			if (!right) {
717 718
				tnode_free(left);
				goto nomem;
719
			}
720

721 722
			put_child(t, tn, 2*i, (struct rt_trie_node *) left);
			put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
723 724 725
		}
	}

O
Olof Johansson 已提交
726
	for (i = 0; i < olen; i++) {
727
		struct tnode *inode;
728
		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
729 730
		struct tnode *left, *right;
		int size, j;
731

732 733 734 735 736 737
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

738
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
739
		   tn->pos + tn->bits - 1) {
740 741 742
			if (tkey_extract_bits(node->key,
					      oldtnode->pos + oldtnode->bits,
					      1) == 0)
743 744 745 746 747 748 749 750 751 752 753 754 755
				put_child(t, tn, 2*i, node);
			else
				put_child(t, tn, 2*i+1, node);
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
			put_child(t, tn, 2*i, inode->child[0]);
			put_child(t, tn, 2*i+1, inode->child[1]);

J
Jarek Poplawski 已提交
756
			tnode_free_safe(inode);
O
Olof Johansson 已提交
757
			continue;
758 759
		}

O
Olof Johansson 已提交
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
778

O
Olof Johansson 已提交
779 780 781
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
782

O
Olof Johansson 已提交
783 784
		left = (struct tnode *) tnode_get_child(tn, 2*i);
		put_child(t, tn, 2*i, NULL);
785

O
Olof Johansson 已提交
786
		BUG_ON(!left);
787

O
Olof Johansson 已提交
788 789
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
		put_child(t, tn, 2*i+1, NULL);
790

O
Olof Johansson 已提交
791
		BUG_ON(!right);
792

O
Olof Johansson 已提交
793 794 795 796
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
			put_child(t, left, j, inode->child[j]);
			put_child(t, right, j, inode->child[j + size]);
797
		}
O
Olof Johansson 已提交
798 799 800
		put_child(t, tn, 2*i, resize(t, left));
		put_child(t, tn, 2*i+1, resize(t, right));

J
Jarek Poplawski 已提交
801
		tnode_free_safe(inode);
802
	}
J
Jarek Poplawski 已提交
803
	tnode_free_safe(oldtnode);
804
	return tn;
805 806 807 808 809
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

S
Stephen Hemminger 已提交
810
		for (j = 0; j < size; j++)
811 812 813 814
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
S
Stephen Hemminger 已提交
815

816 817
		return ERR_PTR(-ENOMEM);
	}
818 819
}

820
static struct tnode *halve(struct trie *t, struct tnode *tn)
821 822
{
	struct tnode *oldtnode = tn;
823
	struct rt_trie_node *left, *right;
824 825 826
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
827
	pr_debug("In halve\n");
828 829

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
830

831 832
	if (!tn)
		return ERR_PTR(-ENOMEM);
833 834

	/*
835 836 837
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
838 839 840
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
841
	for (i = 0; i < olen; i += 2) {
842 843
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
844

845
		/* Two nonempty children */
S
Stephen Hemminger 已提交
846
		if (left && right) {
847
			struct tnode *newn;
S
Stephen Hemminger 已提交
848

849
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
850 851

			if (!newn)
852
				goto nomem;
S
Stephen Hemminger 已提交
853

854
			put_child(t, tn, i/2, (struct rt_trie_node *)newn);
855 856 857
		}

	}
858

O
Olof Johansson 已提交
859 860 861
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

862 863
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
864

865 866 867 868 869
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
			put_child(t, tn, i/2, right);
O
Olof Johansson 已提交
870
			continue;
S
Stephen Hemminger 已提交
871
		}
O
Olof Johansson 已提交
872 873

		if (right == NULL) {
874
			put_child(t, tn, i/2, left);
O
Olof Johansson 已提交
875 876
			continue;
		}
877

878
		/* Two nonempty children */
O
Olof Johansson 已提交
879 880 881 882 883
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
		put_child(t, tn, i/2, NULL);
		put_child(t, newBinNode, 0, left);
		put_child(t, newBinNode, 1, right);
		put_child(t, tn, i/2, resize(t, newBinNode));
884
	}
J
Jarek Poplawski 已提交
885
	tnode_free_safe(oldtnode);
886
	return tn;
887 888 889 890 891
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

S
Stephen Hemminger 已提交
892
		for (j = 0; j < size; j++)
893 894 895 896
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
S
Stephen Hemminger 已提交
897

898 899
		return ERR_PTR(-ENOMEM);
	}
900 901
}

R
Robert Olsson 已提交
902
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
903 904
 via get_fa_head and dump */

R
Robert Olsson 已提交
905
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
906
{
R
Robert Olsson 已提交
907
	struct hlist_head *head = &l->list;
908 909 910
	struct hlist_node *node;
	struct leaf_info *li;

R
Robert Olsson 已提交
911
	hlist_for_each_entry_rcu(li, node, head, hlist)
912
		if (li->plen == plen)
913
			return li;
O
Olof Johansson 已提交
914

915 916 917
	return NULL;
}

918
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
919
{
R
Robert Olsson 已提交
920
	struct leaf_info *li = find_leaf_info(l, plen);
921

O
Olof Johansson 已提交
922 923
	if (!li)
		return NULL;
924

O
Olof Johansson 已提交
925
	return &li->falh;
926 927 928 929
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946
	struct leaf_info *li = NULL, *last = NULL;
	struct hlist_node *node;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
		hlist_for_each_entry(li, node, head, hlist) {
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
			hlist_add_after_rcu(&last->hlist, &new->hlist);
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
947 948
}

R
Robert Olsson 已提交
949 950
/* rcu_read_lock needs to be hold by caller from readside */

951 952 953 954 955
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
956
	struct rt_trie_node *n;
957 958

	pos = 0;
E
Eric Dumazet 已提交
959
	n = rcu_dereference_rtnl(t->trie);
960 961 962

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
963

964
		check_tnode(tn);
O
Olof Johansson 已提交
965

966
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
967
			pos = tn->pos + tn->bits;
968 969 970 971
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
972
		} else
973 974 975 976
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
977 978 979
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

980 981 982
	return NULL;
}

983
static void trie_rebalance(struct trie *t, struct tnode *tn)
984 985
{
	int wasfull;
R
Robert Olsson 已提交
986
	t_key cindex, key;
S
Stephen Hemminger 已提交
987
	struct tnode *tp;
988

R
Robert Olsson 已提交
989 990
	key = tn->key;

991
	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
992 993
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
994 995 996
		tn = (struct tnode *) resize(t, (struct tnode *)tn);

		tnode_put_child_reorg((struct tnode *)tp, cindex,
997
				      (struct rt_trie_node *)tn, wasfull);
O
Olof Johansson 已提交
998

999
		tp = node_parent((struct rt_trie_node *) tn);
1000
		if (!tp)
1001
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1002

J
Jarek Poplawski 已提交
1003
		tnode_free_flush();
S
Stephen Hemminger 已提交
1004
		if (!tp)
1005
			break;
S
Stephen Hemminger 已提交
1006
		tn = tp;
1007
	}
S
Stephen Hemminger 已提交
1008

1009
	/* Handle last (top) tnode */
1010
	if (IS_TNODE(tn))
1011
		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1012

1013
	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014
	tnode_free_flush();
1015 1016
}

R
Robert Olsson 已提交
1017 1018
/* only used from updater-side */

1019
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020 1021 1022
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
1023
	struct rt_trie_node *n;
1024 1025
	struct leaf *l;
	int missbit;
1026
	struct list_head *fa_head = NULL;
1027 1028 1029 1030
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
1031
	n = t->trie;
1032

1033 1034
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1035
	 * and we should just put our new leaf in that.
1036 1037
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 1039
	 * not be the parent's 'pos'+'bits'!
	 *
1040
	 * If it does match the current key, get pos/bits from it, extract
1041 1042 1043 1044
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1045 1046 1047
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1048 1049 1050 1051 1052
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1053

1054
		check_tnode(tn);
O
Olof Johansson 已提交
1055

1056
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057
			tp = tn;
O
Olof Johansson 已提交
1058
			pos = tn->pos + tn->bits;
1059 1060 1061 1062
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1063

S
Stephen Hemminger 已提交
1064
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1065
		} else
1066 1067 1068 1069 1070 1071
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1072
	 * tp is n's (parent) ----> NULL or TNODE
1073 1074
	 */

O
Olof Johansson 已提交
1075
	BUG_ON(tp && IS_LEAF(tp));
1076 1077 1078

	/* Case 1: n is a leaf. Compare prefixes */

1079
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080
		l = (struct leaf *) n;
1081
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1082

1083 1084
		if (!li)
			return NULL;
1085 1086 1087 1088 1089 1090 1091

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1092 1093
	if (!l)
		return NULL;
1094 1095 1096 1097

	l->key = key;
	li = leaf_info_new(plen);

1098
	if (!li) {
1099
		free_leaf(l);
1100
		return NULL;
1101
	}
1102 1103 1104 1105 1106

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1107
		/* Case 2: n is NULL, and will just insert a new leaf */
1108

1109
		node_set_parent((struct rt_trie_node *)l, tp);
1110

O
Olof Johansson 已提交
1111
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1112
		put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
O
Olof Johansson 已提交
1113 1114
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1115 1116
		/*
		 *  Add a new tnode here
1117 1118 1119 1120
		 *  first tnode need some special handling
		 */

		if (tp)
O
Olof Johansson 已提交
1121
			pos = tp->pos+tp->bits;
1122
		else
O
Olof Johansson 已提交
1123 1124
			pos = 0;

1125
		if (n) {
1126 1127
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1128
		} else {
1129
			newpos = 0;
1130
			tn = tnode_new(key, newpos, 1); /* First tnode */
1131 1132
		}

1133
		if (!tn) {
1134
			free_leaf_info(li);
1135
			free_leaf(l);
1136
			return NULL;
O
Olof Johansson 已提交
1137 1138
		}

1139
		node_set_parent((struct rt_trie_node *)tn, tp);
1140

O
Olof Johansson 已提交
1141
		missbit = tkey_extract_bits(key, newpos, 1);
1142
		put_child(t, tn, missbit, (struct rt_trie_node *)l);
1143 1144
		put_child(t, tn, 1-missbit, n);

1145
		if (tp) {
1146
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1147
			put_child(t, (struct tnode *)tp, cindex,
1148
				  (struct rt_trie_node *)tn);
O
Olof Johansson 已提交
1149
		} else {
1150
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1151 1152 1153
			tp = tn;
		}
	}
O
Olof Johansson 已提交
1154 1155

	if (tp && tp->pos + tp->bits > 32)
1156 1157 1158
		pr_warning("fib_trie"
			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			   tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1159

1160
	/* Rebalance the trie */
R
Robert Olsson 已提交
1161

1162
	trie_rebalance(t, tp);
1163
done:
1164 1165 1166
	return fa_head;
}

1167 1168 1169
/*
 * Caller must hold RTNL.
 */
1170
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1171 1172 1173
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1174
	struct list_head *fa_head = NULL;
1175
	struct fib_info *fi;
1176 1177
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1178 1179 1180 1181 1182 1183 1184
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1185
	key = ntohl(cfg->fc_dst);
1186

1187
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1188

O
Olof Johansson 已提交
1189
	mask = ntohl(inet_make_mask(plen));
1190

1191
	if (key & ~mask)
1192 1193 1194 1195
		return -EINVAL;

	key = key & mask;

1196 1197 1198
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1199
		goto err;
1200
	}
1201 1202

	l = fib_find_node(t, key);
1203
	fa = NULL;
1204

1205
	if (l) {
1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1221 1222 1223
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1224 1225

		err = -EEXIST;
1226
		if (cfg->fc_nlflags & NLM_F_EXCL)
1227 1228
			goto out;

1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1249
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1250 1251 1252
			struct fib_info *fi_drop;
			u8 state;

1253 1254 1255 1256
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1257
				goto out;
1258
			}
R
Robert Olsson 已提交
1259
			err = -ENOBUFS;
1260
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1261 1262
			if (new_fa == NULL)
				goto out;
1263 1264

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1265 1266
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1267
			new_fa->fa_type = cfg->fc_type;
1268
			state = fa->fa_state;
1269
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1270

R
Robert Olsson 已提交
1271 1272
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1273 1274 1275

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1276
				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1277 1278
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1279

O
Olof Johansson 已提交
1280
			goto succeeded;
1281 1282 1283 1284 1285
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1286 1287
		if (fa_match)
			goto out;
1288

1289
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1290
			fa = fa_first;
1291 1292
	}
	err = -ENOENT;
1293
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1294 1295 1296
		goto out;

	err = -ENOBUFS;
1297
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1298 1299 1300 1301 1302
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1303
	new_fa->fa_type = cfg->fc_type;
1304 1305 1306 1307 1308
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1309
	if (!fa_head) {
1310 1311 1312
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1313
			goto out_free_new_fa;
1314
		}
1315
	}
1316

R
Robert Olsson 已提交
1317 1318
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1319

1320
	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1321
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1322
		  &cfg->fc_nlinfo, 0);
1323 1324
succeeded:
	return 0;
1325 1326 1327

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1328 1329
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1330
err:
1331 1332 1333
	return err;
}

R
Robert Olsson 已提交
1334
/* should be called with rcu_read_lock */
1335
static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1336
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1337
		      struct fib_result *res, int fib_flags)
1338 1339 1340 1341
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
	struct hlist_node *node;
1342

R
Robert Olsson 已提交
1343
	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1344
		struct fib_alias *fa;
1345 1346 1347
		int plen = li->plen;
		__be32 mask = inet_make_mask(plen);

1348
		if (l->key != (key & ntohl(mask)))
1349 1350
			continue;

1351 1352 1353
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1354

1355
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1356
				continue;
1357
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1358 1359 1360 1361
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (err) {
1362
#ifdef CONFIG_IP_FIB_TRIE_STATS
1363
				t->stats.semantic_match_passed++;
1364
#endif
1365
				return err;
1366 1367 1368 1369 1370 1371 1372 1373
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1374
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1375 1376 1377 1378 1379 1380 1381 1382
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
				t->stats.semantic_match_passed++;
#endif
				res->prefixlen = plen;
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1383
				res->scope = fa->fa_info->fib_scope;
1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
					atomic_inc(&res->fi->fib_clntref);
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
		t->stats.semantic_match_miss++;
1395 1396
#endif
	}
1397

1398
	return 1;
1399 1400
}

1401
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1402
		     struct fib_result *res, int fib_flags)
1403 1404
{
	struct trie *t = (struct trie *) tb->tb_data;
1405
	int ret;
1406
	struct rt_trie_node *n;
1407
	struct tnode *pn;
1408
	unsigned int pos, bits;
1409
	t_key key = ntohl(flp->daddr);
1410
	unsigned int chopped_off;
1411
	t_key cindex = 0;
1412
	unsigned int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1413
	struct tnode *cn;
1414
	t_key pref_mismatch;
O
Olof Johansson 已提交
1415

R
Robert Olsson 已提交
1416
	rcu_read_lock();
O
Olof Johansson 已提交
1417

R
Robert Olsson 已提交
1418
	n = rcu_dereference(t->trie);
1419
	if (!n)
1420 1421 1422 1423 1424 1425 1426 1427
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats.gets++;
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1428
		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1429
		goto found;
1430
	}
1431

1432 1433
	pn = (struct tnode *) n;
	chopped_off = 0;
1434

O
Olof Johansson 已提交
1435
	while (pn) {
1436 1437 1438
		pos = pn->pos;
		bits = pn->bits;

1439
		if (!chopped_off)
S
Stephen Hemminger 已提交
1440 1441
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1442

1443
		n = tnode_get_child_rcu(pn, cindex);
1444 1445 1446 1447 1448 1449 1450 1451

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.null_node_hit++;
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1452
		if (IS_LEAF(n)) {
1453
			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1454
			if (ret > 0)
O
Olof Johansson 已提交
1455
				goto backtrace;
1456
			goto found;
O
Olof Johansson 已提交
1457 1458 1459
		}

		cn = (struct tnode *)n;
1460

O
Olof Johansson 已提交
1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1476

O
Olof Johansson 已提交
1477 1478 1479 1480 1481 1482 1483 1484 1485
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1486

1487 1488
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1489

O
Olof Johansson 已提交
1490 1491
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1492 1493
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1494 1495
				goto backtrace;
		}
1496

O
Olof Johansson 已提交
1497 1498 1499 1500 1501 1502 1503 1504 1505
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1506

O
Olof Johansson 已提交
1507 1508 1509 1510 1511 1512 1513 1514
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1515

1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1527 1528
		 */

1529
		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
O
Olof Johansson 已提交
1530

1531 1532 1533 1534
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1535 1536
		 */
		if (pref_mismatch) {
1537
			int mp = KEYLENGTH - fls(pref_mismatch);
O
Olof Johansson 已提交
1538

1539
			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
O
Olof Johansson 已提交
1540 1541 1542 1543
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1544
		}
1545

O
Olof Johansson 已提交
1546 1547 1548 1549
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1550 1551 1552 1553
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1554 1555
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1556 1557 1558 1559
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1560 1561
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1562

1563
		/*
1564
		 * Either we do the actual chop off according or if we have
1565 1566 1567
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1568
		if (chopped_off <= pn->bits) {
1569
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1570
		} else {
1571
			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
S
Stephen Hemminger 已提交
1572
			if (!parent)
1573
				goto failed;
O
Olof Johansson 已提交
1574

1575
			/* Get Child's index */
S
Stephen Hemminger 已提交
1576 1577
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1578 1579 1580 1581 1582 1583
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.backtrack++;
#endif
			goto backtrace;
1584
		}
1585 1586
	}
failed:
1587
	ret = 1;
1588
found:
R
Robert Olsson 已提交
1589
	rcu_read_unlock();
1590 1591 1592
	return ret;
}

1593 1594 1595 1596
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1597
{
1598
	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1599

1600
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1601

1602
	if (tp) {
1603
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1604
		put_child(t, (struct tnode *)tp, cindex, NULL);
1605
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1606
	} else
R
Robert Olsson 已提交
1607
		rcu_assign_pointer(t->trie, NULL);
1608

1609
	free_leaf(l);
1610 1611
}

1612 1613 1614
/*
 * Caller must hold RTNL.
 */
1615
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1616 1617 1618
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1619 1620
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1621 1622 1623
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1624 1625
	struct leaf_info *li;

1626
	if (plen > 32)
1627 1628
		return -EINVAL;

1629
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1630
	mask = ntohl(inet_make_mask(plen));
1631

1632
	if (key & ~mask)
1633 1634 1635 1636 1637
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1638
	if (!l)
1639 1640 1641 1642 1643 1644 1645 1646
		return -ESRCH;

	fa_head = get_fa_head(l, plen);
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1647
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1648 1649

	fa_to_delete = NULL;
1650 1651
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1652 1653 1654 1655 1656
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1657 1658
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1659
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1660 1661
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1662 1663 1664
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1665 1666 1667 1668 1669
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1670 1671
	if (!fa_to_delete)
		return -ESRCH;
1672

O
Olof Johansson 已提交
1673
	fa = fa_to_delete;
1674
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1675
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1676 1677

	l = fib_find_node(t, key);
R
Robert Olsson 已提交
1678
	li = find_leaf_info(l, plen);
1679

R
Robert Olsson 已提交
1680
	list_del_rcu(&fa->fa_list);
1681

O
Olof Johansson 已提交
1682
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1683
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1684
		free_leaf_info(li);
R
Robert Olsson 已提交
1685
	}
1686

O
Olof Johansson 已提交
1687
	if (hlist_empty(&l->list))
1688
		trie_leaf_remove(t, l);
1689

O
Olof Johansson 已提交
1690
	if (fa->fa_state & FA_S_ACCESSED)
1691
		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1692

R
Robert Olsson 已提交
1693 1694
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1695
	return 0;
1696 1697
}

1698
static int trie_flush_list(struct list_head *head)
1699 1700 1701 1702 1703 1704 1705
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1706 1707 1708 1709
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1710 1711 1712 1713 1714 1715
			found++;
		}
	}
	return found;
}

1716
static int trie_flush_leaf(struct leaf *l)
1717 1718 1719 1720 1721 1722 1723
{
	int found = 0;
	struct hlist_head *lih = &l->list;
	struct hlist_node *node, *tmp;
	struct leaf_info *li = NULL;

	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1724
		found += trie_flush_list(&li->falh);
1725 1726

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1727
			hlist_del_rcu(&li->hlist);
1728 1729 1730 1731 1732 1733
			free_leaf_info(li);
		}
	}
	return found;
}

1734 1735 1736 1737
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
1738
static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1739
{
1740 1741
	do {
		t_key idx;
1742 1743

		if (c)
1744
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1745
		else
1746
			idx = 0;
R
Robert Olsson 已提交
1747

1748 1749
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1750
			if (!c)
O
Olof Johansson 已提交
1751 1752
				continue;

1753 1754 1755
			if (IS_LEAF(c)) {
				prefetch(p->child[idx]);
				return (struct leaf *) c;
1756
			}
1757 1758 1759 1760

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1761
		}
1762 1763

		/* Node empty, walk back up to parent */
1764
		c = (struct rt_trie_node *) p;
E
Eric Dumazet 已提交
1765
	} while ((p = node_parent_rcu(c)) != NULL);
1766 1767 1768 1769 1770 1771

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
E
Eric Dumazet 已提交
1772
	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
1785
	struct rt_trie_node *c = (struct rt_trie_node *) l;
1786
	struct tnode *p = node_parent_rcu(c);
1787 1788 1789 1790 1791

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1792 1793
}

1794 1795 1796 1797
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1798
	while (l && index-- > 0)
1799
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1800

1801 1802 1803 1804
	return l;
}


1805 1806 1807
/*
 * Caller must hold RTNL.
 */
1808
int fib_table_flush(struct fib_table *tb)
1809 1810
{
	struct trie *t = (struct trie *) tb->tb_data;
1811
	struct leaf *l, *ll = NULL;
1812
	int found = 0;
1813

1814
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1815
		found += trie_flush_leaf(l);
1816 1817

		if (ll && hlist_empty(&ll->list))
1818
			trie_leaf_remove(t, ll);
1819 1820 1821 1822
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1823
		trie_leaf_remove(t, ll);
1824

S
Stephen Hemminger 已提交
1825
	pr_debug("trie_flush found=%d\n", found);
1826 1827 1828
	return found;
}

1829 1830 1831 1832 1833
void fib_free_table(struct fib_table *tb)
{
	kfree(tb);
}

1834 1835
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1836 1837 1838 1839
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1840
	__be32 xkey = htonl(key);
1841

1842
	s_i = cb->args[5];
1843 1844
	i = 0;

R
Robert Olsson 已提交
1845 1846 1847
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1848 1849 1850 1851 1852 1853 1854 1855 1856 1857
		if (i < s_i) {
			i++;
			continue;
		}

		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1858
				  xkey,
1859 1860
				  plen,
				  fa->fa_tos,
1861
				  fa->fa_info, NLM_F_MULTI) < 0) {
1862
			cb->args[5] = i;
1863
			return -1;
O
Olof Johansson 已提交
1864
		}
1865 1866
		i++;
	}
1867
	cb->args[5] = i;
1868 1869 1870
	return skb->len;
}

1871 1872
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1873
{
1874 1875 1876
	struct leaf_info *li;
	struct hlist_node *node;
	int i, s_i;
1877

1878
	s_i = cb->args[4];
1879
	i = 0;
1880

1881 1882 1883 1884
	/* rcu_read_lock is hold by caller */
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
		if (i < s_i) {
			i++;
1885
			continue;
1886
		}
O
Olof Johansson 已提交
1887

1888
		if (i > s_i)
1889
			cb->args[5] = 0;
1890

1891
		if (list_empty(&li->falh))
1892 1893
			continue;

1894
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1895
			cb->args[4] = i;
1896 1897
			return -1;
		}
1898
		i++;
1899
	}
1900

1901
	cb->args[4] = i;
1902 1903 1904
	return skb->len;
}

1905 1906
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1907
{
1908
	struct leaf *l;
1909
	struct trie *t = (struct trie *) tb->tb_data;
1910
	t_key key = cb->args[2];
1911
	int count = cb->args[3];
1912

R
Robert Olsson 已提交
1913
	rcu_read_lock();
1914 1915 1916
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1917
	if (count == 0)
1918 1919
		l = trie_firstleaf(t);
	else {
1920 1921 1922
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1923
		l = fib_find_node(t, key);
1924 1925
		if (!l)
			l = trie_leafindex(t, count);
1926
	}
1927

1928 1929
	while (l) {
		cb->args[2] = l->key;
1930
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1931
			cb->args[3] = count;
1932 1933
			rcu_read_unlock();
			return -1;
1934
		}
1935

1936
		++count;
1937
		l = trie_nextleaf(l);
1938 1939
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1940
	}
1941
	cb->args[3] = count;
R
Robert Olsson 已提交
1942
	rcu_read_unlock();
1943

1944 1945 1946
	return skb->len;
}

1947
void __init fib_trie_init(void)
1948
{
1949 1950
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1951 1952 1953 1954 1955 1956
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1957
}
1958

1959

1960
struct fib_table *fib_trie_table(u32 id)
1961 1962 1963 1964 1965 1966 1967 1968 1969 1970
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1971
	tb->tb_default = -1;
1972 1973

	t = (struct trie *) tb->tb_data;
1974
	memset(t, 0, sizeof(*t));
1975 1976 1977 1978

	return tb;
}

1979 1980 1981
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1982
	struct seq_net_private p;
1983
	struct fib_table *tb;
1984
	struct tnode *tnode;
E
Eric Dumazet 已提交
1985 1986
	unsigned int index;
	unsigned int depth;
1987
};
1988

1989
static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1990
{
1991
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
1992
	unsigned int cindex = iter->index;
1993
	struct tnode *p;
1994

1995 1996 1997 1998
	/* A single entry routing table */
	if (!tn)
		return NULL;

1999 2000 2001 2002
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2003
		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2004

2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2017

2018 2019
		++cindex;
	}
O
Olof Johansson 已提交
2020

2021
	/* Current node exhausted, pop back up */
2022
	p = node_parent_rcu((struct rt_trie_node *)tn);
2023 2024 2025 2026 2027
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2028
	}
2029 2030 2031

	/* got root? */
	return NULL;
2032 2033
}

2034
static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2035
				       struct trie *t)
2036
{
2037
	struct rt_trie_node *n;
2038

S
Stephen Hemminger 已提交
2039
	if (!t)
2040 2041 2042
		return NULL;

	n = rcu_dereference(t->trie);
2043
	if (!n)
2044
		return NULL;
2045

2046 2047 2048 2049 2050 2051 2052 2053
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2054
	}
2055 2056

	return n;
2057
}
O
Olof Johansson 已提交
2058

2059 2060
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
2061
	struct rt_trie_node *n;
2062
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2063

2064
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2065

2066
	rcu_read_lock();
2067
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2068
		if (IS_LEAF(n)) {
2069 2070 2071 2072
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;
			struct hlist_node *tmp;

2073 2074 2075 2076
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2077 2078 2079

			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
				++s->prefixes;
2080 2081 2082 2083 2084
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2085
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2086 2087
				s->nodesizes[tn->bits]++;

2088 2089 2090
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2091 2092
		}
	}
R
Robert Olsson 已提交
2093
	rcu_read_unlock();
2094 2095
}

2096 2097 2098 2099
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2100
{
E
Eric Dumazet 已提交
2101
	unsigned int i, max, pointers, bytes, avdepth;
2102

2103 2104 2105 2106
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2107

2108 2109
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2110
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2111

2112 2113
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2114 2115 2116 2117

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2118
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2119
	bytes += sizeof(struct tnode) * stat->tnodes;
2120

R
Robert Olsson 已提交
2121 2122
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2123
		max--;
2124

2125 2126 2127
	pointers = 0;
	for (i = 1; i <= max; i++)
		if (stat->nodesizes[i] != 0) {
2128
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2129 2130 2131
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2132
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2133

2134
	bytes += sizeof(struct rt_trie_node *) * pointers;
2135 2136
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2137
}
R
Robert Olsson 已提交
2138

2139
#ifdef CONFIG_IP_FIB_TRIE_STATS
2140 2141 2142 2143
static void trie_show_usage(struct seq_file *seq,
			    const struct trie_use_stats *stats)
{
	seq_printf(seq, "\nCounters:\n---------\n");
2144 2145 2146 2147 2148 2149 2150 2151 2152
	seq_printf(seq, "gets = %u\n", stats->gets);
	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
	seq_printf(seq, "semantic match passed = %u\n",
		   stats->semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n",
		   stats->semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n",
		   stats->resize_node_skipped);
2153
}
2154 2155
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2156
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2157
{
2158 2159 2160 2161 2162 2163
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2164
}
2165

2166

2167 2168
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2169
	struct net *net = (struct net *)seq->private;
2170
	unsigned int h;
2171

2172
	seq_printf(seq,
2173 2174
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2175 2176
		   sizeof(struct leaf), sizeof(struct tnode));

2177 2178 2179 2180 2181 2182 2183 2184
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;

		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2185

2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
			trie_show_usage(seq, &t->stats);
#endif
		}
	}
2198

2199
	return 0;
2200 2201
}

2202
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2203
{
2204
	return single_open_net(inode, file, fib_triestat_seq_show);
2205 2206
}

2207
static const struct file_operations fib_triestat_fops = {
2208 2209 2210 2211
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2212
	.release = single_release_net,
2213 2214
};

2215
static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2216
{
2217 2218
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2219
	loff_t idx = 0;
2220
	unsigned int h;
2221

2222 2223 2224 2225
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct hlist_node *node;
		struct fib_table *tb;
2226

2227
		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2228
			struct rt_trie_node *n;
2229 2230 2231 2232 2233 2234 2235 2236 2237

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2238
	}
2239

2240 2241 2242
	return NULL;
}

2243
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2244
	__acquires(RCU)
2245
{
2246
	rcu_read_lock();
2247
	return fib_trie_get_idx(seq, *pos);
2248 2249
}

2250
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2251
{
2252
	struct fib_trie_iter *iter = seq->private;
2253
	struct net *net = seq_file_net(seq);
2254 2255 2256
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
2257
	struct rt_trie_node *n;
2258

2259
	++*pos;
2260 2261 2262 2263
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2264

2265 2266 2267 2268 2269 2270 2271 2272
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2273

2274 2275 2276 2277 2278 2279 2280 2281 2282
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2283
	return NULL;
2284 2285 2286 2287

found:
	iter->tb = tb;
	return n;
2288
}
2289

2290
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2291
	__releases(RCU)
2292
{
2293 2294
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2295

2296 2297
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2298 2299
	while (n-- > 0)
		seq_puts(seq, "   ");
2300
}
2301

2302
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2303
{
S
Stephen Hemminger 已提交
2304
	switch (s) {
2305 2306 2307 2308 2309 2310
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2311
		snprintf(buf, len, "scope=%d", s);
2312 2313 2314
		return buf;
	}
}
2315

2316
static const char *const rtn_type_names[__RTN_MAX] = {
2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2330

E
Eric Dumazet 已提交
2331
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2332 2333 2334
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2335
	snprintf(buf, len, "type %u", t);
2336
	return buf;
2337 2338
}

2339 2340
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2341
{
2342
	const struct fib_trie_iter *iter = seq->private;
2343
	struct rt_trie_node *n = v;
2344

2345 2346
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2347

2348 2349
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2350
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2351

2352
		seq_indent(seq, iter->depth-1);
2353 2354
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2355
			   tn->empty_children);
2356

2357 2358
	} else {
		struct leaf *l = (struct leaf *) n;
2359 2360
		struct leaf_info *li;
		struct hlist_node *node;
A
Al Viro 已提交
2361
		__be32 val = htonl(l->key);
2362 2363

		seq_indent(seq, iter->depth);
2364
		seq_printf(seq, "  |-- %pI4\n", &val);
2365 2366 2367 2368 2369 2370 2371 2372 2373 2374

		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2375
						     fa->fa_info->fib_scope),
2376 2377 2378
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2379
					seq_printf(seq, " tos=%d", fa->fa_tos);
2380
				seq_putc(seq, '\n');
2381 2382
			}
		}
2383
	}
2384

2385 2386 2387
	return 0;
}

2388
static const struct seq_operations fib_trie_seq_ops = {
2389 2390 2391 2392
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2393 2394
};

2395
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2396
{
2397 2398
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2399 2400
}

2401
static const struct file_operations fib_trie_fops = {
2402 2403 2404 2405
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2406
	.release = seq_release_net,
2407 2408
};

2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2449
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2487
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2488
{
E
Eric Dumazet 已提交
2489
	unsigned int flags = 0;
2490

E
Eric Dumazet 已提交
2491 2492
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2493 2494
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2495
	if (mask == htonl(0xFFFFFFFF))
2496 2497 2498
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2499 2500
}

2501 2502 2503
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2504
 *	and needs to be same as fib_hash output to avoid breaking
2505 2506 2507
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2508
{
2509
	struct leaf *l = v;
2510 2511
	struct leaf_info *li;
	struct hlist_node *node;
2512

2513 2514 2515 2516 2517 2518
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2519

2520
	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2521
		struct fib_alias *fa;
A
Al Viro 已提交
2522
		__be32 mask, prefix;
O
Olof Johansson 已提交
2523

2524 2525
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2526

2527
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2528
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2529
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2530
			int len;
2531

2532 2533 2534
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2535

2536
			if (fi)
2537 2538 2539
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2540 2541 2542 2543 2544
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2545 2546
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2547
					 fi->fib_window,
2548
					 fi->fib_rtt >> 3, &len);
2549
			else
2550 2551 2552
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2553
					 prefix, 0, flags, 0, 0, 0,
2554
					 mask, 0, 0, 0, &len);
2555

2556
			seq_printf(seq, "%*s\n", 127 - len, "");
2557
		}
2558 2559 2560 2561 2562
	}

	return 0;
}

2563
static const struct seq_operations fib_route_seq_ops = {
2564 2565 2566
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2567
	.show   = fib_route_seq_show,
2568 2569
};

2570
static int fib_route_seq_open(struct inode *inode, struct file *file)
2571
{
2572
	return seq_open_net(inode, file, &fib_route_seq_ops,
2573
			    sizeof(struct fib_route_iter));
2574 2575
}

2576
static const struct file_operations fib_route_fops = {
2577 2578 2579 2580
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2581
	.release = seq_release_net,
2582 2583
};

2584
int __net_init fib_proc_init(struct net *net)
2585
{
2586
	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2587 2588
		goto out1;

2589 2590
	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
				  &fib_triestat_fops))
2591 2592
		goto out2;

2593
	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2594 2595
		goto out3;

2596
	return 0;
2597 2598

out3:
2599
	proc_net_remove(net, "fib_triestat");
2600
out2:
2601
	proc_net_remove(net, "fib_trie");
2602 2603
out1:
	return -ENOMEM;
2604 2605
}

2606
void __net_exit fib_proc_exit(struct net *net)
2607
{
2608 2609 2610
	proc_net_remove(net, "fib_trie");
	proc_net_remove(net, "fib_triestat");
	proc_net_remove(net, "route");
2611 2612 2613
}

#endif /* CONFIG_PROC_FS */