fib_trie.c 57.0 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

90 91
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
R
Robert Olsson 已提交
92

93 94 95
#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))

96 97 98 99 100
struct tnode {
	t_key key;
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	struct tnode __rcu *parent;
101
	struct rcu_head rcu;
A
Alexander Duyck 已提交
102 103 104 105 106 107 108 109 110 111
	union {
		/* The fields in this struct are valid if bits > 0 (TNODE) */
		struct {
			unsigned int full_children;  /* KEYLENGTH bits needed */
			unsigned int empty_children; /* KEYLENGTH bits needed */
			struct tnode __rcu *child[0];
		};
		/* This list pointer if valid if bits == 0 (LEAF) */
		struct hlist_head list;
	};
112 113 114 115 116
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
117
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
118
	struct list_head falh;
119
	struct rcu_head rcu;
120 121 122 123 124 125 126 127 128
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
129
	unsigned int resize_node_skipped;
130 131 132 133 134 135 136 137 138
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
139
	unsigned int prefixes;
R
Robert Olsson 已提交
140
	unsigned int nodesizes[MAX_STAT_DEPTH];
141
};
142 143

struct trie {
A
Alexander Duyck 已提交
144
	struct tnode __rcu *trie;
145
#ifdef CONFIG_IP_FIB_TRIE_STATS
146
	struct trie_use_stats __percpu *stats;
147 148 149
#endif
};

A
Alexander Duyck 已提交
150
static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
151
				  int wasfull);
A
Alexander Duyck 已提交
152
static struct tnode *resize(struct trie *t, struct tnode *tn);
153 154
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
155
/* tnodes to free after resize(); protected by RTNL */
156
static struct callback_head *tnode_free_head;
157 158 159 160 161 162 163 164
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
165

166
static struct kmem_cache *fn_alias_kmem __read_mostly;
167
static struct kmem_cache *trie_leaf_kmem __read_mostly;
168

169 170
/* caller must hold RTNL */
#define node_parent(n) rtnl_dereference((n)->parent)
E
Eric Dumazet 已提交
171

172 173
/* caller must hold RCU read lock or RTNL */
#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
E
Eric Dumazet 已提交
174

175
/* wrapper for rcu_assign_pointer */
A
Alexander Duyck 已提交
176
static inline void node_set_parent(struct tnode *n, struct tnode *tp)
177
{
A
Alexander Duyck 已提交
178 179
	if (n)
		rcu_assign_pointer(n->parent, tp);
S
Stephen Hemminger 已提交
180 181
}

182 183 184 185
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)

/* This provides us with the number of children in this node, in the case of a
 * leaf this will return 0 meaning none of the children are accessible.
186
 */
187
static inline int tnode_child_length(const struct tnode *tn)
S
Stephen Hemminger 已提交
188
{
189
	return (1ul << tn->bits) & ~(1ul);
S
Stephen Hemminger 已提交
190
}
R
Robert Olsson 已提交
191

E
Eric Dumazet 已提交
192 193 194
/*
 * caller must hold RTNL
 */
A
Alexander Duyck 已提交
195
static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
196
{
197
	BUG_ON(i >= tnode_child_length(tn));
R
Robert Olsson 已提交
198

E
Eric Dumazet 已提交
199
	return rtnl_dereference(tn->child[i]);
200 201
}

E
Eric Dumazet 已提交
202 203 204
/*
 * caller must hold RCU read lock or RTNL
 */
A
Alexander Duyck 已提交
205
static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
206
{
207
	BUG_ON(i >= tnode_child_length(tn));
208

E
Eric Dumazet 已提交
209
	return rcu_dereference_rtnl(tn->child[i]);
210 211
}

212
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
213 214 215 216
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

217
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
218
{
O
Olof Johansson 已提交
219
	if (offset < KEYLENGTH)
220
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
221
	else
222 223 224 225
		return 0;
}

/*
226 227
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
228 229 230 231
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

232 233 234 235 236
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
237 238
  correct key path.

239 240 241 242 243
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
244 245
  call to tkey_sub_equals() in trie_insert().

246
  if n is an internal node - a 'tnode' here, the various parts of its key
247 248
  have many different meanings.

249
  Example:
250 251 252
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
253
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
254 255 256 257 258 259 260 261 262

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
263
  n->bits = 4
264

265 266
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
267 268 269
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
270
  index into the parent's child array. That is, they will be used to find
271 272 273 274 275
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

276
  All the bits we have seen so far are significant to the node n. The rest
277 278
  of the bits are really not needed or indeed known in n->key.

279
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
280
  n's child array, and will of course be different for each child.
281

282

283 284 285 286 287
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

288 289
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
290
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
291
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
292 293

static void __alias_free_mem(struct rcu_head *head)
294
{
R
Robert Olsson 已提交
295 296
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
297 298
}

R
Robert Olsson 已提交
299
static inline void alias_free_mem_rcu(struct fib_alias *fa)
300
{
R
Robert Olsson 已提交
301 302
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
303

304
#define TNODE_KMALLOC_MAX \
A
Alexander Duyck 已提交
305
	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
O
Olof Johansson 已提交
306

307
static void __node_free_rcu(struct rcu_head *head)
308
{
A
Alexander Duyck 已提交
309
	struct tnode *n = container_of(head, struct tnode, rcu);
310 311 312 313 314 315 316

	if (IS_LEAF(n))
		kmem_cache_free(trie_leaf_kmem, n);
	else if (n->bits <= TNODE_KMALLOC_MAX)
		kfree(n);
	else
		vfree(n);
317 318
}

319 320
#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)

R
Robert Olsson 已提交
321
static inline void free_leaf_info(struct leaf_info *leaf)
322
{
323
	kfree_rcu(leaf, rcu);
324 325
}

326
static struct tnode *tnode_alloc(size_t size)
327
{
R
Robert Olsson 已提交
328
	if (size <= PAGE_SIZE)
329
		return kzalloc(size, GFP_KERNEL);
330
	else
331
		return vzalloc(size);
332
}
R
Robert Olsson 已提交
333

J
Jarek Poplawski 已提交
334 335 336
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
337 338
	tn->rcu.next = tnode_free_head;
	tnode_free_head = &tn->rcu;
J
Jarek Poplawski 已提交
339 340 341 342
}

static void tnode_free_flush(void)
{
343 344 345 346 347 348 349
	struct callback_head *head;

	while ((head = tnode_free_head)) {
		struct tnode *tn = container_of(head, struct tnode, rcu);

		tnode_free_head = head->next;
		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
J
Jarek Poplawski 已提交
350

351
		node_free(tn);
J
Jarek Poplawski 已提交
352
	}
353 354 355 356 357

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
358 359
}

A
Alexander Duyck 已提交
360
static struct tnode *leaf_new(t_key key)
R
Robert Olsson 已提交
361
{
A
Alexander Duyck 已提交
362
	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
363
	if (l) {
364 365 366 367 368 369 370 371 372 373
		l->parent = NULL;
		/* set key and pos to reflect full key value
		 * any trailing zeros in the key should be ignored
		 * as the nodes are searched
		 */
		l->key = key;
		l->pos = KEYLENGTH;
		/* set bits to 0 indicating we are not a tnode */
		l->bits = 0;

R
Robert Olsson 已提交
374 375 376 377 378 379 380 381 382 383
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
384
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
385 386 387 388 389
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

390
static struct tnode *tnode_new(t_key key, int pos, int bits)
391
{
392
	size_t sz = offsetof(struct tnode, child[1 << bits]);
393
	struct tnode *tn = tnode_alloc(sz);
394 395 396 397
	unsigned int shift = pos + bits;

	/* verify bits and pos their msb bits clear and values are valid */
	BUG_ON(!bits || (shift > KEYLENGTH));
398

O
Olof Johansson 已提交
399
	if (tn) {
400
		tn->parent = NULL;
401 402
		tn->pos = pos;
		tn->bits = bits;
403
		tn->key = mask_pfx(key, pos);
404 405 406
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
407

E
Eric Dumazet 已提交
408
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
A
Alexander Duyck 已提交
409
		 sizeof(struct tnode *) << bits);
410 411 412 413 414 415 416 417
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

A
Alexander Duyck 已提交
418
static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
419
{
420
	return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
421 422
}

L
Lin Ming 已提交
423
static inline void put_child(struct tnode *tn, int i,
A
Alexander Duyck 已提交
424
			     struct tnode *n)
425 426 427 428
{
	tnode_put_child_reorg(tn, i, n, -1);
}

429
 /*
430 431 432 433
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

A
Alexander Duyck 已提交
434
static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
435
				  int wasfull)
436
{
A
Alexander Duyck 已提交
437
	struct tnode *chi = rtnl_dereference(tn->child[i]);
438 439
	int isfull;

S
Stephen Hemminger 已提交
440 441
	BUG_ON(i >= 1<<tn->bits);

442 443 444 445 446
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
447

448
	/* update fullChildren */
O
Olof Johansson 已提交
449
	if (wasfull == -1)
450 451 452
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
453
	if (wasfull && !isfull)
454
		tn->full_children--;
455
	else if (!wasfull && isfull)
456
		tn->full_children++;
O
Olof Johansson 已提交
457

458
	node_set_parent(n, tn);
459

460
	rcu_assign_pointer(tn->child[i], n);
461 462
}

463 464 465 466 467 468 469 470 471
static void put_child_root(struct tnode *tp, struct trie *t,
			   t_key key, struct tnode *n)
{
	if (tp)
		put_child(tp, get_index(key, tp), n);
	else
		rcu_assign_pointer(t->trie, n);
}

J
Jens Låås 已提交
472
#define MAX_WORK 10
A
Alexander Duyck 已提交
473
static struct tnode *resize(struct trie *t, struct tnode *tn)
474
{
A
Alexander Duyck 已提交
475
	struct tnode *old_tn, *n = NULL;
476 477
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
478
	int max_work;
479

480
	if (!tn)
481 482
		return NULL;

S
Stephen Hemminger 已提交
483 484
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
485 486

	/* No children */
487 488 489
	if (tn->empty_children > (tnode_child_length(tn) - 1))
		goto no_children;

490
	/* One child */
491
	if (tn->empty_children == (tnode_child_length(tn) - 1))
J
Jens Låås 已提交
492
		goto one_child;
493
	/*
494 495 496 497 498
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
499 500
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
501
	 * Telecommunications, page 6:
502
	 * "A node is doubled if the ratio of non-empty children to all
503 504
	 * children in the *doubled* node is at least 'high'."
	 *
505 506 507 508 509
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
510
	 * multiply the left-hand side by 50.
511 512 513 514
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
515
	 * children, that is non-null tnodes with a skip value of 0.
516
	 * All of those will be doubled in the resulting inflated tnode, so
517
	 * we just count them one extra time here.
518
	 *
519
	 * A clearer way to write this would be:
520
	 *
521
	 * to_be_doubled = tn->full_children;
522
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
523 524 525 526
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
527
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
528 529
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
530 531 532
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
533 534 535
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
536
	 *
537 538 539
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
540
	 *
541
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
542
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
543
	 *    tn->full_children) >= inflate_threshold * new_child_length
544
	 *
545
	 * expand new_child_length:
546
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
547
	 *    tn->full_children) >=
548
	 *      inflate_threshold * tnode_child_length(tn) * 2
549
	 *
550
	 * shorten again:
551
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
552
	 *    tn->empty_children) >= inflate_threshold *
553
	 *    tnode_child_length(tn)
554
	 *
555 556
	 */

557 558
	/* Keep root node larger  */

559
	if (!node_parent(tn)) {
J
Jens Låås 已提交
560 561
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
562
	} else {
563
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
564 565
		halve_threshold_use = halve_threshold;
	}
566

J
Jens Låås 已提交
567 568
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
569 570 571
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
572

573 574
		old_tn = tn;
		tn = inflate(t, tn);
575

576 577
		if (IS_ERR(tn)) {
			tn = old_tn;
578
#ifdef CONFIG_IP_FIB_TRIE_STATS
579
			this_cpu_inc(t->stats->resize_node_skipped);
580 581 582
#endif
			break;
		}
583 584
	}

J
Jens Låås 已提交
585
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
586
	if (max_work != MAX_WORK)
A
Alexander Duyck 已提交
587
		return tn;
J
Jens Låås 已提交
588

589 590 591 592
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
593

J
Jens Låås 已提交
594 595
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
596
	       100 * (tnode_child_length(tn) - tn->empty_children) <
597
	       halve_threshold_use * tnode_child_length(tn)) {
598

599 600 601 602
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
603
#ifdef CONFIG_IP_FIB_TRIE_STATS
604
			this_cpu_inc(t->stats->resize_node_skipped);
605 606 607 608
#endif
			break;
		}
	}
609

610

611
	/* Only one child remains */
612 613
	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
		unsigned long i;
J
Jens Låås 已提交
614
one_child:
615 616 617 618 619 620 621
		for (i = tnode_child_length(tn); !n && i;)
			n = tnode_get_child(tn, --i);
no_children:
		/* compress one level */
		node_set_parent(n, NULL);
		tnode_free_safe(tn);
		return n;
J
Jens Låås 已提交
622
	}
A
Alexander Duyck 已提交
623
	return tn;
624 625
}

E
Eric Dumazet 已提交
626 627 628

static void tnode_clean_free(struct tnode *tn)
{
A
Alexander Duyck 已提交
629
	struct tnode *tofree;
E
Eric Dumazet 已提交
630 631 632
	int i;

	for (i = 0; i < tnode_child_length(tn); i++) {
633
		tofree = rtnl_dereference(tn->child[i]);
E
Eric Dumazet 已提交
634
		if (tofree)
635
			node_free(tofree);
E
Eric Dumazet 已提交
636
	}
637
	node_free(tn);
E
Eric Dumazet 已提交
638 639
}

A
Alexander Duyck 已提交
640
static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
641
{
A
Alexander Duyck 已提交
642 643
	int olen = tnode_child_length(oldtnode);
	struct tnode *tn;
644 645
	int i;

S
Stephen Hemminger 已提交
646
	pr_debug("In inflate\n");
647 648 649

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
650
	if (!tn)
651
		return ERR_PTR(-ENOMEM);
652 653

	/*
654 655 656
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
657 658
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
659 660

	for (i = 0; i < olen; i++) {
661
		struct tnode *inode;
662

A
Alexander Duyck 已提交
663 664
		inode = tnode_get_child(oldtnode, i);
		if (tnode_full(oldtnode, inode) && inode->bits > 1) {
665
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
666
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
667

668 669
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
670 671
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
672

673 674 675
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

676
			if (!right) {
677
				node_free(left);
678
				goto nomem;
679
			}
680

A
Alexander Duyck 已提交
681 682
			put_child(tn, 2*i, left);
			put_child(tn, 2*i+1, right);
683 684 685
		}
	}

O
Olof Johansson 已提交
686
	for (i = 0; i < olen; i++) {
A
Alexander Duyck 已提交
687
		struct tnode *inode = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
688 689
		struct tnode *left, *right;
		int size, j;
690

691
		/* An empty child */
A
Alexander Duyck 已提交
692
		if (inode == NULL)
693 694 695
			continue;

		/* A leaf or an internal node with skipped bits */
A
Alexander Duyck 已提交
696
		if (!tnode_full(oldtnode, inode)) {
697
			put_child(tn,
A
Alexander Duyck 已提交
698 699
				tkey_extract_bits(inode->key, tn->pos, tn->bits),
				inode);
700 701 702 703 704
			continue;
		}

		/* An internal node with two children */
		if (inode->bits == 1) {
L
Lin Ming 已提交
705 706
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
707

J
Jarek Poplawski 已提交
708
			tnode_free_safe(inode);
O
Olof Johansson 已提交
709
			continue;
710 711
		}

O
Olof Johansson 已提交
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
730

O
Olof Johansson 已提交
731 732 733
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
734

A
Alexander Duyck 已提交
735
		left = tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
736
		put_child(tn, 2*i, NULL);
737

O
Olof Johansson 已提交
738
		BUG_ON(!left);
739

A
Alexander Duyck 已提交
740
		right = tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
741
		put_child(tn, 2*i+1, NULL);
742

O
Olof Johansson 已提交
743
		BUG_ON(!right);
744

O
Olof Johansson 已提交
745 746
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
747 748
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
749
		}
L
Lin Ming 已提交
750 751
		put_child(tn, 2*i, resize(t, left));
		put_child(tn, 2*i+1, resize(t, right));
O
Olof Johansson 已提交
752

J
Jarek Poplawski 已提交
753
		tnode_free_safe(inode);
754
	}
J
Jarek Poplawski 已提交
755
	tnode_free_safe(oldtnode);
756
	return tn;
757
nomem:
E
Eric Dumazet 已提交
758 759
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
760 761
}

A
Alexander Duyck 已提交
762
static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
763
{
A
Alexander Duyck 已提交
764 765
	int olen = tnode_child_length(oldtnode);
	struct tnode *tn, *left, *right;
766 767
	int i;

S
Stephen Hemminger 已提交
768
	pr_debug("In halve\n");
769 770

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
771

772 773
	if (!tn)
		return ERR_PTR(-ENOMEM);
774 775

	/*
776 777 778
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
779 780 781
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
782
	for (i = 0; i < olen; i += 2) {
783 784
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
785

786
		/* Two nonempty children */
S
Stephen Hemminger 已提交
787
		if (left && right) {
788
			struct tnode *newn;
S
Stephen Hemminger 已提交
789

790
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
791 792

			if (!newn)
793
				goto nomem;
S
Stephen Hemminger 已提交
794

A
Alexander Duyck 已提交
795
			put_child(tn, i/2, newn);
796 797 798
		}

	}
799

O
Olof Johansson 已提交
800 801 802
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

803 804
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
805

806 807 808 809
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
810
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
811
			continue;
S
Stephen Hemminger 已提交
812
		}
O
Olof Johansson 已提交
813 814

		if (right == NULL) {
L
Lin Ming 已提交
815
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
816 817
			continue;
		}
818

819
		/* Two nonempty children */
A
Alexander Duyck 已提交
820
		newBinNode = tnode_get_child(tn, i/2);
L
Lin Ming 已提交
821 822 823 824
		put_child(tn, i/2, NULL);
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
		put_child(tn, i/2, resize(t, newBinNode));
825
	}
J
Jarek Poplawski 已提交
826
	tnode_free_safe(oldtnode);
827
	return tn;
828
nomem:
E
Eric Dumazet 已提交
829 830
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
831 832
}

R
Robert Olsson 已提交
833
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
834 835
 via get_fa_head and dump */

A
Alexander Duyck 已提交
836
static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
837
{
R
Robert Olsson 已提交
838
	struct hlist_head *head = &l->list;
839 840
	struct leaf_info *li;

841
	hlist_for_each_entry_rcu(li, head, hlist)
842
		if (li->plen == plen)
843
			return li;
O
Olof Johansson 已提交
844

845 846 847
	return NULL;
}

A
Alexander Duyck 已提交
848
static inline struct list_head *get_fa_head(struct tnode *l, int plen)
849
{
R
Robert Olsson 已提交
850
	struct leaf_info *li = find_leaf_info(l, plen);
851

O
Olof Johansson 已提交
852 853
	if (!li)
		return NULL;
854

O
Olof Johansson 已提交
855
	return &li->falh;
856 857 858 859
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
860 861 862 863 864
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
865
		hlist_for_each_entry(li, head, hlist) {
866 867 868 869 870 871
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
872
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
873 874 875
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
876 877
}

R
Robert Olsson 已提交
878
/* rcu_read_lock needs to be hold by caller from readside */
A
Alexander Duyck 已提交
879
static struct tnode *fib_find_node(struct trie *t, u32 key)
880
{
A
Alexander Duyck 已提交
881
	struct tnode *n = rcu_dereference_rtnl(t->trie);
A
Alexander Duyck 已提交
882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900

	while (n) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the bits in the cindex. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is cindex
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			return NULL;

		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
901 902
			break;

A
Alexander Duyck 已提交
903 904
		n = rcu_dereference_rtnl(n->child[index]);
	}
O
Olof Johansson 已提交
905

A
Alexander Duyck 已提交
906
	return n;
907 908
}

909
static void trie_rebalance(struct trie *t, struct tnode *tn)
910 911
{
	int wasfull;
R
Robert Olsson 已提交
912
	t_key cindex, key;
S
Stephen Hemminger 已提交
913
	struct tnode *tp;
914

R
Robert Olsson 已提交
915 916
	key = tn->key;

917
	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
918 919
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
A
Alexander Duyck 已提交
920
		tn = resize(t, tn);
921

A
Alexander Duyck 已提交
922
		tnode_put_child_reorg(tp, cindex, tn, wasfull);
O
Olof Johansson 已提交
923

924
		tp = node_parent(tn);
925
		if (!tp)
A
Alexander Duyck 已提交
926
			rcu_assign_pointer(t->trie, tn);
927

J
Jarek Poplawski 已提交
928
		tnode_free_flush();
S
Stephen Hemminger 已提交
929
		if (!tp)
930
			break;
S
Stephen Hemminger 已提交
931
		tn = tp;
932
	}
S
Stephen Hemminger 已提交
933

934
	/* Handle last (top) tnode */
935
	if (IS_TNODE(tn))
A
Alexander Duyck 已提交
936
		tn = resize(t, tn);
937

A
Alexander Duyck 已提交
938
	rcu_assign_pointer(t->trie, tn);
939
	tnode_free_flush();
940 941
}

R
Robert Olsson 已提交
942 943
/* only used from updater-side */

944
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
945
{
946
	struct list_head *fa_head = NULL;
947
	struct tnode *l, *n, *tp = NULL;
948 949
	struct leaf_info *li;

950 951 952 953 954
	li = leaf_info_new(plen);
	if (!li)
		return NULL;
	fa_head = &li->falh;

E
Eric Dumazet 已提交
955
	n = rtnl_dereference(t->trie);
956

957 958
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
959 960
	 * and we should just put our new leaf in that.
	 *
961 962 963
	 * If we hit a node with a key that does't match then we should stop
	 * and create a new tnode to replace that node and insert ourselves
	 * and the other node into the new tnode.
964
	 */
965 966
	while (n) {
		unsigned long index = get_index(key, n);
967

968 969 970 971 972 973 974 975 976 977 978
		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
979 980
			break;

981 982 983 984 985 986
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n)) {
			/* Case 1: n is a leaf, and prefixes match*/
			insert_leaf_info(&n->list, li);
			return fa_head;
		}
987

988 989
		tp = n;
		n = rcu_dereference_rtnl(n->child[index]);
990 991
	}

992 993 994
	l = leaf_new(key);
	if (!l) {
		free_leaf_info(li);
995
		return NULL;
996
	}
997 998 999

	insert_leaf_info(&l->list, li);

1000 1001 1002 1003 1004 1005 1006 1007 1008
	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
	 *
	 *  Add a new tnode here
	 *  first tnode need some special handling
	 *  leaves us in position for handling as case 3
	 */
	if (n) {
		struct tnode *tn;
		int newpos;
1009

1010
		newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
1011

1012
		tn = tnode_new(key, newpos, 1);
1013
		if (!tn) {
1014
			free_leaf_info(li);
1015
			node_free(l);
1016
			return NULL;
O
Olof Johansson 已提交
1017 1018
		}

1019 1020 1021
		/* initialize routes out of node */
		NODE_INIT_PARENT(tn, tp);
		put_child(tn, get_index(key, tn) ^ 1, n);
1022

1023 1024 1025
		/* start adding routes into the node */
		put_child_root(tp, t, key, tn);
		node_set_parent(n, tn);
1026

1027
		/* parent now has a NULL spot where the leaf can go */
1028
		tp = tn;
1029
	}
O
Olof Johansson 已提交
1030

1031 1032 1033 1034 1035 1036 1037 1038
	/* Case 3: n is NULL, and will just insert a new leaf */
	if (tp) {
		NODE_INIT_PARENT(l, tp);
		put_child(tp, get_index(key, tp), l);
		trie_rebalance(t, tp);
	} else {
		rcu_assign_pointer(t->trie, l);
	}
R
Robert Olsson 已提交
1039

1040 1041 1042
	return fa_head;
}

1043 1044 1045
/*
 * Caller must hold RTNL.
 */
1046
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1047 1048 1049
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1050
	struct list_head *fa_head = NULL;
1051
	struct fib_info *fi;
1052 1053
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1054 1055
	u32 key, mask;
	int err;
A
Alexander Duyck 已提交
1056
	struct tnode *l;
1057 1058 1059 1060

	if (plen > 32)
		return -EINVAL;

1061
	key = ntohl(cfg->fc_dst);
1062

1063
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1064

O
Olof Johansson 已提交
1065
	mask = ntohl(inet_make_mask(plen));
1066

1067
	if (key & ~mask)
1068 1069 1070 1071
		return -EINVAL;

	key = key & mask;

1072 1073 1074
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1075
		goto err;
1076
	}
1077 1078

	l = fib_find_node(t, key);
1079
	fa = NULL;
1080

1081
	if (l) {
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1097 1098 1099
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1100 1101

		err = -EEXIST;
1102
		if (cfg->fc_nlflags & NLM_F_EXCL)
1103 1104
			goto out;

1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1125
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1126 1127 1128
			struct fib_info *fi_drop;
			u8 state;

1129 1130 1131 1132
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1133
				goto out;
1134
			}
R
Robert Olsson 已提交
1135
			err = -ENOBUFS;
1136
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1137 1138
			if (new_fa == NULL)
				goto out;
1139 1140

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1141 1142
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1143
			new_fa->fa_type = cfg->fc_type;
1144
			state = fa->fa_state;
1145
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1146

R
Robert Olsson 已提交
1147 1148
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1149 1150 1151

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1152
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1153 1154
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1155

O
Olof Johansson 已提交
1156
			goto succeeded;
1157 1158 1159 1160 1161
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1162 1163
		if (fa_match)
			goto out;
1164

1165
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1166
			fa = fa_first;
1167 1168
	}
	err = -ENOENT;
1169
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1170 1171 1172
		goto out;

	err = -ENOBUFS;
1173
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1174 1175 1176 1177 1178
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1179
	new_fa->fa_type = cfg->fc_type;
1180 1181 1182 1183 1184
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1185
	if (!fa_head) {
1186 1187 1188
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1189
			goto out_free_new_fa;
1190
		}
1191
	}
1192

1193 1194 1195
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1196 1197
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1198

1199
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1200
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1201
		  &cfg->fc_nlinfo, 0);
1202 1203
succeeded:
	return 0;
1204 1205 1206

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1207 1208
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1209
err:
1210 1211 1212
	return err;
}

R
Robert Olsson 已提交
1213
/* should be called with rcu_read_lock */
A
Alexander Duyck 已提交
1214
static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
1215
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1216
		      struct fib_result *res, int fib_flags)
1217 1218 1219
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
1220

1221
	hlist_for_each_entry_rcu(li, hhead, hlist) {
1222
		struct fib_alias *fa;
1223

1224
		if (l->key != (key & li->mask_plen))
1225 1226
			continue;

1227 1228 1229
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1230

1231
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1232
				continue;
1233 1234
			if (fi->fib_dead)
				continue;
1235
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1236 1237 1238
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
1239
			if (unlikely(err < 0)) {
1240
#ifdef CONFIG_IP_FIB_TRIE_STATS
1241
				this_cpu_inc(t->stats->semantic_match_passed);
1242
#endif
1243
				return err;
1244 1245 1246 1247 1248 1249 1250 1251
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1252
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1253 1254 1255
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1256
				this_cpu_inc(t->stats->semantic_match_passed);
1257
#endif
1258
				res->prefixlen = li->plen;
1259 1260
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1261
				res->scope = fi->fib_scope;
1262 1263 1264 1265
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
1266
					atomic_inc(&fi->fib_clntref);
1267 1268 1269 1270 1271
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
1272
		this_cpu_inc(t->stats->semantic_match_miss);
1273 1274
#endif
	}
1275

1276
	return 1;
1277 1278
}

1279 1280 1281 1282 1283 1284 1285
static inline t_key prefix_mismatch(t_key key, struct tnode *n)
{
	t_key prefix = n->key;

	return (key ^ prefix) & (prefix | -prefix);
}

1286
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1287
		     struct fib_result *res, int fib_flags)
1288
{
1289
	struct trie *t = (struct trie *)tb->tb_data;
1290 1291 1292
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats __percpu *stats = t->stats;
#endif
1293 1294 1295 1296
	const t_key key = ntohl(flp->daddr);
	struct tnode *n, *pn;
	t_key cindex;
	int ret = 1;
O
Olof Johansson 已提交
1297

R
Robert Olsson 已提交
1298
	rcu_read_lock();
O
Olof Johansson 已提交
1299

R
Robert Olsson 已提交
1300
	n = rcu_dereference(t->trie);
1301
	if (!n)
1302 1303 1304
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1305
	this_cpu_inc(stats->gets);
1306 1307
#endif

A
Alexander Duyck 已提交
1308
	pn = n;
1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
	cindex = 0;

	/* Step 1: Travel to the longest prefix match in the trie */
	for (;;) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			break;
1327

1328 1329
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
1330
			goto found;
1331

1332 1333
		/* only record pn and cindex if we are going to be chopping
		 * bits later.  Otherwise we are just wasting cycles.
O
Olof Johansson 已提交
1334
		 */
1335 1336 1337
		if (index) {
			pn = n;
			cindex = index;
O
Olof Johansson 已提交
1338
		}
1339

1340 1341 1342 1343
		n = rcu_dereference(n->child[index]);
		if (unlikely(!n))
			goto backtrace;
	}
1344

1345 1346 1347 1348
	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
	for (;;) {
		/* record the pointer where our next node pointer is stored */
		struct tnode __rcu **cptr = n->child;
1349

1350 1351 1352
		/* This test verifies that none of the bits that differ
		 * between the key and the prefix exist in the region of
		 * the lsb and higher in the prefix.
O
Olof Johansson 已提交
1353
		 */
1354 1355
		if (unlikely(prefix_mismatch(key, n)))
			goto backtrace;
O
Olof Johansson 已提交
1356

1357 1358 1359
		/* exit out and process leaf */
		if (unlikely(IS_LEAF(n)))
			break;
O
Olof Johansson 已提交
1360

1361 1362 1363
		/* Don't bother recording parent info.  Since we are in
		 * prefix match mode we will have to come back to wherever
		 * we started this traversal anyway
O
Olof Johansson 已提交
1364 1365
		 */

1366
		while ((n = rcu_dereference(*cptr)) == NULL) {
1367 1368
backtrace:
#ifdef CONFIG_IP_FIB_TRIE_STATS
1369 1370
			if (!n)
				this_cpu_inc(stats->null_node_hit);
1371
#endif
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
			/* If we are at cindex 0 there are no more bits for
			 * us to strip at this level so we must ascend back
			 * up one level to see if there are any more bits to
			 * be stripped there.
			 */
			while (!cindex) {
				t_key pkey = pn->key;

				pn = node_parent_rcu(pn);
				if (unlikely(!pn))
					goto failed;
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->backtrack);
#endif
				/* Get Child's index */
				cindex = get_index(pkey, pn);
			}

			/* strip the least significant bit from the cindex */
			cindex &= cindex - 1;

			/* grab pointer for next child node */
			cptr = &pn->child[cindex];
1395
		}
1396
	}
1397

1398
found:
1399 1400 1401 1402 1403
	/* Step 3: Process the leaf, if that fails fall back to backtracing */
	ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
	if (unlikely(ret > 0))
		goto backtrace;
failed:
R
Robert Olsson 已提交
1404
	rcu_read_unlock();
1405 1406
	return ret;
}
1407
EXPORT_SYMBOL_GPL(fib_table_lookup);
1408

1409 1410 1411
/*
 * Remove the leaf and return parent.
 */
A
Alexander Duyck 已提交
1412
static void trie_leaf_remove(struct trie *t, struct tnode *l)
1413
{
1414
	struct tnode *tp = node_parent(l);
1415

1416
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1417

1418
	if (tp) {
1419
		put_child(tp, get_index(l->key, tp), NULL);
1420
		trie_rebalance(t, tp);
1421
	} else {
1422
		RCU_INIT_POINTER(t->trie, NULL);
1423
	}
1424

1425
	node_free(l);
1426 1427
}

1428 1429 1430
/*
 * Caller must hold RTNL.
 */
1431
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1432 1433 1434
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1435 1436
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1437 1438
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
A
Alexander Duyck 已提交
1439
	struct tnode *l;
O
Olof Johansson 已提交
1440 1441
	struct leaf_info *li;

1442
	if (plen > 32)
1443 1444
		return -EINVAL;

1445
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1446
	mask = ntohl(inet_make_mask(plen));
1447

1448
	if (key & ~mask)
1449 1450 1451 1452 1453
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1454
	if (!l)
1455 1456
		return -ESRCH;

1457 1458 1459 1460 1461 1462
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1463 1464 1465 1466 1467
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1468
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1469 1470

	fa_to_delete = NULL;
1471 1472
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1473 1474 1475 1476 1477
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1478 1479
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1480
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1481 1482
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1483 1484 1485
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1486 1487 1488 1489 1490
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1491 1492
	if (!fa_to_delete)
		return -ESRCH;
1493

O
Olof Johansson 已提交
1494
	fa = fa_to_delete;
1495
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1496
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1497

R
Robert Olsson 已提交
1498
	list_del_rcu(&fa->fa_list);
1499

1500 1501 1502
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1503
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1504
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1505
		free_leaf_info(li);
R
Robert Olsson 已提交
1506
	}
1507

O
Olof Johansson 已提交
1508
	if (hlist_empty(&l->list))
1509
		trie_leaf_remove(t, l);
1510

O
Olof Johansson 已提交
1511
	if (fa->fa_state & FA_S_ACCESSED)
1512
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1513

R
Robert Olsson 已提交
1514 1515
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1516
	return 0;
1517 1518
}

1519
static int trie_flush_list(struct list_head *head)
1520 1521 1522 1523 1524 1525 1526
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1527 1528 1529 1530
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1531 1532 1533 1534 1535 1536
			found++;
		}
	}
	return found;
}

A
Alexander Duyck 已提交
1537
static int trie_flush_leaf(struct tnode *l)
1538 1539 1540
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1541
	struct hlist_node *tmp;
1542 1543
	struct leaf_info *li = NULL;

1544
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1545
		found += trie_flush_list(&li->falh);
1546 1547

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1548
			hlist_del_rcu(&li->hlist);
1549 1550 1551 1552 1553 1554
			free_leaf_info(li);
		}
	}
	return found;
}

1555 1556 1557 1558
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
A
Alexander Duyck 已提交
1559
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1560
{
1561 1562
	do {
		t_key idx;
1563 1564

		if (c)
1565
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1566
		else
1567
			idx = 0;
R
Robert Olsson 已提交
1568

1569 1570
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1571
			if (!c)
O
Olof Johansson 已提交
1572 1573
				continue;

1574
			if (IS_LEAF(c))
A
Alexander Duyck 已提交
1575
				return c;
1576 1577

			/* Rescan start scanning in new node */
A
Alexander Duyck 已提交
1578
			p = c;
1579
			idx = 0;
1580
		}
1581 1582

		/* Node empty, walk back up to parent */
A
Alexander Duyck 已提交
1583
		c = p;
E
Eric Dumazet 已提交
1584
	} while ((p = node_parent_rcu(c)) != NULL);
1585 1586 1587 1588

	return NULL; /* Root of trie */
}

A
Alexander Duyck 已提交
1589
static struct tnode *trie_firstleaf(struct trie *t)
1590
{
A
Alexander Duyck 已提交
1591
	struct tnode *n = rcu_dereference_rtnl(t->trie);
1592 1593 1594 1595 1596

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
A
Alexander Duyck 已提交
1597
		return n;
1598 1599 1600 1601

	return leaf_walk_rcu(n, NULL);
}

A
Alexander Duyck 已提交
1602
static struct tnode *trie_nextleaf(struct tnode *l)
1603
{
A
Alexander Duyck 已提交
1604
	struct tnode *p = node_parent_rcu(l);
1605 1606 1607 1608

	if (!p)
		return NULL;	/* trie with just one leaf */

A
Alexander Duyck 已提交
1609
	return leaf_walk_rcu(p, l);
1610 1611
}

A
Alexander Duyck 已提交
1612
static struct tnode *trie_leafindex(struct trie *t, int index)
1613
{
A
Alexander Duyck 已提交
1614
	struct tnode *l = trie_firstleaf(t);
1615

S
Stephen Hemminger 已提交
1616
	while (l && index-- > 0)
1617
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1618

1619 1620 1621 1622
	return l;
}


1623 1624 1625
/*
 * Caller must hold RTNL.
 */
1626
int fib_table_flush(struct fib_table *tb)
1627 1628
{
	struct trie *t = (struct trie *) tb->tb_data;
A
Alexander Duyck 已提交
1629
	struct tnode *l, *ll = NULL;
1630
	int found = 0;
1631

1632
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1633
		found += trie_flush_leaf(l);
1634 1635

		if (ll && hlist_empty(&ll->list))
1636
			trie_leaf_remove(t, ll);
1637 1638 1639 1640
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1641
		trie_leaf_remove(t, ll);
1642

S
Stephen Hemminger 已提交
1643
	pr_debug("trie_flush found=%d\n", found);
1644 1645 1646
	return found;
}

1647 1648
void fib_free_table(struct fib_table *tb)
{
1649 1650 1651 1652 1653
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie *t = (struct trie *)tb->tb_data;

	free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
1654 1655 1656
	kfree(tb);
}

1657 1658
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1659 1660 1661 1662
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1663
	__be32 xkey = htonl(key);
1664

1665
	s_i = cb->args[5];
1666 1667
	i = 0;

R
Robert Olsson 已提交
1668 1669 1670
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1671 1672 1673 1674 1675
		if (i < s_i) {
			i++;
			continue;
		}

1676
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1677 1678 1679 1680
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1681
				  xkey,
1682 1683
				  plen,
				  fa->fa_tos,
1684
				  fa->fa_info, NLM_F_MULTI) < 0) {
1685
			cb->args[5] = i;
1686
			return -1;
O
Olof Johansson 已提交
1687
		}
1688 1689
		i++;
	}
1690
	cb->args[5] = i;
1691 1692 1693
	return skb->len;
}

A
Alexander Duyck 已提交
1694
static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1695
			struct sk_buff *skb, struct netlink_callback *cb)
1696
{
1697 1698
	struct leaf_info *li;
	int i, s_i;
1699

1700
	s_i = cb->args[4];
1701
	i = 0;
1702

1703
	/* rcu_read_lock is hold by caller */
1704
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1705 1706
		if (i < s_i) {
			i++;
1707
			continue;
1708
		}
O
Olof Johansson 已提交
1709

1710
		if (i > s_i)
1711
			cb->args[5] = 0;
1712

1713
		if (list_empty(&li->falh))
1714 1715
			continue;

1716
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1717
			cb->args[4] = i;
1718 1719
			return -1;
		}
1720
		i++;
1721
	}
1722

1723
	cb->args[4] = i;
1724 1725 1726
	return skb->len;
}

1727 1728
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1729
{
A
Alexander Duyck 已提交
1730
	struct tnode *l;
1731
	struct trie *t = (struct trie *) tb->tb_data;
1732
	t_key key = cb->args[2];
1733
	int count = cb->args[3];
1734

R
Robert Olsson 已提交
1735
	rcu_read_lock();
1736 1737 1738
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1739
	if (count == 0)
1740 1741
		l = trie_firstleaf(t);
	else {
1742 1743 1744
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1745
		l = fib_find_node(t, key);
1746 1747
		if (!l)
			l = trie_leafindex(t, count);
1748
	}
1749

1750 1751
	while (l) {
		cb->args[2] = l->key;
1752
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1753
			cb->args[3] = count;
1754 1755
			rcu_read_unlock();
			return -1;
1756
		}
1757

1758
		++count;
1759
		l = trie_nextleaf(l);
1760 1761
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1762
	}
1763
	cb->args[3] = count;
R
Robert Olsson 已提交
1764
	rcu_read_unlock();
1765

1766 1767 1768
	return skb->len;
}

1769
void __init fib_trie_init(void)
1770
{
1771 1772
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1773 1774 1775
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
A
Alexander Duyck 已提交
1776
					   max(sizeof(struct tnode),
1777 1778
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1779
}
1780

1781

1782
struct fib_table *fib_trie_table(u32 id)
1783 1784 1785 1786 1787 1788 1789 1790 1791 1792
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1793
	tb->tb_default = -1;
1794
	tb->tb_num_default = 0;
1795 1796

	t = (struct trie *) tb->tb_data;
1797 1798 1799 1800 1801 1802 1803 1804
	RCU_INIT_POINTER(t->trie, NULL);
#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats = alloc_percpu(struct trie_use_stats);
	if (!t->stats) {
		kfree(tb);
		tb = NULL;
	}
#endif
1805 1806 1807 1808

	return tb;
}

1809 1810 1811
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1812
	struct seq_net_private p;
1813
	struct fib_table *tb;
1814
	struct tnode *tnode;
E
Eric Dumazet 已提交
1815 1816
	unsigned int index;
	unsigned int depth;
1817
};
1818

A
Alexander Duyck 已提交
1819
static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1820
{
1821
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
1822
	unsigned int cindex = iter->index;
1823
	struct tnode *p;
1824

1825 1826 1827 1828
	/* A single entry routing table */
	if (!tn)
		return NULL;

1829 1830 1831 1832
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
A
Alexander Duyck 已提交
1833
		struct tnode *n = tnode_get_child_rcu(tn, cindex);
1834

1835 1836 1837 1838 1839 1840
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
A
Alexander Duyck 已提交
1841
				iter->tnode = n;
1842 1843 1844 1845 1846
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
1847

1848 1849
		++cindex;
	}
O
Olof Johansson 已提交
1850

1851
	/* Current node exhausted, pop back up */
A
Alexander Duyck 已提交
1852
	p = node_parent_rcu(tn);
1853 1854 1855 1856 1857
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
1858
	}
1859 1860 1861

	/* got root? */
	return NULL;
1862 1863
}

A
Alexander Duyck 已提交
1864
static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1865
				       struct trie *t)
1866
{
A
Alexander Duyck 已提交
1867
	struct tnode *n;
1868

S
Stephen Hemminger 已提交
1869
	if (!t)
1870 1871 1872
		return NULL;

	n = rcu_dereference(t->trie);
1873
	if (!n)
1874
		return NULL;
1875

1876
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
1877
		iter->tnode = n;
1878 1879 1880 1881 1882 1883
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
1884
	}
1885 1886

	return n;
1887
}
O
Olof Johansson 已提交
1888

1889 1890
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
A
Alexander Duyck 已提交
1891
	struct tnode *n;
1892
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
1893

1894
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
1895

1896
	rcu_read_lock();
1897
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1898
		if (IS_LEAF(n)) {
1899 1900
			struct leaf_info *li;

1901 1902 1903 1904
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
1905

A
Alexander Duyck 已提交
1906
			hlist_for_each_entry_rcu(li, &n->list, hlist)
1907
				++s->prefixes;
1908 1909 1910 1911
		} else {
			int i;

			s->tnodes++;
A
Alexander Duyck 已提交
1912 1913
			if (n->bits < MAX_STAT_DEPTH)
				s->nodesizes[n->bits]++;
R
Robert Olsson 已提交
1914

A
Alexander Duyck 已提交
1915 1916
			for (i = 0; i < tnode_child_length(n); i++)
				if (!rcu_access_pointer(n->child[i]))
1917
					s->nullpointers++;
1918 1919
		}
	}
R
Robert Olsson 已提交
1920
	rcu_read_unlock();
1921 1922
}

1923 1924 1925 1926
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1927
{
E
Eric Dumazet 已提交
1928
	unsigned int i, max, pointers, bytes, avdepth;
1929

1930 1931 1932 1933
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
1934

1935 1936
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
1937
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
1938

1939
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
A
Alexander Duyck 已提交
1940
	bytes = sizeof(struct tnode) * stat->leaves;
1941 1942 1943 1944

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

1945
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
1946
	bytes += sizeof(struct tnode) * stat->tnodes;
1947

R
Robert Olsson 已提交
1948 1949
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
1950
		max--;
1951

1952
	pointers = 0;
1953
	for (i = 1; i < max; i++)
1954
		if (stat->nodesizes[i] != 0) {
1955
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
1956 1957 1958
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
1959
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
1960

A
Alexander Duyck 已提交
1961
	bytes += sizeof(struct tnode *) * pointers;
1962 1963
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
1964
}
R
Robert Olsson 已提交
1965

1966
#ifdef CONFIG_IP_FIB_TRIE_STATS
1967
static void trie_show_usage(struct seq_file *seq,
1968
			    const struct trie_use_stats __percpu *stats)
1969
{
1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
	struct trie_use_stats s = { 0 };
	int cpu;

	/* loop through all of the CPUs and gather up the stats */
	for_each_possible_cpu(cpu) {
		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);

		s.gets += pcpu->gets;
		s.backtrack += pcpu->backtrack;
		s.semantic_match_passed += pcpu->semantic_match_passed;
		s.semantic_match_miss += pcpu->semantic_match_miss;
		s.null_node_hit += pcpu->null_node_hit;
		s.resize_node_skipped += pcpu->resize_node_skipped;
	}

1985
	seq_printf(seq, "\nCounters:\n---------\n");
1986 1987
	seq_printf(seq, "gets = %u\n", s.gets);
	seq_printf(seq, "backtracks = %u\n", s.backtrack);
1988
	seq_printf(seq, "semantic match passed = %u\n",
1989 1990 1991 1992
		   s.semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
1993
}
1994 1995
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

1996
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
1997
{
1998 1999 2000 2001 2002 2003
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2004
}
2005

2006

2007 2008
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2009
	struct net *net = (struct net *)seq->private;
2010
	unsigned int h;
2011

2012
	seq_printf(seq,
2013 2014
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
A
Alexander Duyck 已提交
2015
		   sizeof(struct tnode), sizeof(struct tnode));
2016

2017 2018 2019 2020
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

2021
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2022 2023
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2024

2025 2026 2027 2028 2029 2030 2031 2032
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
2033
			trie_show_usage(seq, t->stats);
2034 2035 2036
#endif
		}
	}
2037

2038
	return 0;
2039 2040
}

2041
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2042
{
2043
	return single_open_net(inode, file, fib_triestat_seq_show);
2044 2045
}

2046
static const struct file_operations fib_triestat_fops = {
2047 2048 2049 2050
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2051
	.release = single_release_net,
2052 2053
};

A
Alexander Duyck 已提交
2054
static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2055
{
2056 2057
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2058
	loff_t idx = 0;
2059
	unsigned int h;
2060

2061 2062 2063
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
2064

2065
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
A
Alexander Duyck 已提交
2066
			struct tnode *n;
2067 2068 2069 2070 2071 2072 2073 2074 2075

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2076
	}
2077

2078 2079 2080
	return NULL;
}

2081
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2082
	__acquires(RCU)
2083
{
2084
	rcu_read_lock();
2085
	return fib_trie_get_idx(seq, *pos);
2086 2087
}

2088
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2089
{
2090
	struct fib_trie_iter *iter = seq->private;
2091
	struct net *net = seq_file_net(seq);
2092 2093 2094
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
A
Alexander Duyck 已提交
2095
	struct tnode *n;
2096

2097
	++*pos;
2098 2099 2100 2101
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2102

2103 2104
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2105
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2106 2107 2108 2109 2110
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2111

2112 2113 2114
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2115
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2116 2117 2118 2119 2120
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2121
	return NULL;
2122 2123 2124 2125

found:
	iter->tb = tb;
	return n;
2126
}
2127

2128
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2129
	__releases(RCU)
2130
{
2131 2132
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2133

2134 2135
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2136 2137
	while (n-- > 0)
		seq_puts(seq, "   ");
2138
}
2139

2140
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2141
{
S
Stephen Hemminger 已提交
2142
	switch (s) {
2143 2144 2145 2146 2147 2148
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2149
		snprintf(buf, len, "scope=%d", s);
2150 2151 2152
		return buf;
	}
}
2153

2154
static const char *const rtn_type_names[__RTN_MAX] = {
2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2168

E
Eric Dumazet 已提交
2169
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2170 2171 2172
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2173
	snprintf(buf, len, "type %u", t);
2174
	return buf;
2175 2176
}

2177 2178
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2179
{
2180
	const struct fib_trie_iter *iter = seq->private;
A
Alexander Duyck 已提交
2181
	struct tnode *n = v;
2182

2183 2184
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2185

2186
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
2187
		__be32 prf = htonl(n->key);
O
Olof Johansson 已提交
2188

A
Alexander Duyck 已提交
2189
		seq_indent(seq, iter->depth - 1);
2190
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
A
Alexander Duyck 已提交
2191 2192
			   &prf, n->pos, n->bits, n->full_children,
			   n->empty_children);
2193
	} else {
2194
		struct leaf_info *li;
A
Alexander Duyck 已提交
2195
		__be32 val = htonl(n->key);
2196 2197

		seq_indent(seq, iter->depth);
2198
		seq_printf(seq, "  |-- %pI4\n", &val);
2199

A
Alexander Duyck 已提交
2200
		hlist_for_each_entry_rcu(li, &n->list, hlist) {
2201 2202 2203 2204 2205 2206 2207 2208
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2209
						     fa->fa_info->fib_scope),
2210 2211 2212
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2213
					seq_printf(seq, " tos=%d", fa->fa_tos);
2214
				seq_putc(seq, '\n');
2215 2216
			}
		}
2217
	}
2218

2219 2220 2221
	return 0;
}

2222
static const struct seq_operations fib_trie_seq_ops = {
2223 2224 2225 2226
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2227 2228
};

2229
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2230
{
2231 2232
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2233 2234
}

2235
static const struct file_operations fib_trie_fops = {
2236 2237 2238 2239
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2240
	.release = seq_release_net,
2241 2242
};

2243 2244 2245 2246 2247 2248 2249
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

A
Alexander Duyck 已提交
2250
static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2251
{
A
Alexander Duyck 已提交
2252
	struct tnode *l = NULL;
2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2283
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
A
Alexander Duyck 已提交
2297
	struct tnode *l = v;
2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2321
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2322
{
E
Eric Dumazet 已提交
2323
	unsigned int flags = 0;
2324

E
Eric Dumazet 已提交
2325 2326
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2327 2328
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2329
	if (mask == htonl(0xFFFFFFFF))
2330 2331 2332
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2333 2334
}

2335 2336 2337
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2338
 *	and needs to be same as fib_hash output to avoid breaking
2339 2340 2341
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2342
{
A
Alexander Duyck 已提交
2343
	struct tnode *l = v;
2344
	struct leaf_info *li;
2345

2346 2347 2348 2349 2350 2351
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2352

2353
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2354
		struct fib_alias *fa;
A
Al Viro 已提交
2355
		__be32 mask, prefix;
O
Olof Johansson 已提交
2356

2357 2358
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2359

2360
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2361
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2362
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2363

2364 2365 2366
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2367

2368 2369
			seq_setwidth(seq, 127);

2370
			if (fi)
2371 2372
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2373
					 "%d\t%08X\t%d\t%u\t%u",
2374 2375 2376 2377 2378
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2379 2380
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2381
					 fi->fib_window,
2382
					 fi->fib_rtt >> 3);
2383
			else
2384 2385
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2386
					 "%d\t%08X\t%d\t%u\t%u",
2387
					 prefix, 0, flags, 0, 0, 0,
2388
					 mask, 0, 0, 0);
2389

2390
			seq_pad(seq, '\n');
2391
		}
2392 2393 2394 2395 2396
	}

	return 0;
}

2397
static const struct seq_operations fib_route_seq_ops = {
2398 2399 2400
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2401
	.show   = fib_route_seq_show,
2402 2403
};

2404
static int fib_route_seq_open(struct inode *inode, struct file *file)
2405
{
2406
	return seq_open_net(inode, file, &fib_route_seq_ops,
2407
			    sizeof(struct fib_route_iter));
2408 2409
}

2410
static const struct file_operations fib_route_fops = {
2411 2412 2413 2414
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2415
	.release = seq_release_net,
2416 2417
};

2418
int __net_init fib_proc_init(struct net *net)
2419
{
2420
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2421 2422
		goto out1;

2423 2424
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2425 2426
		goto out2;

2427
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2428 2429
		goto out3;

2430
	return 0;
2431 2432

out3:
2433
	remove_proc_entry("fib_triestat", net->proc_net);
2434
out2:
2435
	remove_proc_entry("fib_trie", net->proc_net);
2436 2437
out1:
	return -ENOMEM;
2438 2439
}

2440
void __net_exit fib_proc_exit(struct net *net)
2441
{
2442 2443 2444
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2445 2446 2447
}

#endif /* CONFIG_PROC_FS */