fib_trie.c 56.6 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

90 91
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
R
Robert Olsson 已提交
92

93
#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
94

95 96 97 98 99
struct tnode {
	t_key key;
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	struct tnode __rcu *parent;
100
	struct rcu_head rcu;
A
Alexander Duyck 已提交
101 102 103 104 105 106 107 108 109 110
	union {
		/* The fields in this struct are valid if bits > 0 (TNODE) */
		struct {
			unsigned int full_children;  /* KEYLENGTH bits needed */
			unsigned int empty_children; /* KEYLENGTH bits needed */
			struct tnode __rcu *child[0];
		};
		/* This list pointer if valid if bits == 0 (LEAF) */
		struct hlist_head list;
	};
111 112 113 114 115
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
116
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
117
	struct list_head falh;
118
	struct rcu_head rcu;
119 120 121 122 123 124 125 126 127
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
128
	unsigned int resize_node_skipped;
129 130 131 132 133 134 135 136 137
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
138
	unsigned int prefixes;
R
Robert Olsson 已提交
139
	unsigned int nodesizes[MAX_STAT_DEPTH];
140
};
141 142

struct trie {
A
Alexander Duyck 已提交
143
	struct tnode __rcu *trie;
144
#ifdef CONFIG_IP_FIB_TRIE_STATS
145
	struct trie_use_stats __percpu *stats;
146 147 148
#endif
};

149
static void resize(struct trie *t, struct tnode *tn);
150 151 152 153 154 155 156 157
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
158

159
static struct kmem_cache *fn_alias_kmem __read_mostly;
160
static struct kmem_cache *trie_leaf_kmem __read_mostly;
161

162 163
/* caller must hold RTNL */
#define node_parent(n) rtnl_dereference((n)->parent)
E
Eric Dumazet 已提交
164

165 166
/* caller must hold RCU read lock or RTNL */
#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
E
Eric Dumazet 已提交
167

168
/* wrapper for rcu_assign_pointer */
A
Alexander Duyck 已提交
169
static inline void node_set_parent(struct tnode *n, struct tnode *tp)
170
{
A
Alexander Duyck 已提交
171 172
	if (n)
		rcu_assign_pointer(n->parent, tp);
S
Stephen Hemminger 已提交
173 174
}

175 176 177 178
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)

/* This provides us with the number of children in this node, in the case of a
 * leaf this will return 0 meaning none of the children are accessible.
179
 */
180
static inline unsigned long tnode_child_length(const struct tnode *tn)
S
Stephen Hemminger 已提交
181
{
182
	return (1ul << tn->bits) & ~(1ul);
S
Stephen Hemminger 已提交
183
}
R
Robert Olsson 已提交
184

185 186 187
/* caller must hold RTNL */
static inline struct tnode *tnode_get_child(const struct tnode *tn,
					    unsigned long i)
188
{
189
	BUG_ON(i >= tnode_child_length(tn));
R
Robert Olsson 已提交
190

E
Eric Dumazet 已提交
191
	return rtnl_dereference(tn->child[i]);
192 193
}

194 195 196
/* caller must hold RCU read lock or RTNL */
static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
						unsigned long i)
197
{
198
	BUG_ON(i >= tnode_child_length(tn));
199

E
Eric Dumazet 已提交
200
	return rcu_dereference_rtnl(tn->child[i]);
201 202
}

203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
/* To understand this stuff, an understanding of keys and all their bits is
 * necessary. Every node in the trie has a key associated with it, but not
 * all of the bits in that key are significant.
 *
 * Consider a node 'n' and its parent 'tp'.
 *
 * If n is a leaf, every bit in its key is significant. Its presence is
 * necessitated by path compression, since during a tree traversal (when
 * searching for a leaf - unless we are doing an insertion) we will completely
 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
 * a potentially successful search, that we have indeed been walking the
 * correct key path.
 *
 * Note that we can never "miss" the correct key in the tree if present by
 * following the wrong path. Path compression ensures that segments of the key
 * that are the same for all keys with a given prefix are skipped, but the
 * skipped part *is* identical for each node in the subtrie below the skipped
 * bit! trie_insert() in this implementation takes care of that.
 *
 * if n is an internal node - a 'tnode' here, the various parts of its key
 * have many different meanings.
 *
 * Example:
 * _________________________________________________________________
 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 * -----------------------------------------------------------------
 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
 *
 * _________________________________________________________________
 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 * -----------------------------------------------------------------
 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
 *
 * tp->pos = 22
 * tp->bits = 3
 * n->pos = 13
 * n->bits = 4
 *
 * First, let's just ignore the bits that come before the parent tp, that is
 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
 * point we do not use them for anything.
 *
 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 * index into the parent's child array. That is, they will be used to find
 * 'n' among tp's children.
 *
 * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
 * for the node n.
 *
 * All the bits we have seen so far are significant to the node n. The rest
 * of the bits are really not needed or indeed known in n->key.
 *
 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 * n's child array, and will of course be different for each child.
 *
 * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
 * at this point.
 */
261

262 263
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
264
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
265
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
266 267

static void __alias_free_mem(struct rcu_head *head)
268
{
R
Robert Olsson 已提交
269 270
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
271 272
}

R
Robert Olsson 已提交
273
static inline void alias_free_mem_rcu(struct fib_alias *fa)
274
{
R
Robert Olsson 已提交
275 276
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
277

278
#define TNODE_KMALLOC_MAX \
A
Alexander Duyck 已提交
279
	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
O
Olof Johansson 已提交
280

281
static void __node_free_rcu(struct rcu_head *head)
282
{
A
Alexander Duyck 已提交
283
	struct tnode *n = container_of(head, struct tnode, rcu);
284 285 286 287 288 289 290

	if (IS_LEAF(n))
		kmem_cache_free(trie_leaf_kmem, n);
	else if (n->bits <= TNODE_KMALLOC_MAX)
		kfree(n);
	else
		vfree(n);
291 292
}

293 294
#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)

R
Robert Olsson 已提交
295
static inline void free_leaf_info(struct leaf_info *leaf)
296
{
297
	kfree_rcu(leaf, rcu);
298 299
}

300
static struct tnode *tnode_alloc(size_t size)
301
{
R
Robert Olsson 已提交
302
	if (size <= PAGE_SIZE)
303
		return kzalloc(size, GFP_KERNEL);
304
	else
305
		return vzalloc(size);
306
}
R
Robert Olsson 已提交
307

A
Alexander Duyck 已提交
308
static struct tnode *leaf_new(t_key key)
R
Robert Olsson 已提交
309
{
A
Alexander Duyck 已提交
310
	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
311
	if (l) {
312 313 314 315 316 317
		l->parent = NULL;
		/* set key and pos to reflect full key value
		 * any trailing zeros in the key should be ignored
		 * as the nodes are searched
		 */
		l->key = key;
318
		l->pos = 0;
319 320 321
		/* set bits to 0 indicating we are not a tnode */
		l->bits = 0;

R
Robert Olsson 已提交
322 323 324 325 326 327 328 329 330 331
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
332
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
333 334 335 336 337
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

338
static struct tnode *tnode_new(t_key key, int pos, int bits)
339
{
340
	size_t sz = offsetof(struct tnode, child[1 << bits]);
341
	struct tnode *tn = tnode_alloc(sz);
342 343 344 345
	unsigned int shift = pos + bits;

	/* verify bits and pos their msb bits clear and values are valid */
	BUG_ON(!bits || (shift > KEYLENGTH));
346

O
Olof Johansson 已提交
347
	if (tn) {
348
		tn->parent = NULL;
349 350
		tn->pos = pos;
		tn->bits = bits;
351
		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
352 353 354
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
355

E
Eric Dumazet 已提交
356
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
A
Alexander Duyck 已提交
357
		 sizeof(struct tnode *) << bits);
358 359 360
	return tn;
}

361
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
362 363
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */
A
Alexander Duyck 已提交
364
static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
365
{
366
	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
367 368
}

369 370 371 372
/* Add a child at position i overwriting the old value.
 * Update the value of full_children and empty_children.
 */
static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
373
{
A
Alexander Duyck 已提交
374
	struct tnode *chi = rtnl_dereference(tn->child[i]);
375
	int isfull, wasfull;
376

377
	BUG_ON(i >= tnode_child_length(tn));
S
Stephen Hemminger 已提交
378

379 380 381 382 383
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
384

385
	/* update fullChildren */
386
	wasfull = tnode_full(tn, chi);
387
	isfull = tnode_full(tn, n);
388

389
	if (wasfull && !isfull)
390
		tn->full_children--;
391
	else if (!wasfull && isfull)
392
		tn->full_children++;
O
Olof Johansson 已提交
393

394
	rcu_assign_pointer(tn->child[i], n);
395 396
}

397 398 399 400 401 402 403 404 405
static void put_child_root(struct tnode *tp, struct trie *t,
			   t_key key, struct tnode *n)
{
	if (tp)
		put_child(tp, get_index(key, tp), n);
	else
		rcu_assign_pointer(t->trie, n);
}

406
static inline void tnode_free_init(struct tnode *tn)
E
Eric Dumazet 已提交
407
{
408 409 410 411 412 413 414 415
	tn->rcu.next = NULL;
}

static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
{
	n->rcu.next = tn->rcu.next;
	tn->rcu.next = &n->rcu;
}
E
Eric Dumazet 已提交
416

417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
static void tnode_free(struct tnode *tn)
{
	struct callback_head *head = &tn->rcu;

	while (head) {
		head = head->next;
		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
		node_free(tn);

		tn = container_of(head, struct tnode, rcu);
	}

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
E
Eric Dumazet 已提交
432 433 434
	}
}

435
static int inflate(struct trie *t, struct tnode *oldtnode)
436
{
437 438
	struct tnode *inode, *node0, *node1, *tn, *tp;
	unsigned long i, j, k;
439
	t_key m;
440

S
Stephen Hemminger 已提交
441
	pr_debug("In inflate\n");
442

443
	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
S
Stephen Hemminger 已提交
444
	if (!tn)
445
		return -ENOMEM;
446

447 448 449 450
	/* Assemble all of the pointers in our cluster, in this case that
	 * represents all of the pointers out of our allocated nodes that
	 * point to existing tnodes and the links between our allocated
	 * nodes.
451
	 */
452 453
	for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
		inode = tnode_get_child(oldtnode, --i);
454

455
		/* An empty child */
A
Alexander Duyck 已提交
456
		if (inode == NULL)
457 458 459
			continue;

		/* A leaf or an internal node with skipped bits */
A
Alexander Duyck 已提交
460
		if (!tnode_full(oldtnode, inode)) {
461
			put_child(tn, get_index(inode->key, tn), inode);
462 463 464 465 466
			continue;
		}

		/* An internal node with two children */
		if (inode->bits == 1) {
467 468
			put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
			put_child(tn, 2 * i, tnode_get_child(inode, 0));
O
Olof Johansson 已提交
469
			continue;
470 471
		}

O
Olof Johansson 已提交
472
		/* We will replace this node 'inode' with two new
473
		 * ones, 'node0' and 'node1', each with half of the
O
Olof Johansson 已提交
474 475 476 477 478
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
479
		 * node0's key and "1" in node1's key. Since we are
O
Olof Johansson 已提交
480 481
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
482 483 484
		 * (tn->pos) - is the one that will differ between
		 * node0 and node1. So... we synthesize that bit in the
		 * two new keys.
O
Olof Johansson 已提交
485
		 */
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
		if (!node1)
			goto nomem;
		tnode_free_append(tn, node1);

		node0 = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1);
		if (!node0)
			goto nomem;
		tnode_free_append(tn, node0);

		/* populate child pointers in new nodes */
		for (k = tnode_child_length(inode), j = k / 2; j;) {
			put_child(node1, --j, tnode_get_child(inode, --k));
			put_child(node0, j, tnode_get_child(inode, j));
			put_child(node1, --j, tnode_get_child(inode, --k));
			put_child(node0, j, tnode_get_child(inode, j));
		}
503

504 505 506
		/* link new nodes to parent */
		NODE_INIT_PARENT(node1, tn);
		NODE_INIT_PARENT(node0, tn);
507

508 509 510 511
		/* link parent to nodes */
		put_child(tn, 2 * i + 1, node1);
		put_child(tn, 2 * i, node0);
	}
512

513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
	/* setup the parent pointer into and out of this node */
	tp = node_parent(oldtnode);
	NODE_INIT_PARENT(tn, tp);
	put_child_root(tp, t, tn->key, tn);

	/* prepare oldtnode to be freed */
	tnode_free_init(oldtnode);

	/* update all child nodes parent pointers to route to us */
	for (i = tnode_child_length(oldtnode); i;) {
		inode = tnode_get_child(oldtnode, --i);

		/* A leaf or an internal node with skipped bits */
		if (!tnode_full(oldtnode, inode)) {
			node_set_parent(inode, tn);
			continue;
		}
530

531 532
		/* drop the node in the old tnode free list */
		tnode_free_append(oldtnode, inode);
533

534 535 536
		/* fetch new nodes */
		node1 = tnode_get_child(tn, 2 * i + 1);
		node0 = tnode_get_child(tn, 2 * i);
537

538 539 540 541 542
		/* bits == 1 then node0 and node1 represent inode's children */
		if (inode->bits == 1) {
			node_set_parent(node1, tn);
			node_set_parent(node0, tn);
			continue;
543
		}
544

545 546 547 548 549 550 551
		/* update parent pointers in child node's children */
		for (k = tnode_child_length(inode), j = k / 2; j;) {
			node_set_parent(tnode_get_child(inode, --k), node1);
			node_set_parent(tnode_get_child(inode, --j), node0);
			node_set_parent(tnode_get_child(inode, --k), node1);
			node_set_parent(tnode_get_child(inode, --j), node0);
		}
O
Olof Johansson 已提交
552

553
		/* resize child nodes */
554 555
		resize(t, node1);
		resize(t, node0);
556
	}
557

558 559
	/* we completed without error, prepare to free old node */
	tnode_free(oldtnode);
560
	return 0;
561
nomem:
562 563
	/* all pointers should be clean so we are done */
	tnode_free(tn);
564
	return -ENOMEM;
565 566
}

567
static int halve(struct trie *t, struct tnode *oldtnode)
568
{
569 570
	struct tnode *tn, *tp, *inode, *node0, *node1;
	unsigned long i;
571

S
Stephen Hemminger 已提交
572
	pr_debug("In halve\n");
573

574
	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
575
	if (!tn)
576
		return -ENOMEM;
577

578 579 580 581
	/* Assemble all of the pointers in our cluster, in this case that
	 * represents all of the pointers out of our allocated nodes that
	 * point to existing tnodes and the links between our allocated
	 * nodes.
582
	 */
583 584 585
	for (i = tnode_child_length(oldtnode); i;) {
		node1 = tnode_get_child(oldtnode, --i);
		node0 = tnode_get_child(oldtnode, --i);
586

587 588 589 590 591
		/* At least one of the children is empty */
		if (!node1 || !node0) {
			put_child(tn, i / 2, node1 ? : node0);
			continue;
		}
592

593
		/* Two nonempty children */
594 595 596 597
		inode = tnode_new(node0->key, oldtnode->pos, 1);
		if (!inode) {
			tnode_free(tn);
			return -ENOMEM;
598
		}
599
		tnode_free_append(tn, inode);
600

601 602 603 604 605 606 607
		/* initialize pointers out of node */
		put_child(inode, 1, node1);
		put_child(inode, 0, node0);
		NODE_INIT_PARENT(inode, tn);

		/* link parent to node */
		put_child(tn, i / 2, inode);
608
	}
609

610 611 612 613 614
	/* setup the parent pointer out of and back into this node */
	tp = node_parent(oldtnode);
	NODE_INIT_PARENT(tn, tp);
	put_child_root(tp, t, tn->key, tn);

615 616 617
	/* prepare oldtnode to be freed */
	tnode_free_init(oldtnode);

618 619 620
	/* update all of the child parent pointers */
	for (i = tnode_child_length(tn); i;) {
		inode = tnode_get_child(tn, --i);
621

622 623 624
		/* only new tnodes will be considered "full" nodes */
		if (!tnode_full(tn, inode)) {
			node_set_parent(inode, tn);
O
Olof Johansson 已提交
625 626
			continue;
		}
627

628
		/* Two nonempty children */
629 630
		node_set_parent(tnode_get_child(inode, 1), inode);
		node_set_parent(tnode_get_child(inode, 0), inode);
631

632
		/* resize child node */
633
		resize(t, inode);
634
	}
635

636 637
	/* all pointers should be clean so we are done */
	tnode_free(oldtnode);
638 639

	return 0;
640 641
}

642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 * Telecommunications, page 6:
 * "A node is doubled if the ratio of non-empty children to all
 * children in the *doubled* node is at least 'high'."
 *
 * 'high' in this instance is the variable 'inflate_threshold'. It
 * is expressed as a percentage, so we multiply it with
 * tnode_child_length() and instead of multiplying by 2 (since the
 * child array will be doubled by inflate()) and multiplying
 * the left-hand side by 100 (to handle the percentage thing) we
 * multiply the left-hand side by 50.
 *
 * The left-hand side may look a bit weird: tnode_child_length(tn)
 * - tn->empty_children is of course the number of non-null children
 * in the current node. tn->full_children is the number of "full"
 * children, that is non-null tnodes with a skip value of 0.
 * All of those will be doubled in the resulting inflated tnode, so
 * we just count them one extra time here.
 *
 * A clearer way to write this would be:
 *
 * to_be_doubled = tn->full_children;
 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 *     tn->full_children;
 *
 * new_child_length = tnode_child_length(tn) * 2;
 *
 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 *      new_child_length;
 * if (new_fill_factor >= inflate_threshold)
 *
 * ...and so on, tho it would mess up the while () loop.
 *
 * anyway,
 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 *      inflate_threshold
 *
 * avoid a division:
 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 *      inflate_threshold * new_child_length
 *
 * expand not_to_be_doubled and to_be_doubled, and shorten:
 * 100 * (tnode_child_length(tn) - tn->empty_children +
 *    tn->full_children) >= inflate_threshold * new_child_length
 *
 * expand new_child_length:
 * 100 * (tnode_child_length(tn) - tn->empty_children +
 *    tn->full_children) >=
 *      inflate_threshold * tnode_child_length(tn) * 2
 *
 * shorten again:
 * 50 * (tn->full_children + tnode_child_length(tn) -
 *    tn->empty_children) >= inflate_threshold *
 *    tnode_child_length(tn)
 *
 */
699
static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
700 701 702 703 704
{
	unsigned long used = tnode_child_length(tn);
	unsigned long threshold = used;

	/* Keep root node larger */
705
	threshold *= tp ? inflate_threshold : inflate_threshold_root;
706 707 708 709 710 711
	used += tn->full_children;
	used -= tn->empty_children;

	return tn->pos && ((50 * used) >= threshold);
}

712
static bool should_halve(const struct tnode *tp, const struct tnode *tn)
713 714 715 716 717
{
	unsigned long used = tnode_child_length(tn);
	unsigned long threshold = used;

	/* Keep root node larger */
718
	threshold *= tp ? halve_threshold : halve_threshold_root;
719 720 721 722 723
	used -= tn->empty_children;

	return (tn->bits > 1) && ((100 * used) < threshold);
}

724
#define MAX_WORK 10
725
static void resize(struct trie *t, struct tnode *tn)
726
{
727 728
	struct tnode *tp = node_parent(tn), *n = NULL;
	struct tnode __rcu **cptr;
729 730 731 732 733
	int max_work;

	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);

734 735 736 737 738 739 740
	/* track the tnode via the pointer from the parent instead of
	 * doing it ourselves.  This way we can let RCU fully do its
	 * thing without us interfering
	 */
	cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
	BUG_ON(tn != rtnl_dereference(*cptr));

741 742 743 744 745 746 747 748
	/* No children */
	if (tn->empty_children > (tnode_child_length(tn) - 1))
		goto no_children;

	/* One child */
	if (tn->empty_children == (tnode_child_length(tn) - 1))
		goto one_child;

749 750
	/* Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
751 752
	 */
	max_work = MAX_WORK;
753 754
	while (should_inflate(tp, tn) && max_work--) {
		if (inflate(t, tn)) {
755 756 757 758 759
#ifdef CONFIG_IP_FIB_TRIE_STATS
			this_cpu_inc(t->stats->resize_node_skipped);
#endif
			break;
		}
760 761

		tn = rtnl_dereference(*cptr);
762 763 764 765
	}

	/* Return if at least one inflate is run */
	if (max_work != MAX_WORK)
766
		return;
767

768
	/* Halve as long as the number of empty children in this
769 770 771
	 * node is above threshold.
	 */
	max_work = MAX_WORK;
772 773
	while (should_halve(tp, tn) && max_work--) {
		if (halve(t, tn)) {
774 775 776 777 778 779
#ifdef CONFIG_IP_FIB_TRIE_STATS
			this_cpu_inc(t->stats->resize_node_skipped);
#endif
			break;
		}

780 781
		tn = rtnl_dereference(*cptr);
	}
782 783 784 785 786 787 788 789 790

	/* Only one child remains */
	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
		unsigned long i;
one_child:
		for (i = tnode_child_length(tn); !n && i;)
			n = tnode_get_child(tn, --i);
no_children:
		/* compress one level */
791 792 793 794
		put_child_root(tp, t, tn->key, n);
		node_set_parent(n, tp);

		/* drop dead node */
795 796
		tnode_free_init(tn);
		tnode_free(tn);
797 798 799
	}
}

R
Robert Olsson 已提交
800
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
801 802
 via get_fa_head and dump */

A
Alexander Duyck 已提交
803
static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
804
{
R
Robert Olsson 已提交
805
	struct hlist_head *head = &l->list;
806 807
	struct leaf_info *li;

808
	hlist_for_each_entry_rcu(li, head, hlist)
809
		if (li->plen == plen)
810
			return li;
O
Olof Johansson 已提交
811

812 813 814
	return NULL;
}

A
Alexander Duyck 已提交
815
static inline struct list_head *get_fa_head(struct tnode *l, int plen)
816
{
R
Robert Olsson 已提交
817
	struct leaf_info *li = find_leaf_info(l, plen);
818

O
Olof Johansson 已提交
819 820
	if (!li)
		return NULL;
821

O
Olof Johansson 已提交
822
	return &li->falh;
823 824 825 826
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
827 828 829 830 831
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
832
		hlist_for_each_entry(li, head, hlist) {
833 834 835 836 837 838
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
839
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
840 841 842
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
843 844
}

R
Robert Olsson 已提交
845
/* rcu_read_lock needs to be hold by caller from readside */
A
Alexander Duyck 已提交
846
static struct tnode *fib_find_node(struct trie *t, u32 key)
847
{
A
Alexander Duyck 已提交
848
	struct tnode *n = rcu_dereference_rtnl(t->trie);
A
Alexander Duyck 已提交
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867

	while (n) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the bits in the cindex. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is cindex
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			return NULL;

		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
868 869
			break;

A
Alexander Duyck 已提交
870 871
		n = rcu_dereference_rtnl(n->child[index]);
	}
O
Olof Johansson 已提交
872

A
Alexander Duyck 已提交
873
	return n;
874 875
}

876
static void trie_rebalance(struct trie *t, struct tnode *tn)
877
{
S
Stephen Hemminger 已提交
878
	struct tnode *tp;
879

880 881
	while ((tp = node_parent(tn)) != NULL) {
		resize(t, tn);
S
Stephen Hemminger 已提交
882
		tn = tp;
883
	}
S
Stephen Hemminger 已提交
884

885
	/* Handle last (top) tnode */
886
	if (IS_TNODE(tn))
887
		resize(t, tn);
888 889
}

R
Robert Olsson 已提交
890 891
/* only used from updater-side */

892
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
893
{
894
	struct list_head *fa_head = NULL;
895
	struct tnode *l, *n, *tp = NULL;
896 897
	struct leaf_info *li;

898 899 900 901 902
	li = leaf_info_new(plen);
	if (!li)
		return NULL;
	fa_head = &li->falh;

E
Eric Dumazet 已提交
903
	n = rtnl_dereference(t->trie);
904

905 906
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
907 908
	 * and we should just put our new leaf in that.
	 *
909 910 911
	 * If we hit a node with a key that does't match then we should stop
	 * and create a new tnode to replace that node and insert ourselves
	 * and the other node into the new tnode.
912
	 */
913 914
	while (n) {
		unsigned long index = get_index(key, n);
915

916 917 918 919 920 921 922 923 924 925 926
		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
927 928
			break;

929 930 931 932 933 934
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n)) {
			/* Case 1: n is a leaf, and prefixes match*/
			insert_leaf_info(&n->list, li);
			return fa_head;
		}
935

936 937
		tp = n;
		n = rcu_dereference_rtnl(n->child[index]);
938 939
	}

940 941 942
	l = leaf_new(key);
	if (!l) {
		free_leaf_info(li);
943
		return NULL;
944
	}
945 946 947

	insert_leaf_info(&l->list, li);

948 949 950 951 952 953 954 955
	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
	 *
	 *  Add a new tnode here
	 *  first tnode need some special handling
	 *  leaves us in position for handling as case 3
	 */
	if (n) {
		struct tnode *tn;
956

957
		tn = tnode_new(key, __fls(key ^ n->key), 1);
958
		if (!tn) {
959
			free_leaf_info(li);
960
			node_free(l);
961
			return NULL;
O
Olof Johansson 已提交
962 963
		}

964 965 966
		/* initialize routes out of node */
		NODE_INIT_PARENT(tn, tp);
		put_child(tn, get_index(key, tn) ^ 1, n);
967

968 969 970
		/* start adding routes into the node */
		put_child_root(tp, t, key, tn);
		node_set_parent(n, tn);
971

972
		/* parent now has a NULL spot where the leaf can go */
973
		tp = tn;
974
	}
O
Olof Johansson 已提交
975

976 977 978 979 980 981 982 983
	/* Case 3: n is NULL, and will just insert a new leaf */
	if (tp) {
		NODE_INIT_PARENT(l, tp);
		put_child(tp, get_index(key, tp), l);
		trie_rebalance(t, tp);
	} else {
		rcu_assign_pointer(t->trie, l);
	}
R
Robert Olsson 已提交
984

985 986 987
	return fa_head;
}

988 989 990
/*
 * Caller must hold RTNL.
 */
991
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
992 993 994
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
995
	struct list_head *fa_head = NULL;
996
	struct fib_info *fi;
997 998
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
999 1000
	u32 key, mask;
	int err;
A
Alexander Duyck 已提交
1001
	struct tnode *l;
1002 1003 1004 1005

	if (plen > 32)
		return -EINVAL;

1006
	key = ntohl(cfg->fc_dst);
1007

1008
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1009

O
Olof Johansson 已提交
1010
	mask = ntohl(inet_make_mask(plen));
1011

1012
	if (key & ~mask)
1013 1014 1015 1016
		return -EINVAL;

	key = key & mask;

1017 1018 1019
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1020
		goto err;
1021
	}
1022 1023

	l = fib_find_node(t, key);
1024
	fa = NULL;
1025

1026
	if (l) {
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1042 1043 1044
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1045 1046

		err = -EEXIST;
1047
		if (cfg->fc_nlflags & NLM_F_EXCL)
1048 1049
			goto out;

1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1070
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1071 1072 1073
			struct fib_info *fi_drop;
			u8 state;

1074 1075 1076 1077
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1078
				goto out;
1079
			}
R
Robert Olsson 已提交
1080
			err = -ENOBUFS;
1081
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1082 1083
			if (new_fa == NULL)
				goto out;
1084 1085

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1086 1087
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1088
			new_fa->fa_type = cfg->fc_type;
1089
			state = fa->fa_state;
1090
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1091

R
Robert Olsson 已提交
1092 1093
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1094 1095 1096

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1097
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1098 1099
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1100

O
Olof Johansson 已提交
1101
			goto succeeded;
1102 1103 1104 1105 1106
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1107 1108
		if (fa_match)
			goto out;
1109

1110
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1111
			fa = fa_first;
1112 1113
	}
	err = -ENOENT;
1114
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1115 1116 1117
		goto out;

	err = -ENOBUFS;
1118
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1119 1120 1121 1122 1123
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1124
	new_fa->fa_type = cfg->fc_type;
1125 1126 1127 1128 1129
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1130
	if (!fa_head) {
1131 1132 1133
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1134
			goto out_free_new_fa;
1135
		}
1136
	}
1137

1138 1139 1140
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1141 1142
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1143

1144
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1145
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1146
		  &cfg->fc_nlinfo, 0);
1147 1148
succeeded:
	return 0;
1149 1150 1151

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1152 1153
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1154
err:
1155 1156 1157
	return err;
}

1158 1159 1160 1161 1162 1163 1164
static inline t_key prefix_mismatch(t_key key, struct tnode *n)
{
	t_key prefix = n->key;

	return (key ^ prefix) & (prefix | -prefix);
}

1165
/* should be called with rcu_read_lock */
1166
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1167
		     struct fib_result *res, int fib_flags)
1168
{
1169
	struct trie *t = (struct trie *)tb->tb_data;
1170 1171 1172
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats __percpu *stats = t->stats;
#endif
1173 1174
	const t_key key = ntohl(flp->daddr);
	struct tnode *n, *pn;
1175
	struct leaf_info *li;
1176
	t_key cindex;
O
Olof Johansson 已提交
1177

R
Robert Olsson 已提交
1178
	n = rcu_dereference(t->trie);
1179
	if (!n)
1180
		return -EAGAIN;
1181 1182

#ifdef CONFIG_IP_FIB_TRIE_STATS
1183
	this_cpu_inc(stats->gets);
1184 1185
#endif

A
Alexander Duyck 已提交
1186
	pn = n;
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
	cindex = 0;

	/* Step 1: Travel to the longest prefix match in the trie */
	for (;;) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			break;
1205

1206 1207
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
1208
			goto found;
1209

1210 1211
		/* only record pn and cindex if we are going to be chopping
		 * bits later.  Otherwise we are just wasting cycles.
O
Olof Johansson 已提交
1212
		 */
1213 1214 1215
		if (index) {
			pn = n;
			cindex = index;
O
Olof Johansson 已提交
1216
		}
1217

1218 1219 1220 1221
		n = rcu_dereference(n->child[index]);
		if (unlikely(!n))
			goto backtrace;
	}
1222

1223 1224 1225 1226
	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
	for (;;) {
		/* record the pointer where our next node pointer is stored */
		struct tnode __rcu **cptr = n->child;
1227

1228 1229 1230
		/* This test verifies that none of the bits that differ
		 * between the key and the prefix exist in the region of
		 * the lsb and higher in the prefix.
O
Olof Johansson 已提交
1231
		 */
1232 1233
		if (unlikely(prefix_mismatch(key, n)))
			goto backtrace;
O
Olof Johansson 已提交
1234

1235 1236 1237
		/* exit out and process leaf */
		if (unlikely(IS_LEAF(n)))
			break;
O
Olof Johansson 已提交
1238

1239 1240 1241
		/* Don't bother recording parent info.  Since we are in
		 * prefix match mode we will have to come back to wherever
		 * we started this traversal anyway
O
Olof Johansson 已提交
1242 1243
		 */

1244
		while ((n = rcu_dereference(*cptr)) == NULL) {
1245 1246
backtrace:
#ifdef CONFIG_IP_FIB_TRIE_STATS
1247 1248
			if (!n)
				this_cpu_inc(stats->null_node_hit);
1249
#endif
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
			/* If we are at cindex 0 there are no more bits for
			 * us to strip at this level so we must ascend back
			 * up one level to see if there are any more bits to
			 * be stripped there.
			 */
			while (!cindex) {
				t_key pkey = pn->key;

				pn = node_parent_rcu(pn);
				if (unlikely(!pn))
1260
					return -EAGAIN;
1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->backtrack);
#endif
				/* Get Child's index */
				cindex = get_index(pkey, pn);
			}

			/* strip the least significant bit from the cindex */
			cindex &= cindex - 1;

			/* grab pointer for next child node */
			cptr = &pn->child[cindex];
1273
		}
1274
	}
1275

1276
found:
1277
	/* Step 3: Process the leaf, if that fails fall back to backtracing */
1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333
	hlist_for_each_entry_rcu(li, &n->list, hlist) {
		struct fib_alias *fa;

		if ((key ^ n->key) & li->mask_plen)
			continue;

		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;

			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
				continue;
			if (fi->fib_dead)
				continue;
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (unlikely(err < 0)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->semantic_match_passed);
#endif
				return err;
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
					continue;

				if (!(fib_flags & FIB_LOOKUP_NOREF))
					atomic_inc(&fi->fib_clntref);

				res->prefixlen = li->plen;
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
				res->scope = fi->fib_scope;
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->semantic_match_passed);
#endif
				return err;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
		this_cpu_inc(stats->semantic_match_miss);
#endif
	}
	goto backtrace;
1334
}
1335
EXPORT_SYMBOL_GPL(fib_table_lookup);
1336

1337 1338 1339
/*
 * Remove the leaf and return parent.
 */
A
Alexander Duyck 已提交
1340
static void trie_leaf_remove(struct trie *t, struct tnode *l)
1341
{
1342
	struct tnode *tp = node_parent(l);
1343

1344
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1345

1346
	if (tp) {
1347
		put_child(tp, get_index(l->key, tp), NULL);
1348
		trie_rebalance(t, tp);
1349
	} else {
1350
		RCU_INIT_POINTER(t->trie, NULL);
1351
	}
1352

1353
	node_free(l);
1354 1355
}

1356 1357 1358
/*
 * Caller must hold RTNL.
 */
1359
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1360 1361 1362
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1363 1364
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1365 1366
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
A
Alexander Duyck 已提交
1367
	struct tnode *l;
O
Olof Johansson 已提交
1368 1369
	struct leaf_info *li;

1370
	if (plen > 32)
1371 1372
		return -EINVAL;

1373
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1374
	mask = ntohl(inet_make_mask(plen));
1375

1376
	if (key & ~mask)
1377 1378 1379 1380 1381
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1382
	if (!l)
1383 1384
		return -ESRCH;

1385 1386 1387 1388 1389 1390
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1391 1392 1393 1394 1395
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1396
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1397 1398

	fa_to_delete = NULL;
1399 1400
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1401 1402 1403 1404 1405
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1406 1407
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1408
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1409 1410
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1411 1412 1413
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1414 1415 1416 1417 1418
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1419 1420
	if (!fa_to_delete)
		return -ESRCH;
1421

O
Olof Johansson 已提交
1422
	fa = fa_to_delete;
1423
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1424
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1425

R
Robert Olsson 已提交
1426
	list_del_rcu(&fa->fa_list);
1427

1428 1429 1430
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1431
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1432
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1433
		free_leaf_info(li);
R
Robert Olsson 已提交
1434
	}
1435

O
Olof Johansson 已提交
1436
	if (hlist_empty(&l->list))
1437
		trie_leaf_remove(t, l);
1438

O
Olof Johansson 已提交
1439
	if (fa->fa_state & FA_S_ACCESSED)
1440
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1441

R
Robert Olsson 已提交
1442 1443
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1444
	return 0;
1445 1446
}

1447
static int trie_flush_list(struct list_head *head)
1448 1449 1450 1451 1452 1453 1454
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1455 1456 1457 1458
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1459 1460 1461 1462 1463 1464
			found++;
		}
	}
	return found;
}

A
Alexander Duyck 已提交
1465
static int trie_flush_leaf(struct tnode *l)
1466 1467 1468
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1469
	struct hlist_node *tmp;
1470 1471
	struct leaf_info *li = NULL;

1472
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1473
		found += trie_flush_list(&li->falh);
1474 1475

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1476
			hlist_del_rcu(&li->hlist);
1477 1478 1479 1480 1481 1482
			free_leaf_info(li);
		}
	}
	return found;
}

1483 1484 1485 1486
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
A
Alexander Duyck 已提交
1487
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1488
{
1489
	do {
1490
		unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
R
Robert Olsson 已提交
1491

1492
		while (idx < tnode_child_length(p)) {
1493
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1494
			if (!c)
O
Olof Johansson 已提交
1495 1496
				continue;

1497
			if (IS_LEAF(c))
A
Alexander Duyck 已提交
1498
				return c;
1499 1500

			/* Rescan start scanning in new node */
A
Alexander Duyck 已提交
1501
			p = c;
1502
			idx = 0;
1503
		}
1504 1505

		/* Node empty, walk back up to parent */
A
Alexander Duyck 已提交
1506
		c = p;
E
Eric Dumazet 已提交
1507
	} while ((p = node_parent_rcu(c)) != NULL);
1508 1509 1510 1511

	return NULL; /* Root of trie */
}

A
Alexander Duyck 已提交
1512
static struct tnode *trie_firstleaf(struct trie *t)
1513
{
A
Alexander Duyck 已提交
1514
	struct tnode *n = rcu_dereference_rtnl(t->trie);
1515 1516 1517 1518 1519

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
A
Alexander Duyck 已提交
1520
		return n;
1521 1522 1523 1524

	return leaf_walk_rcu(n, NULL);
}

A
Alexander Duyck 已提交
1525
static struct tnode *trie_nextleaf(struct tnode *l)
1526
{
A
Alexander Duyck 已提交
1527
	struct tnode *p = node_parent_rcu(l);
1528 1529 1530 1531

	if (!p)
		return NULL;	/* trie with just one leaf */

A
Alexander Duyck 已提交
1532
	return leaf_walk_rcu(p, l);
1533 1534
}

A
Alexander Duyck 已提交
1535
static struct tnode *trie_leafindex(struct trie *t, int index)
1536
{
A
Alexander Duyck 已提交
1537
	struct tnode *l = trie_firstleaf(t);
1538

S
Stephen Hemminger 已提交
1539
	while (l && index-- > 0)
1540
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1541

1542 1543 1544 1545
	return l;
}


1546 1547 1548
/*
 * Caller must hold RTNL.
 */
1549
int fib_table_flush(struct fib_table *tb)
1550 1551
{
	struct trie *t = (struct trie *) tb->tb_data;
A
Alexander Duyck 已提交
1552
	struct tnode *l, *ll = NULL;
1553
	int found = 0;
1554

1555
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1556
		found += trie_flush_leaf(l);
1557 1558

		if (ll && hlist_empty(&ll->list))
1559
			trie_leaf_remove(t, ll);
1560 1561 1562 1563
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1564
		trie_leaf_remove(t, ll);
1565

S
Stephen Hemminger 已提交
1566
	pr_debug("trie_flush found=%d\n", found);
1567 1568 1569
	return found;
}

1570 1571
void fib_free_table(struct fib_table *tb)
{
1572 1573 1574 1575 1576
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie *t = (struct trie *)tb->tb_data;

	free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
1577 1578 1579
	kfree(tb);
}

1580 1581
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1582 1583 1584 1585
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1586
	__be32 xkey = htonl(key);
1587

1588
	s_i = cb->args[5];
1589 1590
	i = 0;

R
Robert Olsson 已提交
1591 1592 1593
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1594 1595 1596 1597 1598
		if (i < s_i) {
			i++;
			continue;
		}

1599
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1600 1601 1602 1603
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1604
				  xkey,
1605 1606
				  plen,
				  fa->fa_tos,
1607
				  fa->fa_info, NLM_F_MULTI) < 0) {
1608
			cb->args[5] = i;
1609
			return -1;
O
Olof Johansson 已提交
1610
		}
1611 1612
		i++;
	}
1613
	cb->args[5] = i;
1614 1615 1616
	return skb->len;
}

A
Alexander Duyck 已提交
1617
static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1618
			struct sk_buff *skb, struct netlink_callback *cb)
1619
{
1620 1621
	struct leaf_info *li;
	int i, s_i;
1622

1623
	s_i = cb->args[4];
1624
	i = 0;
1625

1626
	/* rcu_read_lock is hold by caller */
1627
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1628 1629
		if (i < s_i) {
			i++;
1630
			continue;
1631
		}
O
Olof Johansson 已提交
1632

1633
		if (i > s_i)
1634
			cb->args[5] = 0;
1635

1636
		if (list_empty(&li->falh))
1637 1638
			continue;

1639
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1640
			cb->args[4] = i;
1641 1642
			return -1;
		}
1643
		i++;
1644
	}
1645

1646
	cb->args[4] = i;
1647 1648 1649
	return skb->len;
}

1650 1651
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1652
{
A
Alexander Duyck 已提交
1653
	struct tnode *l;
1654
	struct trie *t = (struct trie *) tb->tb_data;
1655
	t_key key = cb->args[2];
1656
	int count = cb->args[3];
1657

R
Robert Olsson 已提交
1658
	rcu_read_lock();
1659 1660 1661
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1662
	if (count == 0)
1663 1664
		l = trie_firstleaf(t);
	else {
1665 1666 1667
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1668
		l = fib_find_node(t, key);
1669 1670
		if (!l)
			l = trie_leafindex(t, count);
1671
	}
1672

1673 1674
	while (l) {
		cb->args[2] = l->key;
1675
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1676
			cb->args[3] = count;
1677 1678
			rcu_read_unlock();
			return -1;
1679
		}
1680

1681
		++count;
1682
		l = trie_nextleaf(l);
1683 1684
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1685
	}
1686
	cb->args[3] = count;
R
Robert Olsson 已提交
1687
	rcu_read_unlock();
1688

1689 1690 1691
	return skb->len;
}

1692
void __init fib_trie_init(void)
1693
{
1694 1695
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1696 1697 1698
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
A
Alexander Duyck 已提交
1699
					   max(sizeof(struct tnode),
1700 1701
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1702
}
1703

1704

1705
struct fib_table *fib_trie_table(u32 id)
1706 1707 1708 1709 1710 1711 1712 1713 1714 1715
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1716
	tb->tb_default = -1;
1717
	tb->tb_num_default = 0;
1718 1719

	t = (struct trie *) tb->tb_data;
1720 1721 1722 1723 1724 1725 1726 1727
	RCU_INIT_POINTER(t->trie, NULL);
#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats = alloc_percpu(struct trie_use_stats);
	if (!t->stats) {
		kfree(tb);
		tb = NULL;
	}
#endif
1728 1729 1730 1731

	return tb;
}

1732 1733 1734
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1735
	struct seq_net_private p;
1736
	struct fib_table *tb;
1737
	struct tnode *tnode;
E
Eric Dumazet 已提交
1738 1739
	unsigned int index;
	unsigned int depth;
1740
};
1741

A
Alexander Duyck 已提交
1742
static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1743
{
1744
	unsigned long cindex = iter->index;
1745 1746
	struct tnode *tn = iter->tnode;
	struct tnode *p;
1747

1748 1749 1750 1751
	/* A single entry routing table */
	if (!tn)
		return NULL;

1752 1753 1754
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
1755
	while (cindex < tnode_child_length(tn)) {
A
Alexander Duyck 已提交
1756
		struct tnode *n = tnode_get_child_rcu(tn, cindex);
1757

1758 1759 1760 1761 1762 1763
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
A
Alexander Duyck 已提交
1764
				iter->tnode = n;
1765 1766 1767 1768 1769
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
1770

1771 1772
		++cindex;
	}
O
Olof Johansson 已提交
1773

1774
	/* Current node exhausted, pop back up */
A
Alexander Duyck 已提交
1775
	p = node_parent_rcu(tn);
1776
	if (p) {
1777
		cindex = get_index(tn->key, p) + 1;
1778 1779 1780
		tn = p;
		--iter->depth;
		goto rescan;
1781
	}
1782 1783 1784

	/* got root? */
	return NULL;
1785 1786
}

A
Alexander Duyck 已提交
1787
static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1788
				       struct trie *t)
1789
{
A
Alexander Duyck 已提交
1790
	struct tnode *n;
1791

S
Stephen Hemminger 已提交
1792
	if (!t)
1793 1794 1795
		return NULL;

	n = rcu_dereference(t->trie);
1796
	if (!n)
1797
		return NULL;
1798

1799
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
1800
		iter->tnode = n;
1801 1802 1803 1804 1805 1806
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
1807
	}
1808 1809

	return n;
1810
}
O
Olof Johansson 已提交
1811

1812 1813
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
A
Alexander Duyck 已提交
1814
	struct tnode *n;
1815
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
1816

1817
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
1818

1819
	rcu_read_lock();
1820
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1821
		if (IS_LEAF(n)) {
1822 1823
			struct leaf_info *li;

1824 1825 1826 1827
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
1828

A
Alexander Duyck 已提交
1829
			hlist_for_each_entry_rcu(li, &n->list, hlist)
1830
				++s->prefixes;
1831
		} else {
1832
			unsigned long i;
1833 1834

			s->tnodes++;
A
Alexander Duyck 已提交
1835 1836
			if (n->bits < MAX_STAT_DEPTH)
				s->nodesizes[n->bits]++;
R
Robert Olsson 已提交
1837

1838
			for (i = 0; i < tnode_child_length(n); i++) {
A
Alexander Duyck 已提交
1839
				if (!rcu_access_pointer(n->child[i]))
1840
					s->nullpointers++;
1841
			}
1842 1843
		}
	}
R
Robert Olsson 已提交
1844
	rcu_read_unlock();
1845 1846
}

1847 1848 1849 1850
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1851
{
E
Eric Dumazet 已提交
1852
	unsigned int i, max, pointers, bytes, avdepth;
1853

1854 1855 1856 1857
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
1858

1859 1860
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
1861
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
1862

1863
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
A
Alexander Duyck 已提交
1864
	bytes = sizeof(struct tnode) * stat->leaves;
1865 1866 1867 1868

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

1869
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
1870
	bytes += sizeof(struct tnode) * stat->tnodes;
1871

R
Robert Olsson 已提交
1872 1873
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
1874
		max--;
1875

1876
	pointers = 0;
1877
	for (i = 1; i < max; i++)
1878
		if (stat->nodesizes[i] != 0) {
1879
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
1880 1881 1882
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
1883
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
1884

A
Alexander Duyck 已提交
1885
	bytes += sizeof(struct tnode *) * pointers;
1886 1887
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
1888
}
R
Robert Olsson 已提交
1889

1890
#ifdef CONFIG_IP_FIB_TRIE_STATS
1891
static void trie_show_usage(struct seq_file *seq,
1892
			    const struct trie_use_stats __percpu *stats)
1893
{
1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908
	struct trie_use_stats s = { 0 };
	int cpu;

	/* loop through all of the CPUs and gather up the stats */
	for_each_possible_cpu(cpu) {
		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);

		s.gets += pcpu->gets;
		s.backtrack += pcpu->backtrack;
		s.semantic_match_passed += pcpu->semantic_match_passed;
		s.semantic_match_miss += pcpu->semantic_match_miss;
		s.null_node_hit += pcpu->null_node_hit;
		s.resize_node_skipped += pcpu->resize_node_skipped;
	}

1909
	seq_printf(seq, "\nCounters:\n---------\n");
1910 1911
	seq_printf(seq, "gets = %u\n", s.gets);
	seq_printf(seq, "backtracks = %u\n", s.backtrack);
1912
	seq_printf(seq, "semantic match passed = %u\n",
1913 1914 1915 1916
		   s.semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
1917
}
1918 1919
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

1920
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
1921
{
1922 1923 1924 1925 1926 1927
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
1928
}
1929

1930

1931 1932
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
1933
	struct net *net = (struct net *)seq->private;
1934
	unsigned int h;
1935

1936
	seq_printf(seq,
1937 1938
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
A
Alexander Duyck 已提交
1939
		   sizeof(struct tnode), sizeof(struct tnode));
1940

1941 1942 1943 1944
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

1945
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1946 1947
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
1948

1949 1950 1951 1952 1953 1954 1955 1956
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
1957
			trie_show_usage(seq, t->stats);
1958 1959 1960
#endif
		}
	}
1961

1962
	return 0;
1963 1964
}

1965
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
1966
{
1967
	return single_open_net(inode, file, fib_triestat_seq_show);
1968 1969
}

1970
static const struct file_operations fib_triestat_fops = {
1971 1972 1973 1974
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
1975
	.release = single_release_net,
1976 1977
};

A
Alexander Duyck 已提交
1978
static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
1979
{
1980 1981
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
1982
	loff_t idx = 0;
1983
	unsigned int h;
1984

1985 1986 1987
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
1988

1989
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
A
Alexander Duyck 已提交
1990
			struct tnode *n;
1991 1992 1993 1994 1995 1996 1997 1998 1999

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2000
	}
2001

2002 2003 2004
	return NULL;
}

2005
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2006
	__acquires(RCU)
2007
{
2008
	rcu_read_lock();
2009
	return fib_trie_get_idx(seq, *pos);
2010 2011
}

2012
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2013
{
2014
	struct fib_trie_iter *iter = seq->private;
2015
	struct net *net = seq_file_net(seq);
2016 2017 2018
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
A
Alexander Duyck 已提交
2019
	struct tnode *n;
2020

2021
	++*pos;
2022 2023 2024 2025
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2026

2027 2028
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2029
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2030 2031 2032 2033 2034
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2035

2036 2037 2038
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2039
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2040 2041 2042 2043 2044
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2045
	return NULL;
2046 2047 2048 2049

found:
	iter->tb = tb;
	return n;
2050
}
2051

2052
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2053
	__releases(RCU)
2054
{
2055 2056
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2057

2058 2059
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2060 2061
	while (n-- > 0)
		seq_puts(seq, "   ");
2062
}
2063

2064
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2065
{
S
Stephen Hemminger 已提交
2066
	switch (s) {
2067 2068 2069 2070 2071 2072
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2073
		snprintf(buf, len, "scope=%d", s);
2074 2075 2076
		return buf;
	}
}
2077

2078
static const char *const rtn_type_names[__RTN_MAX] = {
2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2092

E
Eric Dumazet 已提交
2093
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2094 2095 2096
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2097
	snprintf(buf, len, "type %u", t);
2098
	return buf;
2099 2100
}

2101 2102
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2103
{
2104
	const struct fib_trie_iter *iter = seq->private;
A
Alexander Duyck 已提交
2105
	struct tnode *n = v;
2106

2107 2108
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2109

2110
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
2111
		__be32 prf = htonl(n->key);
O
Olof Johansson 已提交
2112

2113 2114 2115 2116
		seq_indent(seq, iter->depth-1);
		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
			   n->full_children, n->empty_children);
2117
	} else {
2118
		struct leaf_info *li;
A
Alexander Duyck 已提交
2119
		__be32 val = htonl(n->key);
2120 2121

		seq_indent(seq, iter->depth);
2122
		seq_printf(seq, "  |-- %pI4\n", &val);
2123

A
Alexander Duyck 已提交
2124
		hlist_for_each_entry_rcu(li, &n->list, hlist) {
2125 2126 2127 2128 2129 2130 2131 2132
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2133
						     fa->fa_info->fib_scope),
2134 2135 2136
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2137
					seq_printf(seq, " tos=%d", fa->fa_tos);
2138
				seq_putc(seq, '\n');
2139 2140
			}
		}
2141
	}
2142

2143 2144 2145
	return 0;
}

2146
static const struct seq_operations fib_trie_seq_ops = {
2147 2148 2149 2150
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2151 2152
};

2153
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2154
{
2155 2156
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2157 2158
}

2159
static const struct file_operations fib_trie_fops = {
2160 2161 2162 2163
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2164
	.release = seq_release_net,
2165 2166
};

2167 2168 2169 2170 2171 2172 2173
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

A
Alexander Duyck 已提交
2174
static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2175
{
A
Alexander Duyck 已提交
2176
	struct tnode *l = NULL;
2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2207
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
A
Alexander Duyck 已提交
2221
	struct tnode *l = v;
2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2245
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2246
{
E
Eric Dumazet 已提交
2247
	unsigned int flags = 0;
2248

E
Eric Dumazet 已提交
2249 2250
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2251 2252
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2253
	if (mask == htonl(0xFFFFFFFF))
2254 2255 2256
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2257 2258
}

2259 2260 2261
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2262
 *	and needs to be same as fib_hash output to avoid breaking
2263 2264 2265
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2266
{
A
Alexander Duyck 已提交
2267
	struct tnode *l = v;
2268
	struct leaf_info *li;
2269

2270 2271 2272 2273 2274 2275
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2276

2277
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2278
		struct fib_alias *fa;
A
Al Viro 已提交
2279
		__be32 mask, prefix;
O
Olof Johansson 已提交
2280

2281 2282
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2283

2284
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2285
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2286
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2287

2288 2289 2290
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2291

2292 2293
			seq_setwidth(seq, 127);

2294
			if (fi)
2295 2296
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2297
					 "%d\t%08X\t%d\t%u\t%u",
2298 2299 2300 2301 2302
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2303 2304
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2305
					 fi->fib_window,
2306
					 fi->fib_rtt >> 3);
2307
			else
2308 2309
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2310
					 "%d\t%08X\t%d\t%u\t%u",
2311
					 prefix, 0, flags, 0, 0, 0,
2312
					 mask, 0, 0, 0);
2313

2314
			seq_pad(seq, '\n');
2315
		}
2316 2317 2318 2319 2320
	}

	return 0;
}

2321
static const struct seq_operations fib_route_seq_ops = {
2322 2323 2324
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2325
	.show   = fib_route_seq_show,
2326 2327
};

2328
static int fib_route_seq_open(struct inode *inode, struct file *file)
2329
{
2330
	return seq_open_net(inode, file, &fib_route_seq_ops,
2331
			    sizeof(struct fib_route_iter));
2332 2333
}

2334
static const struct file_operations fib_route_fops = {
2335 2336 2337 2338
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2339
	.release = seq_release_net,
2340 2341
};

2342
int __net_init fib_proc_init(struct net *net)
2343
{
2344
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2345 2346
		goto out1;

2347 2348
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2349 2350
		goto out2;

2351
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2352 2353
		goto out3;

2354
	return 0;
2355 2356

out3:
2357
	remove_proc_entry("fib_triestat", net->proc_net);
2358
out2:
2359
	remove_proc_entry("fib_trie", net->proc_net);
2360 2361
out1:
	return -ENOMEM;
2362 2363
}

2364
void __net_exit fib_proc_exit(struct net *net)
2365
{
2366 2367 2368
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2369 2370 2371
}

#endif /* CONFIG_PROC_FS */