fib_trie.c 56.0 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

90 91
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
R
Robert Olsson 已提交
92

93
#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
94

95 96 97 98 99
struct tnode {
	t_key key;
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	struct tnode __rcu *parent;
100
	struct rcu_head rcu;
A
Alexander Duyck 已提交
101 102 103 104 105 106 107 108 109 110
	union {
		/* The fields in this struct are valid if bits > 0 (TNODE) */
		struct {
			unsigned int full_children;  /* KEYLENGTH bits needed */
			unsigned int empty_children; /* KEYLENGTH bits needed */
			struct tnode __rcu *child[0];
		};
		/* This list pointer if valid if bits == 0 (LEAF) */
		struct hlist_head list;
	};
111 112 113 114 115
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
116
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
117
	struct list_head falh;
118
	struct rcu_head rcu;
119 120 121 122 123 124 125 126 127
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
128
	unsigned int resize_node_skipped;
129 130 131 132 133 134 135 136 137
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
138
	unsigned int prefixes;
R
Robert Olsson 已提交
139
	unsigned int nodesizes[MAX_STAT_DEPTH];
140
};
141 142

struct trie {
A
Alexander Duyck 已提交
143
	struct tnode __rcu *trie;
144
#ifdef CONFIG_IP_FIB_TRIE_STATS
145
	struct trie_use_stats __percpu *stats;
146 147 148
#endif
};

149
static void resize(struct trie *t, struct tnode *tn);
150 151 152 153 154 155 156 157
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
158

159
static struct kmem_cache *fn_alias_kmem __read_mostly;
160
static struct kmem_cache *trie_leaf_kmem __read_mostly;
161

162 163
/* caller must hold RTNL */
#define node_parent(n) rtnl_dereference((n)->parent)
E
Eric Dumazet 已提交
164

165 166
/* caller must hold RCU read lock or RTNL */
#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
E
Eric Dumazet 已提交
167

168
/* wrapper for rcu_assign_pointer */
A
Alexander Duyck 已提交
169
static inline void node_set_parent(struct tnode *n, struct tnode *tp)
170
{
A
Alexander Duyck 已提交
171 172
	if (n)
		rcu_assign_pointer(n->parent, tp);
S
Stephen Hemminger 已提交
173 174
}

175 176 177 178
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)

/* This provides us with the number of children in this node, in the case of a
 * leaf this will return 0 meaning none of the children are accessible.
179
 */
180
static inline unsigned long tnode_child_length(const struct tnode *tn)
S
Stephen Hemminger 已提交
181
{
182
	return (1ul << tn->bits) & ~(1ul);
S
Stephen Hemminger 已提交
183
}
R
Robert Olsson 已提交
184

185 186 187
/* caller must hold RTNL */
static inline struct tnode *tnode_get_child(const struct tnode *tn,
					    unsigned long i)
188
{
189
	BUG_ON(i >= tnode_child_length(tn));
R
Robert Olsson 已提交
190

E
Eric Dumazet 已提交
191
	return rtnl_dereference(tn->child[i]);
192 193
}

194 195 196
/* caller must hold RCU read lock or RTNL */
static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
						unsigned long i)
197
{
198
	BUG_ON(i >= tnode_child_length(tn));
199

E
Eric Dumazet 已提交
200
	return rcu_dereference_rtnl(tn->child[i]);
201 202
}

203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
/* To understand this stuff, an understanding of keys and all their bits is
 * necessary. Every node in the trie has a key associated with it, but not
 * all of the bits in that key are significant.
 *
 * Consider a node 'n' and its parent 'tp'.
 *
 * If n is a leaf, every bit in its key is significant. Its presence is
 * necessitated by path compression, since during a tree traversal (when
 * searching for a leaf - unless we are doing an insertion) we will completely
 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
 * a potentially successful search, that we have indeed been walking the
 * correct key path.
 *
 * Note that we can never "miss" the correct key in the tree if present by
 * following the wrong path. Path compression ensures that segments of the key
 * that are the same for all keys with a given prefix are skipped, but the
 * skipped part *is* identical for each node in the subtrie below the skipped
 * bit! trie_insert() in this implementation takes care of that.
 *
 * if n is an internal node - a 'tnode' here, the various parts of its key
 * have many different meanings.
 *
 * Example:
 * _________________________________________________________________
 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 * -----------------------------------------------------------------
 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
 *
 * _________________________________________________________________
 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 * -----------------------------------------------------------------
 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
 *
 * tp->pos = 22
 * tp->bits = 3
 * n->pos = 13
 * n->bits = 4
 *
 * First, let's just ignore the bits that come before the parent tp, that is
 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
 * point we do not use them for anything.
 *
 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 * index into the parent's child array. That is, they will be used to find
 * 'n' among tp's children.
 *
 * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
 * for the node n.
 *
 * All the bits we have seen so far are significant to the node n. The rest
 * of the bits are really not needed or indeed known in n->key.
 *
 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 * n's child array, and will of course be different for each child.
 *
 * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
 * at this point.
 */
261

262 263
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
264
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
265
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
266 267

static void __alias_free_mem(struct rcu_head *head)
268
{
R
Robert Olsson 已提交
269 270
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
271 272
}

R
Robert Olsson 已提交
273
static inline void alias_free_mem_rcu(struct fib_alias *fa)
274
{
R
Robert Olsson 已提交
275 276
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
277

278
#define TNODE_KMALLOC_MAX \
A
Alexander Duyck 已提交
279
	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
O
Olof Johansson 已提交
280

281
static void __node_free_rcu(struct rcu_head *head)
282
{
A
Alexander Duyck 已提交
283
	struct tnode *n = container_of(head, struct tnode, rcu);
284 285 286 287 288 289 290

	if (IS_LEAF(n))
		kmem_cache_free(trie_leaf_kmem, n);
	else if (n->bits <= TNODE_KMALLOC_MAX)
		kfree(n);
	else
		vfree(n);
291 292
}

293 294
#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)

R
Robert Olsson 已提交
295
static inline void free_leaf_info(struct leaf_info *leaf)
296
{
297
	kfree_rcu(leaf, rcu);
298 299
}

300
static struct tnode *tnode_alloc(size_t size)
301
{
R
Robert Olsson 已提交
302
	if (size <= PAGE_SIZE)
303
		return kzalloc(size, GFP_KERNEL);
304
	else
305
		return vzalloc(size);
306
}
R
Robert Olsson 已提交
307

A
Alexander Duyck 已提交
308
static struct tnode *leaf_new(t_key key)
R
Robert Olsson 已提交
309
{
A
Alexander Duyck 已提交
310
	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
311
	if (l) {
312 313 314 315 316 317
		l->parent = NULL;
		/* set key and pos to reflect full key value
		 * any trailing zeros in the key should be ignored
		 * as the nodes are searched
		 */
		l->key = key;
318
		l->pos = 0;
319 320 321
		/* set bits to 0 indicating we are not a tnode */
		l->bits = 0;

R
Robert Olsson 已提交
322 323 324 325 326 327 328 329 330 331
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
332
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
333 334 335 336 337
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

338
static struct tnode *tnode_new(t_key key, int pos, int bits)
339
{
340
	size_t sz = offsetof(struct tnode, child[1 << bits]);
341
	struct tnode *tn = tnode_alloc(sz);
342 343 344 345
	unsigned int shift = pos + bits;

	/* verify bits and pos their msb bits clear and values are valid */
	BUG_ON(!bits || (shift > KEYLENGTH));
346

O
Olof Johansson 已提交
347
	if (tn) {
348
		tn->parent = NULL;
349 350
		tn->pos = pos;
		tn->bits = bits;
351
		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
352 353 354
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
355

E
Eric Dumazet 已提交
356
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
A
Alexander Duyck 已提交
357
		 sizeof(struct tnode *) << bits);
358 359 360
	return tn;
}

361
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
362 363
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */
A
Alexander Duyck 已提交
364
static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
365
{
366
	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
367 368
}

369 370 371 372
/* Add a child at position i overwriting the old value.
 * Update the value of full_children and empty_children.
 */
static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
373
{
A
Alexander Duyck 已提交
374
	struct tnode *chi = rtnl_dereference(tn->child[i]);
375
	int isfull, wasfull;
376

377
	BUG_ON(i >= tnode_child_length(tn));
S
Stephen Hemminger 已提交
378

379 380 381 382 383
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
384

385
	/* update fullChildren */
386
	wasfull = tnode_full(tn, chi);
387
	isfull = tnode_full(tn, n);
388

389
	if (wasfull && !isfull)
390
		tn->full_children--;
391
	else if (!wasfull && isfull)
392
		tn->full_children++;
O
Olof Johansson 已提交
393

394
	node_set_parent(n, tn);
395

396
	rcu_assign_pointer(tn->child[i], n);
397 398
}

399 400 401 402 403 404 405 406 407
static void put_child_root(struct tnode *tp, struct trie *t,
			   t_key key, struct tnode *n)
{
	if (tp)
		put_child(tp, get_index(key, tp), n);
	else
		rcu_assign_pointer(t->trie, n);
}

408
static inline void tnode_free_init(struct tnode *tn)
E
Eric Dumazet 已提交
409
{
410 411 412 413 414 415 416 417
	tn->rcu.next = NULL;
}

static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
{
	n->rcu.next = tn->rcu.next;
	tn->rcu.next = &n->rcu;
}
E
Eric Dumazet 已提交
418

419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
static void tnode_free(struct tnode *tn)
{
	struct callback_head *head = &tn->rcu;

	while (head) {
		head = head->next;
		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
		node_free(tn);

		tn = container_of(head, struct tnode, rcu);
	}

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
E
Eric Dumazet 已提交
434 435 436
	}
}

437
static int inflate(struct trie *t, struct tnode *oldtnode)
438
{
439
	unsigned long olen = tnode_child_length(oldtnode);
440
	struct tnode *tp = node_parent(oldtnode);
A
Alexander Duyck 已提交
441
	struct tnode *tn;
442
	unsigned long i;
443
	t_key m;
444

S
Stephen Hemminger 已提交
445
	pr_debug("In inflate\n");
446

447
	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
S
Stephen Hemminger 已提交
448
	if (!tn)
449
		return -ENOMEM;
450 451

	/*
452 453 454
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
455 456
	 * of tnode is ignored.
	 */
457 458
	for (i = 0, m = 1u << tn->pos; i < olen; i++) {
		struct tnode *inode = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
459

460
		if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
461
			struct tnode *left, *right;
462

463
			left = tnode_new(inode->key & ~m, inode->pos,
464
					 inode->bits - 1);
465 466
			if (!left)
				goto nomem;
467
			tnode_free_append(tn, left);
O
Olof Johansson 已提交
468

469
			right = tnode_new(inode->key | m, inode->pos,
470 471
					  inode->bits - 1);

472
			if (!right)
473
				goto nomem;
474
			tnode_free_append(tn, right);
475

A
Alexander Duyck 已提交
476 477
			put_child(tn, 2*i, left);
			put_child(tn, 2*i+1, right);
478 479 480
		}
	}

481 482 483
	/* prepare oldtnode to be freed */
	tnode_free_init(oldtnode);

O
Olof Johansson 已提交
484
	for (i = 0; i < olen; i++) {
A
Alexander Duyck 已提交
485
		struct tnode *inode = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
486
		struct tnode *left, *right;
487
		unsigned long size, j;
488

489
		/* An empty child */
A
Alexander Duyck 已提交
490
		if (inode == NULL)
491 492 493
			continue;

		/* A leaf or an internal node with skipped bits */
A
Alexander Duyck 已提交
494
		if (!tnode_full(oldtnode, inode)) {
495
			put_child(tn, get_index(inode->key, tn), inode);
496 497 498
			continue;
		}

499 500 501
		/* drop the node in the old tnode free list */
		tnode_free_append(oldtnode, inode);

502 503
		/* An internal node with two children */
		if (inode->bits == 1) {
L
Lin Ming 已提交
504 505
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
O
Olof Johansson 已提交
506
			continue;
507 508
		}

O
Olof Johansson 已提交
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
527

O
Olof Johansson 已提交
528 529 530
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
531

A
Alexander Duyck 已提交
532
		left = tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
533
		put_child(tn, 2*i, NULL);
534

O
Olof Johansson 已提交
535
		BUG_ON(!left);
536

A
Alexander Duyck 已提交
537
		right = tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
538
		put_child(tn, 2*i+1, NULL);
539

O
Olof Johansson 已提交
540
		BUG_ON(!right);
541

O
Olof Johansson 已提交
542 543
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
544 545
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
546
		}
547 548 549

		put_child(tn, 2 * i, left);
		put_child(tn, 2 * i + 1, right);
O
Olof Johansson 已提交
550

551
		/* resize child nodes */
552 553
		resize(t, left);
		resize(t, right);
554
	}
555 556

	put_child_root(tp, t, tn->key, tn);
557 558 559

	/* we completed without error, prepare to free old node */
	tnode_free(oldtnode);
560
	return 0;
561
nomem:
562 563
	/* all pointers should be clean so we are done */
	tnode_free(tn);
564
	return -ENOMEM;
565 566
}

567
static int halve(struct trie *t, struct tnode *oldtnode)
568
{
569
	unsigned long olen = tnode_child_length(oldtnode);
570
	struct tnode *tp = node_parent(oldtnode);
A
Alexander Duyck 已提交
571
	struct tnode *tn, *left, *right;
572 573
	int i;

S
Stephen Hemminger 已提交
574
	pr_debug("In halve\n");
575

576
	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
577
	if (!tn)
578
		return -ENOMEM;
579 580

	/*
581 582 583
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
584 585 586
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
587
	for (i = 0; i < olen; i += 2) {
588 589
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
590

591
		/* Two nonempty children */
S
Stephen Hemminger 已提交
592
		if (left && right) {
593
			struct tnode *newn;
S
Stephen Hemminger 已提交
594

595
			newn = tnode_new(left->key, oldtnode->pos, 1);
596
			if (!newn) {
597
				tnode_free(tn);
598 599
				return -ENOMEM;
			}
600
			tnode_free_append(tn, newn);
S
Stephen Hemminger 已提交
601

A
Alexander Duyck 已提交
602
			put_child(tn, i/2, newn);
603 604 605
		}

	}
606

607 608 609
	/* prepare oldtnode to be freed */
	tnode_free_init(oldtnode);

O
Olof Johansson 已提交
610 611 612
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

613 614
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
615

616 617 618 619
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
620
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
621
			continue;
S
Stephen Hemminger 已提交
622
		}
O
Olof Johansson 已提交
623 624

		if (right == NULL) {
L
Lin Ming 已提交
625
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
626 627
			continue;
		}
628

629
		/* Two nonempty children */
A
Alexander Duyck 已提交
630
		newBinNode = tnode_get_child(tn, i/2);
L
Lin Ming 已提交
631 632
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
633 634 635

		put_child(tn, i / 2, newBinNode);

636
		/* resize child node */
637
		resize(t, newBinNode);
638
	}
639 640

	put_child_root(tp, t, tn->key, tn);
641 642 643

	/* all pointers should be clean so we are done */
	tnode_free(oldtnode);
644 645

	return 0;
646 647
}

648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 * Telecommunications, page 6:
 * "A node is doubled if the ratio of non-empty children to all
 * children in the *doubled* node is at least 'high'."
 *
 * 'high' in this instance is the variable 'inflate_threshold'. It
 * is expressed as a percentage, so we multiply it with
 * tnode_child_length() and instead of multiplying by 2 (since the
 * child array will be doubled by inflate()) and multiplying
 * the left-hand side by 100 (to handle the percentage thing) we
 * multiply the left-hand side by 50.
 *
 * The left-hand side may look a bit weird: tnode_child_length(tn)
 * - tn->empty_children is of course the number of non-null children
 * in the current node. tn->full_children is the number of "full"
 * children, that is non-null tnodes with a skip value of 0.
 * All of those will be doubled in the resulting inflated tnode, so
 * we just count them one extra time here.
 *
 * A clearer way to write this would be:
 *
 * to_be_doubled = tn->full_children;
 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 *     tn->full_children;
 *
 * new_child_length = tnode_child_length(tn) * 2;
 *
 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 *      new_child_length;
 * if (new_fill_factor >= inflate_threshold)
 *
 * ...and so on, tho it would mess up the while () loop.
 *
 * anyway,
 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 *      inflate_threshold
 *
 * avoid a division:
 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 *      inflate_threshold * new_child_length
 *
 * expand not_to_be_doubled and to_be_doubled, and shorten:
 * 100 * (tnode_child_length(tn) - tn->empty_children +
 *    tn->full_children) >= inflate_threshold * new_child_length
 *
 * expand new_child_length:
 * 100 * (tnode_child_length(tn) - tn->empty_children +
 *    tn->full_children) >=
 *      inflate_threshold * tnode_child_length(tn) * 2
 *
 * shorten again:
 * 50 * (tn->full_children + tnode_child_length(tn) -
 *    tn->empty_children) >= inflate_threshold *
 *    tnode_child_length(tn)
 *
 */
705
static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
706 707 708 709 710
{
	unsigned long used = tnode_child_length(tn);
	unsigned long threshold = used;

	/* Keep root node larger */
711
	threshold *= tp ? inflate_threshold : inflate_threshold_root;
712 713 714 715 716 717
	used += tn->full_children;
	used -= tn->empty_children;

	return tn->pos && ((50 * used) >= threshold);
}

718
static bool should_halve(const struct tnode *tp, const struct tnode *tn)
719 720 721 722 723
{
	unsigned long used = tnode_child_length(tn);
	unsigned long threshold = used;

	/* Keep root node larger */
724
	threshold *= tp ? halve_threshold : halve_threshold_root;
725 726 727 728 729
	used -= tn->empty_children;

	return (tn->bits > 1) && ((100 * used) < threshold);
}

730
#define MAX_WORK 10
731
static void resize(struct trie *t, struct tnode *tn)
732
{
733 734
	struct tnode *tp = node_parent(tn), *n = NULL;
	struct tnode __rcu **cptr;
735 736 737 738 739
	int max_work;

	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);

740 741 742 743 744 745 746
	/* track the tnode via the pointer from the parent instead of
	 * doing it ourselves.  This way we can let RCU fully do its
	 * thing without us interfering
	 */
	cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
	BUG_ON(tn != rtnl_dereference(*cptr));

747 748 749 750 751 752 753 754
	/* No children */
	if (tn->empty_children > (tnode_child_length(tn) - 1))
		goto no_children;

	/* One child */
	if (tn->empty_children == (tnode_child_length(tn) - 1))
		goto one_child;

755 756
	/* Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
757 758
	 */
	max_work = MAX_WORK;
759 760
	while (should_inflate(tp, tn) && max_work--) {
		if (inflate(t, tn)) {
761 762 763 764 765
#ifdef CONFIG_IP_FIB_TRIE_STATS
			this_cpu_inc(t->stats->resize_node_skipped);
#endif
			break;
		}
766 767

		tn = rtnl_dereference(*cptr);
768 769 770 771
	}

	/* Return if at least one inflate is run */
	if (max_work != MAX_WORK)
772
		return;
773

774
	/* Halve as long as the number of empty children in this
775 776 777
	 * node is above threshold.
	 */
	max_work = MAX_WORK;
778 779
	while (should_halve(tp, tn) && max_work--) {
		if (halve(t, tn)) {
780 781 782 783 784 785
#ifdef CONFIG_IP_FIB_TRIE_STATS
			this_cpu_inc(t->stats->resize_node_skipped);
#endif
			break;
		}

786 787
		tn = rtnl_dereference(*cptr);
	}
788 789 790 791 792 793 794 795 796

	/* Only one child remains */
	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
		unsigned long i;
one_child:
		for (i = tnode_child_length(tn); !n && i;)
			n = tnode_get_child(tn, --i);
no_children:
		/* compress one level */
797 798 799 800
		put_child_root(tp, t, tn->key, n);
		node_set_parent(n, tp);

		/* drop dead node */
801 802
		tnode_free_init(tn);
		tnode_free(tn);
803 804 805
	}
}

R
Robert Olsson 已提交
806
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
807 808
 via get_fa_head and dump */

A
Alexander Duyck 已提交
809
static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
810
{
R
Robert Olsson 已提交
811
	struct hlist_head *head = &l->list;
812 813
	struct leaf_info *li;

814
	hlist_for_each_entry_rcu(li, head, hlist)
815
		if (li->plen == plen)
816
			return li;
O
Olof Johansson 已提交
817

818 819 820
	return NULL;
}

A
Alexander Duyck 已提交
821
static inline struct list_head *get_fa_head(struct tnode *l, int plen)
822
{
R
Robert Olsson 已提交
823
	struct leaf_info *li = find_leaf_info(l, plen);
824

O
Olof Johansson 已提交
825 826
	if (!li)
		return NULL;
827

O
Olof Johansson 已提交
828
	return &li->falh;
829 830 831 832
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
833 834 835 836 837
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
838
		hlist_for_each_entry(li, head, hlist) {
839 840 841 842 843 844
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
845
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
846 847 848
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
849 850
}

R
Robert Olsson 已提交
851
/* rcu_read_lock needs to be hold by caller from readside */
A
Alexander Duyck 已提交
852
static struct tnode *fib_find_node(struct trie *t, u32 key)
853
{
A
Alexander Duyck 已提交
854
	struct tnode *n = rcu_dereference_rtnl(t->trie);
A
Alexander Duyck 已提交
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873

	while (n) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the bits in the cindex. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is cindex
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			return NULL;

		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
874 875
			break;

A
Alexander Duyck 已提交
876 877
		n = rcu_dereference_rtnl(n->child[index]);
	}
O
Olof Johansson 已提交
878

A
Alexander Duyck 已提交
879
	return n;
880 881
}

882
static void trie_rebalance(struct trie *t, struct tnode *tn)
883
{
S
Stephen Hemminger 已提交
884
	struct tnode *tp;
885

886 887
	while ((tp = node_parent(tn)) != NULL) {
		resize(t, tn);
S
Stephen Hemminger 已提交
888
		tn = tp;
889
	}
S
Stephen Hemminger 已提交
890

891
	/* Handle last (top) tnode */
892
	if (IS_TNODE(tn))
893
		resize(t, tn);
894 895
}

R
Robert Olsson 已提交
896 897
/* only used from updater-side */

898
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
899
{
900
	struct list_head *fa_head = NULL;
901
	struct tnode *l, *n, *tp = NULL;
902 903
	struct leaf_info *li;

904 905 906 907 908
	li = leaf_info_new(plen);
	if (!li)
		return NULL;
	fa_head = &li->falh;

E
Eric Dumazet 已提交
909
	n = rtnl_dereference(t->trie);
910

911 912
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
913 914
	 * and we should just put our new leaf in that.
	 *
915 916 917
	 * If we hit a node with a key that does't match then we should stop
	 * and create a new tnode to replace that node and insert ourselves
	 * and the other node into the new tnode.
918
	 */
919 920
	while (n) {
		unsigned long index = get_index(key, n);
921

922 923 924 925 926 927 928 929 930 931 932
		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
933 934
			break;

935 936 937 938 939 940
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n)) {
			/* Case 1: n is a leaf, and prefixes match*/
			insert_leaf_info(&n->list, li);
			return fa_head;
		}
941

942 943
		tp = n;
		n = rcu_dereference_rtnl(n->child[index]);
944 945
	}

946 947 948
	l = leaf_new(key);
	if (!l) {
		free_leaf_info(li);
949
		return NULL;
950
	}
951 952 953

	insert_leaf_info(&l->list, li);

954 955 956 957 958 959 960 961
	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
	 *
	 *  Add a new tnode here
	 *  first tnode need some special handling
	 *  leaves us in position for handling as case 3
	 */
	if (n) {
		struct tnode *tn;
962

963
		tn = tnode_new(key, __fls(key ^ n->key), 1);
964
		if (!tn) {
965
			free_leaf_info(li);
966
			node_free(l);
967
			return NULL;
O
Olof Johansson 已提交
968 969
		}

970 971 972
		/* initialize routes out of node */
		NODE_INIT_PARENT(tn, tp);
		put_child(tn, get_index(key, tn) ^ 1, n);
973

974 975 976
		/* start adding routes into the node */
		put_child_root(tp, t, key, tn);
		node_set_parent(n, tn);
977

978
		/* parent now has a NULL spot where the leaf can go */
979
		tp = tn;
980
	}
O
Olof Johansson 已提交
981

982 983 984 985 986 987 988 989
	/* Case 3: n is NULL, and will just insert a new leaf */
	if (tp) {
		NODE_INIT_PARENT(l, tp);
		put_child(tp, get_index(key, tp), l);
		trie_rebalance(t, tp);
	} else {
		rcu_assign_pointer(t->trie, l);
	}
R
Robert Olsson 已提交
990

991 992 993
	return fa_head;
}

994 995 996
/*
 * Caller must hold RTNL.
 */
997
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
998 999 1000
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1001
	struct list_head *fa_head = NULL;
1002
	struct fib_info *fi;
1003 1004
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1005 1006
	u32 key, mask;
	int err;
A
Alexander Duyck 已提交
1007
	struct tnode *l;
1008 1009 1010 1011

	if (plen > 32)
		return -EINVAL;

1012
	key = ntohl(cfg->fc_dst);
1013

1014
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1015

O
Olof Johansson 已提交
1016
	mask = ntohl(inet_make_mask(plen));
1017

1018
	if (key & ~mask)
1019 1020 1021 1022
		return -EINVAL;

	key = key & mask;

1023 1024 1025
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1026
		goto err;
1027
	}
1028 1029

	l = fib_find_node(t, key);
1030
	fa = NULL;
1031

1032
	if (l) {
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1048 1049 1050
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1051 1052

		err = -EEXIST;
1053
		if (cfg->fc_nlflags & NLM_F_EXCL)
1054 1055
			goto out;

1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1076
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1077 1078 1079
			struct fib_info *fi_drop;
			u8 state;

1080 1081 1082 1083
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1084
				goto out;
1085
			}
R
Robert Olsson 已提交
1086
			err = -ENOBUFS;
1087
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1088 1089
			if (new_fa == NULL)
				goto out;
1090 1091

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1092 1093
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1094
			new_fa->fa_type = cfg->fc_type;
1095
			state = fa->fa_state;
1096
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1097

R
Robert Olsson 已提交
1098 1099
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1100 1101 1102

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1103
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1104 1105
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1106

O
Olof Johansson 已提交
1107
			goto succeeded;
1108 1109 1110 1111 1112
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1113 1114
		if (fa_match)
			goto out;
1115

1116
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1117
			fa = fa_first;
1118 1119
	}
	err = -ENOENT;
1120
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1121 1122 1123
		goto out;

	err = -ENOBUFS;
1124
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1125 1126 1127 1128 1129
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1130
	new_fa->fa_type = cfg->fc_type;
1131 1132 1133 1134 1135
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1136
	if (!fa_head) {
1137 1138 1139
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1140
			goto out_free_new_fa;
1141
		}
1142
	}
1143

1144 1145 1146
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1147 1148
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1149

1150
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1151
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1152
		  &cfg->fc_nlinfo, 0);
1153 1154
succeeded:
	return 0;
1155 1156 1157

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1158 1159
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1160
err:
1161 1162 1163
	return err;
}

1164 1165 1166 1167 1168 1169 1170
static inline t_key prefix_mismatch(t_key key, struct tnode *n)
{
	t_key prefix = n->key;

	return (key ^ prefix) & (prefix | -prefix);
}

1171
/* should be called with rcu_read_lock */
1172
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1173
		     struct fib_result *res, int fib_flags)
1174
{
1175
	struct trie *t = (struct trie *)tb->tb_data;
1176 1177 1178
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats __percpu *stats = t->stats;
#endif
1179 1180
	const t_key key = ntohl(flp->daddr);
	struct tnode *n, *pn;
1181
	struct leaf_info *li;
1182
	t_key cindex;
O
Olof Johansson 已提交
1183

R
Robert Olsson 已提交
1184
	n = rcu_dereference(t->trie);
1185
	if (!n)
1186
		return -EAGAIN;
1187 1188

#ifdef CONFIG_IP_FIB_TRIE_STATS
1189
	this_cpu_inc(stats->gets);
1190 1191
#endif

A
Alexander Duyck 已提交
1192
	pn = n;
1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210
	cindex = 0;

	/* Step 1: Travel to the longest prefix match in the trie */
	for (;;) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			break;
1211

1212 1213
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
1214
			goto found;
1215

1216 1217
		/* only record pn and cindex if we are going to be chopping
		 * bits later.  Otherwise we are just wasting cycles.
O
Olof Johansson 已提交
1218
		 */
1219 1220 1221
		if (index) {
			pn = n;
			cindex = index;
O
Olof Johansson 已提交
1222
		}
1223

1224 1225 1226 1227
		n = rcu_dereference(n->child[index]);
		if (unlikely(!n))
			goto backtrace;
	}
1228

1229 1230 1231 1232
	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
	for (;;) {
		/* record the pointer where our next node pointer is stored */
		struct tnode __rcu **cptr = n->child;
1233

1234 1235 1236
		/* This test verifies that none of the bits that differ
		 * between the key and the prefix exist in the region of
		 * the lsb and higher in the prefix.
O
Olof Johansson 已提交
1237
		 */
1238 1239
		if (unlikely(prefix_mismatch(key, n)))
			goto backtrace;
O
Olof Johansson 已提交
1240

1241 1242 1243
		/* exit out and process leaf */
		if (unlikely(IS_LEAF(n)))
			break;
O
Olof Johansson 已提交
1244

1245 1246 1247
		/* Don't bother recording parent info.  Since we are in
		 * prefix match mode we will have to come back to wherever
		 * we started this traversal anyway
O
Olof Johansson 已提交
1248 1249
		 */

1250
		while ((n = rcu_dereference(*cptr)) == NULL) {
1251 1252
backtrace:
#ifdef CONFIG_IP_FIB_TRIE_STATS
1253 1254
			if (!n)
				this_cpu_inc(stats->null_node_hit);
1255
#endif
1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
			/* If we are at cindex 0 there are no more bits for
			 * us to strip at this level so we must ascend back
			 * up one level to see if there are any more bits to
			 * be stripped there.
			 */
			while (!cindex) {
				t_key pkey = pn->key;

				pn = node_parent_rcu(pn);
				if (unlikely(!pn))
1266
					return -EAGAIN;
1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->backtrack);
#endif
				/* Get Child's index */
				cindex = get_index(pkey, pn);
			}

			/* strip the least significant bit from the cindex */
			cindex &= cindex - 1;

			/* grab pointer for next child node */
			cptr = &pn->child[cindex];
1279
		}
1280
	}
1281

1282
found:
1283
	/* Step 3: Process the leaf, if that fails fall back to backtracing */
1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339
	hlist_for_each_entry_rcu(li, &n->list, hlist) {
		struct fib_alias *fa;

		if ((key ^ n->key) & li->mask_plen)
			continue;

		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;

			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
				continue;
			if (fi->fib_dead)
				continue;
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (unlikely(err < 0)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->semantic_match_passed);
#endif
				return err;
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
					continue;

				if (!(fib_flags & FIB_LOOKUP_NOREF))
					atomic_inc(&fi->fib_clntref);

				res->prefixlen = li->plen;
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
				res->scope = fi->fib_scope;
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->semantic_match_passed);
#endif
				return err;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
		this_cpu_inc(stats->semantic_match_miss);
#endif
	}
	goto backtrace;
1340
}
1341
EXPORT_SYMBOL_GPL(fib_table_lookup);
1342

1343 1344 1345
/*
 * Remove the leaf and return parent.
 */
A
Alexander Duyck 已提交
1346
static void trie_leaf_remove(struct trie *t, struct tnode *l)
1347
{
1348
	struct tnode *tp = node_parent(l);
1349

1350
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1351

1352
	if (tp) {
1353
		put_child(tp, get_index(l->key, tp), NULL);
1354
		trie_rebalance(t, tp);
1355
	} else {
1356
		RCU_INIT_POINTER(t->trie, NULL);
1357
	}
1358

1359
	node_free(l);
1360 1361
}

1362 1363 1364
/*
 * Caller must hold RTNL.
 */
1365
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1366 1367 1368
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1369 1370
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1371 1372
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
A
Alexander Duyck 已提交
1373
	struct tnode *l;
O
Olof Johansson 已提交
1374 1375
	struct leaf_info *li;

1376
	if (plen > 32)
1377 1378
		return -EINVAL;

1379
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1380
	mask = ntohl(inet_make_mask(plen));
1381

1382
	if (key & ~mask)
1383 1384 1385 1386 1387
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1388
	if (!l)
1389 1390
		return -ESRCH;

1391 1392 1393 1394 1395 1396
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1397 1398 1399 1400 1401
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1402
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1403 1404

	fa_to_delete = NULL;
1405 1406
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1407 1408 1409 1410 1411
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1412 1413
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1414
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1415 1416
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1417 1418 1419
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1420 1421 1422 1423 1424
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1425 1426
	if (!fa_to_delete)
		return -ESRCH;
1427

O
Olof Johansson 已提交
1428
	fa = fa_to_delete;
1429
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1430
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1431

R
Robert Olsson 已提交
1432
	list_del_rcu(&fa->fa_list);
1433

1434 1435 1436
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1437
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1438
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1439
		free_leaf_info(li);
R
Robert Olsson 已提交
1440
	}
1441

O
Olof Johansson 已提交
1442
	if (hlist_empty(&l->list))
1443
		trie_leaf_remove(t, l);
1444

O
Olof Johansson 已提交
1445
	if (fa->fa_state & FA_S_ACCESSED)
1446
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1447

R
Robert Olsson 已提交
1448 1449
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1450
	return 0;
1451 1452
}

1453
static int trie_flush_list(struct list_head *head)
1454 1455 1456 1457 1458 1459 1460
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1461 1462 1463 1464
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1465 1466 1467 1468 1469 1470
			found++;
		}
	}
	return found;
}

A
Alexander Duyck 已提交
1471
static int trie_flush_leaf(struct tnode *l)
1472 1473 1474
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1475
	struct hlist_node *tmp;
1476 1477
	struct leaf_info *li = NULL;

1478
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1479
		found += trie_flush_list(&li->falh);
1480 1481

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1482
			hlist_del_rcu(&li->hlist);
1483 1484 1485 1486 1487 1488
			free_leaf_info(li);
		}
	}
	return found;
}

1489 1490 1491 1492
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
A
Alexander Duyck 已提交
1493
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1494
{
1495
	do {
1496
		unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
R
Robert Olsson 已提交
1497

1498
		while (idx < tnode_child_length(p)) {
1499
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1500
			if (!c)
O
Olof Johansson 已提交
1501 1502
				continue;

1503
			if (IS_LEAF(c))
A
Alexander Duyck 已提交
1504
				return c;
1505 1506

			/* Rescan start scanning in new node */
A
Alexander Duyck 已提交
1507
			p = c;
1508
			idx = 0;
1509
		}
1510 1511

		/* Node empty, walk back up to parent */
A
Alexander Duyck 已提交
1512
		c = p;
E
Eric Dumazet 已提交
1513
	} while ((p = node_parent_rcu(c)) != NULL);
1514 1515 1516 1517

	return NULL; /* Root of trie */
}

A
Alexander Duyck 已提交
1518
static struct tnode *trie_firstleaf(struct trie *t)
1519
{
A
Alexander Duyck 已提交
1520
	struct tnode *n = rcu_dereference_rtnl(t->trie);
1521 1522 1523 1524 1525

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
A
Alexander Duyck 已提交
1526
		return n;
1527 1528 1529 1530

	return leaf_walk_rcu(n, NULL);
}

A
Alexander Duyck 已提交
1531
static struct tnode *trie_nextleaf(struct tnode *l)
1532
{
A
Alexander Duyck 已提交
1533
	struct tnode *p = node_parent_rcu(l);
1534 1535 1536 1537

	if (!p)
		return NULL;	/* trie with just one leaf */

A
Alexander Duyck 已提交
1538
	return leaf_walk_rcu(p, l);
1539 1540
}

A
Alexander Duyck 已提交
1541
static struct tnode *trie_leafindex(struct trie *t, int index)
1542
{
A
Alexander Duyck 已提交
1543
	struct tnode *l = trie_firstleaf(t);
1544

S
Stephen Hemminger 已提交
1545
	while (l && index-- > 0)
1546
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1547

1548 1549 1550 1551
	return l;
}


1552 1553 1554
/*
 * Caller must hold RTNL.
 */
1555
int fib_table_flush(struct fib_table *tb)
1556 1557
{
	struct trie *t = (struct trie *) tb->tb_data;
A
Alexander Duyck 已提交
1558
	struct tnode *l, *ll = NULL;
1559
	int found = 0;
1560

1561
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1562
		found += trie_flush_leaf(l);
1563 1564

		if (ll && hlist_empty(&ll->list))
1565
			trie_leaf_remove(t, ll);
1566 1567 1568 1569
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1570
		trie_leaf_remove(t, ll);
1571

S
Stephen Hemminger 已提交
1572
	pr_debug("trie_flush found=%d\n", found);
1573 1574 1575
	return found;
}

1576 1577
void fib_free_table(struct fib_table *tb)
{
1578 1579 1580 1581 1582
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie *t = (struct trie *)tb->tb_data;

	free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
1583 1584 1585
	kfree(tb);
}

1586 1587
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1588 1589 1590 1591
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1592
	__be32 xkey = htonl(key);
1593

1594
	s_i = cb->args[5];
1595 1596
	i = 0;

R
Robert Olsson 已提交
1597 1598 1599
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1600 1601 1602 1603 1604
		if (i < s_i) {
			i++;
			continue;
		}

1605
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1606 1607 1608 1609
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1610
				  xkey,
1611 1612
				  plen,
				  fa->fa_tos,
1613
				  fa->fa_info, NLM_F_MULTI) < 0) {
1614
			cb->args[5] = i;
1615
			return -1;
O
Olof Johansson 已提交
1616
		}
1617 1618
		i++;
	}
1619
	cb->args[5] = i;
1620 1621 1622
	return skb->len;
}

A
Alexander Duyck 已提交
1623
static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1624
			struct sk_buff *skb, struct netlink_callback *cb)
1625
{
1626 1627
	struct leaf_info *li;
	int i, s_i;
1628

1629
	s_i = cb->args[4];
1630
	i = 0;
1631

1632
	/* rcu_read_lock is hold by caller */
1633
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1634 1635
		if (i < s_i) {
			i++;
1636
			continue;
1637
		}
O
Olof Johansson 已提交
1638

1639
		if (i > s_i)
1640
			cb->args[5] = 0;
1641

1642
		if (list_empty(&li->falh))
1643 1644
			continue;

1645
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1646
			cb->args[4] = i;
1647 1648
			return -1;
		}
1649
		i++;
1650
	}
1651

1652
	cb->args[4] = i;
1653 1654 1655
	return skb->len;
}

1656 1657
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1658
{
A
Alexander Duyck 已提交
1659
	struct tnode *l;
1660
	struct trie *t = (struct trie *) tb->tb_data;
1661
	t_key key = cb->args[2];
1662
	int count = cb->args[3];
1663

R
Robert Olsson 已提交
1664
	rcu_read_lock();
1665 1666 1667
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1668
	if (count == 0)
1669 1670
		l = trie_firstleaf(t);
	else {
1671 1672 1673
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1674
		l = fib_find_node(t, key);
1675 1676
		if (!l)
			l = trie_leafindex(t, count);
1677
	}
1678

1679 1680
	while (l) {
		cb->args[2] = l->key;
1681
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1682
			cb->args[3] = count;
1683 1684
			rcu_read_unlock();
			return -1;
1685
		}
1686

1687
		++count;
1688
		l = trie_nextleaf(l);
1689 1690
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1691
	}
1692
	cb->args[3] = count;
R
Robert Olsson 已提交
1693
	rcu_read_unlock();
1694

1695 1696 1697
	return skb->len;
}

1698
void __init fib_trie_init(void)
1699
{
1700 1701
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1702 1703 1704
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
A
Alexander Duyck 已提交
1705
					   max(sizeof(struct tnode),
1706 1707
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1708
}
1709

1710

1711
struct fib_table *fib_trie_table(u32 id)
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1722
	tb->tb_default = -1;
1723
	tb->tb_num_default = 0;
1724 1725

	t = (struct trie *) tb->tb_data;
1726 1727 1728 1729 1730 1731 1732 1733
	RCU_INIT_POINTER(t->trie, NULL);
#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats = alloc_percpu(struct trie_use_stats);
	if (!t->stats) {
		kfree(tb);
		tb = NULL;
	}
#endif
1734 1735 1736 1737

	return tb;
}

1738 1739 1740
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1741
	struct seq_net_private p;
1742
	struct fib_table *tb;
1743
	struct tnode *tnode;
E
Eric Dumazet 已提交
1744 1745
	unsigned int index;
	unsigned int depth;
1746
};
1747

A
Alexander Duyck 已提交
1748
static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1749
{
1750
	unsigned long cindex = iter->index;
1751 1752
	struct tnode *tn = iter->tnode;
	struct tnode *p;
1753

1754 1755 1756 1757
	/* A single entry routing table */
	if (!tn)
		return NULL;

1758 1759 1760
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
1761
	while (cindex < tnode_child_length(tn)) {
A
Alexander Duyck 已提交
1762
		struct tnode *n = tnode_get_child_rcu(tn, cindex);
1763

1764 1765 1766 1767 1768 1769
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
A
Alexander Duyck 已提交
1770
				iter->tnode = n;
1771 1772 1773 1774 1775
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
1776

1777 1778
		++cindex;
	}
O
Olof Johansson 已提交
1779

1780
	/* Current node exhausted, pop back up */
A
Alexander Duyck 已提交
1781
	p = node_parent_rcu(tn);
1782
	if (p) {
1783
		cindex = get_index(tn->key, p) + 1;
1784 1785 1786
		tn = p;
		--iter->depth;
		goto rescan;
1787
	}
1788 1789 1790

	/* got root? */
	return NULL;
1791 1792
}

A
Alexander Duyck 已提交
1793
static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1794
				       struct trie *t)
1795
{
A
Alexander Duyck 已提交
1796
	struct tnode *n;
1797

S
Stephen Hemminger 已提交
1798
	if (!t)
1799 1800 1801
		return NULL;

	n = rcu_dereference(t->trie);
1802
	if (!n)
1803
		return NULL;
1804

1805
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
1806
		iter->tnode = n;
1807 1808 1809 1810 1811 1812
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
1813
	}
1814 1815

	return n;
1816
}
O
Olof Johansson 已提交
1817

1818 1819
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
A
Alexander Duyck 已提交
1820
	struct tnode *n;
1821
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
1822

1823
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
1824

1825
	rcu_read_lock();
1826
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1827
		if (IS_LEAF(n)) {
1828 1829
			struct leaf_info *li;

1830 1831 1832 1833
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
1834

A
Alexander Duyck 已提交
1835
			hlist_for_each_entry_rcu(li, &n->list, hlist)
1836
				++s->prefixes;
1837
		} else {
1838
			unsigned long i;
1839 1840

			s->tnodes++;
A
Alexander Duyck 已提交
1841 1842
			if (n->bits < MAX_STAT_DEPTH)
				s->nodesizes[n->bits]++;
R
Robert Olsson 已提交
1843

1844
			for (i = 0; i < tnode_child_length(n); i++) {
A
Alexander Duyck 已提交
1845
				if (!rcu_access_pointer(n->child[i]))
1846
					s->nullpointers++;
1847
			}
1848 1849
		}
	}
R
Robert Olsson 已提交
1850
	rcu_read_unlock();
1851 1852
}

1853 1854 1855 1856
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1857
{
E
Eric Dumazet 已提交
1858
	unsigned int i, max, pointers, bytes, avdepth;
1859

1860 1861 1862 1863
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
1864

1865 1866
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
1867
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
1868

1869
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
A
Alexander Duyck 已提交
1870
	bytes = sizeof(struct tnode) * stat->leaves;
1871 1872 1873 1874

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

1875
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
1876
	bytes += sizeof(struct tnode) * stat->tnodes;
1877

R
Robert Olsson 已提交
1878 1879
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
1880
		max--;
1881

1882
	pointers = 0;
1883
	for (i = 1; i < max; i++)
1884
		if (stat->nodesizes[i] != 0) {
1885
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
1886 1887 1888
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
1889
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
1890

A
Alexander Duyck 已提交
1891
	bytes += sizeof(struct tnode *) * pointers;
1892 1893
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
1894
}
R
Robert Olsson 已提交
1895

1896
#ifdef CONFIG_IP_FIB_TRIE_STATS
1897
static void trie_show_usage(struct seq_file *seq,
1898
			    const struct trie_use_stats __percpu *stats)
1899
{
1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
	struct trie_use_stats s = { 0 };
	int cpu;

	/* loop through all of the CPUs and gather up the stats */
	for_each_possible_cpu(cpu) {
		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);

		s.gets += pcpu->gets;
		s.backtrack += pcpu->backtrack;
		s.semantic_match_passed += pcpu->semantic_match_passed;
		s.semantic_match_miss += pcpu->semantic_match_miss;
		s.null_node_hit += pcpu->null_node_hit;
		s.resize_node_skipped += pcpu->resize_node_skipped;
	}

1915
	seq_printf(seq, "\nCounters:\n---------\n");
1916 1917
	seq_printf(seq, "gets = %u\n", s.gets);
	seq_printf(seq, "backtracks = %u\n", s.backtrack);
1918
	seq_printf(seq, "semantic match passed = %u\n",
1919 1920 1921 1922
		   s.semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
1923
}
1924 1925
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

1926
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
1927
{
1928 1929 1930 1931 1932 1933
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
1934
}
1935

1936

1937 1938
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
1939
	struct net *net = (struct net *)seq->private;
1940
	unsigned int h;
1941

1942
	seq_printf(seq,
1943 1944
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
A
Alexander Duyck 已提交
1945
		   sizeof(struct tnode), sizeof(struct tnode));
1946

1947 1948 1949 1950
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

1951
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1952 1953
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
1954

1955 1956 1957 1958 1959 1960 1961 1962
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
1963
			trie_show_usage(seq, t->stats);
1964 1965 1966
#endif
		}
	}
1967

1968
	return 0;
1969 1970
}

1971
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
1972
{
1973
	return single_open_net(inode, file, fib_triestat_seq_show);
1974 1975
}

1976
static const struct file_operations fib_triestat_fops = {
1977 1978 1979 1980
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
1981
	.release = single_release_net,
1982 1983
};

A
Alexander Duyck 已提交
1984
static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
1985
{
1986 1987
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
1988
	loff_t idx = 0;
1989
	unsigned int h;
1990

1991 1992 1993
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
1994

1995
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
A
Alexander Duyck 已提交
1996
			struct tnode *n;
1997 1998 1999 2000 2001 2002 2003 2004 2005

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2006
	}
2007

2008 2009 2010
	return NULL;
}

2011
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2012
	__acquires(RCU)
2013
{
2014
	rcu_read_lock();
2015
	return fib_trie_get_idx(seq, *pos);
2016 2017
}

2018
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2019
{
2020
	struct fib_trie_iter *iter = seq->private;
2021
	struct net *net = seq_file_net(seq);
2022 2023 2024
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
A
Alexander Duyck 已提交
2025
	struct tnode *n;
2026

2027
	++*pos;
2028 2029 2030 2031
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2032

2033 2034
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2035
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2036 2037 2038 2039 2040
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2041

2042 2043 2044
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2045
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2046 2047 2048 2049 2050
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2051
	return NULL;
2052 2053 2054 2055

found:
	iter->tb = tb;
	return n;
2056
}
2057

2058
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2059
	__releases(RCU)
2060
{
2061 2062
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2063

2064 2065
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2066 2067
	while (n-- > 0)
		seq_puts(seq, "   ");
2068
}
2069

2070
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2071
{
S
Stephen Hemminger 已提交
2072
	switch (s) {
2073 2074 2075 2076 2077 2078
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2079
		snprintf(buf, len, "scope=%d", s);
2080 2081 2082
		return buf;
	}
}
2083

2084
static const char *const rtn_type_names[__RTN_MAX] = {
2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2098

E
Eric Dumazet 已提交
2099
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2100 2101 2102
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2103
	snprintf(buf, len, "type %u", t);
2104
	return buf;
2105 2106
}

2107 2108
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2109
{
2110
	const struct fib_trie_iter *iter = seq->private;
A
Alexander Duyck 已提交
2111
	struct tnode *n = v;
2112

2113 2114
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2115

2116
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
2117
		__be32 prf = htonl(n->key);
O
Olof Johansson 已提交
2118

2119 2120 2121 2122
		seq_indent(seq, iter->depth-1);
		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
			   n->full_children, n->empty_children);
2123
	} else {
2124
		struct leaf_info *li;
A
Alexander Duyck 已提交
2125
		__be32 val = htonl(n->key);
2126 2127

		seq_indent(seq, iter->depth);
2128
		seq_printf(seq, "  |-- %pI4\n", &val);
2129

A
Alexander Duyck 已提交
2130
		hlist_for_each_entry_rcu(li, &n->list, hlist) {
2131 2132 2133 2134 2135 2136 2137 2138
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2139
						     fa->fa_info->fib_scope),
2140 2141 2142
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2143
					seq_printf(seq, " tos=%d", fa->fa_tos);
2144
				seq_putc(seq, '\n');
2145 2146
			}
		}
2147
	}
2148

2149 2150 2151
	return 0;
}

2152
static const struct seq_operations fib_trie_seq_ops = {
2153 2154 2155 2156
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2157 2158
};

2159
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2160
{
2161 2162
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2163 2164
}

2165
static const struct file_operations fib_trie_fops = {
2166 2167 2168 2169
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2170
	.release = seq_release_net,
2171 2172
};

2173 2174 2175 2176 2177 2178 2179
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

A
Alexander Duyck 已提交
2180
static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2181
{
A
Alexander Duyck 已提交
2182
	struct tnode *l = NULL;
2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2213
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
A
Alexander Duyck 已提交
2227
	struct tnode *l = v;
2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2251
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2252
{
E
Eric Dumazet 已提交
2253
	unsigned int flags = 0;
2254

E
Eric Dumazet 已提交
2255 2256
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2257 2258
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2259
	if (mask == htonl(0xFFFFFFFF))
2260 2261 2262
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2263 2264
}

2265 2266 2267
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2268
 *	and needs to be same as fib_hash output to avoid breaking
2269 2270 2271
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2272
{
A
Alexander Duyck 已提交
2273
	struct tnode *l = v;
2274
	struct leaf_info *li;
2275

2276 2277 2278 2279 2280 2281
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2282

2283
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2284
		struct fib_alias *fa;
A
Al Viro 已提交
2285
		__be32 mask, prefix;
O
Olof Johansson 已提交
2286

2287 2288
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2289

2290
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2291
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2292
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2293

2294 2295 2296
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2297

2298 2299
			seq_setwidth(seq, 127);

2300
			if (fi)
2301 2302
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2303
					 "%d\t%08X\t%d\t%u\t%u",
2304 2305 2306 2307 2308
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2309 2310
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2311
					 fi->fib_window,
2312
					 fi->fib_rtt >> 3);
2313
			else
2314 2315
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2316
					 "%d\t%08X\t%d\t%u\t%u",
2317
					 prefix, 0, flags, 0, 0, 0,
2318
					 mask, 0, 0, 0);
2319

2320
			seq_pad(seq, '\n');
2321
		}
2322 2323 2324 2325 2326
	}

	return 0;
}

2327
static const struct seq_operations fib_route_seq_ops = {
2328 2329 2330
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2331
	.show   = fib_route_seq_show,
2332 2333
};

2334
static int fib_route_seq_open(struct inode *inode, struct file *file)
2335
{
2336
	return seq_open_net(inode, file, &fib_route_seq_ops,
2337
			    sizeof(struct fib_route_iter));
2338 2339
}

2340
static const struct file_operations fib_route_fops = {
2341 2342 2343 2344
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2345
	.release = seq_release_net,
2346 2347
};

2348
int __net_init fib_proc_init(struct net *net)
2349
{
2350
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2351 2352
		goto out1;

2353 2354
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2355 2356
		goto out2;

2357
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2358 2359
		goto out3;

2360
	return 0;
2361 2362

out3:
2363
	remove_proc_entry("fib_triestat", net->proc_net);
2364
out2:
2365
	remove_proc_entry("fib_trie", net->proc_net);
2366 2367
out1:
	return -ENOMEM;
2368 2369
}

2370
void __net_exit fib_proc_exit(struct net *net)
2371
{
2372 2373 2374
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2375 2376 2377
}

#endif /* CONFIG_PROC_FS */