fib_trie.c 56.5 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

90 91
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
R
Robert Olsson 已提交
92

93
#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
94

95 96 97 98 99
struct tnode {
	t_key key;
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	struct tnode __rcu *parent;
100
	struct rcu_head rcu;
A
Alexander Duyck 已提交
101 102 103 104 105 106 107 108 109 110
	union {
		/* The fields in this struct are valid if bits > 0 (TNODE) */
		struct {
			unsigned int full_children;  /* KEYLENGTH bits needed */
			unsigned int empty_children; /* KEYLENGTH bits needed */
			struct tnode __rcu *child[0];
		};
		/* This list pointer if valid if bits == 0 (LEAF) */
		struct hlist_head list;
	};
111 112 113 114 115
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
116
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
117
	struct list_head falh;
118
	struct rcu_head rcu;
119 120 121 122 123 124 125 126 127
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
128
	unsigned int resize_node_skipped;
129 130 131 132 133 134 135 136 137
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
138
	unsigned int prefixes;
R
Robert Olsson 已提交
139
	unsigned int nodesizes[MAX_STAT_DEPTH];
140
};
141 142

struct trie {
A
Alexander Duyck 已提交
143
	struct tnode __rcu *trie;
144
#ifdef CONFIG_IP_FIB_TRIE_STATS
145
	struct trie_use_stats __percpu *stats;
146 147 148
#endif
};

A
Alexander Duyck 已提交
149
static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
150
				  int wasfull);
A
Alexander Duyck 已提交
151
static struct tnode *resize(struct trie *t, struct tnode *tn);
152 153
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
154
/* tnodes to free after resize(); protected by RTNL */
155
static struct callback_head *tnode_free_head;
156 157 158 159 160 161 162 163
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
164

165
static struct kmem_cache *fn_alias_kmem __read_mostly;
166
static struct kmem_cache *trie_leaf_kmem __read_mostly;
167

168 169
/* caller must hold RTNL */
#define node_parent(n) rtnl_dereference((n)->parent)
E
Eric Dumazet 已提交
170

171 172
/* caller must hold RCU read lock or RTNL */
#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
E
Eric Dumazet 已提交
173

174
/* wrapper for rcu_assign_pointer */
A
Alexander Duyck 已提交
175
static inline void node_set_parent(struct tnode *n, struct tnode *tp)
176
{
A
Alexander Duyck 已提交
177 178
	if (n)
		rcu_assign_pointer(n->parent, tp);
S
Stephen Hemminger 已提交
179 180
}

181 182 183 184
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)

/* This provides us with the number of children in this node, in the case of a
 * leaf this will return 0 meaning none of the children are accessible.
185
 */
186
static inline int tnode_child_length(const struct tnode *tn)
S
Stephen Hemminger 已提交
187
{
188
	return (1ul << tn->bits) & ~(1ul);
S
Stephen Hemminger 已提交
189
}
R
Robert Olsson 已提交
190

E
Eric Dumazet 已提交
191 192 193
/*
 * caller must hold RTNL
 */
A
Alexander Duyck 已提交
194
static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
195
{
196
	BUG_ON(i >= tnode_child_length(tn));
R
Robert Olsson 已提交
197

E
Eric Dumazet 已提交
198
	return rtnl_dereference(tn->child[i]);
199 200
}

E
Eric Dumazet 已提交
201 202 203
/*
 * caller must hold RCU read lock or RTNL
 */
A
Alexander Duyck 已提交
204
static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
205
{
206
	BUG_ON(i >= tnode_child_length(tn));
207

E
Eric Dumazet 已提交
208
	return rcu_dereference_rtnl(tn->child[i]);
209 210
}

211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
/* To understand this stuff, an understanding of keys and all their bits is
 * necessary. Every node in the trie has a key associated with it, but not
 * all of the bits in that key are significant.
 *
 * Consider a node 'n' and its parent 'tp'.
 *
 * If n is a leaf, every bit in its key is significant. Its presence is
 * necessitated by path compression, since during a tree traversal (when
 * searching for a leaf - unless we are doing an insertion) we will completely
 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
 * a potentially successful search, that we have indeed been walking the
 * correct key path.
 *
 * Note that we can never "miss" the correct key in the tree if present by
 * following the wrong path. Path compression ensures that segments of the key
 * that are the same for all keys with a given prefix are skipped, but the
 * skipped part *is* identical for each node in the subtrie below the skipped
 * bit! trie_insert() in this implementation takes care of that.
 *
 * if n is an internal node - a 'tnode' here, the various parts of its key
 * have many different meanings.
 *
 * Example:
 * _________________________________________________________________
 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 * -----------------------------------------------------------------
 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
 *
 * _________________________________________________________________
 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 * -----------------------------------------------------------------
 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
 *
 * tp->pos = 22
 * tp->bits = 3
 * n->pos = 13
 * n->bits = 4
 *
 * First, let's just ignore the bits that come before the parent tp, that is
 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
 * point we do not use them for anything.
 *
 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 * index into the parent's child array. That is, they will be used to find
 * 'n' among tp's children.
 *
 * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
 * for the node n.
 *
 * All the bits we have seen so far are significant to the node n. The rest
 * of the bits are really not needed or indeed known in n->key.
 *
 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 * n's child array, and will of course be different for each child.
 *
 * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
 * at this point.
 */
269

270 271
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
272
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
273
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
274 275

static void __alias_free_mem(struct rcu_head *head)
276
{
R
Robert Olsson 已提交
277 278
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
279 280
}

R
Robert Olsson 已提交
281
static inline void alias_free_mem_rcu(struct fib_alias *fa)
282
{
R
Robert Olsson 已提交
283 284
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
285

286
#define TNODE_KMALLOC_MAX \
A
Alexander Duyck 已提交
287
	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
O
Olof Johansson 已提交
288

289
static void __node_free_rcu(struct rcu_head *head)
290
{
A
Alexander Duyck 已提交
291
	struct tnode *n = container_of(head, struct tnode, rcu);
292 293 294 295 296 297 298

	if (IS_LEAF(n))
		kmem_cache_free(trie_leaf_kmem, n);
	else if (n->bits <= TNODE_KMALLOC_MAX)
		kfree(n);
	else
		vfree(n);
299 300
}

301 302
#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)

R
Robert Olsson 已提交
303
static inline void free_leaf_info(struct leaf_info *leaf)
304
{
305
	kfree_rcu(leaf, rcu);
306 307
}

308
static struct tnode *tnode_alloc(size_t size)
309
{
R
Robert Olsson 已提交
310
	if (size <= PAGE_SIZE)
311
		return kzalloc(size, GFP_KERNEL);
312
	else
313
		return vzalloc(size);
314
}
R
Robert Olsson 已提交
315

J
Jarek Poplawski 已提交
316 317 318
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
319 320
	tn->rcu.next = tnode_free_head;
	tnode_free_head = &tn->rcu;
J
Jarek Poplawski 已提交
321 322 323 324
}

static void tnode_free_flush(void)
{
325 326 327 328 329 330 331
	struct callback_head *head;

	while ((head = tnode_free_head)) {
		struct tnode *tn = container_of(head, struct tnode, rcu);

		tnode_free_head = head->next;
		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
J
Jarek Poplawski 已提交
332

333
		node_free(tn);
J
Jarek Poplawski 已提交
334
	}
335 336 337 338 339

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
340 341
}

A
Alexander Duyck 已提交
342
static struct tnode *leaf_new(t_key key)
R
Robert Olsson 已提交
343
{
A
Alexander Duyck 已提交
344
	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
345
	if (l) {
346 347 348 349 350 351
		l->parent = NULL;
		/* set key and pos to reflect full key value
		 * any trailing zeros in the key should be ignored
		 * as the nodes are searched
		 */
		l->key = key;
352
		l->pos = 0;
353 354 355
		/* set bits to 0 indicating we are not a tnode */
		l->bits = 0;

R
Robert Olsson 已提交
356 357 358 359 360 361 362 363 364 365
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
366
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
367 368 369 370 371
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

372
static struct tnode *tnode_new(t_key key, int pos, int bits)
373
{
374
	size_t sz = offsetof(struct tnode, child[1 << bits]);
375
	struct tnode *tn = tnode_alloc(sz);
376 377 378 379
	unsigned int shift = pos + bits;

	/* verify bits and pos their msb bits clear and values are valid */
	BUG_ON(!bits || (shift > KEYLENGTH));
380

O
Olof Johansson 已提交
381
	if (tn) {
382
		tn->parent = NULL;
383 384
		tn->pos = pos;
		tn->bits = bits;
385
		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
386 387 388
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
389

E
Eric Dumazet 已提交
390
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
A
Alexander Duyck 已提交
391
		 sizeof(struct tnode *) << bits);
392 393 394
	return tn;
}

395
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
396 397
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */
A
Alexander Duyck 已提交
398
static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
399
{
400
	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
401 402
}

L
Lin Ming 已提交
403
static inline void put_child(struct tnode *tn, int i,
A
Alexander Duyck 已提交
404
			     struct tnode *n)
405 406 407 408
{
	tnode_put_child_reorg(tn, i, n, -1);
}

409
 /*
410 411 412 413
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

A
Alexander Duyck 已提交
414
static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
415
				  int wasfull)
416
{
A
Alexander Duyck 已提交
417
	struct tnode *chi = rtnl_dereference(tn->child[i]);
418 419
	int isfull;

S
Stephen Hemminger 已提交
420 421
	BUG_ON(i >= 1<<tn->bits);

422 423 424 425 426
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
427

428
	/* update fullChildren */
O
Olof Johansson 已提交
429
	if (wasfull == -1)
430 431 432
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
433
	if (wasfull && !isfull)
434
		tn->full_children--;
435
	else if (!wasfull && isfull)
436
		tn->full_children++;
O
Olof Johansson 已提交
437

438
	node_set_parent(n, tn);
439

440
	rcu_assign_pointer(tn->child[i], n);
441 442
}

443 444 445 446 447 448 449 450 451
static void put_child_root(struct tnode *tp, struct trie *t,
			   t_key key, struct tnode *n)
{
	if (tp)
		put_child(tp, get_index(key, tp), n);
	else
		rcu_assign_pointer(t->trie, n);
}

J
Jens Låås 已提交
452
#define MAX_WORK 10
A
Alexander Duyck 已提交
453
static struct tnode *resize(struct trie *t, struct tnode *tn)
454
{
A
Alexander Duyck 已提交
455
	struct tnode *old_tn, *n = NULL;
456 457
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
458
	int max_work;
459

460
	if (!tn)
461 462
		return NULL;

S
Stephen Hemminger 已提交
463 464
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
465 466

	/* No children */
467 468 469
	if (tn->empty_children > (tnode_child_length(tn) - 1))
		goto no_children;

470
	/* One child */
471
	if (tn->empty_children == (tnode_child_length(tn) - 1))
J
Jens Låås 已提交
472
		goto one_child;
473
	/*
474 475 476 477 478
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
479 480
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
481
	 * Telecommunications, page 6:
482
	 * "A node is doubled if the ratio of non-empty children to all
483 484
	 * children in the *doubled* node is at least 'high'."
	 *
485 486 487 488 489
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
490
	 * multiply the left-hand side by 50.
491 492 493 494
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
495
	 * children, that is non-null tnodes with a skip value of 0.
496
	 * All of those will be doubled in the resulting inflated tnode, so
497
	 * we just count them one extra time here.
498
	 *
499
	 * A clearer way to write this would be:
500
	 *
501
	 * to_be_doubled = tn->full_children;
502
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
503 504 505 506
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
507
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
508 509
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
510 511 512
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
513 514 515
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
516
	 *
517 518 519
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
520
	 *
521
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
522
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
523
	 *    tn->full_children) >= inflate_threshold * new_child_length
524
	 *
525
	 * expand new_child_length:
526
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
527
	 *    tn->full_children) >=
528
	 *      inflate_threshold * tnode_child_length(tn) * 2
529
	 *
530
	 * shorten again:
531
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
532
	 *    tn->empty_children) >= inflate_threshold *
533
	 *    tnode_child_length(tn)
534
	 *
535 536
	 */

537 538
	/* Keep root node larger  */

539
	if (!node_parent(tn)) {
J
Jens Låås 已提交
540 541
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
542
	} else {
543
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
544 545
		halve_threshold_use = halve_threshold;
	}
546

J
Jens Låås 已提交
547 548
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
549 550 551
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
552

553 554
		old_tn = tn;
		tn = inflate(t, tn);
555

556 557
		if (IS_ERR(tn)) {
			tn = old_tn;
558
#ifdef CONFIG_IP_FIB_TRIE_STATS
559
			this_cpu_inc(t->stats->resize_node_skipped);
560 561 562
#endif
			break;
		}
563 564
	}

J
Jens Låås 已提交
565
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
566
	if (max_work != MAX_WORK)
A
Alexander Duyck 已提交
567
		return tn;
J
Jens Låås 已提交
568

569 570 571 572
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
573

J
Jens Låås 已提交
574 575
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
576
	       100 * (tnode_child_length(tn) - tn->empty_children) <
577
	       halve_threshold_use * tnode_child_length(tn)) {
578

579 580 581 582
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
583
#ifdef CONFIG_IP_FIB_TRIE_STATS
584
			this_cpu_inc(t->stats->resize_node_skipped);
585 586 587 588
#endif
			break;
		}
	}
589

590

591
	/* Only one child remains */
592 593
	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
		unsigned long i;
J
Jens Låås 已提交
594
one_child:
595 596 597 598 599 600 601
		for (i = tnode_child_length(tn); !n && i;)
			n = tnode_get_child(tn, --i);
no_children:
		/* compress one level */
		node_set_parent(n, NULL);
		tnode_free_safe(tn);
		return n;
J
Jens Låås 已提交
602
	}
A
Alexander Duyck 已提交
603
	return tn;
604 605
}

E
Eric Dumazet 已提交
606 607 608

static void tnode_clean_free(struct tnode *tn)
{
A
Alexander Duyck 已提交
609
	struct tnode *tofree;
E
Eric Dumazet 已提交
610 611 612
	int i;

	for (i = 0; i < tnode_child_length(tn); i++) {
613
		tofree = rtnl_dereference(tn->child[i]);
E
Eric Dumazet 已提交
614
		if (tofree)
615
			node_free(tofree);
E
Eric Dumazet 已提交
616
	}
617
	node_free(tn);
E
Eric Dumazet 已提交
618 619
}

A
Alexander Duyck 已提交
620
static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
621
{
A
Alexander Duyck 已提交
622 623
	int olen = tnode_child_length(oldtnode);
	struct tnode *tn;
624
	t_key m;
625 626
	int i;

S
Stephen Hemminger 已提交
627
	pr_debug("In inflate\n");
628

629
	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
630

S
Stephen Hemminger 已提交
631
	if (!tn)
632
		return ERR_PTR(-ENOMEM);
633 634

	/*
635 636 637
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
638 639
	 * of tnode is ignored.
	 */
640 641
	for (i = 0, m = 1u << tn->pos; i < olen; i++) {
		struct tnode *inode = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
642

643
		if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
644
			struct tnode *left, *right;
645

646
			left = tnode_new(inode->key & ~m, inode->pos,
647
					 inode->bits - 1);
648 649
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
650

651
			right = tnode_new(inode->key | m, inode->pos,
652 653
					  inode->bits - 1);

654
			if (!right) {
655
				node_free(left);
656
				goto nomem;
657
			}
658

A
Alexander Duyck 已提交
659 660
			put_child(tn, 2*i, left);
			put_child(tn, 2*i+1, right);
661 662 663
		}
	}

O
Olof Johansson 已提交
664
	for (i = 0; i < olen; i++) {
A
Alexander Duyck 已提交
665
		struct tnode *inode = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
666 667
		struct tnode *left, *right;
		int size, j;
668

669
		/* An empty child */
A
Alexander Duyck 已提交
670
		if (inode == NULL)
671 672 673
			continue;

		/* A leaf or an internal node with skipped bits */
A
Alexander Duyck 已提交
674
		if (!tnode_full(oldtnode, inode)) {
675
			put_child(tn, get_index(inode->key, tn), inode);
676 677 678 679 680
			continue;
		}

		/* An internal node with two children */
		if (inode->bits == 1) {
L
Lin Ming 已提交
681 682
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
683

J
Jarek Poplawski 已提交
684
			tnode_free_safe(inode);
O
Olof Johansson 已提交
685
			continue;
686 687
		}

O
Olof Johansson 已提交
688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
706

O
Olof Johansson 已提交
707 708 709
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
710

A
Alexander Duyck 已提交
711
		left = tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
712
		put_child(tn, 2*i, NULL);
713

O
Olof Johansson 已提交
714
		BUG_ON(!left);
715

A
Alexander Duyck 已提交
716
		right = tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
717
		put_child(tn, 2*i+1, NULL);
718

O
Olof Johansson 已提交
719
		BUG_ON(!right);
720

O
Olof Johansson 已提交
721 722
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
723 724
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
725
		}
L
Lin Ming 已提交
726 727
		put_child(tn, 2*i, resize(t, left));
		put_child(tn, 2*i+1, resize(t, right));
O
Olof Johansson 已提交
728

J
Jarek Poplawski 已提交
729
		tnode_free_safe(inode);
730
	}
J
Jarek Poplawski 已提交
731
	tnode_free_safe(oldtnode);
732
	return tn;
733
nomem:
E
Eric Dumazet 已提交
734 735
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
736 737
}

A
Alexander Duyck 已提交
738
static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
739
{
A
Alexander Duyck 已提交
740 741
	int olen = tnode_child_length(oldtnode);
	struct tnode *tn, *left, *right;
742 743
	int i;

S
Stephen Hemminger 已提交
744
	pr_debug("In halve\n");
745

746
	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
747

748 749
	if (!tn)
		return ERR_PTR(-ENOMEM);
750 751

	/*
752 753 754
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
755 756 757
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
758
	for (i = 0; i < olen; i += 2) {
759 760
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
761

762
		/* Two nonempty children */
S
Stephen Hemminger 已提交
763
		if (left && right) {
764
			struct tnode *newn;
S
Stephen Hemminger 已提交
765

766
			newn = tnode_new(left->key, oldtnode->pos, 1);
S
Stephen Hemminger 已提交
767 768

			if (!newn)
769
				goto nomem;
S
Stephen Hemminger 已提交
770

A
Alexander Duyck 已提交
771
			put_child(tn, i/2, newn);
772 773 774
		}

	}
775

O
Olof Johansson 已提交
776 777 778
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

779 780
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
781

782 783 784 785
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
786
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
787
			continue;
S
Stephen Hemminger 已提交
788
		}
O
Olof Johansson 已提交
789 790

		if (right == NULL) {
L
Lin Ming 已提交
791
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
792 793
			continue;
		}
794

795
		/* Two nonempty children */
A
Alexander Duyck 已提交
796
		newBinNode = tnode_get_child(tn, i/2);
L
Lin Ming 已提交
797 798 799 800
		put_child(tn, i/2, NULL);
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
		put_child(tn, i/2, resize(t, newBinNode));
801
	}
J
Jarek Poplawski 已提交
802
	tnode_free_safe(oldtnode);
803
	return tn;
804
nomem:
E
Eric Dumazet 已提交
805 806
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
807 808
}

R
Robert Olsson 已提交
809
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
810 811
 via get_fa_head and dump */

A
Alexander Duyck 已提交
812
static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
813
{
R
Robert Olsson 已提交
814
	struct hlist_head *head = &l->list;
815 816
	struct leaf_info *li;

817
	hlist_for_each_entry_rcu(li, head, hlist)
818
		if (li->plen == plen)
819
			return li;
O
Olof Johansson 已提交
820

821 822 823
	return NULL;
}

A
Alexander Duyck 已提交
824
static inline struct list_head *get_fa_head(struct tnode *l, int plen)
825
{
R
Robert Olsson 已提交
826
	struct leaf_info *li = find_leaf_info(l, plen);
827

O
Olof Johansson 已提交
828 829
	if (!li)
		return NULL;
830

O
Olof Johansson 已提交
831
	return &li->falh;
832 833 834 835
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
836 837 838 839 840
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
841
		hlist_for_each_entry(li, head, hlist) {
842 843 844 845 846 847
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
848
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
849 850 851
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
852 853
}

R
Robert Olsson 已提交
854
/* rcu_read_lock needs to be hold by caller from readside */
A
Alexander Duyck 已提交
855
static struct tnode *fib_find_node(struct trie *t, u32 key)
856
{
A
Alexander Duyck 已提交
857
	struct tnode *n = rcu_dereference_rtnl(t->trie);
A
Alexander Duyck 已提交
858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876

	while (n) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the bits in the cindex. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is cindex
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			return NULL;

		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
877 878
			break;

A
Alexander Duyck 已提交
879 880
		n = rcu_dereference_rtnl(n->child[index]);
	}
O
Olof Johansson 已提交
881

A
Alexander Duyck 已提交
882
	return n;
883 884
}

885
static void trie_rebalance(struct trie *t, struct tnode *tn)
886 887
{
	int wasfull;
R
Robert Olsson 已提交
888
	t_key cindex, key;
S
Stephen Hemminger 已提交
889
	struct tnode *tp;
890

R
Robert Olsson 已提交
891 892
	key = tn->key;

893
	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
894
		cindex = get_index(key, tp);
895
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
A
Alexander Duyck 已提交
896
		tn = resize(t, tn);
897

A
Alexander Duyck 已提交
898
		tnode_put_child_reorg(tp, cindex, tn, wasfull);
O
Olof Johansson 已提交
899

900
		tp = node_parent(tn);
901
		if (!tp)
A
Alexander Duyck 已提交
902
			rcu_assign_pointer(t->trie, tn);
903

J
Jarek Poplawski 已提交
904
		tnode_free_flush();
S
Stephen Hemminger 已提交
905
		if (!tp)
906
			break;
S
Stephen Hemminger 已提交
907
		tn = tp;
908
	}
S
Stephen Hemminger 已提交
909

910
	/* Handle last (top) tnode */
911
	if (IS_TNODE(tn))
A
Alexander Duyck 已提交
912
		tn = resize(t, tn);
913

A
Alexander Duyck 已提交
914
	rcu_assign_pointer(t->trie, tn);
915
	tnode_free_flush();
916 917
}

R
Robert Olsson 已提交
918 919
/* only used from updater-side */

920
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
921
{
922
	struct list_head *fa_head = NULL;
923
	struct tnode *l, *n, *tp = NULL;
924 925
	struct leaf_info *li;

926 927 928 929 930
	li = leaf_info_new(plen);
	if (!li)
		return NULL;
	fa_head = &li->falh;

E
Eric Dumazet 已提交
931
	n = rtnl_dereference(t->trie);
932

933 934
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
935 936
	 * and we should just put our new leaf in that.
	 *
937 938 939
	 * If we hit a node with a key that does't match then we should stop
	 * and create a new tnode to replace that node and insert ourselves
	 * and the other node into the new tnode.
940
	 */
941 942
	while (n) {
		unsigned long index = get_index(key, n);
943

944 945 946 947 948 949 950 951 952 953 954
		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
955 956
			break;

957 958 959 960 961 962
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n)) {
			/* Case 1: n is a leaf, and prefixes match*/
			insert_leaf_info(&n->list, li);
			return fa_head;
		}
963

964 965
		tp = n;
		n = rcu_dereference_rtnl(n->child[index]);
966 967
	}

968 969 970
	l = leaf_new(key);
	if (!l) {
		free_leaf_info(li);
971
		return NULL;
972
	}
973 974 975

	insert_leaf_info(&l->list, li);

976 977 978 979 980 981 982 983
	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
	 *
	 *  Add a new tnode here
	 *  first tnode need some special handling
	 *  leaves us in position for handling as case 3
	 */
	if (n) {
		struct tnode *tn;
984

985
		tn = tnode_new(key, __fls(key ^ n->key), 1);
986
		if (!tn) {
987
			free_leaf_info(li);
988
			node_free(l);
989
			return NULL;
O
Olof Johansson 已提交
990 991
		}

992 993 994
		/* initialize routes out of node */
		NODE_INIT_PARENT(tn, tp);
		put_child(tn, get_index(key, tn) ^ 1, n);
995

996 997 998
		/* start adding routes into the node */
		put_child_root(tp, t, key, tn);
		node_set_parent(n, tn);
999

1000
		/* parent now has a NULL spot where the leaf can go */
1001
		tp = tn;
1002
	}
O
Olof Johansson 已提交
1003

1004 1005 1006 1007 1008 1009 1010 1011
	/* Case 3: n is NULL, and will just insert a new leaf */
	if (tp) {
		NODE_INIT_PARENT(l, tp);
		put_child(tp, get_index(key, tp), l);
		trie_rebalance(t, tp);
	} else {
		rcu_assign_pointer(t->trie, l);
	}
R
Robert Olsson 已提交
1012

1013 1014 1015
	return fa_head;
}

1016 1017 1018
/*
 * Caller must hold RTNL.
 */
1019
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1020 1021 1022
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1023
	struct list_head *fa_head = NULL;
1024
	struct fib_info *fi;
1025 1026
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1027 1028
	u32 key, mask;
	int err;
A
Alexander Duyck 已提交
1029
	struct tnode *l;
1030 1031 1032 1033

	if (plen > 32)
		return -EINVAL;

1034
	key = ntohl(cfg->fc_dst);
1035

1036
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1037

O
Olof Johansson 已提交
1038
	mask = ntohl(inet_make_mask(plen));
1039

1040
	if (key & ~mask)
1041 1042 1043 1044
		return -EINVAL;

	key = key & mask;

1045 1046 1047
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1048
		goto err;
1049
	}
1050 1051

	l = fib_find_node(t, key);
1052
	fa = NULL;
1053

1054
	if (l) {
1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1070 1071 1072
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1073 1074

		err = -EEXIST;
1075
		if (cfg->fc_nlflags & NLM_F_EXCL)
1076 1077
			goto out;

1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1098
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1099 1100 1101
			struct fib_info *fi_drop;
			u8 state;

1102 1103 1104 1105
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1106
				goto out;
1107
			}
R
Robert Olsson 已提交
1108
			err = -ENOBUFS;
1109
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1110 1111
			if (new_fa == NULL)
				goto out;
1112 1113

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1114 1115
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1116
			new_fa->fa_type = cfg->fc_type;
1117
			state = fa->fa_state;
1118
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1119

R
Robert Olsson 已提交
1120 1121
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1122 1123 1124

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1125
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1126 1127
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1128

O
Olof Johansson 已提交
1129
			goto succeeded;
1130 1131 1132 1133 1134
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1135 1136
		if (fa_match)
			goto out;
1137

1138
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1139
			fa = fa_first;
1140 1141
	}
	err = -ENOENT;
1142
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1143 1144 1145
		goto out;

	err = -ENOBUFS;
1146
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1147 1148 1149 1150 1151
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1152
	new_fa->fa_type = cfg->fc_type;
1153 1154 1155 1156 1157
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1158
	if (!fa_head) {
1159 1160 1161
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1162
			goto out_free_new_fa;
1163
		}
1164
	}
1165

1166 1167 1168
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1169 1170
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1171

1172
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1173
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1174
		  &cfg->fc_nlinfo, 0);
1175 1176
succeeded:
	return 0;
1177 1178 1179

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1180 1181
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1182
err:
1183 1184 1185
	return err;
}

R
Robert Olsson 已提交
1186
/* should be called with rcu_read_lock */
A
Alexander Duyck 已提交
1187
static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
1188
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1189
		      struct fib_result *res, int fib_flags)
1190 1191 1192
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
1193

1194
	hlist_for_each_entry_rcu(li, hhead, hlist) {
1195
		struct fib_alias *fa;
1196

1197
		if (l->key != (key & li->mask_plen))
1198 1199
			continue;

1200 1201 1202
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1203

1204
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1205
				continue;
1206 1207
			if (fi->fib_dead)
				continue;
1208
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1209 1210 1211
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
1212
			if (unlikely(err < 0)) {
1213
#ifdef CONFIG_IP_FIB_TRIE_STATS
1214
				this_cpu_inc(t->stats->semantic_match_passed);
1215
#endif
1216
				return err;
1217 1218 1219 1220 1221 1222 1223 1224
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1225
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1226 1227 1228
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1229
				this_cpu_inc(t->stats->semantic_match_passed);
1230
#endif
1231
				res->prefixlen = li->plen;
1232 1233
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1234
				res->scope = fi->fib_scope;
1235 1236 1237 1238
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
1239
					atomic_inc(&fi->fib_clntref);
1240 1241 1242 1243 1244
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
1245
		this_cpu_inc(t->stats->semantic_match_miss);
1246 1247
#endif
	}
1248

1249
	return 1;
1250 1251
}

1252 1253 1254 1255 1256 1257 1258
static inline t_key prefix_mismatch(t_key key, struct tnode *n)
{
	t_key prefix = n->key;

	return (key ^ prefix) & (prefix | -prefix);
}

1259
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1260
		     struct fib_result *res, int fib_flags)
1261
{
1262
	struct trie *t = (struct trie *)tb->tb_data;
1263 1264 1265
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats __percpu *stats = t->stats;
#endif
1266 1267 1268 1269
	const t_key key = ntohl(flp->daddr);
	struct tnode *n, *pn;
	t_key cindex;
	int ret = 1;
O
Olof Johansson 已提交
1270

R
Robert Olsson 已提交
1271
	rcu_read_lock();
O
Olof Johansson 已提交
1272

R
Robert Olsson 已提交
1273
	n = rcu_dereference(t->trie);
1274
	if (!n)
1275 1276 1277
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1278
	this_cpu_inc(stats->gets);
1279 1280
#endif

A
Alexander Duyck 已提交
1281
	pn = n;
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
	cindex = 0;

	/* Step 1: Travel to the longest prefix match in the trie */
	for (;;) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			break;
1300

1301 1302
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
1303
			goto found;
1304

1305 1306
		/* only record pn and cindex if we are going to be chopping
		 * bits later.  Otherwise we are just wasting cycles.
O
Olof Johansson 已提交
1307
		 */
1308 1309 1310
		if (index) {
			pn = n;
			cindex = index;
O
Olof Johansson 已提交
1311
		}
1312

1313 1314 1315 1316
		n = rcu_dereference(n->child[index]);
		if (unlikely(!n))
			goto backtrace;
	}
1317

1318 1319 1320 1321
	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
	for (;;) {
		/* record the pointer where our next node pointer is stored */
		struct tnode __rcu **cptr = n->child;
1322

1323 1324 1325
		/* This test verifies that none of the bits that differ
		 * between the key and the prefix exist in the region of
		 * the lsb and higher in the prefix.
O
Olof Johansson 已提交
1326
		 */
1327 1328
		if (unlikely(prefix_mismatch(key, n)))
			goto backtrace;
O
Olof Johansson 已提交
1329

1330 1331 1332
		/* exit out and process leaf */
		if (unlikely(IS_LEAF(n)))
			break;
O
Olof Johansson 已提交
1333

1334 1335 1336
		/* Don't bother recording parent info.  Since we are in
		 * prefix match mode we will have to come back to wherever
		 * we started this traversal anyway
O
Olof Johansson 已提交
1337 1338
		 */

1339
		while ((n = rcu_dereference(*cptr)) == NULL) {
1340 1341
backtrace:
#ifdef CONFIG_IP_FIB_TRIE_STATS
1342 1343
			if (!n)
				this_cpu_inc(stats->null_node_hit);
1344
#endif
1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
			/* If we are at cindex 0 there are no more bits for
			 * us to strip at this level so we must ascend back
			 * up one level to see if there are any more bits to
			 * be stripped there.
			 */
			while (!cindex) {
				t_key pkey = pn->key;

				pn = node_parent_rcu(pn);
				if (unlikely(!pn))
					goto failed;
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->backtrack);
#endif
				/* Get Child's index */
				cindex = get_index(pkey, pn);
			}

			/* strip the least significant bit from the cindex */
			cindex &= cindex - 1;

			/* grab pointer for next child node */
			cptr = &pn->child[cindex];
1368
		}
1369
	}
1370

1371
found:
1372 1373 1374 1375 1376
	/* Step 3: Process the leaf, if that fails fall back to backtracing */
	ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
	if (unlikely(ret > 0))
		goto backtrace;
failed:
R
Robert Olsson 已提交
1377
	rcu_read_unlock();
1378 1379
	return ret;
}
1380
EXPORT_SYMBOL_GPL(fib_table_lookup);
1381

1382 1383 1384
/*
 * Remove the leaf and return parent.
 */
A
Alexander Duyck 已提交
1385
static void trie_leaf_remove(struct trie *t, struct tnode *l)
1386
{
1387
	struct tnode *tp = node_parent(l);
1388

1389
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1390

1391
	if (tp) {
1392
		put_child(tp, get_index(l->key, tp), NULL);
1393
		trie_rebalance(t, tp);
1394
	} else {
1395
		RCU_INIT_POINTER(t->trie, NULL);
1396
	}
1397

1398
	node_free(l);
1399 1400
}

1401 1402 1403
/*
 * Caller must hold RTNL.
 */
1404
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1405 1406 1407
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1408 1409
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1410 1411
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
A
Alexander Duyck 已提交
1412
	struct tnode *l;
O
Olof Johansson 已提交
1413 1414
	struct leaf_info *li;

1415
	if (plen > 32)
1416 1417
		return -EINVAL;

1418
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1419
	mask = ntohl(inet_make_mask(plen));
1420

1421
	if (key & ~mask)
1422 1423 1424 1425 1426
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1427
	if (!l)
1428 1429
		return -ESRCH;

1430 1431 1432 1433 1434 1435
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1436 1437 1438 1439 1440
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1441
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1442 1443

	fa_to_delete = NULL;
1444 1445
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1446 1447 1448 1449 1450
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1451 1452
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1453
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1454 1455
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1456 1457 1458
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1459 1460 1461 1462 1463
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1464 1465
	if (!fa_to_delete)
		return -ESRCH;
1466

O
Olof Johansson 已提交
1467
	fa = fa_to_delete;
1468
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1469
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1470

R
Robert Olsson 已提交
1471
	list_del_rcu(&fa->fa_list);
1472

1473 1474 1475
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1476
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1477
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1478
		free_leaf_info(li);
R
Robert Olsson 已提交
1479
	}
1480

O
Olof Johansson 已提交
1481
	if (hlist_empty(&l->list))
1482
		trie_leaf_remove(t, l);
1483

O
Olof Johansson 已提交
1484
	if (fa->fa_state & FA_S_ACCESSED)
1485
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1486

R
Robert Olsson 已提交
1487 1488
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1489
	return 0;
1490 1491
}

1492
static int trie_flush_list(struct list_head *head)
1493 1494 1495 1496 1497 1498 1499
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1500 1501 1502 1503
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1504 1505 1506 1507 1508 1509
			found++;
		}
	}
	return found;
}

A
Alexander Duyck 已提交
1510
static int trie_flush_leaf(struct tnode *l)
1511 1512 1513
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1514
	struct hlist_node *tmp;
1515 1516
	struct leaf_info *li = NULL;

1517
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1518
		found += trie_flush_list(&li->falh);
1519 1520

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1521
			hlist_del_rcu(&li->hlist);
1522 1523 1524 1525 1526 1527
			free_leaf_info(li);
		}
	}
	return found;
}

1528 1529 1530 1531
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
A
Alexander Duyck 已提交
1532
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1533
{
1534
	do {
1535
		t_key idx = c ? idx = get_index(c->key, p) + 1 : 0;
R
Robert Olsson 已提交
1536

1537 1538
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1539
			if (!c)
O
Olof Johansson 已提交
1540 1541
				continue;

1542
			if (IS_LEAF(c))
A
Alexander Duyck 已提交
1543
				return c;
1544 1545

			/* Rescan start scanning in new node */
A
Alexander Duyck 已提交
1546
			p = c;
1547
			idx = 0;
1548
		}
1549 1550

		/* Node empty, walk back up to parent */
A
Alexander Duyck 已提交
1551
		c = p;
E
Eric Dumazet 已提交
1552
	} while ((p = node_parent_rcu(c)) != NULL);
1553 1554 1555 1556

	return NULL; /* Root of trie */
}

A
Alexander Duyck 已提交
1557
static struct tnode *trie_firstleaf(struct trie *t)
1558
{
A
Alexander Duyck 已提交
1559
	struct tnode *n = rcu_dereference_rtnl(t->trie);
1560 1561 1562 1563 1564

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
A
Alexander Duyck 已提交
1565
		return n;
1566 1567 1568 1569

	return leaf_walk_rcu(n, NULL);
}

A
Alexander Duyck 已提交
1570
static struct tnode *trie_nextleaf(struct tnode *l)
1571
{
A
Alexander Duyck 已提交
1572
	struct tnode *p = node_parent_rcu(l);
1573 1574 1575 1576

	if (!p)
		return NULL;	/* trie with just one leaf */

A
Alexander Duyck 已提交
1577
	return leaf_walk_rcu(p, l);
1578 1579
}

A
Alexander Duyck 已提交
1580
static struct tnode *trie_leafindex(struct trie *t, int index)
1581
{
A
Alexander Duyck 已提交
1582
	struct tnode *l = trie_firstleaf(t);
1583

S
Stephen Hemminger 已提交
1584
	while (l && index-- > 0)
1585
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1586

1587 1588 1589 1590
	return l;
}


1591 1592 1593
/*
 * Caller must hold RTNL.
 */
1594
int fib_table_flush(struct fib_table *tb)
1595 1596
{
	struct trie *t = (struct trie *) tb->tb_data;
A
Alexander Duyck 已提交
1597
	struct tnode *l, *ll = NULL;
1598
	int found = 0;
1599

1600
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1601
		found += trie_flush_leaf(l);
1602 1603

		if (ll && hlist_empty(&ll->list))
1604
			trie_leaf_remove(t, ll);
1605 1606 1607 1608
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1609
		trie_leaf_remove(t, ll);
1610

S
Stephen Hemminger 已提交
1611
	pr_debug("trie_flush found=%d\n", found);
1612 1613 1614
	return found;
}

1615 1616
void fib_free_table(struct fib_table *tb)
{
1617 1618 1619 1620 1621
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie *t = (struct trie *)tb->tb_data;

	free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
1622 1623 1624
	kfree(tb);
}

1625 1626
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1627 1628 1629 1630
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1631
	__be32 xkey = htonl(key);
1632

1633
	s_i = cb->args[5];
1634 1635
	i = 0;

R
Robert Olsson 已提交
1636 1637 1638
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1639 1640 1641 1642 1643
		if (i < s_i) {
			i++;
			continue;
		}

1644
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1645 1646 1647 1648
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1649
				  xkey,
1650 1651
				  plen,
				  fa->fa_tos,
1652
				  fa->fa_info, NLM_F_MULTI) < 0) {
1653
			cb->args[5] = i;
1654
			return -1;
O
Olof Johansson 已提交
1655
		}
1656 1657
		i++;
	}
1658
	cb->args[5] = i;
1659 1660 1661
	return skb->len;
}

A
Alexander Duyck 已提交
1662
static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1663
			struct sk_buff *skb, struct netlink_callback *cb)
1664
{
1665 1666
	struct leaf_info *li;
	int i, s_i;
1667

1668
	s_i = cb->args[4];
1669
	i = 0;
1670

1671
	/* rcu_read_lock is hold by caller */
1672
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1673 1674
		if (i < s_i) {
			i++;
1675
			continue;
1676
		}
O
Olof Johansson 已提交
1677

1678
		if (i > s_i)
1679
			cb->args[5] = 0;
1680

1681
		if (list_empty(&li->falh))
1682 1683
			continue;

1684
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1685
			cb->args[4] = i;
1686 1687
			return -1;
		}
1688
		i++;
1689
	}
1690

1691
	cb->args[4] = i;
1692 1693 1694
	return skb->len;
}

1695 1696
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1697
{
A
Alexander Duyck 已提交
1698
	struct tnode *l;
1699
	struct trie *t = (struct trie *) tb->tb_data;
1700
	t_key key = cb->args[2];
1701
	int count = cb->args[3];
1702

R
Robert Olsson 已提交
1703
	rcu_read_lock();
1704 1705 1706
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1707
	if (count == 0)
1708 1709
		l = trie_firstleaf(t);
	else {
1710 1711 1712
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1713
		l = fib_find_node(t, key);
1714 1715
		if (!l)
			l = trie_leafindex(t, count);
1716
	}
1717

1718 1719
	while (l) {
		cb->args[2] = l->key;
1720
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1721
			cb->args[3] = count;
1722 1723
			rcu_read_unlock();
			return -1;
1724
		}
1725

1726
		++count;
1727
		l = trie_nextleaf(l);
1728 1729
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1730
	}
1731
	cb->args[3] = count;
R
Robert Olsson 已提交
1732
	rcu_read_unlock();
1733

1734 1735 1736
	return skb->len;
}

1737
void __init fib_trie_init(void)
1738
{
1739 1740
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1741 1742 1743
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
A
Alexander Duyck 已提交
1744
					   max(sizeof(struct tnode),
1745 1746
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1747
}
1748

1749

1750
struct fib_table *fib_trie_table(u32 id)
1751 1752 1753 1754 1755 1756 1757 1758 1759 1760
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1761
	tb->tb_default = -1;
1762
	tb->tb_num_default = 0;
1763 1764

	t = (struct trie *) tb->tb_data;
1765 1766 1767 1768 1769 1770 1771 1772
	RCU_INIT_POINTER(t->trie, NULL);
#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats = alloc_percpu(struct trie_use_stats);
	if (!t->stats) {
		kfree(tb);
		tb = NULL;
	}
#endif
1773 1774 1775 1776

	return tb;
}

1777 1778 1779
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1780
	struct seq_net_private p;
1781
	struct fib_table *tb;
1782
	struct tnode *tnode;
E
Eric Dumazet 已提交
1783 1784
	unsigned int index;
	unsigned int depth;
1785
};
1786

A
Alexander Duyck 已提交
1787
static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1788
{
1789
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
1790
	unsigned int cindex = iter->index;
1791
	struct tnode *p;
1792

1793 1794 1795 1796
	/* A single entry routing table */
	if (!tn)
		return NULL;

1797 1798 1799 1800
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
A
Alexander Duyck 已提交
1801
		struct tnode *n = tnode_get_child_rcu(tn, cindex);
1802

1803 1804 1805 1806 1807 1808
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
A
Alexander Duyck 已提交
1809
				iter->tnode = n;
1810 1811 1812 1813 1814
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
1815

1816 1817
		++cindex;
	}
O
Olof Johansson 已提交
1818

1819
	/* Current node exhausted, pop back up */
A
Alexander Duyck 已提交
1820
	p = node_parent_rcu(tn);
1821
	if (p) {
1822
		cindex = get_index(tn->key, p) + 1;
1823 1824 1825
		tn = p;
		--iter->depth;
		goto rescan;
1826
	}
1827 1828 1829

	/* got root? */
	return NULL;
1830 1831
}

A
Alexander Duyck 已提交
1832
static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1833
				       struct trie *t)
1834
{
A
Alexander Duyck 已提交
1835
	struct tnode *n;
1836

S
Stephen Hemminger 已提交
1837
	if (!t)
1838 1839 1840
		return NULL;

	n = rcu_dereference(t->trie);
1841
	if (!n)
1842
		return NULL;
1843

1844
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
1845
		iter->tnode = n;
1846 1847 1848 1849 1850 1851
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
1852
	}
1853 1854

	return n;
1855
}
O
Olof Johansson 已提交
1856

1857 1858
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
A
Alexander Duyck 已提交
1859
	struct tnode *n;
1860
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
1861

1862
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
1863

1864
	rcu_read_lock();
1865
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1866
		if (IS_LEAF(n)) {
1867 1868
			struct leaf_info *li;

1869 1870 1871 1872
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
1873

A
Alexander Duyck 已提交
1874
			hlist_for_each_entry_rcu(li, &n->list, hlist)
1875
				++s->prefixes;
1876 1877 1878 1879
		} else {
			int i;

			s->tnodes++;
A
Alexander Duyck 已提交
1880 1881
			if (n->bits < MAX_STAT_DEPTH)
				s->nodesizes[n->bits]++;
R
Robert Olsson 已提交
1882

A
Alexander Duyck 已提交
1883 1884
			for (i = 0; i < tnode_child_length(n); i++)
				if (!rcu_access_pointer(n->child[i]))
1885
					s->nullpointers++;
1886 1887
		}
	}
R
Robert Olsson 已提交
1888
	rcu_read_unlock();
1889 1890
}

1891 1892 1893 1894
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1895
{
E
Eric Dumazet 已提交
1896
	unsigned int i, max, pointers, bytes, avdepth;
1897

1898 1899 1900 1901
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
1902

1903 1904
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
1905
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
1906

1907
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
A
Alexander Duyck 已提交
1908
	bytes = sizeof(struct tnode) * stat->leaves;
1909 1910 1911 1912

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

1913
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
1914
	bytes += sizeof(struct tnode) * stat->tnodes;
1915

R
Robert Olsson 已提交
1916 1917
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
1918
		max--;
1919

1920
	pointers = 0;
1921
	for (i = 1; i < max; i++)
1922
		if (stat->nodesizes[i] != 0) {
1923
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
1924 1925 1926
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
1927
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
1928

A
Alexander Duyck 已提交
1929
	bytes += sizeof(struct tnode *) * pointers;
1930 1931
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
1932
}
R
Robert Olsson 已提交
1933

1934
#ifdef CONFIG_IP_FIB_TRIE_STATS
1935
static void trie_show_usage(struct seq_file *seq,
1936
			    const struct trie_use_stats __percpu *stats)
1937
{
1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952
	struct trie_use_stats s = { 0 };
	int cpu;

	/* loop through all of the CPUs and gather up the stats */
	for_each_possible_cpu(cpu) {
		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);

		s.gets += pcpu->gets;
		s.backtrack += pcpu->backtrack;
		s.semantic_match_passed += pcpu->semantic_match_passed;
		s.semantic_match_miss += pcpu->semantic_match_miss;
		s.null_node_hit += pcpu->null_node_hit;
		s.resize_node_skipped += pcpu->resize_node_skipped;
	}

1953
	seq_printf(seq, "\nCounters:\n---------\n");
1954 1955
	seq_printf(seq, "gets = %u\n", s.gets);
	seq_printf(seq, "backtracks = %u\n", s.backtrack);
1956
	seq_printf(seq, "semantic match passed = %u\n",
1957 1958 1959 1960
		   s.semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
1961
}
1962 1963
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

1964
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
1965
{
1966 1967 1968 1969 1970 1971
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
1972
}
1973

1974

1975 1976
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
1977
	struct net *net = (struct net *)seq->private;
1978
	unsigned int h;
1979

1980
	seq_printf(seq,
1981 1982
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
A
Alexander Duyck 已提交
1983
		   sizeof(struct tnode), sizeof(struct tnode));
1984

1985 1986 1987 1988
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

1989
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1990 1991
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
1992

1993 1994 1995 1996 1997 1998 1999 2000
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
2001
			trie_show_usage(seq, t->stats);
2002 2003 2004
#endif
		}
	}
2005

2006
	return 0;
2007 2008
}

2009
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2010
{
2011
	return single_open_net(inode, file, fib_triestat_seq_show);
2012 2013
}

2014
static const struct file_operations fib_triestat_fops = {
2015 2016 2017 2018
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2019
	.release = single_release_net,
2020 2021
};

A
Alexander Duyck 已提交
2022
static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2023
{
2024 2025
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2026
	loff_t idx = 0;
2027
	unsigned int h;
2028

2029 2030 2031
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
2032

2033
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
A
Alexander Duyck 已提交
2034
			struct tnode *n;
2035 2036 2037 2038 2039 2040 2041 2042 2043

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2044
	}
2045

2046 2047 2048
	return NULL;
}

2049
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2050
	__acquires(RCU)
2051
{
2052
	rcu_read_lock();
2053
	return fib_trie_get_idx(seq, *pos);
2054 2055
}

2056
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2057
{
2058
	struct fib_trie_iter *iter = seq->private;
2059
	struct net *net = seq_file_net(seq);
2060 2061 2062
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
A
Alexander Duyck 已提交
2063
	struct tnode *n;
2064

2065
	++*pos;
2066 2067 2068 2069
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2070

2071 2072
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2073
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2074 2075 2076 2077 2078
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2079

2080 2081 2082
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2083
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2084 2085 2086 2087 2088
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2089
	return NULL;
2090 2091 2092 2093

found:
	iter->tb = tb;
	return n;
2094
}
2095

2096
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2097
	__releases(RCU)
2098
{
2099 2100
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2101

2102 2103
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2104 2105
	while (n-- > 0)
		seq_puts(seq, "   ");
2106
}
2107

2108
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2109
{
S
Stephen Hemminger 已提交
2110
	switch (s) {
2111 2112 2113 2114 2115 2116
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2117
		snprintf(buf, len, "scope=%d", s);
2118 2119 2120
		return buf;
	}
}
2121

2122
static const char *const rtn_type_names[__RTN_MAX] = {
2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2136

E
Eric Dumazet 已提交
2137
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2138 2139 2140
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2141
	snprintf(buf, len, "type %u", t);
2142
	return buf;
2143 2144
}

2145 2146
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2147
{
2148
	const struct fib_trie_iter *iter = seq->private;
A
Alexander Duyck 已提交
2149
	struct tnode *n = v;
2150

2151 2152
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2153

2154
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
2155
		__be32 prf = htonl(n->key);
O
Olof Johansson 已提交
2156

2157 2158 2159 2160
		seq_indent(seq, iter->depth-1);
		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
			   n->full_children, n->empty_children);
2161
	} else {
2162
		struct leaf_info *li;
A
Alexander Duyck 已提交
2163
		__be32 val = htonl(n->key);
2164 2165

		seq_indent(seq, iter->depth);
2166
		seq_printf(seq, "  |-- %pI4\n", &val);
2167

A
Alexander Duyck 已提交
2168
		hlist_for_each_entry_rcu(li, &n->list, hlist) {
2169 2170 2171 2172 2173 2174 2175 2176
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2177
						     fa->fa_info->fib_scope),
2178 2179 2180
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2181
					seq_printf(seq, " tos=%d", fa->fa_tos);
2182
				seq_putc(seq, '\n');
2183 2184
			}
		}
2185
	}
2186

2187 2188 2189
	return 0;
}

2190
static const struct seq_operations fib_trie_seq_ops = {
2191 2192 2193 2194
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2195 2196
};

2197
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2198
{
2199 2200
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2201 2202
}

2203
static const struct file_operations fib_trie_fops = {
2204 2205 2206 2207
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2208
	.release = seq_release_net,
2209 2210
};

2211 2212 2213 2214 2215 2216 2217
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

A
Alexander Duyck 已提交
2218
static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2219
{
A
Alexander Duyck 已提交
2220
	struct tnode *l = NULL;
2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2251
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
A
Alexander Duyck 已提交
2265
	struct tnode *l = v;
2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2289
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2290
{
E
Eric Dumazet 已提交
2291
	unsigned int flags = 0;
2292

E
Eric Dumazet 已提交
2293 2294
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2295 2296
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2297
	if (mask == htonl(0xFFFFFFFF))
2298 2299 2300
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2301 2302
}

2303 2304 2305
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2306
 *	and needs to be same as fib_hash output to avoid breaking
2307 2308 2309
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2310
{
A
Alexander Duyck 已提交
2311
	struct tnode *l = v;
2312
	struct leaf_info *li;
2313

2314 2315 2316 2317 2318 2319
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2320

2321
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2322
		struct fib_alias *fa;
A
Al Viro 已提交
2323
		__be32 mask, prefix;
O
Olof Johansson 已提交
2324

2325 2326
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2327

2328
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2329
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2330
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2331

2332 2333 2334
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2335

2336 2337
			seq_setwidth(seq, 127);

2338
			if (fi)
2339 2340
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2341
					 "%d\t%08X\t%d\t%u\t%u",
2342 2343 2344 2345 2346
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2347 2348
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2349
					 fi->fib_window,
2350
					 fi->fib_rtt >> 3);
2351
			else
2352 2353
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2354
					 "%d\t%08X\t%d\t%u\t%u",
2355
					 prefix, 0, flags, 0, 0, 0,
2356
					 mask, 0, 0, 0);
2357

2358
			seq_pad(seq, '\n');
2359
		}
2360 2361 2362 2363 2364
	}

	return 0;
}

2365
static const struct seq_operations fib_route_seq_ops = {
2366 2367 2368
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2369
	.show   = fib_route_seq_show,
2370 2371
};

2372
static int fib_route_seq_open(struct inode *inode, struct file *file)
2373
{
2374
	return seq_open_net(inode, file, &fib_route_seq_ops,
2375
			    sizeof(struct fib_route_iter));
2376 2377
}

2378
static const struct file_operations fib_route_fops = {
2379 2380 2381 2382
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2383
	.release = seq_release_net,
2384 2385
};

2386
int __net_init fib_proc_init(struct net *net)
2387
{
2388
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2389 2390
		goto out1;

2391 2392
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2393 2394
		goto out2;

2395
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2396 2397
		goto out3;

2398
	return 0;
2399 2400

out3:
2401
	remove_proc_entry("fib_triestat", net->proc_net);
2402
out2:
2403
	remove_proc_entry("fib_trie", net->proc_net);
2404 2405
out1:
	return -ENOMEM;
2406 2407
}

2408
void __net_exit fib_proc_exit(struct net *net)
2409
{
2410 2411 2412
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2413 2414 2415
}

#endif /* CONFIG_PROC_FS */