fib_trie.c 57.9 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

90 91
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
R
Robert Olsson 已提交
92

93 94 95
#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))

96 97 98 99 100
struct tnode {
	t_key key;
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	struct tnode __rcu *parent;
101
	struct rcu_head rcu;
A
Alexander Duyck 已提交
102 103 104 105 106 107 108 109 110 111
	union {
		/* The fields in this struct are valid if bits > 0 (TNODE) */
		struct {
			unsigned int full_children;  /* KEYLENGTH bits needed */
			unsigned int empty_children; /* KEYLENGTH bits needed */
			struct tnode __rcu *child[0];
		};
		/* This list pointer if valid if bits == 0 (LEAF) */
		struct hlist_head list;
	};
112 113 114 115 116
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
117
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
118
	struct list_head falh;
119
	struct rcu_head rcu;
120 121 122 123 124 125 126 127 128
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
129
	unsigned int resize_node_skipped;
130 131 132 133 134 135 136 137 138
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
139
	unsigned int prefixes;
R
Robert Olsson 已提交
140
	unsigned int nodesizes[MAX_STAT_DEPTH];
141
};
142 143

struct trie {
A
Alexander Duyck 已提交
144
	struct tnode __rcu *trie;
145
#ifdef CONFIG_IP_FIB_TRIE_STATS
146
	struct trie_use_stats __percpu *stats;
147 148 149
#endif
};

A
Alexander Duyck 已提交
150
static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
151
				  int wasfull);
A
Alexander Duyck 已提交
152
static struct tnode *resize(struct trie *t, struct tnode *tn);
153 154
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
155
/* tnodes to free after resize(); protected by RTNL */
156
static struct callback_head *tnode_free_head;
157 158 159 160 161 162 163 164
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
165

166
static struct kmem_cache *fn_alias_kmem __read_mostly;
167
static struct kmem_cache *trie_leaf_kmem __read_mostly;
168

169 170
/* caller must hold RTNL */
#define node_parent(n) rtnl_dereference((n)->parent)
E
Eric Dumazet 已提交
171

172 173
/* caller must hold RCU read lock or RTNL */
#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
E
Eric Dumazet 已提交
174

175
/* wrapper for rcu_assign_pointer */
A
Alexander Duyck 已提交
176
static inline void node_set_parent(struct tnode *n, struct tnode *tp)
177
{
A
Alexander Duyck 已提交
178 179
	if (n)
		rcu_assign_pointer(n->parent, tp);
S
Stephen Hemminger 已提交
180 181
}

182 183 184 185
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)

/* This provides us with the number of children in this node, in the case of a
 * leaf this will return 0 meaning none of the children are accessible.
186
 */
187
static inline int tnode_child_length(const struct tnode *tn)
S
Stephen Hemminger 已提交
188
{
189
	return (1ul << tn->bits) & ~(1ul);
S
Stephen Hemminger 已提交
190
}
R
Robert Olsson 已提交
191

E
Eric Dumazet 已提交
192 193 194
/*
 * caller must hold RTNL
 */
A
Alexander Duyck 已提交
195
static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
196
{
197
	BUG_ON(i >= tnode_child_length(tn));
R
Robert Olsson 已提交
198

E
Eric Dumazet 已提交
199
	return rtnl_dereference(tn->child[i]);
200 201
}

E
Eric Dumazet 已提交
202 203 204
/*
 * caller must hold RCU read lock or RTNL
 */
A
Alexander Duyck 已提交
205
static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
206
{
207
	BUG_ON(i >= tnode_child_length(tn));
208

E
Eric Dumazet 已提交
209
	return rcu_dereference_rtnl(tn->child[i]);
210 211
}

212
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
213 214 215 216
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

217
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
218
{
O
Olof Johansson 已提交
219
	if (offset < KEYLENGTH)
220
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
221
	else
222 223 224 225 226
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
227
	return a == b;
228 229 230 231
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
232 233
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
234 235
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
236
}
237 238 239 240 241 242

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

243 244 245
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
246 247 248 249 250
		i++;
	return i;
}

/*
251 252
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
253 254 255 256
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

257 258 259 260 261
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
262 263
  correct key path.

264 265 266 267 268
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
269 270
  call to tkey_sub_equals() in trie_insert().

271
  if n is an internal node - a 'tnode' here, the various parts of its key
272 273
  have many different meanings.

274
  Example:
275 276 277
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
278
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
279 280 281 282 283 284 285 286 287

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
288
  n->bits = 4
289

290 291
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
292 293 294
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
295
  index into the parent's child array. That is, they will be used to find
296 297 298 299 300
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

301
  All the bits we have seen so far are significant to the node n. The rest
302 303
  of the bits are really not needed or indeed known in n->key.

304
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
305
  n's child array, and will of course be different for each child.
306

307

308 309 310 311 312
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

313 314
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
315
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
316
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
317 318

static void __alias_free_mem(struct rcu_head *head)
319
{
R
Robert Olsson 已提交
320 321
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
322 323
}

R
Robert Olsson 已提交
324
static inline void alias_free_mem_rcu(struct fib_alias *fa)
325
{
R
Robert Olsson 已提交
326 327
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
328

329
#define TNODE_KMALLOC_MAX \
A
Alexander Duyck 已提交
330
	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
O
Olof Johansson 已提交
331

332
static void __node_free_rcu(struct rcu_head *head)
333
{
A
Alexander Duyck 已提交
334
	struct tnode *n = container_of(head, struct tnode, rcu);
335 336 337 338 339 340 341

	if (IS_LEAF(n))
		kmem_cache_free(trie_leaf_kmem, n);
	else if (n->bits <= TNODE_KMALLOC_MAX)
		kfree(n);
	else
		vfree(n);
342 343
}

344 345
#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)

R
Robert Olsson 已提交
346
static inline void free_leaf_info(struct leaf_info *leaf)
347
{
348
	kfree_rcu(leaf, rcu);
349 350
}

351
static struct tnode *tnode_alloc(size_t size)
352
{
R
Robert Olsson 已提交
353
	if (size <= PAGE_SIZE)
354
		return kzalloc(size, GFP_KERNEL);
355
	else
356
		return vzalloc(size);
357
}
R
Robert Olsson 已提交
358

J
Jarek Poplawski 已提交
359 360 361
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
362 363
	tn->rcu.next = tnode_free_head;
	tnode_free_head = &tn->rcu;
J
Jarek Poplawski 已提交
364 365 366 367
}

static void tnode_free_flush(void)
{
368 369 370 371 372 373 374
	struct callback_head *head;

	while ((head = tnode_free_head)) {
		struct tnode *tn = container_of(head, struct tnode, rcu);

		tnode_free_head = head->next;
		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
J
Jarek Poplawski 已提交
375

376
		node_free(tn);
J
Jarek Poplawski 已提交
377
	}
378 379 380 381 382

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
383 384
}

A
Alexander Duyck 已提交
385
static struct tnode *leaf_new(t_key key)
R
Robert Olsson 已提交
386
{
A
Alexander Duyck 已提交
387
	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
388
	if (l) {
389 390 391 392 393 394 395 396 397 398
		l->parent = NULL;
		/* set key and pos to reflect full key value
		 * any trailing zeros in the key should be ignored
		 * as the nodes are searched
		 */
		l->key = key;
		l->pos = KEYLENGTH;
		/* set bits to 0 indicating we are not a tnode */
		l->bits = 0;

R
Robert Olsson 已提交
399 400 401 402 403 404 405 406 407 408
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
409
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
410 411 412 413 414
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

415
static struct tnode *tnode_new(t_key key, int pos, int bits)
416
{
417
	size_t sz = offsetof(struct tnode, child[1 << bits]);
418
	struct tnode *tn = tnode_alloc(sz);
419 420 421 422
	unsigned int shift = pos + bits;

	/* verify bits and pos their msb bits clear and values are valid */
	BUG_ON(!bits || (shift > KEYLENGTH));
423

O
Olof Johansson 已提交
424
	if (tn) {
425
		tn->parent = NULL;
426 427
		tn->pos = pos;
		tn->bits = bits;
428
		tn->key = mask_pfx(key, pos);
429 430 431
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
432

E
Eric Dumazet 已提交
433
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
A
Alexander Duyck 已提交
434
		 sizeof(struct tnode *) << bits);
435 436 437 438 439 440 441 442
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

A
Alexander Duyck 已提交
443
static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
444
{
445
	return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
446 447
}

L
Lin Ming 已提交
448
static inline void put_child(struct tnode *tn, int i,
A
Alexander Duyck 已提交
449
			     struct tnode *n)
450 451 452 453
{
	tnode_put_child_reorg(tn, i, n, -1);
}

454
 /*
455 456 457 458
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

A
Alexander Duyck 已提交
459
static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
460
				  int wasfull)
461
{
A
Alexander Duyck 已提交
462
	struct tnode *chi = rtnl_dereference(tn->child[i]);
463 464
	int isfull;

S
Stephen Hemminger 已提交
465 466
	BUG_ON(i >= 1<<tn->bits);

467 468 469 470 471
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
472

473
	/* update fullChildren */
O
Olof Johansson 已提交
474
	if (wasfull == -1)
475 476 477
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
478
	if (wasfull && !isfull)
479
		tn->full_children--;
480
	else if (!wasfull && isfull)
481
		tn->full_children++;
O
Olof Johansson 已提交
482

483
	node_set_parent(n, tn);
484

485
	rcu_assign_pointer(tn->child[i], n);
486 487
}

J
Jens Låås 已提交
488
#define MAX_WORK 10
A
Alexander Duyck 已提交
489
static struct tnode *resize(struct trie *t, struct tnode *tn)
490
{
A
Alexander Duyck 已提交
491
	struct tnode *old_tn, *n = NULL;
492 493
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
494
	int max_work;
495

496
	if (!tn)
497 498
		return NULL;

S
Stephen Hemminger 已提交
499 500
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
501 502

	/* No children */
503 504 505
	if (tn->empty_children > (tnode_child_length(tn) - 1))
		goto no_children;

506
	/* One child */
507
	if (tn->empty_children == (tnode_child_length(tn) - 1))
J
Jens Låås 已提交
508
		goto one_child;
509
	/*
510 511 512 513 514
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
515 516
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
517
	 * Telecommunications, page 6:
518
	 * "A node is doubled if the ratio of non-empty children to all
519 520
	 * children in the *doubled* node is at least 'high'."
	 *
521 522 523 524 525
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
526
	 * multiply the left-hand side by 50.
527 528 529 530
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
531
	 * children, that is non-null tnodes with a skip value of 0.
532
	 * All of those will be doubled in the resulting inflated tnode, so
533
	 * we just count them one extra time here.
534
	 *
535
	 * A clearer way to write this would be:
536
	 *
537
	 * to_be_doubled = tn->full_children;
538
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
539 540 541 542
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
543
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
544 545
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
546 547 548
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
549 550 551
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
552
	 *
553 554 555
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
556
	 *
557
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
558
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
559
	 *    tn->full_children) >= inflate_threshold * new_child_length
560
	 *
561
	 * expand new_child_length:
562
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
563
	 *    tn->full_children) >=
564
	 *      inflate_threshold * tnode_child_length(tn) * 2
565
	 *
566
	 * shorten again:
567
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
568
	 *    tn->empty_children) >= inflate_threshold *
569
	 *    tnode_child_length(tn)
570
	 *
571 572
	 */

573 574
	/* Keep root node larger  */

575
	if (!node_parent(tn)) {
J
Jens Låås 已提交
576 577
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
578
	} else {
579
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
580 581
		halve_threshold_use = halve_threshold;
	}
582

J
Jens Låås 已提交
583 584
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
585 586 587
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
588

589 590
		old_tn = tn;
		tn = inflate(t, tn);
591

592 593
		if (IS_ERR(tn)) {
			tn = old_tn;
594
#ifdef CONFIG_IP_FIB_TRIE_STATS
595
			this_cpu_inc(t->stats->resize_node_skipped);
596 597 598
#endif
			break;
		}
599 600
	}

J
Jens Låås 已提交
601
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
602
	if (max_work != MAX_WORK)
A
Alexander Duyck 已提交
603
		return tn;
J
Jens Låås 已提交
604

605 606 607 608
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
609

J
Jens Låås 已提交
610 611
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
612
	       100 * (tnode_child_length(tn) - tn->empty_children) <
613
	       halve_threshold_use * tnode_child_length(tn)) {
614

615 616 617 618
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
619
#ifdef CONFIG_IP_FIB_TRIE_STATS
620
			this_cpu_inc(t->stats->resize_node_skipped);
621 622 623 624
#endif
			break;
		}
	}
625

626

627
	/* Only one child remains */
628 629
	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
		unsigned long i;
J
Jens Låås 已提交
630
one_child:
631 632 633 634 635 636 637
		for (i = tnode_child_length(tn); !n && i;)
			n = tnode_get_child(tn, --i);
no_children:
		/* compress one level */
		node_set_parent(n, NULL);
		tnode_free_safe(tn);
		return n;
J
Jens Låås 已提交
638
	}
A
Alexander Duyck 已提交
639
	return tn;
640 641
}

E
Eric Dumazet 已提交
642 643 644

static void tnode_clean_free(struct tnode *tn)
{
A
Alexander Duyck 已提交
645
	struct tnode *tofree;
E
Eric Dumazet 已提交
646 647 648
	int i;

	for (i = 0; i < tnode_child_length(tn); i++) {
649
		tofree = rtnl_dereference(tn->child[i]);
E
Eric Dumazet 已提交
650
		if (tofree)
651
			node_free(tofree);
E
Eric Dumazet 已提交
652
	}
653
	node_free(tn);
E
Eric Dumazet 已提交
654 655
}

A
Alexander Duyck 已提交
656
static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
657
{
A
Alexander Duyck 已提交
658 659
	int olen = tnode_child_length(oldtnode);
	struct tnode *tn;
660 661
	int i;

S
Stephen Hemminger 已提交
662
	pr_debug("In inflate\n");
663 664 665

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
666
	if (!tn)
667
		return ERR_PTR(-ENOMEM);
668 669

	/*
670 671 672
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
673 674
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
675 676

	for (i = 0; i < olen; i++) {
677
		struct tnode *inode;
678

A
Alexander Duyck 已提交
679 680
		inode = tnode_get_child(oldtnode, i);
		if (tnode_full(oldtnode, inode) && inode->bits > 1) {
681
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
682
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
683

684 685
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
686 687
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
688

689 690 691
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

692
			if (!right) {
693
				node_free(left);
694
				goto nomem;
695
			}
696

A
Alexander Duyck 已提交
697 698
			put_child(tn, 2*i, left);
			put_child(tn, 2*i+1, right);
699 700 701
		}
	}

O
Olof Johansson 已提交
702
	for (i = 0; i < olen; i++) {
A
Alexander Duyck 已提交
703
		struct tnode *inode = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
704 705
		struct tnode *left, *right;
		int size, j;
706

707
		/* An empty child */
A
Alexander Duyck 已提交
708
		if (inode == NULL)
709 710 711
			continue;

		/* A leaf or an internal node with skipped bits */
A
Alexander Duyck 已提交
712
		if (!tnode_full(oldtnode, inode)) {
713
			put_child(tn,
A
Alexander Duyck 已提交
714 715
				tkey_extract_bits(inode->key, tn->pos, tn->bits),
				inode);
716 717 718 719 720
			continue;
		}

		/* An internal node with two children */
		if (inode->bits == 1) {
L
Lin Ming 已提交
721 722
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
723

J
Jarek Poplawski 已提交
724
			tnode_free_safe(inode);
O
Olof Johansson 已提交
725
			continue;
726 727
		}

O
Olof Johansson 已提交
728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
746

O
Olof Johansson 已提交
747 748 749
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
750

A
Alexander Duyck 已提交
751
		left = tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
752
		put_child(tn, 2*i, NULL);
753

O
Olof Johansson 已提交
754
		BUG_ON(!left);
755

A
Alexander Duyck 已提交
756
		right = tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
757
		put_child(tn, 2*i+1, NULL);
758

O
Olof Johansson 已提交
759
		BUG_ON(!right);
760

O
Olof Johansson 已提交
761 762
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
763 764
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
765
		}
L
Lin Ming 已提交
766 767
		put_child(tn, 2*i, resize(t, left));
		put_child(tn, 2*i+1, resize(t, right));
O
Olof Johansson 已提交
768

J
Jarek Poplawski 已提交
769
		tnode_free_safe(inode);
770
	}
J
Jarek Poplawski 已提交
771
	tnode_free_safe(oldtnode);
772
	return tn;
773
nomem:
E
Eric Dumazet 已提交
774 775
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
776 777
}

A
Alexander Duyck 已提交
778
static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
779
{
A
Alexander Duyck 已提交
780 781
	int olen = tnode_child_length(oldtnode);
	struct tnode *tn, *left, *right;
782 783
	int i;

S
Stephen Hemminger 已提交
784
	pr_debug("In halve\n");
785 786

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
787

788 789
	if (!tn)
		return ERR_PTR(-ENOMEM);
790 791

	/*
792 793 794
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
795 796 797
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
798
	for (i = 0; i < olen; i += 2) {
799 800
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
801

802
		/* Two nonempty children */
S
Stephen Hemminger 已提交
803
		if (left && right) {
804
			struct tnode *newn;
S
Stephen Hemminger 已提交
805

806
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
807 808

			if (!newn)
809
				goto nomem;
S
Stephen Hemminger 已提交
810

A
Alexander Duyck 已提交
811
			put_child(tn, i/2, newn);
812 813 814
		}

	}
815

O
Olof Johansson 已提交
816 817 818
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

819 820
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
821

822 823 824 825
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
826
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
827
			continue;
S
Stephen Hemminger 已提交
828
		}
O
Olof Johansson 已提交
829 830

		if (right == NULL) {
L
Lin Ming 已提交
831
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
832 833
			continue;
		}
834

835
		/* Two nonempty children */
A
Alexander Duyck 已提交
836
		newBinNode = tnode_get_child(tn, i/2);
L
Lin Ming 已提交
837 838 839 840
		put_child(tn, i/2, NULL);
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
		put_child(tn, i/2, resize(t, newBinNode));
841
	}
J
Jarek Poplawski 已提交
842
	tnode_free_safe(oldtnode);
843
	return tn;
844
nomem:
E
Eric Dumazet 已提交
845 846
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
847 848
}

R
Robert Olsson 已提交
849
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
850 851
 via get_fa_head and dump */

A
Alexander Duyck 已提交
852
static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
853
{
R
Robert Olsson 已提交
854
	struct hlist_head *head = &l->list;
855 856
	struct leaf_info *li;

857
	hlist_for_each_entry_rcu(li, head, hlist)
858
		if (li->plen == plen)
859
			return li;
O
Olof Johansson 已提交
860

861 862 863
	return NULL;
}

A
Alexander Duyck 已提交
864
static inline struct list_head *get_fa_head(struct tnode *l, int plen)
865
{
R
Robert Olsson 已提交
866
	struct leaf_info *li = find_leaf_info(l, plen);
867

O
Olof Johansson 已提交
868 869
	if (!li)
		return NULL;
870

O
Olof Johansson 已提交
871
	return &li->falh;
872 873 874 875
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
876 877 878 879 880
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
881
		hlist_for_each_entry(li, head, hlist) {
882 883 884 885 886 887
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
888
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
889 890 891
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
892 893
}

R
Robert Olsson 已提交
894
/* rcu_read_lock needs to be hold by caller from readside */
A
Alexander Duyck 已提交
895
static struct tnode *fib_find_node(struct trie *t, u32 key)
896
{
A
Alexander Duyck 已提交
897
	struct tnode *n = rcu_dereference_rtnl(t->trie);
A
Alexander Duyck 已提交
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916

	while (n) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the bits in the cindex. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is cindex
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			return NULL;

		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
917 918
			break;

A
Alexander Duyck 已提交
919 920
		n = rcu_dereference_rtnl(n->child[index]);
	}
O
Olof Johansson 已提交
921

A
Alexander Duyck 已提交
922
	return n;
923 924
}

925
static void trie_rebalance(struct trie *t, struct tnode *tn)
926 927
{
	int wasfull;
R
Robert Olsson 已提交
928
	t_key cindex, key;
S
Stephen Hemminger 已提交
929
	struct tnode *tp;
930

R
Robert Olsson 已提交
931 932
	key = tn->key;

933
	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
934 935
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
A
Alexander Duyck 已提交
936
		tn = resize(t, tn);
937

A
Alexander Duyck 已提交
938
		tnode_put_child_reorg(tp, cindex, tn, wasfull);
O
Olof Johansson 已提交
939

940
		tp = node_parent(tn);
941
		if (!tp)
A
Alexander Duyck 已提交
942
			rcu_assign_pointer(t->trie, tn);
943

J
Jarek Poplawski 已提交
944
		tnode_free_flush();
S
Stephen Hemminger 已提交
945
		if (!tp)
946
			break;
S
Stephen Hemminger 已提交
947
		tn = tp;
948
	}
S
Stephen Hemminger 已提交
949

950
	/* Handle last (top) tnode */
951
	if (IS_TNODE(tn))
A
Alexander Duyck 已提交
952
		tn = resize(t, tn);
953

A
Alexander Duyck 已提交
954
	rcu_assign_pointer(t->trie, tn);
955
	tnode_free_flush();
956 957
}

R
Robert Olsson 已提交
958 959
/* only used from updater-side */

960
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
961 962 963
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
A
Alexander Duyck 已提交
964 965
	struct tnode *n;
	struct tnode *l;
966
	int missbit;
967
	struct list_head *fa_head = NULL;
968 969 970 971
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
E
Eric Dumazet 已提交
972
	n = rtnl_dereference(t->trie);
973

974 975
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
976
	 * and we should just put our new leaf in that.
977 978
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
979 980
	 * not be the parent's 'pos'+'bits'!
	 *
981
	 * If it does match the current key, get pos/bits from it, extract
982 983 984 985
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
986 987 988
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
989 990 991
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

992
	while (n && IS_TNODE(n)) {
A
Alexander Duyck 已提交
993 994 995 996
		if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
			tp = n;
			pos = n->pos + n->bits;
			n = tnode_get_child(n,
997
					    tkey_extract_bits(key,
A
Alexander Duyck 已提交
998 999
							      n->pos,
							      n->bits));
1000

A
Alexander Duyck 已提交
1001
			BUG_ON(n && node_parent(n) != tp);
O
Olof Johansson 已提交
1002
		} else
1003 1004 1005 1006 1007 1008
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1009
	 * tp is n's (parent) ----> NULL or TNODE
1010 1011
	 */

O
Olof Johansson 已提交
1012
	BUG_ON(tp && IS_LEAF(tp));
1013 1014 1015

	/* Case 1: n is a leaf. Compare prefixes */

1016
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1017
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1018

1019 1020
		if (!li)
			return NULL;
1021 1022

		fa_head = &li->falh;
A
Alexander Duyck 已提交
1023
		insert_leaf_info(&n->list, li);
1024 1025
		goto done;
	}
1026
	l = leaf_new(key);
1027

1028 1029
	if (!l)
		return NULL;
1030 1031 1032

	li = leaf_info_new(plen);

1033
	if (!li) {
1034
		node_free(l);
1035
		return NULL;
1036
	}
1037 1038 1039 1040 1041

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1042
		/* Case 2: n is NULL, and will just insert a new leaf */
1043

A
Alexander Duyck 已提交
1044
		node_set_parent(l, tp);
1045

O
Olof Johansson 已提交
1046
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
A
Alexander Duyck 已提交
1047
		put_child(tp, cindex, l);
O
Olof Johansson 已提交
1048 1049
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1050 1051
		/*
		 *  Add a new tnode here
1052 1053 1054
		 *  first tnode need some special handling
		 */

1055
		if (n) {
1056
			pos = tp ? tp->pos+tp->bits : 0;
1057 1058
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1059
		} else {
1060
			newpos = 0;
1061
			tn = tnode_new(key, newpos, 1); /* First tnode */
1062 1063
		}

1064
		if (!tn) {
1065
			free_leaf_info(li);
1066
			node_free(l);
1067
			return NULL;
O
Olof Johansson 已提交
1068 1069
		}

A
Alexander Duyck 已提交
1070
		node_set_parent(tn, tp);
1071

O
Olof Johansson 已提交
1072
		missbit = tkey_extract_bits(key, newpos, 1);
A
Alexander Duyck 已提交
1073
		put_child(tn, missbit, l);
L
Lin Ming 已提交
1074
		put_child(tn, 1-missbit, n);
1075

1076
		if (tp) {
1077
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
A
Alexander Duyck 已提交
1078
			put_child(tp, cindex, tn);
O
Olof Johansson 已提交
1079
		} else {
A
Alexander Duyck 已提交
1080
			rcu_assign_pointer(t->trie, tn);
1081
		}
1082 1083

		tp = tn;
1084
	}
O
Olof Johansson 已提交
1085 1086

	if (tp && tp->pos + tp->bits > 32)
J
Joe Perches 已提交
1087 1088
		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1089

1090
	/* Rebalance the trie */
R
Robert Olsson 已提交
1091

1092
	trie_rebalance(t, tp);
1093
done:
1094 1095 1096
	return fa_head;
}

1097 1098 1099
/*
 * Caller must hold RTNL.
 */
1100
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1101 1102 1103
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1104
	struct list_head *fa_head = NULL;
1105
	struct fib_info *fi;
1106 1107
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1108 1109
	u32 key, mask;
	int err;
A
Alexander Duyck 已提交
1110
	struct tnode *l;
1111 1112 1113 1114

	if (plen > 32)
		return -EINVAL;

1115
	key = ntohl(cfg->fc_dst);
1116

1117
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1118

O
Olof Johansson 已提交
1119
	mask = ntohl(inet_make_mask(plen));
1120

1121
	if (key & ~mask)
1122 1123 1124 1125
		return -EINVAL;

	key = key & mask;

1126 1127 1128
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1129
		goto err;
1130
	}
1131 1132

	l = fib_find_node(t, key);
1133
	fa = NULL;
1134

1135
	if (l) {
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1151 1152 1153
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1154 1155

		err = -EEXIST;
1156
		if (cfg->fc_nlflags & NLM_F_EXCL)
1157 1158
			goto out;

1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1179
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1180 1181 1182
			struct fib_info *fi_drop;
			u8 state;

1183 1184 1185 1186
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1187
				goto out;
1188
			}
R
Robert Olsson 已提交
1189
			err = -ENOBUFS;
1190
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1191 1192
			if (new_fa == NULL)
				goto out;
1193 1194

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1195 1196
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1197
			new_fa->fa_type = cfg->fc_type;
1198
			state = fa->fa_state;
1199
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1200

R
Robert Olsson 已提交
1201 1202
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1203 1204 1205

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1206
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1207 1208
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1209

O
Olof Johansson 已提交
1210
			goto succeeded;
1211 1212 1213 1214 1215
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1216 1217
		if (fa_match)
			goto out;
1218

1219
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1220
			fa = fa_first;
1221 1222
	}
	err = -ENOENT;
1223
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1224 1225 1226
		goto out;

	err = -ENOBUFS;
1227
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1228 1229 1230 1231 1232
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1233
	new_fa->fa_type = cfg->fc_type;
1234 1235 1236 1237 1238
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1239
	if (!fa_head) {
1240 1241 1242
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1243
			goto out_free_new_fa;
1244
		}
1245
	}
1246

1247 1248 1249
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1250 1251
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1252

1253
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1254
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1255
		  &cfg->fc_nlinfo, 0);
1256 1257
succeeded:
	return 0;
1258 1259 1260

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1261 1262
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1263
err:
1264 1265 1266
	return err;
}

R
Robert Olsson 已提交
1267
/* should be called with rcu_read_lock */
A
Alexander Duyck 已提交
1268
static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
1269
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1270
		      struct fib_result *res, int fib_flags)
1271 1272 1273
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
1274

1275
	hlist_for_each_entry_rcu(li, hhead, hlist) {
1276
		struct fib_alias *fa;
1277

1278
		if (l->key != (key & li->mask_plen))
1279 1280
			continue;

1281 1282 1283
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1284

1285
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1286
				continue;
1287 1288
			if (fi->fib_dead)
				continue;
1289
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1290 1291 1292
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
1293
			if (unlikely(err < 0)) {
1294
#ifdef CONFIG_IP_FIB_TRIE_STATS
1295
				this_cpu_inc(t->stats->semantic_match_passed);
1296
#endif
1297
				return err;
1298 1299 1300 1301 1302 1303 1304 1305
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1306
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1307 1308 1309
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1310
				this_cpu_inc(t->stats->semantic_match_passed);
1311
#endif
1312
				res->prefixlen = li->plen;
1313 1314
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1315
				res->scope = fi->fib_scope;
1316 1317 1318 1319
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
1320
					atomic_inc(&fi->fib_clntref);
1321 1322 1323 1324 1325
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
1326
		this_cpu_inc(t->stats->semantic_match_miss);
1327 1328
#endif
	}
1329

1330
	return 1;
1331 1332
}

1333 1334 1335 1336 1337 1338 1339
static inline t_key prefix_mismatch(t_key key, struct tnode *n)
{
	t_key prefix = n->key;

	return (key ^ prefix) & (prefix | -prefix);
}

1340
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1341
		     struct fib_result *res, int fib_flags)
1342
{
1343
	struct trie *t = (struct trie *)tb->tb_data;
1344 1345 1346
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats __percpu *stats = t->stats;
#endif
1347 1348 1349 1350
	const t_key key = ntohl(flp->daddr);
	struct tnode *n, *pn;
	t_key cindex;
	int ret = 1;
O
Olof Johansson 已提交
1351

R
Robert Olsson 已提交
1352
	rcu_read_lock();
O
Olof Johansson 已提交
1353

R
Robert Olsson 已提交
1354
	n = rcu_dereference(t->trie);
1355
	if (!n)
1356 1357 1358
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
1359
	this_cpu_inc(stats->gets);
1360 1361
#endif

A
Alexander Duyck 已提交
1362
	pn = n;
1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380
	cindex = 0;

	/* Step 1: Travel to the longest prefix match in the trie */
	for (;;) {
		unsigned long index = get_index(key, n);

		/* This bit of code is a bit tricky but it combines multiple
		 * checks into a single check.  The prefix consists of the
		 * prefix plus zeros for the "bits" in the prefix. The index
		 * is the difference between the key and this value.  From
		 * this we can actually derive several pieces of data.
		 *   if !(index >> bits)
		 *     we know the value is child index
		 *   else
		 *     we have a mismatch in skip bits and failed
		 */
		if (index >> n->bits)
			break;
1381

1382 1383
		/* we have found a leaf. Prefixes have already been compared */
		if (IS_LEAF(n))
1384
			goto found;
1385

1386 1387
		/* only record pn and cindex if we are going to be chopping
		 * bits later.  Otherwise we are just wasting cycles.
O
Olof Johansson 已提交
1388
		 */
1389 1390 1391
		if (index) {
			pn = n;
			cindex = index;
O
Olof Johansson 已提交
1392
		}
1393

1394 1395 1396 1397
		n = rcu_dereference(n->child[index]);
		if (unlikely(!n))
			goto backtrace;
	}
1398

1399 1400 1401 1402
	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
	for (;;) {
		/* record the pointer where our next node pointer is stored */
		struct tnode __rcu **cptr = n->child;
1403

1404 1405 1406
		/* This test verifies that none of the bits that differ
		 * between the key and the prefix exist in the region of
		 * the lsb and higher in the prefix.
O
Olof Johansson 已提交
1407
		 */
1408 1409
		if (unlikely(prefix_mismatch(key, n)))
			goto backtrace;
O
Olof Johansson 已提交
1410

1411 1412 1413
		/* exit out and process leaf */
		if (unlikely(IS_LEAF(n)))
			break;
O
Olof Johansson 已提交
1414

1415 1416 1417
		/* Don't bother recording parent info.  Since we are in
		 * prefix match mode we will have to come back to wherever
		 * we started this traversal anyway
O
Olof Johansson 已提交
1418 1419
		 */

1420
		while ((n = rcu_dereference(*cptr)) == NULL) {
1421 1422
backtrace:
#ifdef CONFIG_IP_FIB_TRIE_STATS
1423 1424
			if (!n)
				this_cpu_inc(stats->null_node_hit);
1425
#endif
1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448
			/* If we are at cindex 0 there are no more bits for
			 * us to strip at this level so we must ascend back
			 * up one level to see if there are any more bits to
			 * be stripped there.
			 */
			while (!cindex) {
				t_key pkey = pn->key;

				pn = node_parent_rcu(pn);
				if (unlikely(!pn))
					goto failed;
#ifdef CONFIG_IP_FIB_TRIE_STATS
				this_cpu_inc(stats->backtrack);
#endif
				/* Get Child's index */
				cindex = get_index(pkey, pn);
			}

			/* strip the least significant bit from the cindex */
			cindex &= cindex - 1;

			/* grab pointer for next child node */
			cptr = &pn->child[cindex];
1449
		}
1450
	}
1451

1452
found:
1453 1454 1455 1456 1457
	/* Step 3: Process the leaf, if that fails fall back to backtracing */
	ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
	if (unlikely(ret > 0))
		goto backtrace;
failed:
R
Robert Olsson 已提交
1458
	rcu_read_unlock();
1459 1460
	return ret;
}
1461
EXPORT_SYMBOL_GPL(fib_table_lookup);
1462

1463 1464 1465
/*
 * Remove the leaf and return parent.
 */
A
Alexander Duyck 已提交
1466
static void trie_leaf_remove(struct trie *t, struct tnode *l)
1467
{
1468
	struct tnode *tp = node_parent(l);
1469

1470
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1471

1472
	if (tp) {
1473
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
L
Lin Ming 已提交
1474
		put_child(tp, cindex, NULL);
1475
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1476
	} else
1477
		RCU_INIT_POINTER(t->trie, NULL);
1478

1479
	node_free(l);
1480 1481
}

1482 1483 1484
/*
 * Caller must hold RTNL.
 */
1485
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1486 1487 1488
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1489 1490
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1491 1492
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
A
Alexander Duyck 已提交
1493
	struct tnode *l;
O
Olof Johansson 已提交
1494 1495
	struct leaf_info *li;

1496
	if (plen > 32)
1497 1498
		return -EINVAL;

1499
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1500
	mask = ntohl(inet_make_mask(plen));
1501

1502
	if (key & ~mask)
1503 1504 1505 1506 1507
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1508
	if (!l)
1509 1510
		return -ESRCH;

1511 1512 1513 1514 1515 1516
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1517 1518 1519 1520 1521
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1522
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1523 1524

	fa_to_delete = NULL;
1525 1526
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1527 1528 1529 1530 1531
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1532 1533
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1534
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1535 1536
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1537 1538 1539
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1540 1541 1542 1543 1544
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1545 1546
	if (!fa_to_delete)
		return -ESRCH;
1547

O
Olof Johansson 已提交
1548
	fa = fa_to_delete;
1549
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1550
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1551

R
Robert Olsson 已提交
1552
	list_del_rcu(&fa->fa_list);
1553

1554 1555 1556
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1557
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1558
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1559
		free_leaf_info(li);
R
Robert Olsson 已提交
1560
	}
1561

O
Olof Johansson 已提交
1562
	if (hlist_empty(&l->list))
1563
		trie_leaf_remove(t, l);
1564

O
Olof Johansson 已提交
1565
	if (fa->fa_state & FA_S_ACCESSED)
1566
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1567

R
Robert Olsson 已提交
1568 1569
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1570
	return 0;
1571 1572
}

1573
static int trie_flush_list(struct list_head *head)
1574 1575 1576 1577 1578 1579 1580
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1581 1582 1583 1584
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1585 1586 1587 1588 1589 1590
			found++;
		}
	}
	return found;
}

A
Alexander Duyck 已提交
1591
static int trie_flush_leaf(struct tnode *l)
1592 1593 1594
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1595
	struct hlist_node *tmp;
1596 1597
	struct leaf_info *li = NULL;

1598
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1599
		found += trie_flush_list(&li->falh);
1600 1601

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1602
			hlist_del_rcu(&li->hlist);
1603 1604 1605 1606 1607 1608
			free_leaf_info(li);
		}
	}
	return found;
}

1609 1610 1611 1612
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
A
Alexander Duyck 已提交
1613
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1614
{
1615 1616
	do {
		t_key idx;
1617 1618

		if (c)
1619
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1620
		else
1621
			idx = 0;
R
Robert Olsson 已提交
1622

1623 1624
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1625
			if (!c)
O
Olof Johansson 已提交
1626 1627
				continue;

1628
			if (IS_LEAF(c))
A
Alexander Duyck 已提交
1629
				return c;
1630 1631

			/* Rescan start scanning in new node */
A
Alexander Duyck 已提交
1632
			p = c;
1633
			idx = 0;
1634
		}
1635 1636

		/* Node empty, walk back up to parent */
A
Alexander Duyck 已提交
1637
		c = p;
E
Eric Dumazet 已提交
1638
	} while ((p = node_parent_rcu(c)) != NULL);
1639 1640 1641 1642

	return NULL; /* Root of trie */
}

A
Alexander Duyck 已提交
1643
static struct tnode *trie_firstleaf(struct trie *t)
1644
{
A
Alexander Duyck 已提交
1645
	struct tnode *n = rcu_dereference_rtnl(t->trie);
1646 1647 1648 1649 1650

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
A
Alexander Duyck 已提交
1651
		return n;
1652 1653 1654 1655

	return leaf_walk_rcu(n, NULL);
}

A
Alexander Duyck 已提交
1656
static struct tnode *trie_nextleaf(struct tnode *l)
1657
{
A
Alexander Duyck 已提交
1658
	struct tnode *p = node_parent_rcu(l);
1659 1660 1661 1662

	if (!p)
		return NULL;	/* trie with just one leaf */

A
Alexander Duyck 已提交
1663
	return leaf_walk_rcu(p, l);
1664 1665
}

A
Alexander Duyck 已提交
1666
static struct tnode *trie_leafindex(struct trie *t, int index)
1667
{
A
Alexander Duyck 已提交
1668
	struct tnode *l = trie_firstleaf(t);
1669

S
Stephen Hemminger 已提交
1670
	while (l && index-- > 0)
1671
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1672

1673 1674 1675 1676
	return l;
}


1677 1678 1679
/*
 * Caller must hold RTNL.
 */
1680
int fib_table_flush(struct fib_table *tb)
1681 1682
{
	struct trie *t = (struct trie *) tb->tb_data;
A
Alexander Duyck 已提交
1683
	struct tnode *l, *ll = NULL;
1684
	int found = 0;
1685

1686
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1687
		found += trie_flush_leaf(l);
1688 1689

		if (ll && hlist_empty(&ll->list))
1690
			trie_leaf_remove(t, ll);
1691 1692 1693 1694
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1695
		trie_leaf_remove(t, ll);
1696

S
Stephen Hemminger 已提交
1697
	pr_debug("trie_flush found=%d\n", found);
1698 1699 1700
	return found;
}

1701 1702
void fib_free_table(struct fib_table *tb)
{
1703 1704 1705 1706 1707
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie *t = (struct trie *)tb->tb_data;

	free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
1708 1709 1710
	kfree(tb);
}

1711 1712
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1713 1714 1715 1716
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1717
	__be32 xkey = htonl(key);
1718

1719
	s_i = cb->args[5];
1720 1721
	i = 0;

R
Robert Olsson 已提交
1722 1723 1724
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1725 1726 1727 1728 1729
		if (i < s_i) {
			i++;
			continue;
		}

1730
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1731 1732 1733 1734
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1735
				  xkey,
1736 1737
				  plen,
				  fa->fa_tos,
1738
				  fa->fa_info, NLM_F_MULTI) < 0) {
1739
			cb->args[5] = i;
1740
			return -1;
O
Olof Johansson 已提交
1741
		}
1742 1743
		i++;
	}
1744
	cb->args[5] = i;
1745 1746 1747
	return skb->len;
}

A
Alexander Duyck 已提交
1748
static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1749
			struct sk_buff *skb, struct netlink_callback *cb)
1750
{
1751 1752
	struct leaf_info *li;
	int i, s_i;
1753

1754
	s_i = cb->args[4];
1755
	i = 0;
1756

1757
	/* rcu_read_lock is hold by caller */
1758
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1759 1760
		if (i < s_i) {
			i++;
1761
			continue;
1762
		}
O
Olof Johansson 已提交
1763

1764
		if (i > s_i)
1765
			cb->args[5] = 0;
1766

1767
		if (list_empty(&li->falh))
1768 1769
			continue;

1770
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1771
			cb->args[4] = i;
1772 1773
			return -1;
		}
1774
		i++;
1775
	}
1776

1777
	cb->args[4] = i;
1778 1779 1780
	return skb->len;
}

1781 1782
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1783
{
A
Alexander Duyck 已提交
1784
	struct tnode *l;
1785
	struct trie *t = (struct trie *) tb->tb_data;
1786
	t_key key = cb->args[2];
1787
	int count = cb->args[3];
1788

R
Robert Olsson 已提交
1789
	rcu_read_lock();
1790 1791 1792
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1793
	if (count == 0)
1794 1795
		l = trie_firstleaf(t);
	else {
1796 1797 1798
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1799
		l = fib_find_node(t, key);
1800 1801
		if (!l)
			l = trie_leafindex(t, count);
1802
	}
1803

1804 1805
	while (l) {
		cb->args[2] = l->key;
1806
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1807
			cb->args[3] = count;
1808 1809
			rcu_read_unlock();
			return -1;
1810
		}
1811

1812
		++count;
1813
		l = trie_nextleaf(l);
1814 1815
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1816
	}
1817
	cb->args[3] = count;
R
Robert Olsson 已提交
1818
	rcu_read_unlock();
1819

1820 1821 1822
	return skb->len;
}

1823
void __init fib_trie_init(void)
1824
{
1825 1826
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1827 1828 1829
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
A
Alexander Duyck 已提交
1830
					   max(sizeof(struct tnode),
1831 1832
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1833
}
1834

1835

1836
struct fib_table *fib_trie_table(u32 id)
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1847
	tb->tb_default = -1;
1848
	tb->tb_num_default = 0;
1849 1850

	t = (struct trie *) tb->tb_data;
1851 1852 1853 1854 1855 1856 1857 1858
	RCU_INIT_POINTER(t->trie, NULL);
#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats = alloc_percpu(struct trie_use_stats);
	if (!t->stats) {
		kfree(tb);
		tb = NULL;
	}
#endif
1859 1860 1861 1862

	return tb;
}

1863 1864 1865
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1866
	struct seq_net_private p;
1867
	struct fib_table *tb;
1868
	struct tnode *tnode;
E
Eric Dumazet 已提交
1869 1870
	unsigned int index;
	unsigned int depth;
1871
};
1872

A
Alexander Duyck 已提交
1873
static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1874
{
1875
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
1876
	unsigned int cindex = iter->index;
1877
	struct tnode *p;
1878

1879 1880 1881 1882
	/* A single entry routing table */
	if (!tn)
		return NULL;

1883 1884 1885 1886
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
A
Alexander Duyck 已提交
1887
		struct tnode *n = tnode_get_child_rcu(tn, cindex);
1888

1889 1890 1891 1892 1893 1894
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
A
Alexander Duyck 已提交
1895
				iter->tnode = n;
1896 1897 1898 1899 1900
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
1901

1902 1903
		++cindex;
	}
O
Olof Johansson 已提交
1904

1905
	/* Current node exhausted, pop back up */
A
Alexander Duyck 已提交
1906
	p = node_parent_rcu(tn);
1907 1908 1909 1910 1911
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
1912
	}
1913 1914 1915

	/* got root? */
	return NULL;
1916 1917
}

A
Alexander Duyck 已提交
1918
static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1919
				       struct trie *t)
1920
{
A
Alexander Duyck 已提交
1921
	struct tnode *n;
1922

S
Stephen Hemminger 已提交
1923
	if (!t)
1924 1925 1926
		return NULL;

	n = rcu_dereference(t->trie);
1927
	if (!n)
1928
		return NULL;
1929

1930
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
1931
		iter->tnode = n;
1932 1933 1934 1935 1936 1937
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
1938
	}
1939 1940

	return n;
1941
}
O
Olof Johansson 已提交
1942

1943 1944
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
A
Alexander Duyck 已提交
1945
	struct tnode *n;
1946
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
1947

1948
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
1949

1950
	rcu_read_lock();
1951
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1952
		if (IS_LEAF(n)) {
1953 1954
			struct leaf_info *li;

1955 1956 1957 1958
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
1959

A
Alexander Duyck 已提交
1960
			hlist_for_each_entry_rcu(li, &n->list, hlist)
1961
				++s->prefixes;
1962 1963 1964 1965
		} else {
			int i;

			s->tnodes++;
A
Alexander Duyck 已提交
1966 1967
			if (n->bits < MAX_STAT_DEPTH)
				s->nodesizes[n->bits]++;
R
Robert Olsson 已提交
1968

A
Alexander Duyck 已提交
1969 1970
			for (i = 0; i < tnode_child_length(n); i++)
				if (!rcu_access_pointer(n->child[i]))
1971
					s->nullpointers++;
1972 1973
		}
	}
R
Robert Olsson 已提交
1974
	rcu_read_unlock();
1975 1976
}

1977 1978 1979 1980
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1981
{
E
Eric Dumazet 已提交
1982
	unsigned int i, max, pointers, bytes, avdepth;
1983

1984 1985 1986 1987
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
1988

1989 1990
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
1991
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
1992

1993
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
A
Alexander Duyck 已提交
1994
	bytes = sizeof(struct tnode) * stat->leaves;
1995 1996 1997 1998

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

1999
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2000
	bytes += sizeof(struct tnode) * stat->tnodes;
2001

R
Robert Olsson 已提交
2002 2003
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2004
		max--;
2005

2006
	pointers = 0;
2007
	for (i = 1; i < max; i++)
2008
		if (stat->nodesizes[i] != 0) {
2009
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2010 2011 2012
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2013
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2014

A
Alexander Duyck 已提交
2015
	bytes += sizeof(struct tnode *) * pointers;
2016 2017
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2018
}
R
Robert Olsson 已提交
2019

2020
#ifdef CONFIG_IP_FIB_TRIE_STATS
2021
static void trie_show_usage(struct seq_file *seq,
2022
			    const struct trie_use_stats __percpu *stats)
2023
{
2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
	struct trie_use_stats s = { 0 };
	int cpu;

	/* loop through all of the CPUs and gather up the stats */
	for_each_possible_cpu(cpu) {
		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);

		s.gets += pcpu->gets;
		s.backtrack += pcpu->backtrack;
		s.semantic_match_passed += pcpu->semantic_match_passed;
		s.semantic_match_miss += pcpu->semantic_match_miss;
		s.null_node_hit += pcpu->null_node_hit;
		s.resize_node_skipped += pcpu->resize_node_skipped;
	}

2039
	seq_printf(seq, "\nCounters:\n---------\n");
2040 2041
	seq_printf(seq, "gets = %u\n", s.gets);
	seq_printf(seq, "backtracks = %u\n", s.backtrack);
2042
	seq_printf(seq, "semantic match passed = %u\n",
2043 2044 2045 2046
		   s.semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2047
}
2048 2049
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2050
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2051
{
2052 2053 2054 2055 2056 2057
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2058
}
2059

2060

2061 2062
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2063
	struct net *net = (struct net *)seq->private;
2064
	unsigned int h;
2065

2066
	seq_printf(seq,
2067 2068
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
A
Alexander Duyck 已提交
2069
		   sizeof(struct tnode), sizeof(struct tnode));
2070

2071 2072 2073 2074
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

2075
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2076 2077
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2078

2079 2080 2081 2082 2083 2084 2085 2086
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
2087
			trie_show_usage(seq, t->stats);
2088 2089 2090
#endif
		}
	}
2091

2092
	return 0;
2093 2094
}

2095
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2096
{
2097
	return single_open_net(inode, file, fib_triestat_seq_show);
2098 2099
}

2100
static const struct file_operations fib_triestat_fops = {
2101 2102 2103 2104
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2105
	.release = single_release_net,
2106 2107
};

A
Alexander Duyck 已提交
2108
static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2109
{
2110 2111
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2112
	loff_t idx = 0;
2113
	unsigned int h;
2114

2115 2116 2117
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
2118

2119
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
A
Alexander Duyck 已提交
2120
			struct tnode *n;
2121 2122 2123 2124 2125 2126 2127 2128 2129

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2130
	}
2131

2132 2133 2134
	return NULL;
}

2135
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2136
	__acquires(RCU)
2137
{
2138
	rcu_read_lock();
2139
	return fib_trie_get_idx(seq, *pos);
2140 2141
}

2142
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2143
{
2144
	struct fib_trie_iter *iter = seq->private;
2145
	struct net *net = seq_file_net(seq);
2146 2147 2148
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
A
Alexander Duyck 已提交
2149
	struct tnode *n;
2150

2151
	++*pos;
2152 2153 2154 2155
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2156

2157 2158
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2159
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2160 2161 2162 2163 2164
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2165

2166 2167 2168
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2169
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2170 2171 2172 2173 2174
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2175
	return NULL;
2176 2177 2178 2179

found:
	iter->tb = tb;
	return n;
2180
}
2181

2182
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2183
	__releases(RCU)
2184
{
2185 2186
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2187

2188 2189
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2190 2191
	while (n-- > 0)
		seq_puts(seq, "   ");
2192
}
2193

2194
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2195
{
S
Stephen Hemminger 已提交
2196
	switch (s) {
2197 2198 2199 2200 2201 2202
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2203
		snprintf(buf, len, "scope=%d", s);
2204 2205 2206
		return buf;
	}
}
2207

2208
static const char *const rtn_type_names[__RTN_MAX] = {
2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2222

E
Eric Dumazet 已提交
2223
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2224 2225 2226
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2227
	snprintf(buf, len, "type %u", t);
2228
	return buf;
2229 2230
}

2231 2232
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2233
{
2234
	const struct fib_trie_iter *iter = seq->private;
A
Alexander Duyck 已提交
2235
	struct tnode *n = v;
2236

2237 2238
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2239

2240
	if (IS_TNODE(n)) {
A
Alexander Duyck 已提交
2241
		__be32 prf = htonl(n->key);
O
Olof Johansson 已提交
2242

A
Alexander Duyck 已提交
2243
		seq_indent(seq, iter->depth - 1);
2244
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
A
Alexander Duyck 已提交
2245 2246
			   &prf, n->pos, n->bits, n->full_children,
			   n->empty_children);
2247
	} else {
2248
		struct leaf_info *li;
A
Alexander Duyck 已提交
2249
		__be32 val = htonl(n->key);
2250 2251

		seq_indent(seq, iter->depth);
2252
		seq_printf(seq, "  |-- %pI4\n", &val);
2253

A
Alexander Duyck 已提交
2254
		hlist_for_each_entry_rcu(li, &n->list, hlist) {
2255 2256 2257 2258 2259 2260 2261 2262
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2263
						     fa->fa_info->fib_scope),
2264 2265 2266
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2267
					seq_printf(seq, " tos=%d", fa->fa_tos);
2268
				seq_putc(seq, '\n');
2269 2270
			}
		}
2271
	}
2272

2273 2274 2275
	return 0;
}

2276
static const struct seq_operations fib_trie_seq_ops = {
2277 2278 2279 2280
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2281 2282
};

2283
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2284
{
2285 2286
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2287 2288
}

2289
static const struct file_operations fib_trie_fops = {
2290 2291 2292 2293
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2294
	.release = seq_release_net,
2295 2296
};

2297 2298 2299 2300 2301 2302 2303
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

A
Alexander Duyck 已提交
2304
static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2305
{
A
Alexander Duyck 已提交
2306
	struct tnode *l = NULL;
2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2337
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
A
Alexander Duyck 已提交
2351
	struct tnode *l = v;
2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2375
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2376
{
E
Eric Dumazet 已提交
2377
	unsigned int flags = 0;
2378

E
Eric Dumazet 已提交
2379 2380
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2381 2382
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2383
	if (mask == htonl(0xFFFFFFFF))
2384 2385 2386
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2387 2388
}

2389 2390 2391
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2392
 *	and needs to be same as fib_hash output to avoid breaking
2393 2394 2395
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2396
{
A
Alexander Duyck 已提交
2397
	struct tnode *l = v;
2398
	struct leaf_info *li;
2399

2400 2401 2402 2403 2404 2405
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2406

2407
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2408
		struct fib_alias *fa;
A
Al Viro 已提交
2409
		__be32 mask, prefix;
O
Olof Johansson 已提交
2410

2411 2412
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2413

2414
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2415
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2416
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2417

2418 2419 2420
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2421

2422 2423
			seq_setwidth(seq, 127);

2424
			if (fi)
2425 2426
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2427
					 "%d\t%08X\t%d\t%u\t%u",
2428 2429 2430 2431 2432
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2433 2434
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2435
					 fi->fib_window,
2436
					 fi->fib_rtt >> 3);
2437
			else
2438 2439
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2440
					 "%d\t%08X\t%d\t%u\t%u",
2441
					 prefix, 0, flags, 0, 0, 0,
2442
					 mask, 0, 0, 0);
2443

2444
			seq_pad(seq, '\n');
2445
		}
2446 2447 2448 2449 2450
	}

	return 0;
}

2451
static const struct seq_operations fib_route_seq_ops = {
2452 2453 2454
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2455
	.show   = fib_route_seq_show,
2456 2457
};

2458
static int fib_route_seq_open(struct inode *inode, struct file *file)
2459
{
2460
	return seq_open_net(inode, file, &fib_route_seq_ops,
2461
			    sizeof(struct fib_route_iter));
2462 2463
}

2464
static const struct file_operations fib_route_fops = {
2465 2466 2467 2468
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2469
	.release = seq_release_net,
2470 2471
};

2472
int __net_init fib_proc_init(struct net *net)
2473
{
2474
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2475 2476
		goto out1;

2477 2478
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2479 2480
		goto out2;

2481
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2482 2483
		goto out3;

2484
	return 0;
2485 2486

out3:
2487
	remove_proc_entry("fib_triestat", net->proc_net);
2488
out2:
2489
	remove_proc_entry("fib_trie", net->proc_net);
2490 2491
out1:
	return -ENOMEM;
2492 2493
}

2494
void __net_exit fib_proc_exit(struct net *net)
2495
{
2496 2497 2498
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2499 2500 2501
}

#endif /* CONFIG_PROC_FS */