fib_trie.c 60.8 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89 90 91 92

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
93 94
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
95 96
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
97

98
struct rt_trie_node {
O
Olof Johansson 已提交
99
	unsigned long parent;
100
	t_key key;
101 102 103
};

struct leaf {
O
Olof Johansson 已提交
104
	unsigned long parent;
105
	t_key key;
106
	struct hlist_head list;
R
Robert Olsson 已提交
107
	struct rcu_head rcu;
108 109 110 111 112
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
113
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
114
	struct list_head falh;
115
	struct rcu_head rcu;
116 117 118
};

struct tnode {
O
Olof Johansson 已提交
119
	unsigned long parent;
120
	t_key key;
121 122
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123 124
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
125 126
	union {
		struct rcu_head rcu;
J
Jarek Poplawski 已提交
127
		struct tnode *tnode_free;
128
	};
E
Eric Dumazet 已提交
129
	struct rt_trie_node __rcu *child[0];
130 131 132 133 134 135 136 137 138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140 141 142 143 144 145 146 147 148
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
149
	unsigned int prefixes;
R
Robert Olsson 已提交
150
	unsigned int nodesizes[MAX_STAT_DEPTH];
151
};
152 153

struct trie {
E
Eric Dumazet 已提交
154
	struct rt_trie_node __rcu *trie;
155 156 157 158 159
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
};

160
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
161
				  int wasfull);
162
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163 164
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
165 166
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
167 168 169 170 171 172 173 174
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
175

176
static struct kmem_cache *fn_alias_kmem __read_mostly;
177
static struct kmem_cache *trie_leaf_kmem __read_mostly;
178

E
Eric Dumazet 已提交
179 180 181 182
/*
 * caller must hold RTNL
 */
static inline struct tnode *node_parent(const struct rt_trie_node *node)
S
Stephen Hemminger 已提交
183
{
E
Eric Dumazet 已提交
184 185 186 187 188
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());

	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
189 190
}

E
Eric Dumazet 已提交
191 192 193 194
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195
{
E
Eric Dumazet 已提交
196 197 198 199
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
							   lockdep_rtnl_is_held());
S
Stephen Hemminger 已提交
200

E
Eric Dumazet 已提交
201
	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
S
Stephen Hemminger 已提交
202 203
}

204
/* Same as rcu_assign_pointer
205 206
 * but that macro() assumes that value is a pointer.
 */
207
static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
S
Stephen Hemminger 已提交
208
{
209 210
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
211
}
R
Robert Olsson 已提交
212

E
Eric Dumazet 已提交
213 214 215 216
/*
 * caller must hold RTNL
 */
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217 218
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
219

E
Eric Dumazet 已提交
220
	return rtnl_dereference(tn->child[i]);
221 222
}

E
Eric Dumazet 已提交
223 224 225 226
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227
{
E
Eric Dumazet 已提交
228
	BUG_ON(i >= 1U << tn->bits);
229

E
Eric Dumazet 已提交
230
	return rcu_dereference_rtnl(tn->child[i]);
231 232
}

S
Stephen Hemmigner 已提交
233
static inline int tnode_child_length(const struct tnode *tn)
234
{
O
Olof Johansson 已提交
235
	return 1 << tn->bits;
236 237
}

238
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
239 240 241 242
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

243
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
244
{
O
Olof Johansson 已提交
245
	if (offset < KEYLENGTH)
246
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
247
	else
248 249 250 251 252
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
253
	return a == b;
254 255 256 257
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
258 259
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
260 261
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
262
}
263 264 265 266 267 268

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

269 270 271
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
272 273 274 275 276
		i++;
	return i;
}

/*
277 278
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
279 280 281 282
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

283 284 285 286 287
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
288 289
  correct key path.

290 291 292 293 294
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
295 296
  call to tkey_sub_equals() in trie_insert().

297
  if n is an internal node - a 'tnode' here, the various parts of its key
298 299
  have many different meanings.

300
  Example:
301 302 303
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
304
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
305 306 307 308 309 310 311 312 313

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
314
  n->bits = 4
315

316 317
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318 319 320
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321
  index into the parent's child array. That is, they will be used to find
322 323 324 325 326
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

327
  All the bits we have seen so far are significant to the node n. The rest
328 329
  of the bits are really not needed or indeed known in n->key.

330
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331
  n's child array, and will of course be different for each child.
332

333

334 335 336 337 338
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
339
static inline void check_tnode(const struct tnode *tn)
340
{
S
Stephen Hemminger 已提交
341
	WARN_ON(tn && tn->pos+tn->bits > 32);
342 343
}

344 345
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
346
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
347
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
348 349

static void __alias_free_mem(struct rcu_head *head)
350
{
R
Robert Olsson 已提交
351 352
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
353 354
}

R
Robert Olsson 已提交
355
static inline void alias_free_mem_rcu(struct fib_alias *fa)
356
{
R
Robert Olsson 已提交
357 358
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
359

R
Robert Olsson 已提交
360 361
static void __leaf_free_rcu(struct rcu_head *head)
{
362 363
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
364
}
O
Olof Johansson 已提交
365

366 367
static inline void free_leaf(struct leaf *l)
{
368
	call_rcu(&l->rcu, __leaf_free_rcu);
369 370
}

R
Robert Olsson 已提交
371
static inline void free_leaf_info(struct leaf_info *leaf)
372
{
373
	kfree_rcu(leaf, rcu);
374 375
}

376
static struct tnode *tnode_alloc(size_t size)
377
{
R
Robert Olsson 已提交
378
	if (size <= PAGE_SIZE)
379
		return kzalloc(size, GFP_KERNEL);
380
	else
381
		return vzalloc(size);
382
}
R
Robert Olsson 已提交
383 384

static void __tnode_free_rcu(struct rcu_head *head)
385
{
R
Robert Olsson 已提交
386
	struct tnode *tn = container_of(head, struct tnode, rcu);
387
	size_t size = sizeof(struct tnode) +
388
		      (sizeof(struct rt_trie_node *) << tn->bits);
389 390 391

	if (size <= PAGE_SIZE)
		kfree(tn);
A
Al Viro 已提交
392 393
	else
		vfree(tn);
394 395
}

R
Robert Olsson 已提交
396 397
static inline void tnode_free(struct tnode *tn)
{
398 399 400
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
401
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
402 403
}

J
Jarek Poplawski 已提交
404 405 406
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
407 408
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
409
	tnode_free_size += sizeof(struct tnode) +
410
			   (sizeof(struct rt_trie_node *) << tn->bits);
J
Jarek Poplawski 已提交
411 412 413 414 415 416 417 418 419 420 421
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
422 423 424 425 426

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
427 428
}

R
Robert Olsson 已提交
429 430
static struct leaf *leaf_new(void)
{
431
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
432 433 434 435 436 437 438 439 440 441 442 443
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
444
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
445 446 447 448 449
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

450
static struct tnode *tnode_new(t_key key, int pos, int bits)
451
{
452
	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
453
	struct tnode *tn = tnode_alloc(sz);
454

O
Olof Johansson 已提交
455
	if (tn) {
R
Robert Olsson 已提交
456
		tn->parent = T_TNODE;
457 458 459 460 461 462
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
463

E
Eric Dumazet 已提交
464
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
L
Lin Ming 已提交
465
		 sizeof(struct rt_trie_node *) << bits);
466 467 468 469 470 471 472 473
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

474
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475
{
476
	if (n == NULL || IS_LEAF(n))
477 478 479 480 481
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

L
Lin Ming 已提交
482
static inline void put_child(struct tnode *tn, int i,
483
			     struct rt_trie_node *n)
484 485 486 487
{
	tnode_put_child_reorg(tn, i, n, -1);
}

488
 /*
489 490 491 492
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

493
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494
				  int wasfull)
495
{
E
Eric Dumazet 已提交
496
	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
497 498
	int isfull;

S
Stephen Hemminger 已提交
499 500
	BUG_ON(i >= 1<<tn->bits);

501 502 503 504 505
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
506

507
	/* update fullChildren */
O
Olof Johansson 已提交
508
	if (wasfull == -1)
509 510 511
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
512
	if (wasfull && !isfull)
513
		tn->full_children--;
514
	else if (!wasfull && isfull)
515
		tn->full_children++;
O
Olof Johansson 已提交
516

517
	if (n)
S
Stephen Hemminger 已提交
518
		node_set_parent(n, tn);
519

520
	rcu_assign_pointer(tn->child[i], n);
521 522
}

J
Jens Låås 已提交
523
#define MAX_WORK 10
524
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525 526
{
	int i;
527
	struct tnode *old_tn;
528 529
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
530
	int max_work;
531

532
	if (!tn)
533 534
		return NULL;

S
Stephen Hemminger 已提交
535 536
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
537 538 539

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
540
		tnode_free_safe(tn);
541 542 543 544
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
J
Jens Låås 已提交
545
		goto one_child;
546
	/*
547 548 549 550 551
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
552 553
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554
	 * Telecommunications, page 6:
555
	 * "A node is doubled if the ratio of non-empty children to all
556 557
	 * children in the *doubled* node is at least 'high'."
	 *
558 559 560 561 562
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
563
	 * multiply the left-hand side by 50.
564 565 566 567
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
568
	 * children, that is non-null tnodes with a skip value of 0.
569
	 * All of those will be doubled in the resulting inflated tnode, so
570
	 * we just count them one extra time here.
571
	 *
572
	 * A clearer way to write this would be:
573
	 *
574
	 * to_be_doubled = tn->full_children;
575
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 577 578 579
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
580
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 582
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
583 584 585
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
586 587 588
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
589
	 *
590 591 592
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
593
	 *
594
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
595
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
596
	 *    tn->full_children) >= inflate_threshold * new_child_length
597
	 *
598
	 * expand new_child_length:
599
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
600
	 *    tn->full_children) >=
601
	 *      inflate_threshold * tnode_child_length(tn) * 2
602
	 *
603
	 * shorten again:
604
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
605
	 *    tn->empty_children) >= inflate_threshold *
606
	 *    tnode_child_length(tn)
607
	 *
608 609 610
	 */

	check_tnode(tn);
611

612 613
	/* Keep root node larger  */

614
	if (!node_parent((struct rt_trie_node *)tn)) {
J
Jens Låås 已提交
615 616
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
617
	} else {
618
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
619 620
		halve_threshold_use = halve_threshold;
	}
621

J
Jens Låås 已提交
622 623
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
624 625 626
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
627

628 629
		old_tn = tn;
		tn = inflate(t, tn);
630

631 632
		if (IS_ERR(tn)) {
			tn = old_tn;
633 634 635 636 637
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
638 639 640 641
	}

	check_tnode(tn);

J
Jens Låås 已提交
642
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
643
	if (max_work != MAX_WORK)
644
		return (struct rt_trie_node *) tn;
J
Jens Låås 已提交
645

646 647 648 649
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
650

J
Jens Låås 已提交
651 652
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
653
	       100 * (tnode_child_length(tn) - tn->empty_children) <
654
	       halve_threshold_use * tnode_child_length(tn)) {
655

656 657 658 659
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
660 661 662 663 664 665
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
666

667

668
	/* Only one child remains */
J
Jens Låås 已提交
669 670
	if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
671
		for (i = 0; i < tnode_child_length(tn); i++) {
672
			struct rt_trie_node *n;
673

E
Eric Dumazet 已提交
674
			n = rtnl_dereference(tn->child[i]);
R
Robert Olsson 已提交
675
			if (!n)
O
Olof Johansson 已提交
676 677 678 679
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
680
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
681
			tnode_free_safe(tn);
O
Olof Johansson 已提交
682
			return n;
683
		}
J
Jens Låås 已提交
684
	}
685
	return (struct rt_trie_node *) tn;
686 687
}

E
Eric Dumazet 已提交
688 689 690 691 692 693 694 695 696 697 698 699 700 701

static void tnode_clean_free(struct tnode *tn)
{
	int i;
	struct tnode *tofree;

	for (i = 0; i < tnode_child_length(tn); i++) {
		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
		if (tofree)
			tnode_free(tofree);
	}
	tnode_free(tn);
}

702
static struct tnode *inflate(struct trie *t, struct tnode *tn)
703 704 705 706 707
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
708
	pr_debug("In inflate\n");
709 710 711

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
712
	if (!tn)
713
		return ERR_PTR(-ENOMEM);
714 715

	/*
716 717 718
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
719 720
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
721 722

	for (i = 0; i < olen; i++) {
723
		struct tnode *inode;
724

725
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 727 728 729 730
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
731
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732

733 734
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
735 736
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
737

738 739 740
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

741
			if (!right) {
742 743
				tnode_free(left);
				goto nomem;
744
			}
745

L
Lin Ming 已提交
746 747
			put_child(tn, 2*i, (struct rt_trie_node *) left);
			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
748 749 750
		}
	}

O
Olof Johansson 已提交
751
	for (i = 0; i < olen; i++) {
752
		struct tnode *inode;
753
		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
754 755
		struct tnode *left, *right;
		int size, j;
756

757 758 759 760 761 762
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

763
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
764
		   tn->pos + tn->bits - 1) {
765 766 767
			put_child(tn,
				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
				node);
768 769 770 771 772 773 774
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
L
Lin Ming 已提交
775 776
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
777

J
Jarek Poplawski 已提交
778
			tnode_free_safe(inode);
O
Olof Johansson 已提交
779
			continue;
780 781
		}

O
Olof Johansson 已提交
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
800

O
Olof Johansson 已提交
801 802 803
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
804

O
Olof Johansson 已提交
805
		left = (struct tnode *) tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
806
		put_child(tn, 2*i, NULL);
807

O
Olof Johansson 已提交
808
		BUG_ON(!left);
809

O
Olof Johansson 已提交
810
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
811
		put_child(tn, 2*i+1, NULL);
812

O
Olof Johansson 已提交
813
		BUG_ON(!right);
814

O
Olof Johansson 已提交
815 816
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
817 818
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
819
		}
L
Lin Ming 已提交
820 821
		put_child(tn, 2*i, resize(t, left));
		put_child(tn, 2*i+1, resize(t, right));
O
Olof Johansson 已提交
822

J
Jarek Poplawski 已提交
823
		tnode_free_safe(inode);
824
	}
J
Jarek Poplawski 已提交
825
	tnode_free_safe(oldtnode);
826
	return tn;
827
nomem:
E
Eric Dumazet 已提交
828 829
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
830 831
}

832
static struct tnode *halve(struct trie *t, struct tnode *tn)
833 834
{
	struct tnode *oldtnode = tn;
835
	struct rt_trie_node *left, *right;
836 837 838
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
839
	pr_debug("In halve\n");
840 841

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
842

843 844
	if (!tn)
		return ERR_PTR(-ENOMEM);
845 846

	/*
847 848 849
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
850 851 852
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
853
	for (i = 0; i < olen; i += 2) {
854 855
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
856

857
		/* Two nonempty children */
S
Stephen Hemminger 已提交
858
		if (left && right) {
859
			struct tnode *newn;
S
Stephen Hemminger 已提交
860

861
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
862 863

			if (!newn)
864
				goto nomem;
S
Stephen Hemminger 已提交
865

L
Lin Ming 已提交
866
			put_child(tn, i/2, (struct rt_trie_node *)newn);
867 868 869
		}

	}
870

O
Olof Johansson 已提交
871 872 873
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

874 875
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
876

877 878 879 880
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
881
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
882
			continue;
S
Stephen Hemminger 已提交
883
		}
O
Olof Johansson 已提交
884 885

		if (right == NULL) {
L
Lin Ming 已提交
886
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
887 888
			continue;
		}
889

890
		/* Two nonempty children */
O
Olof Johansson 已提交
891
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
L
Lin Ming 已提交
892 893 894 895
		put_child(tn, i/2, NULL);
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
		put_child(tn, i/2, resize(t, newBinNode));
896
	}
J
Jarek Poplawski 已提交
897
	tnode_free_safe(oldtnode);
898
	return tn;
899
nomem:
E
Eric Dumazet 已提交
900 901
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
902 903
}

R
Robert Olsson 已提交
904
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
905 906
 via get_fa_head and dump */

R
Robert Olsson 已提交
907
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
908
{
R
Robert Olsson 已提交
909
	struct hlist_head *head = &l->list;
910 911
	struct leaf_info *li;

912
	hlist_for_each_entry_rcu(li, head, hlist)
913
		if (li->plen == plen)
914
			return li;
O
Olof Johansson 已提交
915

916 917 918
	return NULL;
}

919
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
920
{
R
Robert Olsson 已提交
921
	struct leaf_info *li = find_leaf_info(l, plen);
922

O
Olof Johansson 已提交
923 924
	if (!li)
		return NULL;
925

O
Olof Johansson 已提交
926
	return &li->falh;
927 928 929 930
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
931 932 933 934 935
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
936
		hlist_for_each_entry(li, head, hlist) {
937 938 939 940 941 942
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
943
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
944 945 946
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
947 948
}

R
Robert Olsson 已提交
949 950
/* rcu_read_lock needs to be hold by caller from readside */

951 952 953 954 955
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
956
	struct rt_trie_node *n;
957 958

	pos = 0;
E
Eric Dumazet 已提交
959
	n = rcu_dereference_rtnl(t->trie);
960 961 962

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
963

964
		check_tnode(tn);
O
Olof Johansson 已提交
965

966
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
967
			pos = tn->pos + tn->bits;
968 969 970 971
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
972
		} else
973 974 975 976
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
977 978 979
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

980 981 982
	return NULL;
}

983
static void trie_rebalance(struct trie *t, struct tnode *tn)
984 985
{
	int wasfull;
R
Robert Olsson 已提交
986
	t_key cindex, key;
S
Stephen Hemminger 已提交
987
	struct tnode *tp;
988

R
Robert Olsson 已提交
989 990
	key = tn->key;

991
	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
992 993
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
J
Joe Perches 已提交
994
		tn = (struct tnode *)resize(t, tn);
995

J
Joe Perches 已提交
996
		tnode_put_child_reorg(tp, cindex,
997
				      (struct rt_trie_node *)tn, wasfull);
O
Olof Johansson 已提交
998

999
		tp = node_parent((struct rt_trie_node *) tn);
1000
		if (!tp)
1001
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1002

J
Jarek Poplawski 已提交
1003
		tnode_free_flush();
S
Stephen Hemminger 已提交
1004
		if (!tp)
1005
			break;
S
Stephen Hemminger 已提交
1006
		tn = tp;
1007
	}
S
Stephen Hemminger 已提交
1008

1009
	/* Handle last (top) tnode */
1010
	if (IS_TNODE(tn))
J
Joe Perches 已提交
1011
		tn = (struct tnode *)resize(t, tn);
1012

1013
	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014
	tnode_free_flush();
1015 1016
}

R
Robert Olsson 已提交
1017 1018
/* only used from updater-side */

1019
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020 1021 1022
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
1023
	struct rt_trie_node *n;
1024 1025
	struct leaf *l;
	int missbit;
1026
	struct list_head *fa_head = NULL;
1027 1028 1029 1030
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
E
Eric Dumazet 已提交
1031
	n = rtnl_dereference(t->trie);
1032

1033 1034
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1035
	 * and we should just put our new leaf in that.
1036 1037
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 1039
	 * not be the parent's 'pos'+'bits'!
	 *
1040
	 * If it does match the current key, get pos/bits from it, extract
1041 1042 1043 1044
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1045 1046 1047
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1048 1049 1050 1051 1052
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1053

1054
		check_tnode(tn);
O
Olof Johansson 已提交
1055

1056
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057
			tp = tn;
O
Olof Johansson 已提交
1058
			pos = tn->pos + tn->bits;
1059 1060 1061 1062
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1063

S
Stephen Hemminger 已提交
1064
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1065
		} else
1066 1067 1068 1069 1070 1071
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1072
	 * tp is n's (parent) ----> NULL or TNODE
1073 1074
	 */

O
Olof Johansson 已提交
1075
	BUG_ON(tp && IS_LEAF(tp));
1076 1077 1078

	/* Case 1: n is a leaf. Compare prefixes */

1079
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080
		l = (struct leaf *) n;
1081
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1082

1083 1084
		if (!li)
			return NULL;
1085 1086 1087 1088 1089 1090 1091

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1092 1093
	if (!l)
		return NULL;
1094 1095 1096 1097

	l->key = key;
	li = leaf_info_new(plen);

1098
	if (!li) {
1099
		free_leaf(l);
1100
		return NULL;
1101
	}
1102 1103 1104 1105 1106

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1107
		/* Case 2: n is NULL, and will just insert a new leaf */
1108

1109
		node_set_parent((struct rt_trie_node *)l, tp);
1110

O
Olof Johansson 已提交
1111
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1112
		put_child(tp, cindex, (struct rt_trie_node *)l);
O
Olof Johansson 已提交
1113 1114
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1115 1116
		/*
		 *  Add a new tnode here
1117 1118 1119
		 *  first tnode need some special handling
		 */

1120
		if (n) {
1121
			pos = tp ? tp->pos+tp->bits : 0;
1122 1123
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1124
		} else {
1125
			newpos = 0;
1126
			tn = tnode_new(key, newpos, 1); /* First tnode */
1127 1128
		}

1129
		if (!tn) {
1130
			free_leaf_info(li);
1131
			free_leaf(l);
1132
			return NULL;
O
Olof Johansson 已提交
1133 1134
		}

1135
		node_set_parent((struct rt_trie_node *)tn, tp);
1136

O
Olof Johansson 已提交
1137
		missbit = tkey_extract_bits(key, newpos, 1);
L
Lin Ming 已提交
1138 1139
		put_child(tn, missbit, (struct rt_trie_node *)l);
		put_child(tn, 1-missbit, n);
1140

1141
		if (tp) {
1142
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1143
			put_child(tp, cindex, (struct rt_trie_node *)tn);
O
Olof Johansson 已提交
1144
		} else {
1145
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1146 1147 1148
			tp = tn;
		}
	}
O
Olof Johansson 已提交
1149 1150

	if (tp && tp->pos + tp->bits > 32)
J
Joe Perches 已提交
1151 1152
		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1153

1154
	/* Rebalance the trie */
R
Robert Olsson 已提交
1155

1156
	trie_rebalance(t, tp);
1157
done:
1158 1159 1160
	return fa_head;
}

1161 1162 1163
/*
 * Caller must hold RTNL.
 */
1164
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1165 1166 1167
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1168
	struct list_head *fa_head = NULL;
1169
	struct fib_info *fi;
1170 1171
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1172 1173 1174 1175 1176 1177 1178
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1179
	key = ntohl(cfg->fc_dst);
1180

1181
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1182

O
Olof Johansson 已提交
1183
	mask = ntohl(inet_make_mask(plen));
1184

1185
	if (key & ~mask)
1186 1187 1188 1189
		return -EINVAL;

	key = key & mask;

1190 1191 1192
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1193
		goto err;
1194
	}
1195 1196

	l = fib_find_node(t, key);
1197
	fa = NULL;
1198

1199
	if (l) {
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1215 1216 1217
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1218 1219

		err = -EEXIST;
1220
		if (cfg->fc_nlflags & NLM_F_EXCL)
1221 1222
			goto out;

1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1243
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1244 1245 1246
			struct fib_info *fi_drop;
			u8 state;

1247 1248 1249 1250
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1251
				goto out;
1252
			}
R
Robert Olsson 已提交
1253
			err = -ENOBUFS;
1254
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1255 1256
			if (new_fa == NULL)
				goto out;
1257 1258

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1259 1260
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1261
			new_fa->fa_type = cfg->fc_type;
1262
			state = fa->fa_state;
1263
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1264

R
Robert Olsson 已提交
1265 1266
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1267 1268 1269

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1270
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1271 1272
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1273

O
Olof Johansson 已提交
1274
			goto succeeded;
1275 1276 1277 1278 1279
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1280 1281
		if (fa_match)
			goto out;
1282

1283
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1284
			fa = fa_first;
1285 1286
	}
	err = -ENOENT;
1287
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1288 1289 1290
		goto out;

	err = -ENOBUFS;
1291
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1292 1293 1294 1295 1296
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1297
	new_fa->fa_type = cfg->fc_type;
1298 1299 1300 1301 1302
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1303
	if (!fa_head) {
1304 1305 1306
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1307
			goto out_free_new_fa;
1308
		}
1309
	}
1310

1311 1312 1313
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1314 1315
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1316

1317
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1318
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1319
		  &cfg->fc_nlinfo, 0);
1320 1321
succeeded:
	return 0;
1322 1323 1324

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1325 1326
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1327
err:
1328 1329 1330
	return err;
}

R
Robert Olsson 已提交
1331
/* should be called with rcu_read_lock */
1332
static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1333
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1334
		      struct fib_result *res, int fib_flags)
1335 1336 1337
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
1338

1339
	hlist_for_each_entry_rcu(li, hhead, hlist) {
1340
		struct fib_alias *fa;
1341

1342
		if (l->key != (key & li->mask_plen))
1343 1344
			continue;

1345 1346 1347
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1348

1349
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1350
				continue;
1351 1352
			if (fi->fib_dead)
				continue;
1353
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1354 1355 1356 1357
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (err) {
1358
#ifdef CONFIG_IP_FIB_TRIE_STATS
1359
				t->stats.semantic_match_passed++;
1360
#endif
1361
				return err;
1362 1363 1364 1365 1366 1367 1368 1369
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1370
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1371 1372 1373 1374 1375
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
				t->stats.semantic_match_passed++;
#endif
1376
				res->prefixlen = li->plen;
1377 1378
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1379
				res->scope = fa->fa_info->fib_scope;
1380 1381 1382 1383
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
1384
					atomic_inc(&fi->fib_clntref);
1385 1386 1387 1388 1389 1390
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
		t->stats.semantic_match_miss++;
1391 1392
#endif
	}
1393

1394
	return 1;
1395 1396
}

1397
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1398
		     struct fib_result *res, int fib_flags)
1399 1400
{
	struct trie *t = (struct trie *) tb->tb_data;
1401
	int ret;
1402
	struct rt_trie_node *n;
1403
	struct tnode *pn;
1404
	unsigned int pos, bits;
1405
	t_key key = ntohl(flp->daddr);
1406
	unsigned int chopped_off;
1407
	t_key cindex = 0;
1408
	unsigned int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1409
	struct tnode *cn;
1410
	t_key pref_mismatch;
O
Olof Johansson 已提交
1411

R
Robert Olsson 已提交
1412
	rcu_read_lock();
O
Olof Johansson 已提交
1413

R
Robert Olsson 已提交
1414
	n = rcu_dereference(t->trie);
1415
	if (!n)
1416 1417 1418 1419 1420 1421 1422 1423
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats.gets++;
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1424
		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1425
		goto found;
1426
	}
1427

1428 1429
	pn = (struct tnode *) n;
	chopped_off = 0;
1430

O
Olof Johansson 已提交
1431
	while (pn) {
1432 1433 1434
		pos = pn->pos;
		bits = pn->bits;

1435
		if (!chopped_off)
S
Stephen Hemminger 已提交
1436 1437
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1438

1439
		n = tnode_get_child_rcu(pn, cindex);
1440 1441 1442 1443 1444 1445 1446 1447

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.null_node_hit++;
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1448
		if (IS_LEAF(n)) {
1449
			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1450
			if (ret > 0)
O
Olof Johansson 已提交
1451
				goto backtrace;
1452
			goto found;
O
Olof Johansson 已提交
1453 1454 1455
		}

		cn = (struct tnode *)n;
1456

O
Olof Johansson 已提交
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1472

O
Olof Johansson 已提交
1473 1474 1475 1476 1477 1478 1479 1480 1481
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1482

1483 1484
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1485

O
Olof Johansson 已提交
1486 1487
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1488 1489
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1490 1491
				goto backtrace;
		}
1492

O
Olof Johansson 已提交
1493 1494 1495 1496 1497 1498 1499 1500 1501
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1502

O
Olof Johansson 已提交
1503 1504 1505 1506 1507 1508 1509 1510
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1511

1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1523 1524
		 */

1525
		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
O
Olof Johansson 已提交
1526

1527 1528 1529 1530
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1531 1532
		 */
		if (pref_mismatch) {
1533 1534
			/* fls(x) = __fls(x) + 1 */
			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
O
Olof Johansson 已提交
1535

1536
			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
O
Olof Johansson 已提交
1537 1538 1539 1540
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1541
		}
1542

O
Olof Johansson 已提交
1543 1544 1545 1546
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1547 1548 1549 1550
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1551 1552
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1553 1554 1555 1556
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1557 1558
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1559

1560
		/*
1561
		 * Either we do the actual chop off according or if we have
1562 1563 1564
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1565
		if (chopped_off <= pn->bits) {
1566
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1567
		} else {
1568
			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
S
Stephen Hemminger 已提交
1569
			if (!parent)
1570
				goto failed;
O
Olof Johansson 已提交
1571

1572
			/* Get Child's index */
S
Stephen Hemminger 已提交
1573 1574
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1575 1576 1577 1578 1579 1580
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.backtrack++;
#endif
			goto backtrace;
1581
		}
1582 1583
	}
failed:
1584
	ret = 1;
1585
found:
R
Robert Olsson 已提交
1586
	rcu_read_unlock();
1587 1588
	return ret;
}
1589
EXPORT_SYMBOL_GPL(fib_table_lookup);
1590

1591 1592 1593 1594
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1595
{
1596
	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1597

1598
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1599

1600
	if (tp) {
1601
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
L
Lin Ming 已提交
1602
		put_child(tp, cindex, NULL);
1603
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1604
	} else
1605
		RCU_INIT_POINTER(t->trie, NULL);
1606

1607
	free_leaf(l);
1608 1609
}

1610 1611 1612
/*
 * Caller must hold RTNL.
 */
1613
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1614 1615 1616
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1617 1618
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1619 1620 1621
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1622 1623
	struct leaf_info *li;

1624
	if (plen > 32)
1625 1626
		return -EINVAL;

1627
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1628
	mask = ntohl(inet_make_mask(plen));
1629

1630
	if (key & ~mask)
1631 1632 1633 1634 1635
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1636
	if (!l)
1637 1638
		return -ESRCH;

1639 1640 1641 1642 1643 1644
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1645 1646 1647 1648 1649
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1650
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1651 1652

	fa_to_delete = NULL;
1653 1654
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1655 1656 1657 1658 1659
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1660 1661
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1662
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1663 1664
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1665 1666 1667
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1668 1669 1670 1671 1672
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1673 1674
	if (!fa_to_delete)
		return -ESRCH;
1675

O
Olof Johansson 已提交
1676
	fa = fa_to_delete;
1677
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1678
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1679

R
Robert Olsson 已提交
1680
	list_del_rcu(&fa->fa_list);
1681

1682 1683 1684
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1685
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1686
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1687
		free_leaf_info(li);
R
Robert Olsson 已提交
1688
	}
1689

O
Olof Johansson 已提交
1690
	if (hlist_empty(&l->list))
1691
		trie_leaf_remove(t, l);
1692

O
Olof Johansson 已提交
1693
	if (fa->fa_state & FA_S_ACCESSED)
1694
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1695

R
Robert Olsson 已提交
1696 1697
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1698
	return 0;
1699 1700
}

1701
static int trie_flush_list(struct list_head *head)
1702 1703 1704 1705 1706 1707 1708
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1709 1710 1711 1712
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1713 1714 1715 1716 1717 1718
			found++;
		}
	}
	return found;
}

1719
static int trie_flush_leaf(struct leaf *l)
1720 1721 1722
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1723
	struct hlist_node *tmp;
1724 1725
	struct leaf_info *li = NULL;

1726
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1727
		found += trie_flush_list(&li->falh);
1728 1729

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1730
			hlist_del_rcu(&li->hlist);
1731 1732 1733 1734 1735 1736
			free_leaf_info(li);
		}
	}
	return found;
}

1737 1738 1739 1740
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
1741
static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1742
{
1743 1744
	do {
		t_key idx;
1745 1746

		if (c)
1747
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1748
		else
1749
			idx = 0;
R
Robert Olsson 已提交
1750

1751 1752
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1753
			if (!c)
O
Olof Johansson 已提交
1754 1755
				continue;

1756
			if (IS_LEAF(c))
1757 1758 1759 1760 1761
				return (struct leaf *) c;

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1762
		}
1763 1764

		/* Node empty, walk back up to parent */
1765
		c = (struct rt_trie_node *) p;
E
Eric Dumazet 已提交
1766
	} while ((p = node_parent_rcu(c)) != NULL);
1767 1768 1769 1770 1771 1772

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
E
Eric Dumazet 已提交
1773
	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
1786
	struct rt_trie_node *c = (struct rt_trie_node *) l;
1787
	struct tnode *p = node_parent_rcu(c);
1788 1789 1790 1791 1792

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1793 1794
}

1795 1796 1797 1798
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1799
	while (l && index-- > 0)
1800
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1801

1802 1803 1804 1805
	return l;
}


1806 1807 1808
/*
 * Caller must hold RTNL.
 */
1809
int fib_table_flush(struct fib_table *tb)
1810 1811
{
	struct trie *t = (struct trie *) tb->tb_data;
1812
	struct leaf *l, *ll = NULL;
1813
	int found = 0;
1814

1815
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1816
		found += trie_flush_leaf(l);
1817 1818

		if (ll && hlist_empty(&ll->list))
1819
			trie_leaf_remove(t, ll);
1820 1821 1822 1823
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1824
		trie_leaf_remove(t, ll);
1825

S
Stephen Hemminger 已提交
1826
	pr_debug("trie_flush found=%d\n", found);
1827 1828 1829
	return found;
}

1830 1831 1832 1833 1834
void fib_free_table(struct fib_table *tb)
{
	kfree(tb);
}

1835 1836
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1837 1838 1839 1840
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1841
	__be32 xkey = htonl(key);
1842

1843
	s_i = cb->args[5];
1844 1845
	i = 0;

R
Robert Olsson 已提交
1846 1847 1848
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1849 1850 1851 1852 1853
		if (i < s_i) {
			i++;
			continue;
		}

1854
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1855 1856 1857 1858
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1859
				  xkey,
1860 1861
				  plen,
				  fa->fa_tos,
1862
				  fa->fa_info, NLM_F_MULTI) < 0) {
1863
			cb->args[5] = i;
1864
			return -1;
O
Olof Johansson 已提交
1865
		}
1866 1867
		i++;
	}
1868
	cb->args[5] = i;
1869 1870 1871
	return skb->len;
}

1872 1873
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1874
{
1875 1876
	struct leaf_info *li;
	int i, s_i;
1877

1878
	s_i = cb->args[4];
1879
	i = 0;
1880

1881
	/* rcu_read_lock is hold by caller */
1882
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1883 1884
		if (i < s_i) {
			i++;
1885
			continue;
1886
		}
O
Olof Johansson 已提交
1887

1888
		if (i > s_i)
1889
			cb->args[5] = 0;
1890

1891
		if (list_empty(&li->falh))
1892 1893
			continue;

1894
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1895
			cb->args[4] = i;
1896 1897
			return -1;
		}
1898
		i++;
1899
	}
1900

1901
	cb->args[4] = i;
1902 1903 1904
	return skb->len;
}

1905 1906
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1907
{
1908
	struct leaf *l;
1909
	struct trie *t = (struct trie *) tb->tb_data;
1910
	t_key key = cb->args[2];
1911
	int count = cb->args[3];
1912

R
Robert Olsson 已提交
1913
	rcu_read_lock();
1914 1915 1916
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1917
	if (count == 0)
1918 1919
		l = trie_firstleaf(t);
	else {
1920 1921 1922
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1923
		l = fib_find_node(t, key);
1924 1925
		if (!l)
			l = trie_leafindex(t, count);
1926
	}
1927

1928 1929
	while (l) {
		cb->args[2] = l->key;
1930
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1931
			cb->args[3] = count;
1932 1933
			rcu_read_unlock();
			return -1;
1934
		}
1935

1936
		++count;
1937
		l = trie_nextleaf(l);
1938 1939
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1940
	}
1941
	cb->args[3] = count;
R
Robert Olsson 已提交
1942
	rcu_read_unlock();
1943

1944 1945 1946
	return skb->len;
}

1947
void __init fib_trie_init(void)
1948
{
1949 1950
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1951 1952 1953 1954 1955 1956
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1957
}
1958

1959

1960
struct fib_table *fib_trie_table(u32 id)
1961 1962 1963 1964 1965 1966 1967 1968 1969 1970
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1971
	tb->tb_default = -1;
1972
	tb->tb_num_default = 0;
1973 1974

	t = (struct trie *) tb->tb_data;
1975
	memset(t, 0, sizeof(*t));
1976 1977 1978 1979

	return tb;
}

1980 1981 1982
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1983
	struct seq_net_private p;
1984
	struct fib_table *tb;
1985
	struct tnode *tnode;
E
Eric Dumazet 已提交
1986 1987
	unsigned int index;
	unsigned int depth;
1988
};
1989

1990
static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1991
{
1992
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
1993
	unsigned int cindex = iter->index;
1994
	struct tnode *p;
1995

1996 1997 1998 1999
	/* A single entry routing table */
	if (!tn)
		return NULL;

2000 2001 2002 2003
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2004
		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2005

2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2018

2019 2020
		++cindex;
	}
O
Olof Johansson 已提交
2021

2022
	/* Current node exhausted, pop back up */
2023
	p = node_parent_rcu((struct rt_trie_node *)tn);
2024 2025 2026 2027 2028
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2029
	}
2030 2031 2032

	/* got root? */
	return NULL;
2033 2034
}

2035
static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2036
				       struct trie *t)
2037
{
2038
	struct rt_trie_node *n;
2039

S
Stephen Hemminger 已提交
2040
	if (!t)
2041 2042 2043
		return NULL;

	n = rcu_dereference(t->trie);
2044
	if (!n)
2045
		return NULL;
2046

2047 2048 2049 2050 2051 2052 2053 2054
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2055
	}
2056 2057

	return n;
2058
}
O
Olof Johansson 已提交
2059

2060 2061
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
2062
	struct rt_trie_node *n;
2063
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2064

2065
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2066

2067
	rcu_read_lock();
2068
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2069
		if (IS_LEAF(n)) {
2070 2071 2072
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;

2073 2074 2075 2076
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2077

2078
			hlist_for_each_entry_rcu(li, &l->list, hlist)
2079
				++s->prefixes;
2080 2081 2082 2083 2084
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2085
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2086 2087
				s->nodesizes[tn->bits]++;

2088 2089 2090
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2091 2092
		}
	}
R
Robert Olsson 已提交
2093
	rcu_read_unlock();
2094 2095
}

2096 2097 2098 2099
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2100
{
E
Eric Dumazet 已提交
2101
	unsigned int i, max, pointers, bytes, avdepth;
2102

2103 2104 2105 2106
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2107

2108 2109
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2110
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2111

2112 2113
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2114 2115 2116 2117

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2118
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2119
	bytes += sizeof(struct tnode) * stat->tnodes;
2120

R
Robert Olsson 已提交
2121 2122
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2123
		max--;
2124

2125
	pointers = 0;
2126
	for (i = 1; i < max; i++)
2127
		if (stat->nodesizes[i] != 0) {
2128
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2129 2130 2131
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2132
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2133

2134
	bytes += sizeof(struct rt_trie_node *) * pointers;
2135 2136
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2137
}
R
Robert Olsson 已提交
2138

2139
#ifdef CONFIG_IP_FIB_TRIE_STATS
2140 2141 2142 2143
static void trie_show_usage(struct seq_file *seq,
			    const struct trie_use_stats *stats)
{
	seq_printf(seq, "\nCounters:\n---------\n");
2144 2145 2146 2147 2148 2149 2150 2151 2152
	seq_printf(seq, "gets = %u\n", stats->gets);
	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
	seq_printf(seq, "semantic match passed = %u\n",
		   stats->semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n",
		   stats->semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n",
		   stats->resize_node_skipped);
2153
}
2154 2155
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2156
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2157
{
2158 2159 2160 2161 2162 2163
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2164
}
2165

2166

2167 2168
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2169
	struct net *net = (struct net *)seq->private;
2170
	unsigned int h;
2171

2172
	seq_printf(seq,
2173 2174
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2175 2176
		   sizeof(struct leaf), sizeof(struct tnode));

2177 2178 2179 2180
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

2181
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2182 2183
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2184

2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
			trie_show_usage(seq, &t->stats);
#endif
		}
	}
2197

2198
	return 0;
2199 2200
}

2201
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2202
{
2203
	return single_open_net(inode, file, fib_triestat_seq_show);
2204 2205
}

2206
static const struct file_operations fib_triestat_fops = {
2207 2208 2209 2210
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2211
	.release = single_release_net,
2212 2213
};

2214
static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2215
{
2216 2217
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2218
	loff_t idx = 0;
2219
	unsigned int h;
2220

2221 2222 2223
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
2224

2225
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2226
			struct rt_trie_node *n;
2227 2228 2229 2230 2231 2232 2233 2234 2235

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2236
	}
2237

2238 2239 2240
	return NULL;
}

2241
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2242
	__acquires(RCU)
2243
{
2244
	rcu_read_lock();
2245
	return fib_trie_get_idx(seq, *pos);
2246 2247
}

2248
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2249
{
2250
	struct fib_trie_iter *iter = seq->private;
2251
	struct net *net = seq_file_net(seq);
2252 2253 2254
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
2255
	struct rt_trie_node *n;
2256

2257
	++*pos;
2258 2259 2260 2261
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2262

2263 2264
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2265
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2266 2267 2268 2269 2270
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2271

2272 2273 2274
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2275
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2276 2277 2278 2279 2280
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2281
	return NULL;
2282 2283 2284 2285

found:
	iter->tb = tb;
	return n;
2286
}
2287

2288
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2289
	__releases(RCU)
2290
{
2291 2292
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2293

2294 2295
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2296 2297
	while (n-- > 0)
		seq_puts(seq, "   ");
2298
}
2299

2300
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2301
{
S
Stephen Hemminger 已提交
2302
	switch (s) {
2303 2304 2305 2306 2307 2308
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2309
		snprintf(buf, len, "scope=%d", s);
2310 2311 2312
		return buf;
	}
}
2313

2314
static const char *const rtn_type_names[__RTN_MAX] = {
2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2328

E
Eric Dumazet 已提交
2329
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2330 2331 2332
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2333
	snprintf(buf, len, "type %u", t);
2334
	return buf;
2335 2336
}

2337 2338
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2339
{
2340
	const struct fib_trie_iter *iter = seq->private;
2341
	struct rt_trie_node *n = v;
2342

2343 2344
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2345

2346 2347
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2348
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2349

2350
		seq_indent(seq, iter->depth-1);
2351 2352
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2353
			   tn->empty_children);
2354

2355 2356
	} else {
		struct leaf *l = (struct leaf *) n;
2357
		struct leaf_info *li;
A
Al Viro 已提交
2358
		__be32 val = htonl(l->key);
2359 2360

		seq_indent(seq, iter->depth);
2361
		seq_printf(seq, "  |-- %pI4\n", &val);
2362

2363
		hlist_for_each_entry_rcu(li, &l->list, hlist) {
2364 2365 2366 2367 2368 2369 2370 2371
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2372
						     fa->fa_info->fib_scope),
2373 2374 2375
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2376
					seq_printf(seq, " tos=%d", fa->fa_tos);
2377
				seq_putc(seq, '\n');
2378 2379
			}
		}
2380
	}
2381

2382 2383 2384
	return 0;
}

2385
static const struct seq_operations fib_trie_seq_ops = {
2386 2387 2388 2389
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2390 2391
};

2392
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2393
{
2394 2395
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2396 2397
}

2398
static const struct file_operations fib_trie_fops = {
2399 2400 2401 2402
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2403
	.release = seq_release_net,
2404 2405
};

2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2446
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2484
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2485
{
E
Eric Dumazet 已提交
2486
	unsigned int flags = 0;
2487

E
Eric Dumazet 已提交
2488 2489
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2490 2491
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2492
	if (mask == htonl(0xFFFFFFFF))
2493 2494 2495
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2496 2497
}

2498 2499 2500
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2501
 *	and needs to be same as fib_hash output to avoid breaking
2502 2503 2504
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2505
{
2506
	struct leaf *l = v;
2507
	struct leaf_info *li;
2508

2509 2510 2511 2512 2513 2514
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2515

2516
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2517
		struct fib_alias *fa;
A
Al Viro 已提交
2518
		__be32 mask, prefix;
O
Olof Johansson 已提交
2519

2520 2521
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2522

2523
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2524
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2525
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2526

2527 2528 2529
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2530

2531 2532
			seq_setwidth(seq, 127);

2533
			if (fi)
2534 2535
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2536
					 "%d\t%08X\t%d\t%u\t%u",
2537 2538 2539 2540 2541
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2542 2543
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2544
					 fi->fib_window,
2545
					 fi->fib_rtt >> 3);
2546
			else
2547 2548
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2549
					 "%d\t%08X\t%d\t%u\t%u",
2550
					 prefix, 0, flags, 0, 0, 0,
2551
					 mask, 0, 0, 0);
2552

2553
			seq_pad(seq, '\n');
2554
		}
2555 2556 2557 2558 2559
	}

	return 0;
}

2560
static const struct seq_operations fib_route_seq_ops = {
2561 2562 2563
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2564
	.show   = fib_route_seq_show,
2565 2566
};

2567
static int fib_route_seq_open(struct inode *inode, struct file *file)
2568
{
2569
	return seq_open_net(inode, file, &fib_route_seq_ops,
2570
			    sizeof(struct fib_route_iter));
2571 2572
}

2573
static const struct file_operations fib_route_fops = {
2574 2575 2576 2577
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2578
	.release = seq_release_net,
2579 2580
};

2581
int __net_init fib_proc_init(struct net *net)
2582
{
2583
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2584 2585
		goto out1;

2586 2587
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2588 2589
		goto out2;

2590
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2591 2592
		goto out3;

2593
	return 0;
2594 2595

out3:
2596
	remove_proc_entry("fib_triestat", net->proc_net);
2597
out2:
2598
	remove_proc_entry("fib_trie", net->proc_net);
2599 2600
out1:
	return -ENOMEM;
2601 2602
}

2603
void __net_exit fib_proc_exit(struct net *net)
2604
{
2605 2606 2607
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2608 2609 2610
}

#endif /* CONFIG_PROC_FS */