fib_trie.c 60.8 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89 90 91 92

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
93 94
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
95 96
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
97

98
struct rt_trie_node {
O
Olof Johansson 已提交
99
	unsigned long parent;
100
	t_key key;
101 102 103
};

struct leaf {
O
Olof Johansson 已提交
104
	unsigned long parent;
105
	t_key key;
106
	struct hlist_head list;
R
Robert Olsson 已提交
107
	struct rcu_head rcu;
108 109 110 111 112
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
113
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
114
	struct list_head falh;
115
	struct rcu_head rcu;
116 117 118
};

struct tnode {
O
Olof Johansson 已提交
119
	unsigned long parent;
120
	t_key key;
121 122
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123 124
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
125 126
	union {
		struct rcu_head rcu;
J
Jarek Poplawski 已提交
127
		struct tnode *tnode_free;
128
	};
E
Eric Dumazet 已提交
129
	struct rt_trie_node __rcu *child[0];
130 131 132 133 134 135 136 137 138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140 141 142 143 144 145 146 147 148
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
149
	unsigned int prefixes;
R
Robert Olsson 已提交
150
	unsigned int nodesizes[MAX_STAT_DEPTH];
151
};
152 153

struct trie {
E
Eric Dumazet 已提交
154
	struct rt_trie_node __rcu *trie;
155 156 157 158 159
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
};

160
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
161
				  int wasfull);
162
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163 164
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
165 166
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
167 168 169 170 171 172 173 174
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
175

176
static struct kmem_cache *fn_alias_kmem __read_mostly;
177
static struct kmem_cache *trie_leaf_kmem __read_mostly;
178

E
Eric Dumazet 已提交
179 180 181 182
/*
 * caller must hold RTNL
 */
static inline struct tnode *node_parent(const struct rt_trie_node *node)
S
Stephen Hemminger 已提交
183
{
E
Eric Dumazet 已提交
184 185 186 187 188
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());

	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
189 190
}

E
Eric Dumazet 已提交
191 192 193 194
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195
{
E
Eric Dumazet 已提交
196 197 198 199
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
							   lockdep_rtnl_is_held());
S
Stephen Hemminger 已提交
200

E
Eric Dumazet 已提交
201
	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
S
Stephen Hemminger 已提交
202 203
}

204
/* Same as rcu_assign_pointer
205 206
 * but that macro() assumes that value is a pointer.
 */
207
static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
S
Stephen Hemminger 已提交
208
{
209 210
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
211
}
R
Robert Olsson 已提交
212

E
Eric Dumazet 已提交
213 214 215 216
/*
 * caller must hold RTNL
 */
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217 218
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
219

E
Eric Dumazet 已提交
220
	return rtnl_dereference(tn->child[i]);
221 222
}

E
Eric Dumazet 已提交
223 224 225 226
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227
{
E
Eric Dumazet 已提交
228
	BUG_ON(i >= 1U << tn->bits);
229

E
Eric Dumazet 已提交
230
	return rcu_dereference_rtnl(tn->child[i]);
231 232
}

S
Stephen Hemmigner 已提交
233
static inline int tnode_child_length(const struct tnode *tn)
234
{
O
Olof Johansson 已提交
235
	return 1 << tn->bits;
236 237
}

238
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
239 240 241 242
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

243
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
244
{
O
Olof Johansson 已提交
245
	if (offset < KEYLENGTH)
246
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
247
	else
248 249 250 251 252
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
253
	return a == b;
254 255 256 257
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
258 259
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
260 261
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
262
}
263 264 265 266 267 268

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

269 270 271
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
272 273 274 275 276
		i++;
	return i;
}

/*
277 278
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
279 280 281 282
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

283 284 285 286 287
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
288 289
  correct key path.

290 291 292 293 294
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
295 296
  call to tkey_sub_equals() in trie_insert().

297
  if n is an internal node - a 'tnode' here, the various parts of its key
298 299
  have many different meanings.

300
  Example:
301 302 303
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
304
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
305 306 307 308 309 310 311 312 313

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
314
  n->bits = 4
315

316 317
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318 319 320
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321
  index into the parent's child array. That is, they will be used to find
322 323 324 325 326
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

327
  All the bits we have seen so far are significant to the node n. The rest
328 329
  of the bits are really not needed or indeed known in n->key.

330
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331
  n's child array, and will of course be different for each child.
332

333

334 335 336 337 338
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
339
static inline void check_tnode(const struct tnode *tn)
340
{
S
Stephen Hemminger 已提交
341
	WARN_ON(tn && tn->pos+tn->bits > 32);
342 343
}

344 345
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
346
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
347
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
348 349

static void __alias_free_mem(struct rcu_head *head)
350
{
R
Robert Olsson 已提交
351 352
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
353 354
}

R
Robert Olsson 已提交
355
static inline void alias_free_mem_rcu(struct fib_alias *fa)
356
{
R
Robert Olsson 已提交
357 358
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
359

R
Robert Olsson 已提交
360 361
static void __leaf_free_rcu(struct rcu_head *head)
{
362 363
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
364
}
O
Olof Johansson 已提交
365

366 367
static inline void free_leaf(struct leaf *l)
{
368
	call_rcu(&l->rcu, __leaf_free_rcu);
369 370
}

R
Robert Olsson 已提交
371
static inline void free_leaf_info(struct leaf_info *leaf)
372
{
373
	kfree_rcu(leaf, rcu);
374 375
}

376
static struct tnode *tnode_alloc(size_t size)
377
{
R
Robert Olsson 已提交
378
	if (size <= PAGE_SIZE)
379
		return kzalloc(size, GFP_KERNEL);
380
	else
381
		return vzalloc(size);
382
}
R
Robert Olsson 已提交
383 384

static void __tnode_free_rcu(struct rcu_head *head)
385
{
R
Robert Olsson 已提交
386
	struct tnode *tn = container_of(head, struct tnode, rcu);
387
	size_t size = sizeof(struct tnode) +
388
		      (sizeof(struct rt_trie_node *) << tn->bits);
389 390 391

	if (size <= PAGE_SIZE)
		kfree(tn);
A
Al Viro 已提交
392 393
	else
		vfree(tn);
394 395
}

R
Robert Olsson 已提交
396 397
static inline void tnode_free(struct tnode *tn)
{
398 399 400
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
401
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
402 403
}

J
Jarek Poplawski 已提交
404 405 406
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
407 408
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
409
	tnode_free_size += sizeof(struct tnode) +
410
			   (sizeof(struct rt_trie_node *) << tn->bits);
J
Jarek Poplawski 已提交
411 412 413 414 415 416 417 418 419 420 421
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
422 423 424 425 426

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
427 428
}

R
Robert Olsson 已提交
429 430
static struct leaf *leaf_new(void)
{
431
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
432 433 434 435 436 437 438 439 440 441 442 443
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
444
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
445 446 447 448 449
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

450
static struct tnode *tnode_new(t_key key, int pos, int bits)
451
{
452
	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
453
	struct tnode *tn = tnode_alloc(sz);
454

O
Olof Johansson 已提交
455
	if (tn) {
R
Robert Olsson 已提交
456
		tn->parent = T_TNODE;
457 458 459 460 461 462
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
463

E
Eric Dumazet 已提交
464
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
L
Lin Ming 已提交
465
		 sizeof(struct rt_trie_node *) << bits);
466 467 468 469 470 471 472 473
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

474
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475
{
476
	if (n == NULL || IS_LEAF(n))
477 478 479 480 481
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

L
Lin Ming 已提交
482
static inline void put_child(struct tnode *tn, int i,
483
			     struct rt_trie_node *n)
484 485 486 487
{
	tnode_put_child_reorg(tn, i, n, -1);
}

488
 /*
489 490 491 492
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

493
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494
				  int wasfull)
495
{
E
Eric Dumazet 已提交
496
	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
497 498
	int isfull;

S
Stephen Hemminger 已提交
499 500
	BUG_ON(i >= 1<<tn->bits);

501 502 503 504 505
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
506

507
	/* update fullChildren */
O
Olof Johansson 已提交
508
	if (wasfull == -1)
509 510 511
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
512
	if (wasfull && !isfull)
513
		tn->full_children--;
514
	else if (!wasfull && isfull)
515
		tn->full_children++;
O
Olof Johansson 已提交
516

517
	if (n)
S
Stephen Hemminger 已提交
518
		node_set_parent(n, tn);
519

520
	rcu_assign_pointer(tn->child[i], n);
521 522
}

J
Jens Låås 已提交
523
#define MAX_WORK 10
524
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525 526
{
	int i;
527
	struct tnode *old_tn;
528 529
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
530
	int max_work;
531

532
	if (!tn)
533 534
		return NULL;

S
Stephen Hemminger 已提交
535 536
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
537 538 539

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
540
		tnode_free_safe(tn);
541 542 543 544
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
J
Jens Låås 已提交
545
		goto one_child;
546
	/*
547 548 549 550 551
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
552 553
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554
	 * Telecommunications, page 6:
555
	 * "A node is doubled if the ratio of non-empty children to all
556 557
	 * children in the *doubled* node is at least 'high'."
	 *
558 559 560 561 562
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
563
	 * multiply the left-hand side by 50.
564 565 566 567
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
568
	 * children, that is non-null tnodes with a skip value of 0.
569
	 * All of those will be doubled in the resulting inflated tnode, so
570
	 * we just count them one extra time here.
571
	 *
572
	 * A clearer way to write this would be:
573
	 *
574
	 * to_be_doubled = tn->full_children;
575
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 577 578 579
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
580
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 582
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
583 584 585
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
586 587 588
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
589
	 *
590 591 592
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
593
	 *
594
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
595
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
596
	 *    tn->full_children) >= inflate_threshold * new_child_length
597
	 *
598
	 * expand new_child_length:
599
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
600
	 *    tn->full_children) >=
601
	 *      inflate_threshold * tnode_child_length(tn) * 2
602
	 *
603
	 * shorten again:
604
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
605
	 *    tn->empty_children) >= inflate_threshold *
606
	 *    tnode_child_length(tn)
607
	 *
608 609 610
	 */

	check_tnode(tn);
611

612 613
	/* Keep root node larger  */

614
	if (!node_parent((struct rt_trie_node *)tn)) {
J
Jens Låås 已提交
615 616
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
617
	} else {
618
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
619 620
		halve_threshold_use = halve_threshold;
	}
621

J
Jens Låås 已提交
622 623
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
624 625 626
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
627

628 629
		old_tn = tn;
		tn = inflate(t, tn);
630

631 632
		if (IS_ERR(tn)) {
			tn = old_tn;
633 634 635 636 637
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
638 639 640 641
	}

	check_tnode(tn);

J
Jens Låås 已提交
642
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
643
	if (max_work != MAX_WORK)
644
		return (struct rt_trie_node *) tn;
J
Jens Låås 已提交
645

646 647 648 649
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
650

J
Jens Låås 已提交
651 652
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
653
	       100 * (tnode_child_length(tn) - tn->empty_children) <
654
	       halve_threshold_use * tnode_child_length(tn)) {
655

656 657 658 659
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
660 661 662 663 664 665
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
666

667

668
	/* Only one child remains */
J
Jens Låås 已提交
669 670
	if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
671
		for (i = 0; i < tnode_child_length(tn); i++) {
672
			struct rt_trie_node *n;
673

E
Eric Dumazet 已提交
674
			n = rtnl_dereference(tn->child[i]);
R
Robert Olsson 已提交
675
			if (!n)
O
Olof Johansson 已提交
676 677 678 679
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
680
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
681
			tnode_free_safe(tn);
O
Olof Johansson 已提交
682
			return n;
683
		}
J
Jens Låås 已提交
684
	}
685
	return (struct rt_trie_node *) tn;
686 687
}

E
Eric Dumazet 已提交
688 689 690 691 692 693 694 695 696 697 698 699 700 701

static void tnode_clean_free(struct tnode *tn)
{
	int i;
	struct tnode *tofree;

	for (i = 0; i < tnode_child_length(tn); i++) {
		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
		if (tofree)
			tnode_free(tofree);
	}
	tnode_free(tn);
}

702
static struct tnode *inflate(struct trie *t, struct tnode *tn)
703 704 705 706 707
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
708
	pr_debug("In inflate\n");
709 710 711

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
712
	if (!tn)
713
		return ERR_PTR(-ENOMEM);
714 715

	/*
716 717 718
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
719 720
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
721 722

	for (i = 0; i < olen; i++) {
723
		struct tnode *inode;
724

725
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 727 728 729 730
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
731
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732

733 734
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
735 736
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
737

738 739 740
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

741
			if (!right) {
742 743
				tnode_free(left);
				goto nomem;
744
			}
745

L
Lin Ming 已提交
746 747
			put_child(tn, 2*i, (struct rt_trie_node *) left);
			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
748 749 750
		}
	}

O
Olof Johansson 已提交
751
	for (i = 0; i < olen; i++) {
752
		struct tnode *inode;
753
		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
754 755
		struct tnode *left, *right;
		int size, j;
756

757 758 759 760 761 762
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

763
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
764
		   tn->pos + tn->bits - 1) {
765 766 767
			put_child(tn,
				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
				node);
768 769 770 771 772 773 774
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
L
Lin Ming 已提交
775 776
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
777

J
Jarek Poplawski 已提交
778
			tnode_free_safe(inode);
O
Olof Johansson 已提交
779
			continue;
780 781
		}

O
Olof Johansson 已提交
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
800

O
Olof Johansson 已提交
801 802 803
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
804

O
Olof Johansson 已提交
805
		left = (struct tnode *) tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
806
		put_child(tn, 2*i, NULL);
807

O
Olof Johansson 已提交
808
		BUG_ON(!left);
809

O
Olof Johansson 已提交
810
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
811
		put_child(tn, 2*i+1, NULL);
812

O
Olof Johansson 已提交
813
		BUG_ON(!right);
814

O
Olof Johansson 已提交
815 816
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
817 818
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
819
		}
L
Lin Ming 已提交
820 821
		put_child(tn, 2*i, resize(t, left));
		put_child(tn, 2*i+1, resize(t, right));
O
Olof Johansson 已提交
822

J
Jarek Poplawski 已提交
823
		tnode_free_safe(inode);
824
	}
J
Jarek Poplawski 已提交
825
	tnode_free_safe(oldtnode);
826
	return tn;
827
nomem:
E
Eric Dumazet 已提交
828 829
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
830 831
}

832
static struct tnode *halve(struct trie *t, struct tnode *tn)
833 834
{
	struct tnode *oldtnode = tn;
835
	struct rt_trie_node *left, *right;
836 837 838
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
839
	pr_debug("In halve\n");
840 841

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
842

843 844
	if (!tn)
		return ERR_PTR(-ENOMEM);
845 846

	/*
847 848 849
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
850 851 852
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
853
	for (i = 0; i < olen; i += 2) {
854 855
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
856

857
		/* Two nonempty children */
S
Stephen Hemminger 已提交
858
		if (left && right) {
859
			struct tnode *newn;
S
Stephen Hemminger 已提交
860

861
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
862 863

			if (!newn)
864
				goto nomem;
S
Stephen Hemminger 已提交
865

L
Lin Ming 已提交
866
			put_child(tn, i/2, (struct rt_trie_node *)newn);
867 868 869
		}

	}
870

O
Olof Johansson 已提交
871 872 873
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

874 875
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
876

877 878 879 880
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
881
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
882
			continue;
S
Stephen Hemminger 已提交
883
		}
O
Olof Johansson 已提交
884 885

		if (right == NULL) {
L
Lin Ming 已提交
886
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
887 888
			continue;
		}
889

890
		/* Two nonempty children */
O
Olof Johansson 已提交
891
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
L
Lin Ming 已提交
892 893 894 895
		put_child(tn, i/2, NULL);
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
		put_child(tn, i/2, resize(t, newBinNode));
896
	}
J
Jarek Poplawski 已提交
897
	tnode_free_safe(oldtnode);
898
	return tn;
899
nomem:
E
Eric Dumazet 已提交
900 901
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
902 903
}

R
Robert Olsson 已提交
904
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
905 906
 via get_fa_head and dump */

R
Robert Olsson 已提交
907
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
908
{
R
Robert Olsson 已提交
909
	struct hlist_head *head = &l->list;
910 911
	struct leaf_info *li;

912
	hlist_for_each_entry_rcu(li, head, hlist)
913
		if (li->plen == plen)
914
			return li;
O
Olof Johansson 已提交
915

916 917 918
	return NULL;
}

919
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
920
{
R
Robert Olsson 已提交
921
	struct leaf_info *li = find_leaf_info(l, plen);
922

O
Olof Johansson 已提交
923 924
	if (!li)
		return NULL;
925

O
Olof Johansson 已提交
926
	return &li->falh;
927 928 929 930
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
931 932 933 934 935
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
936
		hlist_for_each_entry(li, head, hlist) {
937 938 939 940 941 942
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
943
			hlist_add_behind_rcu(&new->hlist, &last->hlist);
944 945 946
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
947 948
}

R
Robert Olsson 已提交
949 950
/* rcu_read_lock needs to be hold by caller from readside */

951 952 953 954 955
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
956
	struct rt_trie_node *n;
957 958

	pos = 0;
E
Eric Dumazet 已提交
959
	n = rcu_dereference_rtnl(t->trie);
960 961 962

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
963

964
		check_tnode(tn);
O
Olof Johansson 已提交
965

966
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
967
			pos = tn->pos + tn->bits;
968 969 970 971
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
972
		} else
973 974 975 976
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
977 978 979
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

980 981 982
	return NULL;
}

983
static void trie_rebalance(struct trie *t, struct tnode *tn)
984 985
{
	int wasfull;
R
Robert Olsson 已提交
986
	t_key cindex, key;
S
Stephen Hemminger 已提交
987
	struct tnode *tp;
988

R
Robert Olsson 已提交
989 990
	key = tn->key;

991
	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
992 993
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
J
Joe Perches 已提交
994
		tn = (struct tnode *)resize(t, tn);
995

J
Joe Perches 已提交
996
		tnode_put_child_reorg(tp, cindex,
997
				      (struct rt_trie_node *)tn, wasfull);
O
Olof Johansson 已提交
998

999
		tp = node_parent((struct rt_trie_node *) tn);
1000
		if (!tp)
1001
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1002

J
Jarek Poplawski 已提交
1003
		tnode_free_flush();
S
Stephen Hemminger 已提交
1004
		if (!tp)
1005
			break;
S
Stephen Hemminger 已提交
1006
		tn = tp;
1007
	}
S
Stephen Hemminger 已提交
1008

1009
	/* Handle last (top) tnode */
1010
	if (IS_TNODE(tn))
J
Joe Perches 已提交
1011
		tn = (struct tnode *)resize(t, tn);
1012

1013
	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014
	tnode_free_flush();
1015 1016
}

R
Robert Olsson 已提交
1017 1018
/* only used from updater-side */

1019
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020 1021 1022
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
1023
	struct rt_trie_node *n;
1024 1025
	struct leaf *l;
	int missbit;
1026
	struct list_head *fa_head = NULL;
1027 1028 1029 1030
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
E
Eric Dumazet 已提交
1031
	n = rtnl_dereference(t->trie);
1032

1033 1034
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1035
	 * and we should just put our new leaf in that.
1036 1037
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 1039
	 * not be the parent's 'pos'+'bits'!
	 *
1040
	 * If it does match the current key, get pos/bits from it, extract
1041 1042 1043 1044
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1045 1046 1047
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1048 1049 1050 1051 1052
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1053

1054
		check_tnode(tn);
O
Olof Johansson 已提交
1055

1056
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057
			tp = tn;
O
Olof Johansson 已提交
1058
			pos = tn->pos + tn->bits;
1059 1060 1061 1062
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1063

S
Stephen Hemminger 已提交
1064
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1065
		} else
1066 1067 1068 1069 1070 1071
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1072
	 * tp is n's (parent) ----> NULL or TNODE
1073 1074
	 */

O
Olof Johansson 已提交
1075
	BUG_ON(tp && IS_LEAF(tp));
1076 1077 1078

	/* Case 1: n is a leaf. Compare prefixes */

1079
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080
		l = (struct leaf *) n;
1081
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1082

1083 1084
		if (!li)
			return NULL;
1085 1086 1087 1088 1089 1090 1091

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1092 1093
	if (!l)
		return NULL;
1094 1095 1096 1097

	l->key = key;
	li = leaf_info_new(plen);

1098
	if (!li) {
1099
		free_leaf(l);
1100
		return NULL;
1101
	}
1102 1103 1104 1105 1106

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1107
		/* Case 2: n is NULL, and will just insert a new leaf */
1108

1109
		node_set_parent((struct rt_trie_node *)l, tp);
1110

O
Olof Johansson 已提交
1111
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1112
		put_child(tp, cindex, (struct rt_trie_node *)l);
O
Olof Johansson 已提交
1113 1114
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1115 1116
		/*
		 *  Add a new tnode here
1117 1118 1119
		 *  first tnode need some special handling
		 */

1120
		if (n) {
1121
			pos = tp ? tp->pos+tp->bits : 0;
1122 1123
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1124
		} else {
1125
			newpos = 0;
1126
			tn = tnode_new(key, newpos, 1); /* First tnode */
1127 1128
		}

1129
		if (!tn) {
1130
			free_leaf_info(li);
1131
			free_leaf(l);
1132
			return NULL;
O
Olof Johansson 已提交
1133 1134
		}

1135
		node_set_parent((struct rt_trie_node *)tn, tp);
1136

O
Olof Johansson 已提交
1137
		missbit = tkey_extract_bits(key, newpos, 1);
L
Lin Ming 已提交
1138 1139
		put_child(tn, missbit, (struct rt_trie_node *)l);
		put_child(tn, 1-missbit, n);
1140

1141
		if (tp) {
1142
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1143
			put_child(tp, cindex, (struct rt_trie_node *)tn);
O
Olof Johansson 已提交
1144
		} else {
1145
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1146
		}
1147 1148

		tp = tn;
1149
	}
O
Olof Johansson 已提交
1150 1151

	if (tp && tp->pos + tp->bits > 32)
J
Joe Perches 已提交
1152 1153
		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1154

1155
	/* Rebalance the trie */
R
Robert Olsson 已提交
1156

1157
	trie_rebalance(t, tp);
1158
done:
1159 1160 1161
	return fa_head;
}

1162 1163 1164
/*
 * Caller must hold RTNL.
 */
1165
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1166 1167 1168
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1169
	struct list_head *fa_head = NULL;
1170
	struct fib_info *fi;
1171 1172
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1173 1174 1175 1176 1177 1178 1179
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1180
	key = ntohl(cfg->fc_dst);
1181

1182
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1183

O
Olof Johansson 已提交
1184
	mask = ntohl(inet_make_mask(plen));
1185

1186
	if (key & ~mask)
1187 1188 1189 1190
		return -EINVAL;

	key = key & mask;

1191 1192 1193
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1194
		goto err;
1195
	}
1196 1197

	l = fib_find_node(t, key);
1198
	fa = NULL;
1199

1200
	if (l) {
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1216 1217 1218
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1219 1220

		err = -EEXIST;
1221
		if (cfg->fc_nlflags & NLM_F_EXCL)
1222 1223
			goto out;

1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1244
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1245 1246 1247
			struct fib_info *fi_drop;
			u8 state;

1248 1249 1250 1251
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1252
				goto out;
1253
			}
R
Robert Olsson 已提交
1254
			err = -ENOBUFS;
1255
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1256 1257
			if (new_fa == NULL)
				goto out;
1258 1259

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1260 1261
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1262
			new_fa->fa_type = cfg->fc_type;
1263
			state = fa->fa_state;
1264
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1265

R
Robert Olsson 已提交
1266 1267
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1268 1269 1270

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1271
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1272 1273
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1274

O
Olof Johansson 已提交
1275
			goto succeeded;
1276 1277 1278 1279 1280
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1281 1282
		if (fa_match)
			goto out;
1283

1284
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1285
			fa = fa_first;
1286 1287
	}
	err = -ENOENT;
1288
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1289 1290 1291
		goto out;

	err = -ENOBUFS;
1292
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1293 1294 1295 1296 1297
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1298
	new_fa->fa_type = cfg->fc_type;
1299 1300 1301 1302 1303
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1304
	if (!fa_head) {
1305 1306 1307
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1308
			goto out_free_new_fa;
1309
		}
1310
	}
1311

1312 1313 1314
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1315 1316
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1317

1318
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1319
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1320
		  &cfg->fc_nlinfo, 0);
1321 1322
succeeded:
	return 0;
1323 1324 1325

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1326 1327
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1328
err:
1329 1330 1331
	return err;
}

R
Robert Olsson 已提交
1332
/* should be called with rcu_read_lock */
1333
static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1334
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1335
		      struct fib_result *res, int fib_flags)
1336 1337 1338
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
1339

1340
	hlist_for_each_entry_rcu(li, hhead, hlist) {
1341
		struct fib_alias *fa;
1342

1343
		if (l->key != (key & li->mask_plen))
1344 1345
			continue;

1346 1347 1348
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1349

1350
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1351
				continue;
1352 1353
			if (fi->fib_dead)
				continue;
1354
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1355 1356 1357 1358
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (err) {
1359
#ifdef CONFIG_IP_FIB_TRIE_STATS
1360
				t->stats.semantic_match_passed++;
1361
#endif
1362
				return err;
1363 1364 1365 1366 1367 1368 1369 1370
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1371
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1372 1373 1374 1375 1376
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
				t->stats.semantic_match_passed++;
#endif
1377
				res->prefixlen = li->plen;
1378 1379
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1380
				res->scope = fa->fa_info->fib_scope;
1381 1382 1383 1384
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
1385
					atomic_inc(&fi->fib_clntref);
1386 1387 1388 1389 1390 1391
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
		t->stats.semantic_match_miss++;
1392 1393
#endif
	}
1394

1395
	return 1;
1396 1397
}

1398
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1399
		     struct fib_result *res, int fib_flags)
1400 1401
{
	struct trie *t = (struct trie *) tb->tb_data;
1402
	int ret;
1403
	struct rt_trie_node *n;
1404
	struct tnode *pn;
1405
	unsigned int pos, bits;
1406
	t_key key = ntohl(flp->daddr);
1407
	unsigned int chopped_off;
1408
	t_key cindex = 0;
1409
	unsigned int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1410
	struct tnode *cn;
1411
	t_key pref_mismatch;
O
Olof Johansson 已提交
1412

R
Robert Olsson 已提交
1413
	rcu_read_lock();
O
Olof Johansson 已提交
1414

R
Robert Olsson 已提交
1415
	n = rcu_dereference(t->trie);
1416
	if (!n)
1417 1418 1419 1420 1421 1422 1423 1424
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats.gets++;
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1425
		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1426
		goto found;
1427
	}
1428

1429 1430
	pn = (struct tnode *) n;
	chopped_off = 0;
1431

O
Olof Johansson 已提交
1432
	while (pn) {
1433 1434 1435
		pos = pn->pos;
		bits = pn->bits;

1436
		if (!chopped_off)
S
Stephen Hemminger 已提交
1437 1438
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1439

1440
		n = tnode_get_child_rcu(pn, cindex);
1441 1442 1443 1444 1445 1446 1447 1448

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.null_node_hit++;
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1449
		if (IS_LEAF(n)) {
1450
			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1451
			if (ret > 0)
O
Olof Johansson 已提交
1452
				goto backtrace;
1453
			goto found;
O
Olof Johansson 已提交
1454 1455 1456
		}

		cn = (struct tnode *)n;
1457

O
Olof Johansson 已提交
1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1473

O
Olof Johansson 已提交
1474 1475 1476 1477 1478 1479 1480 1481 1482
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1483

1484 1485
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1486

O
Olof Johansson 已提交
1487 1488
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1489 1490
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1491 1492
				goto backtrace;
		}
1493

O
Olof Johansson 已提交
1494 1495 1496 1497 1498 1499 1500 1501 1502
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1503

O
Olof Johansson 已提交
1504 1505 1506 1507 1508 1509 1510 1511
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1512

1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1524 1525
		 */

1526
		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
O
Olof Johansson 已提交
1527

1528 1529 1530 1531
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1532 1533
		 */
		if (pref_mismatch) {
1534 1535
			/* fls(x) = __fls(x) + 1 */
			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
O
Olof Johansson 已提交
1536

1537
			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
O
Olof Johansson 已提交
1538 1539 1540 1541
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1542
		}
1543

O
Olof Johansson 已提交
1544 1545 1546 1547
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1548 1549 1550 1551
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1552 1553
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1554 1555 1556 1557
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1558 1559
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1560

1561
		/*
1562
		 * Either we do the actual chop off according or if we have
1563 1564 1565
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1566
		if (chopped_off <= pn->bits) {
1567
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1568
		} else {
1569
			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
S
Stephen Hemminger 已提交
1570
			if (!parent)
1571
				goto failed;
O
Olof Johansson 已提交
1572

1573
			/* Get Child's index */
S
Stephen Hemminger 已提交
1574 1575
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1576 1577 1578 1579 1580 1581
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.backtrack++;
#endif
			goto backtrace;
1582
		}
1583 1584
	}
failed:
1585
	ret = 1;
1586
found:
R
Robert Olsson 已提交
1587
	rcu_read_unlock();
1588 1589
	return ret;
}
1590
EXPORT_SYMBOL_GPL(fib_table_lookup);
1591

1592 1593 1594 1595
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1596
{
1597
	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1598

1599
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1600

1601
	if (tp) {
1602
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
L
Lin Ming 已提交
1603
		put_child(tp, cindex, NULL);
1604
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1605
	} else
1606
		RCU_INIT_POINTER(t->trie, NULL);
1607

1608
	free_leaf(l);
1609 1610
}

1611 1612 1613
/*
 * Caller must hold RTNL.
 */
1614
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1615 1616 1617
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1618 1619
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1620 1621 1622
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1623 1624
	struct leaf_info *li;

1625
	if (plen > 32)
1626 1627
		return -EINVAL;

1628
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1629
	mask = ntohl(inet_make_mask(plen));
1630

1631
	if (key & ~mask)
1632 1633 1634 1635 1636
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1637
	if (!l)
1638 1639
		return -ESRCH;

1640 1641 1642 1643 1644 1645
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1646 1647 1648 1649 1650
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1651
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1652 1653

	fa_to_delete = NULL;
1654 1655
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1656 1657 1658 1659 1660
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1661 1662
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1663
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1664 1665
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1666 1667 1668
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1669 1670 1671 1672 1673
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1674 1675
	if (!fa_to_delete)
		return -ESRCH;
1676

O
Olof Johansson 已提交
1677
	fa = fa_to_delete;
1678
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1679
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1680

R
Robert Olsson 已提交
1681
	list_del_rcu(&fa->fa_list);
1682

1683 1684 1685
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1686
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1687
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1688
		free_leaf_info(li);
R
Robert Olsson 已提交
1689
	}
1690

O
Olof Johansson 已提交
1691
	if (hlist_empty(&l->list))
1692
		trie_leaf_remove(t, l);
1693

O
Olof Johansson 已提交
1694
	if (fa->fa_state & FA_S_ACCESSED)
1695
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1696

R
Robert Olsson 已提交
1697 1698
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1699
	return 0;
1700 1701
}

1702
static int trie_flush_list(struct list_head *head)
1703 1704 1705 1706 1707 1708 1709
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1710 1711 1712 1713
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1714 1715 1716 1717 1718 1719
			found++;
		}
	}
	return found;
}

1720
static int trie_flush_leaf(struct leaf *l)
1721 1722 1723
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1724
	struct hlist_node *tmp;
1725 1726
	struct leaf_info *li = NULL;

1727
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1728
		found += trie_flush_list(&li->falh);
1729 1730

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1731
			hlist_del_rcu(&li->hlist);
1732 1733 1734 1735 1736 1737
			free_leaf_info(li);
		}
	}
	return found;
}

1738 1739 1740 1741
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
1742
static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1743
{
1744 1745
	do {
		t_key idx;
1746 1747

		if (c)
1748
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1749
		else
1750
			idx = 0;
R
Robert Olsson 已提交
1751

1752 1753
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1754
			if (!c)
O
Olof Johansson 已提交
1755 1756
				continue;

1757
			if (IS_LEAF(c))
1758 1759 1760 1761 1762
				return (struct leaf *) c;

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1763
		}
1764 1765

		/* Node empty, walk back up to parent */
1766
		c = (struct rt_trie_node *) p;
E
Eric Dumazet 已提交
1767
	} while ((p = node_parent_rcu(c)) != NULL);
1768 1769 1770 1771 1772 1773

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
E
Eric Dumazet 已提交
1774
	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
1787
	struct rt_trie_node *c = (struct rt_trie_node *) l;
1788
	struct tnode *p = node_parent_rcu(c);
1789 1790 1791 1792 1793

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1794 1795
}

1796 1797 1798 1799
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1800
	while (l && index-- > 0)
1801
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1802

1803 1804 1805 1806
	return l;
}


1807 1808 1809
/*
 * Caller must hold RTNL.
 */
1810
int fib_table_flush(struct fib_table *tb)
1811 1812
{
	struct trie *t = (struct trie *) tb->tb_data;
1813
	struct leaf *l, *ll = NULL;
1814
	int found = 0;
1815

1816
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1817
		found += trie_flush_leaf(l);
1818 1819

		if (ll && hlist_empty(&ll->list))
1820
			trie_leaf_remove(t, ll);
1821 1822 1823 1824
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1825
		trie_leaf_remove(t, ll);
1826

S
Stephen Hemminger 已提交
1827
	pr_debug("trie_flush found=%d\n", found);
1828 1829 1830
	return found;
}

1831 1832 1833 1834 1835
void fib_free_table(struct fib_table *tb)
{
	kfree(tb);
}

1836 1837
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1838 1839 1840 1841
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1842
	__be32 xkey = htonl(key);
1843

1844
	s_i = cb->args[5];
1845 1846
	i = 0;

R
Robert Olsson 已提交
1847 1848 1849
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1850 1851 1852 1853 1854
		if (i < s_i) {
			i++;
			continue;
		}

1855
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1856 1857 1858 1859
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1860
				  xkey,
1861 1862
				  plen,
				  fa->fa_tos,
1863
				  fa->fa_info, NLM_F_MULTI) < 0) {
1864
			cb->args[5] = i;
1865
			return -1;
O
Olof Johansson 已提交
1866
		}
1867 1868
		i++;
	}
1869
	cb->args[5] = i;
1870 1871 1872
	return skb->len;
}

1873 1874
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1875
{
1876 1877
	struct leaf_info *li;
	int i, s_i;
1878

1879
	s_i = cb->args[4];
1880
	i = 0;
1881

1882
	/* rcu_read_lock is hold by caller */
1883
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1884 1885
		if (i < s_i) {
			i++;
1886
			continue;
1887
		}
O
Olof Johansson 已提交
1888

1889
		if (i > s_i)
1890
			cb->args[5] = 0;
1891

1892
		if (list_empty(&li->falh))
1893 1894
			continue;

1895
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1896
			cb->args[4] = i;
1897 1898
			return -1;
		}
1899
		i++;
1900
	}
1901

1902
	cb->args[4] = i;
1903 1904 1905
	return skb->len;
}

1906 1907
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1908
{
1909
	struct leaf *l;
1910
	struct trie *t = (struct trie *) tb->tb_data;
1911
	t_key key = cb->args[2];
1912
	int count = cb->args[3];
1913

R
Robert Olsson 已提交
1914
	rcu_read_lock();
1915 1916 1917
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1918
	if (count == 0)
1919 1920
		l = trie_firstleaf(t);
	else {
1921 1922 1923
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1924
		l = fib_find_node(t, key);
1925 1926
		if (!l)
			l = trie_leafindex(t, count);
1927
	}
1928

1929 1930
	while (l) {
		cb->args[2] = l->key;
1931
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1932
			cb->args[3] = count;
1933 1934
			rcu_read_unlock();
			return -1;
1935
		}
1936

1937
		++count;
1938
		l = trie_nextleaf(l);
1939 1940
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1941
	}
1942
	cb->args[3] = count;
R
Robert Olsson 已提交
1943
	rcu_read_unlock();
1944

1945 1946 1947
	return skb->len;
}

1948
void __init fib_trie_init(void)
1949
{
1950 1951
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1952 1953 1954 1955 1956 1957
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1958
}
1959

1960

1961
struct fib_table *fib_trie_table(u32 id)
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1972
	tb->tb_default = -1;
1973
	tb->tb_num_default = 0;
1974 1975

	t = (struct trie *) tb->tb_data;
1976
	memset(t, 0, sizeof(*t));
1977 1978 1979 1980

	return tb;
}

1981 1982 1983
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1984
	struct seq_net_private p;
1985
	struct fib_table *tb;
1986
	struct tnode *tnode;
E
Eric Dumazet 已提交
1987 1988
	unsigned int index;
	unsigned int depth;
1989
};
1990

1991
static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1992
{
1993
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
1994
	unsigned int cindex = iter->index;
1995
	struct tnode *p;
1996

1997 1998 1999 2000
	/* A single entry routing table */
	if (!tn)
		return NULL;

2001 2002 2003 2004
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2005
		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2006

2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2019

2020 2021
		++cindex;
	}
O
Olof Johansson 已提交
2022

2023
	/* Current node exhausted, pop back up */
2024
	p = node_parent_rcu((struct rt_trie_node *)tn);
2025 2026 2027 2028 2029
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2030
	}
2031 2032 2033

	/* got root? */
	return NULL;
2034 2035
}

2036
static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2037
				       struct trie *t)
2038
{
2039
	struct rt_trie_node *n;
2040

S
Stephen Hemminger 已提交
2041
	if (!t)
2042 2043 2044
		return NULL;

	n = rcu_dereference(t->trie);
2045
	if (!n)
2046
		return NULL;
2047

2048 2049 2050 2051 2052 2053 2054 2055
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2056
	}
2057 2058

	return n;
2059
}
O
Olof Johansson 已提交
2060

2061 2062
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
2063
	struct rt_trie_node *n;
2064
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2065

2066
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2067

2068
	rcu_read_lock();
2069
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2070
		if (IS_LEAF(n)) {
2071 2072 2073
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;

2074 2075 2076 2077
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2078

2079
			hlist_for_each_entry_rcu(li, &l->list, hlist)
2080
				++s->prefixes;
2081 2082 2083 2084 2085
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2086
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2087 2088
				s->nodesizes[tn->bits]++;

2089 2090 2091
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2092 2093
		}
	}
R
Robert Olsson 已提交
2094
	rcu_read_unlock();
2095 2096
}

2097 2098 2099 2100
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2101
{
E
Eric Dumazet 已提交
2102
	unsigned int i, max, pointers, bytes, avdepth;
2103

2104 2105 2106 2107
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2108

2109 2110
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2111
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2112

2113 2114
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2115 2116 2117 2118

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2119
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2120
	bytes += sizeof(struct tnode) * stat->tnodes;
2121

R
Robert Olsson 已提交
2122 2123
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2124
		max--;
2125

2126
	pointers = 0;
2127
	for (i = 1; i < max; i++)
2128
		if (stat->nodesizes[i] != 0) {
2129
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2130 2131 2132
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2133
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2134

2135
	bytes += sizeof(struct rt_trie_node *) * pointers;
2136 2137
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2138
}
R
Robert Olsson 已提交
2139

2140
#ifdef CONFIG_IP_FIB_TRIE_STATS
2141 2142 2143 2144
static void trie_show_usage(struct seq_file *seq,
			    const struct trie_use_stats *stats)
{
	seq_printf(seq, "\nCounters:\n---------\n");
2145 2146 2147 2148 2149 2150 2151 2152 2153
	seq_printf(seq, "gets = %u\n", stats->gets);
	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
	seq_printf(seq, "semantic match passed = %u\n",
		   stats->semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n",
		   stats->semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n",
		   stats->resize_node_skipped);
2154
}
2155 2156
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2157
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2158
{
2159 2160 2161 2162 2163 2164
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2165
}
2166

2167

2168 2169
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2170
	struct net *net = (struct net *)seq->private;
2171
	unsigned int h;
2172

2173
	seq_printf(seq,
2174 2175
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2176 2177
		   sizeof(struct leaf), sizeof(struct tnode));

2178 2179 2180 2181
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

2182
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2183 2184
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2185

2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
			trie_show_usage(seq, &t->stats);
#endif
		}
	}
2198

2199
	return 0;
2200 2201
}

2202
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2203
{
2204
	return single_open_net(inode, file, fib_triestat_seq_show);
2205 2206
}

2207
static const struct file_operations fib_triestat_fops = {
2208 2209 2210 2211
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2212
	.release = single_release_net,
2213 2214
};

2215
static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2216
{
2217 2218
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2219
	loff_t idx = 0;
2220
	unsigned int h;
2221

2222 2223 2224
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
2225

2226
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2227
			struct rt_trie_node *n;
2228 2229 2230 2231 2232 2233 2234 2235 2236

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2237
	}
2238

2239 2240 2241
	return NULL;
}

2242
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2243
	__acquires(RCU)
2244
{
2245
	rcu_read_lock();
2246
	return fib_trie_get_idx(seq, *pos);
2247 2248
}

2249
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2250
{
2251
	struct fib_trie_iter *iter = seq->private;
2252
	struct net *net = seq_file_net(seq);
2253 2254 2255
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
2256
	struct rt_trie_node *n;
2257

2258
	++*pos;
2259 2260 2261 2262
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2263

2264 2265
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2266
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2267 2268 2269 2270 2271
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2272

2273 2274 2275
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2276
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2277 2278 2279 2280 2281
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2282
	return NULL;
2283 2284 2285 2286

found:
	iter->tb = tb;
	return n;
2287
}
2288

2289
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2290
	__releases(RCU)
2291
{
2292 2293
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2294

2295 2296
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2297 2298
	while (n-- > 0)
		seq_puts(seq, "   ");
2299
}
2300

2301
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2302
{
S
Stephen Hemminger 已提交
2303
	switch (s) {
2304 2305 2306 2307 2308 2309
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2310
		snprintf(buf, len, "scope=%d", s);
2311 2312 2313
		return buf;
	}
}
2314

2315
static const char *const rtn_type_names[__RTN_MAX] = {
2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2329

E
Eric Dumazet 已提交
2330
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2331 2332 2333
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2334
	snprintf(buf, len, "type %u", t);
2335
	return buf;
2336 2337
}

2338 2339
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2340
{
2341
	const struct fib_trie_iter *iter = seq->private;
2342
	struct rt_trie_node *n = v;
2343

2344 2345
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2346

2347 2348
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2349
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2350

2351
		seq_indent(seq, iter->depth-1);
2352 2353
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2354
			   tn->empty_children);
2355

2356 2357
	} else {
		struct leaf *l = (struct leaf *) n;
2358
		struct leaf_info *li;
A
Al Viro 已提交
2359
		__be32 val = htonl(l->key);
2360 2361

		seq_indent(seq, iter->depth);
2362
		seq_printf(seq, "  |-- %pI4\n", &val);
2363

2364
		hlist_for_each_entry_rcu(li, &l->list, hlist) {
2365 2366 2367 2368 2369 2370 2371 2372
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2373
						     fa->fa_info->fib_scope),
2374 2375 2376
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2377
					seq_printf(seq, " tos=%d", fa->fa_tos);
2378
				seq_putc(seq, '\n');
2379 2380
			}
		}
2381
	}
2382

2383 2384 2385
	return 0;
}

2386
static const struct seq_operations fib_trie_seq_ops = {
2387 2388 2389 2390
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2391 2392
};

2393
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2394
{
2395 2396
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2397 2398
}

2399
static const struct file_operations fib_trie_fops = {
2400 2401 2402 2403
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2404
	.release = seq_release_net,
2405 2406
};

2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2447
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2485
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2486
{
E
Eric Dumazet 已提交
2487
	unsigned int flags = 0;
2488

E
Eric Dumazet 已提交
2489 2490
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2491 2492
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2493
	if (mask == htonl(0xFFFFFFFF))
2494 2495 2496
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2497 2498
}

2499 2500 2501
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2502
 *	and needs to be same as fib_hash output to avoid breaking
2503 2504 2505
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2506
{
2507
	struct leaf *l = v;
2508
	struct leaf_info *li;
2509

2510 2511 2512 2513 2514 2515
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2516

2517
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2518
		struct fib_alias *fa;
A
Al Viro 已提交
2519
		__be32 mask, prefix;
O
Olof Johansson 已提交
2520

2521 2522
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2523

2524
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2525
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2526
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2527

2528 2529 2530
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2531

2532 2533
			seq_setwidth(seq, 127);

2534
			if (fi)
2535 2536
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2537
					 "%d\t%08X\t%d\t%u\t%u",
2538 2539 2540 2541 2542
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2543 2544
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2545
					 fi->fib_window,
2546
					 fi->fib_rtt >> 3);
2547
			else
2548 2549
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2550
					 "%d\t%08X\t%d\t%u\t%u",
2551
					 prefix, 0, flags, 0, 0, 0,
2552
					 mask, 0, 0, 0);
2553

2554
			seq_pad(seq, '\n');
2555
		}
2556 2557 2558 2559 2560
	}

	return 0;
}

2561
static const struct seq_operations fib_route_seq_ops = {
2562 2563 2564
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2565
	.show   = fib_route_seq_show,
2566 2567
};

2568
static int fib_route_seq_open(struct inode *inode, struct file *file)
2569
{
2570
	return seq_open_net(inode, file, &fib_route_seq_ops,
2571
			    sizeof(struct fib_route_iter));
2572 2573
}

2574
static const struct file_operations fib_route_fops = {
2575 2576 2577 2578
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2579
	.release = seq_release_net,
2580 2581
};

2582
int __net_init fib_proc_init(struct net *net)
2583
{
2584
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2585 2586
		goto out1;

2587 2588
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2589 2590
		goto out2;

2591
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2592 2593
		goto out3;

2594
	return 0;
2595 2596

out3:
2597
	remove_proc_entry("fib_triestat", net->proc_net);
2598
out2:
2599
	remove_proc_entry("fib_trie", net->proc_net);
2600 2601
out1:
	return -ENOMEM;
2602 2603
}

2604
void __net_exit fib_proc_exit(struct net *net)
2605
{
2606 2607 2608
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2609 2610 2611
}

#endif /* CONFIG_PROC_FS */