fib_trie.c 60.9 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
10
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11
 *     Agricultural Sciences.
12
 *
13 14
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
L
Lucas De Marchi 已提交
15
 * This work is based on the LPC-trie which is originally described in:
16
 *
17 18
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19
 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
R
Robert Olsson 已提交
42 43 44 45 46 47 48
 *
 * Substantial contributions to this work comes from:
 *
 *		David S. Miller, <davem@davemloft.net>
 *		Stephen Hemminger <shemminger@osdl.org>
 *		Paul E. McKenney <paulmck@us.ibm.com>
 *		Patrick McHardy <kaber@trash.net>
49 50
 */

J
Jens Låås 已提交
51
#define VERSION "0.409"
52 53

#include <asm/uaccess.h>
J
Jiri Slaby 已提交
54
#include <linux/bitops.h>
55 56 57 58 59 60 61 62 63
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
S
Stephen Hemminger 已提交
64
#include <linux/inetdevice.h>
65 66 67
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
R
Robert Olsson 已提交
68
#include <linux/rcupdate.h>
69 70 71 72
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
73
#include <linux/slab.h>
74
#include <linux/export.h>
75
#include <net/net_namespace.h>
76 77 78 79 80 81 82 83
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

R
Robert Olsson 已提交
84
#define MAX_STAT_DEPTH 32
85 86 87 88 89 90 91 92

#define KEYLENGTH (8*sizeof(t_key))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
R
Robert Olsson 已提交
93 94
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

O
Olof Johansson 已提交
95 96
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
97

98
struct rt_trie_node {
O
Olof Johansson 已提交
99
	unsigned long parent;
100
	t_key key;
101 102 103
};

struct leaf {
O
Olof Johansson 已提交
104
	unsigned long parent;
105
	t_key key;
106
	struct hlist_head list;
R
Robert Olsson 已提交
107
	struct rcu_head rcu;
108 109 110 111 112
};

struct leaf_info {
	struct hlist_node hlist;
	int plen;
113
	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
114
	struct list_head falh;
115
	struct rcu_head rcu;
116 117 118
};

struct tnode {
O
Olof Johansson 已提交
119
	unsigned long parent;
120
	t_key key;
121 122
	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123 124
	unsigned int full_children;	/* KEYLENGTH bits needed */
	unsigned int empty_children;	/* KEYLENGTH bits needed */
125 126
	union {
		struct rcu_head rcu;
J
Jarek Poplawski 已提交
127
		struct tnode *tnode_free;
128
	};
E
Eric Dumazet 已提交
129
	struct rt_trie_node __rcu *child[0];
130 131 132 133 134 135 136 137 138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140 141 142 143 144 145 146 147 148
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
149
	unsigned int prefixes;
R
Robert Olsson 已提交
150
	unsigned int nodesizes[MAX_STAT_DEPTH];
151
};
152 153

struct trie {
E
Eric Dumazet 已提交
154
	struct rt_trie_node __rcu *trie;
155 156 157 158 159
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
};

160
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
161
				  int wasfull);
162
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163 164
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
J
Jarek Poplawski 已提交
165 166
/* tnodes to free after resize(); protected by RTNL */
static struct tnode *tnode_free_head;
167 168 169 170 171 172 173 174
static size_t tnode_free_size;

/*
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 * obtained experimentally, aiming to avoid visible slowdown.
 */
static const int sync_pages = 128;
175

176
static struct kmem_cache *fn_alias_kmem __read_mostly;
177
static struct kmem_cache *trie_leaf_kmem __read_mostly;
178

E
Eric Dumazet 已提交
179 180 181 182
/*
 * caller must hold RTNL
 */
static inline struct tnode *node_parent(const struct rt_trie_node *node)
S
Stephen Hemminger 已提交
183
{
E
Eric Dumazet 已提交
184 185 186 187 188
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());

	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
189 190
}

E
Eric Dumazet 已提交
191 192 193 194
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195
{
E
Eric Dumazet 已提交
196 197 198 199
	unsigned long parent;

	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
							   lockdep_rtnl_is_held());
S
Stephen Hemminger 已提交
200

E
Eric Dumazet 已提交
201
	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
S
Stephen Hemminger 已提交
202 203
}

204
/* Same as rcu_assign_pointer
205 206
 * but that macro() assumes that value is a pointer.
 */
207
static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
S
Stephen Hemminger 已提交
208
{
209 210
	smp_wmb();
	node->parent = (unsigned long)ptr | NODE_TYPE(node);
S
Stephen Hemminger 已提交
211
}
R
Robert Olsson 已提交
212

E
Eric Dumazet 已提交
213 214 215 216
/*
 * caller must hold RTNL
 */
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217 218
{
	BUG_ON(i >= 1U << tn->bits);
R
Robert Olsson 已提交
219

E
Eric Dumazet 已提交
220
	return rtnl_dereference(tn->child[i]);
221 222
}

E
Eric Dumazet 已提交
223 224 225 226
/*
 * caller must hold RCU read lock or RTNL
 */
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227
{
E
Eric Dumazet 已提交
228
	BUG_ON(i >= 1U << tn->bits);
229

E
Eric Dumazet 已提交
230
	return rcu_dereference_rtnl(tn->child[i]);
231 232
}

S
Stephen Hemmigner 已提交
233
static inline int tnode_child_length(const struct tnode *tn)
234
{
O
Olof Johansson 已提交
235
	return 1 << tn->bits;
236 237
}

238
static inline t_key mask_pfx(t_key k, unsigned int l)
S
Stephen Hemminger 已提交
239 240 241 242
{
	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}

243
static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
244
{
O
Olof Johansson 已提交
245
	if (offset < KEYLENGTH)
246
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
O
Olof Johansson 已提交
247
	else
248 249 250 251 252
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
253
	return a == b;
254 255 256 257
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
258 259
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
O
Olof Johansson 已提交
260 261
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
262
}
263 264 265 266 267 268

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

269 270 271
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
272 273 274 275 276
		i++;
	return i;
}

/*
277 278
  To understand this stuff, an understanding of keys and all their bits is
  necessary. Every node in the trie has a key associated with it, but not
279 280 281 282
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

283 284 285 286 287
  If n is a leaf, every bit in its key is significant. Its presence is
  necessitated by path compression, since during a tree traversal (when
  searching for a leaf - unless we are doing an insertion) we will completely
  ignore all skipped bits we encounter. Thus we need to verify, at the end of
  a potentially successful search, that we have indeed been walking the
288 289
  correct key path.

290 291 292 293 294
  Note that we can never "miss" the correct key in the tree if present by
  following the wrong path. Path compression ensures that segments of the key
  that are the same for all keys with a given prefix are skipped, but the
  skipped part *is* identical for each node in the subtrie below the skipped
  bit! trie_insert() in this implementation takes care of that - note the
295 296
  call to tkey_sub_equals() in trie_insert().

297
  if n is an internal node - a 'tnode' here, the various parts of its key
298 299
  have many different meanings.

300
  Example:
301 302 303
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
304
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
305 306 307 308 309 310 311 312 313

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
O
Olof Johansson 已提交
314
  n->bits = 4
315

316 317
  First, let's just ignore the bits that come before the parent tp, that is
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318 319 320
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321
  index into the parent's child array. That is, they will be used to find
322 323 324 325 326
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

327
  All the bits we have seen so far are significant to the node n. The rest
328 329
  of the bits are really not needed or indeed known in n->key.

330
  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331
  n's child array, and will of course be different for each child.
332

333

334 335 336 337 338
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

S
Stephen Hemminger 已提交
339
static inline void check_tnode(const struct tnode *tn)
340
{
S
Stephen Hemminger 已提交
341
	WARN_ON(tn && tn->pos+tn->bits > 32);
342 343
}

344 345
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
346
static const int halve_threshold_root = 15;
J
Jens Låås 已提交
347
static const int inflate_threshold_root = 30;
R
Robert Olsson 已提交
348 349

static void __alias_free_mem(struct rcu_head *head)
350
{
R
Robert Olsson 已提交
351 352
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
353 354
}

R
Robert Olsson 已提交
355
static inline void alias_free_mem_rcu(struct fib_alias *fa)
356
{
R
Robert Olsson 已提交
357 358
	call_rcu(&fa->rcu, __alias_free_mem);
}
O
Olof Johansson 已提交
359

R
Robert Olsson 已提交
360 361
static void __leaf_free_rcu(struct rcu_head *head)
{
362 363
	struct leaf *l = container_of(head, struct leaf, rcu);
	kmem_cache_free(trie_leaf_kmem, l);
R
Robert Olsson 已提交
364
}
O
Olof Johansson 已提交
365

366 367
static inline void free_leaf(struct leaf *l)
{
368
	call_rcu(&l->rcu, __leaf_free_rcu);
369 370
}

R
Robert Olsson 已提交
371
static inline void free_leaf_info(struct leaf_info *leaf)
372
{
373
	kfree_rcu(leaf, rcu);
374 375
}

376
static struct tnode *tnode_alloc(size_t size)
377
{
R
Robert Olsson 已提交
378
	if (size <= PAGE_SIZE)
379
		return kzalloc(size, GFP_KERNEL);
380
	else
381
		return vzalloc(size);
382
}
R
Robert Olsson 已提交
383 384

static void __tnode_free_rcu(struct rcu_head *head)
385
{
R
Robert Olsson 已提交
386
	struct tnode *tn = container_of(head, struct tnode, rcu);
387
	size_t size = sizeof(struct tnode) +
388
		      (sizeof(struct rt_trie_node *) << tn->bits);
389 390 391

	if (size <= PAGE_SIZE)
		kfree(tn);
A
Al Viro 已提交
392 393
	else
		vfree(tn);
394 395
}

R
Robert Olsson 已提交
396 397
static inline void tnode_free(struct tnode *tn)
{
398 399 400
	if (IS_LEAF(tn))
		free_leaf((struct leaf *) tn);
	else
R
Robert Olsson 已提交
401
		call_rcu(&tn->rcu, __tnode_free_rcu);
R
Robert Olsson 已提交
402 403
}

J
Jarek Poplawski 已提交
404 405 406
static void tnode_free_safe(struct tnode *tn)
{
	BUG_ON(IS_LEAF(tn));
407 408
	tn->tnode_free = tnode_free_head;
	tnode_free_head = tn;
409
	tnode_free_size += sizeof(struct tnode) +
410
			   (sizeof(struct rt_trie_node *) << tn->bits);
J
Jarek Poplawski 已提交
411 412 413 414 415 416 417 418 419 420 421
}

static void tnode_free_flush(void)
{
	struct tnode *tn;

	while ((tn = tnode_free_head)) {
		tnode_free_head = tn->tnode_free;
		tn->tnode_free = NULL;
		tnode_free(tn);
	}
422 423 424 425 426

	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
		tnode_free_size = 0;
		synchronize_rcu();
	}
J
Jarek Poplawski 已提交
427 428
}

R
Robert Olsson 已提交
429 430
static struct leaf *leaf_new(void)
{
431
	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
432 433 434 435 436 437 438 439 440 441 442 443
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
444
		li->mask_plen = ntohl(inet_make_mask(plen));
R
Robert Olsson 已提交
445 446 447 448 449
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

450
static struct tnode *tnode_new(t_key key, int pos, int bits)
451
{
452
	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
453
	struct tnode *tn = tnode_alloc(sz);
454

O
Olof Johansson 已提交
455
	if (tn) {
R
Robert Olsson 已提交
456
		tn->parent = T_TNODE;
457 458 459 460 461 462
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
463

E
Eric Dumazet 已提交
464
	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
L
Lin Ming 已提交
465
		 sizeof(struct rt_trie_node *) << bits);
466 467 468 469 470 471 472 473
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

474
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475
{
476
	if (n == NULL || IS_LEAF(n))
477 478 479 480 481
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

L
Lin Ming 已提交
482
static inline void put_child(struct tnode *tn, int i,
483
			     struct rt_trie_node *n)
484 485 486 487
{
	tnode_put_child_reorg(tn, i, n, -1);
}

488
 /*
489 490 491 492
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

493
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494
				  int wasfull)
495
{
E
Eric Dumazet 已提交
496
	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
497 498
	int isfull;

S
Stephen Hemminger 已提交
499 500
	BUG_ON(i >= 1<<tn->bits);

501 502 503 504 505
	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
506

507
	/* update fullChildren */
O
Olof Johansson 已提交
508
	if (wasfull == -1)
509 510 511
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
512
	if (wasfull && !isfull)
513
		tn->full_children--;
514
	else if (!wasfull && isfull)
515
		tn->full_children++;
O
Olof Johansson 已提交
516

517
	if (n)
S
Stephen Hemminger 已提交
518
		node_set_parent(n, tn);
519

520
	rcu_assign_pointer(tn->child[i], n);
521 522
}

J
Jens Låås 已提交
523
#define MAX_WORK 10
524
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525 526
{
	int i;
527
	struct tnode *old_tn;
528 529
	int inflate_threshold_use;
	int halve_threshold_use;
J
Jens Låås 已提交
530
	int max_work;
531

532
	if (!tn)
533 534
		return NULL;

S
Stephen Hemminger 已提交
535 536
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
537 538 539

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
J
Jarek Poplawski 已提交
540
		tnode_free_safe(tn);
541 542 543 544
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
J
Jens Låås 已提交
545
		goto one_child;
546
	/*
547 548 549 550 551
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
552 553
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554
	 * Telecommunications, page 6:
555
	 * "A node is doubled if the ratio of non-empty children to all
556 557
	 * children in the *doubled* node is at least 'high'."
	 *
558 559 560 561 562
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
563
	 * multiply the left-hand side by 50.
564 565 566 567
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
568
	 * children, that is non-null tnodes with a skip value of 0.
569
	 * All of those will be doubled in the resulting inflated tnode, so
570
	 * we just count them one extra time here.
571
	 *
572
	 * A clearer way to write this would be:
573
	 *
574
	 * to_be_doubled = tn->full_children;
575
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 577 578 579
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
580
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 582
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
583 584 585
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
586 587 588
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
589
	 *
590 591 592
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
593
	 *
594
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
595
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
596
	 *    tn->full_children) >= inflate_threshold * new_child_length
597
	 *
598
	 * expand new_child_length:
599
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
O
Olof Johansson 已提交
600
	 *    tn->full_children) >=
601
	 *      inflate_threshold * tnode_child_length(tn) * 2
602
	 *
603
	 * shorten again:
604
	 * 50 * (tn->full_children + tnode_child_length(tn) -
O
Olof Johansson 已提交
605
	 *    tn->empty_children) >= inflate_threshold *
606
	 *    tnode_child_length(tn)
607
	 *
608 609 610
	 */

	check_tnode(tn);
611

612 613
	/* Keep root node larger  */

614
	if (!node_parent((struct rt_trie_node *)tn)) {
J
Jens Låås 已提交
615 616
		inflate_threshold_use = inflate_threshold_root;
		halve_threshold_use = halve_threshold_root;
E
Eric Dumazet 已提交
617
	} else {
618
		inflate_threshold_use = inflate_threshold;
J
Jens Låås 已提交
619 620
		halve_threshold_use = halve_threshold;
	}
621

J
Jens Låås 已提交
622 623
	max_work = MAX_WORK;
	while ((tn->full_children > 0 &&  max_work-- &&
624 625 626
		50 * (tn->full_children + tnode_child_length(tn)
		      - tn->empty_children)
		>= inflate_threshold_use * tnode_child_length(tn))) {
627

628 629
		old_tn = tn;
		tn = inflate(t, tn);
630

631 632
		if (IS_ERR(tn)) {
			tn = old_tn;
633 634 635 636 637
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
638 639 640 641
	}

	check_tnode(tn);

J
Jens Låås 已提交
642
	/* Return if at least one inflate is run */
E
Eric Dumazet 已提交
643
	if (max_work != MAX_WORK)
644
		return (struct rt_trie_node *) tn;
J
Jens Låås 已提交
645

646 647 648 649
	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
650

J
Jens Låås 已提交
651 652
	max_work = MAX_WORK;
	while (tn->bits > 1 &&  max_work-- &&
653
	       100 * (tnode_child_length(tn) - tn->empty_children) <
654
	       halve_threshold_use * tnode_child_length(tn)) {
655

656 657 658 659
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
660 661 662 663 664 665
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
666

667

668
	/* Only one child remains */
J
Jens Låås 已提交
669 670
	if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
671
		for (i = 0; i < tnode_child_length(tn); i++) {
672
			struct rt_trie_node *n;
673

E
Eric Dumazet 已提交
674
			n = rtnl_dereference(tn->child[i]);
R
Robert Olsson 已提交
675
			if (!n)
O
Olof Johansson 已提交
676 677 678 679
				continue;

			/* compress one level */

S
Stephen Hemminger 已提交
680
			node_set_parent(n, NULL);
J
Jarek Poplawski 已提交
681
			tnode_free_safe(tn);
O
Olof Johansson 已提交
682
			return n;
683
		}
J
Jens Låås 已提交
684
	}
685
	return (struct rt_trie_node *) tn;
686 687
}

E
Eric Dumazet 已提交
688 689 690 691 692 693 694 695 696 697 698 699 700 701

static void tnode_clean_free(struct tnode *tn)
{
	int i;
	struct tnode *tofree;

	for (i = 0; i < tnode_child_length(tn); i++) {
		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
		if (tofree)
			tnode_free(tofree);
	}
	tnode_free(tn);
}

702
static struct tnode *inflate(struct trie *t, struct tnode *tn)
703 704 705 706 707
{
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

S
Stephen Hemminger 已提交
708
	pr_debug("In inflate\n");
709 710 711

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

S
Stephen Hemminger 已提交
712
	if (!tn)
713
		return ERR_PTR(-ENOMEM);
714 715

	/*
716 717 718
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
719 720
	 * of tnode is ignored.
	 */
O
Olof Johansson 已提交
721 722

	for (i = 0; i < olen; i++) {
723
		struct tnode *inode;
724

725
		inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 727 728 729 730
		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
S
Stephen Hemminger 已提交
731
			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732

733 734
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
735 736
			if (!left)
				goto nomem;
O
Olof Johansson 已提交
737

738 739 740
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

741
			if (!right) {
742 743
				tnode_free(left);
				goto nomem;
744
			}
745

L
Lin Ming 已提交
746 747
			put_child(tn, 2*i, (struct rt_trie_node *) left);
			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
748 749 750
		}
	}

O
Olof Johansson 已提交
751
	for (i = 0; i < olen; i++) {
752
		struct tnode *inode;
753
		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
O
Olof Johansson 已提交
754 755
		struct tnode *left, *right;
		int size, j;
756

757 758 759 760 761 762
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

763
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
764
		   tn->pos + tn->bits - 1) {
765 766 767
			if (tkey_extract_bits(node->key,
					      oldtnode->pos + oldtnode->bits,
					      1) == 0)
L
Lin Ming 已提交
768
				put_child(tn, 2*i, node);
769
			else
L
Lin Ming 已提交
770
				put_child(tn, 2*i+1, node);
771 772 773 774 775 776 777
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
L
Lin Ming 已提交
778 779
			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
780

J
Jarek Poplawski 已提交
781
			tnode_free_safe(inode);
O
Olof Johansson 已提交
782
			continue;
783 784
		}

O
Olof Johansson 已提交
785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
803

O
Olof Johansson 已提交
804 805 806
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
807

O
Olof Johansson 已提交
808
		left = (struct tnode *) tnode_get_child(tn, 2*i);
L
Lin Ming 已提交
809
		put_child(tn, 2*i, NULL);
810

O
Olof Johansson 已提交
811
		BUG_ON(!left);
812

O
Olof Johansson 已提交
813
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
L
Lin Ming 已提交
814
		put_child(tn, 2*i+1, NULL);
815

O
Olof Johansson 已提交
816
		BUG_ON(!right);
817

O
Olof Johansson 已提交
818 819
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
L
Lin Ming 已提交
820 821
			put_child(left, j, rtnl_dereference(inode->child[j]));
			put_child(right, j, rtnl_dereference(inode->child[j + size]));
822
		}
L
Lin Ming 已提交
823 824
		put_child(tn, 2*i, resize(t, left));
		put_child(tn, 2*i+1, resize(t, right));
O
Olof Johansson 已提交
825

J
Jarek Poplawski 已提交
826
		tnode_free_safe(inode);
827
	}
J
Jarek Poplawski 已提交
828
	tnode_free_safe(oldtnode);
829
	return tn;
830
nomem:
E
Eric Dumazet 已提交
831 832
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
833 834
}

835
static struct tnode *halve(struct trie *t, struct tnode *tn)
836 837
{
	struct tnode *oldtnode = tn;
838
	struct rt_trie_node *left, *right;
839 840 841
	int i;
	int olen = tnode_child_length(tn);

S
Stephen Hemminger 已提交
842
	pr_debug("In halve\n");
843 844

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
845

846 847
	if (!tn)
		return ERR_PTR(-ENOMEM);
848 849

	/*
850 851 852
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
853 854 855
	 * of tnode is ignored.
	 */

O
Olof Johansson 已提交
856
	for (i = 0; i < olen; i += 2) {
857 858
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
859

860
		/* Two nonempty children */
S
Stephen Hemminger 已提交
861
		if (left && right) {
862
			struct tnode *newn;
S
Stephen Hemminger 已提交
863

864
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
S
Stephen Hemminger 已提交
865 866

			if (!newn)
867
				goto nomem;
S
Stephen Hemminger 已提交
868

L
Lin Ming 已提交
869
			put_child(tn, i/2, (struct rt_trie_node *)newn);
870 871 872
		}

	}
873

O
Olof Johansson 已提交
874 875 876
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

877 878
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
879

880 881 882 883
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
L
Lin Ming 已提交
884
			put_child(tn, i/2, right);
O
Olof Johansson 已提交
885
			continue;
S
Stephen Hemminger 已提交
886
		}
O
Olof Johansson 已提交
887 888

		if (right == NULL) {
L
Lin Ming 已提交
889
			put_child(tn, i/2, left);
O
Olof Johansson 已提交
890 891
			continue;
		}
892

893
		/* Two nonempty children */
O
Olof Johansson 已提交
894
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
L
Lin Ming 已提交
895 896 897 898
		put_child(tn, i/2, NULL);
		put_child(newBinNode, 0, left);
		put_child(newBinNode, 1, right);
		put_child(tn, i/2, resize(t, newBinNode));
899
	}
J
Jarek Poplawski 已提交
900
	tnode_free_safe(oldtnode);
901
	return tn;
902
nomem:
E
Eric Dumazet 已提交
903 904
	tnode_clean_free(tn);
	return ERR_PTR(-ENOMEM);
905 906
}

R
Robert Olsson 已提交
907
/* readside must use rcu_read_lock currently dump routines
R
Robert Olsson 已提交
908 909
 via get_fa_head and dump */

R
Robert Olsson 已提交
910
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
911
{
R
Robert Olsson 已提交
912
	struct hlist_head *head = &l->list;
913 914
	struct leaf_info *li;

915
	hlist_for_each_entry_rcu(li, head, hlist)
916
		if (li->plen == plen)
917
			return li;
O
Olof Johansson 已提交
918

919 920 921
	return NULL;
}

922
static inline struct list_head *get_fa_head(struct leaf *l, int plen)
923
{
R
Robert Olsson 已提交
924
	struct leaf_info *li = find_leaf_info(l, plen);
925

O
Olof Johansson 已提交
926 927
	if (!li)
		return NULL;
928

O
Olof Johansson 已提交
929
	return &li->falh;
930 931 932 933
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
934 935 936 937 938
	struct leaf_info *li = NULL, *last = NULL;

	if (hlist_empty(head)) {
		hlist_add_head_rcu(&new->hlist, head);
	} else {
939
		hlist_for_each_entry(li, head, hlist) {
940 941 942 943 944 945 946 947 948 949
			if (new->plen > li->plen)
				break;

			last = li;
		}
		if (last)
			hlist_add_after_rcu(&last->hlist, &new->hlist);
		else
			hlist_add_before_rcu(&new->hlist, &li->hlist);
	}
950 951
}

R
Robert Olsson 已提交
952 953
/* rcu_read_lock needs to be hold by caller from readside */

954 955 956 957 958
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
959
	struct rt_trie_node *n;
960 961

	pos = 0;
E
Eric Dumazet 已提交
962
	n = rcu_dereference_rtnl(t->trie);
963 964 965

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
966

967
		check_tnode(tn);
O
Olof Johansson 已提交
968

969
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
O
Olof Johansson 已提交
970
			pos = tn->pos + tn->bits;
971 972 973 974
			n = tnode_get_child_rcu(tn,
						tkey_extract_bits(key,
								  tn->pos,
								  tn->bits));
O
Olof Johansson 已提交
975
		} else
976 977 978 979
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

O
Olof Johansson 已提交
980 981 982
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

983 984 985
	return NULL;
}

986
static void trie_rebalance(struct trie *t, struct tnode *tn)
987 988
{
	int wasfull;
R
Robert Olsson 已提交
989
	t_key cindex, key;
S
Stephen Hemminger 已提交
990
	struct tnode *tp;
991

R
Robert Olsson 已提交
992 993
	key = tn->key;

994
	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
995 996
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
J
Joe Perches 已提交
997
		tn = (struct tnode *)resize(t, tn);
998

J
Joe Perches 已提交
999
		tnode_put_child_reorg(tp, cindex,
1000
				      (struct rt_trie_node *)tn, wasfull);
O
Olof Johansson 已提交
1001

1002
		tp = node_parent((struct rt_trie_node *) tn);
1003
		if (!tp)
1004
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1005

J
Jarek Poplawski 已提交
1006
		tnode_free_flush();
S
Stephen Hemminger 已提交
1007
		if (!tp)
1008
			break;
S
Stephen Hemminger 已提交
1009
		tn = tp;
1010
	}
S
Stephen Hemminger 已提交
1011

1012
	/* Handle last (top) tnode */
1013
	if (IS_TNODE(tn))
J
Joe Perches 已提交
1014
		tn = (struct tnode *)resize(t, tn);
1015

1016
	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1017
	tnode_free_flush();
1018 1019
}

R
Robert Olsson 已提交
1020 1021
/* only used from updater-side */

1022
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1023 1024 1025
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
1026
	struct rt_trie_node *n;
1027 1028
	struct leaf *l;
	int missbit;
1029
	struct list_head *fa_head = NULL;
1030 1031 1032 1033
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
E
Eric Dumazet 已提交
1034
	n = rtnl_dereference(t->trie);
1035

1036 1037
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
1038
	 * and we should just put our new leaf in that.
1039 1040
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1041 1042
	 * not be the parent's 'pos'+'bits'!
	 *
1043
	 * If it does match the current key, get pos/bits from it, extract
1044 1045 1046 1047
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
1048 1049 1050
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
1051 1052 1053 1054 1055
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
O
Olof Johansson 已提交
1056

1057
		check_tnode(tn);
O
Olof Johansson 已提交
1058

1059
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1060
			tp = tn;
O
Olof Johansson 已提交
1061
			pos = tn->pos + tn->bits;
1062 1063 1064 1065
			n = tnode_get_child(tn,
					    tkey_extract_bits(key,
							      tn->pos,
							      tn->bits));
1066

S
Stephen Hemminger 已提交
1067
			BUG_ON(n && node_parent(n) != tn);
O
Olof Johansson 已提交
1068
		} else
1069 1070 1071 1072 1073 1074
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1075
	 * tp is n's (parent) ----> NULL or TNODE
1076 1077
	 */

O
Olof Johansson 已提交
1078
	BUG_ON(tp && IS_LEAF(tp));
1079 1080 1081

	/* Case 1: n is a leaf. Compare prefixes */

1082
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1083
		l = (struct leaf *) n;
1084
		li = leaf_info_new(plen);
O
Olof Johansson 已提交
1085

1086 1087
		if (!li)
			return NULL;
1088 1089 1090 1091 1092 1093 1094

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	l = leaf_new();

1095 1096
	if (!l)
		return NULL;
1097 1098 1099 1100

	l->key = key;
	li = leaf_info_new(plen);

1101
	if (!li) {
1102
		free_leaf(l);
1103
		return NULL;
1104
	}
1105 1106 1107 1108 1109

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
O
Olof Johansson 已提交
1110
		/* Case 2: n is NULL, and will just insert a new leaf */
1111

1112
		node_set_parent((struct rt_trie_node *)l, tp);
1113

O
Olof Johansson 已提交
1114
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1115
		put_child(tp, cindex, (struct rt_trie_node *)l);
O
Olof Johansson 已提交
1116 1117
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1118 1119
		/*
		 *  Add a new tnode here
1120 1121 1122 1123
		 *  first tnode need some special handling
		 */

		if (tp)
O
Olof Johansson 已提交
1124
			pos = tp->pos+tp->bits;
1125
		else
O
Olof Johansson 已提交
1126 1127
			pos = 0;

1128
		if (n) {
1129 1130
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
O
Olof Johansson 已提交
1131
		} else {
1132
			newpos = 0;
1133
			tn = tnode_new(key, newpos, 1); /* First tnode */
1134 1135
		}

1136
		if (!tn) {
1137
			free_leaf_info(li);
1138
			free_leaf(l);
1139
			return NULL;
O
Olof Johansson 已提交
1140 1141
		}

1142
		node_set_parent((struct rt_trie_node *)tn, tp);
1143

O
Olof Johansson 已提交
1144
		missbit = tkey_extract_bits(key, newpos, 1);
L
Lin Ming 已提交
1145 1146
		put_child(tn, missbit, (struct rt_trie_node *)l);
		put_child(tn, 1-missbit, n);
1147

1148
		if (tp) {
1149
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
L
Lin Ming 已提交
1150
			put_child(tp, cindex, (struct rt_trie_node *)tn);
O
Olof Johansson 已提交
1151
		} else {
1152
			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1153 1154 1155
			tp = tn;
		}
	}
O
Olof Johansson 已提交
1156 1157

	if (tp && tp->pos + tp->bits > 32)
J
Joe Perches 已提交
1158 1159
		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
			tp, tp->pos, tp->bits, key, plen);
O
Olof Johansson 已提交
1160

1161
	/* Rebalance the trie */
R
Robert Olsson 已提交
1162

1163
	trie_rebalance(t, tp);
1164
done:
1165 1166 1167
	return fa_head;
}

1168 1169 1170
/*
 * Caller must hold RTNL.
 */
1171
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1172 1173 1174
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1175
	struct list_head *fa_head = NULL;
1176
	struct fib_info *fi;
1177 1178
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1179 1180 1181 1182 1183 1184 1185
	u32 key, mask;
	int err;
	struct leaf *l;

	if (plen > 32)
		return -EINVAL;

1186
	key = ntohl(cfg->fc_dst);
1187

1188
	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1189

O
Olof Johansson 已提交
1190
	mask = ntohl(inet_make_mask(plen));
1191

1192
	if (key & ~mask)
1193 1194 1195 1196
		return -EINVAL;

	key = key & mask;

1197 1198 1199
	fi = fib_create_info(cfg);
	if (IS_ERR(fi)) {
		err = PTR_ERR(fi);
1200
		goto err;
1201
	}
1202 1203

	l = fib_find_node(t, key);
1204
	fa = NULL;
1205

1206
	if (l) {
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
		fa_head = get_fa_head(l, plen);
		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
	}

	/* Now fa, if non-NULL, points to the first fib alias
	 * with the same keys [prefix,tos,priority], if such key already
	 * exists or to the node before which we will insert new one.
	 *
	 * If fa is NULL, we will need to allocate a new one and
	 * insert to the head of f.
	 *
	 * If f is NULL, no fib node matched the destination key
	 * and we need to allocate a new one of those as well.
	 */

1222 1223 1224
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
1225 1226

		err = -EEXIST;
1227
		if (cfg->fc_nlflags & NLM_F_EXCL)
1228 1229
			goto out;

1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, fa_head, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}

1250
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1251 1252 1253
			struct fib_info *fi_drop;
			u8 state;

1254 1255 1256 1257
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
1258
				goto out;
1259
			}
R
Robert Olsson 已提交
1260
			err = -ENOBUFS;
1261
			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
R
Robert Olsson 已提交
1262 1263
			if (new_fa == NULL)
				goto out;
1264 1265

			fi_drop = fa->fa_info;
R
Robert Olsson 已提交
1266 1267
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
1268
			new_fa->fa_type = cfg->fc_type;
1269
			state = fa->fa_state;
1270
			new_fa->fa_state = state & ~FA_S_ACCESSED;
1271

R
Robert Olsson 已提交
1272 1273
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
			alias_free_mem_rcu(fa);
1274 1275 1276

			fib_release_info(fi_drop);
			if (state & FA_S_ACCESSED)
1277
				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1278 1279
			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1280

O
Olof Johansson 已提交
1281
			goto succeeded;
1282 1283 1284 1285 1286
		}
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
1287 1288
		if (fa_match)
			goto out;
1289

1290
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1291
			fa = fa_first;
1292 1293
	}
	err = -ENOENT;
1294
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1295 1296 1297
		goto out;

	err = -ENOBUFS;
1298
	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1299 1300 1301 1302 1303
	if (new_fa == NULL)
		goto out;

	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
1304
	new_fa->fa_type = cfg->fc_type;
1305 1306 1307 1308 1309
	new_fa->fa_state = 0;
	/*
	 * Insert new entry to the list.
	 */

1310
	if (!fa_head) {
1311 1312 1313
		fa_head = fib_insert_node(t, key, plen);
		if (unlikely(!fa_head)) {
			err = -ENOMEM;
1314
			goto out_free_new_fa;
1315
		}
1316
	}
1317

1318 1319 1320
	if (!plen)
		tb->tb_num_default++;

R
Robert Olsson 已提交
1321 1322
	list_add_tail_rcu(&new_fa->fa_list,
			  (fa ? &fa->fa_list : fa_head));
1323

1324
	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1325
	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1326
		  &cfg->fc_nlinfo, 0);
1327 1328
succeeded:
	return 0;
1329 1330 1331

out_free_new_fa:
	kmem_cache_free(fn_alias_kmem, new_fa);
1332 1333
out:
	fib_release_info(fi);
O
Olof Johansson 已提交
1334
err:
1335 1336 1337
	return err;
}

R
Robert Olsson 已提交
1338
/* should be called with rcu_read_lock */
1339
static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1340
		      t_key key,  const struct flowi4 *flp,
E
Eric Dumazet 已提交
1341
		      struct fib_result *res, int fib_flags)
1342 1343 1344
{
	struct leaf_info *li;
	struct hlist_head *hhead = &l->list;
1345

1346
	hlist_for_each_entry_rcu(li, hhead, hlist) {
1347
		struct fib_alias *fa;
1348

1349
		if (l->key != (key & li->mask_plen))
1350 1351
			continue;

1352 1353 1354
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
			struct fib_info *fi = fa->fa_info;
			int nhsel, err;
1355

1356
			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1357
				continue;
1358 1359
			if (fi->fib_dead)
				continue;
1360
			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1361 1362 1363 1364
				continue;
			fib_alias_accessed(fa);
			err = fib_props[fa->fa_type].error;
			if (err) {
1365
#ifdef CONFIG_IP_FIB_TRIE_STATS
1366
				t->stats.semantic_match_passed++;
1367
#endif
1368
				return err;
1369 1370 1371 1372 1373 1374 1375 1376
			}
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
				const struct fib_nh *nh = &fi->fib_nh[nhsel];

				if (nh->nh_flags & RTNH_F_DEAD)
					continue;
1377
				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1378 1379 1380 1381 1382
					continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
				t->stats.semantic_match_passed++;
#endif
1383
				res->prefixlen = li->plen;
1384 1385
				res->nh_sel = nhsel;
				res->type = fa->fa_type;
1386
				res->scope = fa->fa_info->fib_scope;
1387 1388 1389 1390
				res->fi = fi;
				res->table = tb;
				res->fa_head = &li->falh;
				if (!(fib_flags & FIB_LOOKUP_NOREF))
1391
					atomic_inc(&fi->fib_clntref);
1392 1393 1394 1395 1396 1397
				return 0;
			}
		}

#ifdef CONFIG_IP_FIB_TRIE_STATS
		t->stats.semantic_match_miss++;
1398 1399
#endif
	}
1400

1401
	return 1;
1402 1403
}

1404
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
E
Eric Dumazet 已提交
1405
		     struct fib_result *res, int fib_flags)
1406 1407
{
	struct trie *t = (struct trie *) tb->tb_data;
1408
	int ret;
1409
	struct rt_trie_node *n;
1410
	struct tnode *pn;
1411
	unsigned int pos, bits;
1412
	t_key key = ntohl(flp->daddr);
1413
	unsigned int chopped_off;
1414
	t_key cindex = 0;
1415
	unsigned int current_prefix_length = KEYLENGTH;
O
Olof Johansson 已提交
1416
	struct tnode *cn;
1417
	t_key pref_mismatch;
O
Olof Johansson 已提交
1418

R
Robert Olsson 已提交
1419
	rcu_read_lock();
O
Olof Johansson 已提交
1420

R
Robert Olsson 已提交
1421
	n = rcu_dereference(t->trie);
1422
	if (!n)
1423 1424 1425 1426 1427 1428 1429 1430
		goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
	t->stats.gets++;
#endif

	/* Just a leaf? */
	if (IS_LEAF(n)) {
1431
		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1432
		goto found;
1433
	}
1434

1435 1436
	pn = (struct tnode *) n;
	chopped_off = 0;
1437

O
Olof Johansson 已提交
1438
	while (pn) {
1439 1440 1441
		pos = pn->pos;
		bits = pn->bits;

1442
		if (!chopped_off)
S
Stephen Hemminger 已提交
1443 1444
			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
						   pos, bits);
1445

1446
		n = tnode_get_child_rcu(pn, cindex);
1447 1448 1449 1450 1451 1452 1453 1454

		if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.null_node_hit++;
#endif
			goto backtrace;
		}

O
Olof Johansson 已提交
1455
		if (IS_LEAF(n)) {
1456
			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1457
			if (ret > 0)
O
Olof Johansson 已提交
1458
				goto backtrace;
1459
			goto found;
O
Olof Johansson 已提交
1460 1461 1462
		}

		cn = (struct tnode *)n;
1463

O
Olof Johansson 已提交
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478
		/*
		 * It's a tnode, and we can do some extra checks here if we
		 * like, to avoid descending into a dead-end branch.
		 * This tnode is in the parent's child array at index
		 * key[p_pos..p_pos+p_bits] but potentially with some bits
		 * chopped off, so in reality the index may be just a
		 * subprefix, padded with zero at the end.
		 * We can also take a look at any skipped bits in this
		 * tnode - everything up to p_pos is supposed to be ok,
		 * and the non-chopped bits of the index (se previous
		 * paragraph) are also guaranteed ok, but the rest is
		 * considered unknown.
		 *
		 * The skipped bits are key[pos+bits..cn->pos].
		 */
1479

O
Olof Johansson 已提交
1480 1481 1482 1483 1484 1485 1486 1487 1488
		/* If current_prefix_length < pos+bits, we are already doing
		 * actual prefix  matching, which means everything from
		 * pos+(bits-chopped_off) onward must be zero along some
		 * branch of this subtree - otherwise there is *no* valid
		 * prefix present. Here we can only check the skipped
		 * bits. Remember, since we have already indexed into the
		 * parent's child array, we know that the bits we chopped of
		 * *are* zero.
		 */
1489

1490 1491
		/* NOTA BENE: Checking only skipped bits
		   for the new node here */
1492

O
Olof Johansson 已提交
1493 1494
		if (current_prefix_length < pos+bits) {
			if (tkey_extract_bits(cn->key, current_prefix_length,
1495 1496
						cn->pos - current_prefix_length)
			    || !(cn->child[0]))
O
Olof Johansson 已提交
1497 1498
				goto backtrace;
		}
1499

O
Olof Johansson 已提交
1500 1501 1502 1503 1504 1505 1506 1507 1508
		/*
		 * If chopped_off=0, the index is fully validated and we
		 * only need to look at the skipped bits for this, the new,
		 * tnode. What we actually want to do is to find out if
		 * these skipped bits match our key perfectly, or if we will
		 * have to count on finding a matching prefix further down,
		 * because if we do, we would like to have some way of
		 * verifying the existence of such a prefix at this point.
		 */
1509

O
Olof Johansson 已提交
1510 1511 1512 1513 1514 1515 1516 1517
		/* The only thing we can do at this point is to verify that
		 * any such matching prefix can indeed be a prefix to our
		 * key, and if the bits in the node we are inspecting that
		 * do not match our key are not ZERO, this cannot be true.
		 * Thus, find out where there is a mismatch (before cn->pos)
		 * and verify that all the mismatching bits are zero in the
		 * new tnode's key.
		 */
1518

1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529
		/*
		 * Note: We aren't very concerned about the piece of
		 * the key that precede pn->pos+pn->bits, since these
		 * have already been checked. The bits after cn->pos
		 * aren't checked since these are by definition
		 * "unknown" at this point. Thus, what we want to see
		 * is if we are about to enter the "prefix matching"
		 * state, and in that case verify that the skipped
		 * bits that will prevail throughout this subtree are
		 * zero, as they have to be if we are to find a
		 * matching prefix.
O
Olof Johansson 已提交
1530 1531
		 */

1532
		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
O
Olof Johansson 已提交
1533

1534 1535 1536 1537
		/*
		 * In short: If skipped bits in this node do not match
		 * the search key, enter the "prefix matching"
		 * state.directly.
O
Olof Johansson 已提交
1538 1539
		 */
		if (pref_mismatch) {
1540 1541
			/* fls(x) = __fls(x) + 1 */
			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
O
Olof Johansson 已提交
1542

1543
			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
O
Olof Johansson 已提交
1544 1545 1546 1547
				goto backtrace;

			if (current_prefix_length >= cn->pos)
				current_prefix_length = mp;
1548
		}
1549

O
Olof Johansson 已提交
1550 1551 1552 1553
		pn = (struct tnode *)n; /* Descend */
		chopped_off = 0;
		continue;

1554 1555 1556 1557
backtrace:
		chopped_off++;

		/* As zero don't change the child key (cindex) */
1558 1559
		while ((chopped_off <= pn->bits)
		       && !(cindex & (1<<(chopped_off-1))))
1560 1561 1562 1563
			chopped_off++;

		/* Decrease current_... with bits chopped off */
		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1564 1565
			current_prefix_length = pn->pos + pn->bits
				- chopped_off;
O
Olof Johansson 已提交
1566

1567
		/*
1568
		 * Either we do the actual chop off according or if we have
1569 1570 1571
		 * chopped off all bits in this tnode walk up to our parent.
		 */

O
Olof Johansson 已提交
1572
		if (chopped_off <= pn->bits) {
1573
			cindex &= ~(1 << (chopped_off-1));
O
Olof Johansson 已提交
1574
		} else {
1575
			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
S
Stephen Hemminger 已提交
1576
			if (!parent)
1577
				goto failed;
O
Olof Johansson 已提交
1578

1579
			/* Get Child's index */
S
Stephen Hemminger 已提交
1580 1581
			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
			pn = parent;
1582 1583 1584 1585 1586 1587
			chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.backtrack++;
#endif
			goto backtrace;
1588
		}
1589 1590
	}
failed:
1591
	ret = 1;
1592
found:
R
Robert Olsson 已提交
1593
	rcu_read_unlock();
1594 1595
	return ret;
}
1596
EXPORT_SYMBOL_GPL(fib_table_lookup);
1597

1598 1599 1600 1601
/*
 * Remove the leaf and return parent.
 */
static void trie_leaf_remove(struct trie *t, struct leaf *l)
1602
{
1603
	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1604

1605
	pr_debug("entering trie_leaf_remove(%p)\n", l);
1606

1607
	if (tp) {
1608
		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
L
Lin Ming 已提交
1609
		put_child(tp, cindex, NULL);
1610
		trie_rebalance(t, tp);
O
Olof Johansson 已提交
1611
	} else
1612
		RCU_INIT_POINTER(t->trie, NULL);
1613

1614
	free_leaf(l);
1615 1616
}

1617 1618 1619
/*
 * Caller must hold RTNL.
 */
1620
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1621 1622 1623
{
	struct trie *t = (struct trie *) tb->tb_data;
	u32 key, mask;
1624 1625
	int plen = cfg->fc_dst_len;
	u8 tos = cfg->fc_tos;
1626 1627 1628
	struct fib_alias *fa, *fa_to_delete;
	struct list_head *fa_head;
	struct leaf *l;
O
Olof Johansson 已提交
1629 1630
	struct leaf_info *li;

1631
	if (plen > 32)
1632 1633
		return -EINVAL;

1634
	key = ntohl(cfg->fc_dst);
O
Olof Johansson 已提交
1635
	mask = ntohl(inet_make_mask(plen));
1636

1637
	if (key & ~mask)
1638 1639 1640 1641 1642
		return -EINVAL;

	key = key & mask;
	l = fib_find_node(t, key);

1643
	if (!l)
1644 1645
		return -ESRCH;

1646 1647 1648 1649 1650 1651
	li = find_leaf_info(l, plen);

	if (!li)
		return -ESRCH;

	fa_head = &li->falh;
1652 1653 1654 1655 1656
	fa = fib_find_alias(fa_head, tos, 0);

	if (!fa)
		return -ESRCH;

S
Stephen Hemminger 已提交
1657
	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1658 1659

	fa_to_delete = NULL;
1660 1661
	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
	list_for_each_entry_continue(fa, fa_head, fa_list) {
1662 1663 1664 1665 1666
		struct fib_info *fi = fa->fa_info;

		if (fa->fa_tos != tos)
			break;

1667 1668
		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1669
		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1670 1671
		    (!cfg->fc_prefsrc ||
		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1672 1673 1674
		    (!cfg->fc_protocol ||
		     fi->fib_protocol == cfg->fc_protocol) &&
		    fib_nh_match(cfg, fi) == 0) {
1675 1676 1677 1678 1679
			fa_to_delete = fa;
			break;
		}
	}

O
Olof Johansson 已提交
1680 1681
	if (!fa_to_delete)
		return -ESRCH;
1682

O
Olof Johansson 已提交
1683
	fa = fa_to_delete;
1684
	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1685
		  &cfg->fc_nlinfo, 0);
O
Olof Johansson 已提交
1686

R
Robert Olsson 已提交
1687
	list_del_rcu(&fa->fa_list);
1688

1689 1690 1691
	if (!plen)
		tb->tb_num_default--;

O
Olof Johansson 已提交
1692
	if (list_empty(fa_head)) {
R
Robert Olsson 已提交
1693
		hlist_del_rcu(&li->hlist);
O
Olof Johansson 已提交
1694
		free_leaf_info(li);
R
Robert Olsson 已提交
1695
	}
1696

O
Olof Johansson 已提交
1697
	if (hlist_empty(&l->list))
1698
		trie_leaf_remove(t, l);
1699

O
Olof Johansson 已提交
1700
	if (fa->fa_state & FA_S_ACCESSED)
1701
		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1702

R
Robert Olsson 已提交
1703 1704
	fib_release_info(fa->fa_info);
	alias_free_mem_rcu(fa);
O
Olof Johansson 已提交
1705
	return 0;
1706 1707
}

1708
static int trie_flush_list(struct list_head *head)
1709 1710 1711 1712 1713 1714 1715
{
	struct fib_alias *fa, *fa_node;
	int found = 0;

	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
		struct fib_info *fi = fa->fa_info;

R
Robert Olsson 已提交
1716 1717 1718 1719
		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
			list_del_rcu(&fa->fa_list);
			fib_release_info(fa->fa_info);
			alias_free_mem_rcu(fa);
1720 1721 1722 1723 1724 1725
			found++;
		}
	}
	return found;
}

1726
static int trie_flush_leaf(struct leaf *l)
1727 1728 1729
{
	int found = 0;
	struct hlist_head *lih = &l->list;
1730
	struct hlist_node *tmp;
1731 1732
	struct leaf_info *li = NULL;

1733
	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1734
		found += trie_flush_list(&li->falh);
1735 1736

		if (list_empty(&li->falh)) {
R
Robert Olsson 已提交
1737
			hlist_del_rcu(&li->hlist);
1738 1739 1740 1741 1742 1743
			free_leaf_info(li);
		}
	}
	return found;
}

1744 1745 1746 1747
/*
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
1748
static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1749
{
1750 1751
	do {
		t_key idx;
1752 1753

		if (c)
1754
			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1755
		else
1756
			idx = 0;
R
Robert Olsson 已提交
1757

1758 1759
		while (idx < 1u << p->bits) {
			c = tnode_get_child_rcu(p, idx++);
R
Robert Olsson 已提交
1760
			if (!c)
O
Olof Johansson 已提交
1761 1762
				continue;

1763
			if (IS_LEAF(c))
1764 1765 1766 1767 1768
				return (struct leaf *) c;

			/* Rescan start scanning in new node */
			p = (struct tnode *) c;
			idx = 0;
1769
		}
1770 1771

		/* Node empty, walk back up to parent */
1772
		c = (struct rt_trie_node *) p;
E
Eric Dumazet 已提交
1773
	} while ((p = node_parent_rcu(c)) != NULL);
1774 1775 1776 1777 1778 1779

	return NULL; /* Root of trie */
}

static struct leaf *trie_firstleaf(struct trie *t)
{
E
Eric Dumazet 已提交
1780
	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792

	if (!n)
		return NULL;

	if (IS_LEAF(n))          /* trie is just a leaf */
		return (struct leaf *) n;

	return leaf_walk_rcu(n, NULL);
}

static struct leaf *trie_nextleaf(struct leaf *l)
{
1793
	struct rt_trie_node *c = (struct rt_trie_node *) l;
1794
	struct tnode *p = node_parent_rcu(c);
1795 1796 1797 1798 1799

	if (!p)
		return NULL;	/* trie with just one leaf */

	return leaf_walk_rcu(p, c);
1800 1801
}

1802 1803 1804 1805
static struct leaf *trie_leafindex(struct trie *t, int index)
{
	struct leaf *l = trie_firstleaf(t);

S
Stephen Hemminger 已提交
1806
	while (l && index-- > 0)
1807
		l = trie_nextleaf(l);
S
Stephen Hemminger 已提交
1808

1809 1810 1811 1812
	return l;
}


1813 1814 1815
/*
 * Caller must hold RTNL.
 */
1816
int fib_table_flush(struct fib_table *tb)
1817 1818
{
	struct trie *t = (struct trie *) tb->tb_data;
1819
	struct leaf *l, *ll = NULL;
1820
	int found = 0;
1821

1822
	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1823
		found += trie_flush_leaf(l);
1824 1825

		if (ll && hlist_empty(&ll->list))
1826
			trie_leaf_remove(t, ll);
1827 1828 1829 1830
		ll = l;
	}

	if (ll && hlist_empty(&ll->list))
1831
		trie_leaf_remove(t, ll);
1832

S
Stephen Hemminger 已提交
1833
	pr_debug("trie_flush found=%d\n", found);
1834 1835 1836
	return found;
}

1837 1838 1839 1840 1841
void fib_free_table(struct fib_table *tb)
{
	kfree(tb);
}

1842 1843
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
			   struct fib_table *tb,
1844 1845 1846 1847
			   struct sk_buff *skb, struct netlink_callback *cb)
{
	int i, s_i;
	struct fib_alias *fa;
A
Al Viro 已提交
1848
	__be32 xkey = htonl(key);
1849

1850
	s_i = cb->args[5];
1851 1852
	i = 0;

R
Robert Olsson 已提交
1853 1854 1855
	/* rcu_read_lock is hold by caller */

	list_for_each_entry_rcu(fa, fah, fa_list) {
1856 1857 1858 1859 1860
		if (i < s_i) {
			i++;
			continue;
		}

1861
		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1862 1863 1864 1865
				  cb->nlh->nlmsg_seq,
				  RTM_NEWROUTE,
				  tb->tb_id,
				  fa->fa_type,
1866
				  xkey,
1867 1868
				  plen,
				  fa->fa_tos,
1869
				  fa->fa_info, NLM_F_MULTI) < 0) {
1870
			cb->args[5] = i;
1871
			return -1;
O
Olof Johansson 已提交
1872
		}
1873 1874
		i++;
	}
1875
	cb->args[5] = i;
1876 1877 1878
	return skb->len;
}

1879 1880
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
			struct sk_buff *skb, struct netlink_callback *cb)
1881
{
1882 1883
	struct leaf_info *li;
	int i, s_i;
1884

1885
	s_i = cb->args[4];
1886
	i = 0;
1887

1888
	/* rcu_read_lock is hold by caller */
1889
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1890 1891
		if (i < s_i) {
			i++;
1892
			continue;
1893
		}
O
Olof Johansson 已提交
1894

1895
		if (i > s_i)
1896
			cb->args[5] = 0;
1897

1898
		if (list_empty(&li->falh))
1899 1900
			continue;

1901
		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1902
			cb->args[4] = i;
1903 1904
			return -1;
		}
1905
		i++;
1906
	}
1907

1908
	cb->args[4] = i;
1909 1910 1911
	return skb->len;
}

1912 1913
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
		   struct netlink_callback *cb)
1914
{
1915
	struct leaf *l;
1916
	struct trie *t = (struct trie *) tb->tb_data;
1917
	t_key key = cb->args[2];
1918
	int count = cb->args[3];
1919

R
Robert Olsson 已提交
1920
	rcu_read_lock();
1921 1922 1923
	/* Dump starting at last key.
	 * Note: 0.0.0.0/0 (ie default) is first key.
	 */
1924
	if (count == 0)
1925 1926
		l = trie_firstleaf(t);
	else {
1927 1928 1929
		/* Normally, continue from last key, but if that is missing
		 * fallback to using slow rescan
		 */
1930
		l = fib_find_node(t, key);
1931 1932
		if (!l)
			l = trie_leafindex(t, count);
1933
	}
1934

1935 1936
	while (l) {
		cb->args[2] = l->key;
1937
		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1938
			cb->args[3] = count;
1939 1940
			rcu_read_unlock();
			return -1;
1941
		}
1942

1943
		++count;
1944
		l = trie_nextleaf(l);
1945 1946
		memset(&cb->args[4], 0,
		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1947
	}
1948
	cb->args[3] = count;
R
Robert Olsson 已提交
1949
	rcu_read_unlock();
1950

1951 1952 1953
	return skb->len;
}

1954
void __init fib_trie_init(void)
1955
{
1956 1957
	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
					  sizeof(struct fib_alias),
1958 1959 1960 1961 1962 1963
					  0, SLAB_PANIC, NULL);

	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
					   max(sizeof(struct leaf),
					       sizeof(struct leaf_info)),
					   0, SLAB_PANIC, NULL);
1964
}
1965

1966

1967
struct fib_table *fib_trie_table(u32 id)
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
{
	struct fib_table *tb;
	struct trie *t;

	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;

	tb->tb_id = id;
1978
	tb->tb_default = -1;
1979
	tb->tb_num_default = 0;
1980 1981

	t = (struct trie *) tb->tb_data;
1982
	memset(t, 0, sizeof(*t));
1983 1984 1985 1986

	return tb;
}

1987 1988 1989
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
1990
	struct seq_net_private p;
1991
	struct fib_table *tb;
1992
	struct tnode *tnode;
E
Eric Dumazet 已提交
1993 1994
	unsigned int index;
	unsigned int depth;
1995
};
1996

1997
static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1998
{
1999
	struct tnode *tn = iter->tnode;
E
Eric Dumazet 已提交
2000
	unsigned int cindex = iter->index;
2001
	struct tnode *p;
2002

2003 2004 2005 2006
	/* A single entry routing table */
	if (!tn)
		return NULL;

2007 2008 2009 2010
	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
		 iter->tnode, iter->index, iter->depth);
rescan:
	while (cindex < (1<<tn->bits)) {
2011
		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2012

2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
		if (n) {
			if (IS_LEAF(n)) {
				iter->tnode = tn;
				iter->index = cindex + 1;
			} else {
				/* push down one level */
				iter->tnode = (struct tnode *) n;
				iter->index = 0;
				++iter->depth;
			}
			return n;
		}
2025

2026 2027
		++cindex;
	}
O
Olof Johansson 已提交
2028

2029
	/* Current node exhausted, pop back up */
2030
	p = node_parent_rcu((struct rt_trie_node *)tn);
2031 2032 2033 2034 2035
	if (p) {
		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
		tn = p;
		--iter->depth;
		goto rescan;
2036
	}
2037 2038 2039

	/* got root? */
	return NULL;
2040 2041
}

2042
static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2043
				       struct trie *t)
2044
{
2045
	struct rt_trie_node *n;
2046

S
Stephen Hemminger 已提交
2047
	if (!t)
2048 2049 2050
		return NULL;

	n = rcu_dereference(t->trie);
2051
	if (!n)
2052
		return NULL;
2053

2054 2055 2056 2057 2058 2059 2060 2061
	if (IS_TNODE(n)) {
		iter->tnode = (struct tnode *) n;
		iter->index = 0;
		iter->depth = 1;
	} else {
		iter->tnode = NULL;
		iter->index = 0;
		iter->depth = 0;
O
Olof Johansson 已提交
2062
	}
2063 2064

	return n;
2065
}
O
Olof Johansson 已提交
2066

2067 2068
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
2069
	struct rt_trie_node *n;
2070
	struct fib_trie_iter iter;
O
Olof Johansson 已提交
2071

2072
	memset(s, 0, sizeof(*s));
O
Olof Johansson 已提交
2073

2074
	rcu_read_lock();
2075
	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2076
		if (IS_LEAF(n)) {
2077 2078 2079
			struct leaf *l = (struct leaf *)n;
			struct leaf_info *li;

2080 2081 2082 2083
			s->leaves++;
			s->totdepth += iter.depth;
			if (iter.depth > s->maxdepth)
				s->maxdepth = iter.depth;
2084

2085
			hlist_for_each_entry_rcu(li, &l->list, hlist)
2086
				++s->prefixes;
2087 2088 2089 2090 2091
		} else {
			const struct tnode *tn = (const struct tnode *) n;
			int i;

			s->tnodes++;
S
Stephen Hemminger 已提交
2092
			if (tn->bits < MAX_STAT_DEPTH)
R
Robert Olsson 已提交
2093 2094
				s->nodesizes[tn->bits]++;

2095 2096 2097
			for (i = 0; i < (1<<tn->bits); i++)
				if (!tn->child[i])
					s->nullpointers++;
2098 2099
		}
	}
R
Robert Olsson 已提交
2100
	rcu_read_unlock();
2101 2102
}

2103 2104 2105 2106
/*
 *	This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2107
{
E
Eric Dumazet 已提交
2108
	unsigned int i, max, pointers, bytes, avdepth;
2109

2110 2111 2112 2113
	if (stat->leaves)
		avdepth = stat->totdepth*100 / stat->leaves;
	else
		avdepth = 0;
O
Olof Johansson 已提交
2114

2115 2116
	seq_printf(seq, "\tAver depth:     %u.%02d\n",
		   avdepth / 100, avdepth % 100);
2117
	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
O
Olof Johansson 已提交
2118

2119 2120
	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
	bytes = sizeof(struct leaf) * stat->leaves;
2121 2122 2123 2124

	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
	bytes += sizeof(struct leaf_info) * stat->prefixes;

2125
	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2126
	bytes += sizeof(struct tnode) * stat->tnodes;
2127

R
Robert Olsson 已提交
2128 2129
	max = MAX_STAT_DEPTH;
	while (max > 0 && stat->nodesizes[max-1] == 0)
2130
		max--;
2131

2132
	pointers = 0;
2133
	for (i = 1; i < max; i++)
2134
		if (stat->nodesizes[i] != 0) {
2135
			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2136 2137 2138
			pointers += (1<<i) * stat->nodesizes[i];
		}
	seq_putc(seq, '\n');
2139
	seq_printf(seq, "\tPointers: %u\n", pointers);
R
Robert Olsson 已提交
2140

2141
	bytes += sizeof(struct rt_trie_node *) * pointers;
2142 2143
	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2144
}
R
Robert Olsson 已提交
2145

2146
#ifdef CONFIG_IP_FIB_TRIE_STATS
2147 2148 2149 2150
static void trie_show_usage(struct seq_file *seq,
			    const struct trie_use_stats *stats)
{
	seq_printf(seq, "\nCounters:\n---------\n");
2151 2152 2153 2154 2155 2156 2157 2158 2159
	seq_printf(seq, "gets = %u\n", stats->gets);
	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
	seq_printf(seq, "semantic match passed = %u\n",
		   stats->semantic_match_passed);
	seq_printf(seq, "semantic match miss = %u\n",
		   stats->semantic_match_miss);
	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
	seq_printf(seq, "skipped node resize = %u\n\n",
		   stats->resize_node_skipped);
2160
}
2161 2162
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

2163
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2164
{
2165 2166 2167 2168 2169 2170
	if (tb->tb_id == RT_TABLE_LOCAL)
		seq_puts(seq, "Local:\n");
	else if (tb->tb_id == RT_TABLE_MAIN)
		seq_puts(seq, "Main:\n");
	else
		seq_printf(seq, "Id %d:\n", tb->tb_id);
2171
}
2172

2173

2174 2175
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
2176
	struct net *net = (struct net *)seq->private;
2177
	unsigned int h;
2178

2179
	seq_printf(seq,
2180 2181
		   "Basic info: size of leaf:"
		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2182 2183
		   sizeof(struct leaf), sizeof(struct tnode));

2184 2185 2186 2187
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;

2188
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2189 2190
			struct trie *t = (struct trie *) tb->tb_data;
			struct trie_stat stat;
2191

2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203
			if (!t)
				continue;

			fib_table_print(seq, tb);

			trie_collect_stats(t, &stat);
			trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
			trie_show_usage(seq, &t->stats);
#endif
		}
	}
2204

2205
	return 0;
2206 2207
}

2208
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2209
{
2210
	return single_open_net(inode, file, fib_triestat_seq_show);
2211 2212
}

2213
static const struct file_operations fib_triestat_fops = {
2214 2215 2216 2217
	.owner	= THIS_MODULE,
	.open	= fib_triestat_seq_open,
	.read	= seq_read,
	.llseek	= seq_lseek,
2218
	.release = single_release_net,
2219 2220
};

2221
static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2222
{
2223 2224
	struct fib_trie_iter *iter = seq->private;
	struct net *net = seq_file_net(seq);
2225
	loff_t idx = 0;
2226
	unsigned int h;
2227

2228 2229 2230
	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
		struct fib_table *tb;
2231

2232
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2233
			struct rt_trie_node *n;
2234 2235 2236 2237 2238 2239 2240 2241 2242

			for (n = fib_trie_get_first(iter,
						    (struct trie *) tb->tb_data);
			     n; n = fib_trie_get_next(iter))
				if (pos == idx++) {
					iter->tb = tb;
					return n;
				}
		}
2243
	}
2244

2245 2246 2247
	return NULL;
}

2248
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2249
	__acquires(RCU)
2250
{
2251
	rcu_read_lock();
2252
	return fib_trie_get_idx(seq, *pos);
2253 2254
}

2255
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2256
{
2257
	struct fib_trie_iter *iter = seq->private;
2258
	struct net *net = seq_file_net(seq);
2259 2260 2261
	struct fib_table *tb = iter->tb;
	struct hlist_node *tb_node;
	unsigned int h;
2262
	struct rt_trie_node *n;
2263

2264
	++*pos;
2265 2266 2267 2268
	/* next node in same table */
	n = fib_trie_get_next(iter);
	if (n)
		return n;
2269

2270 2271
	/* walk rest of this hash chain */
	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
E
Eric Dumazet 已提交
2272
	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2273 2274 2275 2276 2277
		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
		if (n)
			goto found;
	}
2278

2279 2280 2281
	/* new hash chain */
	while (++h < FIB_TABLE_HASHSZ) {
		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2282
		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2283 2284 2285 2286 2287
			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
			if (n)
				goto found;
		}
	}
2288
	return NULL;
2289 2290 2291 2292

found:
	iter->tb = tb;
	return n;
2293
}
2294

2295
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2296
	__releases(RCU)
2297
{
2298 2299
	rcu_read_unlock();
}
O
Olof Johansson 已提交
2300

2301 2302
static void seq_indent(struct seq_file *seq, int n)
{
E
Eric Dumazet 已提交
2303 2304
	while (n-- > 0)
		seq_puts(seq, "   ");
2305
}
2306

2307
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2308
{
S
Stephen Hemminger 已提交
2309
	switch (s) {
2310 2311 2312 2313 2314 2315
	case RT_SCOPE_UNIVERSE: return "universe";
	case RT_SCOPE_SITE:	return "site";
	case RT_SCOPE_LINK:	return "link";
	case RT_SCOPE_HOST:	return "host";
	case RT_SCOPE_NOWHERE:	return "nowhere";
	default:
2316
		snprintf(buf, len, "scope=%d", s);
2317 2318 2319
		return buf;
	}
}
2320

2321
static const char *const rtn_type_names[__RTN_MAX] = {
2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334
	[RTN_UNSPEC] = "UNSPEC",
	[RTN_UNICAST] = "UNICAST",
	[RTN_LOCAL] = "LOCAL",
	[RTN_BROADCAST] = "BROADCAST",
	[RTN_ANYCAST] = "ANYCAST",
	[RTN_MULTICAST] = "MULTICAST",
	[RTN_BLACKHOLE] = "BLACKHOLE",
	[RTN_UNREACHABLE] = "UNREACHABLE",
	[RTN_PROHIBIT] = "PROHIBIT",
	[RTN_THROW] = "THROW",
	[RTN_NAT] = "NAT",
	[RTN_XRESOLVE] = "XRESOLVE",
};
2335

E
Eric Dumazet 已提交
2336
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2337 2338 2339
{
	if (t < __RTN_MAX && rtn_type_names[t])
		return rtn_type_names[t];
2340
	snprintf(buf, len, "type %u", t);
2341
	return buf;
2342 2343
}

2344 2345
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
2346
{
2347
	const struct fib_trie_iter *iter = seq->private;
2348
	struct rt_trie_node *n = v;
2349

2350 2351
	if (!node_parent_rcu(n))
		fib_table_print(seq, iter->tb);
2352

2353 2354
	if (IS_TNODE(n)) {
		struct tnode *tn = (struct tnode *) n;
S
Stephen Hemminger 已提交
2355
		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
O
Olof Johansson 已提交
2356

2357
		seq_indent(seq, iter->depth-1);
2358 2359
		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
			   &prf, tn->pos, tn->bits, tn->full_children,
2360
			   tn->empty_children);
2361

2362 2363
	} else {
		struct leaf *l = (struct leaf *) n;
2364
		struct leaf_info *li;
A
Al Viro 已提交
2365
		__be32 val = htonl(l->key);
2366 2367

		seq_indent(seq, iter->depth);
2368
		seq_printf(seq, "  |-- %pI4\n", &val);
2369

2370
		hlist_for_each_entry_rcu(li, &l->list, hlist) {
2371 2372 2373 2374 2375 2376 2377 2378
			struct fib_alias *fa;

			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
				char buf1[32], buf2[32];

				seq_indent(seq, iter->depth+1);
				seq_printf(seq, "  /%d %s %s", li->plen,
					   rtn_scope(buf1, sizeof(buf1),
2379
						     fa->fa_info->fib_scope),
2380 2381 2382
					   rtn_type(buf2, sizeof(buf2),
						    fa->fa_type));
				if (fa->fa_tos)
2383
					seq_printf(seq, " tos=%d", fa->fa_tos);
2384
				seq_putc(seq, '\n');
2385 2386
			}
		}
2387
	}
2388

2389 2390 2391
	return 0;
}

2392
static const struct seq_operations fib_trie_seq_ops = {
2393 2394 2395 2396
	.start  = fib_trie_seq_start,
	.next   = fib_trie_seq_next,
	.stop   = fib_trie_seq_stop,
	.show   = fib_trie_seq_show,
2397 2398
};

2399
static int fib_trie_seq_open(struct inode *inode, struct file *file)
2400
{
2401 2402
	return seq_open_net(inode, file, &fib_trie_seq_ops,
			    sizeof(struct fib_trie_iter));
2403 2404
}

2405
static const struct file_operations fib_trie_fops = {
2406 2407 2408 2409
	.owner  = THIS_MODULE,
	.open   = fib_trie_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2410
	.release = seq_release_net,
2411 2412
};

2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452
struct fib_route_iter {
	struct seq_net_private p;
	struct trie *main_trie;
	loff_t	pos;
	t_key	key;
};

static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
	struct leaf *l = NULL;
	struct trie *t = iter->main_trie;

	/* use cache location of last found key */
	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
		pos -= iter->pos;
	else {
		iter->pos = 0;
		l = trie_firstleaf(t);
	}

	while (l && pos-- > 0) {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = pos;	/* remember it */
	else
		iter->pos = 0;		/* forget it */

	return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
	__acquires(RCU)
{
	struct fib_route_iter *iter = seq->private;
	struct fib_table *tb;

	rcu_read_lock();
2453
	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490
	if (!tb)
		return NULL;

	iter->main_trie = (struct trie *) tb->tb_data;
	if (*pos == 0)
		return SEQ_START_TOKEN;
	else
		return fib_route_get_idx(iter, *pos - 1);
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct fib_route_iter *iter = seq->private;
	struct leaf *l = v;

	++*pos;
	if (v == SEQ_START_TOKEN) {
		iter->pos = 0;
		l = trie_firstleaf(iter->main_trie);
	} else {
		iter->pos++;
		l = trie_nextleaf(l);
	}

	if (l)
		iter->key = l->key;
	else
		iter->pos = 0;
	return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
	__releases(RCU)
{
	rcu_read_unlock();
}

E
Eric Dumazet 已提交
2491
static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2492
{
E
Eric Dumazet 已提交
2493
	unsigned int flags = 0;
2494

E
Eric Dumazet 已提交
2495 2496
	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
		flags = RTF_REJECT;
2497 2498
	if (fi && fi->fib_nh->nh_gw)
		flags |= RTF_GATEWAY;
A
Al Viro 已提交
2499
	if (mask == htonl(0xFFFFFFFF))
2500 2501 2502
		flags |= RTF_HOST;
	flags |= RTF_UP;
	return flags;
2503 2504
}

2505 2506 2507
/*
 *	This outputs /proc/net/route.
 *	The format of the file is not supposed to be changed
E
Eric Dumazet 已提交
2508
 *	and needs to be same as fib_hash output to avoid breaking
2509 2510 2511
 *	legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
2512
{
2513
	struct leaf *l = v;
2514
	struct leaf_info *li;
2515

2516 2517 2518 2519 2520 2521
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
			   "\tWindow\tIRTT");
		return 0;
	}
2522

2523
	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2524
		struct fib_alias *fa;
A
Al Viro 已提交
2525
		__be32 mask, prefix;
O
Olof Johansson 已提交
2526

2527 2528
		mask = inet_make_mask(li->plen);
		prefix = htonl(l->key);
2529

2530
		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2531
			const struct fib_info *fi = fa->fa_info;
E
Eric Dumazet 已提交
2532
			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2533
			int len;
2534

2535 2536 2537
			if (fa->fa_type == RTN_BROADCAST
			    || fa->fa_type == RTN_MULTICAST)
				continue;
2538

2539
			if (fi)
2540 2541 2542
				seq_printf(seq,
					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2543 2544 2545 2546 2547
					 fi->fib_dev ? fi->fib_dev->name : "*",
					 prefix,
					 fi->fib_nh->nh_gw, flags, 0, 0,
					 fi->fib_priority,
					 mask,
2548 2549
					 (fi->fib_advmss ?
					  fi->fib_advmss + 40 : 0),
2550
					 fi->fib_window,
2551
					 fi->fib_rtt >> 3, &len);
2552
			else
2553 2554 2555
				seq_printf(seq,
					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
					 "%d\t%08X\t%d\t%u\t%u%n",
2556
					 prefix, 0, flags, 0, 0, 0,
2557
					 mask, 0, 0, 0, &len);
2558

2559
			seq_printf(seq, "%*s\n", 127 - len, "");
2560
		}
2561 2562 2563 2564 2565
	}

	return 0;
}

2566
static const struct seq_operations fib_route_seq_ops = {
2567 2568 2569
	.start  = fib_route_seq_start,
	.next   = fib_route_seq_next,
	.stop   = fib_route_seq_stop,
2570
	.show   = fib_route_seq_show,
2571 2572
};

2573
static int fib_route_seq_open(struct inode *inode, struct file *file)
2574
{
2575
	return seq_open_net(inode, file, &fib_route_seq_ops,
2576
			    sizeof(struct fib_route_iter));
2577 2578
}

2579
static const struct file_operations fib_route_fops = {
2580 2581 2582 2583
	.owner  = THIS_MODULE,
	.open   = fib_route_seq_open,
	.read   = seq_read,
	.llseek = seq_lseek,
2584
	.release = seq_release_net,
2585 2586
};

2587
int __net_init fib_proc_init(struct net *net)
2588
{
2589
	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2590 2591
		goto out1;

2592 2593
	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
			 &fib_triestat_fops))
2594 2595
		goto out2;

2596
	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2597 2598
		goto out3;

2599
	return 0;
2600 2601

out3:
2602
	remove_proc_entry("fib_triestat", net->proc_net);
2603
out2:
2604
	remove_proc_entry("fib_trie", net->proc_net);
2605 2606
out1:
	return -ENOMEM;
2607 2608
}

2609
void __net_exit fib_proc_exit(struct net *net)
2610
{
2611 2612 2613
	remove_proc_entry("fib_trie", net->proc_net);
	remove_proc_entry("fib_triestat", net->proc_net);
	remove_proc_entry("route", net->proc_net);
2614 2615 2616
}

#endif /* CONFIG_PROC_FS */