inetpeer.c 15.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 *		INETPEER - A storage for permanent information about peers
 *
 *  This source is covered by the GNU GPL, the same as all kernel sources.
 *
 *  Authors:	Andrey V. Savochkin <saw@msu.ru>
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/timer.h>
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
20
#include <net/ip.h>
L
Linus Torvalds 已提交
21
#include <net/inetpeer.h>
22
#include <net/secure_seq.h>
L
Linus Torvalds 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54

/*
 *  Theory of operations.
 *  We keep one entry for each peer IP address.  The nodes contains long-living
 *  information about the peer which doesn't depend on routes.
 *  At this moment this information consists only of ID field for the next
 *  outgoing IP packet.  This field is incremented with each packet as encoded
 *  in inet_getid() function (include/net/inetpeer.h).
 *  At the moment of writing this notes identifier of IP packets is generated
 *  to be unpredictable using this code only for packets subjected
 *  (actually or potentially) to defragmentation.  I.e. DF packets less than
 *  PMTU in size uses a constant ID and do not use this code (see
 *  ip_select_ident() in include/net/ip.h).
 *
 *  Route cache entries hold references to our nodes.
 *  New cache entries get references via lookup by destination IP address in
 *  the avl tree.  The reference is grabbed only when it's needed i.e. only
 *  when we try to output IP packet which needs an unpredictable ID (see
 *  __ip_select_ident() in net/ipv4/route.c).
 *  Nodes are removed only when reference counter goes to 0.
 *  When it's happened the node may be removed when a sufficient amount of
 *  time has been passed since its last use.  The less-recently-used entry can
 *  also be removed if the pool is overloaded i.e. if the total amount of
 *  entries is greater-or-equal than the threshold.
 *
 *  Node pool is organised as an AVL tree.
 *  Such an implementation has been chosen not just for fun.  It's a way to
 *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
 *  amount of long living nodes in a single hash slot would significantly delay
 *  lookups performed with disabled BHs.
 *
 *  Serialisation issues.
E
Eric Dumazet 已提交
55 56
 *  1.  Nodes may appear in the tree only with the pool lock held.
 *  2.  Nodes may disappear from the tree only with the pool lock held
L
Linus Torvalds 已提交
57
 *      AND reference count being 0.
E
Eric Dumazet 已提交
58 59
 *  3.  Global variable peer_total is modified under the pool lock.
 *  4.  struct inet_peer fields modification:
L
Linus Torvalds 已提交
60 61 62
 *		avl_left, avl_right, avl_parent, avl_height: pool lock
 *		refcnt: atomically against modifications on other CPU;
 *		   usually under some other lock to prevent node disappearing
63
 *		daddr: unchangeable
64
 *		ip_id_count: atomic value (no lock needed)
L
Linus Torvalds 已提交
65 66
 */

67
static struct kmem_cache *peer_cachep __read_mostly;
L
Linus Torvalds 已提交
68 69

#define node_height(x) x->avl_height
E
Eric Dumazet 已提交
70 71

#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
E
Eric Dumazet 已提交
72
#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
E
Eric Dumazet 已提交
73
static const struct inet_peer peer_fake_node = {
E
Eric Dumazet 已提交
74 75
	.avl_left	= peer_avl_empty_rcu,
	.avl_right	= peer_avl_empty_rcu,
L
Linus Torvalds 已提交
76 77
	.avl_height	= 0
};
E
Eric Dumazet 已提交
78

79
struct inet_peer_base {
E
Eric Dumazet 已提交
80
	struct inet_peer __rcu *root;
E
Eric Dumazet 已提交
81
	seqlock_t	lock;
E
Eric Dumazet 已提交
82
	int		total;
83 84 85
};

static struct inet_peer_base v4_peers = {
E
Eric Dumazet 已提交
86
	.root		= peer_avl_empty_rcu,
E
Eric Dumazet 已提交
87
	.lock		= __SEQLOCK_UNLOCKED(v4_peers.lock),
E
Eric Dumazet 已提交
88 89
	.total		= 0,
};
90 91 92

static struct inet_peer_base v6_peers = {
	.root		= peer_avl_empty_rcu,
E
Eric Dumazet 已提交
93
	.lock		= __SEQLOCK_UNLOCKED(v6_peers.lock),
94 95 96
	.total		= 0,
};

L
Linus Torvalds 已提交
97 98 99
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */

/* Exported for sysctl_net_ipv4.  */
E
Eric Dumazet 已提交
100
int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries more
L
Linus Torvalds 已提交
101
					 * aggressively at this stage */
E
Eric Dumazet 已提交
102 103
int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
L
Linus Torvalds 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125


/* Called from ip_output.c:ip_init  */
void __init inet_initpeers(void)
{
	struct sysinfo si;

	/* Use the straight interface to information about memory. */
	si_meminfo(&si);
	/* The values below were suggested by Alexey Kuznetsov
	 * <kuznet@ms2.inr.ac.ru>.  I don't have any opinion about the values
	 * myself.  --SAW
	 */
	if (si.totalram <= (32768*1024)/PAGE_SIZE)
		inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
	if (si.totalram <= (16384*1024)/PAGE_SIZE)
		inet_peer_threshold >>= 1; /* about 512KB */
	if (si.totalram <= (8192*1024)/PAGE_SIZE)
		inet_peer_threshold >>= 2; /* about 128KB */

	peer_cachep = kmem_cache_create("inet_peer_cache",
			sizeof(struct inet_peer),
126
			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
127
			NULL);
L
Linus Torvalds 已提交
128 129 130

}

131 132
static int addr_compare(const struct inetpeer_addr *a,
			const struct inetpeer_addr *b)
133 134 135 136
{
	int i, n = (a->family == AF_INET ? 1 : 4);

	for (i = 0; i < n; i++) {
137
		if (a->addr.a6[i] == b->addr.a6[i])
138
			continue;
E
Eric Dumazet 已提交
139
		if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
140 141 142 143 144 145 146
			return -1;
		return 1;
	}

	return 0;
}

E
Eric Dumazet 已提交
147 148 149
#define rcu_deref_locked(X, BASE)				\
	rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))

E
Eric Dumazet 已提交
150 151 152
/*
 * Called with local BH disabled and the pool lock held.
 */
153
#define lookup(_daddr, _stack, _base)				\
L
Linus Torvalds 已提交
154
({								\
E
Eric Dumazet 已提交
155 156
	struct inet_peer *u;					\
	struct inet_peer __rcu **v;				\
E
Eric Dumazet 已提交
157 158
								\
	stackptr = _stack;					\
159
	*stackptr++ = &_base->root;				\
E
Eric Dumazet 已提交
160
	for (u = rcu_deref_locked(_base->root, _base);		\
E
Eric Dumazet 已提交
161
	     u != peer_avl_empty; ) {				\
162 163
		int cmp = addr_compare(_daddr, &u->daddr);	\
		if (cmp == 0)					\
L
Linus Torvalds 已提交
164
			break;					\
165
		if (cmp == -1)					\
L
Linus Torvalds 已提交
166 167 168
			v = &u->avl_left;			\
		else						\
			v = &u->avl_right;			\
E
Eric Dumazet 已提交
169
		*stackptr++ = v;				\
E
Eric Dumazet 已提交
170
		u = rcu_deref_locked(*v, _base);		\
L
Linus Torvalds 已提交
171 172 173 174
	}							\
	u;							\
})

E
Eric Dumazet 已提交
175
/*
176
 * Called with rcu_read_lock()
E
Eric Dumazet 已提交
177 178 179 180 181
 * Because we hold no lock against a writer, its quite possible we fall
 * in an endless loop.
 * But every pointer we follow is guaranteed to be valid thanks to RCU.
 * We exit from this function if number of links exceeds PEER_MAXDEPTH
 */
182
static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
E
Eric Dumazet 已提交
183
				    struct inet_peer_base *base)
E
Eric Dumazet 已提交
184
{
185
	struct inet_peer *u = rcu_dereference(base->root);
E
Eric Dumazet 已提交
186 187 188
	int count = 0;

	while (u != peer_avl_empty) {
189 190
		int cmp = addr_compare(daddr, &u->daddr);
		if (cmp == 0) {
191
			/* Before taking a reference, check if this entry was
E
Eric Dumazet 已提交
192
			 * deleted (refcnt=-1)
193
			 */
E
Eric Dumazet 已提交
194
			if (!atomic_add_unless(&u->refcnt, 1, -1))
E
Eric Dumazet 已提交
195 196 197
				u = NULL;
			return u;
		}
198
		if (cmp == -1)
199
			u = rcu_dereference(u->avl_left);
E
Eric Dumazet 已提交
200
		else
201
			u = rcu_dereference(u->avl_right);
E
Eric Dumazet 已提交
202 203 204 205 206 207 208
		if (unlikely(++count == PEER_MAXDEPTH))
			break;
	}
	return NULL;
}

/* Called with local BH disabled and the pool lock held. */
209
#define lookup_rightempty(start, base)				\
L
Linus Torvalds 已提交
210
({								\
E
Eric Dumazet 已提交
211 212
	struct inet_peer *u;					\
	struct inet_peer __rcu **v;				\
L
Linus Torvalds 已提交
213 214
	*stackptr++ = &start->avl_left;				\
	v = &start->avl_left;					\
E
Eric Dumazet 已提交
215
	for (u = rcu_deref_locked(*v, base);			\
E
Eric Dumazet 已提交
216
	     u->avl_right != peer_avl_empty_rcu; ) {		\
L
Linus Torvalds 已提交
217 218
		v = &u->avl_right;				\
		*stackptr++ = v;				\
E
Eric Dumazet 已提交
219
		u = rcu_deref_locked(*v, base);			\
L
Linus Torvalds 已提交
220 221 222 223
	}							\
	u;							\
})

E
Eric Dumazet 已提交
224
/* Called with local BH disabled and the pool lock held.
L
Linus Torvalds 已提交
225
 * Variable names are the proof of operation correctness.
E
Eric Dumazet 已提交
226 227
 * Look into mm/map_avl.c for more detail description of the ideas.
 */
E
Eric Dumazet 已提交
228
static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
229 230
			       struct inet_peer __rcu ***stackend,
			       struct inet_peer_base *base)
L
Linus Torvalds 已提交
231
{
E
Eric Dumazet 已提交
232 233
	struct inet_peer __rcu **nodep;
	struct inet_peer *node, *l, *r;
L
Linus Torvalds 已提交
234 235 236 237
	int lh, rh;

	while (stackend > stack) {
		nodep = *--stackend;
E
Eric Dumazet 已提交
238 239 240
		node = rcu_deref_locked(*nodep, base);
		l = rcu_deref_locked(node->avl_left, base);
		r = rcu_deref_locked(node->avl_right, base);
L
Linus Torvalds 已提交
241 242 243 244 245
		lh = node_height(l);
		rh = node_height(r);
		if (lh > rh + 1) { /* l: RH+2 */
			struct inet_peer *ll, *lr, *lrl, *lrr;
			int lrh;
E
Eric Dumazet 已提交
246 247
			ll = rcu_deref_locked(l->avl_left, base);
			lr = rcu_deref_locked(l->avl_right, base);
L
Linus Torvalds 已提交
248 249
			lrh = node_height(lr);
			if (lrh <= node_height(ll)) {	/* ll: RH+1 */
E
Eric Dumazet 已提交
250 251
				RCU_INIT_POINTER(node->avl_left, lr);	/* lr: RH or RH+1 */
				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
L
Linus Torvalds 已提交
252
				node->avl_height = lrh + 1; /* RH+1 or RH+2 */
E
Eric Dumazet 已提交
253 254
				RCU_INIT_POINTER(l->avl_left, ll);       /* ll: RH+1 */
				RCU_INIT_POINTER(l->avl_right, node);	/* node: RH+1 or RH+2 */
L
Linus Torvalds 已提交
255
				l->avl_height = node->avl_height + 1;
E
Eric Dumazet 已提交
256
				RCU_INIT_POINTER(*nodep, l);
L
Linus Torvalds 已提交
257
			} else { /* ll: RH, lr: RH+1 */
E
Eric Dumazet 已提交
258 259
				lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
				lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
E
Eric Dumazet 已提交
260 261
				RCU_INIT_POINTER(node->avl_left, lrr);	/* lrr: RH or RH-1 */
				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
L
Linus Torvalds 已提交
262
				node->avl_height = rh + 1; /* node: RH+1 */
E
Eric Dumazet 已提交
263 264
				RCU_INIT_POINTER(l->avl_left, ll);	/* ll: RH */
				RCU_INIT_POINTER(l->avl_right, lrl);	/* lrl: RH or RH-1 */
L
Linus Torvalds 已提交
265
				l->avl_height = rh + 1;	/* l: RH+1 */
E
Eric Dumazet 已提交
266 267
				RCU_INIT_POINTER(lr->avl_left, l);	/* l: RH+1 */
				RCU_INIT_POINTER(lr->avl_right, node);	/* node: RH+1 */
L
Linus Torvalds 已提交
268
				lr->avl_height = rh + 2;
E
Eric Dumazet 已提交
269
				RCU_INIT_POINTER(*nodep, lr);
L
Linus Torvalds 已提交
270 271 272 273
			}
		} else if (rh > lh + 1) { /* r: LH+2 */
			struct inet_peer *rr, *rl, *rlr, *rll;
			int rlh;
E
Eric Dumazet 已提交
274 275
			rr = rcu_deref_locked(r->avl_right, base);
			rl = rcu_deref_locked(r->avl_left, base);
L
Linus Torvalds 已提交
276 277
			rlh = node_height(rl);
			if (rlh <= node_height(rr)) {	/* rr: LH+1 */
E
Eric Dumazet 已提交
278 279
				RCU_INIT_POINTER(node->avl_right, rl);	/* rl: LH or LH+1 */
				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
L
Linus Torvalds 已提交
280
				node->avl_height = rlh + 1; /* LH+1 or LH+2 */
E
Eric Dumazet 已提交
281 282
				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH+1 */
				RCU_INIT_POINTER(r->avl_left, node);	/* node: LH+1 or LH+2 */
L
Linus Torvalds 已提交
283
				r->avl_height = node->avl_height + 1;
E
Eric Dumazet 已提交
284
				RCU_INIT_POINTER(*nodep, r);
L
Linus Torvalds 已提交
285
			} else { /* rr: RH, rl: RH+1 */
E
Eric Dumazet 已提交
286 287
				rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
				rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
E
Eric Dumazet 已提交
288 289
				RCU_INIT_POINTER(node->avl_right, rll);	/* rll: LH or LH-1 */
				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
L
Linus Torvalds 已提交
290
				node->avl_height = lh + 1; /* node: LH+1 */
E
Eric Dumazet 已提交
291 292
				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH */
				RCU_INIT_POINTER(r->avl_left, rlr);	/* rlr: LH or LH-1 */
L
Linus Torvalds 已提交
293
				r->avl_height = lh + 1;	/* r: LH+1 */
E
Eric Dumazet 已提交
294 295
				RCU_INIT_POINTER(rl->avl_right, r);	/* r: LH+1 */
				RCU_INIT_POINTER(rl->avl_left, node);	/* node: LH+1 */
L
Linus Torvalds 已提交
296
				rl->avl_height = lh + 2;
E
Eric Dumazet 已提交
297
				RCU_INIT_POINTER(*nodep, rl);
L
Linus Torvalds 已提交
298 299 300 301 302 303 304
			}
		} else {
			node->avl_height = (lh > rh ? lh : rh) + 1;
		}
	}
}

E
Eric Dumazet 已提交
305
/* Called with local BH disabled and the pool lock held. */
306
#define link_to_pool(n, base)					\
L
Linus Torvalds 已提交
307 308
do {								\
	n->avl_height = 1;					\
E
Eric Dumazet 已提交
309 310 311 312
	n->avl_left = peer_avl_empty_rcu;			\
	n->avl_right = peer_avl_empty_rcu;			\
	/* lockless readers can catch us now */			\
	rcu_assign_pointer(**--stackptr, n);			\
313
	peer_avl_rebalance(stack, stackptr, base);		\
E
Eric Dumazet 已提交
314
} while (0)
L
Linus Torvalds 已提交
315

E
Eric Dumazet 已提交
316 317 318 319 320
static void inetpeer_free_rcu(struct rcu_head *head)
{
	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
}

E
Eric Dumazet 已提交
321 322
static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
			     struct inet_peer __rcu **stack[PEER_MAXDEPTH])
L
Linus Torvalds 已提交
323
{
E
Eric Dumazet 已提交
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
	struct inet_peer __rcu ***stackptr, ***delp;

	if (lookup(&p->daddr, stack, base) != p)
		BUG();
	delp = stackptr - 1; /* *delp[0] == p */
	if (p->avl_left == peer_avl_empty_rcu) {
		*delp[0] = p->avl_right;
		--stackptr;
	} else {
		/* look for a node to insert instead of p */
		struct inet_peer *t;
		t = lookup_rightempty(p, base);
		BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
		**--stackptr = t->avl_left;
		/* t is removed, t->daddr > x->daddr for any
		 * x in p->avl_left subtree.
		 * Put t in the old place of p. */
		RCU_INIT_POINTER(*delp[0], t);
		t->avl_left = p->avl_left;
		t->avl_right = p->avl_right;
		t->avl_height = p->avl_height;
		BUG_ON(delp[1] != &p->avl_left);
		delp[1] = &t->avl_left; /* was &p->avl_left */
L
Linus Torvalds 已提交
347
	}
E
Eric Dumazet 已提交
348 349 350
	peer_avl_rebalance(stack, stackptr, base);
	base->total--;
	call_rcu(&p->rcu, inetpeer_free_rcu);
L
Linus Torvalds 已提交
351 352
}

353 354
static struct inet_peer_base *family_to_base(int family)
{
E
Eric Dumazet 已提交
355
	return family == AF_INET ? &v4_peers : &v6_peers;
356 357
}

E
Eric Dumazet 已提交
358 359 360 361
/* perform garbage collect on all items stacked during a lookup */
static int inet_peer_gc(struct inet_peer_base *base,
			struct inet_peer __rcu **stack[PEER_MAXDEPTH],
			struct inet_peer __rcu ***stackptr)
362
{
E
Eric Dumazet 已提交
363 364 365
	struct inet_peer *p, *gchead = NULL;
	__u32 delta, ttl;
	int cnt = 0;
366

E
Eric Dumazet 已提交
367 368 369 370 371 372 373 374 375 376
	if (base->total >= inet_peer_threshold)
		ttl = 0; /* be aggressive */
	else
		ttl = inet_peer_maxttl
				- (inet_peer_maxttl - inet_peer_minttl) / HZ *
					base->total / inet_peer_threshold * HZ;
	stackptr--; /* last stack slot is peer_avl_empty */
	while (stackptr > stack) {
		stackptr--;
		p = rcu_deref_locked(**stackptr, base);
E
Eric Dumazet 已提交
377 378 379 380 381 382 383 384
		if (atomic_read(&p->refcnt) == 0) {
			smp_rmb();
			delta = (__u32)jiffies - p->dtime;
			if (delta >= ttl &&
			    atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
				p->gc_next = gchead;
				gchead = p;
			}
L
Linus Torvalds 已提交
385 386
		}
	}
E
Eric Dumazet 已提交
387 388 389 390 391 392
	while ((p = gchead) != NULL) {
		gchead = p->gc_next;
		cnt++;
		unlink_from_pool(p, base, stack);
	}
	return cnt;
L
Linus Torvalds 已提交
393 394
}

395
struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
L
Linus Torvalds 已提交
396
{
E
Eric Dumazet 已提交
397
	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
398
	struct inet_peer_base *base = family_to_base(daddr->family);
399
	struct inet_peer *p;
E
Eric Dumazet 已提交
400
	unsigned int sequence;
E
Eric Dumazet 已提交
401
	int invalidated, gccnt = 0;
L
Linus Torvalds 已提交
402

E
Eric Dumazet 已提交
403
	/* Attempt a lockless lookup first.
E
Eric Dumazet 已提交
404 405
	 * Because of a concurrent writer, we might not find an existing entry.
	 */
406
	rcu_read_lock();
E
Eric Dumazet 已提交
407
	sequence = read_seqbegin(&base->lock);
E
Eric Dumazet 已提交
408
	p = lookup_rcu(daddr, base);
E
Eric Dumazet 已提交
409
	invalidated = read_seqretry(&base->lock, sequence);
410
	rcu_read_unlock();
E
Eric Dumazet 已提交
411

E
Eric Dumazet 已提交
412
	if (p)
E
Eric Dumazet 已提交
413
		return p;
L
Linus Torvalds 已提交
414

E
Eric Dumazet 已提交
415 416 417 418
	/* If no writer did a change during our lookup, we can return early. */
	if (!create && !invalidated)
		return NULL;

E
Eric Dumazet 已提交
419 420 421
	/* retry an exact lookup, taking the lock before.
	 * At least, nodes should be hot in our cache.
	 */
E
Eric Dumazet 已提交
422
	write_seqlock_bh(&base->lock);
E
Eric Dumazet 已提交
423
relookup:
424
	p = lookup(daddr, stack, base);
L
Linus Torvalds 已提交
425
	if (p != peer_avl_empty) {
E
Eric Dumazet 已提交
426
		atomic_inc(&p->refcnt);
E
Eric Dumazet 已提交
427
		write_sequnlock_bh(&base->lock);
E
Eric Dumazet 已提交
428 429 430 431 432 433
		return p;
	}
	if (!gccnt) {
		gccnt = inet_peer_gc(base, stack, stackptr);
		if (gccnt && create)
			goto relookup;
L
Linus Torvalds 已提交
434
	}
E
Eric Dumazet 已提交
435 436
	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
	if (p) {
437
		p->daddr = *daddr;
E
Eric Dumazet 已提交
438 439
		atomic_set(&p->refcnt, 1);
		atomic_set(&p->rid, 0);
440 441 442 443
		atomic_set(&p->ip_id_count,
				(daddr->family == AF_INET) ?
					secure_ip_id(daddr->addr.a4) :
					secure_ipv6_id(daddr->addr.a6));
E
Eric Dumazet 已提交
444
		p->tcp_ts_stamp = 0;
445
		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
446 447
		p->rate_tokens = 0;
		p->rate_last = 0;
448
		p->pmtu_expires = 0;
H
Hiroaki SHIMODA 已提交
449
		p->pmtu_orig = 0;
450
		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
E
Eric Dumazet 已提交
451 452 453


		/* Link the node. */
454 455
		link_to_pool(p, base);
		base->total++;
E
Eric Dumazet 已提交
456
	}
E
Eric Dumazet 已提交
457
	write_sequnlock_bh(&base->lock);
L
Linus Torvalds 已提交
458 459 460

	return p;
}
461
EXPORT_SYMBOL_GPL(inet_getpeer);
462

463 464
void inet_putpeer(struct inet_peer *p)
{
E
Eric Dumazet 已提交
465
	p->dtime = (__u32)jiffies;
E
Eric Dumazet 已提交
466
	smp_mb__before_atomic_dec();
E
Eric Dumazet 已提交
467
	atomic_dec(&p->refcnt);
468
}
469
EXPORT_SYMBOL_GPL(inet_putpeer);
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510

/*
 *	Check transmit rate limitation for given message.
 *	The rate information is held in the inet_peer entries now.
 *	This function is generic and could be used for other purposes
 *	too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
 *
 *	Note that the same inet_peer fields are modified by functions in
 *	route.c too, but these work for packet destinations while xrlim_allow
 *	works for icmp destinations. This means the rate limiting information
 *	for one "ip object" is shared - and these ICMPs are twice limited:
 *	by source and by destination.
 *
 *	RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
 *			  SHOULD allow setting of rate limits
 *
 * 	Shared between ICMPv4 and ICMPv6.
 */
#define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
	unsigned long now, token;
	bool rc = false;

	if (!peer)
		return true;

	token = peer->rate_tokens;
	now = jiffies;
	token += now - peer->rate_last;
	peer->rate_last = now;
	if (token > XRLIM_BURST_FACTOR * timeout)
		token = XRLIM_BURST_FACTOR * timeout;
	if (token >= timeout) {
		token -= timeout;
		rc = true;
	}
	peer->rate_tokens = token;
	return rc;
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);