ip_vs_ctl.c 93.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * IPVS         An implementation of the IP virtual server support for the
 *              LINUX operating system.  IPVS is now implemented as a module
 *              over the NetFilter framework. IPVS can be used to build a
 *              high-performance and highly available server based on a
 *              cluster of servers.
 *
 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
 *              Peter Kese <peter.kese@ijs.si>
 *              Julian Anastasov <ja@ssi.bg>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 *
 * Changes:
 *
 */

H
Hannes Eder 已提交
21 22 23
#define KMSG_COMPONENT "IPVS"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

L
Linus Torvalds 已提交
24 25 26
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28 29 30 31 32 33
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
34
#include <linux/slab.h>
L
Linus Torvalds 已提交
35 36 37

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
38
#include <linux/mutex.h>
L
Linus Torvalds 已提交
39

40
#include <net/net_namespace.h>
41
#include <linux/nsproxy.h>
L
Linus Torvalds 已提交
42
#include <net/ip.h>
43 44 45 46
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <net/ip6_route.h>
#endif
47
#include <net/route.h>
L
Linus Torvalds 已提交
48
#include <net/sock.h>
49
#include <net/genetlink.h>
L
Linus Torvalds 已提交
50 51 52 53 54 55

#include <asm/uaccess.h>

#include <net/ip_vs.h>

/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56
static DEFINE_MUTEX(__ip_vs_mutex);
L
Linus Torvalds 已提交
57 58 59 60 61 62 63 64 65 66 67 68

/* sysctl variables */

#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;

int ip_vs_get_debug_level(void)
{
	return sysctl_ip_vs_debug_level;
}
#endif

69 70

/*  Protos */
J
Julian Anastasov 已提交
71
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
72 73


74 75
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
76 77
static bool __ip_vs_addr_is_local_v6(struct net *net,
				     const struct in6_addr *addr)
78
{
79 80
	struct flowi6 fl6 = {
		.daddr = *addr,
81
	};
82 83
	struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
	bool is_local;
84

85
	is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
86

87 88
	dst_release(dst);
	return is_local;
89 90
}
#endif
91 92

#ifdef CONFIG_SYSCTL
L
Linus Torvalds 已提交
93
/*
94 95
 *	update_defense_level is called from keventd and from sysctl,
 *	so it needs to protect itself from softirqs
L
Linus Torvalds 已提交
96
 */
97
static void update_defense_level(struct netns_ipvs *ipvs)
L
Linus Torvalds 已提交
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
{
	struct sysinfo i;
	static int old_secure_tcp = 0;
	int availmem;
	int nomem;
	int to_change = -1;

	/* we only count free and buffered memory (in pages) */
	si_meminfo(&i);
	availmem = i.freeram + i.bufferram;
	/* however in linux 2.5 the i.bufferram is total page cache size,
	   we need adjust it */
	/* si_swapinfo(&i); */
	/* availmem = availmem - (i.totalswap - i.freeswap); */

113
	nomem = (availmem < ipvs->sysctl_amemthresh);
L
Linus Torvalds 已提交
114

115 116
	local_bh_disable();

L
Linus Torvalds 已提交
117
	/* drop_entry */
118 119
	spin_lock(&ipvs->dropentry_lock);
	switch (ipvs->sysctl_drop_entry) {
L
Linus Torvalds 已提交
120
	case 0:
121
		atomic_set(&ipvs->dropentry, 0);
L
Linus Torvalds 已提交
122 123 124
		break;
	case 1:
		if (nomem) {
125 126
			atomic_set(&ipvs->dropentry, 1);
			ipvs->sysctl_drop_entry = 2;
L
Linus Torvalds 已提交
127
		} else {
128
			atomic_set(&ipvs->dropentry, 0);
L
Linus Torvalds 已提交
129 130 131 132
		}
		break;
	case 2:
		if (nomem) {
133
			atomic_set(&ipvs->dropentry, 1);
L
Linus Torvalds 已提交
134
		} else {
135 136
			atomic_set(&ipvs->dropentry, 0);
			ipvs->sysctl_drop_entry = 1;
L
Linus Torvalds 已提交
137 138 139
		};
		break;
	case 3:
140
		atomic_set(&ipvs->dropentry, 1);
L
Linus Torvalds 已提交
141 142
		break;
	}
143
	spin_unlock(&ipvs->dropentry_lock);
L
Linus Torvalds 已提交
144 145

	/* drop_packet */
146 147
	spin_lock(&ipvs->droppacket_lock);
	switch (ipvs->sysctl_drop_packet) {
L
Linus Torvalds 已提交
148
	case 0:
149
		ipvs->drop_rate = 0;
L
Linus Torvalds 已提交
150 151 152
		break;
	case 1:
		if (nomem) {
153 154 155 156
			ipvs->drop_rate = ipvs->drop_counter
				= ipvs->sysctl_amemthresh /
				(ipvs->sysctl_amemthresh-availmem);
			ipvs->sysctl_drop_packet = 2;
L
Linus Torvalds 已提交
157
		} else {
158
			ipvs->drop_rate = 0;
L
Linus Torvalds 已提交
159 160 161 162
		}
		break;
	case 2:
		if (nomem) {
163 164 165
			ipvs->drop_rate = ipvs->drop_counter
				= ipvs->sysctl_amemthresh /
				(ipvs->sysctl_amemthresh-availmem);
L
Linus Torvalds 已提交
166
		} else {
167 168
			ipvs->drop_rate = 0;
			ipvs->sysctl_drop_packet = 1;
L
Linus Torvalds 已提交
169 170 171
		}
		break;
	case 3:
172
		ipvs->drop_rate = ipvs->sysctl_am_droprate;
L
Linus Torvalds 已提交
173 174
		break;
	}
175
	spin_unlock(&ipvs->droppacket_lock);
L
Linus Torvalds 已提交
176 177

	/* secure_tcp */
178 179
	spin_lock(&ipvs->securetcp_lock);
	switch (ipvs->sysctl_secure_tcp) {
L
Linus Torvalds 已提交
180 181 182 183 184 185 186 187
	case 0:
		if (old_secure_tcp >= 2)
			to_change = 0;
		break;
	case 1:
		if (nomem) {
			if (old_secure_tcp < 2)
				to_change = 1;
188
			ipvs->sysctl_secure_tcp = 2;
L
Linus Torvalds 已提交
189 190 191 192 193 194 195 196 197 198 199 200
		} else {
			if (old_secure_tcp >= 2)
				to_change = 0;
		}
		break;
	case 2:
		if (nomem) {
			if (old_secure_tcp < 2)
				to_change = 1;
		} else {
			if (old_secure_tcp >= 2)
				to_change = 0;
201
			ipvs->sysctl_secure_tcp = 1;
L
Linus Torvalds 已提交
202 203 204 205 206 207 208
		}
		break;
	case 3:
		if (old_secure_tcp < 2)
			to_change = 1;
		break;
	}
209
	old_secure_tcp = ipvs->sysctl_secure_tcp;
L
Linus Torvalds 已提交
210
	if (to_change >= 0)
211
		ip_vs_protocol_timeout_change(ipvs,
212 213
					      ipvs->sysctl_secure_tcp > 1);
	spin_unlock(&ipvs->securetcp_lock);
214 215

	local_bh_enable();
L
Linus Torvalds 已提交
216 217 218 219 220 221 222 223
}


/*
 *	Timer for checking the defense
 */
#define DEFENSE_TIMER_PERIOD	1*HZ

D
David Howells 已提交
224
static void defense_work_handler(struct work_struct *work)
L
Linus Torvalds 已提交
225
{
226 227
	struct netns_ipvs *ipvs =
		container_of(work, struct netns_ipvs, defense_work.work);
228 229

	update_defense_level(ipvs);
230
	if (atomic_read(&ipvs->dropentry))
231 232
		ip_vs_random_dropentry(ipvs->net);
	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
L
Linus Torvalds 已提交
233
}
234
#endif
L
Linus Torvalds 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256

int
ip_vs_use_count_inc(void)
{
	return try_module_get(THIS_MODULE);
}

void
ip_vs_use_count_dec(void)
{
	module_put(THIS_MODULE);
}


/*
 *	Hash table: for virtual service lookups
 */
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)

/* the service table hashed by <protocol, addr, port> */
J
Julian Anastasov 已提交
257
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
L
Linus Torvalds 已提交
258
/* the service table hashed by fwmark */
J
Julian Anastasov 已提交
259
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
L
Linus Torvalds 已提交
260 261 262 263 264


/*
 *	Returns hash value for virtual service
 */
265 266
static inline unsigned int
ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
267
		  const union nf_inet_addr *addr, __be16 port)
L
Linus Torvalds 已提交
268
{
269
	register unsigned int porth = ntohs(port);
270
	__be32 addr_fold = addr->ip;
271
	__u32 ahash;
L
Linus Torvalds 已提交
272

273 274 275 276 277
#ifdef CONFIG_IP_VS_IPV6
	if (af == AF_INET6)
		addr_fold = addr->ip6[0]^addr->ip6[1]^
			    addr->ip6[2]^addr->ip6[3];
#endif
278 279
	ahash = ntohl(addr_fold);
	ahash ^= ((size_t) net >> 8);
280

281 282
	return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
	       IP_VS_SVC_TAB_MASK;
L
Linus Torvalds 已提交
283 284 285 286 287
}

/*
 *	Returns hash value of fwmark for virtual service lookup
 */
288
static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
L
Linus Torvalds 已提交
289
{
290
	return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
L
Linus Torvalds 已提交
291 292 293
}

/*
294
 *	Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
L
Linus Torvalds 已提交
295 296 297 298 299
 *	or in the ip_vs_svc_fwm_table by fwmark.
 *	Should be called with locked tables.
 */
static int ip_vs_svc_hash(struct ip_vs_service *svc)
{
300
	unsigned int hash;
L
Linus Torvalds 已提交
301 302

	if (svc->flags & IP_VS_SVC_F_HASHED) {
303 304
		pr_err("%s(): request for already hashed, called from %pF\n",
		       __func__, __builtin_return_address(0));
L
Linus Torvalds 已提交
305 306 307 308 309
		return 0;
	}

	if (svc->fwmark == 0) {
		/*
310
		 *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
L
Linus Torvalds 已提交
311
		 */
312 313
		hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
					 &svc->addr, svc->port);
J
Julian Anastasov 已提交
314
		hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
L
Linus Torvalds 已提交
315 316
	} else {
		/*
317
		 *  Hash it by fwmark in svc_fwm_table
L
Linus Torvalds 已提交
318
		 */
319
		hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
J
Julian Anastasov 已提交
320
		hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
L
Linus Torvalds 已提交
321 322 323 324 325 326 327 328 329 330
	}

	svc->flags |= IP_VS_SVC_F_HASHED;
	/* increase its refcnt because it is referenced by the svc table */
	atomic_inc(&svc->refcnt);
	return 1;
}


/*
331
 *	Unhashes a service from svc_table / svc_fwm_table.
L
Linus Torvalds 已提交
332 333 334 335 336
 *	Should be called with locked tables.
 */
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
{
	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
337 338
		pr_err("%s(): request for unhash flagged, called from %pF\n",
		       __func__, __builtin_return_address(0));
L
Linus Torvalds 已提交
339 340 341 342
		return 0;
	}

	if (svc->fwmark == 0) {
343
		/* Remove it from the svc_table table */
J
Julian Anastasov 已提交
344
		hlist_del_rcu(&svc->s_list);
L
Linus Torvalds 已提交
345
	} else {
346
		/* Remove it from the svc_fwm_table table */
J
Julian Anastasov 已提交
347
		hlist_del_rcu(&svc->f_list);
L
Linus Torvalds 已提交
348 349 350 351 352 353 354 355 356
	}

	svc->flags &= ~IP_VS_SVC_F_HASHED;
	atomic_dec(&svc->refcnt);
	return 1;
}


/*
357
 *	Get service by {netns, proto,addr,port} in the service table.
L
Linus Torvalds 已提交
358
 */
359
static inline struct ip_vs_service *
360 361
__ip_vs_service_find(struct net *net, int af, __u16 protocol,
		     const union nf_inet_addr *vaddr, __be16 vport)
L
Linus Torvalds 已提交
362
{
363
	unsigned int hash;
L
Linus Torvalds 已提交
364 365 366
	struct ip_vs_service *svc;

	/* Check for "full" addressed entries */
367
	hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
L
Linus Torvalds 已提交
368

J
Julian Anastasov 已提交
369
	hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
370 371
		if ((svc->af == af)
		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
L
Linus Torvalds 已提交
372
		    && (svc->port == vport)
373 374
		    && (svc->protocol == protocol)
		    && net_eq(svc->net, net)) {
L
Linus Torvalds 已提交
375 376 377 378 379 380 381 382 383 384 385 386
			/* HIT */
			return svc;
		}
	}

	return NULL;
}


/*
 *	Get service by {fwmark} in the service table.
 */
387
static inline struct ip_vs_service *
388
__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
L
Linus Torvalds 已提交
389
{
390
	unsigned int hash;
L
Linus Torvalds 已提交
391 392 393
	struct ip_vs_service *svc;

	/* Check for fwmark addressed entries */
394
	hash = ip_vs_svc_fwm_hashkey(net, fwmark);
L
Linus Torvalds 已提交
395

J
Julian Anastasov 已提交
396
	hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
397 398
		if (svc->fwmark == fwmark && svc->af == af
		    && net_eq(svc->net, net)) {
L
Linus Torvalds 已提交
399 400 401 402 403 404 405 406
			/* HIT */
			return svc;
		}
	}

	return NULL;
}

J
Julian Anastasov 已提交
407
/* Find service, called under RCU lock */
L
Linus Torvalds 已提交
408
struct ip_vs_service *
J
Julian Anastasov 已提交
409 410
ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
		   const union nf_inet_addr *vaddr, __be16 vport)
L
Linus Torvalds 已提交
411 412
{
	struct ip_vs_service *svc;
413
	struct netns_ipvs *ipvs = net_ipvs(net);
414

L
Linus Torvalds 已提交
415 416 417
	/*
	 *	Check the table hashed by fwmark first
	 */
418 419 420 421 422
	if (fwmark) {
		svc = __ip_vs_svc_fwm_find(net, af, fwmark);
		if (svc)
			goto out;
	}
L
Linus Torvalds 已提交
423 424 425 426 427

	/*
	 *	Check the table hashed by <protocol,addr,port>
	 *	for "full" addressed entries
	 */
428
	svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
L
Linus Torvalds 已提交
429 430 431

	if (svc == NULL
	    && protocol == IPPROTO_TCP
432
	    && atomic_read(&ipvs->ftpsvc_counter)
L
Linus Torvalds 已提交
433 434 435 436 437
	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
		/*
		 * Check if ftp service entry exists, the packet
		 * might belong to FTP data connections.
		 */
438
		svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
L
Linus Torvalds 已提交
439 440 441
	}

	if (svc == NULL
442
	    && atomic_read(&ipvs->nullsvc_counter)) {
L
Linus Torvalds 已提交
443 444 445
		/*
		 * Check if the catch-all port (port zero) exists
		 */
446
		svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
L
Linus Torvalds 已提交
447 448 449
	}

  out:
450 451 452 453
	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
		      fwmark, ip_vs_proto_name(protocol),
		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
		      svc ? "hit" : "not hit");
L
Linus Torvalds 已提交
454 455 456 457 458 459 460 461 462

	return svc;
}


static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
	atomic_inc(&svc->refcnt);
463
	rcu_assign_pointer(dest->svc, svc);
L
Linus Torvalds 已提交
464 465
}

J
Julian Anastasov 已提交
466 467 468 469 470 471 472
static void ip_vs_service_free(struct ip_vs_service *svc)
{
	if (svc->stats.cpustats)
		free_percpu(svc->stats.cpustats);
	kfree(svc);
}

473
static void ip_vs_service_rcu_free(struct rcu_head *head)
L
Linus Torvalds 已提交
474
{
475 476 477 478 479
	struct ip_vs_service *svc;

	svc = container_of(head, struct ip_vs_service, rcu_head);
	ip_vs_service_free(svc);
}
L
Linus Torvalds 已提交
480

481 482
static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
{
483
	if (atomic_dec_and_test(&svc->refcnt)) {
J
Julian Anastasov 已提交
484
		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
485 486
			      svc->fwmark,
			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
J
Julian Anastasov 已提交
487
			      ntohs(svc->port));
488 489 490 491
		if (do_delay)
			call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
		else
			ip_vs_service_free(svc);
492
	}
L
Linus Torvalds 已提交
493 494 495 496 497 498
}


/*
 *	Returns hash value for real service
 */
499
static inline unsigned int ip_vs_rs_hashkey(int af,
500 501
					    const union nf_inet_addr *addr,
					    __be16 port)
L
Linus Torvalds 已提交
502
{
503
	register unsigned int porth = ntohs(port);
504 505 506 507 508 509 510
	__be32 addr_fold = addr->ip;

#ifdef CONFIG_IP_VS_IPV6
	if (af == AF_INET6)
		addr_fold = addr->ip6[0]^addr->ip6[1]^
			    addr->ip6[2]^addr->ip6[3];
#endif
L
Linus Torvalds 已提交
511

512
	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
L
Linus Torvalds 已提交
513 514 515
		& IP_VS_RTAB_MASK;
}

516 517
/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
L
Linus Torvalds 已提交
518
{
519
	unsigned int hash;
L
Linus Torvalds 已提交
520

521 522
	if (dest->in_rs_table)
		return;
L
Linus Torvalds 已提交
523 524 525 526 527

	/*
	 *	Hash by proto,addr,port,
	 *	which are the parameters of the real service.
	 */
528 529
	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);

530 531
	hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
	dest->in_rs_table = 1;
L
Linus Torvalds 已提交
532 533
}

534 535
/* Unhash ip_vs_dest from rs_table. */
static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
L
Linus Torvalds 已提交
536 537
{
	/*
538
	 * Remove it from the rs_table table.
L
Linus Torvalds 已提交
539
	 */
540 541 542
	if (dest->in_rs_table) {
		hlist_del_rcu(&dest->d_list);
		dest->in_rs_table = 0;
L
Linus Torvalds 已提交
543 544 545
	}
}

546 547 548
/* Check if real service by <proto,addr,port> is present */
bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
			    const union nf_inet_addr *daddr, __be16 dport)
L
Linus Torvalds 已提交
549
{
550
	struct netns_ipvs *ipvs = net_ipvs(net);
551
	unsigned int hash;
L
Linus Torvalds 已提交
552 553
	struct ip_vs_dest *dest;

554
	/* Check for "full" addressed entries */
555
	hash = ip_vs_rs_hashkey(af, daddr, dport);
L
Linus Torvalds 已提交
556

557 558 559 560 561 562
	rcu_read_lock();
	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
		if (dest->port == dport &&
		    dest->af == af &&
		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
		    (dest->protocol == protocol || dest->vfwmark)) {
L
Linus Torvalds 已提交
563
			/* HIT */
564 565
			rcu_read_unlock();
			return true;
L
Linus Torvalds 已提交
566 567
		}
	}
568
	rcu_read_unlock();
L
Linus Torvalds 已提交
569

570
	return false;
L
Linus Torvalds 已提交
571 572
}

J
Julian Anastasov 已提交
573 574
/* Lookup destination by {addr,port} in the given service
 * Called under RCU lock.
L
Linus Torvalds 已提交
575 576
 */
static struct ip_vs_dest *
577 578
ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
		  __be16 dport)
L
Linus Torvalds 已提交
579 580 581 582 583 584
{
	struct ip_vs_dest *dest;

	/*
	 * Find the destination for the given service
	 */
J
Julian Anastasov 已提交
585
	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
586 587 588
		if ((dest->af == svc->af)
		    && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
		    && (dest->port == dport)) {
L
Linus Torvalds 已提交
589 590 591 592 593 594 595 596
			/* HIT */
			return dest;
		}
	}

	return NULL;
}

597 598
/*
 * Find destination by {daddr,dport,vaddr,protocol}
J
Julian Anastasov 已提交
599
 * Created to be used in ip_vs_process_message() in
600 601 602
 * the backup synchronization daemon. It finds the
 * destination to be bound to the received connection
 * on the backup.
J
Julian Anastasov 已提交
603
 * Called under RCU lock, no refcnt is returned.
604
 */
605 606
struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
				   const union nf_inet_addr *daddr,
607 608
				   __be16 dport,
				   const union nf_inet_addr *vaddr,
609 610
				   __be16 vport, __u16 protocol, __u32 fwmark,
				   __u32 flags)
611 612 613
{
	struct ip_vs_dest *dest;
	struct ip_vs_service *svc;
614
	__be16 port = dport;
615

J
Julian Anastasov 已提交
616
	svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
617 618
	if (!svc)
		return NULL;
619 620 621 622 623
	if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
		port = 0;
	dest = ip_vs_lookup_dest(svc, daddr, port);
	if (!dest)
		dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
624 625
	return dest;
}
L
Linus Torvalds 已提交
626

627 628 629 630 631 632 633 634 635 636 637
void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
{
	struct ip_vs_dest_dst *dest_dst = container_of(head,
						       struct ip_vs_dest_dst,
						       rcu_head);

	dst_release(dest_dst->dst_cache);
	kfree(dest_dst);
}

/* Release dest_dst and dst_cache for dest in user context */
638 639
static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
{
640
	struct ip_vs_dest_dst *old;
641

642 643 644 645 646
	old = rcu_dereference_protected(dest->dest_dst, 1);
	if (old) {
		RCU_INIT_POINTER(dest->dest_dst, NULL);
		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
	}
647 648
}

L
Linus Torvalds 已提交
649 650 651 652 653 654 655 656 657 658 659
/*
 *  Lookup dest by {svc,addr,port} in the destination trash.
 *  The destination trash is used to hold the destinations that are removed
 *  from the service table but are still referenced by some conn entries.
 *  The reason to add the destination trash is when the dest is temporary
 *  down (either by administrator or by monitor program), the dest can be
 *  picked back from the trash, the remaining connections to the dest can
 *  continue, and the counting information of the dest is also useful for
 *  scheduling.
 */
static struct ip_vs_dest *
660 661
ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
		     __be16 dport)
L
Linus Torvalds 已提交
662
{
J
Julian Anastasov 已提交
663
	struct ip_vs_dest *dest;
H
Hans Schillstrom 已提交
664
	struct netns_ipvs *ipvs = net_ipvs(svc->net);
L
Linus Torvalds 已提交
665 666 667 668

	/*
	 * Find the destination in trash
	 */
J
Julian Anastasov 已提交
669 670
	spin_lock_bh(&ipvs->dest_trash_lock);
	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
671 672 673 674 675 676 677 678
		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
			      "dest->refcnt=%d\n",
			      dest->vfwmark,
			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
			      ntohs(dest->port),
			      atomic_read(&dest->refcnt));
		if (dest->af == svc->af &&
		    ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
L
Linus Torvalds 已提交
679 680 681 682
		    dest->port == dport &&
		    dest->vfwmark == svc->fwmark &&
		    dest->protocol == svc->protocol &&
		    (svc->fwmark ||
683
		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
L
Linus Torvalds 已提交
684 685
		      dest->vport == svc->port))) {
			/* HIT */
J
Julian Anastasov 已提交
686 687 688
			list_del(&dest->t_list);
			ip_vs_dest_hold(dest);
			goto out;
L
Linus Torvalds 已提交
689 690 691
		}
	}

J
Julian Anastasov 已提交
692 693 694 695 696 697
	dest = NULL;

out:
	spin_unlock_bh(&ipvs->dest_trash_lock);

	return dest;
L
Linus Torvalds 已提交
698 699
}

J
Julian Anastasov 已提交
700 701
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
702 703
	struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);

J
Julian Anastasov 已提交
704
	__ip_vs_dst_cache_reset(dest);
705
	__ip_vs_svc_put(svc, false);
J
Julian Anastasov 已提交
706
	free_percpu(dest->stats.cpustats);
707
	ip_vs_dest_put_and_free(dest);
J
Julian Anastasov 已提交
708
}
L
Linus Torvalds 已提交
709 710 711 712 713 714 715 716

/*
 *  Clean up all the destinations in the trash
 *  Called by the ip_vs_control_cleanup()
 *
 *  When the ip_vs_control_clearup is activated by ipvs module exit,
 *  the service tables must have been flushed and all the connections
 *  are expired, and the refcnt of each destination in the trash must
J
Julian Anastasov 已提交
717
 *  be 0, so we simply release them here.
L
Linus Torvalds 已提交
718
 */
H
Hans Schillstrom 已提交
719
static void ip_vs_trash_cleanup(struct net *net)
L
Linus Torvalds 已提交
720 721
{
	struct ip_vs_dest *dest, *nxt;
H
Hans Schillstrom 已提交
722
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
723

J
Julian Anastasov 已提交
724 725 726 727 728
	del_timer_sync(&ipvs->dest_trash_timer);
	/* No need to use dest_trash_lock */
	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
		list_del(&dest->t_list);
		ip_vs_dest_free(dest);
L
Linus Torvalds 已提交
729 730 731
	}
}

732 733 734 735 736 737 738 739 740 741 742 743 744
static void
ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c

	spin_lock_bh(&src->lock);

	IP_VS_SHOW_STATS_COUNTER(conns);
	IP_VS_SHOW_STATS_COUNTER(inpkts);
	IP_VS_SHOW_STATS_COUNTER(outpkts);
	IP_VS_SHOW_STATS_COUNTER(inbytes);
	IP_VS_SHOW_STATS_COUNTER(outbytes);

J
Julian Anastasov 已提交
745
	ip_vs_read_estimator(dst, src);
746 747 748

	spin_unlock_bh(&src->lock);
}
L
Linus Torvalds 已提交
749 750 751 752 753

static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
	spin_lock_bh(&stats->lock);
754

755 756 757 758 759 760 761 762 763 764
	/* get current counters as zero point, rates are zeroed */

#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c

	IP_VS_ZERO_STATS_COUNTER(conns);
	IP_VS_ZERO_STATS_COUNTER(inpkts);
	IP_VS_ZERO_STATS_COUNTER(outpkts);
	IP_VS_ZERO_STATS_COUNTER(inbytes);
	IP_VS_ZERO_STATS_COUNTER(outbytes);

L
Linus Torvalds 已提交
765
	ip_vs_zero_estimator(stats);
766

767
	spin_unlock_bh(&stats->lock);
L
Linus Torvalds 已提交
768 769 770 771 772 773
}

/*
 *	Update a destination in the given service
 */
static void
774 775
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
		    struct ip_vs_dest_user_kern *udest, int add)
L
Linus Torvalds 已提交
776
{
777
	struct netns_ipvs *ipvs = net_ipvs(svc->net);
778
	struct ip_vs_service *old_svc;
J
Julian Anastasov 已提交
779
	struct ip_vs_scheduler *sched;
L
Linus Torvalds 已提交
780 781 782 783
	int conn_flags;

	/* set the weight and the flags */
	atomic_set(&dest->weight, udest->weight);
784 785
	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
	conn_flags |= IP_VS_CONN_F_INACTIVE;
L
Linus Torvalds 已提交
786 787

	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
788
	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
L
Linus Torvalds 已提交
789 790 791
		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
	} else {
		/*
792
		 *    Put the real service in rs_table if not present.
L
Linus Torvalds 已提交
793 794
		 *    For now only for NAT!
		 */
795
		ip_vs_rs_hash(ipvs, dest);
L
Linus Torvalds 已提交
796 797 798 799
	}
	atomic_set(&dest->conn_flags, conn_flags);

	/* bind the service */
800 801
	old_svc = rcu_dereference_protected(dest->svc, 1);
	if (!old_svc) {
L
Linus Torvalds 已提交
802 803
		__ip_vs_bind_svc(dest, svc);
	} else {
804
		if (old_svc != svc) {
L
Linus Torvalds 已提交
805 806
			ip_vs_zero_stats(&dest->stats);
			__ip_vs_bind_svc(dest, svc);
807
			__ip_vs_svc_put(old_svc, true);
L
Linus Torvalds 已提交
808 809 810 811 812 813 814 815 816 817
		}
	}

	/* set the dest status flags */
	dest->flags |= IP_VS_DEST_F_AVAILABLE;

	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
	dest->u_threshold = udest->u_threshold;
	dest->l_threshold = udest->l_threshold;
818

819
	spin_lock_bh(&dest->dst_lock);
820
	__ip_vs_dst_cache_reset(dest);
821
	spin_unlock_bh(&dest->dst_lock);
822

J
Julian Anastasov 已提交
823
	sched = rcu_dereference_protected(svc->scheduler, 1);
824
	if (add) {
J
Julian Anastasov 已提交
825
		ip_vs_start_estimator(svc->net, &dest->stats);
J
Julian Anastasov 已提交
826
		list_add_rcu(&dest->n_list, &svc->destinations);
827
		svc->num_dests++;
J
Julian Anastasov 已提交
828 829
		if (sched->add_dest)
			sched->add_dest(svc, dest);
830
	} else {
J
Julian Anastasov 已提交
831 832
		if (sched->upd_dest)
			sched->upd_dest(svc, dest);
833
	}
L
Linus Torvalds 已提交
834 835 836 837 838 839 840
}


/*
 *	Create a destination for the given service
 */
static int
841
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
L
Linus Torvalds 已提交
842 843 844
	       struct ip_vs_dest **dest_p)
{
	struct ip_vs_dest *dest;
845
	unsigned int atype, i;
L
Linus Torvalds 已提交
846 847 848

	EnterFunction(2);

849 850 851
#ifdef CONFIG_IP_VS_IPV6
	if (svc->af == AF_INET6) {
		atype = ipv6_addr_type(&udest->addr.in6);
852 853
		if ((!(atype & IPV6_ADDR_UNICAST) ||
			atype & IPV6_ADDR_LINKLOCAL) &&
854
			!__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
855 856 857 858
			return -EINVAL;
	} else
#endif
	{
859
		atype = inet_addr_type(svc->net, udest->addr.ip);
860 861 862
		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
863

864
	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
865
	if (dest == NULL)
L
Linus Torvalds 已提交
866
		return -ENOMEM;
867

868
	dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
869
	if (!dest->stats.cpustats)
870
		goto err_alloc;
L
Linus Torvalds 已提交
871

872 873 874 875 876 877
	for_each_possible_cpu(i) {
		struct ip_vs_cpu_stats *ip_vs_dest_stats;
		ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
		u64_stats_init(&ip_vs_dest_stats->syncp);
	}

878
	dest->af = svc->af;
L
Linus Torvalds 已提交
879
	dest->protocol = svc->protocol;
880
	dest->vaddr = svc->addr;
L
Linus Torvalds 已提交
881 882
	dest->vport = svc->port;
	dest->vfwmark = svc->fwmark;
883
	ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
L
Linus Torvalds 已提交
884 885 886 887 888
	dest->port = udest->port;

	atomic_set(&dest->activeconns, 0);
	atomic_set(&dest->inactconns, 0);
	atomic_set(&dest->persistconns, 0);
889
	atomic_set(&dest->refcnt, 1);
L
Linus Torvalds 已提交
890

891
	INIT_HLIST_NODE(&dest->d_list);
L
Linus Torvalds 已提交
892 893
	spin_lock_init(&dest->dst_lock);
	spin_lock_init(&dest->stats.lock);
894
	__ip_vs_update_dest(svc, dest, udest, 1);
L
Linus Torvalds 已提交
895 896 897 898 899

	*dest_p = dest;

	LeaveFunction(2);
	return 0;
900 901 902 903

err_alloc:
	kfree(dest);
	return -ENOMEM;
L
Linus Torvalds 已提交
904 905 906 907 908 909 910
}


/*
 *	Add a destination into an existing service
 */
static int
911
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
L
Linus Torvalds 已提交
912 913
{
	struct ip_vs_dest *dest;
914
	union nf_inet_addr daddr;
A
Al Viro 已提交
915
	__be16 dport = udest->port;
L
Linus Torvalds 已提交
916 917 918 919 920
	int ret;

	EnterFunction(2);

	if (udest->weight < 0) {
921
		pr_err("%s(): server weight less than zero\n", __func__);
L
Linus Torvalds 已提交
922 923 924 925
		return -ERANGE;
	}

	if (udest->l_threshold > udest->u_threshold) {
926 927
		pr_err("%s(): lower threshold is higher than upper threshold\n",
			__func__);
L
Linus Torvalds 已提交
928 929 930
		return -ERANGE;
	}

931 932
	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);

J
Julian Anastasov 已提交
933 934
	/* We use function that requires RCU lock */
	rcu_read_lock();
935
	dest = ip_vs_lookup_dest(svc, &daddr, dport);
J
Julian Anastasov 已提交
936
	rcu_read_unlock();
937

L
Linus Torvalds 已提交
938
	if (dest != NULL) {
939
		IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
L
Linus Torvalds 已提交
940 941 942 943 944 945 946
		return -EEXIST;
	}

	/*
	 * Check if the dest already exists in the trash and
	 * is from the same service
	 */
947 948
	dest = ip_vs_trash_get_dest(svc, &daddr, dport);

L
Linus Torvalds 已提交
949
	if (dest != NULL) {
950 951 952 953 954 955 956 957
		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
			      "dest->refcnt=%d, service %u/%s:%u\n",
			      IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
			      atomic_read(&dest->refcnt),
			      dest->vfwmark,
			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
			      ntohs(dest->vport));

958 959 960
		__ip_vs_update_dest(svc, dest, udest, 1);
		ret = 0;
	} else {
L
Linus Torvalds 已提交
961
		/*
962
		 * Allocate and initialize the dest structure
L
Linus Torvalds 已提交
963
		 */
964
		ret = ip_vs_new_dest(svc, udest, &dest);
L
Linus Torvalds 已提交
965 966 967
	}
	LeaveFunction(2);

968
	return ret;
L
Linus Torvalds 已提交
969 970 971 972 973 974 975
}


/*
 *	Edit a destination in the given service
 */
static int
976
ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
L
Linus Torvalds 已提交
977 978
{
	struct ip_vs_dest *dest;
979
	union nf_inet_addr daddr;
A
Al Viro 已提交
980
	__be16 dport = udest->port;
L
Linus Torvalds 已提交
981 982 983 984

	EnterFunction(2);

	if (udest->weight < 0) {
985
		pr_err("%s(): server weight less than zero\n", __func__);
L
Linus Torvalds 已提交
986 987 988 989
		return -ERANGE;
	}

	if (udest->l_threshold > udest->u_threshold) {
990 991
		pr_err("%s(): lower threshold is higher than upper threshold\n",
			__func__);
L
Linus Torvalds 已提交
992 993 994
		return -ERANGE;
	}

995 996
	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);

J
Julian Anastasov 已提交
997 998
	/* We use function that requires RCU lock */
	rcu_read_lock();
999
	dest = ip_vs_lookup_dest(svc, &daddr, dport);
J
Julian Anastasov 已提交
1000
	rcu_read_unlock();
1001

L
Linus Torvalds 已提交
1002
	if (dest == NULL) {
1003
		IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
L
Linus Torvalds 已提交
1004 1005 1006
		return -ENOENT;
	}

1007
	__ip_vs_update_dest(svc, dest, udest, 0);
L
Linus Torvalds 已提交
1008 1009 1010 1011 1012 1013 1014 1015
	LeaveFunction(2);

	return 0;
}

/*
 *	Delete a destination (must be already unlinked from the service)
 */
J
Julian Anastasov 已提交
1016 1017
static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
			     bool cleanup)
L
Linus Torvalds 已提交
1018
{
1019 1020
	struct netns_ipvs *ipvs = net_ipvs(net);

1021
	ip_vs_stop_estimator(net, &dest->stats);
L
Linus Torvalds 已提交
1022 1023 1024 1025 1026 1027

	/*
	 *  Remove it from the d-linked list with the real services.
	 */
	ip_vs_rs_unhash(dest);

J
Julian Anastasov 已提交
1028 1029 1030 1031 1032 1033
	spin_lock_bh(&ipvs->dest_trash_lock);
	IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
		      atomic_read(&dest->refcnt));
	if (list_empty(&ipvs->dest_trash) && !cleanup)
		mod_timer(&ipvs->dest_trash_timer,
1034
			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
J
Julian Anastasov 已提交
1035 1036
	/* dest lives in trash without reference */
	list_add(&dest->t_list, &ipvs->dest_trash);
1037
	dest->idle_start = 0;
J
Julian Anastasov 已提交
1038 1039
	spin_unlock_bh(&ipvs->dest_trash_lock);
	ip_vs_dest_put(dest);
L
Linus Torvalds 已提交
1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
}


/*
 *	Unlink a destination from the given service
 */
static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
				struct ip_vs_dest *dest,
				int svcupd)
{
	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;

	/*
	 *  Remove it from the d-linked destination list.
	 */
J
Julian Anastasov 已提交
1055
	list_del_rcu(&dest->n_list);
L
Linus Torvalds 已提交
1056
	svc->num_dests--;
1057

J
Julian Anastasov 已提交
1058 1059
	if (svcupd) {
		struct ip_vs_scheduler *sched;
1060

J
Julian Anastasov 已提交
1061 1062 1063 1064
		sched = rcu_dereference_protected(svc->scheduler, 1);
		if (sched->del_dest)
			sched->del_dest(svc, dest);
	}
L
Linus Torvalds 已提交
1065 1066 1067 1068 1069 1070 1071
}


/*
 *	Delete a destination server in the given service
 */
static int
1072
ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
L
Linus Torvalds 已提交
1073 1074
{
	struct ip_vs_dest *dest;
A
Al Viro 已提交
1075
	__be16 dport = udest->port;
L
Linus Torvalds 已提交
1076 1077 1078

	EnterFunction(2);

J
Julian Anastasov 已提交
1079 1080
	/* We use function that requires RCU lock */
	rcu_read_lock();
1081
	dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
J
Julian Anastasov 已提交
1082
	rcu_read_unlock();
1083

L
Linus Torvalds 已提交
1084
	if (dest == NULL) {
1085
		IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
L
Linus Torvalds 已提交
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
		return -ENOENT;
	}

	/*
	 *	Unlink dest from the service
	 */
	__ip_vs_unlink_dest(svc, dest, 1);

	/*
	 *	Delete the destination
	 */
J
Julian Anastasov 已提交
1097
	__ip_vs_del_dest(svc->net, dest, false);
L
Linus Torvalds 已提交
1098 1099 1100 1101 1102 1103

	LeaveFunction(2);

	return 0;
}

J
Julian Anastasov 已提交
1104 1105 1106 1107 1108
static void ip_vs_dest_trash_expire(unsigned long data)
{
	struct net *net = (struct net *) data;
	struct netns_ipvs *ipvs = net_ipvs(net);
	struct ip_vs_dest *dest, *next;
1109
	unsigned long now = jiffies;
J
Julian Anastasov 已提交
1110 1111 1112 1113 1114

	spin_lock(&ipvs->dest_trash_lock);
	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
		if (atomic_read(&dest->refcnt) > 0)
			continue;
1115 1116 1117 1118 1119 1120 1121 1122
		if (dest->idle_start) {
			if (time_before(now, dest->idle_start +
					     IP_VS_DEST_TRASH_PERIOD))
				continue;
		} else {
			dest->idle_start = max(1UL, now);
			continue;
		}
J
Julian Anastasov 已提交
1123 1124
		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
			      dest->vfwmark,
1125
			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
J
Julian Anastasov 已提交
1126 1127 1128 1129 1130 1131
			      ntohs(dest->port));
		list_del(&dest->t_list);
		ip_vs_dest_free(dest);
	}
	if (!list_empty(&ipvs->dest_trash))
		mod_timer(&ipvs->dest_trash_timer,
1132
			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
J
Julian Anastasov 已提交
1133 1134
	spin_unlock(&ipvs->dest_trash_lock);
}
L
Linus Torvalds 已提交
1135 1136 1137 1138 1139

/*
 *	Add a service into the service hash table
 */
static int
1140
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1141
		  struct ip_vs_service **svc_p)
L
Linus Torvalds 已提交
1142
{
1143
	int ret = 0, i;
L
Linus Torvalds 已提交
1144
	struct ip_vs_scheduler *sched = NULL;
1145
	struct ip_vs_pe *pe = NULL;
L
Linus Torvalds 已提交
1146
	struct ip_vs_service *svc = NULL;
1147
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
1148 1149 1150 1151 1152 1153 1154

	/* increase the module use count */
	ip_vs_use_count_inc();

	/* Lookup the scheduler by 'u->sched_name' */
	sched = ip_vs_scheduler_get(u->sched_name);
	if (sched == NULL) {
1155
		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
L
Linus Torvalds 已提交
1156
		ret = -ENOENT;
1157
		goto out_err;
L
Linus Torvalds 已提交
1158 1159
	}

1160
	if (u->pe_name && *u->pe_name) {
1161
		pe = ip_vs_pe_getbyname(u->pe_name);
1162 1163 1164 1165 1166 1167 1168 1169
		if (pe == NULL) {
			pr_info("persistence engine module ip_vs_pe_%s "
				"not found\n", u->pe_name);
			ret = -ENOENT;
			goto out_err;
		}
	}

1170
#ifdef CONFIG_IP_VS_IPV6
1171 1172 1173 1174 1175 1176 1177
	if (u->af == AF_INET6) {
		__u32 plen = (__force __u32) u->netmask;

		if (plen < 1 || plen > 128) {
			ret = -EINVAL;
			goto out_err;
		}
1178 1179 1180
	}
#endif

1181
	svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
L
Linus Torvalds 已提交
1182
	if (svc == NULL) {
1183
		IP_VS_DBG(1, "%s(): no memory\n", __func__);
L
Linus Torvalds 已提交
1184 1185 1186
		ret = -ENOMEM;
		goto out_err;
	}
1187
	svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
J
Julia Lawall 已提交
1188 1189
	if (!svc->stats.cpustats) {
		ret = -ENOMEM;
1190
		goto out_err;
J
Julia Lawall 已提交
1191
	}
L
Linus Torvalds 已提交
1192

1193 1194 1195 1196 1197 1198 1199
	for_each_possible_cpu(i) {
		struct ip_vs_cpu_stats *ip_vs_stats;
		ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
		u64_stats_init(&ip_vs_stats->syncp);
	}


L
Linus Torvalds 已提交
1200 1201 1202
	/* I'm the first user of the service */
	atomic_set(&svc->refcnt, 0);

1203
	svc->af = u->af;
L
Linus Torvalds 已提交
1204
	svc->protocol = u->protocol;
1205
	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
L
Linus Torvalds 已提交
1206 1207 1208 1209 1210
	svc->port = u->port;
	svc->fwmark = u->fwmark;
	svc->flags = u->flags;
	svc->timeout = u->timeout * HZ;
	svc->netmask = u->netmask;
1211
	svc->net = net;
L
Linus Torvalds 已提交
1212 1213

	INIT_LIST_HEAD(&svc->destinations);
1214
	spin_lock_init(&svc->sched_lock);
L
Linus Torvalds 已提交
1215 1216 1217 1218 1219 1220 1221 1222
	spin_lock_init(&svc->stats.lock);

	/* Bind the scheduler */
	ret = ip_vs_bind_scheduler(svc, sched);
	if (ret)
		goto out_err;
	sched = NULL;

1223
	/* Bind the ct retriever */
J
Julian Anastasov 已提交
1224
	RCU_INIT_POINTER(svc->pe, pe);
1225 1226
	pe = NULL;

L
Linus Torvalds 已提交
1227 1228
	/* Update the virtual service counters */
	if (svc->port == FTPPORT)
1229
		atomic_inc(&ipvs->ftpsvc_counter);
L
Linus Torvalds 已提交
1230
	else if (svc->port == 0)
1231
		atomic_inc(&ipvs->nullsvc_counter);
L
Linus Torvalds 已提交
1232

1233
	ip_vs_start_estimator(net, &svc->stats);
1234 1235 1236

	/* Count only IPv4 services for old get/setsockopt interface */
	if (svc->af == AF_INET)
1237
		ipvs->num_services++;
L
Linus Torvalds 已提交
1238 1239 1240 1241 1242

	/* Hash the service into the service table */
	ip_vs_svc_hash(svc);

	*svc_p = svc;
1243 1244
	/* Now there is a service - full throttle */
	ipvs->enable = 1;
L
Linus Torvalds 已提交
1245 1246
	return 0;

1247

1248
 out_err:
L
Linus Torvalds 已提交
1249
	if (svc != NULL) {
J
Julian Anastasov 已提交
1250 1251
		ip_vs_unbind_scheduler(svc, sched);
		ip_vs_service_free(svc);
L
Linus Torvalds 已提交
1252 1253
	}
	ip_vs_scheduler_put(sched);
1254
	ip_vs_pe_put(pe);
L
Linus Torvalds 已提交
1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266

	/* decrease the module use count */
	ip_vs_use_count_dec();

	return ret;
}


/*
 *	Edit a service and bind it with a new scheduler
 */
static int
1267
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
L
Linus Torvalds 已提交
1268 1269
{
	struct ip_vs_scheduler *sched, *old_sched;
1270
	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
L
Linus Torvalds 已提交
1271 1272 1273 1274 1275 1276 1277
	int ret = 0;

	/*
	 * Lookup the scheduler, by 'u->sched_name'
	 */
	sched = ip_vs_scheduler_get(u->sched_name);
	if (sched == NULL) {
1278
		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
L
Linus Torvalds 已提交
1279 1280 1281 1282
		return -ENOENT;
	}
	old_sched = sched;

1283
	if (u->pe_name && *u->pe_name) {
1284
		pe = ip_vs_pe_getbyname(u->pe_name);
1285 1286 1287 1288 1289 1290 1291 1292 1293
		if (pe == NULL) {
			pr_info("persistence engine module ip_vs_pe_%s "
				"not found\n", u->pe_name);
			ret = -ENOENT;
			goto out;
		}
		old_pe = pe;
	}

1294
#ifdef CONFIG_IP_VS_IPV6
1295 1296 1297 1298 1299 1300 1301
	if (u->af == AF_INET6) {
		__u32 plen = (__force __u32) u->netmask;

		if (plen < 1 || plen > 128) {
			ret = -EINVAL;
			goto out;
		}
1302 1303 1304
	}
#endif

J
Julian Anastasov 已提交
1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315
	old_sched = rcu_dereference_protected(svc->scheduler, 1);
	if (sched != old_sched) {
		/* Bind the new scheduler */
		ret = ip_vs_bind_scheduler(svc, sched);
		if (ret) {
			old_sched = sched;
			goto out;
		}
		/* Unbind the old scheduler on success */
		ip_vs_unbind_scheduler(svc, old_sched);
	}
L
Linus Torvalds 已提交
1316 1317 1318 1319 1320 1321 1322 1323

	/*
	 * Set the flags and timeout value
	 */
	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
	svc->timeout = u->timeout * HZ;
	svc->netmask = u->netmask;

J
Julian Anastasov 已提交
1324 1325 1326
	old_pe = rcu_dereference_protected(svc->pe, 1);
	if (pe != old_pe)
		rcu_assign_pointer(svc->pe, pe);
L
Linus Torvalds 已提交
1327

H
Hans Schillstrom 已提交
1328
out:
1329
	ip_vs_scheduler_put(old_sched);
1330
	ip_vs_pe_put(old_pe);
L
Linus Torvalds 已提交
1331 1332 1333 1334 1335 1336 1337 1338
	return ret;
}

/*
 *	Delete a service from the service list
 *	- The service must be unlinked, unlocked and not referenced!
 *	- We are called under _bh lock
 */
J
Julian Anastasov 已提交
1339
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
L
Linus Torvalds 已提交
1340 1341 1342
{
	struct ip_vs_dest *dest, *nxt;
	struct ip_vs_scheduler *old_sched;
1343
	struct ip_vs_pe *old_pe;
1344
	struct netns_ipvs *ipvs = net_ipvs(svc->net);
1345 1346

	pr_info("%s: enter\n", __func__);
L
Linus Torvalds 已提交
1347

1348 1349
	/* Count only IPv4 services for old get/setsockopt interface */
	if (svc->af == AF_INET)
1350
		ipvs->num_services--;
1351

1352
	ip_vs_stop_estimator(svc->net, &svc->stats);
L
Linus Torvalds 已提交
1353 1354

	/* Unbind scheduler */
J
Julian Anastasov 已提交
1355 1356
	old_sched = rcu_dereference_protected(svc->scheduler, 1);
	ip_vs_unbind_scheduler(svc, old_sched);
1357
	ip_vs_scheduler_put(old_sched);
L
Linus Torvalds 已提交
1358

J
Julian Anastasov 已提交
1359 1360
	/* Unbind persistence engine, keep svc->pe */
	old_pe = rcu_dereference_protected(svc->pe, 1);
1361 1362
	ip_vs_pe_put(old_pe);

L
Linus Torvalds 已提交
1363 1364 1365 1366 1367
	/*
	 *    Unlink the whole destination list
	 */
	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
		__ip_vs_unlink_dest(svc, dest, 0);
J
Julian Anastasov 已提交
1368
		__ip_vs_del_dest(svc->net, dest, cleanup);
L
Linus Torvalds 已提交
1369 1370 1371 1372 1373 1374
	}

	/*
	 *    Update the virtual service counters
	 */
	if (svc->port == FTPPORT)
1375
		atomic_dec(&ipvs->ftpsvc_counter);
L
Linus Torvalds 已提交
1376
	else if (svc->port == 0)
1377
		atomic_dec(&ipvs->nullsvc_counter);
L
Linus Torvalds 已提交
1378 1379 1380 1381

	/*
	 *    Free the service if nobody refers to it
	 */
1382
	__ip_vs_svc_put(svc, true);
L
Linus Torvalds 已提交
1383 1384 1385 1386 1387 1388

	/* decrease the module use count */
	ip_vs_use_count_dec();
}

/*
1389
 * Unlink a service from list and try to delete it if its refcnt reached 0
L
Linus Torvalds 已提交
1390
 */
J
Julian Anastasov 已提交
1391
static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
L
Linus Torvalds 已提交
1392
{
J
Julian Anastasov 已提交
1393 1394
	/* Hold svc to avoid double release from dest_trash */
	atomic_inc(&svc->refcnt);
L
Linus Torvalds 已提交
1395 1396 1397 1398 1399
	/*
	 * Unhash it from the service table
	 */
	ip_vs_svc_unhash(svc);

J
Julian Anastasov 已提交
1400
	__ip_vs_del_service(svc, cleanup);
1401 1402 1403 1404 1405 1406 1407 1408 1409
}

/*
 *	Delete a service from the service list
 */
static int ip_vs_del_service(struct ip_vs_service *svc)
{
	if (svc == NULL)
		return -EEXIST;
J
Julian Anastasov 已提交
1410
	ip_vs_unlink_service(svc, false);
L
Linus Torvalds 已提交
1411 1412 1413 1414 1415 1416 1417 1418

	return 0;
}


/*
 *	Flush all the virtual services
 */
J
Julian Anastasov 已提交
1419
static int ip_vs_flush(struct net *net, bool cleanup)
L
Linus Torvalds 已提交
1420 1421
{
	int idx;
J
Julian Anastasov 已提交
1422 1423
	struct ip_vs_service *svc;
	struct hlist_node *n;
L
Linus Torvalds 已提交
1424 1425

	/*
1426
	 * Flush the service table hashed by <netns,protocol,addr,port>
L
Linus Torvalds 已提交
1427 1428
	 */
	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1429 1430
		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
					  s_list) {
1431
			if (net_eq(svc->net, net))
J
Julian Anastasov 已提交
1432
				ip_vs_unlink_service(svc, cleanup);
L
Linus Torvalds 已提交
1433 1434 1435 1436 1437 1438 1439
		}
	}

	/*
	 * Flush the service table hashed by fwmark
	 */
	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1440 1441
		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
					  f_list) {
1442
			if (net_eq(svc->net, net))
J
Julian Anastasov 已提交
1443
				ip_vs_unlink_service(svc, cleanup);
L
Linus Torvalds 已提交
1444 1445 1446 1447 1448 1449
		}
	}

	return 0;
}

1450 1451 1452 1453
/*
 *	Delete service by {netns} in the service table.
 *	Called by __ip_vs_cleanup()
 */
1454
void ip_vs_service_net_cleanup(struct net *net)
1455 1456 1457 1458
{
	EnterFunction(2);
	/* Check for "full" addressed entries */
	mutex_lock(&__ip_vs_mutex);
J
Julian Anastasov 已提交
1459
	ip_vs_flush(net, true);
1460 1461 1462
	mutex_unlock(&__ip_vs_mutex);
	LeaveFunction(2);
}
1463 1464

/* Put all references for device (dst_cache) */
1465
static inline void
1466
ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
1467
{
1468 1469
	struct ip_vs_dest_dst *dest_dst;

1470
	spin_lock_bh(&dest->dst_lock);
1471 1472
	dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
	if (dest_dst && dest_dst->dst_cache->dev == dev) {
1473 1474 1475 1476 1477
		IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
			      dev->name,
			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
			      ntohs(dest->port),
			      atomic_read(&dest->refcnt));
1478
		__ip_vs_dst_cache_reset(dest);
1479 1480 1481 1482
	}
	spin_unlock_bh(&dest->dst_lock);

}
1483 1484
/* Netdev event receiver
 * Currently only NETDEV_DOWN is handled to release refs to cached dsts
1485 1486
 */
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1487
			   void *ptr)
1488
{
1489
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1490
	struct net *net = dev_net(dev);
1491
	struct netns_ipvs *ipvs = net_ipvs(net);
1492 1493 1494 1495
	struct ip_vs_service *svc;
	struct ip_vs_dest *dest;
	unsigned int idx;

1496
	if (event != NETDEV_DOWN || !ipvs)
1497 1498 1499 1500 1501
		return NOTIFY_DONE;
	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
	EnterFunction(2);
	mutex_lock(&__ip_vs_mutex);
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1502
		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1503 1504 1505
			if (net_eq(svc->net, net)) {
				list_for_each_entry(dest, &svc->destinations,
						    n_list) {
1506
					ip_vs_forget_dev(dest, dev);
1507 1508 1509 1510
				}
			}
		}

J
Julian Anastasov 已提交
1511
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1512 1513 1514
			if (net_eq(svc->net, net)) {
				list_for_each_entry(dest, &svc->destinations,
						    n_list) {
1515
					ip_vs_forget_dev(dest, dev);
1516 1517 1518 1519 1520 1521
				}
			}

		}
	}

J
Julian Anastasov 已提交
1522 1523
	spin_lock_bh(&ipvs->dest_trash_lock);
	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1524
		ip_vs_forget_dev(dest, dev);
1525
	}
J
Julian Anastasov 已提交
1526
	spin_unlock_bh(&ipvs->dest_trash_lock);
1527 1528 1529 1530
	mutex_unlock(&__ip_vs_mutex);
	LeaveFunction(2);
	return NOTIFY_DONE;
}
L
Linus Torvalds 已提交
1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545

/*
 *	Zero counters in a service or all services
 */
static int ip_vs_zero_service(struct ip_vs_service *svc)
{
	struct ip_vs_dest *dest;

	list_for_each_entry(dest, &svc->destinations, n_list) {
		ip_vs_zero_stats(&dest->stats);
	}
	ip_vs_zero_stats(&svc->stats);
	return 0;
}

1546
static int ip_vs_zero_all(struct net *net)
L
Linus Torvalds 已提交
1547 1548 1549 1550 1551
{
	int idx;
	struct ip_vs_service *svc;

	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1552
		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1553 1554
			if (net_eq(svc->net, net))
				ip_vs_zero_service(svc);
L
Linus Torvalds 已提交
1555 1556 1557 1558
		}
	}

	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1559
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1560 1561
			if (net_eq(svc->net, net))
				ip_vs_zero_service(svc);
L
Linus Torvalds 已提交
1562 1563 1564
		}
	}

J
Julian Anastasov 已提交
1565
	ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
L
Linus Torvalds 已提交
1566 1567 1568
	return 0;
}

1569
#ifdef CONFIG_SYSCTL
1570 1571 1572 1573

static int zero;
static int three = 3;

L
Linus Torvalds 已提交
1574
static int
1575
proc_do_defense_mode(struct ctl_table *table, int write,
L
Linus Torvalds 已提交
1576 1577
		     void __user *buffer, size_t *lenp, loff_t *ppos)
{
1578
	struct net *net = current->nsproxy->net_ns;
L
Linus Torvalds 已提交
1579 1580 1581 1582
	int *valp = table->data;
	int val = *valp;
	int rc;

1583
	rc = proc_dointvec(table, write, buffer, lenp, ppos);
L
Linus Torvalds 已提交
1584 1585 1586 1587 1588
	if (write && (*valp != val)) {
		if ((*valp < 0) || (*valp > 3)) {
			/* Restore the correct value */
			*valp = val;
		} else {
1589
			update_defense_level(net_ipvs(net));
L
Linus Torvalds 已提交
1590 1591 1592 1593 1594 1595
		}
	}
	return rc;
}

static int
1596
proc_do_sync_threshold(struct ctl_table *table, int write,
L
Linus Torvalds 已提交
1597 1598 1599 1600 1601 1602 1603 1604 1605
		       void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int *valp = table->data;
	int val[2];
	int rc;

	/* backup the value first */
	memcpy(val, valp, sizeof(val));

1606
	rc = proc_dointvec(table, write, buffer, lenp, ppos);
1607 1608
	if (write && (valp[0] < 0 || valp[1] < 0 ||
	    (valp[0] >= valp[1] && valp[1]))) {
L
Linus Torvalds 已提交
1609 1610 1611 1612 1613 1614
		/* Restore the correct value */
		memcpy(valp, val, sizeof(val));
	}
	return rc;
}

1615
static int
1616
proc_do_sync_mode(struct ctl_table *table, int write,
1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627
		     void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int *valp = table->data;
	int val = *valp;
	int rc;

	rc = proc_dointvec(table, write, buffer, lenp, ppos);
	if (write && (*valp != val)) {
		if ((*valp < 0) || (*valp > 1)) {
			/* Restore the correct value */
			*valp = val;
1628 1629 1630 1631 1632 1633
		}
	}
	return rc;
}

static int
1634
proc_do_sync_ports(struct ctl_table *table, int write,
1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645
		   void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int *valp = table->data;
	int val = *valp;
	int rc;

	rc = proc_dointvec(table, write, buffer, lenp, ppos);
	if (write && (*valp != val)) {
		if (*valp < 1 || !is_power_of_2(*valp)) {
			/* Restore the correct value */
			*valp = val;
1646 1647 1648 1649
		}
	}
	return rc;
}
L
Linus Torvalds 已提交
1650 1651 1652

/*
 *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1653
 *	Do not change order or insert new entries without
1654
 *	align with netns init in ip_vs_control_net_init()
L
Linus Torvalds 已提交
1655 1656 1657 1658 1659 1660 1661
 */

static struct ctl_table vs_vars[] = {
	{
		.procname	= "amemthresh",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1662
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
1663 1664 1665 1666 1667
	},
	{
		.procname	= "am_droprate",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1668
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
1669 1670 1671 1672 1673
	},
	{
		.procname	= "drop_entry",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1674
		.proc_handler	= proc_do_defense_mode,
L
Linus Torvalds 已提交
1675 1676 1677 1678 1679
	},
	{
		.procname	= "drop_packet",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1680
		.proc_handler	= proc_do_defense_mode,
L
Linus Torvalds 已提交
1681
	},
1682 1683 1684 1685 1686 1687 1688 1689
#ifdef CONFIG_IP_VS_NFCT
	{
		.procname	= "conntrack",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec,
	},
#endif
L
Linus Torvalds 已提交
1690 1691 1692 1693
	{
		.procname	= "secure_tcp",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1694
		.proc_handler	= proc_do_defense_mode,
L
Linus Torvalds 已提交
1695
	},
1696 1697 1698 1699 1700 1701
	{
		.procname	= "snat_reroute",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec,
	},
1702 1703 1704 1705 1706 1707
	{
		.procname	= "sync_version",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_do_sync_mode,
	},
1708 1709 1710 1711 1712 1713
	{
		.procname	= "sync_ports",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_do_sync_ports,
	},
1714 1715 1716 1717 1718 1719
	{
		.procname	= "sync_persist_mode",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
P
Pablo Neira Ayuso 已提交
1720 1721
	{
		.procname	= "sync_qlen_max",
1722
		.maxlen		= sizeof(unsigned long),
P
Pablo Neira Ayuso 已提交
1723
		.mode		= 0644,
1724
		.proc_handler	= proc_doulongvec_minmax,
P
Pablo Neira Ayuso 已提交
1725 1726 1727 1728 1729 1730 1731
	},
	{
		.procname	= "sync_sock_size",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743
	{
		.procname	= "cache_bypass",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
	{
		.procname	= "expire_nodest_conn",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
A
Alexander Frolkin 已提交
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755
	{
		.procname	= "sloppy_tcp",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
	{
		.procname	= "sloppy_sctp",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768
	{
		.procname	= "expire_quiescent_template",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
	{
		.procname	= "sync_threshold",
		.maxlen		=
			sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
		.mode		= 0644,
		.proc_handler	= proc_do_sync_threshold,
	},
1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782
	{
		.procname	= "sync_refresh_period",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_jiffies,
	},
	{
		.procname	= "sync_retries",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= &zero,
		.extra2		= &three,
	},
1783 1784 1785 1786 1787 1788
	{
		.procname	= "nat_icmp_send",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1789 1790 1791 1792 1793 1794
	{
		.procname	= "pmtu_disc",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1795 1796 1797 1798 1799 1800
	{
		.procname	= "backup_only",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1801 1802 1803 1804 1805 1806 1807 1808
#ifdef CONFIG_IP_VS_DEBUG
	{
		.procname	= "debug_level",
		.data		= &sysctl_ip_vs_debug_level,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
L
Linus Torvalds 已提交
1809
#endif
1810
	{ }
L
Linus Torvalds 已提交
1811 1812
};

1813
#endif
L
Linus Torvalds 已提交
1814 1815 1816 1817

#ifdef CONFIG_PROC_FS

struct ip_vs_iter {
1818
	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
J
Julian Anastasov 已提交
1819
	struct hlist_head *table;
L
Linus Torvalds 已提交
1820 1821 1822 1823 1824 1825 1826
	int bucket;
};

/*
 *	Write the contents of the VS rule table to a PROCfs file.
 *	(It is kept just for backward compatibility)
 */
1827
static inline const char *ip_vs_fwd_name(unsigned int flags)
L
Linus Torvalds 已提交
1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844
{
	switch (flags & IP_VS_CONN_F_FWD_MASK) {
	case IP_VS_CONN_F_LOCALNODE:
		return "Local";
	case IP_VS_CONN_F_TUNNEL:
		return "Tunnel";
	case IP_VS_CONN_F_DROUTE:
		return "Route";
	default:
		return "Masq";
	}
}


/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
1845
	struct net *net = seq_file_net(seq);
L
Linus Torvalds 已提交
1846 1847 1848 1849 1850 1851
	struct ip_vs_iter *iter = seq->private;
	int idx;
	struct ip_vs_service *svc;

	/* look in hash by protocol */
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1852
		hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
1853
			if (net_eq(svc->net, net) && pos-- == 0) {
L
Linus Torvalds 已提交
1854 1855 1856 1857 1858 1859 1860 1861 1862
				iter->table = ip_vs_svc_table;
				iter->bucket = idx;
				return svc;
			}
		}
	}

	/* keep looking in fwmark */
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1863 1864
		hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
					 f_list) {
1865
			if (net_eq(svc->net, net) && pos-- == 0) {
L
Linus Torvalds 已提交
1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
				iter->table = ip_vs_svc_fwm_table;
				iter->bucket = idx;
				return svc;
			}
		}
	}

	return NULL;
}

static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1877
	__acquires(RCU)
L
Linus Torvalds 已提交
1878
{
J
Julian Anastasov 已提交
1879
	rcu_read_lock();
L
Linus Torvalds 已提交
1880 1881 1882 1883 1884 1885
	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}


static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
J
Julian Anastasov 已提交
1886
	struct hlist_node *e;
L
Linus Torvalds 已提交
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898
	struct ip_vs_iter *iter;
	struct ip_vs_service *svc;

	++*pos;
	if (v == SEQ_START_TOKEN)
		return ip_vs_info_array(seq,0);

	svc = v;
	iter = seq->private;

	if (iter->table == ip_vs_svc_table) {
		/* next service in table hashed by protocol */
J
Julian Anastasov 已提交
1899 1900 1901
		e = rcu_dereference(hlist_next_rcu(&svc->s_list));
		if (e)
			return hlist_entry(e, struct ip_vs_service, s_list);
L
Linus Torvalds 已提交
1902 1903

		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
J
Julian Anastasov 已提交
1904 1905 1906
			hlist_for_each_entry_rcu(svc,
						 &ip_vs_svc_table[iter->bucket],
						 s_list) {
L
Linus Torvalds 已提交
1907 1908 1909 1910 1911 1912 1913 1914 1915 1916
				return svc;
			}
		}

		iter->table = ip_vs_svc_fwm_table;
		iter->bucket = -1;
		goto scan_fwmark;
	}

	/* next service in hashed by fwmark */
J
Julian Anastasov 已提交
1917 1918 1919
	e = rcu_dereference(hlist_next_rcu(&svc->f_list));
	if (e)
		return hlist_entry(e, struct ip_vs_service, f_list);
L
Linus Torvalds 已提交
1920 1921 1922

 scan_fwmark:
	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
J
Julian Anastasov 已提交
1923 1924 1925
		hlist_for_each_entry_rcu(svc,
					 &ip_vs_svc_fwm_table[iter->bucket],
					 f_list)
L
Linus Torvalds 已提交
1926 1927 1928 1929 1930 1931 1932
			return svc;
	}

	return NULL;
}

static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1933
	__releases(RCU)
L
Linus Torvalds 已提交
1934
{
J
Julian Anastasov 已提交
1935
	rcu_read_unlock();
L
Linus Torvalds 已提交
1936 1937 1938 1939 1940 1941 1942 1943
}


static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
{
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq,
			"IP Virtual Server version %d.%d.%d (size=%d)\n",
1944
			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
L
Linus Torvalds 已提交
1945 1946 1947 1948 1949 1950 1951 1952
		seq_puts(seq,
			 "Prot LocalAddress:Port Scheduler Flags\n");
		seq_puts(seq,
			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
	} else {
		const struct ip_vs_service *svc = v;
		const struct ip_vs_iter *iter = seq->private;
		const struct ip_vs_dest *dest;
J
Julian Anastasov 已提交
1953
		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
L
Linus Torvalds 已提交
1954

1955 1956 1957
		if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
			if (svc->af == AF_INET6)
H
Harvey Harrison 已提交
1958
				seq_printf(seq, "%s  [%pI6]:%04X %s ",
1959
					   ip_vs_proto_name(svc->protocol),
1960
					   &svc->addr.in6,
1961
					   ntohs(svc->port),
J
Julian Anastasov 已提交
1962
					   sched->name);
1963 1964
			else
#endif
N
Nick Chalk 已提交
1965
				seq_printf(seq, "%s  %08X:%04X %s %s ",
1966 1967 1968
					   ip_vs_proto_name(svc->protocol),
					   ntohl(svc->addr.ip),
					   ntohs(svc->port),
J
Julian Anastasov 已提交
1969
					   sched->name,
N
Nick Chalk 已提交
1970
					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1971
		} else {
N
Nick Chalk 已提交
1972
			seq_printf(seq, "FWM  %08X %s %s",
J
Julian Anastasov 已提交
1973
				   svc->fwmark, sched->name,
N
Nick Chalk 已提交
1974
				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1975
		}
L
Linus Torvalds 已提交
1976 1977 1978 1979 1980 1981 1982 1983

		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
			seq_printf(seq, "persistent %d %08X\n",
				svc->timeout,
				ntohl(svc->netmask));
		else
			seq_putc(seq, '\n');

J
Julian Anastasov 已提交
1984
		list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
1985 1986 1987
#ifdef CONFIG_IP_VS_IPV6
			if (dest->af == AF_INET6)
				seq_printf(seq,
H
Harvey Harrison 已提交
1988
					   "  -> [%pI6]:%04X"
1989
					   "      %-7s %-6d %-10d %-10d\n",
1990
					   &dest->addr.in6,
1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
					   ntohs(dest->port),
					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
					   atomic_read(&dest->weight),
					   atomic_read(&dest->activeconns),
					   atomic_read(&dest->inactconns));
			else
#endif
				seq_printf(seq,
					   "  -> %08X:%04X      "
					   "%-7s %-6d %-10d %-10d\n",
					   ntohl(dest->addr.ip),
					   ntohs(dest->port),
					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
					   atomic_read(&dest->weight),
					   atomic_read(&dest->activeconns),
					   atomic_read(&dest->inactconns));

L
Linus Torvalds 已提交
2008 2009 2010 2011 2012
		}
	}
	return 0;
}

2013
static const struct seq_operations ip_vs_info_seq_ops = {
L
Linus Torvalds 已提交
2014 2015 2016 2017 2018 2019 2020 2021
	.start = ip_vs_info_seq_start,
	.next  = ip_vs_info_seq_next,
	.stop  = ip_vs_info_seq_stop,
	.show  = ip_vs_info_seq_show,
};

static int ip_vs_info_open(struct inode *inode, struct file *file)
{
2022
	return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2023
			sizeof(struct ip_vs_iter));
L
Linus Torvalds 已提交
2024 2025
}

2026
static const struct file_operations ip_vs_info_fops = {
L
Linus Torvalds 已提交
2027 2028 2029 2030
	.owner	 = THIS_MODULE,
	.open    = ip_vs_info_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
2031
	.release = seq_release_net,
L
Linus Torvalds 已提交
2032 2033 2034 2035
};

static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
2036
	struct net *net = seq_file_single_net(seq);
2037
	struct ip_vs_stats_user show;
L
Linus Torvalds 已提交
2038 2039 2040 2041 2042 2043 2044

/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
	seq_puts(seq,
		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
	seq_printf(seq,
		   "   Conns  Packets  Packets            Bytes            Bytes\n");

2045 2046 2047 2048 2049
	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
		   show.inpkts, show.outpkts,
		   (unsigned long long) show.inbytes,
		   (unsigned long long) show.outbytes);
L
Linus Torvalds 已提交
2050 2051 2052 2053

/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
	seq_puts(seq,
		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2054 2055 2056
	seq_printf(seq, "%8X %8X %8X %16X %16X\n",
			show.cps, show.inpps, show.outpps,
			show.inbps, show.outbps);
L
Linus Torvalds 已提交
2057 2058 2059 2060 2061 2062

	return 0;
}

static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
{
2063
	return single_open_net(inode, file, ip_vs_stats_show);
L
Linus Torvalds 已提交
2064 2065
}

2066
static const struct file_operations ip_vs_stats_fops = {
L
Linus Torvalds 已提交
2067 2068 2069 2070
	.owner = THIS_MODULE,
	.open = ip_vs_stats_seq_open,
	.read = seq_read,
	.llseek = seq_lseek,
2071
	.release = single_release_net,
L
Linus Torvalds 已提交
2072 2073
};

2074 2075 2076
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
	struct net *net = seq_file_single_net(seq);
J
Julian Anastasov 已提交
2077
	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2078
	struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
J
Julian Anastasov 已提交
2079
	struct ip_vs_stats_user rates;
2080 2081 2082 2083 2084 2085 2086 2087 2088
	int i;

/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
	seq_puts(seq,
		 "       Total Incoming Outgoing         Incoming         Outgoing\n");
	seq_printf(seq,
		   "CPU    Conns  Packets  Packets            Bytes            Bytes\n");

	for_each_possible_cpu(i) {
J
Julian Anastasov 已提交
2089 2090 2091 2092 2093
		struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
		unsigned int start;
		__u64 inbytes, outbytes;

		do {
2094
			start = u64_stats_fetch_begin_irq(&u->syncp);
J
Julian Anastasov 已提交
2095 2096
			inbytes = u->ustats.inbytes;
			outbytes = u->ustats.outbytes;
2097
		} while (u64_stats_fetch_retry_irq(&u->syncp, start));
J
Julian Anastasov 已提交
2098

2099
		seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
J
Julian Anastasov 已提交
2100 2101 2102
			   i, u->ustats.conns, u->ustats.inpkts,
			   u->ustats.outpkts, (__u64)inbytes,
			   (__u64)outbytes);
2103 2104 2105
	}

	spin_lock_bh(&tot_stats->lock);
J
Julian Anastasov 已提交
2106

2107 2108 2109 2110 2111 2112
	seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
		   tot_stats->ustats.conns, tot_stats->ustats.inpkts,
		   tot_stats->ustats.outpkts,
		   (unsigned long long) tot_stats->ustats.inbytes,
		   (unsigned long long) tot_stats->ustats.outbytes);

J
Julian Anastasov 已提交
2113 2114 2115 2116
	ip_vs_read_estimator(&rates, tot_stats);

	spin_unlock_bh(&tot_stats->lock);

2117 2118 2119 2120
/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
	seq_puts(seq,
		   "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
	seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
J
Julian Anastasov 已提交
2121 2122 2123 2124 2125
			rates.cps,
			rates.inpps,
			rates.outpps,
			rates.inbps,
			rates.outbps);
2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139

	return 0;
}

static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
{
	return single_open_net(inode, file, ip_vs_stats_percpu_show);
}

static const struct file_operations ip_vs_stats_percpu_fops = {
	.owner = THIS_MODULE,
	.open = ip_vs_stats_percpu_seq_open,
	.read = seq_read,
	.llseek = seq_lseek,
2140
	.release = single_release_net,
2141
};
L
Linus Torvalds 已提交
2142 2143 2144 2145 2146
#endif

/*
 *	Set timeout values for tcp tcpfin udp in the timeout_table.
 */
2147
static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
L
Linus Torvalds 已提交
2148
{
2149
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2150
	struct ip_vs_proto_data *pd;
2151
#endif
2152

L
Linus Torvalds 已提交
2153 2154 2155 2156 2157 2158 2159
	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
		  u->tcp_timeout,
		  u->tcp_fin_timeout,
		  u->udp_timeout);

#ifdef CONFIG_IP_VS_PROTO_TCP
	if (u->tcp_timeout) {
2160 2161
		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
		pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
L
Linus Torvalds 已提交
2162 2163 2164 2165
			= u->tcp_timeout * HZ;
	}

	if (u->tcp_fin_timeout) {
2166 2167
		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
		pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
L
Linus Torvalds 已提交
2168 2169 2170 2171 2172 2173
			= u->tcp_fin_timeout * HZ;
	}
#endif

#ifdef CONFIG_IP_VS_PROTO_UDP
	if (u->udp_timeout) {
2174 2175
		pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
		pd->timeout_table[IP_VS_UDP_S_NORMAL]
L
Linus Torvalds 已提交
2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190
			= u->udp_timeout * HZ;
	}
#endif
	return 0;
}


#define SET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
#define SERVICE_ARG_LEN		(sizeof(struct ip_vs_service_user))
#define SVCDEST_ARG_LEN		(sizeof(struct ip_vs_service_user) +	\
				 sizeof(struct ip_vs_dest_user))
#define TIMEOUT_ARG_LEN		(sizeof(struct ip_vs_timeout_user))
#define DAEMON_ARG_LEN		(sizeof(struct ip_vs_daemon_user))
#define MAX_ARG_LEN		SVCDEST_ARG_LEN

2191
static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
L
Linus Torvalds 已提交
2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204
	[SET_CMDID(IP_VS_SO_SET_ADD)]		= SERVICE_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_EDIT)]		= SERVICE_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_DEL)]		= SERVICE_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_FLUSH)]		= 0,
	[SET_CMDID(IP_VS_SO_SET_ADDDEST)]	= SVCDEST_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_DELDEST)]	= SVCDEST_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_EDITDEST)]	= SVCDEST_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_TIMEOUT)]	= TIMEOUT_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]	= DAEMON_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]	= DAEMON_ARG_LEN,
	[SET_CMDID(IP_VS_SO_SET_ZERO)]		= SERVICE_ARG_LEN,
};

2205 2206 2207
static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
				  struct ip_vs_service_user *usvc_compat)
{
2208 2209
	memset(usvc, 0, sizeof(*usvc));

2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226
	usvc->af		= AF_INET;
	usvc->protocol		= usvc_compat->protocol;
	usvc->addr.ip		= usvc_compat->addr;
	usvc->port		= usvc_compat->port;
	usvc->fwmark		= usvc_compat->fwmark;

	/* Deep copy of sched_name is not needed here */
	usvc->sched_name	= usvc_compat->sched_name;

	usvc->flags		= usvc_compat->flags;
	usvc->timeout		= usvc_compat->timeout;
	usvc->netmask		= usvc_compat->netmask;
}

static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
				   struct ip_vs_dest_user *udest_compat)
{
2227 2228
	memset(udest, 0, sizeof(*udest));

2229 2230 2231 2232 2233 2234 2235 2236
	udest->addr.ip		= udest_compat->addr;
	udest->port		= udest_compat->port;
	udest->conn_flags	= udest_compat->conn_flags;
	udest->weight		= udest_compat->weight;
	udest->u_threshold	= udest_compat->u_threshold;
	udest->l_threshold	= udest_compat->l_threshold;
}

L
Linus Torvalds 已提交
2237 2238 2239
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
2240
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
2241 2242
	int ret;
	unsigned char arg[MAX_ARG_LEN];
2243 2244
	struct ip_vs_service_user *usvc_compat;
	struct ip_vs_service_user_kern usvc;
L
Linus Torvalds 已提交
2245
	struct ip_vs_service *svc;
2246 2247
	struct ip_vs_dest_user *udest_compat;
	struct ip_vs_dest_user_kern udest;
2248
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
2249

2250
	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
2251 2252
		return -EPERM;

2253 2254 2255 2256
	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
		return -EINVAL;
	if (len < 0 || len >  MAX_ARG_LEN)
		return -EINVAL;
L
Linus Torvalds 已提交
2257
	if (len != set_arglen[SET_CMDID(cmd)]) {
2258 2259
		pr_err("set_ctl: len %u != %u\n",
		       len, set_arglen[SET_CMDID(cmd)]);
L
Linus Torvalds 已提交
2260 2261 2262 2263 2264 2265 2266 2267 2268
		return -EINVAL;
	}

	if (copy_from_user(arg, user, len) != 0)
		return -EFAULT;

	/* increase the module use count */
	ip_vs_use_count_inc();

2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
	/* Handle daemons since they have another lock */
	if (cmd == IP_VS_SO_SET_STARTDAEMON ||
	    cmd == IP_VS_SO_SET_STOPDAEMON) {
		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;

		if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
			ret = -ERESTARTSYS;
			goto out_dec;
		}
		if (cmd == IP_VS_SO_SET_STARTDAEMON)
			ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
						dm->syncid);
		else
			ret = stop_sync_thread(net, dm->state);
		mutex_unlock(&ipvs->sync_mutex);
		goto out_dec;
	}

2287
	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
L
Linus Torvalds 已提交
2288 2289 2290 2291 2292 2293
		ret = -ERESTARTSYS;
		goto out_dec;
	}

	if (cmd == IP_VS_SO_SET_FLUSH) {
		/* Flush the virtual service */
J
Julian Anastasov 已提交
2294
		ret = ip_vs_flush(net, false);
L
Linus Torvalds 已提交
2295 2296 2297
		goto out_unlock;
	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
		/* Set timeout values for (tcp tcpfin udp) */
2298
		ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
L
Linus Torvalds 已提交
2299 2300 2301
		goto out_unlock;
	}

2302 2303 2304 2305 2306 2307 2308
	usvc_compat = (struct ip_vs_service_user *)arg;
	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);

	/* We only use the new structs internally, so copy userspace compat
	 * structs to extended internal versions */
	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
	ip_vs_copy_udest_compat(&udest, udest_compat);
L
Linus Torvalds 已提交
2309 2310 2311

	if (cmd == IP_VS_SO_SET_ZERO) {
		/* if no service address is set, zero counters in all */
2312
		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2313
			ret = ip_vs_zero_all(net);
L
Linus Torvalds 已提交
2314 2315 2316 2317
			goto out_unlock;
		}
	}

2318 2319 2320
	/* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
	    usvc.protocol != IPPROTO_SCTP) {
2321 2322 2323
		pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
		       usvc.protocol, &usvc.addr.ip,
		       ntohs(usvc.port), usvc.sched_name);
L
Linus Torvalds 已提交
2324 2325 2326 2327 2328
		ret = -EFAULT;
		goto out_unlock;
	}

	/* Lookup the exact service by <protocol, addr, port> or fwmark */
J
Julian Anastasov 已提交
2329
	rcu_read_lock();
2330
	if (usvc.fwmark == 0)
2331
		svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2332
					   &usvc.addr, usvc.port);
L
Linus Torvalds 已提交
2333
	else
2334
		svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
J
Julian Anastasov 已提交
2335
	rcu_read_unlock();
L
Linus Torvalds 已提交
2336 2337

	if (cmd != IP_VS_SO_SET_ADD
2338
	    && (svc == NULL || svc->protocol != usvc.protocol)) {
L
Linus Torvalds 已提交
2339
		ret = -ESRCH;
2340
		goto out_unlock;
L
Linus Torvalds 已提交
2341 2342 2343 2344 2345 2346 2347
	}

	switch (cmd) {
	case IP_VS_SO_SET_ADD:
		if (svc != NULL)
			ret = -EEXIST;
		else
2348
			ret = ip_vs_add_service(net, &usvc, &svc);
L
Linus Torvalds 已提交
2349 2350
		break;
	case IP_VS_SO_SET_EDIT:
2351
		ret = ip_vs_edit_service(svc, &usvc);
L
Linus Torvalds 已提交
2352 2353 2354 2355 2356 2357 2358 2359 2360 2361
		break;
	case IP_VS_SO_SET_DEL:
		ret = ip_vs_del_service(svc);
		if (!ret)
			goto out_unlock;
		break;
	case IP_VS_SO_SET_ZERO:
		ret = ip_vs_zero_service(svc);
		break;
	case IP_VS_SO_SET_ADDDEST:
2362
		ret = ip_vs_add_dest(svc, &udest);
L
Linus Torvalds 已提交
2363 2364
		break;
	case IP_VS_SO_SET_EDITDEST:
2365
		ret = ip_vs_edit_dest(svc, &udest);
L
Linus Torvalds 已提交
2366 2367
		break;
	case IP_VS_SO_SET_DELDEST:
2368
		ret = ip_vs_del_dest(svc, &udest);
L
Linus Torvalds 已提交
2369 2370 2371 2372 2373 2374
		break;
	default:
		ret = -EINVAL;
	}

  out_unlock:
2375
	mutex_unlock(&__ip_vs_mutex);
L
Linus Torvalds 已提交
2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386
  out_dec:
	/* decrease the module use count */
	ip_vs_use_count_dec();

	return ret;
}


static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
J
Julian Anastasov 已提交
2387 2388 2389
	struct ip_vs_scheduler *sched;

	sched = rcu_dereference_protected(src->scheduler, 1);
L
Linus Torvalds 已提交
2390
	dst->protocol = src->protocol;
2391
	dst->addr = src->addr.ip;
L
Linus Torvalds 已提交
2392 2393
	dst->port = src->port;
	dst->fwmark = src->fwmark;
J
Julian Anastasov 已提交
2394
	strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
L
Linus Torvalds 已提交
2395 2396 2397 2398 2399 2400 2401 2402
	dst->flags = src->flags;
	dst->timeout = src->timeout / HZ;
	dst->netmask = src->netmask;
	dst->num_dests = src->num_dests;
	ip_vs_copy_stats(&dst->stats, &src->stats);
}

static inline int
2403 2404
__ip_vs_get_service_entries(struct net *net,
			    const struct ip_vs_get_services *get,
L
Linus Torvalds 已提交
2405 2406 2407 2408 2409 2410 2411 2412
			    struct ip_vs_get_services __user *uptr)
{
	int idx, count=0;
	struct ip_vs_service *svc;
	struct ip_vs_service_entry entry;
	int ret = 0;

	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
2413
		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2414
			/* Only expose IPv4 entries to old interface */
2415
			if (svc->af != AF_INET || !net_eq(svc->net, net))
2416 2417
				continue;

L
Linus Torvalds 已提交
2418 2419
			if (count >= get->num_services)
				goto out;
P
pageexec 已提交
2420
			memset(&entry, 0, sizeof(entry));
L
Linus Torvalds 已提交
2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431
			ip_vs_copy_service(&entry, svc);
			if (copy_to_user(&uptr->entrytable[count],
					 &entry, sizeof(entry))) {
				ret = -EFAULT;
				goto out;
			}
			count++;
		}
	}

	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
2432
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2433
			/* Only expose IPv4 entries to old interface */
2434
			if (svc->af != AF_INET || !net_eq(svc->net, net))
2435 2436
				continue;

L
Linus Torvalds 已提交
2437 2438
			if (count >= get->num_services)
				goto out;
P
pageexec 已提交
2439
			memset(&entry, 0, sizeof(entry));
L
Linus Torvalds 已提交
2440 2441 2442 2443 2444 2445 2446 2447 2448
			ip_vs_copy_service(&entry, svc);
			if (copy_to_user(&uptr->entrytable[count],
					 &entry, sizeof(entry))) {
				ret = -EFAULT;
				goto out;
			}
			count++;
		}
	}
H
Hans Schillstrom 已提交
2449
out:
L
Linus Torvalds 已提交
2450 2451 2452 2453
	return ret;
}

static inline int
2454
__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
L
Linus Torvalds 已提交
2455 2456 2457
			 struct ip_vs_get_dests __user *uptr)
{
	struct ip_vs_service *svc;
2458
	union nf_inet_addr addr = { .ip = get->addr };
L
Linus Torvalds 已提交
2459 2460
	int ret = 0;

J
Julian Anastasov 已提交
2461
	rcu_read_lock();
L
Linus Torvalds 已提交
2462
	if (get->fwmark)
2463
		svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
L
Linus Torvalds 已提交
2464
	else
2465
		svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2466
					   get->port);
J
Julian Anastasov 已提交
2467
	rcu_read_unlock();
2468

L
Linus Torvalds 已提交
2469 2470 2471 2472 2473
	if (svc) {
		int count = 0;
		struct ip_vs_dest *dest;
		struct ip_vs_dest_entry entry;

2474
		memset(&entry, 0, sizeof(entry));
L
Linus Torvalds 已提交
2475 2476 2477 2478
		list_for_each_entry(dest, &svc->destinations, n_list) {
			if (count >= get->num_dests)
				break;

2479
			entry.addr = dest->addr.ip;
L
Linus Torvalds 已提交
2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501
			entry.port = dest->port;
			entry.conn_flags = atomic_read(&dest->conn_flags);
			entry.weight = atomic_read(&dest->weight);
			entry.u_threshold = dest->u_threshold;
			entry.l_threshold = dest->l_threshold;
			entry.activeconns = atomic_read(&dest->activeconns);
			entry.inactconns = atomic_read(&dest->inactconns);
			entry.persistconns = atomic_read(&dest->persistconns);
			ip_vs_copy_stats(&entry.stats, &dest->stats);
			if (copy_to_user(&uptr->entrytable[count],
					 &entry, sizeof(entry))) {
				ret = -EFAULT;
				break;
			}
			count++;
		}
	} else
		ret = -ESRCH;
	return ret;
}

static inline void
2502
__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
L
Linus Torvalds 已提交
2503
{
2504
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2505
	struct ip_vs_proto_data *pd;
2506
#endif
2507

2508 2509
	memset(u, 0, sizeof (*u));

L
Linus Torvalds 已提交
2510
#ifdef CONFIG_IP_VS_PROTO_TCP
2511 2512 2513
	pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
	u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
	u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
L
Linus Torvalds 已提交
2514 2515
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
2516
	pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
L
Linus Torvalds 已提交
2517
	u->udp_timeout =
2518
			pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
L
Linus Torvalds 已提交
2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530
#endif
}


#define GET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
#define GET_INFO_ARG_LEN	(sizeof(struct ip_vs_getinfo))
#define GET_SERVICES_ARG_LEN	(sizeof(struct ip_vs_get_services))
#define GET_SERVICE_ARG_LEN	(sizeof(struct ip_vs_service_entry))
#define GET_DESTS_ARG_LEN	(sizeof(struct ip_vs_get_dests))
#define GET_TIMEOUT_ARG_LEN	(sizeof(struct ip_vs_timeout_user))
#define GET_DAEMON_ARG_LEN	(sizeof(struct ip_vs_daemon_user) * 2)

2531
static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
L
Linus Torvalds 已提交
2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545
	[GET_CMDID(IP_VS_SO_GET_VERSION)]	= 64,
	[GET_CMDID(IP_VS_SO_GET_INFO)]		= GET_INFO_ARG_LEN,
	[GET_CMDID(IP_VS_SO_GET_SERVICES)]	= GET_SERVICES_ARG_LEN,
	[GET_CMDID(IP_VS_SO_GET_SERVICE)]	= GET_SERVICE_ARG_LEN,
	[GET_CMDID(IP_VS_SO_GET_DESTS)]		= GET_DESTS_ARG_LEN,
	[GET_CMDID(IP_VS_SO_GET_TIMEOUT)]	= GET_TIMEOUT_ARG_LEN,
	[GET_CMDID(IP_VS_SO_GET_DAEMON)]	= GET_DAEMON_ARG_LEN,
};

static int
do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
	unsigned char arg[128];
	int ret = 0;
2546
	unsigned int copylen;
2547
	struct net *net = sock_net(sk);
2548
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
2549

2550
	BUG_ON(!net);
2551
	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
2552 2553
		return -EPERM;

2554 2555 2556
	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
		return -EINVAL;

L
Linus Torvalds 已提交
2557
	if (*len < get_arglen[GET_CMDID(cmd)]) {
2558 2559
		pr_err("get_ctl: len %u < %u\n",
		       *len, get_arglen[GET_CMDID(cmd)]);
L
Linus Torvalds 已提交
2560 2561 2562
		return -EINVAL;
	}

2563 2564 2565 2566 2567
	copylen = get_arglen[GET_CMDID(cmd)];
	if (copylen > 128)
		return -EINVAL;

	if (copy_from_user(arg, user, copylen) != 0)
L
Linus Torvalds 已提交
2568
		return -EFAULT;
2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595
	/*
	 * Handle daemons first since it has its own locking
	 */
	if (cmd == IP_VS_SO_GET_DAEMON) {
		struct ip_vs_daemon_user d[2];

		memset(&d, 0, sizeof(d));
		if (mutex_lock_interruptible(&ipvs->sync_mutex))
			return -ERESTARTSYS;

		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
			d[0].state = IP_VS_STATE_MASTER;
			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
				sizeof(d[0].mcast_ifn));
			d[0].syncid = ipvs->master_syncid;
		}
		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
			d[1].state = IP_VS_STATE_BACKUP;
			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
				sizeof(d[1].mcast_ifn));
			d[1].syncid = ipvs->backup_syncid;
		}
		if (copy_to_user(user, &d, sizeof(d)) != 0)
			ret = -EFAULT;
		mutex_unlock(&ipvs->sync_mutex);
		return ret;
	}
L
Linus Torvalds 已提交
2596

2597
	if (mutex_lock_interruptible(&__ip_vs_mutex))
L
Linus Torvalds 已提交
2598 2599 2600 2601 2602 2603 2604 2605
		return -ERESTARTSYS;

	switch (cmd) {
	case IP_VS_SO_GET_VERSION:
	{
		char buf[64];

		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2606
			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
L
Linus Torvalds 已提交
2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618
		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
			ret = -EFAULT;
			goto out;
		}
		*len = strlen(buf)+1;
	}
	break;

	case IP_VS_SO_GET_INFO:
	{
		struct ip_vs_getinfo info;
		info.version = IP_VS_VERSION_CODE;
2619
		info.size = ip_vs_conn_tab_size;
2620
		info.num_services = ipvs->num_services;
L
Linus Torvalds 已提交
2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634
		if (copy_to_user(user, &info, sizeof(info)) != 0)
			ret = -EFAULT;
	}
	break;

	case IP_VS_SO_GET_SERVICES:
	{
		struct ip_vs_get_services *get;
		int size;

		get = (struct ip_vs_get_services *)arg;
		size = sizeof(*get) +
			sizeof(struct ip_vs_service_entry) * get->num_services;
		if (*len != size) {
2635
			pr_err("length: %u != %u\n", *len, size);
L
Linus Torvalds 已提交
2636 2637 2638
			ret = -EINVAL;
			goto out;
		}
2639
		ret = __ip_vs_get_service_entries(net, get, user);
L
Linus Torvalds 已提交
2640 2641 2642 2643 2644 2645 2646
	}
	break;

	case IP_VS_SO_GET_SERVICE:
	{
		struct ip_vs_service_entry *entry;
		struct ip_vs_service *svc;
2647
		union nf_inet_addr addr;
L
Linus Torvalds 已提交
2648 2649

		entry = (struct ip_vs_service_entry *)arg;
2650
		addr.ip = entry->addr;
J
Julian Anastasov 已提交
2651
		rcu_read_lock();
L
Linus Torvalds 已提交
2652
		if (entry->fwmark)
2653
			svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
L
Linus Torvalds 已提交
2654
		else
2655 2656 2657
			svc = __ip_vs_service_find(net, AF_INET,
						   entry->protocol, &addr,
						   entry->port);
J
Julian Anastasov 已提交
2658
		rcu_read_unlock();
L
Linus Torvalds 已提交
2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676
		if (svc) {
			ip_vs_copy_service(entry, svc);
			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
				ret = -EFAULT;
		} else
			ret = -ESRCH;
	}
	break;

	case IP_VS_SO_GET_DESTS:
	{
		struct ip_vs_get_dests *get;
		int size;

		get = (struct ip_vs_get_dests *)arg;
		size = sizeof(*get) +
			sizeof(struct ip_vs_dest_entry) * get->num_dests;
		if (*len != size) {
2677
			pr_err("length: %u != %u\n", *len, size);
L
Linus Torvalds 已提交
2678 2679 2680
			ret = -EINVAL;
			goto out;
		}
2681
		ret = __ip_vs_get_dest_entries(net, get, user);
L
Linus Torvalds 已提交
2682 2683 2684 2685 2686 2687 2688
	}
	break;

	case IP_VS_SO_GET_TIMEOUT:
	{
		struct ip_vs_timeout_user t;

2689
		__ip_vs_get_timeouts(net, &t);
L
Linus Torvalds 已提交
2690 2691 2692 2693 2694 2695 2696 2697 2698
		if (copy_to_user(user, &t, sizeof(t)) != 0)
			ret = -EFAULT;
	}
	break;

	default:
		ret = -EINVAL;
	}

H
Hans Schillstrom 已提交
2699
out:
2700
	mutex_unlock(&__ip_vs_mutex);
L
Linus Torvalds 已提交
2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712
	return ret;
}


static struct nf_sockopt_ops ip_vs_sockopts = {
	.pf		= PF_INET,
	.set_optmin	= IP_VS_BASE_CTL,
	.set_optmax	= IP_VS_SO_SET_MAX+1,
	.set		= do_ip_vs_set_ctl,
	.get_optmin	= IP_VS_BASE_CTL,
	.get_optmax	= IP_VS_SO_GET_MAX+1,
	.get		= do_ip_vs_get_ctl,
2713
	.owner		= THIS_MODULE,
L
Linus Torvalds 已提交
2714 2715
};

2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726
/*
 * Generic Netlink interface
 */

/* IPVS genetlink family */
static struct genl_family ip_vs_genl_family = {
	.id		= GENL_ID_GENERATE,
	.hdrsize	= 0,
	.name		= IPVS_GENL_NAME,
	.version	= IPVS_GENL_VERSION,
	.maxattr	= IPVS_CMD_MAX,
2727
	.netnsok        = true,         /* Make ipvsadm to work on netns */
2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757
};

/* Policy used for first-level command attributes */
static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
};

/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
					    .len = IP_VS_IFNAME_MAXLEN },
	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
};

/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
					    .len = sizeof(union nf_inet_addr) },
	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
					    .len = IP_VS_SCHEDNAME_MAXLEN },
2758 2759
	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
					    .len = IP_VS_PENAME_MAXLEN },
2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784
	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
					    .len = sizeof(struct ip_vs_flags) },
	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
};

/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
					    .len = sizeof(union nf_inet_addr) },
	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
};

static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
				 struct ip_vs_stats *stats)
{
2785
	struct ip_vs_stats_user ustats;
2786 2787 2788 2789
	struct nlattr *nl_stats = nla_nest_start(skb, container_type);
	if (!nl_stats)
		return -EMSGSIZE;

2790
	ip_vs_copy_stats(&ustats, stats);
2791

D
David S. Miller 已提交
2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802
	if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
		goto nla_put_failure;
2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814
	nla_nest_end(skb, nl_stats);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_stats);
	return -EMSGSIZE;
}

static int ip_vs_genl_fill_service(struct sk_buff *skb,
				   struct ip_vs_service *svc)
{
J
Julian Anastasov 已提交
2815
	struct ip_vs_scheduler *sched;
2816
	struct ip_vs_pe *pe;
2817 2818 2819 2820 2821 2822 2823 2824
	struct nlattr *nl_service;
	struct ip_vs_flags flags = { .flags = svc->flags,
				     .mask = ~0 };

	nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
	if (!nl_service)
		return -EMSGSIZE;

D
David S. Miller 已提交
2825 2826
	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
		goto nla_put_failure;
2827
	if (svc->fwmark) {
D
David S. Miller 已提交
2828 2829
		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
			goto nla_put_failure;
2830
	} else {
D
David S. Miller 已提交
2831 2832
		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2833
		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
D
David S. Miller 已提交
2834
			goto nla_put_failure;
2835 2836
	}

J
Julian Anastasov 已提交
2837
	sched = rcu_dereference_protected(svc->scheduler, 1);
2838
	pe = rcu_dereference_protected(svc->pe, 1);
J
Julian Anastasov 已提交
2839
	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
2840
	    (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
D
David S. Miller 已提交
2841 2842
	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2843
	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
D
David S. Miller 已提交
2844
		goto nla_put_failure;
2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862
	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
		goto nla_put_failure;

	nla_nest_end(skb, nl_service);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_service);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_service(struct sk_buff *skb,
				   struct ip_vs_service *svc,
				   struct netlink_callback *cb)
{
	void *hdr;

2863
	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884
			  &ip_vs_genl_family, NLM_F_MULTI,
			  IPVS_CMD_NEW_SERVICE);
	if (!hdr)
		return -EMSGSIZE;

	if (ip_vs_genl_fill_service(skb, svc) < 0)
		goto nla_put_failure;

	return genlmsg_end(skb, hdr);

nla_put_failure:
	genlmsg_cancel(skb, hdr);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_services(struct sk_buff *skb,
				    struct netlink_callback *cb)
{
	int idx = 0, i;
	int start = cb->args[0];
	struct ip_vs_service *svc;
2885
	struct net *net = skb_sknet(skb);
2886 2887 2888

	mutex_lock(&__ip_vs_mutex);
	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
J
Julian Anastasov 已提交
2889
		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2890
			if (++idx <= start || !net_eq(svc->net, net))
2891 2892 2893 2894 2895 2896 2897 2898 2899
				continue;
			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
				idx--;
				goto nla_put_failure;
			}
		}
	}

	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
J
Julian Anastasov 已提交
2900
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2901
			if (++idx <= start || !net_eq(svc->net, net))
2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916
				continue;
			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
				idx--;
				goto nla_put_failure;
			}
		}
	}

nla_put_failure:
	mutex_unlock(&__ip_vs_mutex);
	cb->args[0] = idx;

	return skb->len;
}

2917 2918
static int ip_vs_genl_parse_service(struct net *net,
				    struct ip_vs_service_user_kern *usvc,
2919 2920
				    struct nlattr *nla, int full_entry,
				    struct ip_vs_service **ret_svc)
2921 2922 2923
{
	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2924
	struct ip_vs_service *svc;
2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939

	/* Parse mandatory identifying service fields first */
	if (nla == NULL ||
	    nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
		return -EINVAL;

	nla_af		= attrs[IPVS_SVC_ATTR_AF];
	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];

	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
		return -EINVAL;

S
Simon Horman 已提交
2940 2941
	memset(usvc, 0, sizeof(*usvc));

2942
	usvc->af = nla_get_u16(nla_af);
2943 2944 2945 2946 2947
#ifdef CONFIG_IP_VS_IPV6
	if (usvc->af != AF_INET && usvc->af != AF_INET6)
#else
	if (usvc->af != AF_INET)
#endif
2948 2949 2950 2951 2952 2953 2954 2955
		return -EAFNOSUPPORT;

	if (nla_fwmark) {
		usvc->protocol = IPPROTO_TCP;
		usvc->fwmark = nla_get_u32(nla_fwmark);
	} else {
		usvc->protocol = nla_get_u16(nla_protocol);
		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2956
		usvc->port = nla_get_be16(nla_port);
2957 2958 2959
		usvc->fwmark = 0;
	}

J
Julian Anastasov 已提交
2960
	rcu_read_lock();
2961
	if (usvc->fwmark)
2962
		svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2963
	else
2964
		svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2965
					   &usvc->addr, usvc->port);
J
Julian Anastasov 已提交
2966
	rcu_read_unlock();
2967 2968
	*ret_svc = svc;

2969 2970
	/* If a full entry was requested, check for the additional fields */
	if (full_entry) {
2971
		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2972 2973 2974 2975
			      *nla_netmask;
		struct ip_vs_flags flags;

		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2976
		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2977 2978 2979 2980 2981 2982 2983 2984 2985 2986
		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];

		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
			return -EINVAL;

		nla_memcpy(&flags, nla_flags, sizeof(flags));

		/* prefill flags from service if it already exists */
2987
		if (svc)
2988 2989 2990 2991 2992
			usvc->flags = svc->flags;

		/* set new flags from userland */
		usvc->flags = (usvc->flags & ~flags.mask) |
			      (flags.flags & flags.mask);
2993
		usvc->sched_name = nla_data(nla_sched);
2994
		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2995
		usvc->timeout = nla_get_u32(nla_timeout);
2996
		usvc->netmask = nla_get_be32(nla_netmask);
2997 2998 2999 3000 3001
	}

	return 0;
}

3002 3003
static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
						     struct nlattr *nla)
3004
{
3005
	struct ip_vs_service_user_kern usvc;
3006
	struct ip_vs_service *svc;
3007 3008
	int ret;

3009
	ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3010
	return ret ? ERR_PTR(ret) : svc;
3011 3012 3013 3014 3015 3016 3017 3018 3019 3020
}

static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
{
	struct nlattr *nl_dest;

	nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
	if (!nl_dest)
		return -EMSGSIZE;

D
David S. Miller 已提交
3021
	if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3022
	    nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
D
David S. Miller 已提交
3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036
	    nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
			(atomic_read(&dest->conn_flags) &
			 IP_VS_CONN_F_FWD_MASK)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
			atomic_read(&dest->weight)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
			atomic_read(&dest->activeconns)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
			atomic_read(&dest->inactconns)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
			atomic_read(&dest->persistconns)))
		goto nla_put_failure;
3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053
	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
		goto nla_put_failure;

	nla_nest_end(skb, nl_dest);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_dest);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
				struct netlink_callback *cb)
{
	void *hdr;

3054
	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077
			  &ip_vs_genl_family, NLM_F_MULTI,
			  IPVS_CMD_NEW_DEST);
	if (!hdr)
		return -EMSGSIZE;

	if (ip_vs_genl_fill_dest(skb, dest) < 0)
		goto nla_put_failure;

	return genlmsg_end(skb, hdr);

nla_put_failure:
	genlmsg_cancel(skb, hdr);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_dests(struct sk_buff *skb,
				 struct netlink_callback *cb)
{
	int idx = 0;
	int start = cb->args[0];
	struct ip_vs_service *svc;
	struct ip_vs_dest *dest;
	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3078
	struct net *net = skb_sknet(skb);
3079 3080 3081 3082 3083 3084 3085 3086

	mutex_lock(&__ip_vs_mutex);

	/* Try to find the service for which to dump destinations */
	if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
		goto out_err;

3087

3088
	svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110
	if (IS_ERR(svc) || svc == NULL)
		goto out_err;

	/* Dump the destinations */
	list_for_each_entry(dest, &svc->destinations, n_list) {
		if (++idx <= start)
			continue;
		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
			idx--;
			goto nla_put_failure;
		}
	}

nla_put_failure:
	cb->args[0] = idx;

out_err:
	mutex_unlock(&__ip_vs_mutex);

	return skb->len;
}

3111
static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127
				 struct nlattr *nla, int full_entry)
{
	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
	struct nlattr *nla_addr, *nla_port;

	/* Parse mandatory identifying destination fields first */
	if (nla == NULL ||
	    nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
		return -EINVAL;

	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
	nla_port	= attrs[IPVS_DEST_ATTR_PORT];

	if (!(nla_addr && nla_port))
		return -EINVAL;

S
Simon Horman 已提交
3128 3129
	memset(udest, 0, sizeof(*udest));

3130
	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3131
	udest->port = nla_get_be16(nla_port);
3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155

	/* If a full entry was requested, check for the additional fields */
	if (full_entry) {
		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
			      *nla_l_thresh;

		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];

		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
			return -EINVAL;

		udest->conn_flags = nla_get_u32(nla_fwd)
				    & IP_VS_CONN_F_FWD_MASK;
		udest->weight = nla_get_u32(nla_weight);
		udest->u_threshold = nla_get_u32(nla_u_thresh);
		udest->l_threshold = nla_get_u32(nla_l_thresh);
	}

	return 0;
}

3156 3157
static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
				  const char *mcast_ifn, __u32 syncid)
3158 3159 3160 3161 3162 3163 3164
{
	struct nlattr *nl_daemon;

	nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
	if (!nl_daemon)
		return -EMSGSIZE;

D
David S. Miller 已提交
3165 3166 3167 3168
	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
		goto nla_put_failure;
3169 3170 3171 3172 3173 3174 3175 3176 3177
	nla_nest_end(skb, nl_daemon);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_daemon);
	return -EMSGSIZE;
}

3178 3179
static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
				  const char *mcast_ifn, __u32 syncid,
3180 3181 3182
				  struct netlink_callback *cb)
{
	void *hdr;
3183
	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201
			  &ip_vs_genl_family, NLM_F_MULTI,
			  IPVS_CMD_NEW_DAEMON);
	if (!hdr)
		return -EMSGSIZE;

	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
		goto nla_put_failure;

	return genlmsg_end(skb, hdr);

nla_put_failure:
	genlmsg_cancel(skb, hdr);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
				   struct netlink_callback *cb)
{
3202
	struct net *net = skb_sknet(skb);
3203 3204
	struct netns_ipvs *ipvs = net_ipvs(net);

3205
	mutex_lock(&ipvs->sync_mutex);
3206
	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3207
		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3208 3209
					   ipvs->master_mcast_ifn,
					   ipvs->master_syncid, cb) < 0)
3210 3211 3212 3213 3214
			goto nla_put_failure;

		cb->args[0] = 1;
	}

3215
	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3216
		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3217 3218
					   ipvs->backup_mcast_ifn,
					   ipvs->backup_syncid, cb) < 0)
3219 3220 3221 3222 3223 3224
			goto nla_put_failure;

		cb->args[1] = 1;
	}

nla_put_failure:
3225
	mutex_unlock(&ipvs->sync_mutex);
3226 3227 3228 3229

	return skb->len;
}

3230
static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3231 3232 3233 3234 3235 3236
{
	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
		return -EINVAL;

3237 3238
	return start_sync_thread(net,
				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3239 3240 3241 3242
				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
}

3243
static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3244 3245 3246 3247
{
	if (!attrs[IPVS_DAEMON_ATTR_STATE])
		return -EINVAL;

3248 3249
	return stop_sync_thread(net,
				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3250 3251
}

3252
static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3253 3254 3255
{
	struct ip_vs_timeout_user t;

3256
	__ip_vs_get_timeouts(net, &t);
3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267

	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);

	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
		t.tcp_fin_timeout =
			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);

	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);

3268
	return ip_vs_set_timeout(net, &t);
3269 3270
}

3271
static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3272 3273
{
	int ret = 0, cmd;
3274
	struct net *net;
3275
	struct netns_ipvs *ipvs;
3276

3277
	net = skb_sknet(skb);
3278
	ipvs = net_ipvs(net);
3279 3280
	cmd = info->genlhdr->cmd;

3281
	if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3282 3283
		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];

3284
		mutex_lock(&ipvs->sync_mutex);
3285 3286 3287 3288 3289 3290 3291 3292 3293
		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
		    nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
				     info->attrs[IPVS_CMD_ATTR_DAEMON],
				     ip_vs_daemon_policy)) {
			ret = -EINVAL;
			goto out;
		}

		if (cmd == IPVS_CMD_NEW_DAEMON)
3294
			ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3295
		else
3296
			ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317
out:
		mutex_unlock(&ipvs->sync_mutex);
	}
	return ret;
}

static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
{
	struct ip_vs_service *svc = NULL;
	struct ip_vs_service_user_kern usvc;
	struct ip_vs_dest_user_kern udest;
	int ret = 0, cmd;
	int need_full_svc = 0, need_full_dest = 0;
	struct net *net;

	net = skb_sknet(skb);
	cmd = info->genlhdr->cmd;

	mutex_lock(&__ip_vs_mutex);

	if (cmd == IPVS_CMD_FLUSH) {
J
Julian Anastasov 已提交
3318
		ret = ip_vs_flush(net, false);
3319 3320 3321
		goto out;
	} else if (cmd == IPVS_CMD_SET_CONFIG) {
		ret = ip_vs_genl_set_config(net, info->attrs);
3322 3323 3324
		goto out;
	} else if (cmd == IPVS_CMD_ZERO &&
		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3325
		ret = ip_vs_zero_all(net);
3326 3327 3328 3329 3330 3331 3332 3333 3334
		goto out;
	}

	/* All following commands require a service argument, so check if we
	 * received a valid one. We need a full service specification when
	 * adding / editing a service. Only identifying members otherwise. */
	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
		need_full_svc = 1;

3335
	ret = ip_vs_genl_parse_service(net, &usvc,
3336
				       info->attrs[IPVS_CMD_ATTR_SERVICE],
3337
				       need_full_svc, &svc);
3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364
	if (ret)
		goto out;

	/* Unless we're adding a new service, the service must already exist */
	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
		ret = -ESRCH;
		goto out;
	}

	/* Destination commands require a valid destination argument. For
	 * adding / editing a destination, we need a full destination
	 * specification. */
	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
	    cmd == IPVS_CMD_DEL_DEST) {
		if (cmd != IPVS_CMD_DEL_DEST)
			need_full_dest = 1;

		ret = ip_vs_genl_parse_dest(&udest,
					    info->attrs[IPVS_CMD_ATTR_DEST],
					    need_full_dest);
		if (ret)
			goto out;
	}

	switch (cmd) {
	case IPVS_CMD_NEW_SERVICE:
		if (svc == NULL)
3365
			ret = ip_vs_add_service(net, &usvc, &svc);
3366 3367 3368 3369 3370 3371 3372 3373
		else
			ret = -EEXIST;
		break;
	case IPVS_CMD_SET_SERVICE:
		ret = ip_vs_edit_service(svc, &usvc);
		break;
	case IPVS_CMD_DEL_SERVICE:
		ret = ip_vs_del_service(svc);
3374
		/* do not use svc, it can be freed */
3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402
		break;
	case IPVS_CMD_NEW_DEST:
		ret = ip_vs_add_dest(svc, &udest);
		break;
	case IPVS_CMD_SET_DEST:
		ret = ip_vs_edit_dest(svc, &udest);
		break;
	case IPVS_CMD_DEL_DEST:
		ret = ip_vs_del_dest(svc, &udest);
		break;
	case IPVS_CMD_ZERO:
		ret = ip_vs_zero_service(svc);
		break;
	default:
		ret = -EINVAL;
	}

out:
	mutex_unlock(&__ip_vs_mutex);

	return ret;
}

static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
	struct sk_buff *msg;
	void *reply;
	int ret, cmd, reply_cmd;
3403
	struct net *net;
3404

3405
	net = skb_sknet(skb);
3406 3407 3408 3409 3410 3411 3412 3413 3414
	cmd = info->genlhdr->cmd;

	if (cmd == IPVS_CMD_GET_SERVICE)
		reply_cmd = IPVS_CMD_NEW_SERVICE;
	else if (cmd == IPVS_CMD_GET_INFO)
		reply_cmd = IPVS_CMD_SET_INFO;
	else if (cmd == IPVS_CMD_GET_CONFIG)
		reply_cmd = IPVS_CMD_SET_CONFIG;
	else {
3415
		pr_err("unknown Generic Netlink command\n");
3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433
		return -EINVAL;
	}

	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
	if (!msg)
		return -ENOMEM;

	mutex_lock(&__ip_vs_mutex);

	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
	if (reply == NULL)
		goto nla_put_failure;

	switch (cmd) {
	case IPVS_CMD_GET_SERVICE:
	{
		struct ip_vs_service *svc;

3434 3435
		svc = ip_vs_genl_find_service(net,
					      info->attrs[IPVS_CMD_ATTR_SERVICE]);
3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454
		if (IS_ERR(svc)) {
			ret = PTR_ERR(svc);
			goto out_err;
		} else if (svc) {
			ret = ip_vs_genl_fill_service(msg, svc);
			if (ret)
				goto nla_put_failure;
		} else {
			ret = -ESRCH;
			goto out_err;
		}

		break;
	}

	case IPVS_CMD_GET_CONFIG:
	{
		struct ip_vs_timeout_user t;

3455
		__ip_vs_get_timeouts(net, &t);
3456
#ifdef CONFIG_IP_VS_PROTO_TCP
D
David S. Miller 已提交
3457 3458 3459 3460 3461
		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
				t.tcp_timeout) ||
		    nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
				t.tcp_fin_timeout))
			goto nla_put_failure;
3462 3463
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
D
David S. Miller 已提交
3464 3465
		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
			goto nla_put_failure;
3466 3467 3468 3469 3470 3471
#endif

		break;
	}

	case IPVS_CMD_GET_INFO:
D
David S. Miller 已提交
3472 3473 3474 3475 3476
		if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
				IP_VS_VERSION_CODE) ||
		    nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
				ip_vs_conn_tab_size))
			goto nla_put_failure;
3477 3478 3479 3480
		break;
	}

	genlmsg_end(msg, reply);
J
Johannes Berg 已提交
3481
	ret = genlmsg_reply(msg, info);
3482 3483 3484
	goto out;

nla_put_failure:
3485
	pr_err("not enough space in Netlink message\n");
3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496
	ret = -EMSGSIZE;

out_err:
	nlmsg_free(msg);
out:
	mutex_unlock(&__ip_vs_mutex);

	return ret;
}


3497
static const struct genl_ops ip_vs_genl_ops[] = {
3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550
	{
		.cmd	= IPVS_CMD_NEW_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_SET_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_DEL_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_get_cmd,
		.dumpit	= ip_vs_genl_dump_services,
		.policy	= ip_vs_cmd_policy,
	},
	{
		.cmd	= IPVS_CMD_NEW_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_SET_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_DEL_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.dumpit	= ip_vs_genl_dump_dests,
	},
	{
		.cmd	= IPVS_CMD_NEW_DAEMON,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
3551
		.doit	= ip_vs_genl_set_daemon,
3552 3553 3554 3555 3556
	},
	{
		.cmd	= IPVS_CMD_DEL_DAEMON,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
3557
		.doit	= ip_vs_genl_set_daemon,
3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594
	},
	{
		.cmd	= IPVS_CMD_GET_DAEMON,
		.flags	= GENL_ADMIN_PERM,
		.dumpit	= ip_vs_genl_dump_daemons,
	},
	{
		.cmd	= IPVS_CMD_SET_CONFIG,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_CONFIG,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_get_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_INFO,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_get_cmd,
	},
	{
		.cmd	= IPVS_CMD_ZERO,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_FLUSH,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_set_cmd,
	},
};

static int __init ip_vs_genl_register(void)
{
3595
	return genl_register_family_with_ops(&ip_vs_genl_family,
3596
					     ip_vs_genl_ops);
3597 3598 3599 3600 3601 3602 3603 3604 3605
}

static void ip_vs_genl_unregister(void)
{
	genl_unregister_family(&ip_vs_genl_family);
}

/* End of Generic Netlink interface definitions */

3606 3607 3608
/*
 * per netns intit/exit func.
 */
3609
#ifdef CONFIG_SYSCTL
C
Claudiu Ghioc 已提交
3610
static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3611
{
3612 3613
	int idx;
	struct netns_ipvs *ipvs = net_ipvs(net);
3614
	struct ctl_table *tbl;
3615

3616 3617 3618 3619 3620 3621 3622 3623
	atomic_set(&ipvs->dropentry, 0);
	spin_lock_init(&ipvs->dropentry_lock);
	spin_lock_init(&ipvs->droppacket_lock);
	spin_lock_init(&ipvs->securetcp_lock);

	if (!net_eq(net, &init_net)) {
		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
		if (tbl == NULL)
3624
			return -ENOMEM;
3625 3626 3627 3628

		/* Don't export sysctls to unprivileged users */
		if (net->user_ns != &init_user_ns)
			tbl[0].procname = NULL;
3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646
	} else
		tbl = vs_vars;
	/* Initialize sysctl defaults */
	idx = 0;
	ipvs->sysctl_amemthresh = 1024;
	tbl[idx++].data = &ipvs->sysctl_amemthresh;
	ipvs->sysctl_am_droprate = 10;
	tbl[idx++].data = &ipvs->sysctl_am_droprate;
	tbl[idx++].data = &ipvs->sysctl_drop_entry;
	tbl[idx++].data = &ipvs->sysctl_drop_packet;
#ifdef CONFIG_IP_VS_NFCT
	tbl[idx++].data = &ipvs->sysctl_conntrack;
#endif
	tbl[idx++].data = &ipvs->sysctl_secure_tcp;
	ipvs->sysctl_snat_reroute = 1;
	tbl[idx++].data = &ipvs->sysctl_snat_reroute;
	ipvs->sysctl_sync_ver = 1;
	tbl[idx++].data = &ipvs->sysctl_sync_ver;
3647 3648
	ipvs->sysctl_sync_ports = 1;
	tbl[idx++].data = &ipvs->sysctl_sync_ports;
3649
	tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
P
Pablo Neira Ayuso 已提交
3650 3651 3652 3653
	ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
	tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
	ipvs->sysctl_sync_sock_size = 0;
	tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3654 3655
	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
A
Alexander Frolkin 已提交
3656 3657
	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
3658
	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3659 3660
	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3661 3662
	tbl[idx].data = &ipvs->sysctl_sync_threshold;
	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3663 3664 3665 3666
	ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
	tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
	ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
	tbl[idx++].data = &ipvs->sysctl_sync_retries;
3667
	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3668 3669
	ipvs->sysctl_pmtu_disc = 1;
	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3670
	tbl[idx++].data = &ipvs->sysctl_backup_only;
3671 3672


3673
	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3674 3675 3676
	if (ipvs->sysctl_hdr == NULL) {
		if (!net_eq(net, &init_net))
			kfree(tbl);
3677
		return -ENOMEM;
3678
	}
3679
	ip_vs_start_estimator(net, &ipvs->tot_stats);
3680
	ipvs->sysctl_tbl = tbl;
3681 3682 3683
	/* Schedule defense work */
	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3684 3685 3686 3687

	return 0;
}

C
Claudiu Ghioc 已提交
3688
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
3689
{
3690 3691
	struct netns_ipvs *ipvs = net_ipvs(net);

H
Hans Schillstrom 已提交
3692 3693
	cancel_delayed_work_sync(&ipvs->defense_work);
	cancel_work_sync(&ipvs->defense_work.work);
3694
	unregister_net_sysctl_table(ipvs->sysctl_hdr);
3695
	ip_vs_stop_estimator(net, &ipvs->tot_stats);
3696 3697 3698 3699
}

#else

C
Claudiu Ghioc 已提交
3700 3701
static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
3702

3703
#endif
3704

3705 3706 3707 3708
static struct notifier_block ip_vs_dst_notifier = {
	.notifier_call = ip_vs_dst_event,
};

3709
int __net_init ip_vs_control_net_init(struct net *net)
3710
{
3711
	int i, idx;
3712 3713 3714 3715
	struct netns_ipvs *ipvs = net_ipvs(net);

	/* Initialize rs_table */
	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3716
		INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
3717 3718

	INIT_LIST_HEAD(&ipvs->dest_trash);
J
Julian Anastasov 已提交
3719 3720 3721
	spin_lock_init(&ipvs->dest_trash_lock);
	setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
		    (unsigned long) net);
3722 3723 3724 3725 3726
	atomic_set(&ipvs->ftpsvc_counter, 0);
	atomic_set(&ipvs->nullsvc_counter, 0);

	/* procfs stats */
	ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3727
	if (!ipvs->tot_stats.cpustats)
3728
		return -ENOMEM;
3729

3730 3731 3732 3733 3734 3735
	for_each_possible_cpu(i) {
		struct ip_vs_cpu_stats *ipvs_tot_stats;
		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
		u64_stats_init(&ipvs_tot_stats->syncp);
	}

3736 3737
	spin_lock_init(&ipvs->tot_stats.lock);

3738 3739 3740 3741
	proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
	proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
	proc_create("ip_vs_stats_percpu", 0, net->proc_net,
		    &ip_vs_stats_percpu_fops);
3742

3743
	if (ip_vs_control_net_init_sysctl(net))
3744 3745 3746 3747 3748
		goto err;

	return 0;

err:
J
Julian Anastasov 已提交
3749
	free_percpu(ipvs->tot_stats.cpustats);
3750 3751 3752
	return -ENOMEM;
}

3753
void __net_exit ip_vs_control_net_cleanup(struct net *net)
3754
{
3755 3756
	struct netns_ipvs *ipvs = net_ipvs(net);

H
Hans Schillstrom 已提交
3757
	ip_vs_trash_cleanup(net);
3758
	ip_vs_control_net_cleanup_sysctl(net);
3759 3760 3761
	remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
	remove_proc_entry("ip_vs_stats", net->proc_net);
	remove_proc_entry("ip_vs", net->proc_net);
J
Julian Anastasov 已提交
3762
	free_percpu(ipvs->tot_stats.cpustats);
3763 3764
}

3765
int __init ip_vs_register_nl_ioctl(void)
L
Linus Torvalds 已提交
3766
{
3767
	int ret;
L
Linus Torvalds 已提交
3768 3769 3770

	ret = nf_register_sockopt(&ip_vs_sockopts);
	if (ret) {
3771
		pr_err("cannot register sockopt.\n");
3772
		goto err_sock;
L
Linus Torvalds 已提交
3773 3774
	}

3775 3776
	ret = ip_vs_genl_register();
	if (ret) {
3777
		pr_err("cannot register Generic Netlink interface.\n");
3778
		goto err_genl;
3779
	}
L
Linus Torvalds 已提交
3780
	return 0;
3781

3782 3783 3784
err_genl:
	nf_unregister_sockopt(&ip_vs_sockopts);
err_sock:
3785
	return ret;
L
Linus Torvalds 已提交
3786 3787
}

3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800
void ip_vs_unregister_nl_ioctl(void)
{
	ip_vs_genl_unregister();
	nf_unregister_sockopt(&ip_vs_sockopts);
}

int __init ip_vs_control_init(void)
{
	int idx;
	int ret;

	EnterFunction(2);

3801
	/* Initialize svc_table, ip_vs_svc_fwm_table */
3802
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
3803 3804
		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
		INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816
	}

	smp_wmb();	/* Do we really need it now ? */

	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
	if (ret < 0)
		return ret;

	LeaveFunction(2);
	return 0;
}

L
Linus Torvalds 已提交
3817 3818 3819 3820

void ip_vs_control_cleanup(void)
{
	EnterFunction(2);
3821
	unregister_netdevice_notifier(&ip_vs_dst_notifier);
L
Linus Torvalds 已提交
3822 3823
	LeaveFunction(2);
}