ip_vs_ctl.c 98.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * IPVS         An implementation of the IP virtual server support for the
 *              LINUX operating system.  IPVS is now implemented as a module
 *              over the NetFilter framework. IPVS can be used to build a
 *              high-performance and highly available server based on a
 *              cluster of servers.
 *
 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
 *              Peter Kese <peter.kese@ijs.si>
 *              Julian Anastasov <ja@ssi.bg>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 *
 * Changes:
 *
 */

H
Hannes Eder 已提交
21 22 23
#define KMSG_COMPONENT "IPVS"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

L
Linus Torvalds 已提交
24 25 26
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28 29 30 31 32 33
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
34
#include <linux/slab.h>
L
Linus Torvalds 已提交
35 36 37

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
38
#include <linux/mutex.h>
L
Linus Torvalds 已提交
39

40
#include <net/net_namespace.h>
41
#include <linux/nsproxy.h>
L
Linus Torvalds 已提交
42
#include <net/ip.h>
43 44 45 46
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <net/ip6_route.h>
#endif
47
#include <net/route.h>
L
Linus Torvalds 已提交
48
#include <net/sock.h>
49
#include <net/genetlink.h>
L
Linus Torvalds 已提交
50 51 52 53 54 55

#include <asm/uaccess.h>

#include <net/ip_vs.h>

/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56
static DEFINE_MUTEX(__ip_vs_mutex);
L
Linus Torvalds 已提交
57 58 59 60 61 62 63 64 65 66 67 68

/* sysctl variables */

#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;

int ip_vs_get_debug_level(void)
{
	return sysctl_ip_vs_debug_level;
}
#endif

69 70

/*  Protos */
J
Julian Anastasov 已提交
71
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
72 73


74 75
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
76 77
static bool __ip_vs_addr_is_local_v6(struct net *net,
				     const struct in6_addr *addr)
78
{
79 80
	struct flowi6 fl6 = {
		.daddr = *addr,
81
	};
82 83
	struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
	bool is_local;
84

85
	is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
86

87 88
	dst_release(dst);
	return is_local;
89 90
}
#endif
91 92

#ifdef CONFIG_SYSCTL
L
Linus Torvalds 已提交
93
/*
94 95
 *	update_defense_level is called from keventd and from sysctl,
 *	so it needs to protect itself from softirqs
L
Linus Torvalds 已提交
96
 */
97
static void update_defense_level(struct netns_ipvs *ipvs)
L
Linus Torvalds 已提交
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
{
	struct sysinfo i;
	static int old_secure_tcp = 0;
	int availmem;
	int nomem;
	int to_change = -1;

	/* we only count free and buffered memory (in pages) */
	si_meminfo(&i);
	availmem = i.freeram + i.bufferram;
	/* however in linux 2.5 the i.bufferram is total page cache size,
	   we need adjust it */
	/* si_swapinfo(&i); */
	/* availmem = availmem - (i.totalswap - i.freeswap); */

113
	nomem = (availmem < ipvs->sysctl_amemthresh);
L
Linus Torvalds 已提交
114

115 116
	local_bh_disable();

L
Linus Torvalds 已提交
117
	/* drop_entry */
118 119
	spin_lock(&ipvs->dropentry_lock);
	switch (ipvs->sysctl_drop_entry) {
L
Linus Torvalds 已提交
120
	case 0:
121
		atomic_set(&ipvs->dropentry, 0);
L
Linus Torvalds 已提交
122 123 124
		break;
	case 1:
		if (nomem) {
125 126
			atomic_set(&ipvs->dropentry, 1);
			ipvs->sysctl_drop_entry = 2;
L
Linus Torvalds 已提交
127
		} else {
128
			atomic_set(&ipvs->dropentry, 0);
L
Linus Torvalds 已提交
129 130 131 132
		}
		break;
	case 2:
		if (nomem) {
133
			atomic_set(&ipvs->dropentry, 1);
L
Linus Torvalds 已提交
134
		} else {
135 136
			atomic_set(&ipvs->dropentry, 0);
			ipvs->sysctl_drop_entry = 1;
L
Linus Torvalds 已提交
137 138 139
		};
		break;
	case 3:
140
		atomic_set(&ipvs->dropentry, 1);
L
Linus Torvalds 已提交
141 142
		break;
	}
143
	spin_unlock(&ipvs->dropentry_lock);
L
Linus Torvalds 已提交
144 145

	/* drop_packet */
146 147
	spin_lock(&ipvs->droppacket_lock);
	switch (ipvs->sysctl_drop_packet) {
L
Linus Torvalds 已提交
148
	case 0:
149
		ipvs->drop_rate = 0;
L
Linus Torvalds 已提交
150 151 152
		break;
	case 1:
		if (nomem) {
153 154 155 156
			ipvs->drop_rate = ipvs->drop_counter
				= ipvs->sysctl_amemthresh /
				(ipvs->sysctl_amemthresh-availmem);
			ipvs->sysctl_drop_packet = 2;
L
Linus Torvalds 已提交
157
		} else {
158
			ipvs->drop_rate = 0;
L
Linus Torvalds 已提交
159 160 161 162
		}
		break;
	case 2:
		if (nomem) {
163 164 165
			ipvs->drop_rate = ipvs->drop_counter
				= ipvs->sysctl_amemthresh /
				(ipvs->sysctl_amemthresh-availmem);
L
Linus Torvalds 已提交
166
		} else {
167 168
			ipvs->drop_rate = 0;
			ipvs->sysctl_drop_packet = 1;
L
Linus Torvalds 已提交
169 170 171
		}
		break;
	case 3:
172
		ipvs->drop_rate = ipvs->sysctl_am_droprate;
L
Linus Torvalds 已提交
173 174
		break;
	}
175
	spin_unlock(&ipvs->droppacket_lock);
L
Linus Torvalds 已提交
176 177

	/* secure_tcp */
178 179
	spin_lock(&ipvs->securetcp_lock);
	switch (ipvs->sysctl_secure_tcp) {
L
Linus Torvalds 已提交
180 181 182 183 184 185 186 187
	case 0:
		if (old_secure_tcp >= 2)
			to_change = 0;
		break;
	case 1:
		if (nomem) {
			if (old_secure_tcp < 2)
				to_change = 1;
188
			ipvs->sysctl_secure_tcp = 2;
L
Linus Torvalds 已提交
189 190 191 192 193 194 195 196 197 198 199 200
		} else {
			if (old_secure_tcp >= 2)
				to_change = 0;
		}
		break;
	case 2:
		if (nomem) {
			if (old_secure_tcp < 2)
				to_change = 1;
		} else {
			if (old_secure_tcp >= 2)
				to_change = 0;
201
			ipvs->sysctl_secure_tcp = 1;
L
Linus Torvalds 已提交
202 203 204 205 206 207 208
		}
		break;
	case 3:
		if (old_secure_tcp < 2)
			to_change = 1;
		break;
	}
209
	old_secure_tcp = ipvs->sysctl_secure_tcp;
L
Linus Torvalds 已提交
210
	if (to_change >= 0)
211
		ip_vs_protocol_timeout_change(ipvs,
212 213
					      ipvs->sysctl_secure_tcp > 1);
	spin_unlock(&ipvs->securetcp_lock);
214 215

	local_bh_enable();
L
Linus Torvalds 已提交
216 217 218 219 220 221 222 223
}


/*
 *	Timer for checking the defense
 */
#define DEFENSE_TIMER_PERIOD	1*HZ

D
David Howells 已提交
224
static void defense_work_handler(struct work_struct *work)
L
Linus Torvalds 已提交
225
{
226 227
	struct netns_ipvs *ipvs =
		container_of(work, struct netns_ipvs, defense_work.work);
228 229

	update_defense_level(ipvs);
230
	if (atomic_read(&ipvs->dropentry))
231 232
		ip_vs_random_dropentry(ipvs->net);
	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
L
Linus Torvalds 已提交
233
}
234
#endif
L
Linus Torvalds 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256

int
ip_vs_use_count_inc(void)
{
	return try_module_get(THIS_MODULE);
}

void
ip_vs_use_count_dec(void)
{
	module_put(THIS_MODULE);
}


/*
 *	Hash table: for virtual service lookups
 */
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)

/* the service table hashed by <protocol, addr, port> */
J
Julian Anastasov 已提交
257
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
L
Linus Torvalds 已提交
258
/* the service table hashed by fwmark */
J
Julian Anastasov 已提交
259
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
L
Linus Torvalds 已提交
260 261 262 263 264


/*
 *	Returns hash value for virtual service
 */
265 266
static inline unsigned int
ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
267
		  const union nf_inet_addr *addr, __be16 port)
L
Linus Torvalds 已提交
268
{
269
	register unsigned int porth = ntohs(port);
270
	__be32 addr_fold = addr->ip;
271
	__u32 ahash;
L
Linus Torvalds 已提交
272

273 274 275 276 277
#ifdef CONFIG_IP_VS_IPV6
	if (af == AF_INET6)
		addr_fold = addr->ip6[0]^addr->ip6[1]^
			    addr->ip6[2]^addr->ip6[3];
#endif
278 279
	ahash = ntohl(addr_fold);
	ahash ^= ((size_t) net >> 8);
280

281 282
	return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
	       IP_VS_SVC_TAB_MASK;
L
Linus Torvalds 已提交
283 284 285 286 287
}

/*
 *	Returns hash value of fwmark for virtual service lookup
 */
288
static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
L
Linus Torvalds 已提交
289
{
290
	return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
L
Linus Torvalds 已提交
291 292 293
}

/*
294
 *	Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
L
Linus Torvalds 已提交
295 296 297 298 299
 *	or in the ip_vs_svc_fwm_table by fwmark.
 *	Should be called with locked tables.
 */
static int ip_vs_svc_hash(struct ip_vs_service *svc)
{
300
	unsigned int hash;
L
Linus Torvalds 已提交
301 302

	if (svc->flags & IP_VS_SVC_F_HASHED) {
303 304
		pr_err("%s(): request for already hashed, called from %pF\n",
		       __func__, __builtin_return_address(0));
L
Linus Torvalds 已提交
305 306 307 308 309
		return 0;
	}

	if (svc->fwmark == 0) {
		/*
310
		 *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
L
Linus Torvalds 已提交
311
		 */
312 313
		hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
					 &svc->addr, svc->port);
J
Julian Anastasov 已提交
314
		hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
L
Linus Torvalds 已提交
315 316
	} else {
		/*
317
		 *  Hash it by fwmark in svc_fwm_table
L
Linus Torvalds 已提交
318
		 */
319
		hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
J
Julian Anastasov 已提交
320
		hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
L
Linus Torvalds 已提交
321 322 323 324 325 326 327 328 329 330
	}

	svc->flags |= IP_VS_SVC_F_HASHED;
	/* increase its refcnt because it is referenced by the svc table */
	atomic_inc(&svc->refcnt);
	return 1;
}


/*
331
 *	Unhashes a service from svc_table / svc_fwm_table.
L
Linus Torvalds 已提交
332 333 334 335 336
 *	Should be called with locked tables.
 */
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
{
	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
337 338
		pr_err("%s(): request for unhash flagged, called from %pF\n",
		       __func__, __builtin_return_address(0));
L
Linus Torvalds 已提交
339 340 341 342
		return 0;
	}

	if (svc->fwmark == 0) {
343
		/* Remove it from the svc_table table */
J
Julian Anastasov 已提交
344
		hlist_del_rcu(&svc->s_list);
L
Linus Torvalds 已提交
345
	} else {
346
		/* Remove it from the svc_fwm_table table */
J
Julian Anastasov 已提交
347
		hlist_del_rcu(&svc->f_list);
L
Linus Torvalds 已提交
348 349 350 351 352 353 354 355 356
	}

	svc->flags &= ~IP_VS_SVC_F_HASHED;
	atomic_dec(&svc->refcnt);
	return 1;
}


/*
357
 *	Get service by {netns, proto,addr,port} in the service table.
L
Linus Torvalds 已提交
358
 */
359
static inline struct ip_vs_service *
360 361
__ip_vs_service_find(struct net *net, int af, __u16 protocol,
		     const union nf_inet_addr *vaddr, __be16 vport)
L
Linus Torvalds 已提交
362
{
363
	unsigned int hash;
L
Linus Torvalds 已提交
364 365 366
	struct ip_vs_service *svc;

	/* Check for "full" addressed entries */
367
	hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
L
Linus Torvalds 已提交
368

J
Julian Anastasov 已提交
369
	hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
370 371
		if ((svc->af == af)
		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
L
Linus Torvalds 已提交
372
		    && (svc->port == vport)
373 374
		    && (svc->protocol == protocol)
		    && net_eq(svc->net, net)) {
L
Linus Torvalds 已提交
375 376 377 378 379 380 381 382 383 384 385 386
			/* HIT */
			return svc;
		}
	}

	return NULL;
}


/*
 *	Get service by {fwmark} in the service table.
 */
387
static inline struct ip_vs_service *
388
__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
L
Linus Torvalds 已提交
389
{
390
	unsigned int hash;
L
Linus Torvalds 已提交
391 392 393
	struct ip_vs_service *svc;

	/* Check for fwmark addressed entries */
394
	hash = ip_vs_svc_fwm_hashkey(net, fwmark);
L
Linus Torvalds 已提交
395

J
Julian Anastasov 已提交
396
	hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
397 398
		if (svc->fwmark == fwmark && svc->af == af
		    && net_eq(svc->net, net)) {
L
Linus Torvalds 已提交
399 400 401 402 403 404 405 406
			/* HIT */
			return svc;
		}
	}

	return NULL;
}

J
Julian Anastasov 已提交
407
/* Find service, called under RCU lock */
L
Linus Torvalds 已提交
408
struct ip_vs_service *
J
Julian Anastasov 已提交
409 410
ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
		   const union nf_inet_addr *vaddr, __be16 vport)
L
Linus Torvalds 已提交
411 412
{
	struct ip_vs_service *svc;
413
	struct netns_ipvs *ipvs = net_ipvs(net);
414

L
Linus Torvalds 已提交
415 416 417
	/*
	 *	Check the table hashed by fwmark first
	 */
418 419 420 421 422
	if (fwmark) {
		svc = __ip_vs_svc_fwm_find(net, af, fwmark);
		if (svc)
			goto out;
	}
L
Linus Torvalds 已提交
423 424 425 426 427

	/*
	 *	Check the table hashed by <protocol,addr,port>
	 *	for "full" addressed entries
	 */
428
	svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
L
Linus Torvalds 已提交
429 430 431

	if (svc == NULL
	    && protocol == IPPROTO_TCP
432
	    && atomic_read(&ipvs->ftpsvc_counter)
L
Linus Torvalds 已提交
433 434 435 436 437
	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
		/*
		 * Check if ftp service entry exists, the packet
		 * might belong to FTP data connections.
		 */
438
		svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
L
Linus Torvalds 已提交
439 440 441
	}

	if (svc == NULL
442
	    && atomic_read(&ipvs->nullsvc_counter)) {
L
Linus Torvalds 已提交
443 444 445
		/*
		 * Check if the catch-all port (port zero) exists
		 */
446
		svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
L
Linus Torvalds 已提交
447 448 449
	}

  out:
450 451 452 453
	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
		      fwmark, ip_vs_proto_name(protocol),
		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
		      svc ? "hit" : "not hit");
L
Linus Torvalds 已提交
454 455 456 457 458 459 460 461 462

	return svc;
}


static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
	atomic_inc(&svc->refcnt);
463
	rcu_assign_pointer(dest->svc, svc);
L
Linus Torvalds 已提交
464 465
}

J
Julian Anastasov 已提交
466 467
static void ip_vs_service_free(struct ip_vs_service *svc)
{
468
	free_percpu(svc->stats.cpustats);
J
Julian Anastasov 已提交
469 470 471
	kfree(svc);
}

472
static void ip_vs_service_rcu_free(struct rcu_head *head)
L
Linus Torvalds 已提交
473
{
474 475 476 477 478
	struct ip_vs_service *svc;

	svc = container_of(head, struct ip_vs_service, rcu_head);
	ip_vs_service_free(svc);
}
L
Linus Torvalds 已提交
479

480 481
static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
{
482
	if (atomic_dec_and_test(&svc->refcnt)) {
J
Julian Anastasov 已提交
483
		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
484 485
			      svc->fwmark,
			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
J
Julian Anastasov 已提交
486
			      ntohs(svc->port));
487 488 489 490
		if (do_delay)
			call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
		else
			ip_vs_service_free(svc);
491
	}
L
Linus Torvalds 已提交
492 493 494 495 496 497
}


/*
 *	Returns hash value for real service
 */
498
static inline unsigned int ip_vs_rs_hashkey(int af,
499 500
					    const union nf_inet_addr *addr,
					    __be16 port)
L
Linus Torvalds 已提交
501
{
502
	register unsigned int porth = ntohs(port);
503 504 505 506 507 508 509
	__be32 addr_fold = addr->ip;

#ifdef CONFIG_IP_VS_IPV6
	if (af == AF_INET6)
		addr_fold = addr->ip6[0]^addr->ip6[1]^
			    addr->ip6[2]^addr->ip6[3];
#endif
L
Linus Torvalds 已提交
510

511
	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
L
Linus Torvalds 已提交
512 513 514
		& IP_VS_RTAB_MASK;
}

515 516
/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
L
Linus Torvalds 已提交
517
{
518
	unsigned int hash;
L
Linus Torvalds 已提交
519

520 521
	if (dest->in_rs_table)
		return;
L
Linus Torvalds 已提交
522 523 524 525 526

	/*
	 *	Hash by proto,addr,port,
	 *	which are the parameters of the real service.
	 */
527 528
	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);

529 530
	hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
	dest->in_rs_table = 1;
L
Linus Torvalds 已提交
531 532
}

533 534
/* Unhash ip_vs_dest from rs_table. */
static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
L
Linus Torvalds 已提交
535 536
{
	/*
537
	 * Remove it from the rs_table table.
L
Linus Torvalds 已提交
538
	 */
539 540 541
	if (dest->in_rs_table) {
		hlist_del_rcu(&dest->d_list);
		dest->in_rs_table = 0;
L
Linus Torvalds 已提交
542 543 544
	}
}

545 546 547
/* Check if real service by <proto,addr,port> is present */
bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
			    const union nf_inet_addr *daddr, __be16 dport)
L
Linus Torvalds 已提交
548
{
549
	struct netns_ipvs *ipvs = net_ipvs(net);
550
	unsigned int hash;
L
Linus Torvalds 已提交
551 552
	struct ip_vs_dest *dest;

553
	/* Check for "full" addressed entries */
554
	hash = ip_vs_rs_hashkey(af, daddr, dport);
L
Linus Torvalds 已提交
555

556 557 558 559 560 561
	rcu_read_lock();
	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
		if (dest->port == dport &&
		    dest->af == af &&
		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
		    (dest->protocol == protocol || dest->vfwmark)) {
L
Linus Torvalds 已提交
562
			/* HIT */
563 564
			rcu_read_unlock();
			return true;
L
Linus Torvalds 已提交
565 566
		}
	}
567
	rcu_read_unlock();
L
Linus Torvalds 已提交
568

569
	return false;
L
Linus Torvalds 已提交
570 571
}

J
Julian Anastasov 已提交
572 573
/* Lookup destination by {addr,port} in the given service
 * Called under RCU lock.
L
Linus Torvalds 已提交
574 575
 */
static struct ip_vs_dest *
576 577
ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
		  const union nf_inet_addr *daddr, __be16 dport)
L
Linus Torvalds 已提交
578 579 580 581 582 583
{
	struct ip_vs_dest *dest;

	/*
	 * Find the destination for the given service
	 */
J
Julian Anastasov 已提交
584
	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
585 586 587
		if ((dest->af == dest_af) &&
		    ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
		    (dest->port == dport)) {
L
Linus Torvalds 已提交
588 589 590 591 592 593 594 595
			/* HIT */
			return dest;
		}
	}

	return NULL;
}

596 597
/*
 * Find destination by {daddr,dport,vaddr,protocol}
J
Julian Anastasov 已提交
598
 * Created to be used in ip_vs_process_message() in
599 600 601
 * the backup synchronization daemon. It finds the
 * destination to be bound to the received connection
 * on the backup.
J
Julian Anastasov 已提交
602
 * Called under RCU lock, no refcnt is returned.
603
 */
604
struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int svc_af, int dest_af,
605
				   const union nf_inet_addr *daddr,
606 607
				   __be16 dport,
				   const union nf_inet_addr *vaddr,
608 609
				   __be16 vport, __u16 protocol, __u32 fwmark,
				   __u32 flags)
610 611 612
{
	struct ip_vs_dest *dest;
	struct ip_vs_service *svc;
613
	__be16 port = dport;
614

615
	svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport);
616 617
	if (!svc)
		return NULL;
618 619
	if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
		port = 0;
620
	dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
621
	if (!dest)
622
		dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
623 624
	return dest;
}
L
Linus Torvalds 已提交
625

626 627 628 629 630 631 632 633 634 635 636
void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
{
	struct ip_vs_dest_dst *dest_dst = container_of(head,
						       struct ip_vs_dest_dst,
						       rcu_head);

	dst_release(dest_dst->dst_cache);
	kfree(dest_dst);
}

/* Release dest_dst and dst_cache for dest in user context */
637 638
static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
{
639
	struct ip_vs_dest_dst *old;
640

641 642 643 644 645
	old = rcu_dereference_protected(dest->dest_dst, 1);
	if (old) {
		RCU_INIT_POINTER(dest->dest_dst, NULL);
		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
	}
646 647
}

L
Linus Torvalds 已提交
648 649 650 651 652 653 654 655 656 657 658
/*
 *  Lookup dest by {svc,addr,port} in the destination trash.
 *  The destination trash is used to hold the destinations that are removed
 *  from the service table but are still referenced by some conn entries.
 *  The reason to add the destination trash is when the dest is temporary
 *  down (either by administrator or by monitor program), the dest can be
 *  picked back from the trash, the remaining connections to the dest can
 *  continue, and the counting information of the dest is also useful for
 *  scheduling.
 */
static struct ip_vs_dest *
659 660
ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
		     const union nf_inet_addr *daddr, __be16 dport)
L
Linus Torvalds 已提交
661
{
J
Julian Anastasov 已提交
662
	struct ip_vs_dest *dest;
H
Hans Schillstrom 已提交
663
	struct netns_ipvs *ipvs = net_ipvs(svc->net);
L
Linus Torvalds 已提交
664 665 666 667

	/*
	 * Find the destination in trash
	 */
J
Julian Anastasov 已提交
668 669
	spin_lock_bh(&ipvs->dest_trash_lock);
	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
670 671 672
		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
			      "dest->refcnt=%d\n",
			      dest->vfwmark,
673
			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
674 675
			      ntohs(dest->port),
			      atomic_read(&dest->refcnt));
676 677
		if (dest->af == dest_af &&
		    ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
L
Linus Torvalds 已提交
678 679 680 681
		    dest->port == dport &&
		    dest->vfwmark == svc->fwmark &&
		    dest->protocol == svc->protocol &&
		    (svc->fwmark ||
682
		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
L
Linus Torvalds 已提交
683 684
		      dest->vport == svc->port))) {
			/* HIT */
J
Julian Anastasov 已提交
685 686 687
			list_del(&dest->t_list);
			ip_vs_dest_hold(dest);
			goto out;
L
Linus Torvalds 已提交
688 689 690
		}
	}

J
Julian Anastasov 已提交
691 692 693 694 695 696
	dest = NULL;

out:
	spin_unlock_bh(&ipvs->dest_trash_lock);

	return dest;
L
Linus Torvalds 已提交
697 698
}

J
Julian Anastasov 已提交
699 700
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
701 702
	struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);

J
Julian Anastasov 已提交
703
	__ip_vs_dst_cache_reset(dest);
704
	__ip_vs_svc_put(svc, false);
J
Julian Anastasov 已提交
705
	free_percpu(dest->stats.cpustats);
706
	ip_vs_dest_put_and_free(dest);
J
Julian Anastasov 已提交
707
}
L
Linus Torvalds 已提交
708 709 710 711 712 713 714 715

/*
 *  Clean up all the destinations in the trash
 *  Called by the ip_vs_control_cleanup()
 *
 *  When the ip_vs_control_clearup is activated by ipvs module exit,
 *  the service tables must have been flushed and all the connections
 *  are expired, and the refcnt of each destination in the trash must
J
Julian Anastasov 已提交
716
 *  be 0, so we simply release them here.
L
Linus Torvalds 已提交
717
 */
H
Hans Schillstrom 已提交
718
static void ip_vs_trash_cleanup(struct net *net)
L
Linus Torvalds 已提交
719 720
{
	struct ip_vs_dest *dest, *nxt;
H
Hans Schillstrom 已提交
721
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
722

J
Julian Anastasov 已提交
723 724 725 726 727
	del_timer_sync(&ipvs->dest_trash_timer);
	/* No need to use dest_trash_lock */
	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
		list_del(&dest->t_list);
		ip_vs_dest_free(dest);
L
Linus Torvalds 已提交
728 729 730
	}
}

731
static void
732
ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
733
{
734
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
735 736 737 738 739 740 741 742 743

	spin_lock_bh(&src->lock);

	IP_VS_SHOW_STATS_COUNTER(conns);
	IP_VS_SHOW_STATS_COUNTER(inpkts);
	IP_VS_SHOW_STATS_COUNTER(outpkts);
	IP_VS_SHOW_STATS_COUNTER(inbytes);
	IP_VS_SHOW_STATS_COUNTER(outbytes);

J
Julian Anastasov 已提交
744
	ip_vs_read_estimator(dst, src);
745 746 747

	spin_unlock_bh(&src->lock);
}
L
Linus Torvalds 已提交
748

749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
static void
ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
{
	dst->conns = (u32)src->conns;
	dst->inpkts = (u32)src->inpkts;
	dst->outpkts = (u32)src->outpkts;
	dst->inbytes = src->inbytes;
	dst->outbytes = src->outbytes;
	dst->cps = (u32)src->cps;
	dst->inpps = (u32)src->inpps;
	dst->outpps = (u32)src->outpps;
	dst->inbps = (u32)src->inbps;
	dst->outbps = (u32)src->outbps;
}

L
Linus Torvalds 已提交
764 765 766 767
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
	spin_lock_bh(&stats->lock);
768

769 770
	/* get current counters as zero point, rates are zeroed */

771
#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
772 773 774 775 776 777 778

	IP_VS_ZERO_STATS_COUNTER(conns);
	IP_VS_ZERO_STATS_COUNTER(inpkts);
	IP_VS_ZERO_STATS_COUNTER(outpkts);
	IP_VS_ZERO_STATS_COUNTER(inbytes);
	IP_VS_ZERO_STATS_COUNTER(outbytes);

L
Linus Torvalds 已提交
779
	ip_vs_zero_estimator(stats);
780

781
	spin_unlock_bh(&stats->lock);
L
Linus Torvalds 已提交
782 783 784 785 786 787
}

/*
 *	Update a destination in the given service
 */
static void
788 789
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
		    struct ip_vs_dest_user_kern *udest, int add)
L
Linus Torvalds 已提交
790
{
791
	struct netns_ipvs *ipvs = net_ipvs(svc->net);
792
	struct ip_vs_service *old_svc;
J
Julian Anastasov 已提交
793
	struct ip_vs_scheduler *sched;
L
Linus Torvalds 已提交
794 795
	int conn_flags;

796 797 798 799 800 801
	/* We cannot modify an address and change the address family */
	BUG_ON(!add && udest->af != dest->af);

	if (add && udest->af != svc->af)
		ipvs->mixed_address_family_dests++;

L
Linus Torvalds 已提交
802 803
	/* set the weight and the flags */
	atomic_set(&dest->weight, udest->weight);
804 805
	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
	conn_flags |= IP_VS_CONN_F_INACTIVE;
L
Linus Torvalds 已提交
806 807

	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
808
	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
L
Linus Torvalds 已提交
809 810 811
		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
	} else {
		/*
812
		 *    Put the real service in rs_table if not present.
L
Linus Torvalds 已提交
813 814
		 *    For now only for NAT!
		 */
815
		ip_vs_rs_hash(ipvs, dest);
L
Linus Torvalds 已提交
816 817 818 819
	}
	atomic_set(&dest->conn_flags, conn_flags);

	/* bind the service */
820 821
	old_svc = rcu_dereference_protected(dest->svc, 1);
	if (!old_svc) {
L
Linus Torvalds 已提交
822 823
		__ip_vs_bind_svc(dest, svc);
	} else {
824
		if (old_svc != svc) {
L
Linus Torvalds 已提交
825 826
			ip_vs_zero_stats(&dest->stats);
			__ip_vs_bind_svc(dest, svc);
827
			__ip_vs_svc_put(old_svc, true);
L
Linus Torvalds 已提交
828 829 830 831 832 833 834 835 836 837
		}
	}

	/* set the dest status flags */
	dest->flags |= IP_VS_DEST_F_AVAILABLE;

	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
	dest->u_threshold = udest->u_threshold;
	dest->l_threshold = udest->l_threshold;
838

839 840
	dest->af = udest->af;

841
	spin_lock_bh(&dest->dst_lock);
842
	__ip_vs_dst_cache_reset(dest);
843
	spin_unlock_bh(&dest->dst_lock);
844

845
	if (add) {
J
Julian Anastasov 已提交
846
		ip_vs_start_estimator(svc->net, &dest->stats);
J
Julian Anastasov 已提交
847
		list_add_rcu(&dest->n_list, &svc->destinations);
848
		svc->num_dests++;
849 850
		sched = rcu_dereference_protected(svc->scheduler, 1);
		if (sched && sched->add_dest)
J
Julian Anastasov 已提交
851
			sched->add_dest(svc, dest);
852
	} else {
853 854
		sched = rcu_dereference_protected(svc->scheduler, 1);
		if (sched && sched->upd_dest)
J
Julian Anastasov 已提交
855
			sched->upd_dest(svc, dest);
856
	}
L
Linus Torvalds 已提交
857 858 859 860 861 862 863
}


/*
 *	Create a destination for the given service
 */
static int
864
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
L
Linus Torvalds 已提交
865 866 867
	       struct ip_vs_dest **dest_p)
{
	struct ip_vs_dest *dest;
868
	unsigned int atype, i;
L
Linus Torvalds 已提交
869 870 871

	EnterFunction(2);

872
#ifdef CONFIG_IP_VS_IPV6
873
	if (udest->af == AF_INET6) {
874
		atype = ipv6_addr_type(&udest->addr.in6);
875 876
		if ((!(atype & IPV6_ADDR_UNICAST) ||
			atype & IPV6_ADDR_LINKLOCAL) &&
877
			!__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
878 879 880 881
			return -EINVAL;
	} else
#endif
	{
882
		atype = inet_addr_type(svc->net, udest->addr.ip);
883 884 885
		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
886

887
	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
888
	if (dest == NULL)
L
Linus Torvalds 已提交
889
		return -ENOMEM;
890

891
	dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
892
	if (!dest->stats.cpustats)
893
		goto err_alloc;
L
Linus Torvalds 已提交
894

895 896 897 898 899 900
	for_each_possible_cpu(i) {
		struct ip_vs_cpu_stats *ip_vs_dest_stats;
		ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
		u64_stats_init(&ip_vs_dest_stats->syncp);
	}

901
	dest->af = udest->af;
L
Linus Torvalds 已提交
902
	dest->protocol = svc->protocol;
903
	dest->vaddr = svc->addr;
L
Linus Torvalds 已提交
904 905
	dest->vport = svc->port;
	dest->vfwmark = svc->fwmark;
906
	ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
L
Linus Torvalds 已提交
907 908 909 910 911
	dest->port = udest->port;

	atomic_set(&dest->activeconns, 0);
	atomic_set(&dest->inactconns, 0);
	atomic_set(&dest->persistconns, 0);
912
	atomic_set(&dest->refcnt, 1);
L
Linus Torvalds 已提交
913

914
	INIT_HLIST_NODE(&dest->d_list);
L
Linus Torvalds 已提交
915 916
	spin_lock_init(&dest->dst_lock);
	spin_lock_init(&dest->stats.lock);
917
	__ip_vs_update_dest(svc, dest, udest, 1);
L
Linus Torvalds 已提交
918 919 920 921 922

	*dest_p = dest;

	LeaveFunction(2);
	return 0;
923 924 925 926

err_alloc:
	kfree(dest);
	return -ENOMEM;
L
Linus Torvalds 已提交
927 928 929 930 931 932 933
}


/*
 *	Add a destination into an existing service
 */
static int
934
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
L
Linus Torvalds 已提交
935 936
{
	struct ip_vs_dest *dest;
937
	union nf_inet_addr daddr;
A
Al Viro 已提交
938
	__be16 dport = udest->port;
L
Linus Torvalds 已提交
939 940 941 942 943
	int ret;

	EnterFunction(2);

	if (udest->weight < 0) {
944
		pr_err("%s(): server weight less than zero\n", __func__);
L
Linus Torvalds 已提交
945 946 947 948
		return -ERANGE;
	}

	if (udest->l_threshold > udest->u_threshold) {
949 950
		pr_err("%s(): lower threshold is higher than upper threshold\n",
			__func__);
L
Linus Torvalds 已提交
951 952 953
		return -ERANGE;
	}

954
	ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
955

J
Julian Anastasov 已提交
956 957
	/* We use function that requires RCU lock */
	rcu_read_lock();
958
	dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
J
Julian Anastasov 已提交
959
	rcu_read_unlock();
960

L
Linus Torvalds 已提交
961
	if (dest != NULL) {
962
		IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
L
Linus Torvalds 已提交
963 964 965 966 967 968 969
		return -EEXIST;
	}

	/*
	 * Check if the dest already exists in the trash and
	 * is from the same service
	 */
970
	dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
971

L
Linus Torvalds 已提交
972
	if (dest != NULL) {
973 974
		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
			      "dest->refcnt=%d, service %u/%s:%u\n",
975
			      IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
976 977 978 979 980
			      atomic_read(&dest->refcnt),
			      dest->vfwmark,
			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
			      ntohs(dest->vport));

981 982 983
		__ip_vs_update_dest(svc, dest, udest, 1);
		ret = 0;
	} else {
L
Linus Torvalds 已提交
984
		/*
985
		 * Allocate and initialize the dest structure
L
Linus Torvalds 已提交
986
		 */
987
		ret = ip_vs_new_dest(svc, udest, &dest);
L
Linus Torvalds 已提交
988 989 990
	}
	LeaveFunction(2);

991
	return ret;
L
Linus Torvalds 已提交
992 993 994 995 996 997 998
}


/*
 *	Edit a destination in the given service
 */
static int
999
ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
L
Linus Torvalds 已提交
1000 1001
{
	struct ip_vs_dest *dest;
1002
	union nf_inet_addr daddr;
A
Al Viro 已提交
1003
	__be16 dport = udest->port;
L
Linus Torvalds 已提交
1004 1005 1006 1007

	EnterFunction(2);

	if (udest->weight < 0) {
1008
		pr_err("%s(): server weight less than zero\n", __func__);
L
Linus Torvalds 已提交
1009 1010 1011 1012
		return -ERANGE;
	}

	if (udest->l_threshold > udest->u_threshold) {
1013 1014
		pr_err("%s(): lower threshold is higher than upper threshold\n",
			__func__);
L
Linus Torvalds 已提交
1015 1016 1017
		return -ERANGE;
	}

1018
	ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1019

J
Julian Anastasov 已提交
1020 1021
	/* We use function that requires RCU lock */
	rcu_read_lock();
1022
	dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
J
Julian Anastasov 已提交
1023
	rcu_read_unlock();
1024

L
Linus Torvalds 已提交
1025
	if (dest == NULL) {
1026
		IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
L
Linus Torvalds 已提交
1027 1028 1029
		return -ENOENT;
	}

1030
	__ip_vs_update_dest(svc, dest, udest, 0);
L
Linus Torvalds 已提交
1031 1032 1033 1034 1035 1036 1037 1038
	LeaveFunction(2);

	return 0;
}

/*
 *	Delete a destination (must be already unlinked from the service)
 */
J
Julian Anastasov 已提交
1039 1040
static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
			     bool cleanup)
L
Linus Torvalds 已提交
1041
{
1042 1043
	struct netns_ipvs *ipvs = net_ipvs(net);

1044
	ip_vs_stop_estimator(net, &dest->stats);
L
Linus Torvalds 已提交
1045 1046 1047 1048 1049 1050

	/*
	 *  Remove it from the d-linked list with the real services.
	 */
	ip_vs_rs_unhash(dest);

J
Julian Anastasov 已提交
1051 1052 1053 1054 1055 1056
	spin_lock_bh(&ipvs->dest_trash_lock);
	IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
		      atomic_read(&dest->refcnt));
	if (list_empty(&ipvs->dest_trash) && !cleanup)
		mod_timer(&ipvs->dest_trash_timer,
1057
			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
J
Julian Anastasov 已提交
1058 1059
	/* dest lives in trash without reference */
	list_add(&dest->t_list, &ipvs->dest_trash);
1060
	dest->idle_start = 0;
J
Julian Anastasov 已提交
1061 1062
	spin_unlock_bh(&ipvs->dest_trash_lock);
	ip_vs_dest_put(dest);
L
Linus Torvalds 已提交
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
}


/*
 *	Unlink a destination from the given service
 */
static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
				struct ip_vs_dest *dest,
				int svcupd)
{
	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;

	/*
	 *  Remove it from the d-linked destination list.
	 */
J
Julian Anastasov 已提交
1078
	list_del_rcu(&dest->n_list);
L
Linus Torvalds 已提交
1079
	svc->num_dests--;
1080

1081 1082 1083
	if (dest->af != svc->af)
		net_ipvs(svc->net)->mixed_address_family_dests--;

J
Julian Anastasov 已提交
1084 1085
	if (svcupd) {
		struct ip_vs_scheduler *sched;
1086

J
Julian Anastasov 已提交
1087
		sched = rcu_dereference_protected(svc->scheduler, 1);
1088
		if (sched && sched->del_dest)
J
Julian Anastasov 已提交
1089 1090
			sched->del_dest(svc, dest);
	}
L
Linus Torvalds 已提交
1091 1092 1093 1094 1095 1096 1097
}


/*
 *	Delete a destination server in the given service
 */
static int
1098
ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
L
Linus Torvalds 已提交
1099 1100
{
	struct ip_vs_dest *dest;
A
Al Viro 已提交
1101
	__be16 dport = udest->port;
L
Linus Torvalds 已提交
1102 1103 1104

	EnterFunction(2);

J
Julian Anastasov 已提交
1105 1106
	/* We use function that requires RCU lock */
	rcu_read_lock();
1107
	dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
J
Julian Anastasov 已提交
1108
	rcu_read_unlock();
1109

L
Linus Torvalds 已提交
1110
	if (dest == NULL) {
1111
		IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
L
Linus Torvalds 已提交
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
		return -ENOENT;
	}

	/*
	 *	Unlink dest from the service
	 */
	__ip_vs_unlink_dest(svc, dest, 1);

	/*
	 *	Delete the destination
	 */
J
Julian Anastasov 已提交
1123
	__ip_vs_del_dest(svc->net, dest, false);
L
Linus Torvalds 已提交
1124 1125 1126 1127 1128 1129

	LeaveFunction(2);

	return 0;
}

J
Julian Anastasov 已提交
1130 1131 1132 1133 1134
static void ip_vs_dest_trash_expire(unsigned long data)
{
	struct net *net = (struct net *) data;
	struct netns_ipvs *ipvs = net_ipvs(net);
	struct ip_vs_dest *dest, *next;
1135
	unsigned long now = jiffies;
J
Julian Anastasov 已提交
1136 1137 1138 1139 1140

	spin_lock(&ipvs->dest_trash_lock);
	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
		if (atomic_read(&dest->refcnt) > 0)
			continue;
1141 1142 1143 1144 1145 1146 1147 1148
		if (dest->idle_start) {
			if (time_before(now, dest->idle_start +
					     IP_VS_DEST_TRASH_PERIOD))
				continue;
		} else {
			dest->idle_start = max(1UL, now);
			continue;
		}
J
Julian Anastasov 已提交
1149 1150
		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
			      dest->vfwmark,
1151
			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
J
Julian Anastasov 已提交
1152 1153 1154 1155 1156 1157
			      ntohs(dest->port));
		list_del(&dest->t_list);
		ip_vs_dest_free(dest);
	}
	if (!list_empty(&ipvs->dest_trash))
		mod_timer(&ipvs->dest_trash_timer,
1158
			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
J
Julian Anastasov 已提交
1159 1160
	spin_unlock(&ipvs->dest_trash_lock);
}
L
Linus Torvalds 已提交
1161 1162 1163 1164 1165

/*
 *	Add a service into the service hash table
 */
static int
1166
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1167
		  struct ip_vs_service **svc_p)
L
Linus Torvalds 已提交
1168
{
1169
	int ret = 0, i;
L
Linus Torvalds 已提交
1170
	struct ip_vs_scheduler *sched = NULL;
1171
	struct ip_vs_pe *pe = NULL;
L
Linus Torvalds 已提交
1172
	struct ip_vs_service *svc = NULL;
1173
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
1174 1175 1176 1177 1178

	/* increase the module use count */
	ip_vs_use_count_inc();

	/* Lookup the scheduler by 'u->sched_name' */
1179 1180 1181 1182 1183 1184 1185 1186
	if (strcmp(u->sched_name, "none")) {
		sched = ip_vs_scheduler_get(u->sched_name);
		if (!sched) {
			pr_info("Scheduler module ip_vs_%s not found\n",
				u->sched_name);
			ret = -ENOENT;
			goto out_err;
		}
L
Linus Torvalds 已提交
1187 1188
	}

1189
	if (u->pe_name && *u->pe_name) {
1190
		pe = ip_vs_pe_getbyname(u->pe_name);
1191 1192 1193 1194 1195 1196 1197 1198
		if (pe == NULL) {
			pr_info("persistence engine module ip_vs_pe_%s "
				"not found\n", u->pe_name);
			ret = -ENOENT;
			goto out_err;
		}
	}

1199
#ifdef CONFIG_IP_VS_IPV6
1200 1201 1202 1203 1204 1205 1206
	if (u->af == AF_INET6) {
		__u32 plen = (__force __u32) u->netmask;

		if (plen < 1 || plen > 128) {
			ret = -EINVAL;
			goto out_err;
		}
1207 1208 1209
	}
#endif

1210
	svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
L
Linus Torvalds 已提交
1211
	if (svc == NULL) {
1212
		IP_VS_DBG(1, "%s(): no memory\n", __func__);
L
Linus Torvalds 已提交
1213 1214 1215
		ret = -ENOMEM;
		goto out_err;
	}
1216
	svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
J
Julia Lawall 已提交
1217 1218
	if (!svc->stats.cpustats) {
		ret = -ENOMEM;
1219
		goto out_err;
J
Julia Lawall 已提交
1220
	}
L
Linus Torvalds 已提交
1221

1222 1223 1224 1225 1226 1227 1228
	for_each_possible_cpu(i) {
		struct ip_vs_cpu_stats *ip_vs_stats;
		ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
		u64_stats_init(&ip_vs_stats->syncp);
	}


L
Linus Torvalds 已提交
1229 1230 1231
	/* I'm the first user of the service */
	atomic_set(&svc->refcnt, 0);

1232
	svc->af = u->af;
L
Linus Torvalds 已提交
1233
	svc->protocol = u->protocol;
1234
	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
L
Linus Torvalds 已提交
1235 1236 1237 1238 1239
	svc->port = u->port;
	svc->fwmark = u->fwmark;
	svc->flags = u->flags;
	svc->timeout = u->timeout * HZ;
	svc->netmask = u->netmask;
1240
	svc->net = net;
L
Linus Torvalds 已提交
1241 1242

	INIT_LIST_HEAD(&svc->destinations);
1243
	spin_lock_init(&svc->sched_lock);
L
Linus Torvalds 已提交
1244 1245 1246
	spin_lock_init(&svc->stats.lock);

	/* Bind the scheduler */
1247 1248 1249 1250 1251 1252
	if (sched) {
		ret = ip_vs_bind_scheduler(svc, sched);
		if (ret)
			goto out_err;
		sched = NULL;
	}
L
Linus Torvalds 已提交
1253

1254
	/* Bind the ct retriever */
J
Julian Anastasov 已提交
1255
	RCU_INIT_POINTER(svc->pe, pe);
1256 1257
	pe = NULL;

L
Linus Torvalds 已提交
1258 1259
	/* Update the virtual service counters */
	if (svc->port == FTPPORT)
1260
		atomic_inc(&ipvs->ftpsvc_counter);
L
Linus Torvalds 已提交
1261
	else if (svc->port == 0)
1262
		atomic_inc(&ipvs->nullsvc_counter);
L
Linus Torvalds 已提交
1263

1264
	ip_vs_start_estimator(net, &svc->stats);
1265 1266 1267

	/* Count only IPv4 services for old get/setsockopt interface */
	if (svc->af == AF_INET)
1268
		ipvs->num_services++;
L
Linus Torvalds 已提交
1269 1270 1271 1272 1273

	/* Hash the service into the service table */
	ip_vs_svc_hash(svc);

	*svc_p = svc;
1274 1275
	/* Now there is a service - full throttle */
	ipvs->enable = 1;
L
Linus Torvalds 已提交
1276 1277
	return 0;

1278

1279
 out_err:
L
Linus Torvalds 已提交
1280
	if (svc != NULL) {
J
Julian Anastasov 已提交
1281 1282
		ip_vs_unbind_scheduler(svc, sched);
		ip_vs_service_free(svc);
L
Linus Torvalds 已提交
1283 1284
	}
	ip_vs_scheduler_put(sched);
1285
	ip_vs_pe_put(pe);
L
Linus Torvalds 已提交
1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297

	/* decrease the module use count */
	ip_vs_use_count_dec();

	return ret;
}


/*
 *	Edit a service and bind it with a new scheduler
 */
static int
1298
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
L
Linus Torvalds 已提交
1299
{
1300
	struct ip_vs_scheduler *sched = NULL, *old_sched;
1301
	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
L
Linus Torvalds 已提交
1302 1303 1304 1305 1306
	int ret = 0;

	/*
	 * Lookup the scheduler, by 'u->sched_name'
	 */
1307 1308 1309 1310 1311 1312 1313
	if (strcmp(u->sched_name, "none")) {
		sched = ip_vs_scheduler_get(u->sched_name);
		if (!sched) {
			pr_info("Scheduler module ip_vs_%s not found\n",
				u->sched_name);
			return -ENOENT;
		}
L
Linus Torvalds 已提交
1314 1315 1316
	}
	old_sched = sched;

1317
	if (u->pe_name && *u->pe_name) {
1318
		pe = ip_vs_pe_getbyname(u->pe_name);
1319 1320 1321 1322 1323 1324 1325 1326 1327
		if (pe == NULL) {
			pr_info("persistence engine module ip_vs_pe_%s "
				"not found\n", u->pe_name);
			ret = -ENOENT;
			goto out;
		}
		old_pe = pe;
	}

1328
#ifdef CONFIG_IP_VS_IPV6
1329 1330 1331 1332 1333 1334 1335
	if (u->af == AF_INET6) {
		__u32 plen = (__force __u32) u->netmask;

		if (plen < 1 || plen > 128) {
			ret = -EINVAL;
			goto out;
		}
1336 1337 1338
	}
#endif

J
Julian Anastasov 已提交
1339 1340
	old_sched = rcu_dereference_protected(svc->scheduler, 1);
	if (sched != old_sched) {
1341 1342 1343 1344 1345 1346
		if (old_sched) {
			ip_vs_unbind_scheduler(svc, old_sched);
			RCU_INIT_POINTER(svc->scheduler, NULL);
			/* Wait all svc->sched_data users */
			synchronize_rcu();
		}
J
Julian Anastasov 已提交
1347
		/* Bind the new scheduler */
1348 1349 1350 1351 1352 1353
		if (sched) {
			ret = ip_vs_bind_scheduler(svc, sched);
			if (ret) {
				ip_vs_scheduler_put(sched);
				goto out;
			}
J
Julian Anastasov 已提交
1354 1355
		}
	}
L
Linus Torvalds 已提交
1356 1357 1358 1359 1360 1361 1362 1363

	/*
	 * Set the flags and timeout value
	 */
	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
	svc->timeout = u->timeout * HZ;
	svc->netmask = u->netmask;

J
Julian Anastasov 已提交
1364 1365 1366
	old_pe = rcu_dereference_protected(svc->pe, 1);
	if (pe != old_pe)
		rcu_assign_pointer(svc->pe, pe);
L
Linus Torvalds 已提交
1367

H
Hans Schillstrom 已提交
1368
out:
1369
	ip_vs_scheduler_put(old_sched);
1370
	ip_vs_pe_put(old_pe);
L
Linus Torvalds 已提交
1371 1372 1373 1374 1375 1376 1377 1378
	return ret;
}

/*
 *	Delete a service from the service list
 *	- The service must be unlinked, unlocked and not referenced!
 *	- We are called under _bh lock
 */
J
Julian Anastasov 已提交
1379
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
L
Linus Torvalds 已提交
1380 1381 1382
{
	struct ip_vs_dest *dest, *nxt;
	struct ip_vs_scheduler *old_sched;
1383
	struct ip_vs_pe *old_pe;
1384
	struct netns_ipvs *ipvs = net_ipvs(svc->net);
1385 1386

	pr_info("%s: enter\n", __func__);
L
Linus Torvalds 已提交
1387

1388 1389
	/* Count only IPv4 services for old get/setsockopt interface */
	if (svc->af == AF_INET)
1390
		ipvs->num_services--;
1391

1392
	ip_vs_stop_estimator(svc->net, &svc->stats);
L
Linus Torvalds 已提交
1393 1394

	/* Unbind scheduler */
J
Julian Anastasov 已提交
1395 1396
	old_sched = rcu_dereference_protected(svc->scheduler, 1);
	ip_vs_unbind_scheduler(svc, old_sched);
1397
	ip_vs_scheduler_put(old_sched);
L
Linus Torvalds 已提交
1398

J
Julian Anastasov 已提交
1399 1400
	/* Unbind persistence engine, keep svc->pe */
	old_pe = rcu_dereference_protected(svc->pe, 1);
1401 1402
	ip_vs_pe_put(old_pe);

L
Linus Torvalds 已提交
1403 1404 1405 1406 1407
	/*
	 *    Unlink the whole destination list
	 */
	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
		__ip_vs_unlink_dest(svc, dest, 0);
J
Julian Anastasov 已提交
1408
		__ip_vs_del_dest(svc->net, dest, cleanup);
L
Linus Torvalds 已提交
1409 1410 1411 1412 1413 1414
	}

	/*
	 *    Update the virtual service counters
	 */
	if (svc->port == FTPPORT)
1415
		atomic_dec(&ipvs->ftpsvc_counter);
L
Linus Torvalds 已提交
1416
	else if (svc->port == 0)
1417
		atomic_dec(&ipvs->nullsvc_counter);
L
Linus Torvalds 已提交
1418 1419 1420 1421

	/*
	 *    Free the service if nobody refers to it
	 */
1422
	__ip_vs_svc_put(svc, true);
L
Linus Torvalds 已提交
1423 1424 1425 1426 1427 1428

	/* decrease the module use count */
	ip_vs_use_count_dec();
}

/*
1429
 * Unlink a service from list and try to delete it if its refcnt reached 0
L
Linus Torvalds 已提交
1430
 */
J
Julian Anastasov 已提交
1431
static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
L
Linus Torvalds 已提交
1432
{
J
Julian Anastasov 已提交
1433 1434
	/* Hold svc to avoid double release from dest_trash */
	atomic_inc(&svc->refcnt);
L
Linus Torvalds 已提交
1435 1436 1437 1438 1439
	/*
	 * Unhash it from the service table
	 */
	ip_vs_svc_unhash(svc);

J
Julian Anastasov 已提交
1440
	__ip_vs_del_service(svc, cleanup);
1441 1442 1443 1444 1445 1446 1447 1448 1449
}

/*
 *	Delete a service from the service list
 */
static int ip_vs_del_service(struct ip_vs_service *svc)
{
	if (svc == NULL)
		return -EEXIST;
J
Julian Anastasov 已提交
1450
	ip_vs_unlink_service(svc, false);
L
Linus Torvalds 已提交
1451 1452 1453 1454 1455 1456 1457 1458

	return 0;
}


/*
 *	Flush all the virtual services
 */
J
Julian Anastasov 已提交
1459
static int ip_vs_flush(struct net *net, bool cleanup)
L
Linus Torvalds 已提交
1460 1461
{
	int idx;
J
Julian Anastasov 已提交
1462 1463
	struct ip_vs_service *svc;
	struct hlist_node *n;
L
Linus Torvalds 已提交
1464 1465

	/*
1466
	 * Flush the service table hashed by <netns,protocol,addr,port>
L
Linus Torvalds 已提交
1467 1468
	 */
	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1469 1470
		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
					  s_list) {
1471
			if (net_eq(svc->net, net))
J
Julian Anastasov 已提交
1472
				ip_vs_unlink_service(svc, cleanup);
L
Linus Torvalds 已提交
1473 1474 1475 1476 1477 1478 1479
		}
	}

	/*
	 * Flush the service table hashed by fwmark
	 */
	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1480 1481
		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
					  f_list) {
1482
			if (net_eq(svc->net, net))
J
Julian Anastasov 已提交
1483
				ip_vs_unlink_service(svc, cleanup);
L
Linus Torvalds 已提交
1484 1485 1486 1487 1488 1489
		}
	}

	return 0;
}

1490 1491 1492 1493
/*
 *	Delete service by {netns} in the service table.
 *	Called by __ip_vs_cleanup()
 */
1494
void ip_vs_service_net_cleanup(struct net *net)
1495 1496 1497 1498
{
	EnterFunction(2);
	/* Check for "full" addressed entries */
	mutex_lock(&__ip_vs_mutex);
J
Julian Anastasov 已提交
1499
	ip_vs_flush(net, true);
1500 1501 1502
	mutex_unlock(&__ip_vs_mutex);
	LeaveFunction(2);
}
1503 1504

/* Put all references for device (dst_cache) */
1505
static inline void
1506
ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
1507
{
1508 1509
	struct ip_vs_dest_dst *dest_dst;

1510
	spin_lock_bh(&dest->dst_lock);
1511 1512
	dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
	if (dest_dst && dest_dst->dst_cache->dev == dev) {
1513 1514 1515 1516 1517
		IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
			      dev->name,
			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
			      ntohs(dest->port),
			      atomic_read(&dest->refcnt));
1518
		__ip_vs_dst_cache_reset(dest);
1519 1520 1521 1522
	}
	spin_unlock_bh(&dest->dst_lock);

}
1523 1524
/* Netdev event receiver
 * Currently only NETDEV_DOWN is handled to release refs to cached dsts
1525 1526
 */
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1527
			   void *ptr)
1528
{
1529
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1530
	struct net *net = dev_net(dev);
1531
	struct netns_ipvs *ipvs = net_ipvs(net);
1532 1533 1534 1535
	struct ip_vs_service *svc;
	struct ip_vs_dest *dest;
	unsigned int idx;

1536
	if (event != NETDEV_DOWN || !ipvs)
1537 1538 1539 1540 1541
		return NOTIFY_DONE;
	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
	EnterFunction(2);
	mutex_lock(&__ip_vs_mutex);
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1542
		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1543 1544 1545
			if (net_eq(svc->net, net)) {
				list_for_each_entry(dest, &svc->destinations,
						    n_list) {
1546
					ip_vs_forget_dev(dest, dev);
1547 1548 1549 1550
				}
			}
		}

J
Julian Anastasov 已提交
1551
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1552 1553 1554
			if (net_eq(svc->net, net)) {
				list_for_each_entry(dest, &svc->destinations,
						    n_list) {
1555
					ip_vs_forget_dev(dest, dev);
1556 1557 1558 1559 1560 1561
				}
			}

		}
	}

J
Julian Anastasov 已提交
1562 1563
	spin_lock_bh(&ipvs->dest_trash_lock);
	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1564
		ip_vs_forget_dev(dest, dev);
1565
	}
J
Julian Anastasov 已提交
1566
	spin_unlock_bh(&ipvs->dest_trash_lock);
1567 1568 1569 1570
	mutex_unlock(&__ip_vs_mutex);
	LeaveFunction(2);
	return NOTIFY_DONE;
}
L
Linus Torvalds 已提交
1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585

/*
 *	Zero counters in a service or all services
 */
static int ip_vs_zero_service(struct ip_vs_service *svc)
{
	struct ip_vs_dest *dest;

	list_for_each_entry(dest, &svc->destinations, n_list) {
		ip_vs_zero_stats(&dest->stats);
	}
	ip_vs_zero_stats(&svc->stats);
	return 0;
}

1586
static int ip_vs_zero_all(struct net *net)
L
Linus Torvalds 已提交
1587 1588 1589 1590 1591
{
	int idx;
	struct ip_vs_service *svc;

	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1592
		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1593 1594
			if (net_eq(svc->net, net))
				ip_vs_zero_service(svc);
L
Linus Torvalds 已提交
1595 1596 1597 1598
		}
	}

	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1599
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1600 1601
			if (net_eq(svc->net, net))
				ip_vs_zero_service(svc);
L
Linus Torvalds 已提交
1602 1603 1604
		}
	}

J
Julian Anastasov 已提交
1605
	ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
L
Linus Torvalds 已提交
1606 1607 1608
	return 0;
}

1609
#ifdef CONFIG_SYSCTL
1610 1611 1612 1613

static int zero;
static int three = 3;

L
Linus Torvalds 已提交
1614
static int
1615
proc_do_defense_mode(struct ctl_table *table, int write,
L
Linus Torvalds 已提交
1616 1617
		     void __user *buffer, size_t *lenp, loff_t *ppos)
{
1618
	struct net *net = current->nsproxy->net_ns;
L
Linus Torvalds 已提交
1619 1620 1621 1622
	int *valp = table->data;
	int val = *valp;
	int rc;

1623
	rc = proc_dointvec(table, write, buffer, lenp, ppos);
L
Linus Torvalds 已提交
1624 1625 1626 1627 1628
	if (write && (*valp != val)) {
		if ((*valp < 0) || (*valp > 3)) {
			/* Restore the correct value */
			*valp = val;
		} else {
1629
			update_defense_level(net_ipvs(net));
L
Linus Torvalds 已提交
1630 1631 1632 1633 1634 1635
		}
	}
	return rc;
}

static int
1636
proc_do_sync_threshold(struct ctl_table *table, int write,
L
Linus Torvalds 已提交
1637 1638 1639 1640 1641 1642 1643 1644 1645
		       void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int *valp = table->data;
	int val[2];
	int rc;

	/* backup the value first */
	memcpy(val, valp, sizeof(val));

1646
	rc = proc_dointvec(table, write, buffer, lenp, ppos);
1647 1648
	if (write && (valp[0] < 0 || valp[1] < 0 ||
	    (valp[0] >= valp[1] && valp[1]))) {
L
Linus Torvalds 已提交
1649 1650 1651 1652 1653 1654
		/* Restore the correct value */
		memcpy(valp, val, sizeof(val));
	}
	return rc;
}

1655
static int
1656
proc_do_sync_mode(struct ctl_table *table, int write,
1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667
		     void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int *valp = table->data;
	int val = *valp;
	int rc;

	rc = proc_dointvec(table, write, buffer, lenp, ppos);
	if (write && (*valp != val)) {
		if ((*valp < 0) || (*valp > 1)) {
			/* Restore the correct value */
			*valp = val;
1668 1669 1670 1671 1672 1673
		}
	}
	return rc;
}

static int
1674
proc_do_sync_ports(struct ctl_table *table, int write,
1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685
		   void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int *valp = table->data;
	int val = *valp;
	int rc;

	rc = proc_dointvec(table, write, buffer, lenp, ppos);
	if (write && (*valp != val)) {
		if (*valp < 1 || !is_power_of_2(*valp)) {
			/* Restore the correct value */
			*valp = val;
1686 1687 1688 1689
		}
	}
	return rc;
}
L
Linus Torvalds 已提交
1690 1691 1692

/*
 *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1693
 *	Do not change order or insert new entries without
1694
 *	align with netns init in ip_vs_control_net_init()
L
Linus Torvalds 已提交
1695 1696 1697 1698 1699 1700 1701
 */

static struct ctl_table vs_vars[] = {
	{
		.procname	= "amemthresh",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1702
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
1703 1704 1705 1706 1707
	},
	{
		.procname	= "am_droprate",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1708
		.proc_handler	= proc_dointvec,
L
Linus Torvalds 已提交
1709 1710 1711 1712 1713
	},
	{
		.procname	= "drop_entry",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1714
		.proc_handler	= proc_do_defense_mode,
L
Linus Torvalds 已提交
1715 1716 1717 1718 1719
	},
	{
		.procname	= "drop_packet",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1720
		.proc_handler	= proc_do_defense_mode,
L
Linus Torvalds 已提交
1721
	},
1722 1723 1724 1725 1726 1727 1728 1729
#ifdef CONFIG_IP_VS_NFCT
	{
		.procname	= "conntrack",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec,
	},
#endif
L
Linus Torvalds 已提交
1730 1731 1732 1733
	{
		.procname	= "secure_tcp",
		.maxlen		= sizeof(int),
		.mode		= 0644,
A
Alexey Dobriyan 已提交
1734
		.proc_handler	= proc_do_defense_mode,
L
Linus Torvalds 已提交
1735
	},
1736 1737 1738 1739 1740 1741
	{
		.procname	= "snat_reroute",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec,
	},
1742 1743 1744 1745 1746 1747
	{
		.procname	= "sync_version",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_do_sync_mode,
	},
1748 1749 1750 1751 1752 1753
	{
		.procname	= "sync_ports",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_do_sync_ports,
	},
1754 1755 1756 1757 1758 1759
	{
		.procname	= "sync_persist_mode",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
P
Pablo Neira Ayuso 已提交
1760 1761
	{
		.procname	= "sync_qlen_max",
1762
		.maxlen		= sizeof(unsigned long),
P
Pablo Neira Ayuso 已提交
1763
		.mode		= 0644,
1764
		.proc_handler	= proc_doulongvec_minmax,
P
Pablo Neira Ayuso 已提交
1765 1766 1767 1768 1769 1770 1771
	},
	{
		.procname	= "sync_sock_size",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783
	{
		.procname	= "cache_bypass",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
	{
		.procname	= "expire_nodest_conn",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
A
Alexander Frolkin 已提交
1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
	{
		.procname	= "sloppy_tcp",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
	{
		.procname	= "sloppy_sctp",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808
	{
		.procname	= "expire_quiescent_template",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
	{
		.procname	= "sync_threshold",
		.maxlen		=
			sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
		.mode		= 0644,
		.proc_handler	= proc_do_sync_threshold,
	},
1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822
	{
		.procname	= "sync_refresh_period",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_jiffies,
	},
	{
		.procname	= "sync_retries",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= &zero,
		.extra2		= &three,
	},
1823 1824 1825 1826 1827 1828
	{
		.procname	= "nat_icmp_send",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1829 1830 1831 1832 1833 1834
	{
		.procname	= "pmtu_disc",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1835 1836 1837 1838 1839 1840
	{
		.procname	= "backup_only",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1841 1842 1843 1844 1845 1846
	{
		.procname	= "conn_reuse_mode",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
1847 1848 1849 1850 1851 1852 1853 1854
#ifdef CONFIG_IP_VS_DEBUG
	{
		.procname	= "debug_level",
		.data		= &sysctl_ip_vs_debug_level,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
L
Linus Torvalds 已提交
1855
#endif
1856
	{ }
L
Linus Torvalds 已提交
1857 1858
};

1859
#endif
L
Linus Torvalds 已提交
1860 1861 1862 1863

#ifdef CONFIG_PROC_FS

struct ip_vs_iter {
1864
	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
J
Julian Anastasov 已提交
1865
	struct hlist_head *table;
L
Linus Torvalds 已提交
1866 1867 1868 1869 1870 1871 1872
	int bucket;
};

/*
 *	Write the contents of the VS rule table to a PROCfs file.
 *	(It is kept just for backward compatibility)
 */
1873
static inline const char *ip_vs_fwd_name(unsigned int flags)
L
Linus Torvalds 已提交
1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890
{
	switch (flags & IP_VS_CONN_F_FWD_MASK) {
	case IP_VS_CONN_F_LOCALNODE:
		return "Local";
	case IP_VS_CONN_F_TUNNEL:
		return "Tunnel";
	case IP_VS_CONN_F_DROUTE:
		return "Route";
	default:
		return "Masq";
	}
}


/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
1891
	struct net *net = seq_file_net(seq);
L
Linus Torvalds 已提交
1892 1893 1894 1895 1896 1897
	struct ip_vs_iter *iter = seq->private;
	int idx;
	struct ip_vs_service *svc;

	/* look in hash by protocol */
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1898
		hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
1899
			if (net_eq(svc->net, net) && pos-- == 0) {
L
Linus Torvalds 已提交
1900 1901 1902 1903 1904 1905 1906 1907 1908
				iter->table = ip_vs_svc_table;
				iter->bucket = idx;
				return svc;
			}
		}
	}

	/* keep looking in fwmark */
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
1909 1910
		hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
					 f_list) {
1911
			if (net_eq(svc->net, net) && pos-- == 0) {
L
Linus Torvalds 已提交
1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922
				iter->table = ip_vs_svc_fwm_table;
				iter->bucket = idx;
				return svc;
			}
		}
	}

	return NULL;
}

static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1923
	__acquires(RCU)
L
Linus Torvalds 已提交
1924
{
J
Julian Anastasov 已提交
1925
	rcu_read_lock();
L
Linus Torvalds 已提交
1926 1927 1928 1929 1930 1931
	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}


static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
J
Julian Anastasov 已提交
1932
	struct hlist_node *e;
L
Linus Torvalds 已提交
1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944
	struct ip_vs_iter *iter;
	struct ip_vs_service *svc;

	++*pos;
	if (v == SEQ_START_TOKEN)
		return ip_vs_info_array(seq,0);

	svc = v;
	iter = seq->private;

	if (iter->table == ip_vs_svc_table) {
		/* next service in table hashed by protocol */
J
Julian Anastasov 已提交
1945 1946 1947
		e = rcu_dereference(hlist_next_rcu(&svc->s_list));
		if (e)
			return hlist_entry(e, struct ip_vs_service, s_list);
L
Linus Torvalds 已提交
1948 1949

		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
J
Julian Anastasov 已提交
1950 1951 1952
			hlist_for_each_entry_rcu(svc,
						 &ip_vs_svc_table[iter->bucket],
						 s_list) {
L
Linus Torvalds 已提交
1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
				return svc;
			}
		}

		iter->table = ip_vs_svc_fwm_table;
		iter->bucket = -1;
		goto scan_fwmark;
	}

	/* next service in hashed by fwmark */
J
Julian Anastasov 已提交
1963 1964 1965
	e = rcu_dereference(hlist_next_rcu(&svc->f_list));
	if (e)
		return hlist_entry(e, struct ip_vs_service, f_list);
L
Linus Torvalds 已提交
1966 1967 1968

 scan_fwmark:
	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
J
Julian Anastasov 已提交
1969 1970 1971
		hlist_for_each_entry_rcu(svc,
					 &ip_vs_svc_fwm_table[iter->bucket],
					 f_list)
L
Linus Torvalds 已提交
1972 1973 1974 1975 1976 1977 1978
			return svc;
	}

	return NULL;
}

static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1979
	__releases(RCU)
L
Linus Torvalds 已提交
1980
{
J
Julian Anastasov 已提交
1981
	rcu_read_unlock();
L
Linus Torvalds 已提交
1982 1983 1984 1985 1986 1987 1988 1989
}


static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
{
	if (v == SEQ_START_TOKEN) {
		seq_printf(seq,
			"IP Virtual Server version %d.%d.%d (size=%d)\n",
1990
			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
L
Linus Torvalds 已提交
1991 1992 1993 1994 1995 1996 1997 1998
		seq_puts(seq,
			 "Prot LocalAddress:Port Scheduler Flags\n");
		seq_puts(seq,
			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
	} else {
		const struct ip_vs_service *svc = v;
		const struct ip_vs_iter *iter = seq->private;
		const struct ip_vs_dest *dest;
J
Julian Anastasov 已提交
1999
		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
2000
		char *sched_name = sched ? sched->name : "none";
L
Linus Torvalds 已提交
2001

2002 2003 2004
		if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
			if (svc->af == AF_INET6)
H
Harvey Harrison 已提交
2005
				seq_printf(seq, "%s  [%pI6]:%04X %s ",
2006
					   ip_vs_proto_name(svc->protocol),
2007
					   &svc->addr.in6,
2008
					   ntohs(svc->port),
2009
					   sched_name);
2010 2011
			else
#endif
N
Nick Chalk 已提交
2012
				seq_printf(seq, "%s  %08X:%04X %s %s ",
2013 2014 2015
					   ip_vs_proto_name(svc->protocol),
					   ntohl(svc->addr.ip),
					   ntohs(svc->port),
2016
					   sched_name,
N
Nick Chalk 已提交
2017
					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2018
		} else {
N
Nick Chalk 已提交
2019
			seq_printf(seq, "FWM  %08X %s %s",
2020
				   svc->fwmark, sched_name,
N
Nick Chalk 已提交
2021
				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2022
		}
L
Linus Torvalds 已提交
2023 2024 2025 2026 2027 2028 2029 2030

		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
			seq_printf(seq, "persistent %d %08X\n",
				svc->timeout,
				ntohl(svc->netmask));
		else
			seq_putc(seq, '\n');

J
Julian Anastasov 已提交
2031
		list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
2032 2033 2034
#ifdef CONFIG_IP_VS_IPV6
			if (dest->af == AF_INET6)
				seq_printf(seq,
H
Harvey Harrison 已提交
2035
					   "  -> [%pI6]:%04X"
2036
					   "      %-7s %-6d %-10d %-10d\n",
2037
					   &dest->addr.in6,
2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054
					   ntohs(dest->port),
					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
					   atomic_read(&dest->weight),
					   atomic_read(&dest->activeconns),
					   atomic_read(&dest->inactconns));
			else
#endif
				seq_printf(seq,
					   "  -> %08X:%04X      "
					   "%-7s %-6d %-10d %-10d\n",
					   ntohl(dest->addr.ip),
					   ntohs(dest->port),
					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
					   atomic_read(&dest->weight),
					   atomic_read(&dest->activeconns),
					   atomic_read(&dest->inactconns));

L
Linus Torvalds 已提交
2055 2056 2057 2058 2059
		}
	}
	return 0;
}

2060
static const struct seq_operations ip_vs_info_seq_ops = {
L
Linus Torvalds 已提交
2061 2062 2063 2064 2065 2066 2067 2068
	.start = ip_vs_info_seq_start,
	.next  = ip_vs_info_seq_next,
	.stop  = ip_vs_info_seq_stop,
	.show  = ip_vs_info_seq_show,
};

static int ip_vs_info_open(struct inode *inode, struct file *file)
{
2069
	return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2070
			sizeof(struct ip_vs_iter));
L
Linus Torvalds 已提交
2071 2072
}

2073
static const struct file_operations ip_vs_info_fops = {
L
Linus Torvalds 已提交
2074 2075 2076 2077
	.owner	 = THIS_MODULE,
	.open    = ip_vs_info_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
2078
	.release = seq_release_net,
L
Linus Torvalds 已提交
2079 2080 2081 2082
};

static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
2083
	struct net *net = seq_file_single_net(seq);
2084
	struct ip_vs_kstats show;
L
Linus Torvalds 已提交
2085 2086 2087 2088 2089 2090 2091

/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
	seq_puts(seq,
		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
	seq_printf(seq,
		   "   Conns  Packets  Packets            Bytes            Bytes\n");

2092
	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2093 2094 2095 2096 2097 2098 2099 2100
	seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
		   (unsigned long long)show.conns,
		   (unsigned long long)show.inpkts,
		   (unsigned long long)show.outpkts,
		   (unsigned long long)show.inbytes,
		   (unsigned long long)show.outbytes);

/*                01234567 01234567 01234567 0123456701234567 0123456701234567*/
L
Linus Torvalds 已提交
2101
	seq_puts(seq,
2102 2103 2104 2105 2106 2107 2108
		 " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
	seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
		   (unsigned long long)show.cps,
		   (unsigned long long)show.inpps,
		   (unsigned long long)show.outpps,
		   (unsigned long long)show.inbps,
		   (unsigned long long)show.outbps);
L
Linus Torvalds 已提交
2109 2110 2111 2112 2113 2114

	return 0;
}

static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
{
2115
	return single_open_net(inode, file, ip_vs_stats_show);
L
Linus Torvalds 已提交
2116 2117
}

2118
static const struct file_operations ip_vs_stats_fops = {
L
Linus Torvalds 已提交
2119 2120 2121 2122
	.owner = THIS_MODULE,
	.open = ip_vs_stats_seq_open,
	.read = seq_read,
	.llseek = seq_lseek,
2123
	.release = single_release_net,
L
Linus Torvalds 已提交
2124 2125
};

2126 2127 2128
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
	struct net *net = seq_file_single_net(seq);
J
Julian Anastasov 已提交
2129
	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2130
	struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
2131
	struct ip_vs_kstats kstats;
2132 2133 2134 2135 2136 2137 2138 2139 2140
	int i;

/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
	seq_puts(seq,
		 "       Total Incoming Outgoing         Incoming         Outgoing\n");
	seq_printf(seq,
		   "CPU    Conns  Packets  Packets            Bytes            Bytes\n");

	for_each_possible_cpu(i) {
J
Julian Anastasov 已提交
2141 2142
		struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
		unsigned int start;
2143
		u64 conns, inpkts, outpkts, inbytes, outbytes;
J
Julian Anastasov 已提交
2144 2145

		do {
2146
			start = u64_stats_fetch_begin_irq(&u->syncp);
2147 2148 2149 2150 2151
			conns = u->cnt.conns;
			inpkts = u->cnt.inpkts;
			outpkts = u->cnt.outpkts;
			inbytes = u->cnt.inbytes;
			outbytes = u->cnt.outbytes;
2152
		} while (u64_stats_fetch_retry_irq(&u->syncp, start));
J
Julian Anastasov 已提交
2153

2154 2155 2156 2157
		seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
			   i, (u64)conns, (u64)inpkts,
			   (u64)outpkts, (u64)inbytes,
			   (u64)outbytes);
2158 2159
	}

2160
	ip_vs_copy_stats(&kstats, tot_stats);
J
Julian Anastasov 已提交
2161

2162 2163 2164 2165 2166 2167
	seq_printf(seq, "  ~ %8LX %8LX %8LX %16LX %16LX\n\n",
		   (unsigned long long)kstats.conns,
		   (unsigned long long)kstats.inpkts,
		   (unsigned long long)kstats.outpkts,
		   (unsigned long long)kstats.inbytes,
		   (unsigned long long)kstats.outbytes);
J
Julian Anastasov 已提交
2168

2169
/*                ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2170
	seq_puts(seq,
2171 2172 2173 2174 2175 2176 2177
		 "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
	seq_printf(seq, "    %8LX %8LX %8LX %16LX %16LX\n",
		   kstats.cps,
		   kstats.inpps,
		   kstats.outpps,
		   kstats.inbps,
		   kstats.outbps);
2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191

	return 0;
}

static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
{
	return single_open_net(inode, file, ip_vs_stats_percpu_show);
}

static const struct file_operations ip_vs_stats_percpu_fops = {
	.owner = THIS_MODULE,
	.open = ip_vs_stats_percpu_seq_open,
	.read = seq_read,
	.llseek = seq_lseek,
2192
	.release = single_release_net,
2193
};
L
Linus Torvalds 已提交
2194 2195 2196 2197 2198
#endif

/*
 *	Set timeout values for tcp tcpfin udp in the timeout_table.
 */
2199
static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
L
Linus Torvalds 已提交
2200
{
2201
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2202
	struct ip_vs_proto_data *pd;
2203
#endif
2204

L
Linus Torvalds 已提交
2205 2206 2207 2208 2209 2210 2211
	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
		  u->tcp_timeout,
		  u->tcp_fin_timeout,
		  u->udp_timeout);

#ifdef CONFIG_IP_VS_PROTO_TCP
	if (u->tcp_timeout) {
2212 2213
		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
		pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
L
Linus Torvalds 已提交
2214 2215 2216 2217
			= u->tcp_timeout * HZ;
	}

	if (u->tcp_fin_timeout) {
2218 2219
		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
		pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
L
Linus Torvalds 已提交
2220 2221 2222 2223 2224 2225
			= u->tcp_fin_timeout * HZ;
	}
#endif

#ifdef CONFIG_IP_VS_PROTO_UDP
	if (u->udp_timeout) {
2226 2227
		pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
		pd->timeout_table[IP_VS_UDP_S_NORMAL]
L
Linus Torvalds 已提交
2228 2229 2230 2231 2232 2233
			= u->udp_timeout * HZ;
	}
#endif
	return 0;
}

2234
#define CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
L
Linus Torvalds 已提交
2235

2236 2237 2238
struct ip_vs_svcdest_user {
	struct ip_vs_service_user	s;
	struct ip_vs_dest_user		d;
L
Linus Torvalds 已提交
2239 2240
};

2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268
static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
	[CMDID(IP_VS_SO_SET_ADD)]         = sizeof(struct ip_vs_service_user),
	[CMDID(IP_VS_SO_SET_EDIT)]        = sizeof(struct ip_vs_service_user),
	[CMDID(IP_VS_SO_SET_DEL)]         = sizeof(struct ip_vs_service_user),
	[CMDID(IP_VS_SO_SET_ADDDEST)]     = sizeof(struct ip_vs_svcdest_user),
	[CMDID(IP_VS_SO_SET_DELDEST)]     = sizeof(struct ip_vs_svcdest_user),
	[CMDID(IP_VS_SO_SET_EDITDEST)]    = sizeof(struct ip_vs_svcdest_user),
	[CMDID(IP_VS_SO_SET_TIMEOUT)]     = sizeof(struct ip_vs_timeout_user),
	[CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
	[CMDID(IP_VS_SO_SET_STOPDAEMON)]  = sizeof(struct ip_vs_daemon_user),
	[CMDID(IP_VS_SO_SET_ZERO)]        = sizeof(struct ip_vs_service_user),
};

union ip_vs_set_arglen {
	struct ip_vs_service_user	field_IP_VS_SO_SET_ADD;
	struct ip_vs_service_user	field_IP_VS_SO_SET_EDIT;
	struct ip_vs_service_user	field_IP_VS_SO_SET_DEL;
	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_ADDDEST;
	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_DELDEST;
	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_EDITDEST;
	struct ip_vs_timeout_user	field_IP_VS_SO_SET_TIMEOUT;
	struct ip_vs_daemon_user	field_IP_VS_SO_SET_STARTDAEMON;
	struct ip_vs_daemon_user	field_IP_VS_SO_SET_STOPDAEMON;
	struct ip_vs_service_user	field_IP_VS_SO_SET_ZERO;
};

#define MAX_SET_ARGLEN	sizeof(union ip_vs_set_arglen)

2269 2270 2271
static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
				  struct ip_vs_service_user *usvc_compat)
{
2272 2273
	memset(usvc, 0, sizeof(*usvc));

2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290
	usvc->af		= AF_INET;
	usvc->protocol		= usvc_compat->protocol;
	usvc->addr.ip		= usvc_compat->addr;
	usvc->port		= usvc_compat->port;
	usvc->fwmark		= usvc_compat->fwmark;

	/* Deep copy of sched_name is not needed here */
	usvc->sched_name	= usvc_compat->sched_name;

	usvc->flags		= usvc_compat->flags;
	usvc->timeout		= usvc_compat->timeout;
	usvc->netmask		= usvc_compat->netmask;
}

static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
				   struct ip_vs_dest_user *udest_compat)
{
2291 2292
	memset(udest, 0, sizeof(*udest));

2293 2294 2295 2296 2297 2298
	udest->addr.ip		= udest_compat->addr;
	udest->port		= udest_compat->port;
	udest->conn_flags	= udest_compat->conn_flags;
	udest->weight		= udest_compat->weight;
	udest->u_threshold	= udest_compat->u_threshold;
	udest->l_threshold	= udest_compat->l_threshold;
2299
	udest->af		= AF_INET;
2300 2301
}

L
Linus Torvalds 已提交
2302 2303 2304
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
2305
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
2306
	int ret;
2307
	unsigned char arg[MAX_SET_ARGLEN];
2308 2309
	struct ip_vs_service_user *usvc_compat;
	struct ip_vs_service_user_kern usvc;
L
Linus Torvalds 已提交
2310
	struct ip_vs_service *svc;
2311 2312
	struct ip_vs_dest_user *udest_compat;
	struct ip_vs_dest_user_kern udest;
2313
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
2314

2315
	BUILD_BUG_ON(sizeof(arg) > 255);
2316
	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
2317 2318
		return -EPERM;

2319 2320
	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
		return -EINVAL;
2321 2322 2323
	if (len != set_arglen[CMDID(cmd)]) {
		IP_VS_DBG(1, "set_ctl: len %u != %u\n",
			  len, set_arglen[CMDID(cmd)]);
L
Linus Torvalds 已提交
2324 2325 2326 2327 2328 2329 2330 2331 2332
		return -EINVAL;
	}

	if (copy_from_user(arg, user, len) != 0)
		return -EFAULT;

	/* increase the module use count */
	ip_vs_use_count_inc();

2333 2334 2335 2336 2337
	/* Handle daemons since they have another lock */
	if (cmd == IP_VS_SO_SET_STARTDAEMON ||
	    cmd == IP_VS_SO_SET_STOPDAEMON) {
		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;

2338
		mutex_lock(&ipvs->sync_mutex);
2339 2340 2341 2342 2343 2344 2345 2346 2347
		if (cmd == IP_VS_SO_SET_STARTDAEMON)
			ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
						dm->syncid);
		else
			ret = stop_sync_thread(net, dm->state);
		mutex_unlock(&ipvs->sync_mutex);
		goto out_dec;
	}

2348
	mutex_lock(&__ip_vs_mutex);
L
Linus Torvalds 已提交
2349 2350
	if (cmd == IP_VS_SO_SET_FLUSH) {
		/* Flush the virtual service */
J
Julian Anastasov 已提交
2351
		ret = ip_vs_flush(net, false);
L
Linus Torvalds 已提交
2352 2353 2354
		goto out_unlock;
	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
		/* Set timeout values for (tcp tcpfin udp) */
2355
		ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
L
Linus Torvalds 已提交
2356 2357 2358
		goto out_unlock;
	}

2359 2360 2361 2362 2363 2364 2365
	usvc_compat = (struct ip_vs_service_user *)arg;
	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);

	/* We only use the new structs internally, so copy userspace compat
	 * structs to extended internal versions */
	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
	ip_vs_copy_udest_compat(&udest, udest_compat);
L
Linus Torvalds 已提交
2366 2367 2368

	if (cmd == IP_VS_SO_SET_ZERO) {
		/* if no service address is set, zero counters in all */
2369
		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2370
			ret = ip_vs_zero_all(net);
L
Linus Torvalds 已提交
2371 2372 2373 2374
			goto out_unlock;
		}
	}

2375 2376 2377
	/* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
	    usvc.protocol != IPPROTO_SCTP) {
2378 2379 2380
		pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
		       usvc.protocol, &usvc.addr.ip,
		       ntohs(usvc.port), usvc.sched_name);
L
Linus Torvalds 已提交
2381 2382 2383 2384 2385
		ret = -EFAULT;
		goto out_unlock;
	}

	/* Lookup the exact service by <protocol, addr, port> or fwmark */
J
Julian Anastasov 已提交
2386
	rcu_read_lock();
2387
	if (usvc.fwmark == 0)
2388
		svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2389
					   &usvc.addr, usvc.port);
L
Linus Torvalds 已提交
2390
	else
2391
		svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
J
Julian Anastasov 已提交
2392
	rcu_read_unlock();
L
Linus Torvalds 已提交
2393 2394

	if (cmd != IP_VS_SO_SET_ADD
2395
	    && (svc == NULL || svc->protocol != usvc.protocol)) {
L
Linus Torvalds 已提交
2396
		ret = -ESRCH;
2397
		goto out_unlock;
L
Linus Torvalds 已提交
2398 2399 2400 2401 2402 2403 2404
	}

	switch (cmd) {
	case IP_VS_SO_SET_ADD:
		if (svc != NULL)
			ret = -EEXIST;
		else
2405
			ret = ip_vs_add_service(net, &usvc, &svc);
L
Linus Torvalds 已提交
2406 2407
		break;
	case IP_VS_SO_SET_EDIT:
2408
		ret = ip_vs_edit_service(svc, &usvc);
L
Linus Torvalds 已提交
2409 2410 2411 2412 2413 2414 2415 2416 2417 2418
		break;
	case IP_VS_SO_SET_DEL:
		ret = ip_vs_del_service(svc);
		if (!ret)
			goto out_unlock;
		break;
	case IP_VS_SO_SET_ZERO:
		ret = ip_vs_zero_service(svc);
		break;
	case IP_VS_SO_SET_ADDDEST:
2419
		ret = ip_vs_add_dest(svc, &udest);
L
Linus Torvalds 已提交
2420 2421
		break;
	case IP_VS_SO_SET_EDITDEST:
2422
		ret = ip_vs_edit_dest(svc, &udest);
L
Linus Torvalds 已提交
2423 2424
		break;
	case IP_VS_SO_SET_DELDEST:
2425
		ret = ip_vs_del_dest(svc, &udest);
L
Linus Torvalds 已提交
2426 2427 2428 2429 2430 2431
		break;
	default:
		ret = -EINVAL;
	}

  out_unlock:
2432
	mutex_unlock(&__ip_vs_mutex);
L
Linus Torvalds 已提交
2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443
  out_dec:
	/* decrease the module use count */
	ip_vs_use_count_dec();

	return ret;
}


static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
J
Julian Anastasov 已提交
2444
	struct ip_vs_scheduler *sched;
2445
	struct ip_vs_kstats kstats;
2446
	char *sched_name;
J
Julian Anastasov 已提交
2447 2448

	sched = rcu_dereference_protected(src->scheduler, 1);
2449
	sched_name = sched ? sched->name : "none";
L
Linus Torvalds 已提交
2450
	dst->protocol = src->protocol;
2451
	dst->addr = src->addr.ip;
L
Linus Torvalds 已提交
2452 2453
	dst->port = src->port;
	dst->fwmark = src->fwmark;
2454
	strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
L
Linus Torvalds 已提交
2455 2456 2457 2458
	dst->flags = src->flags;
	dst->timeout = src->timeout / HZ;
	dst->netmask = src->netmask;
	dst->num_dests = src->num_dests;
2459 2460
	ip_vs_copy_stats(&kstats, &src->stats);
	ip_vs_export_stats_user(&dst->stats, &kstats);
L
Linus Torvalds 已提交
2461 2462 2463
}

static inline int
2464 2465
__ip_vs_get_service_entries(struct net *net,
			    const struct ip_vs_get_services *get,
L
Linus Torvalds 已提交
2466 2467 2468 2469 2470 2471 2472 2473
			    struct ip_vs_get_services __user *uptr)
{
	int idx, count=0;
	struct ip_vs_service *svc;
	struct ip_vs_service_entry entry;
	int ret = 0;

	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
2474
		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2475
			/* Only expose IPv4 entries to old interface */
2476
			if (svc->af != AF_INET || !net_eq(svc->net, net))
2477 2478
				continue;

L
Linus Torvalds 已提交
2479 2480
			if (count >= get->num_services)
				goto out;
P
pageexec 已提交
2481
			memset(&entry, 0, sizeof(entry));
L
Linus Torvalds 已提交
2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492
			ip_vs_copy_service(&entry, svc);
			if (copy_to_user(&uptr->entrytable[count],
					 &entry, sizeof(entry))) {
				ret = -EFAULT;
				goto out;
			}
			count++;
		}
	}

	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
2493
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2494
			/* Only expose IPv4 entries to old interface */
2495
			if (svc->af != AF_INET || !net_eq(svc->net, net))
2496 2497
				continue;

L
Linus Torvalds 已提交
2498 2499
			if (count >= get->num_services)
				goto out;
P
pageexec 已提交
2500
			memset(&entry, 0, sizeof(entry));
L
Linus Torvalds 已提交
2501 2502 2503 2504 2505 2506 2507 2508 2509
			ip_vs_copy_service(&entry, svc);
			if (copy_to_user(&uptr->entrytable[count],
					 &entry, sizeof(entry))) {
				ret = -EFAULT;
				goto out;
			}
			count++;
		}
	}
H
Hans Schillstrom 已提交
2510
out:
L
Linus Torvalds 已提交
2511 2512 2513 2514
	return ret;
}

static inline int
2515
__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
L
Linus Torvalds 已提交
2516 2517 2518
			 struct ip_vs_get_dests __user *uptr)
{
	struct ip_vs_service *svc;
2519
	union nf_inet_addr addr = { .ip = get->addr };
L
Linus Torvalds 已提交
2520 2521
	int ret = 0;

J
Julian Anastasov 已提交
2522
	rcu_read_lock();
L
Linus Torvalds 已提交
2523
	if (get->fwmark)
2524
		svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
L
Linus Torvalds 已提交
2525
	else
2526
		svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2527
					   get->port);
J
Julian Anastasov 已提交
2528
	rcu_read_unlock();
2529

L
Linus Torvalds 已提交
2530 2531 2532 2533
	if (svc) {
		int count = 0;
		struct ip_vs_dest *dest;
		struct ip_vs_dest_entry entry;
2534
		struct ip_vs_kstats kstats;
L
Linus Torvalds 已提交
2535

2536
		memset(&entry, 0, sizeof(entry));
L
Linus Torvalds 已提交
2537 2538 2539 2540
		list_for_each_entry(dest, &svc->destinations, n_list) {
			if (count >= get->num_dests)
				break;

2541 2542 2543 2544 2545 2546
			/* Cannot expose heterogeneous members via sockopt
			 * interface
			 */
			if (dest->af != svc->af)
				continue;

2547
			entry.addr = dest->addr.ip;
L
Linus Torvalds 已提交
2548 2549 2550 2551 2552 2553 2554 2555
			entry.port = dest->port;
			entry.conn_flags = atomic_read(&dest->conn_flags);
			entry.weight = atomic_read(&dest->weight);
			entry.u_threshold = dest->u_threshold;
			entry.l_threshold = dest->l_threshold;
			entry.activeconns = atomic_read(&dest->activeconns);
			entry.inactconns = atomic_read(&dest->inactconns);
			entry.persistconns = atomic_read(&dest->persistconns);
2556 2557
			ip_vs_copy_stats(&kstats, &dest->stats);
			ip_vs_export_stats_user(&entry.stats, &kstats);
L
Linus Torvalds 已提交
2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570
			if (copy_to_user(&uptr->entrytable[count],
					 &entry, sizeof(entry))) {
				ret = -EFAULT;
				break;
			}
			count++;
		}
	} else
		ret = -ESRCH;
	return ret;
}

static inline void
2571
__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
L
Linus Torvalds 已提交
2572
{
2573
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2574
	struct ip_vs_proto_data *pd;
2575
#endif
2576

2577 2578
	memset(u, 0, sizeof (*u));

L
Linus Torvalds 已提交
2579
#ifdef CONFIG_IP_VS_PROTO_TCP
2580 2581 2582
	pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
	u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
	u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
L
Linus Torvalds 已提交
2583 2584
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
2585
	pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
L
Linus Torvalds 已提交
2586
	u->udp_timeout =
2587
			pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
L
Linus Torvalds 已提交
2588 2589 2590
#endif
}

2591 2592 2593 2594 2595 2596 2597 2598 2599
static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
	[CMDID(IP_VS_SO_GET_VERSION)]  = 64,
	[CMDID(IP_VS_SO_GET_INFO)]     = sizeof(struct ip_vs_getinfo),
	[CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
	[CMDID(IP_VS_SO_GET_SERVICE)]  = sizeof(struct ip_vs_service_entry),
	[CMDID(IP_VS_SO_GET_DESTS)]    = sizeof(struct ip_vs_get_dests),
	[CMDID(IP_VS_SO_GET_TIMEOUT)]  = sizeof(struct ip_vs_timeout_user),
	[CMDID(IP_VS_SO_GET_DAEMON)]   = 2 * sizeof(struct ip_vs_daemon_user),
};
L
Linus Torvalds 已提交
2600

2601 2602 2603 2604 2605 2606 2607 2608
union ip_vs_get_arglen {
	char				field_IP_VS_SO_GET_VERSION[64];
	struct ip_vs_getinfo		field_IP_VS_SO_GET_INFO;
	struct ip_vs_get_services	field_IP_VS_SO_GET_SERVICES;
	struct ip_vs_service_entry	field_IP_VS_SO_GET_SERVICE;
	struct ip_vs_get_dests		field_IP_VS_SO_GET_DESTS;
	struct ip_vs_timeout_user	field_IP_VS_SO_GET_TIMEOUT;
	struct ip_vs_daemon_user	field_IP_VS_SO_GET_DAEMON[2];
L
Linus Torvalds 已提交
2609 2610
};

2611 2612
#define MAX_GET_ARGLEN	sizeof(union ip_vs_get_arglen)

L
Linus Torvalds 已提交
2613 2614 2615
static int
do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
2616
	unsigned char arg[MAX_GET_ARGLEN];
L
Linus Torvalds 已提交
2617
	int ret = 0;
2618
	unsigned int copylen;
2619
	struct net *net = sock_net(sk);
2620
	struct netns_ipvs *ipvs = net_ipvs(net);
L
Linus Torvalds 已提交
2621

2622
	BUG_ON(!net);
2623
	BUILD_BUG_ON(sizeof(arg) > 255);
2624
	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
2625 2626
		return -EPERM;

2627 2628 2629
	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
		return -EINVAL;

2630 2631 2632
	copylen = get_arglen[CMDID(cmd)];
	if (*len < (int) copylen) {
		IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
L
Linus Torvalds 已提交
2633 2634 2635
		return -EINVAL;
	}

2636
	if (copy_from_user(arg, user, copylen) != 0)
L
Linus Torvalds 已提交
2637
		return -EFAULT;
2638 2639 2640 2641 2642 2643 2644
	/*
	 * Handle daemons first since it has its own locking
	 */
	if (cmd == IP_VS_SO_GET_DAEMON) {
		struct ip_vs_daemon_user d[2];

		memset(&d, 0, sizeof(d));
2645
		mutex_lock(&ipvs->sync_mutex);
2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662
		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
			d[0].state = IP_VS_STATE_MASTER;
			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
				sizeof(d[0].mcast_ifn));
			d[0].syncid = ipvs->master_syncid;
		}
		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
			d[1].state = IP_VS_STATE_BACKUP;
			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
				sizeof(d[1].mcast_ifn));
			d[1].syncid = ipvs->backup_syncid;
		}
		if (copy_to_user(user, &d, sizeof(d)) != 0)
			ret = -EFAULT;
		mutex_unlock(&ipvs->sync_mutex);
		return ret;
	}
L
Linus Torvalds 已提交
2663

2664
	mutex_lock(&__ip_vs_mutex);
L
Linus Torvalds 已提交
2665 2666 2667 2668 2669 2670
	switch (cmd) {
	case IP_VS_SO_GET_VERSION:
	{
		char buf[64];

		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2671
			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
L
Linus Torvalds 已提交
2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683
		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
			ret = -EFAULT;
			goto out;
		}
		*len = strlen(buf)+1;
	}
	break;

	case IP_VS_SO_GET_INFO:
	{
		struct ip_vs_getinfo info;
		info.version = IP_VS_VERSION_CODE;
2684
		info.size = ip_vs_conn_tab_size;
2685
		info.num_services = ipvs->num_services;
L
Linus Torvalds 已提交
2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699
		if (copy_to_user(user, &info, sizeof(info)) != 0)
			ret = -EFAULT;
	}
	break;

	case IP_VS_SO_GET_SERVICES:
	{
		struct ip_vs_get_services *get;
		int size;

		get = (struct ip_vs_get_services *)arg;
		size = sizeof(*get) +
			sizeof(struct ip_vs_service_entry) * get->num_services;
		if (*len != size) {
2700
			pr_err("length: %u != %u\n", *len, size);
L
Linus Torvalds 已提交
2701 2702 2703
			ret = -EINVAL;
			goto out;
		}
2704
		ret = __ip_vs_get_service_entries(net, get, user);
L
Linus Torvalds 已提交
2705 2706 2707 2708 2709 2710 2711
	}
	break;

	case IP_VS_SO_GET_SERVICE:
	{
		struct ip_vs_service_entry *entry;
		struct ip_vs_service *svc;
2712
		union nf_inet_addr addr;
L
Linus Torvalds 已提交
2713 2714

		entry = (struct ip_vs_service_entry *)arg;
2715
		addr.ip = entry->addr;
J
Julian Anastasov 已提交
2716
		rcu_read_lock();
L
Linus Torvalds 已提交
2717
		if (entry->fwmark)
2718
			svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
L
Linus Torvalds 已提交
2719
		else
2720 2721 2722
			svc = __ip_vs_service_find(net, AF_INET,
						   entry->protocol, &addr,
						   entry->port);
J
Julian Anastasov 已提交
2723
		rcu_read_unlock();
L
Linus Torvalds 已提交
2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741
		if (svc) {
			ip_vs_copy_service(entry, svc);
			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
				ret = -EFAULT;
		} else
			ret = -ESRCH;
	}
	break;

	case IP_VS_SO_GET_DESTS:
	{
		struct ip_vs_get_dests *get;
		int size;

		get = (struct ip_vs_get_dests *)arg;
		size = sizeof(*get) +
			sizeof(struct ip_vs_dest_entry) * get->num_dests;
		if (*len != size) {
2742
			pr_err("length: %u != %u\n", *len, size);
L
Linus Torvalds 已提交
2743 2744 2745
			ret = -EINVAL;
			goto out;
		}
2746
		ret = __ip_vs_get_dest_entries(net, get, user);
L
Linus Torvalds 已提交
2747 2748 2749 2750 2751 2752 2753
	}
	break;

	case IP_VS_SO_GET_TIMEOUT:
	{
		struct ip_vs_timeout_user t;

2754
		__ip_vs_get_timeouts(net, &t);
L
Linus Torvalds 已提交
2755 2756 2757 2758 2759 2760 2761 2762 2763
		if (copy_to_user(user, &t, sizeof(t)) != 0)
			ret = -EFAULT;
	}
	break;

	default:
		ret = -EINVAL;
	}

H
Hans Schillstrom 已提交
2764
out:
2765
	mutex_unlock(&__ip_vs_mutex);
L
Linus Torvalds 已提交
2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777
	return ret;
}


static struct nf_sockopt_ops ip_vs_sockopts = {
	.pf		= PF_INET,
	.set_optmin	= IP_VS_BASE_CTL,
	.set_optmax	= IP_VS_SO_SET_MAX+1,
	.set		= do_ip_vs_set_ctl,
	.get_optmin	= IP_VS_BASE_CTL,
	.get_optmax	= IP_VS_SO_GET_MAX+1,
	.get		= do_ip_vs_get_ctl,
2778
	.owner		= THIS_MODULE,
L
Linus Torvalds 已提交
2779 2780
};

2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791
/*
 * Generic Netlink interface
 */

/* IPVS genetlink family */
static struct genl_family ip_vs_genl_family = {
	.id		= GENL_ID_GENERATE,
	.hdrsize	= 0,
	.name		= IPVS_GENL_NAME,
	.version	= IPVS_GENL_VERSION,
	.maxattr	= IPVS_CMD_MAX,
2792
	.netnsok        = true,         /* Make ipvsadm to work on netns */
2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
};

/* Policy used for first-level command attributes */
static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
};

/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
					    .len = IP_VS_IFNAME_MAXLEN },
	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
};

/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
					    .len = sizeof(union nf_inet_addr) },
	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
					    .len = IP_VS_SCHEDNAME_MAXLEN },
2823 2824
	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
					    .len = IP_VS_PENAME_MAXLEN },
2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844
	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
					    .len = sizeof(struct ip_vs_flags) },
	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
};

/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
					    .len = sizeof(union nf_inet_addr) },
	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
2845
	[IPVS_DEST_ATTR_ADDR_FAMILY]	= { .type = NLA_U16 },
2846 2847 2848
};

static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877
				 struct ip_vs_kstats *kstats)
{
	struct nlattr *nl_stats = nla_nest_start(skb, container_type);

	if (!nl_stats)
		return -EMSGSIZE;

	if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
		goto nla_put_failure;
	nla_nest_end(skb, nl_stats);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_stats);
	return -EMSGSIZE;
}

static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
				   struct ip_vs_kstats *kstats)
2878 2879
{
	struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2880

2881 2882 2883
	if (!nl_stats)
		return -EMSGSIZE;

2884 2885 2886 2887 2888 2889 2890 2891 2892 2893
	if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) ||
	    nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps))
D
David S. Miller 已提交
2894
		goto nla_put_failure;
2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906
	nla_nest_end(skb, nl_stats);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_stats);
	return -EMSGSIZE;
}

static int ip_vs_genl_fill_service(struct sk_buff *skb,
				   struct ip_vs_service *svc)
{
J
Julian Anastasov 已提交
2907
	struct ip_vs_scheduler *sched;
2908
	struct ip_vs_pe *pe;
2909 2910 2911
	struct nlattr *nl_service;
	struct ip_vs_flags flags = { .flags = svc->flags,
				     .mask = ~0 };
2912
	struct ip_vs_kstats kstats;
2913
	char *sched_name;
2914 2915 2916 2917 2918

	nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
	if (!nl_service)
		return -EMSGSIZE;

D
David S. Miller 已提交
2919 2920
	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
		goto nla_put_failure;
2921
	if (svc->fwmark) {
D
David S. Miller 已提交
2922 2923
		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
			goto nla_put_failure;
2924
	} else {
D
David S. Miller 已提交
2925 2926
		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2927
		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
D
David S. Miller 已提交
2928
			goto nla_put_failure;
2929 2930
	}

J
Julian Anastasov 已提交
2931
	sched = rcu_dereference_protected(svc->scheduler, 1);
2932
	sched_name = sched ? sched->name : "none";
2933
	pe = rcu_dereference_protected(svc->pe, 1);
2934
	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
2935
	    (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
D
David S. Miller 已提交
2936 2937
	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2938
	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
D
David S. Miller 已提交
2939
		goto nla_put_failure;
2940 2941 2942 2943
	ip_vs_copy_stats(&kstats, &svc->stats);
	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
		goto nla_put_failure;
	if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960
		goto nla_put_failure;

	nla_nest_end(skb, nl_service);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_service);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_service(struct sk_buff *skb,
				   struct ip_vs_service *svc,
				   struct netlink_callback *cb)
{
	void *hdr;

2961
	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2962 2963 2964 2965 2966 2967 2968 2969
			  &ip_vs_genl_family, NLM_F_MULTI,
			  IPVS_CMD_NEW_SERVICE);
	if (!hdr)
		return -EMSGSIZE;

	if (ip_vs_genl_fill_service(skb, svc) < 0)
		goto nla_put_failure;

2970 2971
	genlmsg_end(skb, hdr);
	return 0;
2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983

nla_put_failure:
	genlmsg_cancel(skb, hdr);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_services(struct sk_buff *skb,
				    struct netlink_callback *cb)
{
	int idx = 0, i;
	int start = cb->args[0];
	struct ip_vs_service *svc;
2984
	struct net *net = skb_sknet(skb);
2985 2986 2987

	mutex_lock(&__ip_vs_mutex);
	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
J
Julian Anastasov 已提交
2988
		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2989
			if (++idx <= start || !net_eq(svc->net, net))
2990 2991 2992 2993 2994 2995 2996 2997 2998
				continue;
			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
				idx--;
				goto nla_put_failure;
			}
		}
	}

	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
J
Julian Anastasov 已提交
2999
		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
3000
			if (++idx <= start || !net_eq(svc->net, net))
3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015
				continue;
			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
				idx--;
				goto nla_put_failure;
			}
		}
	}

nla_put_failure:
	mutex_unlock(&__ip_vs_mutex);
	cb->args[0] = idx;

	return skb->len;
}

3016 3017
static int ip_vs_genl_parse_service(struct net *net,
				    struct ip_vs_service_user_kern *usvc,
3018 3019
				    struct nlattr *nla, int full_entry,
				    struct ip_vs_service **ret_svc)
3020 3021 3022
{
	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3023
	struct ip_vs_service *svc;
3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038

	/* Parse mandatory identifying service fields first */
	if (nla == NULL ||
	    nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
		return -EINVAL;

	nla_af		= attrs[IPVS_SVC_ATTR_AF];
	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];

	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
		return -EINVAL;

S
Simon Horman 已提交
3039 3040
	memset(usvc, 0, sizeof(*usvc));

3041
	usvc->af = nla_get_u16(nla_af);
3042 3043 3044 3045 3046
#ifdef CONFIG_IP_VS_IPV6
	if (usvc->af != AF_INET && usvc->af != AF_INET6)
#else
	if (usvc->af != AF_INET)
#endif
3047 3048 3049 3050 3051 3052 3053 3054
		return -EAFNOSUPPORT;

	if (nla_fwmark) {
		usvc->protocol = IPPROTO_TCP;
		usvc->fwmark = nla_get_u32(nla_fwmark);
	} else {
		usvc->protocol = nla_get_u16(nla_protocol);
		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3055
		usvc->port = nla_get_be16(nla_port);
3056 3057 3058
		usvc->fwmark = 0;
	}

J
Julian Anastasov 已提交
3059
	rcu_read_lock();
3060
	if (usvc->fwmark)
3061
		svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
3062
	else
3063
		svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
3064
					   &usvc->addr, usvc->port);
J
Julian Anastasov 已提交
3065
	rcu_read_unlock();
3066 3067
	*ret_svc = svc;

3068 3069
	/* If a full entry was requested, check for the additional fields */
	if (full_entry) {
3070
		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3071 3072 3073 3074
			      *nla_netmask;
		struct ip_vs_flags flags;

		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3075
		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3076 3077 3078 3079 3080 3081 3082 3083 3084 3085
		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];

		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
			return -EINVAL;

		nla_memcpy(&flags, nla_flags, sizeof(flags));

		/* prefill flags from service if it already exists */
3086
		if (svc)
3087 3088 3089 3090 3091
			usvc->flags = svc->flags;

		/* set new flags from userland */
		usvc->flags = (usvc->flags & ~flags.mask) |
			      (flags.flags & flags.mask);
3092
		usvc->sched_name = nla_data(nla_sched);
3093
		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3094
		usvc->timeout = nla_get_u32(nla_timeout);
3095
		usvc->netmask = nla_get_be32(nla_netmask);
3096 3097 3098 3099 3100
	}

	return 0;
}

3101 3102
static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
						     struct nlattr *nla)
3103
{
3104
	struct ip_vs_service_user_kern usvc;
3105
	struct ip_vs_service *svc;
3106 3107
	int ret;

3108
	ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3109
	return ret ? ERR_PTR(ret) : svc;
3110 3111 3112 3113 3114
}

static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
{
	struct nlattr *nl_dest;
3115
	struct ip_vs_kstats kstats;
3116 3117 3118 3119 3120

	nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
	if (!nl_dest)
		return -EMSGSIZE;

D
David S. Miller 已提交
3121
	if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3122
	    nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
D
David S. Miller 已提交
3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134
	    nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
			(atomic_read(&dest->conn_flags) &
			 IP_VS_CONN_F_FWD_MASK)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
			atomic_read(&dest->weight)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
			atomic_read(&dest->activeconns)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
			atomic_read(&dest->inactconns)) ||
	    nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3135 3136
			atomic_read(&dest->persistconns)) ||
	    nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
D
David S. Miller 已提交
3137
		goto nla_put_failure;
3138 3139 3140 3141
	ip_vs_copy_stats(&kstats, &dest->stats);
	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
		goto nla_put_failure;
	if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157
		goto nla_put_failure;

	nla_nest_end(skb, nl_dest);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_dest);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
				struct netlink_callback *cb)
{
	void *hdr;

3158
	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3159 3160 3161 3162 3163 3164 3165 3166
			  &ip_vs_genl_family, NLM_F_MULTI,
			  IPVS_CMD_NEW_DEST);
	if (!hdr)
		return -EMSGSIZE;

	if (ip_vs_genl_fill_dest(skb, dest) < 0)
		goto nla_put_failure;

3167 3168
	genlmsg_end(skb, hdr);
	return 0;
3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182

nla_put_failure:
	genlmsg_cancel(skb, hdr);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_dests(struct sk_buff *skb,
				 struct netlink_callback *cb)
{
	int idx = 0;
	int start = cb->args[0];
	struct ip_vs_service *svc;
	struct ip_vs_dest *dest;
	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3183
	struct net *net = skb_sknet(skb);
3184 3185 3186 3187 3188 3189 3190 3191

	mutex_lock(&__ip_vs_mutex);

	/* Try to find the service for which to dump destinations */
	if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
		goto out_err;

3192

3193
	svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215
	if (IS_ERR(svc) || svc == NULL)
		goto out_err;

	/* Dump the destinations */
	list_for_each_entry(dest, &svc->destinations, n_list) {
		if (++idx <= start)
			continue;
		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
			idx--;
			goto nla_put_failure;
		}
	}

nla_put_failure:
	cb->args[0] = idx;

out_err:
	mutex_unlock(&__ip_vs_mutex);

	return skb->len;
}

3216
static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3217 3218 3219 3220
				 struct nlattr *nla, int full_entry)
{
	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
	struct nlattr *nla_addr, *nla_port;
3221
	struct nlattr *nla_addr_family;
3222 3223 3224 3225 3226 3227 3228 3229

	/* Parse mandatory identifying destination fields first */
	if (nla == NULL ||
	    nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
		return -EINVAL;

	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
	nla_port	= attrs[IPVS_DEST_ATTR_PORT];
3230
	nla_addr_family	= attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
3231 3232 3233 3234

	if (!(nla_addr && nla_port))
		return -EINVAL;

S
Simon Horman 已提交
3235 3236
	memset(udest, 0, sizeof(*udest));

3237
	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3238
	udest->port = nla_get_be16(nla_port);
3239

3240 3241 3242 3243 3244
	if (nla_addr_family)
		udest->af = nla_get_u16(nla_addr_family);
	else
		udest->af = 0;

3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267
	/* If a full entry was requested, check for the additional fields */
	if (full_entry) {
		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
			      *nla_l_thresh;

		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];

		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
			return -EINVAL;

		udest->conn_flags = nla_get_u32(nla_fwd)
				    & IP_VS_CONN_F_FWD_MASK;
		udest->weight = nla_get_u32(nla_weight);
		udest->u_threshold = nla_get_u32(nla_u_thresh);
		udest->l_threshold = nla_get_u32(nla_l_thresh);
	}

	return 0;
}

3268 3269
static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
				  const char *mcast_ifn, __u32 syncid)
3270 3271 3272 3273 3274 3275 3276
{
	struct nlattr *nl_daemon;

	nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
	if (!nl_daemon)
		return -EMSGSIZE;

D
David S. Miller 已提交
3277 3278 3279 3280
	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
		goto nla_put_failure;
3281 3282 3283 3284 3285 3286 3287 3288 3289
	nla_nest_end(skb, nl_daemon);

	return 0;

nla_put_failure:
	nla_nest_cancel(skb, nl_daemon);
	return -EMSGSIZE;
}

3290 3291
static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
				  const char *mcast_ifn, __u32 syncid,
3292 3293 3294
				  struct netlink_callback *cb)
{
	void *hdr;
3295
	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3296 3297 3298 3299 3300 3301 3302 3303
			  &ip_vs_genl_family, NLM_F_MULTI,
			  IPVS_CMD_NEW_DAEMON);
	if (!hdr)
		return -EMSGSIZE;

	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
		goto nla_put_failure;

3304 3305
	genlmsg_end(skb, hdr);
	return 0;
3306 3307 3308 3309 3310 3311 3312 3313 3314

nla_put_failure:
	genlmsg_cancel(skb, hdr);
	return -EMSGSIZE;
}

static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
				   struct netlink_callback *cb)
{
3315
	struct net *net = skb_sknet(skb);
3316 3317
	struct netns_ipvs *ipvs = net_ipvs(net);

3318
	mutex_lock(&ipvs->sync_mutex);
3319
	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3320
		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3321 3322
					   ipvs->master_mcast_ifn,
					   ipvs->master_syncid, cb) < 0)
3323 3324 3325 3326 3327
			goto nla_put_failure;

		cb->args[0] = 1;
	}

3328
	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3329
		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3330 3331
					   ipvs->backup_mcast_ifn,
					   ipvs->backup_syncid, cb) < 0)
3332 3333 3334 3335 3336 3337
			goto nla_put_failure;

		cb->args[1] = 1;
	}

nla_put_failure:
3338
	mutex_unlock(&ipvs->sync_mutex);
3339 3340 3341 3342

	return skb->len;
}

3343
static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3344 3345 3346 3347 3348 3349
{
	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
		return -EINVAL;

3350 3351 3352 3353 3354 3355
	/* The synchronization protocol is incompatible with mixed family
	 * services
	 */
	if (net_ipvs(net)->mixed_address_family_dests > 0)
		return -EINVAL;

3356 3357
	return start_sync_thread(net,
				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3358 3359 3360 3361
				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
}

3362
static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3363 3364 3365 3366
{
	if (!attrs[IPVS_DAEMON_ATTR_STATE])
		return -EINVAL;

3367 3368
	return stop_sync_thread(net,
				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3369 3370
}

3371
static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3372 3373 3374
{
	struct ip_vs_timeout_user t;

3375
	__ip_vs_get_timeouts(net, &t);
3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386

	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);

	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
		t.tcp_fin_timeout =
			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);

	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);

3387
	return ip_vs_set_timeout(net, &t);
3388 3389
}

3390
static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3391 3392
{
	int ret = 0, cmd;
3393
	struct net *net;
3394
	struct netns_ipvs *ipvs;
3395

3396
	net = skb_sknet(skb);
3397
	ipvs = net_ipvs(net);
3398 3399
	cmd = info->genlhdr->cmd;

3400
	if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3401 3402
		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];

3403
		mutex_lock(&ipvs->sync_mutex);
3404 3405 3406 3407 3408 3409 3410 3411 3412
		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
		    nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
				     info->attrs[IPVS_CMD_ATTR_DAEMON],
				     ip_vs_daemon_policy)) {
			ret = -EINVAL;
			goto out;
		}

		if (cmd == IPVS_CMD_NEW_DAEMON)
3413
			ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3414
		else
3415
			ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436
out:
		mutex_unlock(&ipvs->sync_mutex);
	}
	return ret;
}

static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
{
	struct ip_vs_service *svc = NULL;
	struct ip_vs_service_user_kern usvc;
	struct ip_vs_dest_user_kern udest;
	int ret = 0, cmd;
	int need_full_svc = 0, need_full_dest = 0;
	struct net *net;

	net = skb_sknet(skb);
	cmd = info->genlhdr->cmd;

	mutex_lock(&__ip_vs_mutex);

	if (cmd == IPVS_CMD_FLUSH) {
J
Julian Anastasov 已提交
3437
		ret = ip_vs_flush(net, false);
3438 3439 3440
		goto out;
	} else if (cmd == IPVS_CMD_SET_CONFIG) {
		ret = ip_vs_genl_set_config(net, info->attrs);
3441 3442 3443
		goto out;
	} else if (cmd == IPVS_CMD_ZERO &&
		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3444
		ret = ip_vs_zero_all(net);
3445 3446 3447 3448 3449 3450 3451 3452 3453
		goto out;
	}

	/* All following commands require a service argument, so check if we
	 * received a valid one. We need a full service specification when
	 * adding / editing a service. Only identifying members otherwise. */
	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
		need_full_svc = 1;

3454
	ret = ip_vs_genl_parse_service(net, &usvc,
3455
				       info->attrs[IPVS_CMD_ATTR_SERVICE],
3456
				       need_full_svc, &svc);
3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478
	if (ret)
		goto out;

	/* Unless we're adding a new service, the service must already exist */
	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
		ret = -ESRCH;
		goto out;
	}

	/* Destination commands require a valid destination argument. For
	 * adding / editing a destination, we need a full destination
	 * specification. */
	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
	    cmd == IPVS_CMD_DEL_DEST) {
		if (cmd != IPVS_CMD_DEL_DEST)
			need_full_dest = 1;

		ret = ip_vs_genl_parse_dest(&udest,
					    info->attrs[IPVS_CMD_ATTR_DEST],
					    need_full_dest);
		if (ret)
			goto out;
3479 3480 3481 3482 3483 3484 3485 3486 3487

		/* Old protocols did not allow the user to specify address
		 * family, so we set it to zero instead.  We also didn't
		 * allow heterogeneous pools in the old code, so it's safe
		 * to assume that this will have the same address family as
		 * the service.
		 */
		if (udest.af == 0)
			udest.af = svc->af;
3488

3489
		if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507
			/* The synchronization protocol is incompatible
			 * with mixed family services
			 */
			if (net_ipvs(net)->sync_state) {
				ret = -EINVAL;
				goto out;
			}

			/* Which connection types do we support? */
			switch (udest.conn_flags) {
			case IP_VS_CONN_F_TUNNEL:
				/* We are able to forward this */
				break;
			default:
				ret = -EINVAL;
				goto out;
			}
		}
3508 3509 3510 3511 3512
	}

	switch (cmd) {
	case IPVS_CMD_NEW_SERVICE:
		if (svc == NULL)
3513
			ret = ip_vs_add_service(net, &usvc, &svc);
3514 3515 3516 3517 3518 3519 3520 3521
		else
			ret = -EEXIST;
		break;
	case IPVS_CMD_SET_SERVICE:
		ret = ip_vs_edit_service(svc, &usvc);
		break;
	case IPVS_CMD_DEL_SERVICE:
		ret = ip_vs_del_service(svc);
3522
		/* do not use svc, it can be freed */
3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550
		break;
	case IPVS_CMD_NEW_DEST:
		ret = ip_vs_add_dest(svc, &udest);
		break;
	case IPVS_CMD_SET_DEST:
		ret = ip_vs_edit_dest(svc, &udest);
		break;
	case IPVS_CMD_DEL_DEST:
		ret = ip_vs_del_dest(svc, &udest);
		break;
	case IPVS_CMD_ZERO:
		ret = ip_vs_zero_service(svc);
		break;
	default:
		ret = -EINVAL;
	}

out:
	mutex_unlock(&__ip_vs_mutex);

	return ret;
}

static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
	struct sk_buff *msg;
	void *reply;
	int ret, cmd, reply_cmd;
3551
	struct net *net;
3552

3553
	net = skb_sknet(skb);
3554 3555 3556 3557 3558 3559 3560 3561 3562
	cmd = info->genlhdr->cmd;

	if (cmd == IPVS_CMD_GET_SERVICE)
		reply_cmd = IPVS_CMD_NEW_SERVICE;
	else if (cmd == IPVS_CMD_GET_INFO)
		reply_cmd = IPVS_CMD_SET_INFO;
	else if (cmd == IPVS_CMD_GET_CONFIG)
		reply_cmd = IPVS_CMD_SET_CONFIG;
	else {
3563
		pr_err("unknown Generic Netlink command\n");
3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581
		return -EINVAL;
	}

	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
	if (!msg)
		return -ENOMEM;

	mutex_lock(&__ip_vs_mutex);

	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
	if (reply == NULL)
		goto nla_put_failure;

	switch (cmd) {
	case IPVS_CMD_GET_SERVICE:
	{
		struct ip_vs_service *svc;

3582 3583
		svc = ip_vs_genl_find_service(net,
					      info->attrs[IPVS_CMD_ATTR_SERVICE]);
3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602
		if (IS_ERR(svc)) {
			ret = PTR_ERR(svc);
			goto out_err;
		} else if (svc) {
			ret = ip_vs_genl_fill_service(msg, svc);
			if (ret)
				goto nla_put_failure;
		} else {
			ret = -ESRCH;
			goto out_err;
		}

		break;
	}

	case IPVS_CMD_GET_CONFIG:
	{
		struct ip_vs_timeout_user t;

3603
		__ip_vs_get_timeouts(net, &t);
3604
#ifdef CONFIG_IP_VS_PROTO_TCP
D
David S. Miller 已提交
3605 3606 3607 3608 3609
		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
				t.tcp_timeout) ||
		    nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
				t.tcp_fin_timeout))
			goto nla_put_failure;
3610 3611
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
D
David S. Miller 已提交
3612 3613
		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
			goto nla_put_failure;
3614 3615 3616 3617 3618 3619
#endif

		break;
	}

	case IPVS_CMD_GET_INFO:
D
David S. Miller 已提交
3620 3621 3622 3623 3624
		if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
				IP_VS_VERSION_CODE) ||
		    nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
				ip_vs_conn_tab_size))
			goto nla_put_failure;
3625 3626 3627 3628
		break;
	}

	genlmsg_end(msg, reply);
J
Johannes Berg 已提交
3629
	ret = genlmsg_reply(msg, info);
3630 3631 3632
	goto out;

nla_put_failure:
3633
	pr_err("not enough space in Netlink message\n");
3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644
	ret = -EMSGSIZE;

out_err:
	nlmsg_free(msg);
out:
	mutex_unlock(&__ip_vs_mutex);

	return ret;
}


3645
static const struct genl_ops ip_vs_genl_ops[] = {
3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698
	{
		.cmd	= IPVS_CMD_NEW_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_SET_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_DEL_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_SERVICE,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_get_cmd,
		.dumpit	= ip_vs_genl_dump_services,
		.policy	= ip_vs_cmd_policy,
	},
	{
		.cmd	= IPVS_CMD_NEW_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_SET_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_DEL_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_DEST,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.dumpit	= ip_vs_genl_dump_dests,
	},
	{
		.cmd	= IPVS_CMD_NEW_DAEMON,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
3699
		.doit	= ip_vs_genl_set_daemon,
3700 3701 3702 3703 3704
	},
	{
		.cmd	= IPVS_CMD_DEL_DAEMON,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
3705
		.doit	= ip_vs_genl_set_daemon,
3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742
	},
	{
		.cmd	= IPVS_CMD_GET_DAEMON,
		.flags	= GENL_ADMIN_PERM,
		.dumpit	= ip_vs_genl_dump_daemons,
	},
	{
		.cmd	= IPVS_CMD_SET_CONFIG,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_CONFIG,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_get_cmd,
	},
	{
		.cmd	= IPVS_CMD_GET_INFO,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_get_cmd,
	},
	{
		.cmd	= IPVS_CMD_ZERO,
		.flags	= GENL_ADMIN_PERM,
		.policy	= ip_vs_cmd_policy,
		.doit	= ip_vs_genl_set_cmd,
	},
	{
		.cmd	= IPVS_CMD_FLUSH,
		.flags	= GENL_ADMIN_PERM,
		.doit	= ip_vs_genl_set_cmd,
	},
};

static int __init ip_vs_genl_register(void)
{
3743
	return genl_register_family_with_ops(&ip_vs_genl_family,
3744
					     ip_vs_genl_ops);
3745 3746 3747 3748 3749 3750 3751 3752 3753
}

static void ip_vs_genl_unregister(void)
{
	genl_unregister_family(&ip_vs_genl_family);
}

/* End of Generic Netlink interface definitions */

3754 3755 3756
/*
 * per netns intit/exit func.
 */
3757
#ifdef CONFIG_SYSCTL
C
Claudiu Ghioc 已提交
3758
static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3759
{
3760 3761
	int idx;
	struct netns_ipvs *ipvs = net_ipvs(net);
3762
	struct ctl_table *tbl;
3763

3764 3765 3766 3767 3768 3769 3770 3771
	atomic_set(&ipvs->dropentry, 0);
	spin_lock_init(&ipvs->dropentry_lock);
	spin_lock_init(&ipvs->droppacket_lock);
	spin_lock_init(&ipvs->securetcp_lock);

	if (!net_eq(net, &init_net)) {
		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
		if (tbl == NULL)
3772
			return -ENOMEM;
3773 3774 3775 3776

		/* Don't export sysctls to unprivileged users */
		if (net->user_ns != &init_user_ns)
			tbl[0].procname = NULL;
3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794
	} else
		tbl = vs_vars;
	/* Initialize sysctl defaults */
	idx = 0;
	ipvs->sysctl_amemthresh = 1024;
	tbl[idx++].data = &ipvs->sysctl_amemthresh;
	ipvs->sysctl_am_droprate = 10;
	tbl[idx++].data = &ipvs->sysctl_am_droprate;
	tbl[idx++].data = &ipvs->sysctl_drop_entry;
	tbl[idx++].data = &ipvs->sysctl_drop_packet;
#ifdef CONFIG_IP_VS_NFCT
	tbl[idx++].data = &ipvs->sysctl_conntrack;
#endif
	tbl[idx++].data = &ipvs->sysctl_secure_tcp;
	ipvs->sysctl_snat_reroute = 1;
	tbl[idx++].data = &ipvs->sysctl_snat_reroute;
	ipvs->sysctl_sync_ver = 1;
	tbl[idx++].data = &ipvs->sysctl_sync_ver;
3795 3796
	ipvs->sysctl_sync_ports = 1;
	tbl[idx++].data = &ipvs->sysctl_sync_ports;
3797
	tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
P
Pablo Neira Ayuso 已提交
3798 3799 3800 3801
	ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
	tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
	ipvs->sysctl_sync_sock_size = 0;
	tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3802 3803
	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
A
Alexander Frolkin 已提交
3804 3805
	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
3806
	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3807 3808
	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3809 3810
	tbl[idx].data = &ipvs->sysctl_sync_threshold;
	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3811 3812 3813 3814
	ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
	tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
	ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
	tbl[idx++].data = &ipvs->sysctl_sync_retries;
3815
	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3816 3817
	ipvs->sysctl_pmtu_disc = 1;
	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3818
	tbl[idx++].data = &ipvs->sysctl_backup_only;
3819 3820
	ipvs->sysctl_conn_reuse_mode = 1;
	tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
3821 3822


3823
	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3824 3825 3826
	if (ipvs->sysctl_hdr == NULL) {
		if (!net_eq(net, &init_net))
			kfree(tbl);
3827
		return -ENOMEM;
3828
	}
3829
	ip_vs_start_estimator(net, &ipvs->tot_stats);
3830
	ipvs->sysctl_tbl = tbl;
3831 3832 3833
	/* Schedule defense work */
	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3834 3835 3836 3837

	return 0;
}

C
Claudiu Ghioc 已提交
3838
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
3839
{
3840 3841
	struct netns_ipvs *ipvs = net_ipvs(net);

H
Hans Schillstrom 已提交
3842 3843
	cancel_delayed_work_sync(&ipvs->defense_work);
	cancel_work_sync(&ipvs->defense_work.work);
3844
	unregister_net_sysctl_table(ipvs->sysctl_hdr);
3845
	ip_vs_stop_estimator(net, &ipvs->tot_stats);
3846 3847 3848

	if (!net_eq(net, &init_net))
		kfree(ipvs->sysctl_tbl);
3849 3850 3851 3852
}

#else

C
Claudiu Ghioc 已提交
3853 3854
static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
3855

3856
#endif
3857

3858 3859 3860 3861
static struct notifier_block ip_vs_dst_notifier = {
	.notifier_call = ip_vs_dst_event,
};

3862
int __net_init ip_vs_control_net_init(struct net *net)
3863
{
3864
	int i, idx;
3865 3866 3867 3868
	struct netns_ipvs *ipvs = net_ipvs(net);

	/* Initialize rs_table */
	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3869
		INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
3870 3871

	INIT_LIST_HEAD(&ipvs->dest_trash);
J
Julian Anastasov 已提交
3872 3873 3874
	spin_lock_init(&ipvs->dest_trash_lock);
	setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
		    (unsigned long) net);
3875 3876 3877 3878 3879
	atomic_set(&ipvs->ftpsvc_counter, 0);
	atomic_set(&ipvs->nullsvc_counter, 0);

	/* procfs stats */
	ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3880
	if (!ipvs->tot_stats.cpustats)
3881
		return -ENOMEM;
3882

3883 3884 3885 3886 3887 3888
	for_each_possible_cpu(i) {
		struct ip_vs_cpu_stats *ipvs_tot_stats;
		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
		u64_stats_init(&ipvs_tot_stats->syncp);
	}

3889 3890
	spin_lock_init(&ipvs->tot_stats.lock);

3891 3892 3893 3894
	proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
	proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
	proc_create("ip_vs_stats_percpu", 0, net->proc_net,
		    &ip_vs_stats_percpu_fops);
3895

3896
	if (ip_vs_control_net_init_sysctl(net))
3897 3898 3899 3900 3901
		goto err;

	return 0;

err:
J
Julian Anastasov 已提交
3902
	free_percpu(ipvs->tot_stats.cpustats);
3903 3904 3905
	return -ENOMEM;
}

3906
void __net_exit ip_vs_control_net_cleanup(struct net *net)
3907
{
3908 3909
	struct netns_ipvs *ipvs = net_ipvs(net);

H
Hans Schillstrom 已提交
3910
	ip_vs_trash_cleanup(net);
3911
	ip_vs_control_net_cleanup_sysctl(net);
3912 3913 3914
	remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
	remove_proc_entry("ip_vs_stats", net->proc_net);
	remove_proc_entry("ip_vs", net->proc_net);
J
Julian Anastasov 已提交
3915
	free_percpu(ipvs->tot_stats.cpustats);
3916 3917
}

3918
int __init ip_vs_register_nl_ioctl(void)
L
Linus Torvalds 已提交
3919
{
3920
	int ret;
L
Linus Torvalds 已提交
3921 3922 3923

	ret = nf_register_sockopt(&ip_vs_sockopts);
	if (ret) {
3924
		pr_err("cannot register sockopt.\n");
3925
		goto err_sock;
L
Linus Torvalds 已提交
3926 3927
	}

3928 3929
	ret = ip_vs_genl_register();
	if (ret) {
3930
		pr_err("cannot register Generic Netlink interface.\n");
3931
		goto err_genl;
3932
	}
L
Linus Torvalds 已提交
3933
	return 0;
3934

3935 3936 3937
err_genl:
	nf_unregister_sockopt(&ip_vs_sockopts);
err_sock:
3938
	return ret;
L
Linus Torvalds 已提交
3939 3940
}

3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953
void ip_vs_unregister_nl_ioctl(void)
{
	ip_vs_genl_unregister();
	nf_unregister_sockopt(&ip_vs_sockopts);
}

int __init ip_vs_control_init(void)
{
	int idx;
	int ret;

	EnterFunction(2);

3954
	/* Initialize svc_table, ip_vs_svc_fwm_table */
3955
	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
J
Julian Anastasov 已提交
3956 3957
		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
		INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969
	}

	smp_wmb();	/* Do we really need it now ? */

	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
	if (ret < 0)
		return ret;

	LeaveFunction(2);
	return 0;
}

L
Linus Torvalds 已提交
3970 3971 3972 3973

void ip_vs_control_cleanup(void)
{
	EnterFunction(2);
3974
	unregister_netdevice_notifier(&ip_vs_dst_notifier);
L
Linus Torvalds 已提交
3975 3976
	LeaveFunction(2);
}