ip_vs_proto_udp.c 11.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * ip_vs_proto_udp.c:	UDP load balancing support for IPVS
 *
 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
 *              Julian Anastasov <ja@ssi.bg>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 *
 * Changes:
 *
 */

16 17
#include <linux/in.h>
#include <linux/ip.h>
L
Linus Torvalds 已提交
18
#include <linux/kernel.h>
19
#include <linux/netfilter.h>
L
Linus Torvalds 已提交
20
#include <linux/netfilter_ipv4.h>
21
#include <linux/udp.h>
L
Linus Torvalds 已提交
22 23

#include <net/ip_vs.h>
24
#include <net/ip.h>
L
Linus Torvalds 已提交
25 26

static struct ip_vs_conn *
27 28 29
udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
		const struct ip_vs_iphdr *iph, unsigned int proto_off,
		int inverse)
L
Linus Torvalds 已提交
30 31
{
	struct ip_vs_conn *cp;
A
Al Viro 已提交
32
	__be16 _ports[2], *pptr;
L
Linus Torvalds 已提交
33 34 35 36 37 38

	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
	if (pptr == NULL)
		return NULL;

	if (likely(!inverse)) {
39 40 41
		cp = ip_vs_conn_in_get(af, iph->protocol,
				       &iph->saddr, pptr[0],
				       &iph->daddr, pptr[1]);
L
Linus Torvalds 已提交
42
	} else {
43 44 45
		cp = ip_vs_conn_in_get(af, iph->protocol,
				       &iph->daddr, pptr[1],
				       &iph->saddr, pptr[0]);
L
Linus Torvalds 已提交
46 47 48 49 50 51 52
	}

	return cp;
}


static struct ip_vs_conn *
53 54 55
udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
		 int inverse)
L
Linus Torvalds 已提交
56 57
{
	struct ip_vs_conn *cp;
A
Al Viro 已提交
58
	__be16 _ports[2], *pptr;
L
Linus Torvalds 已提交
59

60
	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
L
Linus Torvalds 已提交
61 62 63 64
	if (pptr == NULL)
		return NULL;

	if (likely(!inverse)) {
65 66 67
		cp = ip_vs_conn_out_get(af, iph->protocol,
					&iph->saddr, pptr[0],
					&iph->daddr, pptr[1]);
L
Linus Torvalds 已提交
68
	} else {
69 70 71
		cp = ip_vs_conn_out_get(af, iph->protocol,
					&iph->daddr, pptr[1],
					&iph->saddr, pptr[0]);
L
Linus Torvalds 已提交
72 73 74 75 76 77 78
	}

	return cp;
}


static int
79
udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
L
Linus Torvalds 已提交
80 81 82 83
		  int *verdict, struct ip_vs_conn **cpp)
{
	struct ip_vs_service *svc;
	struct udphdr _udph, *uh;
84
	struct ip_vs_iphdr iph;
L
Linus Torvalds 已提交
85

86
	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
87 88

	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
L
Linus Torvalds 已提交
89 90 91 92 93
	if (uh == NULL) {
		*verdict = NF_DROP;
		return 0;
	}

94
	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
95 96
				&iph.daddr, uh->dest);
	if (svc) {
L
Linus Torvalds 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
		if (ip_vs_todrop()) {
			/*
			 * It seems that we are very loaded.
			 * We have to drop this packet :(
			 */
			ip_vs_service_put(svc);
			*verdict = NF_DROP;
			return 0;
		}

		/*
		 * Let the virtual server select a real server for the
		 * incoming connection, and create a connection entry.
		 */
		*cpp = ip_vs_schedule(svc, skb);
		if (!*cpp) {
			*verdict = ip_vs_leave(svc, skb, pp);
			return 0;
		}
		ip_vs_service_put(svc);
	}
	return 1;
}


static inline void
123 124 125
udp_fast_csum_update(int af, struct udphdr *uhdr,
		     const union nf_inet_addr *oldip,
		     const union nf_inet_addr *newip,
A
Al Viro 已提交
126
		     __be16 oldport, __be16 newport)
L
Linus Torvalds 已提交
127
{
128 129 130 131 132 133 134 135 136 137 138 139
#ifdef CONFIG_IP_VS_IPV6
	if (af == AF_INET6)
		uhdr->check =
			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
					 ip_vs_check_diff2(oldport, newport,
						~csum_unfold(uhdr->check))));
	else
#endif
		uhdr->check =
			csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
					 ip_vs_check_diff2(oldport, newport,
						~csum_unfold(uhdr->check))));
L
Linus Torvalds 已提交
140
	if (!uhdr->check)
141
		uhdr->check = CSUM_MANGLED_0;
L
Linus Torvalds 已提交
142 143 144
}

static int
145
udp_snat_handler(struct sk_buff *skb,
L
Linus Torvalds 已提交
146 147 148
		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
	struct udphdr *udph;
149 150 151 152 153 154 155 156
	unsigned int udphoff;

#ifdef CONFIG_IP_VS_IPV6
	if (cp->af == AF_INET6)
		udphoff = sizeof(struct ipv6hdr);
	else
#endif
		udphoff = ip_hdrlen(skb);
L
Linus Torvalds 已提交
157 158

	/* csum_check requires unshared skb */
159
	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
L
Linus Torvalds 已提交
160 161 162 163
		return 0;

	if (unlikely(cp->app != NULL)) {
		/* Some checks before mangling */
164
		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
L
Linus Torvalds 已提交
165 166 167 168 169
			return 0;

		/*
		 *	Call application helper if needed
		 */
170
		if (!ip_vs_app_pkt_out(cp, skb))
L
Linus Torvalds 已提交
171 172 173
			return 0;
	}

174
	udph = (void *)skb_network_header(skb) + udphoff;
L
Linus Torvalds 已提交
175 176 177 178 179 180 181
	udph->source = cp->vport;

	/*
	 *	Adjust UDP checksums
	 */
	if (!cp->app && (udph->check != 0)) {
		/* Only port and addr are changed, do fast csum update */
182
		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
L
Linus Torvalds 已提交
183
				     cp->dport, cp->vport);
184 185
		if (skb->ip_summed == CHECKSUM_COMPLETE)
			skb->ip_summed = CHECKSUM_NONE;
L
Linus Torvalds 已提交
186 187 188
	} else {
		/* full checksum calculation */
		udph->check = 0;
189
		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
190 191 192 193 194 195 196 197 198 199 200 201 202
#ifdef CONFIG_IP_VS_IPV6
		if (cp->af == AF_INET6)
			udph->check = csum_ipv6_magic(&cp->vaddr.in6,
						      &cp->caddr.in6,
						      skb->len - udphoff,
						      cp->protocol, skb->csum);
		else
#endif
			udph->check = csum_tcpudp_magic(cp->vaddr.ip,
							cp->caddr.ip,
							skb->len - udphoff,
							cp->protocol,
							skb->csum);
L
Linus Torvalds 已提交
203
		if (udph->check == 0)
204
			udph->check = CSUM_MANGLED_0;
L
Linus Torvalds 已提交
205 206 207 208 209 210 211 212 213
		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
			  pp->name, udph->check,
			  (char*)&(udph->check) - (char*)udph);
	}
	return 1;
}


static int
214
udp_dnat_handler(struct sk_buff *skb,
L
Linus Torvalds 已提交
215 216 217
		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
	struct udphdr *udph;
218 219 220 221 222 223 224 225
	unsigned int udphoff;

#ifdef CONFIG_IP_VS_IPV6
	if (cp->af == AF_INET6)
		udphoff = sizeof(struct ipv6hdr);
	else
#endif
		udphoff = ip_hdrlen(skb);
L
Linus Torvalds 已提交
226 227

	/* csum_check requires unshared skb */
228
	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
L
Linus Torvalds 已提交
229 230 231 232
		return 0;

	if (unlikely(cp->app != NULL)) {
		/* Some checks before mangling */
233
		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
L
Linus Torvalds 已提交
234 235 236 237 238 239
			return 0;

		/*
		 *	Attempt ip_vs_app call.
		 *	It will fix ip_vs_conn
		 */
240
		if (!ip_vs_app_pkt_in(cp, skb))
L
Linus Torvalds 已提交
241 242 243
			return 0;
	}

244
	udph = (void *)skb_network_header(skb) + udphoff;
L
Linus Torvalds 已提交
245 246 247 248 249 250 251
	udph->dest = cp->dport;

	/*
	 *	Adjust UDP checksums
	 */
	if (!cp->app && (udph->check != 0)) {
		/* Only port and addr are changed, do fast csum update */
252
		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
L
Linus Torvalds 已提交
253
				     cp->vport, cp->dport);
254 255
		if (skb->ip_summed == CHECKSUM_COMPLETE)
			skb->ip_summed = CHECKSUM_NONE;
L
Linus Torvalds 已提交
256 257 258
	} else {
		/* full checksum calculation */
		udph->check = 0;
259
		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
260 261 262 263 264 265 266 267 268 269 270 271 272
#ifdef CONFIG_IP_VS_IPV6
		if (cp->af == AF_INET6)
			udph->check = csum_ipv6_magic(&cp->caddr.in6,
						      &cp->daddr.in6,
						      skb->len - udphoff,
						      cp->protocol, skb->csum);
		else
#endif
			udph->check = csum_tcpudp_magic(cp->caddr.ip,
							cp->daddr.ip,
							skb->len - udphoff,
							cp->protocol,
							skb->csum);
L
Linus Torvalds 已提交
273
		if (udph->check == 0)
274
			udph->check = CSUM_MANGLED_0;
275
		skb->ip_summed = CHECKSUM_UNNECESSARY;
L
Linus Torvalds 已提交
276 277 278 279 280 281
	}
	return 1;
}


static int
282
udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
L
Linus Torvalds 已提交
283 284
{
	struct udphdr _udph, *uh;
285 286 287 288 289 290 291 292
	unsigned int udphoff;

#ifdef CONFIG_IP_VS_IPV6
	if (af == AF_INET6)
		udphoff = sizeof(struct ipv6hdr);
	else
#endif
		udphoff = ip_hdrlen(skb);
L
Linus Torvalds 已提交
293 294 295 296 297 298 299 300 301 302

	uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
	if (uh == NULL)
		return 0;

	if (uh->check != 0) {
		switch (skb->ip_summed) {
		case CHECKSUM_NONE:
			skb->csum = skb_checksum(skb, udphoff,
						 skb->len - udphoff, 0);
303
		case CHECKSUM_COMPLETE:
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
#ifdef CONFIG_IP_VS_IPV6
			if (af == AF_INET6) {
				if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
						    &ipv6_hdr(skb)->daddr,
						    skb->len - udphoff,
						    ipv6_hdr(skb)->nexthdr,
						    skb->csum)) {
					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
							 "Failed checksum for");
					return 0;
				}
			} else
#endif
				if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
						      ip_hdr(skb)->daddr,
						      skb->len - udphoff,
						      ip_hdr(skb)->protocol,
						      skb->csum)) {
					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
							 "Failed checksum for");
					return 0;
				}
L
Linus Torvalds 已提交
326 327
			break;
		default:
328
			/* No need to checksum. */
L
Linus Torvalds 已提交
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
			break;
		}
	}
	return 1;
}


/*
 *	Note: the caller guarantees that only one of register_app,
 *	unregister_app or app_conn_bind is called each time.
 */

#define	UDP_APP_TAB_BITS	4
#define	UDP_APP_TAB_SIZE	(1 << UDP_APP_TAB_BITS)
#define	UDP_APP_TAB_MASK	(UDP_APP_TAB_SIZE - 1)

static struct list_head udp_apps[UDP_APP_TAB_SIZE];
static DEFINE_SPINLOCK(udp_app_lock);

A
Al Viro 已提交
348
static inline __u16 udp_app_hashkey(__be16 port)
L
Linus Torvalds 已提交
349
{
A
Al Viro 已提交
350 351
	return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
		& UDP_APP_TAB_MASK;
L
Linus Torvalds 已提交
352 353 354 355 356 357
}


static int udp_register_app(struct ip_vs_app *inc)
{
	struct ip_vs_app *i;
A
Al Viro 已提交
358 359
	__u16 hash;
	__be16 port = inc->port;
L
Linus Torvalds 已提交
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
	int ret = 0;

	hash = udp_app_hashkey(port);


	spin_lock_bh(&udp_app_lock);
	list_for_each_entry(i, &udp_apps[hash], p_list) {
		if (i->port == port) {
			ret = -EEXIST;
			goto out;
		}
	}
	list_add(&inc->p_list, &udp_apps[hash]);
	atomic_inc(&ip_vs_protocol_udp.appcnt);

  out:
	spin_unlock_bh(&udp_app_lock);
	return ret;
}


static void
udp_unregister_app(struct ip_vs_app *inc)
{
	spin_lock_bh(&udp_app_lock);
	atomic_dec(&ip_vs_protocol_udp.appcnt);
	list_del(&inc->p_list);
	spin_unlock_bh(&udp_app_lock);
}


static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
	int hash;
	struct ip_vs_app *inc;
	int result = 0;

	/* Default binding: bind app only for NAT */
	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
		return 0;

	/* Lookup application incarnations and bind the right one */
	hash = udp_app_hashkey(cp->vport);

	spin_lock(&udp_app_lock);
	list_for_each_entry(inc, &udp_apps[hash], p_list) {
		if (inc->port == cp->vport) {
			if (unlikely(!ip_vs_app_inc_get(inc)))
				break;
			spin_unlock(&udp_app_lock);

			IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
				  "%u.%u.%u.%u:%u to app %s on port %u\n",
413
				  __func__,
414 415
				  NIPQUAD(cp->caddr.ip), ntohs(cp->cport),
				  NIPQUAD(cp->vaddr.ip), ntohs(cp->vport),
L
Linus Torvalds 已提交
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
				  inc->name, ntohs(inc->port));
			cp->app = inc;
			if (inc->init_conn)
				result = inc->init_conn(inc, cp);
			goto out;
		}
	}
	spin_unlock(&udp_app_lock);

  out:
	return result;
}


static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
	[IP_VS_UDP_S_NORMAL]		=	5*60*HZ,
	[IP_VS_UDP_S_LAST]		=	2*HZ,
};

static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
	[IP_VS_UDP_S_NORMAL]		=	"UDP",
	[IP_VS_UDP_S_LAST]		=	"BUG!",
};


static int
udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
{
	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
				       udp_state_name_table, sname, to);
}

static const char * udp_state_name(int state)
{
	if (state >= IP_VS_UDP_S_LAST)
		return "ERR!";
	return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
}

static int
udp_state_transition(struct ip_vs_conn *cp, int direction,
		     const struct sk_buff *skb,
		     struct ip_vs_protocol *pp)
{
	cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
	return 1;
}

static void udp_init(struct ip_vs_protocol *pp)
{
	IP_VS_INIT_HASH_TABLE(udp_apps);
	pp->timeout_table = udp_timeouts;
}

static void udp_exit(struct ip_vs_protocol *pp)
{
}


struct ip_vs_protocol ip_vs_protocol_udp = {
	.name =			"UDP",
	.protocol =		IPPROTO_UDP,
478
	.num_states =		IP_VS_UDP_S_LAST,
L
Linus Torvalds 已提交
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
	.dont_defrag =		0,
	.init =			udp_init,
	.exit =			udp_exit,
	.conn_schedule =	udp_conn_schedule,
	.conn_in_get =		udp_conn_in_get,
	.conn_out_get =		udp_conn_out_get,
	.snat_handler =		udp_snat_handler,
	.dnat_handler =		udp_dnat_handler,
	.csum_check =		udp_csum_check,
	.state_transition =	udp_state_transition,
	.state_name =		udp_state_name,
	.register_app =		udp_register_app,
	.unregister_app =	udp_unregister_app,
	.app_conn_bind =	udp_app_conn_bind,
	.debug_packet =		ip_vs_tcpudp_debug_packet,
	.timeout_change =	NULL,
	.set_state_timeout =	udp_set_state_timeout,
};