svcsock.c 43.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7
/*
 * linux/net/sunrpc/svcsock.c
 *
 * These are the RPC server socket internals.
 *
 * The server scheduling algorithm does not always distribute the load
 * evenly when servicing a single client. May need to modify the
8
 * svc_xprt_enqueue procedure...
L
Linus Torvalds 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * TCP support is largely untested and may be a little slow. The problem
 * is that we currently do two separate recvfrom's, one for the 4-byte
 * record length, and the second for the actual record. This could possibly
 * be improved by always reading a minimum size of around 100 bytes and
 * tucking any superfluous bytes away in a temporary store. Still, that
 * leaves write requests out in the rain. An alternative may be to peek at
 * the first skb in the queue, and if it matches the next TCP sequence
 * number, to extract the record marker. Yuck.
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */

22
#include <linux/kernel.h>
L
Linus Torvalds 已提交
23
#include <linux/sched.h>
24
#include <linux/module.h>
L
Linus Torvalds 已提交
25 26 27 28 29 30
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/udp.h>
31
#include <linux/tcp.h>
L
Linus Torvalds 已提交
32 33 34 35
#include <linux/unistd.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
36
#include <linux/file.h>
37
#include <linux/freezer.h>
L
Linus Torvalds 已提交
38 39 40
#include <net/sock.h>
#include <net/checksum.h>
#include <net/ip.h>
41
#include <net/ipv6.h>
42
#include <net/tcp.h>
43
#include <net/tcp_states.h>
L
Linus Torvalds 已提交
44 45
#include <asm/uaccess.h>
#include <asm/ioctls.h>
46
#include <trace/events/skb.h>
L
Linus Torvalds 已提交
47 48

#include <linux/sunrpc/types.h>
49
#include <linux/sunrpc/clnt.h>
L
Linus Torvalds 已提交
50
#include <linux/sunrpc/xdr.h>
51
#include <linux/sunrpc/msg_prot.h>
L
Linus Torvalds 已提交
52 53
#include <linux/sunrpc/svcsock.h>
#include <linux/sunrpc/stats.h>
54
#include <linux/sunrpc/xprt.h>
L
Linus Torvalds 已提交
55

56 57
#include "sunrpc.h"

58
#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
L
Linus Torvalds 已提交
59 60 61


static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
62
					 int flags);
63
static void		svc_udp_data_ready(struct sock *);
L
Linus Torvalds 已提交
64 65
static int		svc_udp_recvfrom(struct svc_rqst *);
static int		svc_udp_sendto(struct svc_rqst *);
66
static void		svc_sock_detach(struct svc_xprt *);
67
static void		svc_tcp_sock_detach(struct svc_xprt *);
68
static void		svc_sock_free(struct svc_xprt *);
L
Linus Torvalds 已提交
69

70
static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
71 72
					  struct net *, struct sockaddr *,
					  int, int);
73
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
74 75 76 77
static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
					     struct net *, struct sockaddr *,
					     int, int);
static void svc_bc_sock_free(struct svc_xprt *xprt);
78
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
79

80 81 82 83
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key svc_key[2];
static struct lock_class_key svc_slock_key[2];

84
static void svc_reclassify_socket(struct socket *sock)
85 86
{
	struct sock *sk = sock->sk;
87

88
	if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
89 90
		return;

91 92 93
	switch (sk->sk_family) {
	case AF_INET:
		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
94 95 96
					      &svc_slock_key[0],
					      "sk_xprt.xpt_lock-AF_INET-NFSD",
					      &svc_key[0]);
97 98 99 100
		break;

	case AF_INET6:
		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
101 102 103
					      &svc_slock_key[1],
					      "sk_xprt.xpt_lock-AF_INET6-NFSD",
					      &svc_key[1]);
104 105 106 107 108 109 110
		break;

	default:
		BUG();
	}
}
#else
111
static void svc_reclassify_socket(struct socket *sock)
112 113 114 115
{
}
#endif

L
Linus Torvalds 已提交
116 117 118
/*
 * Release an skbuff after use
 */
119
static void svc_release_skb(struct svc_rqst *rqstp)
L
Linus Torvalds 已提交
120
{
121
	struct sk_buff *skb = rqstp->rq_xprt_ctxt;
L
Linus Torvalds 已提交
122 123

	if (skb) {
124 125
		struct svc_sock *svsk =
			container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
126
		rqstp->rq_xprt_ctxt = NULL;
L
Linus Torvalds 已提交
127 128

		dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
129
		skb_free_datagram_locked(svsk->sk_sk, skb);
L
Linus Torvalds 已提交
130 131 132
	}
}

133 134 135 136
union svc_pktinfo_u {
	struct in_pktinfo pkti;
	struct in6_pktinfo pkti6;
};
137 138
#define SVC_PKTINFO_SPACE \
	CMSG_SPACE(sizeof(union svc_pktinfo_u))
139 140 141

static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
{
142 143 144
	struct svc_sock *svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
	switch (svsk->sk_sk->sk_family) {
145 146 147 148 149 150
	case AF_INET: {
			struct in_pktinfo *pki = CMSG_DATA(cmh);

			cmh->cmsg_level = SOL_IP;
			cmh->cmsg_type = IP_PKTINFO;
			pki->ipi_ifindex = 0;
151 152
			pki->ipi_spec_dst.s_addr =
				 svc_daddr_in(rqstp)->sin_addr.s_addr;
153 154 155
			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
		}
		break;
156

157 158
	case AF_INET6: {
			struct in6_pktinfo *pki = CMSG_DATA(cmh);
159
			struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);
160 161 162

			cmh->cmsg_level = SOL_IPV6;
			cmh->cmsg_type = IPV6_PKTINFO;
163
			pki->ipi6_ifindex = daddr->sin6_scope_id;
A
Alexey Dobriyan 已提交
164
			pki->ipi6_addr = daddr->sin6_addr;
165 166 167 168 169 170
			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
		}
		break;
	}
}

L
Linus Torvalds 已提交
171
/*
172
 * send routine intended to be shared by the fore- and back-channel
L
Linus Torvalds 已提交
173
 */
174 175 176
int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
		    struct page *headpage, unsigned long headoffset,
		    struct page *tailpage, unsigned long tailoffset)
L
Linus Torvalds 已提交
177 178 179 180 181 182
{
	int		result;
	int		size;
	struct page	**ppage = xdr->pages;
	size_t		base = xdr->page_base;
	unsigned int	pglen = xdr->page_len;
183
	unsigned int	flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
184 185
	int		slen;
	int		len = 0;
L
Linus Torvalds 已提交
186 187 188 189 190 191

	slen = xdr->len;

	/* send head */
	if (slen == xdr->head[0].iov_len)
		flags = 0;
192
	len = kernel_sendpage(sock, headpage, headoffset,
193
				  xdr->head[0].iov_len, flags);
L
Linus Torvalds 已提交
194 195 196 197 198 199 200 201 202 203 204
	if (len != xdr->head[0].iov_len)
		goto out;
	slen -= xdr->head[0].iov_len;
	if (slen == 0)
		goto out;

	/* send page data */
	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
	while (pglen > 0) {
		if (slen == size)
			flags = 0;
205
		result = kernel_sendpage(sock, *ppage, base, size, flags);
L
Linus Torvalds 已提交
206 207 208 209 210 211 212 213 214 215
		if (result > 0)
			len += result;
		if (result != size)
			goto out;
		slen -= size;
		pglen -= size;
		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
		base = 0;
		ppage++;
	}
216

L
Linus Torvalds 已提交
217 218
	/* send tail */
	if (xdr->tail[0].iov_len) {
219 220
		result = kernel_sendpage(sock, tailpage, tailoffset,
				   xdr->tail[0].iov_len, 0);
L
Linus Torvalds 已提交
221 222 223
		if (result > 0)
			len += result;
	}
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258

out:
	return len;
}


/*
 * Generic sendto routine
 */
static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
{
	struct svc_sock	*svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
	struct socket	*sock = svsk->sk_sock;
	union {
		struct cmsghdr	hdr;
		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
	} buffer;
	struct cmsghdr *cmh = &buffer.hdr;
	int		len = 0;
	unsigned long tailoff;
	unsigned long headoff;
	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);

	if (rqstp->rq_prot == IPPROTO_UDP) {
		struct msghdr msg = {
			.msg_name	= &rqstp->rq_addr,
			.msg_namelen	= rqstp->rq_addrlen,
			.msg_control	= cmh,
			.msg_controllen	= sizeof(buffer),
			.msg_flags	= MSG_MORE,
		};

		svc_set_cmsg_data(rqstp, cmh);

259
		if (sock_sendmsg(sock, &msg) < 0)
260 261 262 263 264 265 266 267
			goto out;
	}

	tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
	headoff = 0;
	len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
			       rqstp->rq_respages[0], tailoff);

L
Linus Torvalds 已提交
268
out:
269
	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
270
		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
271
		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
L
Linus Torvalds 已提交
272 273 274 275

	return len;
}

276 277 278
/*
 * Report socket names for nfsdfs
 */
279
static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
280
{
C
Chuck Lever 已提交
281 282 283
	const struct sock *sk = svsk->sk_sk;
	const char *proto_name = sk->sk_protocol == IPPROTO_UDP ?
							"udp" : "tcp";
284 285
	int len;

C
Chuck Lever 已提交
286
	switch (sk->sk_family) {
287 288
	case PF_INET:
		len = snprintf(buf, remaining, "ipv4 %s %pI4 %d\n",
C
Chuck Lever 已提交
289
				proto_name,
E
Eric Dumazet 已提交
290 291
				&inet_sk(sk)->inet_rcv_saddr,
				inet_sk(sk)->inet_num);
292
		break;
293
#if IS_ENABLED(CONFIG_IPV6)
294 295
	case PF_INET6:
		len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
C
Chuck Lever 已提交
296
				proto_name,
297
				&sk->sk_v6_rcv_saddr,
E
Eric Dumazet 已提交
298
				inet_sk(sk)->inet_num);
299
		break;
300
#endif
301
	default:
302
		len = snprintf(buf, remaining, "*unknown-%d*\n",
C
Chuck Lever 已提交
303
				sk->sk_family);
304
	}
305 306 307 308

	if (len >= remaining) {
		*buf = '\0';
		return -ENAMETOOLONG;
309 310 311 312
	}
	return len;
}

L
Linus Torvalds 已提交
313 314 315
/*
 * Generic recvfrom routine.
 */
316 317
static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
			int buflen)
L
Linus Torvalds 已提交
318
{
319 320
	struct svc_sock *svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
321 322 323 324
	struct msghdr msg = {
		.msg_flags	= MSG_DONTWAIT,
	};
	int len;
L
Linus Torvalds 已提交
325

326 327
	rqstp->rq_xprt_hlen = 0;

328
	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
329 330
	len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
				msg.msg_flags);
331 332 333 334 335
	/* If we read a full record, then assume there may be more
	 * data to read (stream based sockets only!)
	 */
	if (len == buflen)
		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
336 337

	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
338
		svsk, iov[0].iov_base, iov[0].iov_len, len);
L
Linus Torvalds 已提交
339 340 341
	return len;
}

342 343 344 345 346
static int svc_partial_recvfrom(struct svc_rqst *rqstp,
				struct kvec *iov, int nr,
				int buflen, unsigned int base)
{
	size_t save_iovlen;
347
	void *save_iovbase;
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
	unsigned int i;
	int ret;

	if (base == 0)
		return svc_recvfrom(rqstp, iov, nr, buflen);

	for (i = 0; i < nr; i++) {
		if (iov[i].iov_len > base)
			break;
		base -= iov[i].iov_len;
	}
	save_iovlen = iov[i].iov_len;
	save_iovbase = iov[i].iov_base;
	iov[i].iov_len -= base;
	iov[i].iov_base += base;
	ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
	iov[i].iov_len = save_iovlen;
	iov[i].iov_base = save_iovbase;
	return ret;
}

L
Linus Torvalds 已提交
369 370 371
/*
 * Set socket snd and rcv buffer lengths
 */
372 373
static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
				unsigned int rcv)
L
Linus Torvalds 已提交
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
{
#if 0
	mm_segment_t	oldfs;
	oldfs = get_fs(); set_fs(KERNEL_DS);
	sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
			(char*)&snd, sizeof(snd));
	sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
			(char*)&rcv, sizeof(rcv));
#else
	/* sock_setsockopt limits use to sysctl_?mem_max,
	 * which isn't acceptable.  Until that is made conditional
	 * on not having CAP_SYS_RESOURCE or similar, we go direct...
	 * DaveM said I could!
	 */
	lock_sock(sock->sk);
	sock->sk->sk_sndbuf = snd * 2;
	sock->sk->sk_rcvbuf = rcv * 2;
391
	sock->sk->sk_write_space(sock->sk);
L
Linus Torvalds 已提交
392 393 394
	release_sock(sock->sk);
#endif
}
395 396 397 398 399 400

static int svc_sock_secure_port(struct svc_rqst *rqstp)
{
	return svc_port_is_privileged(svc_addr(rqstp));
}

401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
{
	if (!wq)
		return false;
	/*
	 * There should normally be a memory * barrier here--see
	 * wq_has_sleeper().
	 *
	 * It appears that isn't currently necessary, though, basically
	 * because callers all appear to have sufficient memory barriers
	 * between the time the relevant change is made and the
	 * time they call these callbacks.
	 *
	 * The nfsd code itself doesn't actually explicitly wait on
	 * these waitqueues, but it may wait on them for example in
	 * sendpage() or sendmsg() calls.  (And those may be the only
	 * places, since it it uses nonblocking reads.)
	 *
	 * Maybe we should add the memory barriers anyway, but these are
	 * hot paths so we'd need to be convinced there's no sigificant
	 * penalty.
	 */
	return waitqueue_active(wq);
}

L
Linus Torvalds 已提交
426 427 428
/*
 * INET callback when data has been received on the socket.
 */
429
static void svc_udp_data_ready(struct sock *sk)
L
Linus Torvalds 已提交
430
{
431
	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
432
	wait_queue_head_t *wq = sk_sleep(sk);
L
Linus Torvalds 已提交
433

434
	if (svsk) {
435 436
		dprintk("svc: socket %p(inet %p), busy=%d\n",
			svsk, sk,
437 438
			test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
439
		svc_xprt_enqueue(&svsk->sk_xprt);
440
	}
441
	if (sunrpc_waitqueue_active(wq))
442
		wake_up_interruptible(wq);
L
Linus Torvalds 已提交
443 444 445 446 447
}

/*
 * INET callback when space is newly available on the socket.
 */
448
static void svc_write_space(struct sock *sk)
L
Linus Torvalds 已提交
449 450
{
	struct svc_sock	*svsk = (struct svc_sock *)(sk->sk_user_data);
451
	wait_queue_head_t *wq = sk_sleep(sk);
L
Linus Torvalds 已提交
452 453 454

	if (svsk) {
		dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
455
			svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
456
		svc_xprt_enqueue(&svsk->sk_xprt);
L
Linus Torvalds 已提交
457 458
	}

459
	if (sunrpc_waitqueue_active(wq)) {
460
		dprintk("RPC svc_write_space: someone sleeping on %p\n",
L
Linus Torvalds 已提交
461
		       svsk);
462
		wake_up_interruptible(wq);
L
Linus Torvalds 已提交
463 464 465
	}
}

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
static int svc_tcp_has_wspace(struct svc_xprt *xprt)
{
	struct svc_sock *svsk =	container_of(xprt, struct svc_sock, sk_xprt);
	struct svc_serv *serv = svsk->sk_xprt.xpt_server;
	int required;

	if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
		return 1;
	required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
	if (sk_stream_wspace(svsk->sk_sk) >= required ||
	    (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
	     atomic_read(&xprt->xpt_reserved) == 0))
		return 1;
	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
	return 0;
}

483 484
static void svc_tcp_write_space(struct sock *sk)
{
485
	struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
486 487
	struct socket *sock = sk->sk_socket;

488 489 490
	if (!sk_stream_is_writeable(sk) || !sock)
		return;
	if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
491 492 493 494
		clear_bit(SOCK_NOSPACE, &sock->flags);
	svc_write_space(sk);
}

495 496 497 498 499 500 501 502
static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
{
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);

	if (svc_tcp_has_wspace(xprt))
		clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}

503 504 505 506 507 508 509
/*
 * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
 */
static int svc_udp_get_dest_address4(struct svc_rqst *rqstp,
				     struct cmsghdr *cmh)
{
	struct in_pktinfo *pki = CMSG_DATA(cmh);
510 511
	struct sockaddr_in *daddr = svc_daddr_in(rqstp);

512 513
	if (cmh->cmsg_type != IP_PKTINFO)
		return 0;
514 515 516

	daddr->sin_family = AF_INET;
	daddr->sin_addr.s_addr = pki->ipi_spec_dst.s_addr;
517 518 519 520
	return 1;
}

/*
521
 * See net/ipv6/datagram.c : ip6_datagram_recv_ctl
522 523 524 525 526
 */
static int svc_udp_get_dest_address6(struct svc_rqst *rqstp,
				     struct cmsghdr *cmh)
{
	struct in6_pktinfo *pki = CMSG_DATA(cmh);
527 528
	struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);

529 530
	if (cmh->cmsg_type != IPV6_PKTINFO)
		return 0;
531 532

	daddr->sin6_family = AF_INET6;
A
Alexey Dobriyan 已提交
533
	daddr->sin6_addr = pki->ipi6_addr;
534
	daddr->sin6_scope_id = pki->ipi6_ifindex;
535 536 537
	return 1;
}

538 539 540 541 542 543 544
/*
 * Copy the UDP datagram's destination address to the rqstp structure.
 * The 'destination' address in this case is the address to which the
 * peer sent the datagram, i.e. our local address. For multihomed
 * hosts, this can change from msg to msg. Note that only the IP
 * address changes, the port number should remain the same.
 */
545 546
static int svc_udp_get_dest_address(struct svc_rqst *rqstp,
				    struct cmsghdr *cmh)
547
{
548 549 550 551 552
	switch (cmh->cmsg_level) {
	case SOL_IP:
		return svc_udp_get_dest_address4(rqstp, cmh);
	case SOL_IPV6:
		return svc_udp_get_dest_address6(rqstp, cmh);
553
	}
554 555

	return 0;
556 557
}

L
Linus Torvalds 已提交
558 559 560
/*
 * Receive a datagram from a UDP socket.
 */
561
static int svc_udp_recvfrom(struct svc_rqst *rqstp)
L
Linus Torvalds 已提交
562
{
563 564
	struct svc_sock	*svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
565
	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
L
Linus Torvalds 已提交
566
	struct sk_buff	*skb;
567 568 569 570 571
	union {
		struct cmsghdr	hdr;
		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
	} buffer;
	struct cmsghdr *cmh = &buffer.hdr;
572 573 574 575 576 577
	struct msghdr msg = {
		.msg_name = svc_addr(rqstp),
		.msg_control = cmh,
		.msg_controllen = sizeof(buffer),
		.msg_flags = MSG_DONTWAIT,
	};
578 579
	size_t len;
	int err;
L
Linus Torvalds 已提交
580

581
	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
L
Linus Torvalds 已提交
582 583 584
	    /* udp sockets need large rcvbuf as all pending
	     * requests are still in that buffer.  sndbuf must
	     * also be large enough that there is enough space
585 586 587 588
	     * for one reply per thread.  We count all threads
	     * rather than threads in a particular pool, which
	     * provides an upper bound on the number of threads
	     * which will access the socket.
L
Linus Torvalds 已提交
589 590
	     */
	    svc_sock_setbufsize(svsk->sk_sock,
591 592
				(serv->sv_nrthreads+3) * serv->sv_max_mesg,
				(serv->sv_nrthreads+3) * serv->sv_max_mesg);
L
Linus Torvalds 已提交
593

594
	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
595 596 597 598 599 600 601 602 603 604
	skb = NULL;
	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
			     0, 0, MSG_PEEK | MSG_DONTWAIT);
	if (err >= 0)
		skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);

	if (skb == NULL) {
		if (err != -EAGAIN) {
			/* possibly an icmp error */
			dprintk("svc: recvfrom returned error %d\n", -err);
605
			set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
606
		}
607
		return 0;
L
Linus Torvalds 已提交
608
	}
609 610
	len = svc_addr_len(svc_addr(rqstp));
	rqstp->rq_addrlen = len;
611 612
	if (skb->tstamp.tv64 == 0) {
		skb->tstamp = ktime_get_real();
613
		/* Don't enable netstamp, sunrpc doesn't
L
Linus Torvalds 已提交
614 615
		   need that much accuracy */
	}
616
	svsk->sk_sk->sk_stamp = skb->tstamp;
617
	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
L
Linus Torvalds 已提交
618

619
	len  = skb->len;
L
Linus Torvalds 已提交
620 621
	rqstp->rq_arg.len = len;

622
	rqstp->rq_prot = IPPROTO_UDP;
623

624
	if (!svc_udp_get_dest_address(rqstp, cmh)) {
625 626
		net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
				     cmh->cmsg_level, cmh->cmsg_type);
J
J. Bruce Fields 已提交
627
		goto out_free;
628
	}
629
	rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp));
L
Linus Torvalds 已提交
630 631 632 633 634 635 636

	if (skb_is_nonlinear(skb)) {
		/* we have to copy */
		local_bh_disable();
		if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
			local_bh_enable();
			/* checksum error */
637
			goto out_free;
L
Linus Torvalds 已提交
638 639
		}
		local_bh_enable();
640
		skb_free_datagram_locked(svsk->sk_sk, skb);
L
Linus Torvalds 已提交
641 642
	} else {
		/* we can use it in-place */
643
		rqstp->rq_arg.head[0].iov_base = skb->data;
L
Linus Torvalds 已提交
644
		rqstp->rq_arg.head[0].iov_len = len;
645 646
		if (skb_checksum_complete(skb))
			goto out_free;
647
		rqstp->rq_xprt_ctxt = skb;
L
Linus Torvalds 已提交
648 649 650 651 652 653
	}

	rqstp->rq_arg.page_base = 0;
	if (len <= rqstp->rq_arg.head[0].iov_len) {
		rqstp->rq_arg.head[0].iov_len = len;
		rqstp->rq_arg.page_len = 0;
654
		rqstp->rq_respages = rqstp->rq_pages+1;
L
Linus Torvalds 已提交
655 656
	} else {
		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
657
		rqstp->rq_respages = rqstp->rq_pages + 1 +
658
			DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE);
L
Linus Torvalds 已提交
659
	}
660
	rqstp->rq_next_page = rqstp->rq_respages+1;
L
Linus Torvalds 已提交
661 662 663 664 665

	if (serv->sv_stats)
		serv->sv_stats->netudpcnt++;

	return len;
J
J. Bruce Fields 已提交
666 667 668 669
out_free:
	trace_kfree_skb(skb, svc_udp_recvfrom);
	skb_free_datagram_locked(svsk->sk_sk, skb);
	return 0;
L
Linus Torvalds 已提交
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
}

static int
svc_udp_sendto(struct svc_rqst *rqstp)
{
	int		error;

	error = svc_sendto(rqstp, &rqstp->rq_res);
	if (error == -ECONNREFUSED)
		/* ICMP error on earlier request. */
		error = svc_sendto(rqstp, &rqstp->rq_res);

	return error;
}

T
Tom Tucker 已提交
685 686 687 688
static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
{
}

689 690 691
static int svc_udp_has_wspace(struct svc_xprt *xprt)
{
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
692
	struct svc_serv	*serv = xprt->xpt_server;
693 694 695 696 697 698 699
	unsigned long required;

	/*
	 * Set the SOCK_NOSPACE flag before checking the available
	 * sock space.
	 */
	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
T
Tom Tucker 已提交
700
	required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
701 702 703 704 705 706
	if (required*2 > sock_wspace(svsk->sk_sk))
		return 0;
	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
	return 1;
}

707 708 709 710 711 712
static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
{
	BUG();
	return NULL;
}

713
static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
714
				       struct net *net,
715 716 717
				       struct sockaddr *sa, int salen,
				       int flags)
{
718
	return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
719 720
}

721
static struct svc_xprt_ops svc_udp_ops = {
722
	.xpo_create = svc_udp_create,
723 724
	.xpo_recvfrom = svc_udp_recvfrom,
	.xpo_sendto = svc_udp_sendto,
725
	.xpo_release_rqst = svc_release_skb,
726 727
	.xpo_detach = svc_sock_detach,
	.xpo_free = svc_sock_free,
T
Tom Tucker 已提交
728
	.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
729
	.xpo_has_wspace = svc_udp_has_wspace,
730
	.xpo_accept = svc_udp_accept,
731
	.xpo_secure_port = svc_sock_secure_port,
732 733 734 735
};

static struct svc_xprt_class svc_udp_class = {
	.xcl_name = "udp",
736
	.xcl_owner = THIS_MODULE,
737
	.xcl_ops = &svc_udp_ops,
738
	.xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
739
	.xcl_ident = XPRT_TRANSPORT_UDP,
740 741
};

742
static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
L
Linus Torvalds 已提交
743
{
744
	int err, level, optname, one = 1;
745

746 747
	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
		      &svsk->sk_xprt, serv);
748
	clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
749 750 751 752
	svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
	svsk->sk_sk->sk_write_space = svc_write_space;

	/* initialise setting must have enough space to
753
	 * receive and respond to one request.
L
Linus Torvalds 已提交
754 755 756
	 * svc_udp_recvfrom will re-adjust if necessary
	 */
	svc_sock_setbufsize(svsk->sk_sock,
757 758
			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
L
Linus Torvalds 已提交
759

760 761
	/* data might have come in before data_ready set up */
	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
762
	set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
763 764

	/* make sure we get destination address info */
765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
	switch (svsk->sk_sk->sk_family) {
	case AF_INET:
		level = SOL_IP;
		optname = IP_PKTINFO;
		break;
	case AF_INET6:
		level = SOL_IPV6;
		optname = IPV6_RECVPKTINFO;
		break;
	default:
		BUG();
	}
	err = kernel_setsockopt(svsk->sk_sock, level, optname,
					(char *)&one, sizeof(one));
	dprintk("svc: kernel_setsockopt returned %d\n", err);
L
Linus Torvalds 已提交
780 781 782 783 784 785
}

/*
 * A data_ready event on a listening socket means there's a connection
 * pending. Do not use state_change as a substitute for it.
 */
786
static void svc_tcp_listen_data_ready(struct sock *sk)
L
Linus Torvalds 已提交
787
{
788
	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
789
	wait_queue_head_t *wq;
L
Linus Torvalds 已提交
790 791

	dprintk("svc: socket %p TCP (listen) state change %d\n",
792
		sk, sk->sk_state);
L
Linus Torvalds 已提交
793

794 795 796 797 798 799 800 801 802 803 804 805
	/*
	 * This callback may called twice when a new connection
	 * is established as a child socket inherits everything
	 * from a parent LISTEN socket.
	 * 1) data_ready method of the parent socket will be called
	 *    when one of child sockets become ESTABLISHED.
	 * 2) data_ready method of the child socket may be called
	 *    when it receives data before the socket is accepted.
	 * In case of 2, we should ignore it silently.
	 */
	if (sk->sk_state == TCP_LISTEN) {
		if (svsk) {
806
			set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
807
			svc_xprt_enqueue(&svsk->sk_xprt);
808 809
		} else
			printk("svc: socket %p: no user data\n", sk);
L
Linus Torvalds 已提交
810
	}
811

812
	wq = sk_sleep(sk);
813
	if (sunrpc_waitqueue_active(wq))
814
		wake_up_interruptible_all(wq);
L
Linus Torvalds 已提交
815 816 817 818 819
}

/*
 * A state change on a connected socket means it's dying or dead.
 */
820
static void svc_tcp_state_change(struct sock *sk)
L
Linus Torvalds 已提交
821
{
822
	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
823
	wait_queue_head_t *wq = sk_sleep(sk);
L
Linus Torvalds 已提交
824 825

	dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
826
		sk, sk->sk_state, sk->sk_user_data);
L
Linus Torvalds 已提交
827

828
	if (!svsk)
L
Linus Torvalds 已提交
829
		printk("svc: socket %p: no user data\n", sk);
830
	else {
831
		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
832
		svc_xprt_enqueue(&svsk->sk_xprt);
L
Linus Torvalds 已提交
833
	}
834
	if (sunrpc_waitqueue_active(wq))
835
		wake_up_interruptible_all(wq);
L
Linus Torvalds 已提交
836 837
}

838
static void svc_tcp_data_ready(struct sock *sk)
L
Linus Torvalds 已提交
839
{
840
	struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
841
	wait_queue_head_t *wq = sk_sleep(sk);
L
Linus Torvalds 已提交
842 843

	dprintk("svc: socket %p TCP data ready (svsk %p)\n",
844 845
		sk, sk->sk_user_data);
	if (svsk) {
846
		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
847
		svc_xprt_enqueue(&svsk->sk_xprt);
848
	}
849
	if (sunrpc_waitqueue_active(wq))
850
		wake_up_interruptible(wq);
L
Linus Torvalds 已提交
851 852 853 854 855
}

/*
 * Accept a TCP connection
 */
856
static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
L
Linus Torvalds 已提交
857
{
858
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
859 860
	struct sockaddr_storage addr;
	struct sockaddr	*sin = (struct sockaddr *) &addr;
861
	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
L
Linus Torvalds 已提交
862 863 864 865
	struct socket	*sock = svsk->sk_sock;
	struct socket	*newsock;
	struct svc_sock	*newsvsk;
	int		err, slen;
866
	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
L
Linus Torvalds 已提交
867 868 869

	dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
	if (!sock)
870
		return NULL;
L
Linus Torvalds 已提交
871

872
	clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
873 874
	err = kernel_accept(sock, &newsock, O_NONBLOCK);
	if (err < 0) {
L
Linus Torvalds 已提交
875 876 877
		if (err == -ENOMEM)
			printk(KERN_WARNING "%s: no more sockets!\n",
			       serv->sv_name);
878 879 880
		else if (err != -EAGAIN)
			net_warn_ratelimited("%s: accept failed (err %d)!\n",
					     serv->sv_name, -err);
881
		return NULL;
L
Linus Torvalds 已提交
882
	}
883
	set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
884

885
	err = kernel_getpeername(newsock, sin, &slen);
L
Linus Torvalds 已提交
886
	if (err < 0) {
887 888
		net_warn_ratelimited("%s: peername failed (err %d)!\n",
				     serv->sv_name, -err);
L
Linus Torvalds 已提交
889 890 891 892
		goto failed;		/* aborted connection or whatever */
	}

	/* Ideally, we would want to reject connections from unauthorized
893 894
	 * hosts here, but when we get encryption, the IP of the host won't
	 * tell us anything.  For now just warn about unpriv connections.
L
Linus Torvalds 已提交
895
	 */
896
	if (!svc_port_is_privileged(sin)) {
897
		dprintk("%s: connect from unprivileged port: %s\n",
898
			serv->sv_name,
899
			__svc_print_addr(sin, buf, sizeof(buf)));
L
Linus Torvalds 已提交
900
	}
901
	dprintk("%s: connect from %s\n", serv->sv_name,
902
		__svc_print_addr(sin, buf, sizeof(buf)));
L
Linus Torvalds 已提交
903 904 905 906 907 908

	/* make sure that a write doesn't block forever when
	 * low on memory
	 */
	newsock->sk->sk_sndtimeo = HZ*30;

909 910 911
	newsvsk = svc_setup_socket(serv, newsock,
				 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY));
	if (IS_ERR(newsvsk))
L
Linus Torvalds 已提交
912
		goto failed;
913
	svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
914 915 916 917 918
	err = kernel_getsockname(newsock, sin, &slen);
	if (unlikely(err < 0)) {
		dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
		slen = offsetof(struct sockaddr, sa_data);
	}
919
	svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
920

921 922 923 924
	if (sock_is_loopback(newsock->sk))
		set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
	else
		clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
925 926 927 928 929 930 931 932 933 934
	if (serv->sv_stats)
		serv->sv_stats->nettcpconn++;

	return &newsvsk->sk_xprt;

failed:
	sock_release(newsock);
	return NULL;
}

935 936 937 938
static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
{
	unsigned int i, len, npages;

939
	if (svsk->sk_datalen == 0)
940
		return 0;
941
	len = svsk->sk_datalen;
942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	for (i = 0; i < npages; i++) {
		if (rqstp->rq_pages[i] != NULL)
			put_page(rqstp->rq_pages[i]);
		BUG_ON(svsk->sk_pages[i] == NULL);
		rqstp->rq_pages[i] = svsk->sk_pages[i];
		svsk->sk_pages[i] = NULL;
	}
	rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]);
	return len;
}

static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
{
	unsigned int i, len, npages;

958
	if (svsk->sk_datalen == 0)
959
		return;
960
	len = svsk->sk_datalen;
961 962 963 964 965 966 967 968 969 970 971
	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	for (i = 0; i < npages; i++) {
		svsk->sk_pages[i] = rqstp->rq_pages[i];
		rqstp->rq_pages[i] = NULL;
	}
}

static void svc_tcp_clear_pages(struct svc_sock *svsk)
{
	unsigned int i, len, npages;

972
	if (svsk->sk_datalen == 0)
973
		goto out;
974
	len = svsk->sk_datalen;
975 976
	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	for (i = 0; i < npages; i++) {
977 978 979 980
		if (svsk->sk_pages[i] == NULL) {
			WARN_ON_ONCE(1);
			continue;
		}
981 982 983 984 985
		put_page(svsk->sk_pages[i]);
		svsk->sk_pages[i] = NULL;
	}
out:
	svsk->sk_tcplen = 0;
986
	svsk->sk_datalen = 0;
987 988
}

L
Linus Torvalds 已提交
989
/*
990
 * Receive fragment record header.
991
 * If we haven't gotten the record length yet, get the next four bytes.
L
Linus Torvalds 已提交
992
 */
993
static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
L
Linus Torvalds 已提交
994
{
995
	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
996
	unsigned int want;
997
	int len;
L
Linus Torvalds 已提交
998

999
	if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
L
Linus Torvalds 已提交
1000 1001
		struct kvec	iov;

1002
		want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
L
Linus Torvalds 已提交
1003 1004 1005 1006 1007 1008 1009
		iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
		iov.iov_len  = want;
		if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
			goto error;
		svsk->sk_tcplen += len;

		if (len < want) {
1010 1011
			dprintk("svc: short recvfrom while reading record "
				"length (%d of %d)\n", len, want);
1012
			return -EAGAIN;
L
Linus Torvalds 已提交
1013 1014
		}

1015
		dprintk("svc: TCP record, %d bytes\n", svc_sock_reclen(svsk));
1016 1017
		if (svc_sock_reclen(svsk) + svsk->sk_datalen >
							serv->sv_max_mesg) {
J
J. Bruce Fields 已提交
1018 1019
			net_notice_ratelimited("RPC: fragment too large: %d\n",
					svc_sock_reclen(svsk));
L
Linus Torvalds 已提交
1020 1021 1022 1023
			goto err_delete;
		}
	}

1024
	return svc_sock_reclen(svsk);
1025 1026
error:
	dprintk("RPC: TCP recv_record got %d\n", len);
1027
	return len;
1028
err_delete:
1029 1030 1031 1032
	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
	return -EAGAIN;
}

1033
static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
1034
{
1035
	struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
1036
	struct rpc_rqst *req = NULL;
1037 1038
	struct kvec *src, *dst;
	__be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
1039 1040
	__be32 xid;
	__be32 calldir;
1041 1042 1043 1044

	xid = *p++;
	calldir = *p;

1045
	if (!bc_xprt)
1046
		return -EAGAIN;
1047 1048 1049 1050
	spin_lock_bh(&bc_xprt->transport_lock);
	req = xprt_lookup_rqst(bc_xprt, xid);
	if (!req)
		goto unlock_notfound;
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060

	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
	/*
	 * XXX!: cheating for now!  Only copying HEAD.
	 * But we know this is good enough for now (in fact, for any
	 * callback reply in the forseeable future).
	 */
	dst = &req->rq_private_buf.head[0];
	src = &rqstp->rq_arg.head[0];
	if (dst->iov_len < src->iov_len)
1061
		goto unlock_eagain; /* whatever; just giving up. */
1062
	memcpy(dst->iov_base, src->iov_base, src->iov_len);
1063
	xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
1064
	rqstp->rq_arg.len = 0;
1065
	spin_unlock_bh(&bc_xprt->transport_lock);
1066
	return 0;
1067 1068 1069 1070 1071 1072 1073 1074 1075
unlock_notfound:
	printk(KERN_NOTICE
		"%s: Got unrecognized reply: "
		"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
		__func__, ntohl(calldir),
		bc_xprt, ntohl(xid));
unlock_eagain:
	spin_unlock_bh(&bc_xprt->transport_lock);
	return -EAGAIN;
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089
}

static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
{
	int i = 0;
	int t = 0;

	while (t < len) {
		vec[i].iov_base = page_address(pages[i]);
		vec[i].iov_len = PAGE_SIZE;
		i++;
		t += PAGE_SIZE;
	}
	return i;
1090 1091
}

1092 1093 1094 1095 1096 1097 1098 1099 1100
static void svc_tcp_fragment_received(struct svc_sock *svsk)
{
	/* If we have more data, signal svc_xprt_enqueue() to try again */
	dprintk("svc: TCP %s record (%d bytes)\n",
		svc_sock_final_rec(svsk) ? "final" : "nonfinal",
		svc_sock_reclen(svsk));
	svsk->sk_tcplen = 0;
	svsk->sk_reclen = 0;
}
1101

1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
/*
 * Receive data from a TCP socket.
 */
static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
{
	struct svc_sock	*svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
	int		len;
	struct kvec *vec;
1112
	unsigned int want, base;
1113 1114
	__be32 *p;
	__be32 calldir;
1115
	int pnum;
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125

	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
		test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
		test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));

	len = svc_tcp_recv_record(svsk, rqstp);
	if (len < 0)
		goto error;

1126
	base = svc_tcp_restore_pages(svsk, rqstp);
1127
	want = svc_sock_reclen(svsk) - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
1128

1129
	vec = rqstp->rq_vec;
1130

1131
	pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
1132
						svsk->sk_datalen + want);
1133

1134
	rqstp->rq_respages = &rqstp->rq_pages[pnum];
1135
	rqstp->rq_next_page = rqstp->rq_respages + 1;
L
Linus Torvalds 已提交
1136 1137

	/* Now receive data */
1138
	len = svc_partial_recvfrom(rqstp, vec, pnum, want, base);
1139
	if (len >= 0) {
1140
		svsk->sk_tcplen += len;
1141 1142
		svsk->sk_datalen += len;
	}
1143
	if (len != want || !svc_sock_final_rec(svsk)) {
1144
		svc_tcp_save_pages(svsk, rqstp);
1145
		if (len < 0 && len != -EAGAIN)
1146
			goto err_delete;
1147 1148 1149
		if (len == want)
			svc_tcp_fragment_received(svsk);
		else
J
J. Bruce Fields 已提交
1150 1151
			dprintk("svc: incomplete TCP record (%d of %d)\n",
				(int)(svsk->sk_tcplen - sizeof(rpc_fraghdr)),
1152
				svc_sock_reclen(svsk));
1153 1154
		goto err_noclose;
	}
L
Linus Torvalds 已提交
1155

1156
	if (svsk->sk_datalen < 8) {
1157
		svsk->sk_datalen = 0;
1158
		goto err_delete; /* client is nuts. */
1159
	}
1160

1161
	rqstp->rq_arg.len = svsk->sk_datalen;
L
Linus Torvalds 已提交
1162
	rqstp->rq_arg.page_base = 0;
1163 1164
	if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
		rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
L
Linus Torvalds 已提交
1165
		rqstp->rq_arg.page_len = 0;
1166 1167
	} else
		rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
L
Linus Torvalds 已提交
1168

1169
	rqstp->rq_xprt_ctxt   = NULL;
L
Linus Torvalds 已提交
1170
	rqstp->rq_prot	      = IPPROTO_TCP;
1171 1172 1173 1174
	if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags))
		set_bit(RQ_LOCAL, &rqstp->rq_flags);
	else
		clear_bit(RQ_LOCAL, &rqstp->rq_flags);
L
Linus Torvalds 已提交
1175

1176 1177
	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
	calldir = p[1];
1178
	if (calldir)
1179 1180
		len = receive_cb_reply(svsk, rqstp);

L
Linus Torvalds 已提交
1181
	/* Reset TCP read info */
1182
	svsk->sk_datalen = 0;
1183
	svc_tcp_fragment_received(svsk);
1184

1185 1186
	if (len < 0)
		goto error;
L
Linus Torvalds 已提交
1187

1188
	svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
L
Linus Torvalds 已提交
1189 1190 1191
	if (serv->sv_stats)
		serv->sv_stats->nettcpcnt++;

1192
	return rqstp->rq_arg.len;
L
Linus Torvalds 已提交
1193

1194
error:
1195
	if (len != -EAGAIN)
1196
		goto err_delete;
1197
	dprintk("RPC: TCP recvfrom got EAGAIN\n");
1198
	return 0;
1199
err_delete:
1200 1201 1202 1203
	printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
	       svsk->sk_xprt.xpt_server->sv_name, -len);
	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
err_noclose:
1204
	return 0;	/* record not complete */
L
Linus Torvalds 已提交
1205 1206 1207 1208 1209
}

/*
 * Send out data on TCP socket.
 */
1210
static int svc_tcp_sendto(struct svc_rqst *rqstp)
L
Linus Torvalds 已提交
1211 1212 1213
{
	struct xdr_buf	*xbufp = &rqstp->rq_res;
	int sent;
1214
	__be32 reclen;
L
Linus Torvalds 已提交
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224

	/* Set up the first element of the reply kvec.
	 * Any other kvecs that may be in use have been taken
	 * care of by the server implementation itself.
	 */
	reclen = htonl(0x80000000|((xbufp->len ) - 4));
	memcpy(xbufp->head[0].iov_base, &reclen, 4);

	sent = svc_sendto(rqstp, &rqstp->rq_res);
	if (sent != xbufp->len) {
1225 1226 1227
		printk(KERN_NOTICE
		       "rpc-srv/tcp: %s: %s %d when sending %d bytes "
		       "- shutting down socket\n",
1228
		       rqstp->rq_xprt->xpt_server->sv_name,
L
Linus Torvalds 已提交
1229 1230
		       (sent<0)?"got error":"sent only",
		       sent, xbufp->len);
1231
		set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
1232
		svc_xprt_enqueue(rqstp->rq_xprt);
L
Linus Torvalds 已提交
1233 1234 1235 1236 1237
		sent = -EAGAIN;
	}
	return sent;
}

T
Tom Tucker 已提交
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
/*
 * Setup response header. TCP has a 4B record length field.
 */
static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
{
	struct kvec *resv = &rqstp->rq_res.head[0];

	/* tcp needs a space for the record length... */
	svc_putnl(resv, 0);
}

1249
static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
1250
				       struct net *net,
1251 1252 1253
				       struct sockaddr *sa, int salen,
				       int flags)
{
1254
	return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
1255 1256
}

1257
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
					     struct net *, struct sockaddr *,
					     int, int);
static void svc_bc_sock_free(struct svc_xprt *xprt);

static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv,
				       struct net *net,
				       struct sockaddr *sa, int salen,
				       int flags)
{
	return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
}

static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt)
{
}

static struct svc_xprt_ops svc_tcp_bc_ops = {
	.xpo_create = svc_bc_tcp_create,
	.xpo_detach = svc_bc_tcp_sock_detach,
	.xpo_free = svc_bc_sock_free,
	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
1280
	.xpo_secure_port = svc_sock_secure_port,
1281 1282 1283 1284 1285 1286 1287 1288
};

static struct svc_xprt_class svc_tcp_bc_class = {
	.xcl_name = "tcp-bc",
	.xcl_owner = THIS_MODULE,
	.xcl_ops = &svc_tcp_bc_ops,
	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
};
1289 1290 1291 1292 1293 1294 1295 1296 1297 1298

static void svc_init_bc_xprt_sock(void)
{
	svc_reg_xprt_class(&svc_tcp_bc_class);
}

static void svc_cleanup_bc_xprt_sock(void)
{
	svc_unreg_xprt_class(&svc_tcp_bc_class);
}
1299
#else /* CONFIG_SUNRPC_BACKCHANNEL */
1300 1301 1302 1303 1304 1305 1306
static void svc_init_bc_xprt_sock(void)
{
}

static void svc_cleanup_bc_xprt_sock(void)
{
}
1307
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
1308

1309
static struct svc_xprt_ops svc_tcp_ops = {
1310
	.xpo_create = svc_tcp_create,
1311 1312
	.xpo_recvfrom = svc_tcp_recvfrom,
	.xpo_sendto = svc_tcp_sendto,
1313
	.xpo_release_rqst = svc_release_skb,
1314
	.xpo_detach = svc_tcp_sock_detach,
1315
	.xpo_free = svc_sock_free,
T
Tom Tucker 已提交
1316
	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
1317
	.xpo_has_wspace = svc_tcp_has_wspace,
1318
	.xpo_accept = svc_tcp_accept,
1319
	.xpo_secure_port = svc_sock_secure_port,
1320
	.xpo_adjust_wspace = svc_tcp_adjust_wspace,
1321 1322 1323 1324
};

static struct svc_xprt_class svc_tcp_class = {
	.xcl_name = "tcp",
1325
	.xcl_owner = THIS_MODULE,
1326
	.xcl_ops = &svc_tcp_ops,
1327
	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
1328
	.xcl_ident = XPRT_TRANSPORT_TCP,
1329 1330 1331 1332 1333 1334
};

void svc_init_xprt_sock(void)
{
	svc_reg_xprt_class(&svc_tcp_class);
	svc_reg_xprt_class(&svc_udp_class);
1335
	svc_init_bc_xprt_sock();
1336 1337 1338 1339 1340 1341
}

void svc_cleanup_xprt_sock(void)
{
	svc_unreg_xprt_class(&svc_tcp_class);
	svc_unreg_xprt_class(&svc_udp_class);
1342
	svc_cleanup_bc_xprt_sock();
1343 1344
}

1345
static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
L
Linus Torvalds 已提交
1346 1347 1348
{
	struct sock	*sk = svsk->sk_sk;

1349 1350
	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
		      &svsk->sk_xprt, serv);
1351
	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
1352 1353
	if (sk->sk_state == TCP_LISTEN) {
		dprintk("setting up TCP socket for listening\n");
1354
		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
1355
		sk->sk_data_ready = svc_tcp_listen_data_ready;
1356
		set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
1357 1358 1359 1360
	} else {
		dprintk("setting up TCP socket for reading\n");
		sk->sk_state_change = svc_tcp_state_change;
		sk->sk_data_ready = svc_tcp_data_ready;
1361
		sk->sk_write_space = svc_tcp_write_space;
L
Linus Torvalds 已提交
1362 1363 1364

		svsk->sk_reclen = 0;
		svsk->sk_tcplen = 0;
1365
		svsk->sk_datalen = 0;
1366
		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
L
Linus Torvalds 已提交
1367

1368
		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
L
Linus Torvalds 已提交
1369

1370
		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1371
		if (sk->sk_state != TCP_ESTABLISHED)
1372
			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
1373 1374 1375
	}
}

1376
void svc_sock_update_bufs(struct svc_serv *serv)
L
Linus Torvalds 已提交
1377 1378 1379 1380 1381
{
	/*
	 * The number of server threads has changed. Update
	 * rcvbuf and sndbuf accordingly on all sockets
	 */
1382
	struct svc_sock *svsk;
L
Linus Torvalds 已提交
1383 1384

	spin_lock_bh(&serv->sv_lock);
1385
	list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list)
1386
		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
L
Linus Torvalds 已提交
1387 1388
	spin_unlock_bh(&serv->sv_lock);
}
1389
EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
L
Linus Torvalds 已提交
1390 1391 1392 1393

/*
 * Initialize socket for RPC use and create svc_sock struct
 */
1394 1395
static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
						struct socket *sock,
1396
						int flags)
L
Linus Torvalds 已提交
1397 1398 1399
{
	struct svc_sock	*svsk;
	struct sock	*inet;
1400
	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
1401
	int		err = 0;
L
Linus Torvalds 已提交
1402 1403

	dprintk("svc: svc_setup_socket %p\n", sock);
1404 1405 1406
	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
	if (!svsk)
		return ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
1407 1408 1409 1410

	inet = sock->sk;

	/* Register socket with portmapper */
1411 1412
	if (pmap_register)
		err = svc_register(serv, sock_net(sock->sk), inet->sk_family,
1413
				     inet->sk_protocol,
E
Eric Dumazet 已提交
1414
				     ntohs(inet_sk(inet)->inet_sport));
L
Linus Torvalds 已提交
1415

1416
	if (err < 0) {
L
Linus Torvalds 已提交
1417
		kfree(svsk);
1418
		return ERR_PTR(err);
L
Linus Torvalds 已提交
1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
	}

	inet->sk_user_data = svsk;
	svsk->sk_sock = sock;
	svsk->sk_sk = inet;
	svsk->sk_ostate = inet->sk_state_change;
	svsk->sk_odata = inet->sk_data_ready;
	svsk->sk_owspace = inet->sk_write_space;

	/* Initialize the socket */
	if (sock->type == SOCK_DGRAM)
1430
		svc_udp_init(svsk, serv);
1431 1432 1433 1434 1435 1436
	else {
		/* initialise setting must have enough space to
		 * receive and respond to one request.
		 */
		svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
					4 * serv->sv_max_mesg);
1437
		svc_tcp_init(svsk, serv);
1438
	}
L
Linus Torvalds 已提交
1439 1440 1441 1442 1443 1444 1445

	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
				svsk, svsk->sk_sk);

	return svsk;
}

1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
bool svc_alien_sock(struct net *net, int fd)
{
	int err;
	struct socket *sock = sockfd_lookup(fd, &err);
	bool ret = false;

	if (!sock)
		goto out;
	if (sock_net(sock->sk) != net)
		ret = true;
	sockfd_put(sock);
out:
	return ret;
}
EXPORT_SYMBOL_GPL(svc_alien_sock);

1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474
/**
 * svc_addsock - add a listener socket to an RPC service
 * @serv: pointer to RPC service to which to add a new listener
 * @fd: file descriptor of the new listener
 * @name_return: pointer to buffer to fill in with name of listener
 * @len: size of the buffer
 *
 * Fills in socket name and returns positive length of name if successful.
 * Name is terminated with '\n'.  On error, returns a negative errno
 * value.
 */
int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
		const size_t len)
1475 1476 1477 1478
{
	int err = 0;
	struct socket *so = sockfd_lookup(fd, &err);
	struct svc_sock *svsk = NULL;
J
J. Bruce Fields 已提交
1479 1480 1481
	struct sockaddr_storage addr;
	struct sockaddr *sin = (struct sockaddr *)&addr;
	int salen;
1482 1483 1484

	if (!so)
		return err;
J
J. Bruce Fields 已提交
1485
	err = -EAFNOSUPPORT;
1486
	if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6))
J
J. Bruce Fields 已提交
1487 1488 1489
		goto out;
	err =  -EPROTONOSUPPORT;
	if (so->sk->sk_protocol != IPPROTO_TCP &&
1490
	    so->sk->sk_protocol != IPPROTO_UDP)
J
J. Bruce Fields 已提交
1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502
		goto out;
	err = -EISCONN;
	if (so->state > SS_UNCONNECTED)
		goto out;
	err = -ENOENT;
	if (!try_module_get(THIS_MODULE))
		goto out;
	svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS);
	if (IS_ERR(svsk)) {
		module_put(THIS_MODULE);
		err = PTR_ERR(svsk);
		goto out;
1503
	}
J
J. Bruce Fields 已提交
1504 1505
	if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
		svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
1506
	svc_add_new_perm_xprt(serv, &svsk->sk_xprt);
1507
	return svc_one_sock_name(svsk, name_return, len);
J
J. Bruce Fields 已提交
1508 1509 1510
out:
	sockfd_put(so);
	return err;
1511 1512 1513
}
EXPORT_SYMBOL_GPL(svc_addsock);

L
Linus Torvalds 已提交
1514 1515 1516
/*
 * Create socket for RPC service.
 */
1517 1518
static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
					  int protocol,
1519
					  struct net *net,
1520 1521
					  struct sockaddr *sin, int len,
					  int flags)
L
Linus Torvalds 已提交
1522 1523 1524 1525 1526
{
	struct svc_sock	*svsk;
	struct socket	*sock;
	int		error;
	int		type;
1527 1528 1529
	struct sockaddr_storage addr;
	struct sockaddr *newsin = (struct sockaddr *)&addr;
	int		newlen;
1530 1531
	int		family;
	int		val;
1532
	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
L
Linus Torvalds 已提交
1533

1534 1535
	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
			serv->sv_program->pg_name, protocol,
1536
			__svc_print_addr(sin, buf, sizeof(buf)));
L
Linus Torvalds 已提交
1537 1538 1539 1540

	if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
		printk(KERN_WARNING "svc: only UDP and TCP "
				"sockets supported\n");
1541
		return ERR_PTR(-EINVAL);
L
Linus Torvalds 已提交
1542
	}
1543

L
Linus Torvalds 已提交
1544
	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1545 1546 1547 1548 1549 1550 1551 1552 1553 1554
	switch (sin->sa_family) {
	case AF_INET6:
		family = PF_INET6;
		break;
	case AF_INET:
		family = PF_INET;
		break;
	default:
		return ERR_PTR(-EINVAL);
	}
L
Linus Torvalds 已提交
1555

1556
	error = __sock_create(net, family, type, protocol, &sock, 1);
1557
	if (error < 0)
1558
		return ERR_PTR(error);
L
Linus Torvalds 已提交
1559

1560 1561
	svc_reclassify_socket(sock);

1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
	/*
	 * If this is an PF_INET6 listener, we want to avoid
	 * getting requests from IPv4 remotes.  Those should
	 * be shunted to a PF_INET listener via rpcbind.
	 */
	val = 1;
	if (family == PF_INET6)
		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
					(char *)&val, sizeof(val));

1572
	if (type == SOCK_STREAM)
1573
		sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
1574
	error = kernel_bind(sock, sin, len);
1575 1576
	if (error < 0)
		goto bummer;
L
Linus Torvalds 已提交
1577

1578 1579 1580 1581 1582
	newlen = len;
	error = kernel_getsockname(sock, newsin, &newlen);
	if (error < 0)
		goto bummer;

L
Linus Torvalds 已提交
1583
	if (protocol == IPPROTO_TCP) {
1584
		if ((error = kernel_listen(sock, 64)) < 0)
L
Linus Torvalds 已提交
1585 1586 1587
			goto bummer;
	}

1588
	svsk = svc_setup_socket(serv, sock, flags);
J
J. Bruce Fields 已提交
1589 1590 1591
	if (IS_ERR(svsk)) {
		error = PTR_ERR(svsk);
		goto bummer;
1592
	}
J
J. Bruce Fields 已提交
1593 1594
	svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
	return (struct svc_xprt *)svsk;
L
Linus Torvalds 已提交
1595 1596 1597
bummer:
	dprintk("svc: svc_create_socket error = %d\n", -error);
	sock_release(sock);
1598
	return ERR_PTR(error);
L
Linus Torvalds 已提交
1599 1600
}

1601 1602 1603 1604 1605 1606 1607 1608
/*
 * Detach the svc_sock from the socket so that no
 * more callbacks occur.
 */
static void svc_sock_detach(struct svc_xprt *xprt)
{
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
	struct sock *sk = svsk->sk_sk;
1609
	wait_queue_head_t *wq;
1610 1611 1612 1613 1614 1615 1616

	dprintk("svc: svc_sock_detach(%p)\n", svsk);

	/* put back the old socket callbacks */
	sk->sk_state_change = svsk->sk_ostate;
	sk->sk_data_ready = svsk->sk_odata;
	sk->sk_write_space = svsk->sk_owspace;
1617

1618
	wq = sk_sleep(sk);
1619
	if (sunrpc_waitqueue_active(wq))
1620
		wake_up_interruptible(wq);
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633
}

/*
 * Disconnect the socket, and reset the callbacks
 */
static void svc_tcp_sock_detach(struct svc_xprt *xprt)
{
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);

	dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk);

	svc_sock_detach(xprt);

1634 1635
	if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
		svc_tcp_clear_pages(svsk);
1636
		kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR);
1637
	}
1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
}

/*
 * Free the svc_sock's socket resources and the svc_sock itself.
 */
static void svc_sock_free(struct svc_xprt *xprt)
{
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
	dprintk("svc: svc_sock_free(%p)\n", svsk);

	if (svsk->sk_sock->file)
		sockfd_put(svsk->sk_sock);
	else
		sock_release(svsk->sk_sock);
	kfree(svsk);
}
1654

1655
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
1656
/*
1657
 * Create a back channel svc_xprt which shares the fore channel socket.
1658
 */
1659 1660 1661 1662 1663
static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
					     int protocol,
					     struct net *net,
					     struct sockaddr *sin, int len,
					     int flags)
1664 1665
{
	struct svc_sock *svsk;
1666 1667 1668 1669 1670 1671 1672
	struct svc_xprt *xprt;

	if (protocol != IPPROTO_TCP) {
		printk(KERN_WARNING "svc: only TCP sockets"
			" supported on shared back channel\n");
		return ERR_PTR(-EINVAL);
	}
1673 1674 1675

	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
	if (!svsk)
1676
		return ERR_PTR(-ENOMEM);
1677 1678

	xprt = &svsk->sk_xprt;
1679
	svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv);
1680

1681
	serv->sv_bc_xprt = xprt;
1682

1683 1684 1685 1686
	return xprt;
}

/*
1687
 * Free a back channel svc_sock.
1688
 */
1689
static void svc_bc_sock_free(struct svc_xprt *xprt)
1690
{
1691
	if (xprt)
1692 1693
		kfree(container_of(xprt, struct svc_sock, sk_xprt));
}
1694
#endif /* CONFIG_SUNRPC_BACKCHANNEL */