rpc_rdma.c 28.9 KB
Newer Older
1
/*
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the BSD-type
 * license below:
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *      Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 *      Neither the name of the Network Appliance, Inc. nor the names of
 *      its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * rpc_rdma.c
 *
 * This file contains the guts of the RPC RDMA protocol, and
 * does marshaling/unmarshaling, etc. It is also where interfacing
 * to the Linux RPC framework lives.
46 47 48 49
 */

#include "xprt_rdma.h"

50 51
#include <linux/highmem.h>

J
Jeff Layton 已提交
52
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
53 54 55
# define RPCDBG_FACILITY	RPCDBG_TRANS
#endif

56 57 58 59 60 61 62 63
enum rpcrdma_chunktype {
	rpcrdma_noch = 0,
	rpcrdma_readch,
	rpcrdma_areadch,
	rpcrdma_writech,
	rpcrdma_replych
};

J
Jeff Layton 已提交
64
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
65 66 67 68 69 70 71 72 73
static const char transfertypes[][12] = {
	"pure inline",	/* no chunks */
	" read chunk",	/* some argument via rdma read */
	"*read chunk",	/* entire request via rdma read */
	"write chunk",	/* some result via rdma write */
	"reply chunk"	/* entire reply via rdma write */
};
#endif

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
/* The client can send a request inline as long as the RPCRDMA header
 * plus the RPC call fit under the transport's inline limit. If the
 * combined call message size exceeds that limit, the client must use
 * the read chunk list for this operation.
 */
static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
{
	unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;

	return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
}

/* The client can't know how large the actual reply will be. Thus it
 * plans for the largest possible reply for that particular ULP
 * operation. If the maximum combined reply message size exceeds that
 * limit, the client must provide a write list or a reply chunk for
 * this request.
 */
static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
{
	unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;

	return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
}

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
static int
rpcrdma_tail_pullup(struct xdr_buf *buf)
{
	size_t tlen = buf->tail[0].iov_len;
	size_t skip = tlen & 3;

	/* Do not include the tail if it is only an XDR pad */
	if (tlen < 4)
		return 0;

	/* xdr_write_pages() adds a pad at the beginning of the tail
	 * if the content in "buf->pages" is unaligned. Force the
	 * tail's actual content to land at the next XDR position
	 * after the head instead.
	 */
	if (skip) {
		unsigned char *src, *dst;
		unsigned int count;

		src = buf->tail[0].iov_base;
		dst = buf->head[0].iov_base;
		dst += buf->head[0].iov_len;

		src += skip;
		tlen -= skip;

		dprintk("RPC:       %s: skip=%zu, memmove(%p, %p, %zu)\n",
			__func__, skip, dst, src, tlen);

		for (count = tlen; count; count--)
			*dst++ = *src++;
	}

	return tlen;
}

135 136 137 138 139 140
/*
 * Chunk assembly from upper layer xdr_buf.
 *
 * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
 * elements. Segments are then coalesced when registered, if possible
 * within the selected memreg mode.
141 142
 *
 * Returns positive number of segments converted, or a negative errno.
143 144 145
 */

static int
146
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
147 148 149
	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
{
	int len, n = 0, p;
150 151
	int page_base;
	struct page **ppages;
152 153 154 155 156 157 158 159

	if (pos == 0 && xdrbuf->head[0].iov_len) {
		seg[n].mr_page = NULL;
		seg[n].mr_offset = xdrbuf->head[0].iov_base;
		seg[n].mr_len = xdrbuf->head[0].iov_len;
		++n;
	}

160 161 162 163 164
	len = xdrbuf->page_len;
	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
	page_base = xdrbuf->page_base & ~PAGE_MASK;
	p = 0;
	while (len && n < nsegs) {
S
Shirley Ma 已提交
165 166 167 168
		if (!ppages[p]) {
			/* alloc the pagelist for receiving buffer */
			ppages[p] = alloc_page(GFP_ATOMIC);
			if (!ppages[p])
169
				return -ENOMEM;
S
Shirley Ma 已提交
170
		}
171 172 173
		seg[n].mr_page = ppages[p];
		seg[n].mr_offset = (void *)(unsigned long) page_base;
		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
174 175
		if (seg[n].mr_len > PAGE_SIZE)
			return -EIO;
176
		len -= seg[n].mr_len;
177
		++n;
178 179
		++p;
		page_base = 0;	/* page offset only applies to first page */
180 181
	}

182 183
	/* Message overflows the seg array */
	if (len && n == nsegs)
184
		return -EIO;
185

186 187 188 189
	/* When encoding the read list, the tail is always sent inline */
	if (type == rpcrdma_readch)
		return n;

190
	if (xdrbuf->tail[0].iov_len) {
191 192 193 194
		/* the rpcrdma protocol allows us to omit any trailing
		 * xdr pad bytes, saving the server an RDMA operation. */
		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
			return n;
195
		if (n == nsegs)
196
			/* Tail remains, but we're out of segments */
197
			return -EIO;
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
		seg[n].mr_page = NULL;
		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
		seg[n].mr_len = xdrbuf->tail[0].iov_len;
		++n;
	}

	return n;
}

/*
 * Create read/write chunk lists, and reply chunks, for RDMA
 *
 *   Assume check against THRESHOLD has been done, and chunks are required.
 *   Assume only encoding one list entry for read|write chunks. The NFSv3
 *     protocol is simple enough to allow this as it only has a single "bulk
 *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
 *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
 *
 * When used for a single reply chunk (which is a special write
 * chunk used for the entire reply, rather than just the data), it
 * is used primarily for READDIR and READLINK which would otherwise
 * be severely size-limited by a small rdma inline read max. The server
 * response will come back as an RDMA Write, followed by a message
 * of type RDMA_NOMSG carrying the xid and length. As a result, reply
 * chunks do not provide data alignment, however they do not require
 * "fixup" (moving the response to the upper layer buffer) either.
 *
 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 *
 *  Read chunklist (a linked list):
 *   N elements, position P (same P for all chunks of same arg!):
 *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
 *
 *  Write chunklist (a list of (one) counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO - 0
 *
 *  Reply chunk (a counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO
238 239
 *
 * Returns positive RPC/RDMA header size, or negative errno.
240 241
 */

242
static ssize_t
243 244 245 246
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
{
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
247
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
248
	int n, nsegs, nchunks = 0;
249
	unsigned int pos;
250 251 252 253
	struct rpcrdma_mr_seg *seg = req->rl_segments;
	struct rpcrdma_read_chunk *cur_rchunk = NULL;
	struct rpcrdma_write_array *warray = NULL;
	struct rpcrdma_write_chunk *cur_wchunk = NULL;
A
Al Viro 已提交
254
	__be32 *iptr = headerp->rm_body.rm_chunks;
255
	int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274

	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
		/* a read chunk - server will RDMA Read our memory */
		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
	} else {
		/* a write or reply chunk - server will RDMA Write our memory */
		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
		if (type == rpcrdma_replych)
			*iptr++ = xdr_zero;	/* a NULL write chunk list */
		warray = (struct rpcrdma_write_array *) iptr;
		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
	}

	if (type == rpcrdma_replych || type == rpcrdma_areadch)
		pos = 0;
	else
		pos = target->head[0].iov_len;

	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
275 276
	if (nsegs < 0)
		return nsegs;
277

278
	map = r_xprt->rx_ia.ri_ops->ro_map;
279
	do {
280
		n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
281 282 283 284 285
		if (n <= 0)
			goto out;
		if (cur_rchunk) {	/* read */
			cur_rchunk->rc_discrim = xdr_one;
			/* all read chunks have the same "position" */
286 287 288 289 290
			cur_rchunk->rc_position = cpu_to_be32(pos);
			cur_rchunk->rc_target.rs_handle =
						cpu_to_be32(seg->mr_rkey);
			cur_rchunk->rc_target.rs_length =
						cpu_to_be32(seg->mr_len);
291
			xdr_encode_hyper(
A
Al Viro 已提交
292
					(__be32 *)&cur_rchunk->rc_target.rs_offset,
293 294
					seg->mr_base);
			dprintk("RPC:       %s: read chunk "
295
				"elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
296 297
				seg->mr_len, (unsigned long long)seg->mr_base,
				seg->mr_rkey, pos, n < nsegs ? "more" : "last");
298 299 300
			cur_rchunk++;
			r_xprt->rx_stats.read_chunk_count++;
		} else {		/* write/reply */
301 302 303 304
			cur_wchunk->wc_target.rs_handle =
						cpu_to_be32(seg->mr_rkey);
			cur_wchunk->wc_target.rs_length =
						cpu_to_be32(seg->mr_len);
305
			xdr_encode_hyper(
A
Al Viro 已提交
306
					(__be32 *)&cur_wchunk->wc_target.rs_offset,
307 308 309 310
					seg->mr_base);
			dprintk("RPC:       %s: %s chunk "
				"elem %d@0x%llx:0x%x (%s)\n", __func__,
				(type == rpcrdma_replych) ? "reply" : "write",
311 312
				seg->mr_len, (unsigned long long)seg->mr_base,
				seg->mr_rkey, n < nsegs ? "more" : "last");
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
			cur_wchunk++;
			if (type == rpcrdma_replych)
				r_xprt->rx_stats.reply_chunk_count++;
			else
				r_xprt->rx_stats.write_chunk_count++;
			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
		}
		nchunks++;
		seg   += n;
		nsegs -= n;
	} while (nsegs);

	/* success. all failures return above */
	req->rl_nchunks = nchunks;

	/*
	 * finish off header. If write, marshal discrim and nchunks.
	 */
	if (cur_rchunk) {
A
Al Viro 已提交
332
		iptr = (__be32 *) cur_rchunk;
333 334 335 336 337
		*iptr++ = xdr_zero;	/* finish the read chunk list */
		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
	} else {
		warray->wc_discrim = xdr_one;
338
		warray->wc_nchunks = cpu_to_be32(nchunks);
A
Al Viro 已提交
339
		iptr = (__be32 *) cur_wchunk;
340 341 342 343 344 345 346 347 348 349 350 351
		if (type == rpcrdma_writech) {
			*iptr++ = xdr_zero; /* finish the write chunk list */
			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
		}
	}

	/*
	 * Return header size.
	 */
	return (unsigned char *)iptr - (unsigned char *)headerp;

out:
352 353 354
	for (pos = 0; nchunks--;)
		pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
						      &req->rl_segments[pos]);
355
	return n;
356 357 358 359 360 361 362 363 364
}

/*
 * Copy write data inline.
 * This function is used for "small" requests. Data which is passed
 * to RPC via iovecs (or page list) is copied directly into the
 * pre-registered memory buffer for this request. For small amounts
 * of data, this is efficient. The cutoff value is tunable.
 */
365
static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
366 367 368 369 370
{
	int i, npages, curlen;
	int copy_len;
	unsigned char *srcp, *destp;
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
371 372
	int page_base;
	struct page **ppages;
373 374 375 376 377

	destp = rqst->rq_svec[0].iov_base;
	curlen = rqst->rq_svec[0].iov_len;
	destp += curlen;

378 379
	dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
		__func__, destp, rqst->rq_slen, curlen);
380 381

	copy_len = rqst->rq_snd_buf.page_len;
382 383 384 385 386 387 388 389 390 391 392 393

	if (rqst->rq_snd_buf.tail[0].iov_len) {
		curlen = rqst->rq_snd_buf.tail[0].iov_len;
		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
			memmove(destp + copy_len,
				rqst->rq_snd_buf.tail[0].iov_base, curlen);
			r_xprt->rx_stats.pullup_copy_count += curlen;
		}
		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
			__func__, destp + copy_len, curlen);
		rqst->rq_svec[0].iov_len += curlen;
	}
394
	r_xprt->rx_stats.pullup_copy_count += copy_len;
395 396 397 398 399

	page_base = rqst->rq_snd_buf.page_base;
	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
	page_base &= ~PAGE_MASK;
	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
400
	for (i = 0; copy_len && i < npages; i++) {
401
		curlen = PAGE_SIZE - page_base;
402 403 404 405
		if (curlen > copy_len)
			curlen = copy_len;
		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
			__func__, i, destp, copy_len, curlen);
406
		srcp = kmap_atomic(ppages[i]);
407
		memcpy(destp, srcp+page_base, curlen);
408
		kunmap_atomic(srcp);
409 410 411
		rqst->rq_svec[0].iov_len += curlen;
		destp += curlen;
		copy_len -= curlen;
412
		page_base = 0;
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
	}
	/* header now contains entire send message */
}

/*
 * Marshal a request: the primary job of this routine is to choose
 * the transfer modes. See comments below.
 *
 * Uses multiple RDMA IOVs for a request:
 *  [0] -- RPC RDMA header, which uses memory from the *start* of the
 *         preregistered buffer that already holds the RPC data in
 *         its middle.
 *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
 *  [2] -- optional padding.
 *  [3] -- if padded, header only in [1] and data here.
428 429
 *
 * Returns zero on success, otherwise a negative errno.
430 431 432 433 434
 */

int
rpcrdma_marshal_req(struct rpc_rqst *rqst)
{
435
	struct rpc_xprt *xprt = rqst->rq_xprt;
436 437 438
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
	char *base;
439
	size_t rpclen;
440
	ssize_t hdrlen;
441
	enum rpcrdma_chunktype rtype, wtype;
442 443
	struct rpcrdma_msg *headerp;

444 445 446 447 448
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
	if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
		return rpcrdma_bc_marshal_reply(rqst);
#endif

449 450 451 452 453 454 455
	/*
	 * rpclen gets amount of data in first buffer, which is the
	 * pre-registered buffer.
	 */
	base = rqst->rq_svec[0].iov_base;
	rpclen = rqst->rq_svec[0].iov_len;

456
	headerp = rdmab_to_msg(req->rl_rdmabuf);
457
	/* don't byte-swap XID, it's already done in request */
458
	headerp->rm_xid = rqst->rq_xid;
459 460 461
	headerp->rm_vers = rpcrdma_version;
	headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
	headerp->rm_type = rdma_msg;
462 463 464 465

	/*
	 * Chunks needed for results?
	 *
466
	 * o Read ops return data as write chunk(s), header as inline.
467
	 * o If the expected result is under the inline threshold, all ops
468
	 *   return as inline.
469 470
	 * o Large non-read ops return as a single reply chunk.
	 */
471
	if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
472
		wtype = rpcrdma_writech;
473 474
	else if (rpcrdma_results_inline(rqst))
		wtype = rpcrdma_noch;
475
	else
476
		wtype = rpcrdma_replych;
477 478 479 480 481 482 483 484

	/*
	 * Chunks needed for arguments?
	 *
	 * o If the total request is under the inline threshold, all ops
	 *   are sent as inline.
	 * o Large write ops transmit data as read chunk(s), header as
	 *   inline.
485 486
	 * o Large non-write ops are sent with the entire message as a
	 *   single read chunk (protocol 0-position special case).
487
	 *
488 489 490
	 * This assumes that the upper layer does not present a request
	 * that both has a data payload, and whose non-data arguments
	 * by themselves are larger than the inline threshold.
491
	 */
492
	if (rpcrdma_args_inline(rqst)) {
493
		rtype = rpcrdma_noch;
494
	} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
495
		rtype = rpcrdma_readch;
496
	} else {
497
		r_xprt->rx_stats.nomsg_call_count++;
498 499 500 501
		headerp->rm_type = htonl(RDMA_NOMSG);
		rtype = rpcrdma_areadch;
		rpclen = 0;
	}
502 503

	/* The following simplification is not true forever */
504 505 506
	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
		wtype = rpcrdma_noch;
	if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
507 508 509 510
		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
			__func__);
		return -EIO;
	}
511

C
Chuck Lever 已提交
512
	hdrlen = RPCRDMA_HDRLEN_MIN;
513 514 515 516 517 518

	/*
	 * Pull up any extra send data into the preregistered buffer.
	 * When padding is in use and applies to the transfer, insert
	 * it and change the message type.
	 */
519
	if (rtype == rpcrdma_noch) {
520

521 522 523 524 525 526 527
		rpcrdma_inline_pullup(rqst);

		headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
		headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
		headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
		/* new length after pullup */
		rpclen = rqst->rq_svec[0].iov_len;
528 529
	} else if (rtype == rpcrdma_readch)
		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
530 531 532 533 534 535 536 537 538
	if (rtype != rpcrdma_noch) {
		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
					       headerp, rtype);
		wtype = rtype;	/* simplify dprintk */

	} else if (wtype != rpcrdma_noch) {
		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
					       headerp, wtype);
	}
539 540
	if (hdrlen < 0)
		return hdrlen;
541

542
	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
543
		" headerp 0x%p base 0x%p lkey 0x%x\n",
544
		__func__, transfertypes[wtype], hdrlen, rpclen,
545
		headerp, base, rdmab_lkey(req->rl_rdmabuf));
546 547 548 549 550 551 552 553

	/*
	 * initialize send_iov's - normally only two: rdma chunk header and
	 * single preregistered RPC header buffer, but if padding is present,
	 * then use a preregistered (and zeroed) pad buffer between the RPC
	 * header and any write data. In all non-rdma cases, any following
	 * data has been copied into the RPC header buffer.
	 */
554
	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
555
	req->rl_send_iov[0].length = hdrlen;
556
	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
557

558 559 560 561
	req->rl_niovs = 1;
	if (rtype == rpcrdma_areadch)
		return 0;

562
	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
563
	req->rl_send_iov[1].length = rpclen;
564
	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
565 566 567 568 569 570 571 572 573 574

	req->rl_niovs = 2;
	return 0;
}

/*
 * Chase down a received write or reply chunklist to get length
 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
 */
static int
575
rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
576 577 578
{
	unsigned int i, total_len;
	struct rpcrdma_write_chunk *cur_wchunk;
579
	char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
580

581
	i = be32_to_cpu(**iptrp);
582 583 584 585 586 587 588 589
	if (i > max)
		return -1;
	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
	total_len = 0;
	while (i--) {
		struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
		ifdebug(FACILITY) {
			u64 off;
A
Al Viro 已提交
590
			xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
591 592
			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
				__func__,
593
				be32_to_cpu(seg->rs_length),
594
				(unsigned long long)off,
595
				be32_to_cpu(seg->rs_handle));
596
		}
597
		total_len += be32_to_cpu(seg->rs_length);
598 599 600 601
		++cur_wchunk;
	}
	/* check and adjust for properly terminated write chunk */
	if (wrchunk) {
A
Al Viro 已提交
602
		__be32 *w = (__be32 *) cur_wchunk;
603 604 605 606
		if (*w++ != xdr_zero)
			return -1;
		cur_wchunk = (struct rpcrdma_write_chunk *) w;
	}
607
	if ((char *)cur_wchunk > base + rep->rr_len)
608 609
		return -1;

A
Al Viro 已提交
610
	*iptrp = (__be32 *) cur_wchunk;
611 612 613 614 615 616 617
	return total_len;
}

/*
 * Scatter inline received data back into provided iov's.
 */
static void
618
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
619 620 621
{
	int i, npages, curlen, olen;
	char *destp;
622 623
	struct page **ppages;
	int page_base;
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641

	curlen = rqst->rq_rcv_buf.head[0].iov_len;
	if (curlen > copy_len) {	/* write chunk header fixup */
		curlen = copy_len;
		rqst->rq_rcv_buf.head[0].iov_len = curlen;
	}

	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
		__func__, srcp, copy_len, curlen);

	/* Shift pointer for first receive segment only */
	rqst->rq_rcv_buf.head[0].iov_base = srcp;
	srcp += curlen;
	copy_len -= curlen;

	olen = copy_len;
	i = 0;
	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
642 643 644 645
	page_base = rqst->rq_rcv_buf.page_base;
	ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
	page_base &= ~PAGE_MASK;

646
	if (copy_len && rqst->rq_rcv_buf.page_len) {
647
		npages = PAGE_ALIGN(page_base +
648 649
			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
		for (; i < npages; i++) {
650
			curlen = PAGE_SIZE - page_base;
651 652 653 654 655
			if (curlen > copy_len)
				curlen = copy_len;
			dprintk("RPC:       %s: page %d"
				" srcp 0x%p len %d curlen %d\n",
				__func__, i, srcp, copy_len, curlen);
656
			destp = kmap_atomic(ppages[i]);
657 658
			memcpy(destp + page_base, srcp, curlen);
			flush_dcache_page(ppages[i]);
659
			kunmap_atomic(destp);
660 661 662 663
			srcp += curlen;
			copy_len -= curlen;
			if (copy_len == 0)
				break;
664
			page_base = 0;
665
		}
666
	}
667 668 669 670 671 672

	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
		curlen = copy_len;
		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
673
			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
674 675 676 677 678 679 680
		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
			__func__, srcp, copy_len, curlen);
		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
		copy_len -= curlen; ++i;
	} else
		rqst->rq_rcv_buf.tail[0].iov_len = 0;

681 682 683 684 685 686 687
	if (pad) {
		/* implicit padding on terminal chunk */
		unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
		while (pad--)
			p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
	}

688 689 690 691 692 693 694 695 696 697
	if (copy_len)
		dprintk("RPC:       %s: %d bytes in"
			" %d extra segments (%d lost)\n",
			__func__, olen, i, copy_len);

	/* TBD avoid a warning from call_decode() */
	rqst->rq_private_buf = rqst->rq_rcv_buf;
}

void
698
rpcrdma_connect_worker(struct work_struct *work)
699
{
700 701
	struct rpcrdma_ep *ep =
		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
702 703 704
	struct rpcrdma_xprt *r_xprt =
		container_of(ep, struct rpcrdma_xprt, rx_ep);
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
705 706

	spin_lock_bh(&xprt->transport_lock);
707 708
	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
		++xprt->connect_cookie;
709 710 711 712 713
	if (ep->rep_connected > 0) {
		if (!xprt_test_and_set_connected(xprt))
			xprt_wake_pending_tasks(xprt, 0);
	} else {
		if (xprt_test_and_clear_connected(xprt))
714
			xprt_wake_pending_tasks(xprt, -ENOTCONN);
715 716 717 718
	}
	spin_unlock_bh(&xprt->transport_lock);
}

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/* By convention, backchannel calls arrive via rdma_msg type
 * messages, and never populate the chunk lists. This makes
 * the RPC/RDMA header small and fixed in size, so it is
 * straightforward to check the RPC header's direction field.
 */
static bool
rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
{
	__be32 *p = (__be32 *)headerp;

	if (headerp->rm_type != rdma_msg)
		return false;
	if (headerp->rm_body.rm_chunks[0] != xdr_zero)
		return false;
	if (headerp->rm_body.rm_chunks[1] != xdr_zero)
		return false;
	if (headerp->rm_body.rm_chunks[2] != xdr_zero)
		return false;

	/* sanity */
	if (p[7] != headerp->rm_xid)
		return false;
	/* call direction */
	if (p[8] != cpu_to_be32(RPC_CALL))
		return false;

	return true;
}
#endif	/* CONFIG_SUNRPC_BACKCHANNEL */

750 751 752 753 754 755 756 757 758 759 760 761
/*
 * This function is called when an async event is posted to
 * the connection which changes the connection state. All it
 * does at this point is mark the connection up/down, the rpc
 * timers do the rest.
 */
void
rpcrdma_conn_func(struct rpcrdma_ep *ep)
{
	schedule_delayed_work(&ep->rep_connect_worker, 0);
}

762 763
/* Process received RPC/RDMA messages.
 *
764 765 766 767 768 769 770 771 772
 * Errors must result in the RPC task either being awakened, or
 * allowed to timeout, to discover the errors at that time.
 */
void
rpcrdma_reply_handler(struct rpcrdma_rep *rep)
{
	struct rpcrdma_msg *headerp;
	struct rpcrdma_req *req;
	struct rpc_rqst *rqst;
773 774
	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
A
Al Viro 已提交
775
	__be32 *iptr;
776
	int rdmalen, status;
777
	unsigned long cwnd;
778
	u32 credits;
779

780 781 782 783 784 785 786
	dprintk("RPC:       %s: incoming rep %p\n", __func__, rep);

	if (rep->rr_len == RPCRDMA_BAD_LEN)
		goto out_badstatus;
	if (rep->rr_len < RPCRDMA_HDRLEN_MIN)
		goto out_shortreply;

787
	headerp = rdmab_to_msg(rep->rr_rdmabuf);
788 789
	if (headerp->rm_vers != rpcrdma_version)
		goto out_badversion;
790 791 792 793
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
	if (rpcrdma_is_bcall(headerp))
		goto out_bcall;
#endif
794

795 796 797 798
	/* Match incoming rpcrdma_rep to an rpcrdma_req to
	 * get context for handling any incoming chunks.
	 */
	spin_lock_bh(&xprt->transport_lock);
799
	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
800 801
	if (!rqst)
		goto out_nomatch;
802 803

	req = rpcr_to_rdmar(rqst);
804 805
	if (req->rl_reply)
		goto out_duplicate;
806

807 808 809 810 811
	/* Sanity checking has passed. We are now committed
	 * to complete this transaction.
	 */
	list_del_init(&rqst->rq_list);
	spin_unlock_bh(&xprt->transport_lock);
812 813
	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
		"                   RPC request 0x%p xid 0x%08x\n",
814 815
			__func__, rep, req, rqst,
			be32_to_cpu(headerp->rm_xid));
816 817 818

	/* from here on, the reply is no longer an orphan */
	req->rl_reply = rep;
819
	xprt->reestablish_timeout = 0;
820 821 822 823

	/* check for expected message types */
	/* The order of some of these tests is important. */
	switch (headerp->rm_type) {
824
	case rdma_msg:
825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
		/* never expect read chunks */
		/* never expect reply chunks (two ways to check) */
		/* never expect write chunks without having offered RDMA */
		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
		     req->rl_nchunks == 0))
			goto badheader;
		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
			/* count any expected write chunks in read reply */
			/* start at write chunk array count */
			iptr = &headerp->rm_body.rm_chunks[2];
			rdmalen = rpcrdma_count_chunks(rep,
						req->rl_nchunks, 1, &iptr);
			/* check for validity, and no reply chunk after */
			if (rdmalen < 0 || *iptr++ != xdr_zero)
				goto badheader;
			rep->rr_len -=
			    ((unsigned char *)iptr - (unsigned char *)headerp);
			status = rep->rr_len + rdmalen;
			r_xprt->rx_stats.total_rdma_reply += rdmalen;
847 848 849 850 851
			/* special case - last chunk may omit padding */
			if (rdmalen &= 3) {
				rdmalen = 4 - rdmalen;
				status += rdmalen;
			}
852 853
		} else {
			/* else ordinary inline */
854
			rdmalen = 0;
C
Chuck Lever 已提交
855 856 857
			iptr = (__be32 *)((unsigned char *)headerp +
							RPCRDMA_HDRLEN_MIN);
			rep->rr_len -= RPCRDMA_HDRLEN_MIN;
858 859 860
			status = rep->rr_len;
		}
		/* Fix up the rpc results for upper layer */
861
		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
862 863
		break;

864
	case rdma_nomsg:
865 866 867 868 869 870
		/* never expect read or write chunks, always reply chunks */
		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
		    headerp->rm_body.rm_chunks[2] != xdr_one ||
		    req->rl_nchunks == 0)
			goto badheader;
C
Chuck Lever 已提交
871 872
		iptr = (__be32 *)((unsigned char *)headerp +
							RPCRDMA_HDRLEN_MIN);
873 874 875 876 877 878 879 880 881 882 883 884 885
		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
		if (rdmalen < 0)
			goto badheader;
		r_xprt->rx_stats.total_rdma_reply += rdmalen;
		/* Reply chunk buffer already is the reply vector - no fixup. */
		status = rdmalen;
		break;

badheader:
	default:
		dprintk("%s: invalid rpcrdma reply header (type %d):"
				" chunks[012] == %d %d %d"
				" expected chunks <= %d\n",
886
				__func__, be32_to_cpu(headerp->rm_type),
887 888 889 890 891 892 893 894 895
				headerp->rm_body.rm_chunks[0],
				headerp->rm_body.rm_chunks[1],
				headerp->rm_body.rm_chunks[2],
				req->rl_nchunks);
		status = -EIO;
		r_xprt->rx_stats.bad_reply_count++;
		break;
	}

896 897 898 899 900 901 902 903 904 905
	/* Invalidate and flush the data payloads before waking the
	 * waiting application. This guarantees the memory region is
	 * properly fenced from the server before the application
	 * accesses the data. It also ensures proper send flow
	 * control: waking the next RPC waits until this RPC has
	 * relinquished all its Send Queue entries.
	 */
	if (req->rl_nchunks)
		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);

906 907 908 909 910 911
	credits = be32_to_cpu(headerp->rm_credit);
	if (credits == 0)
		credits = 1;	/* don't deadlock */
	else if (credits > r_xprt->rx_buf.rb_max_requests)
		credits = r_xprt->rx_buf.rb_max_requests;

912
	spin_lock_bh(&xprt->transport_lock);
913
	cwnd = xprt->cwnd;
914
	xprt->cwnd = credits << RPC_CWNDSHIFT;
915 916 917
	if (xprt->cwnd > cwnd)
		xprt_release_rqst_cong(rqst->rq_task);

918
	xprt_complete_rqst(rqst->rq_task, status);
919
	spin_unlock_bh(&xprt->transport_lock);
920 921
	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
			__func__, xprt, rqst, status);
922 923 924 925 926 927 928 929 930 931
	return;

out_badstatus:
	rpcrdma_recv_buffer_put(rep);
	if (r_xprt->rx_ep.rep_connected == 1) {
		r_xprt->rx_ep.rep_connected = -EIO;
		rpcrdma_conn_func(&r_xprt->rx_ep);
	}
	return;

932 933 934 935 936 937
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
out_bcall:
	rpcrdma_bc_receive_call(r_xprt, rep);
	return;
#endif

938 939 940 941 942 943 944 945 946 947
out_shortreply:
	dprintk("RPC:       %s: short/invalid reply\n", __func__);
	goto repost;

out_badversion:
	dprintk("RPC:       %s: invalid version %d\n",
		__func__, be32_to_cpu(headerp->rm_vers));
	goto repost;

out_nomatch:
948
	spin_unlock_bh(&xprt->transport_lock);
949 950 951 952 953 954
	dprintk("RPC:       %s: no match for incoming xid 0x%08x len %d\n",
		__func__, be32_to_cpu(headerp->rm_xid),
		rep->rr_len);
	goto repost;

out_duplicate:
955
	spin_unlock_bh(&xprt->transport_lock);
956 957 958 959 960 961 962 963
	dprintk("RPC:       %s: "
		"duplicate reply %p to RPC request %p: xid 0x%08x\n",
		__func__, rep, req, be32_to_cpu(headerp->rm_xid));

repost:
	r_xprt->rx_stats.bad_reply_count++;
	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
		rpcrdma_recv_buffer_put(rep);
964
}