rpc_rdma.c 27.2 KB
Newer Older
1
/*
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the BSD-type
 * license below:
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *      Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 *      Neither the name of the Network Appliance, Inc. nor the names of
 *      its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * rpc_rdma.c
 *
 * This file contains the guts of the RPC RDMA protocol, and
 * does marshaling/unmarshaling, etc. It is also where interfacing
 * to the Linux RPC framework lives.
46 47 48 49
 */

#include "xprt_rdma.h"

50 51
#include <linux/highmem.h>

J
Jeff Layton 已提交
52
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
53 54 55
# define RPCDBG_FACILITY	RPCDBG_TRANS
#endif

56 57 58 59 60 61 62 63
enum rpcrdma_chunktype {
	rpcrdma_noch = 0,
	rpcrdma_readch,
	rpcrdma_areadch,
	rpcrdma_writech,
	rpcrdma_replych
};

J
Jeff Layton 已提交
64
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
65 66 67 68 69 70 71 72 73
static const char transfertypes[][12] = {
	"pure inline",	/* no chunks */
	" read chunk",	/* some argument via rdma read */
	"*read chunk",	/* entire request via rdma read */
	"write chunk",	/* some result via rdma write */
	"reply chunk"	/* entire reply via rdma write */
};
#endif

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
/* The client can send a request inline as long as the RPCRDMA header
 * plus the RPC call fit under the transport's inline limit. If the
 * combined call message size exceeds that limit, the client must use
 * the read chunk list for this operation.
 */
static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
{
	unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;

	return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
}

/* The client can't know how large the actual reply will be. Thus it
 * plans for the largest possible reply for that particular ULP
 * operation. If the maximum combined reply message size exceeds that
 * limit, the client must provide a write list or a reply chunk for
 * this request.
 */
static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
{
	unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;

	return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
}

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
static int
rpcrdma_tail_pullup(struct xdr_buf *buf)
{
	size_t tlen = buf->tail[0].iov_len;
	size_t skip = tlen & 3;

	/* Do not include the tail if it is only an XDR pad */
	if (tlen < 4)
		return 0;

	/* xdr_write_pages() adds a pad at the beginning of the tail
	 * if the content in "buf->pages" is unaligned. Force the
	 * tail's actual content to land at the next XDR position
	 * after the head instead.
	 */
	if (skip) {
		unsigned char *src, *dst;
		unsigned int count;

		src = buf->tail[0].iov_base;
		dst = buf->head[0].iov_base;
		dst += buf->head[0].iov_len;

		src += skip;
		tlen -= skip;

		dprintk("RPC:       %s: skip=%zu, memmove(%p, %p, %zu)\n",
			__func__, skip, dst, src, tlen);

		for (count = tlen; count; count--)
			*dst++ = *src++;
	}

	return tlen;
}

135 136 137 138 139 140
/*
 * Chunk assembly from upper layer xdr_buf.
 *
 * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
 * elements. Segments are then coalesced when registered, if possible
 * within the selected memreg mode.
141 142
 *
 * Returns positive number of segments converted, or a negative errno.
143 144 145
 */

static int
146
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
147 148 149
	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
{
	int len, n = 0, p;
150 151
	int page_base;
	struct page **ppages;
152 153 154 155 156 157 158 159

	if (pos == 0 && xdrbuf->head[0].iov_len) {
		seg[n].mr_page = NULL;
		seg[n].mr_offset = xdrbuf->head[0].iov_base;
		seg[n].mr_len = xdrbuf->head[0].iov_len;
		++n;
	}

160 161 162 163 164
	len = xdrbuf->page_len;
	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
	page_base = xdrbuf->page_base & ~PAGE_MASK;
	p = 0;
	while (len && n < nsegs) {
S
Shirley Ma 已提交
165 166 167 168
		if (!ppages[p]) {
			/* alloc the pagelist for receiving buffer */
			ppages[p] = alloc_page(GFP_ATOMIC);
			if (!ppages[p])
169
				return -ENOMEM;
S
Shirley Ma 已提交
170
		}
171 172 173
		seg[n].mr_page = ppages[p];
		seg[n].mr_offset = (void *)(unsigned long) page_base;
		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
174 175
		if (seg[n].mr_len > PAGE_SIZE)
			return -EIO;
176
		len -= seg[n].mr_len;
177
		++n;
178 179
		++p;
		page_base = 0;	/* page offset only applies to first page */
180 181
	}

182 183
	/* Message overflows the seg array */
	if (len && n == nsegs)
184
		return -EIO;
185

186 187 188 189
	/* When encoding the read list, the tail is always sent inline */
	if (type == rpcrdma_readch)
		return n;

190
	if (xdrbuf->tail[0].iov_len) {
191 192 193 194
		/* the rpcrdma protocol allows us to omit any trailing
		 * xdr pad bytes, saving the server an RDMA operation. */
		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
			return n;
195
		if (n == nsegs)
196
			/* Tail remains, but we're out of segments */
197
			return -EIO;
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
		seg[n].mr_page = NULL;
		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
		seg[n].mr_len = xdrbuf->tail[0].iov_len;
		++n;
	}

	return n;
}

/*
 * Create read/write chunk lists, and reply chunks, for RDMA
 *
 *   Assume check against THRESHOLD has been done, and chunks are required.
 *   Assume only encoding one list entry for read|write chunks. The NFSv3
 *     protocol is simple enough to allow this as it only has a single "bulk
 *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
 *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
 *
 * When used for a single reply chunk (which is a special write
 * chunk used for the entire reply, rather than just the data), it
 * is used primarily for READDIR and READLINK which would otherwise
 * be severely size-limited by a small rdma inline read max. The server
 * response will come back as an RDMA Write, followed by a message
 * of type RDMA_NOMSG carrying the xid and length. As a result, reply
 * chunks do not provide data alignment, however they do not require
 * "fixup" (moving the response to the upper layer buffer) either.
 *
 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 *
 *  Read chunklist (a linked list):
 *   N elements, position P (same P for all chunks of same arg!):
 *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
 *
 *  Write chunklist (a list of (one) counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO - 0
 *
 *  Reply chunk (a counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO
238 239
 *
 * Returns positive RPC/RDMA header size, or negative errno.
240 241
 */

242
static ssize_t
243 244 245 246
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
{
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
247
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
248
	int n, nsegs, nchunks = 0;
249
	unsigned int pos;
250 251 252 253
	struct rpcrdma_mr_seg *seg = req->rl_segments;
	struct rpcrdma_read_chunk *cur_rchunk = NULL;
	struct rpcrdma_write_array *warray = NULL;
	struct rpcrdma_write_chunk *cur_wchunk = NULL;
A
Al Viro 已提交
254
	__be32 *iptr = headerp->rm_body.rm_chunks;
255
	int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274

	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
		/* a read chunk - server will RDMA Read our memory */
		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
	} else {
		/* a write or reply chunk - server will RDMA Write our memory */
		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
		if (type == rpcrdma_replych)
			*iptr++ = xdr_zero;	/* a NULL write chunk list */
		warray = (struct rpcrdma_write_array *) iptr;
		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
	}

	if (type == rpcrdma_replych || type == rpcrdma_areadch)
		pos = 0;
	else
		pos = target->head[0].iov_len;

	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
275 276
	if (nsegs < 0)
		return nsegs;
277

278
	map = r_xprt->rx_ia.ri_ops->ro_map;
279
	do {
280
		n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
281 282 283 284 285
		if (n <= 0)
			goto out;
		if (cur_rchunk) {	/* read */
			cur_rchunk->rc_discrim = xdr_one;
			/* all read chunks have the same "position" */
286 287 288 289 290
			cur_rchunk->rc_position = cpu_to_be32(pos);
			cur_rchunk->rc_target.rs_handle =
						cpu_to_be32(seg->mr_rkey);
			cur_rchunk->rc_target.rs_length =
						cpu_to_be32(seg->mr_len);
291
			xdr_encode_hyper(
A
Al Viro 已提交
292
					(__be32 *)&cur_rchunk->rc_target.rs_offset,
293 294
					seg->mr_base);
			dprintk("RPC:       %s: read chunk "
295
				"elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
296 297
				seg->mr_len, (unsigned long long)seg->mr_base,
				seg->mr_rkey, pos, n < nsegs ? "more" : "last");
298 299 300
			cur_rchunk++;
			r_xprt->rx_stats.read_chunk_count++;
		} else {		/* write/reply */
301 302 303 304
			cur_wchunk->wc_target.rs_handle =
						cpu_to_be32(seg->mr_rkey);
			cur_wchunk->wc_target.rs_length =
						cpu_to_be32(seg->mr_len);
305
			xdr_encode_hyper(
A
Al Viro 已提交
306
					(__be32 *)&cur_wchunk->wc_target.rs_offset,
307 308 309 310
					seg->mr_base);
			dprintk("RPC:       %s: %s chunk "
				"elem %d@0x%llx:0x%x (%s)\n", __func__,
				(type == rpcrdma_replych) ? "reply" : "write",
311 312
				seg->mr_len, (unsigned long long)seg->mr_base,
				seg->mr_rkey, n < nsegs ? "more" : "last");
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
			cur_wchunk++;
			if (type == rpcrdma_replych)
				r_xprt->rx_stats.reply_chunk_count++;
			else
				r_xprt->rx_stats.write_chunk_count++;
			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
		}
		nchunks++;
		seg   += n;
		nsegs -= n;
	} while (nsegs);

	/* success. all failures return above */
	req->rl_nchunks = nchunks;

	/*
	 * finish off header. If write, marshal discrim and nchunks.
	 */
	if (cur_rchunk) {
A
Al Viro 已提交
332
		iptr = (__be32 *) cur_rchunk;
333 334 335 336 337
		*iptr++ = xdr_zero;	/* finish the read chunk list */
		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
	} else {
		warray->wc_discrim = xdr_one;
338
		warray->wc_nchunks = cpu_to_be32(nchunks);
A
Al Viro 已提交
339
		iptr = (__be32 *) cur_wchunk;
340 341 342 343 344 345 346 347 348 349 350 351
		if (type == rpcrdma_writech) {
			*iptr++ = xdr_zero; /* finish the write chunk list */
			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
		}
	}

	/*
	 * Return header size.
	 */
	return (unsigned char *)iptr - (unsigned char *)headerp;

out:
352 353 354
	for (pos = 0; nchunks--;)
		pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
						      &req->rl_segments[pos]);
355
	return n;
356 357 358 359 360 361 362 363 364
}

/*
 * Copy write data inline.
 * This function is used for "small" requests. Data which is passed
 * to RPC via iovecs (or page list) is copied directly into the
 * pre-registered memory buffer for this request. For small amounts
 * of data, this is efficient. The cutoff value is tunable.
 */
365
static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
366 367 368 369 370
{
	int i, npages, curlen;
	int copy_len;
	unsigned char *srcp, *destp;
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
371 372
	int page_base;
	struct page **ppages;
373 374 375 376 377

	destp = rqst->rq_svec[0].iov_base;
	curlen = rqst->rq_svec[0].iov_len;
	destp += curlen;

378 379
	dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
		__func__, destp, rqst->rq_slen, curlen);
380 381

	copy_len = rqst->rq_snd_buf.page_len;
382 383 384 385 386 387 388 389 390 391 392 393

	if (rqst->rq_snd_buf.tail[0].iov_len) {
		curlen = rqst->rq_snd_buf.tail[0].iov_len;
		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
			memmove(destp + copy_len,
				rqst->rq_snd_buf.tail[0].iov_base, curlen);
			r_xprt->rx_stats.pullup_copy_count += curlen;
		}
		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
			__func__, destp + copy_len, curlen);
		rqst->rq_svec[0].iov_len += curlen;
	}
394
	r_xprt->rx_stats.pullup_copy_count += copy_len;
395 396 397 398 399

	page_base = rqst->rq_snd_buf.page_base;
	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
	page_base &= ~PAGE_MASK;
	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
400
	for (i = 0; copy_len && i < npages; i++) {
401
		curlen = PAGE_SIZE - page_base;
402 403 404 405
		if (curlen > copy_len)
			curlen = copy_len;
		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
			__func__, i, destp, copy_len, curlen);
406
		srcp = kmap_atomic(ppages[i]);
407
		memcpy(destp, srcp+page_base, curlen);
408
		kunmap_atomic(srcp);
409 410 411
		rqst->rq_svec[0].iov_len += curlen;
		destp += curlen;
		copy_len -= curlen;
412
		page_base = 0;
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
	}
	/* header now contains entire send message */
}

/*
 * Marshal a request: the primary job of this routine is to choose
 * the transfer modes. See comments below.
 *
 * Uses multiple RDMA IOVs for a request:
 *  [0] -- RPC RDMA header, which uses memory from the *start* of the
 *         preregistered buffer that already holds the RPC data in
 *         its middle.
 *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
 *  [2] -- optional padding.
 *  [3] -- if padded, header only in [1] and data here.
428 429
 *
 * Returns zero on success, otherwise a negative errno.
430 431 432 433 434
 */

int
rpcrdma_marshal_req(struct rpc_rqst *rqst)
{
435
	struct rpc_xprt *xprt = rqst->rq_xprt;
436 437 438
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
	char *base;
439
	size_t rpclen;
440
	ssize_t hdrlen;
441
	enum rpcrdma_chunktype rtype, wtype;
442 443 444 445 446 447 448 449 450
	struct rpcrdma_msg *headerp;

	/*
	 * rpclen gets amount of data in first buffer, which is the
	 * pre-registered buffer.
	 */
	base = rqst->rq_svec[0].iov_base;
	rpclen = rqst->rq_svec[0].iov_len;

451
	headerp = rdmab_to_msg(req->rl_rdmabuf);
452
	/* don't byte-swap XID, it's already done in request */
453
	headerp->rm_xid = rqst->rq_xid;
454 455 456
	headerp->rm_vers = rpcrdma_version;
	headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
	headerp->rm_type = rdma_msg;
457 458 459 460

	/*
	 * Chunks needed for results?
	 *
461
	 * o Read ops return data as write chunk(s), header as inline.
462
	 * o If the expected result is under the inline threshold, all ops
463
	 *   return as inline.
464 465
	 * o Large non-read ops return as a single reply chunk.
	 */
466
	if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
467
		wtype = rpcrdma_writech;
468 469
	else if (rpcrdma_results_inline(rqst))
		wtype = rpcrdma_noch;
470
	else
471
		wtype = rpcrdma_replych;
472 473 474 475 476 477 478 479

	/*
	 * Chunks needed for arguments?
	 *
	 * o If the total request is under the inline threshold, all ops
	 *   are sent as inline.
	 * o Large write ops transmit data as read chunk(s), header as
	 *   inline.
480 481
	 * o Large non-write ops are sent with the entire message as a
	 *   single read chunk (protocol 0-position special case).
482
	 *
483 484 485
	 * This assumes that the upper layer does not present a request
	 * that both has a data payload, and whose non-data arguments
	 * by themselves are larger than the inline threshold.
486
	 */
487
	if (rpcrdma_args_inline(rqst)) {
488
		rtype = rpcrdma_noch;
489
	} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
490
		rtype = rpcrdma_readch;
491
	} else {
492
		r_xprt->rx_stats.nomsg_call_count++;
493 494 495 496
		headerp->rm_type = htonl(RDMA_NOMSG);
		rtype = rpcrdma_areadch;
		rpclen = 0;
	}
497 498

	/* The following simplification is not true forever */
499 500 501
	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
		wtype = rpcrdma_noch;
	if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
502 503 504 505
		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
			__func__);
		return -EIO;
	}
506

C
Chuck Lever 已提交
507
	hdrlen = RPCRDMA_HDRLEN_MIN;
508 509 510 511 512 513

	/*
	 * Pull up any extra send data into the preregistered buffer.
	 * When padding is in use and applies to the transfer, insert
	 * it and change the message type.
	 */
514
	if (rtype == rpcrdma_noch) {
515

516 517 518 519 520 521 522
		rpcrdma_inline_pullup(rqst);

		headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
		headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
		headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
		/* new length after pullup */
		rpclen = rqst->rq_svec[0].iov_len;
523 524
	} else if (rtype == rpcrdma_readch)
		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
525 526 527 528 529 530 531 532 533
	if (rtype != rpcrdma_noch) {
		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
					       headerp, rtype);
		wtype = rtype;	/* simplify dprintk */

	} else if (wtype != rpcrdma_noch) {
		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
					       headerp, wtype);
	}
534 535
	if (hdrlen < 0)
		return hdrlen;
536

537
	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
538
		" headerp 0x%p base 0x%p lkey 0x%x\n",
539
		__func__, transfertypes[wtype], hdrlen, rpclen,
540
		headerp, base, rdmab_lkey(req->rl_rdmabuf));
541 542 543 544 545 546 547 548

	/*
	 * initialize send_iov's - normally only two: rdma chunk header and
	 * single preregistered RPC header buffer, but if padding is present,
	 * then use a preregistered (and zeroed) pad buffer between the RPC
	 * header and any write data. In all non-rdma cases, any following
	 * data has been copied into the RPC header buffer.
	 */
549
	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
550
	req->rl_send_iov[0].length = hdrlen;
551
	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
552

553 554 555 556
	req->rl_niovs = 1;
	if (rtype == rpcrdma_areadch)
		return 0;

557
	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
558
	req->rl_send_iov[1].length = rpclen;
559
	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
560 561 562 563 564 565 566 567 568 569

	req->rl_niovs = 2;
	return 0;
}

/*
 * Chase down a received write or reply chunklist to get length
 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
 */
static int
570
rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
571 572 573
{
	unsigned int i, total_len;
	struct rpcrdma_write_chunk *cur_wchunk;
574
	char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
575

576
	i = be32_to_cpu(**iptrp);
577 578 579 580 581 582 583 584
	if (i > max)
		return -1;
	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
	total_len = 0;
	while (i--) {
		struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
		ifdebug(FACILITY) {
			u64 off;
A
Al Viro 已提交
585
			xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
586 587
			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
				__func__,
588
				be32_to_cpu(seg->rs_length),
589
				(unsigned long long)off,
590
				be32_to_cpu(seg->rs_handle));
591
		}
592
		total_len += be32_to_cpu(seg->rs_length);
593 594 595 596
		++cur_wchunk;
	}
	/* check and adjust for properly terminated write chunk */
	if (wrchunk) {
A
Al Viro 已提交
597
		__be32 *w = (__be32 *) cur_wchunk;
598 599 600 601
		if (*w++ != xdr_zero)
			return -1;
		cur_wchunk = (struct rpcrdma_write_chunk *) w;
	}
602
	if ((char *)cur_wchunk > base + rep->rr_len)
603 604
		return -1;

A
Al Viro 已提交
605
	*iptrp = (__be32 *) cur_wchunk;
606 607 608 609 610 611 612
	return total_len;
}

/*
 * Scatter inline received data back into provided iov's.
 */
static void
613
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
614 615 616
{
	int i, npages, curlen, olen;
	char *destp;
617 618
	struct page **ppages;
	int page_base;
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636

	curlen = rqst->rq_rcv_buf.head[0].iov_len;
	if (curlen > copy_len) {	/* write chunk header fixup */
		curlen = copy_len;
		rqst->rq_rcv_buf.head[0].iov_len = curlen;
	}

	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
		__func__, srcp, copy_len, curlen);

	/* Shift pointer for first receive segment only */
	rqst->rq_rcv_buf.head[0].iov_base = srcp;
	srcp += curlen;
	copy_len -= curlen;

	olen = copy_len;
	i = 0;
	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
637 638 639 640
	page_base = rqst->rq_rcv_buf.page_base;
	ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
	page_base &= ~PAGE_MASK;

641
	if (copy_len && rqst->rq_rcv_buf.page_len) {
642
		npages = PAGE_ALIGN(page_base +
643 644
			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
		for (; i < npages; i++) {
645
			curlen = PAGE_SIZE - page_base;
646 647 648 649 650
			if (curlen > copy_len)
				curlen = copy_len;
			dprintk("RPC:       %s: page %d"
				" srcp 0x%p len %d curlen %d\n",
				__func__, i, srcp, copy_len, curlen);
651
			destp = kmap_atomic(ppages[i]);
652 653
			memcpy(destp + page_base, srcp, curlen);
			flush_dcache_page(ppages[i]);
654
			kunmap_atomic(destp);
655 656 657 658
			srcp += curlen;
			copy_len -= curlen;
			if (copy_len == 0)
				break;
659
			page_base = 0;
660
		}
661
	}
662 663 664 665 666 667

	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
		curlen = copy_len;
		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
668
			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
669 670 671 672 673 674 675
		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
			__func__, srcp, copy_len, curlen);
		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
		copy_len -= curlen; ++i;
	} else
		rqst->rq_rcv_buf.tail[0].iov_len = 0;

676 677 678 679 680 681 682
	if (pad) {
		/* implicit padding on terminal chunk */
		unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
		while (pad--)
			p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
	}

683 684 685 686 687 688 689 690 691 692
	if (copy_len)
		dprintk("RPC:       %s: %d bytes in"
			" %d extra segments (%d lost)\n",
			__func__, olen, i, copy_len);

	/* TBD avoid a warning from call_decode() */
	rqst->rq_private_buf = rqst->rq_rcv_buf;
}

void
693
rpcrdma_connect_worker(struct work_struct *work)
694
{
695 696
	struct rpcrdma_ep *ep =
		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
697 698 699
	struct rpcrdma_xprt *r_xprt =
		container_of(ep, struct rpcrdma_xprt, rx_ep);
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
700 701

	spin_lock_bh(&xprt->transport_lock);
702 703
	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
		++xprt->connect_cookie;
704 705 706 707 708
	if (ep->rep_connected > 0) {
		if (!xprt_test_and_set_connected(xprt))
			xprt_wake_pending_tasks(xprt, 0);
	} else {
		if (xprt_test_and_clear_connected(xprt))
709
			xprt_wake_pending_tasks(xprt, -ENOTCONN);
710 711 712 713
	}
	spin_unlock_bh(&xprt->transport_lock);
}

714 715 716 717 718 719 720 721 722 723 724 725
/*
 * This function is called when an async event is posted to
 * the connection which changes the connection state. All it
 * does at this point is mark the connection up/down, the rpc
 * timers do the rest.
 */
void
rpcrdma_conn_func(struct rpcrdma_ep *ep)
{
	schedule_delayed_work(&ep->rep_connect_worker, 0);
}

726 727 728 729 730 731 732 733 734 735 736
/*
 * Called as a tasklet to do req/reply match and complete a request
 * Errors must result in the RPC task either being awakened, or
 * allowed to timeout, to discover the errors at that time.
 */
void
rpcrdma_reply_handler(struct rpcrdma_rep *rep)
{
	struct rpcrdma_msg *headerp;
	struct rpcrdma_req *req;
	struct rpc_rqst *rqst;
737 738
	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
A
Al Viro 已提交
739
	__be32 *iptr;
740
	int rdmalen, status;
741
	unsigned long cwnd;
742
	u32 credits;
743

744 745 746 747 748 749 750
	dprintk("RPC:       %s: incoming rep %p\n", __func__, rep);

	if (rep->rr_len == RPCRDMA_BAD_LEN)
		goto out_badstatus;
	if (rep->rr_len < RPCRDMA_HDRLEN_MIN)
		goto out_shortreply;

751
	headerp = rdmab_to_msg(rep->rr_rdmabuf);
752 753
	if (headerp->rm_vers != rpcrdma_version)
		goto out_badversion;
754 755 756 757

	/* Get XID and try for a match. */
	spin_lock(&xprt->transport_lock);
	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
758 759
	if (!rqst)
		goto out_nomatch;
760 761 762

	/* get request object */
	req = rpcr_to_rdmar(rqst);
763 764
	if (req->rl_reply)
		goto out_duplicate;
765 766 767

	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
		"                   RPC request 0x%p xid 0x%08x\n",
768 769
			__func__, rep, req, rqst,
			be32_to_cpu(headerp->rm_xid));
770 771 772

	/* from here on, the reply is no longer an orphan */
	req->rl_reply = rep;
773
	xprt->reestablish_timeout = 0;
774 775 776 777

	/* check for expected message types */
	/* The order of some of these tests is important. */
	switch (headerp->rm_type) {
778
	case rdma_msg:
779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
		/* never expect read chunks */
		/* never expect reply chunks (two ways to check) */
		/* never expect write chunks without having offered RDMA */
		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
		     req->rl_nchunks == 0))
			goto badheader;
		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
			/* count any expected write chunks in read reply */
			/* start at write chunk array count */
			iptr = &headerp->rm_body.rm_chunks[2];
			rdmalen = rpcrdma_count_chunks(rep,
						req->rl_nchunks, 1, &iptr);
			/* check for validity, and no reply chunk after */
			if (rdmalen < 0 || *iptr++ != xdr_zero)
				goto badheader;
			rep->rr_len -=
			    ((unsigned char *)iptr - (unsigned char *)headerp);
			status = rep->rr_len + rdmalen;
			r_xprt->rx_stats.total_rdma_reply += rdmalen;
801 802 803 804 805
			/* special case - last chunk may omit padding */
			if (rdmalen &= 3) {
				rdmalen = 4 - rdmalen;
				status += rdmalen;
			}
806 807
		} else {
			/* else ordinary inline */
808
			rdmalen = 0;
C
Chuck Lever 已提交
809 810 811
			iptr = (__be32 *)((unsigned char *)headerp +
							RPCRDMA_HDRLEN_MIN);
			rep->rr_len -= RPCRDMA_HDRLEN_MIN;
812 813 814
			status = rep->rr_len;
		}
		/* Fix up the rpc results for upper layer */
815
		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
816 817
		break;

818
	case rdma_nomsg:
819 820 821 822 823 824
		/* never expect read or write chunks, always reply chunks */
		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
		    headerp->rm_body.rm_chunks[2] != xdr_one ||
		    req->rl_nchunks == 0)
			goto badheader;
C
Chuck Lever 已提交
825 826
		iptr = (__be32 *)((unsigned char *)headerp +
							RPCRDMA_HDRLEN_MIN);
827 828 829 830 831 832 833 834 835 836 837 838 839
		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
		if (rdmalen < 0)
			goto badheader;
		r_xprt->rx_stats.total_rdma_reply += rdmalen;
		/* Reply chunk buffer already is the reply vector - no fixup. */
		status = rdmalen;
		break;

badheader:
	default:
		dprintk("%s: invalid rpcrdma reply header (type %d):"
				" chunks[012] == %d %d %d"
				" expected chunks <= %d\n",
840
				__func__, be32_to_cpu(headerp->rm_type),
841 842 843 844 845 846 847 848 849
				headerp->rm_body.rm_chunks[0],
				headerp->rm_body.rm_chunks[1],
				headerp->rm_body.rm_chunks[2],
				req->rl_nchunks);
		status = -EIO;
		r_xprt->rx_stats.bad_reply_count++;
		break;
	}

850 851 852 853 854 855
	credits = be32_to_cpu(headerp->rm_credit);
	if (credits == 0)
		credits = 1;	/* don't deadlock */
	else if (credits > r_xprt->rx_buf.rb_max_requests)
		credits = r_xprt->rx_buf.rb_max_requests;

856
	cwnd = xprt->cwnd;
857
	xprt->cwnd = credits << RPC_CWNDSHIFT;
858 859 860
	if (xprt->cwnd > cwnd)
		xprt_release_rqst_cong(rqst->rq_task);

861 862
	xprt_complete_rqst(rqst->rq_task, status);
	spin_unlock(&xprt->transport_lock);
863 864
	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
			__func__, xprt, rqst, status);
865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
	return;

out_badstatus:
	rpcrdma_recv_buffer_put(rep);
	if (r_xprt->rx_ep.rep_connected == 1) {
		r_xprt->rx_ep.rep_connected = -EIO;
		rpcrdma_conn_func(&r_xprt->rx_ep);
	}
	return;

out_shortreply:
	dprintk("RPC:       %s: short/invalid reply\n", __func__);
	goto repost;

out_badversion:
	dprintk("RPC:       %s: invalid version %d\n",
		__func__, be32_to_cpu(headerp->rm_vers));
	goto repost;

out_nomatch:
	spin_unlock(&xprt->transport_lock);
	dprintk("RPC:       %s: no match for incoming xid 0x%08x len %d\n",
		__func__, be32_to_cpu(headerp->rm_xid),
		rep->rr_len);
	goto repost;

out_duplicate:
892
	spin_unlock(&xprt->transport_lock);
893 894 895 896 897 898 899 900
	dprintk("RPC:       %s: "
		"duplicate reply %p to RPC request %p: xid 0x%08x\n",
		__func__, rep, req, be32_to_cpu(headerp->rm_xid));

repost:
	r_xprt->rx_stats.bad_reply_count++;
	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
		rpcrdma_recv_buffer_put(rep);
901
}