rpc_rdma.c 26.9 KB
Newer Older
1
/*
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the BSD-type
 * license below:
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *      Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 *      Neither the name of the Network Appliance, Inc. nor the names of
 *      its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * rpc_rdma.c
 *
 * This file contains the guts of the RPC RDMA protocol, and
 * does marshaling/unmarshaling, etc. It is also where interfacing
 * to the Linux RPC framework lives.
46 47 48 49
 */

#include "xprt_rdma.h"

50 51
#include <linux/highmem.h>

J
Jeff Layton 已提交
52
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
53 54 55
# define RPCDBG_FACILITY	RPCDBG_TRANS
#endif

56 57 58 59 60 61 62 63
enum rpcrdma_chunktype {
	rpcrdma_noch = 0,
	rpcrdma_readch,
	rpcrdma_areadch,
	rpcrdma_writech,
	rpcrdma_replych
};

J
Jeff Layton 已提交
64
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
65 66 67 68 69 70 71 72 73
static const char transfertypes[][12] = {
	"pure inline",	/* no chunks */
	" read chunk",	/* some argument via rdma read */
	"*read chunk",	/* entire request via rdma read */
	"write chunk",	/* some result via rdma write */
	"reply chunk"	/* entire reply via rdma write */
};
#endif

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
/* The client can send a request inline as long as the RPCRDMA header
 * plus the RPC call fit under the transport's inline limit. If the
 * combined call message size exceeds that limit, the client must use
 * the read chunk list for this operation.
 */
static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
{
	unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;

	return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
}

/* The client can't know how large the actual reply will be. Thus it
 * plans for the largest possible reply for that particular ULP
 * operation. If the maximum combined reply message size exceeds that
 * limit, the client must provide a write list or a reply chunk for
 * this request.
 */
static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
{
	unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;

	return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
}

99 100 101 102 103 104
/*
 * Chunk assembly from upper layer xdr_buf.
 *
 * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
 * elements. Segments are then coalesced when registered, if possible
 * within the selected memreg mode.
105 106
 *
 * Returns positive number of segments converted, or a negative errno.
107 108 109
 */

static int
110
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
111 112 113
	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
{
	int len, n = 0, p;
114 115
	int page_base;
	struct page **ppages;
116 117 118 119 120 121 122 123

	if (pos == 0 && xdrbuf->head[0].iov_len) {
		seg[n].mr_page = NULL;
		seg[n].mr_offset = xdrbuf->head[0].iov_base;
		seg[n].mr_len = xdrbuf->head[0].iov_len;
		++n;
	}

124 125 126 127 128
	len = xdrbuf->page_len;
	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
	page_base = xdrbuf->page_base & ~PAGE_MASK;
	p = 0;
	while (len && n < nsegs) {
S
Shirley Ma 已提交
129 130 131 132
		if (!ppages[p]) {
			/* alloc the pagelist for receiving buffer */
			ppages[p] = alloc_page(GFP_ATOMIC);
			if (!ppages[p])
133
				return -ENOMEM;
S
Shirley Ma 已提交
134
		}
135 136 137
		seg[n].mr_page = ppages[p];
		seg[n].mr_offset = (void *)(unsigned long) page_base;
		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
138 139
		if (seg[n].mr_len > PAGE_SIZE)
			return -EIO;
140
		len -= seg[n].mr_len;
141
		++n;
142 143
		++p;
		page_base = 0;	/* page offset only applies to first page */
144 145
	}

146 147
	/* Message overflows the seg array */
	if (len && n == nsegs)
148
		return -EIO;
149

150
	if (xdrbuf->tail[0].iov_len) {
151 152 153 154
		/* the rpcrdma protocol allows us to omit any trailing
		 * xdr pad bytes, saving the server an RDMA operation. */
		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
			return n;
155
		if (n == nsegs)
156
			/* Tail remains, but we're out of segments */
157
			return -EIO;
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
		seg[n].mr_page = NULL;
		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
		seg[n].mr_len = xdrbuf->tail[0].iov_len;
		++n;
	}

	return n;
}

/*
 * Create read/write chunk lists, and reply chunks, for RDMA
 *
 *   Assume check against THRESHOLD has been done, and chunks are required.
 *   Assume only encoding one list entry for read|write chunks. The NFSv3
 *     protocol is simple enough to allow this as it only has a single "bulk
 *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
 *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
 *
 * When used for a single reply chunk (which is a special write
 * chunk used for the entire reply, rather than just the data), it
 * is used primarily for READDIR and READLINK which would otherwise
 * be severely size-limited by a small rdma inline read max. The server
 * response will come back as an RDMA Write, followed by a message
 * of type RDMA_NOMSG carrying the xid and length. As a result, reply
 * chunks do not provide data alignment, however they do not require
 * "fixup" (moving the response to the upper layer buffer) either.
 *
 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 *
 *  Read chunklist (a linked list):
 *   N elements, position P (same P for all chunks of same arg!):
 *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
 *
 *  Write chunklist (a list of (one) counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO - 0
 *
 *  Reply chunk (a counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO
198 199
 *
 * Returns positive RPC/RDMA header size, or negative errno.
200 201
 */

202
static ssize_t
203 204 205 206
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
{
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
207
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
208
	int n, nsegs, nchunks = 0;
209
	unsigned int pos;
210 211 212 213
	struct rpcrdma_mr_seg *seg = req->rl_segments;
	struct rpcrdma_read_chunk *cur_rchunk = NULL;
	struct rpcrdma_write_array *warray = NULL;
	struct rpcrdma_write_chunk *cur_wchunk = NULL;
A
Al Viro 已提交
214
	__be32 *iptr = headerp->rm_body.rm_chunks;
215
	int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
		/* a read chunk - server will RDMA Read our memory */
		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
	} else {
		/* a write or reply chunk - server will RDMA Write our memory */
		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
		if (type == rpcrdma_replych)
			*iptr++ = xdr_zero;	/* a NULL write chunk list */
		warray = (struct rpcrdma_write_array *) iptr;
		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
	}

	if (type == rpcrdma_replych || type == rpcrdma_areadch)
		pos = 0;
	else
		pos = target->head[0].iov_len;

	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
235 236
	if (nsegs < 0)
		return nsegs;
237

238
	map = r_xprt->rx_ia.ri_ops->ro_map;
239
	do {
240
		n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
241 242 243 244 245
		if (n <= 0)
			goto out;
		if (cur_rchunk) {	/* read */
			cur_rchunk->rc_discrim = xdr_one;
			/* all read chunks have the same "position" */
246 247 248 249 250
			cur_rchunk->rc_position = cpu_to_be32(pos);
			cur_rchunk->rc_target.rs_handle =
						cpu_to_be32(seg->mr_rkey);
			cur_rchunk->rc_target.rs_length =
						cpu_to_be32(seg->mr_len);
251
			xdr_encode_hyper(
A
Al Viro 已提交
252
					(__be32 *)&cur_rchunk->rc_target.rs_offset,
253 254
					seg->mr_base);
			dprintk("RPC:       %s: read chunk "
255
				"elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
256 257
				seg->mr_len, (unsigned long long)seg->mr_base,
				seg->mr_rkey, pos, n < nsegs ? "more" : "last");
258 259 260
			cur_rchunk++;
			r_xprt->rx_stats.read_chunk_count++;
		} else {		/* write/reply */
261 262 263 264
			cur_wchunk->wc_target.rs_handle =
						cpu_to_be32(seg->mr_rkey);
			cur_wchunk->wc_target.rs_length =
						cpu_to_be32(seg->mr_len);
265
			xdr_encode_hyper(
A
Al Viro 已提交
266
					(__be32 *)&cur_wchunk->wc_target.rs_offset,
267 268 269 270
					seg->mr_base);
			dprintk("RPC:       %s: %s chunk "
				"elem %d@0x%llx:0x%x (%s)\n", __func__,
				(type == rpcrdma_replych) ? "reply" : "write",
271 272
				seg->mr_len, (unsigned long long)seg->mr_base,
				seg->mr_rkey, n < nsegs ? "more" : "last");
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
			cur_wchunk++;
			if (type == rpcrdma_replych)
				r_xprt->rx_stats.reply_chunk_count++;
			else
				r_xprt->rx_stats.write_chunk_count++;
			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
		}
		nchunks++;
		seg   += n;
		nsegs -= n;
	} while (nsegs);

	/* success. all failures return above */
	req->rl_nchunks = nchunks;

	/*
	 * finish off header. If write, marshal discrim and nchunks.
	 */
	if (cur_rchunk) {
A
Al Viro 已提交
292
		iptr = (__be32 *) cur_rchunk;
293 294 295 296 297
		*iptr++ = xdr_zero;	/* finish the read chunk list */
		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
	} else {
		warray->wc_discrim = xdr_one;
298
		warray->wc_nchunks = cpu_to_be32(nchunks);
A
Al Viro 已提交
299
		iptr = (__be32 *) cur_wchunk;
300 301 302 303 304 305 306 307 308 309 310 311
		if (type == rpcrdma_writech) {
			*iptr++ = xdr_zero; /* finish the write chunk list */
			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
		}
	}

	/*
	 * Return header size.
	 */
	return (unsigned char *)iptr - (unsigned char *)headerp;

out:
312 313 314
	for (pos = 0; nchunks--;)
		pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
						      &req->rl_segments[pos]);
315
	return n;
316 317 318 319 320 321 322 323 324
}

/*
 * Copy write data inline.
 * This function is used for "small" requests. Data which is passed
 * to RPC via iovecs (or page list) is copied directly into the
 * pre-registered memory buffer for this request. For small amounts
 * of data, this is efficient. The cutoff value is tunable.
 */
325
static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
326 327 328 329 330
{
	int i, npages, curlen;
	int copy_len;
	unsigned char *srcp, *destp;
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
331 332
	int page_base;
	struct page **ppages;
333 334 335 336 337

	destp = rqst->rq_svec[0].iov_base;
	curlen = rqst->rq_svec[0].iov_len;
	destp += curlen;

338 339
	dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
		__func__, destp, rqst->rq_slen, curlen);
340 341

	copy_len = rqst->rq_snd_buf.page_len;
342 343 344 345 346 347 348 349 350 351 352 353

	if (rqst->rq_snd_buf.tail[0].iov_len) {
		curlen = rqst->rq_snd_buf.tail[0].iov_len;
		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
			memmove(destp + copy_len,
				rqst->rq_snd_buf.tail[0].iov_base, curlen);
			r_xprt->rx_stats.pullup_copy_count += curlen;
		}
		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
			__func__, destp + copy_len, curlen);
		rqst->rq_svec[0].iov_len += curlen;
	}
354
	r_xprt->rx_stats.pullup_copy_count += copy_len;
355 356 357 358 359

	page_base = rqst->rq_snd_buf.page_base;
	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
	page_base &= ~PAGE_MASK;
	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
360
	for (i = 0; copy_len && i < npages; i++) {
361
		curlen = PAGE_SIZE - page_base;
362 363 364 365
		if (curlen > copy_len)
			curlen = copy_len;
		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
			__func__, i, destp, copy_len, curlen);
366
		srcp = kmap_atomic(ppages[i]);
367
		memcpy(destp, srcp+page_base, curlen);
368
		kunmap_atomic(srcp);
369 370 371
		rqst->rq_svec[0].iov_len += curlen;
		destp += curlen;
		copy_len -= curlen;
372
		page_base = 0;
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
	}
	/* header now contains entire send message */
}

/*
 * Marshal a request: the primary job of this routine is to choose
 * the transfer modes. See comments below.
 *
 * Uses multiple RDMA IOVs for a request:
 *  [0] -- RPC RDMA header, which uses memory from the *start* of the
 *         preregistered buffer that already holds the RPC data in
 *         its middle.
 *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
 *  [2] -- optional padding.
 *  [3] -- if padded, header only in [1] and data here.
388 389
 *
 * Returns zero on success, otherwise a negative errno.
390 391 392 393 394
 */

int
rpcrdma_marshal_req(struct rpc_rqst *rqst)
{
395
	struct rpc_xprt *xprt = rqst->rq_xprt;
396 397 398
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
	char *base;
399
	size_t rpclen;
400
	ssize_t hdrlen;
401
	enum rpcrdma_chunktype rtype, wtype;
402 403 404 405 406 407 408 409 410
	struct rpcrdma_msg *headerp;

	/*
	 * rpclen gets amount of data in first buffer, which is the
	 * pre-registered buffer.
	 */
	base = rqst->rq_svec[0].iov_base;
	rpclen = rqst->rq_svec[0].iov_len;

411
	headerp = rdmab_to_msg(req->rl_rdmabuf);
412
	/* don't byte-swap XID, it's already done in request */
413
	headerp->rm_xid = rqst->rq_xid;
414 415 416
	headerp->rm_vers = rpcrdma_version;
	headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
	headerp->rm_type = rdma_msg;
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436

	/*
	 * Chunks needed for results?
	 *
	 * o If the expected result is under the inline threshold, all ops
	 *   return as inline (but see later).
	 * o Large non-read ops return as a single reply chunk.
	 * o Large read ops return data as write chunk(s), header as inline.
	 *
	 * Note: the NFS code sending down multiple result segments implies
	 * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
	 */

	/*
	 * This code can handle read chunks, write chunks OR reply
	 * chunks -- only one type. If the request is too big to fit
	 * inline, then we will choose read chunks. If the request is
	 * a READ, then use write chunks to separate the file data
	 * into pages; otherwise use reply chunks.
	 */
437
	if (rpcrdma_results_inline(rqst))
438
		wtype = rpcrdma_noch;
439
	else if (rqst->rq_rcv_buf.page_len == 0)
440
		wtype = rpcrdma_replych;
441
	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
442
		wtype = rpcrdma_writech;
443
	else
444
		wtype = rpcrdma_replych;
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459

	/*
	 * Chunks needed for arguments?
	 *
	 * o If the total request is under the inline threshold, all ops
	 *   are sent as inline.
	 * o Large non-write ops are sent with the entire message as a
	 *   single read chunk (protocol 0-position special case).
	 * o Large write ops transmit data as read chunk(s), header as
	 *   inline.
	 *
	 * Note: the NFS code sending down multiple argument segments
	 * implies the op is a write.
	 * TBD check NFSv4 setacl
	 */
460
	if (rpcrdma_args_inline(rqst))
461
		rtype = rpcrdma_noch;
462
	else if (rqst->rq_snd_buf.page_len == 0)
463
		rtype = rpcrdma_areadch;
464
	else
465
		rtype = rpcrdma_readch;
466 467

	/* The following simplification is not true forever */
468 469 470
	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
		wtype = rpcrdma_noch;
	if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
471 472 473 474
		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
			__func__);
		return -EIO;
	}
475

C
Chuck Lever 已提交
476
	hdrlen = RPCRDMA_HDRLEN_MIN;
477 478 479 480 481 482

	/*
	 * Pull up any extra send data into the preregistered buffer.
	 * When padding is in use and applies to the transfer, insert
	 * it and change the message type.
	 */
483
	if (rtype == rpcrdma_noch) {
484

485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
		rpcrdma_inline_pullup(rqst);

		headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
		headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
		headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
		/* new length after pullup */
		rpclen = rqst->rq_svec[0].iov_len;
		/* Currently we try to not actually use read inline.
		 * Reply chunks have the desirable property that
		 * they land, packed, directly in the target buffers
		 * without headers, so they require no fixup. The
		 * additional RDMA Write op sends the same amount
		 * of data, streams on-the-wire and adds no overhead
		 * on receive. Therefore, we request a reply chunk
		 * for non-writes wherever feasible and efficient.
		 */
		if (wtype == rpcrdma_noch)
			wtype = rpcrdma_replych;
503 504
	}

505 506 507 508 509 510 511 512 513
	if (rtype != rpcrdma_noch) {
		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
					       headerp, rtype);
		wtype = rtype;	/* simplify dprintk */

	} else if (wtype != rpcrdma_noch) {
		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
					       headerp, wtype);
	}
514 515
	if (hdrlen < 0)
		return hdrlen;
516

517
	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
518
		" headerp 0x%p base 0x%p lkey 0x%x\n",
519
		__func__, transfertypes[wtype], hdrlen, rpclen,
520
		headerp, base, rdmab_lkey(req->rl_rdmabuf));
521 522 523 524 525 526 527 528

	/*
	 * initialize send_iov's - normally only two: rdma chunk header and
	 * single preregistered RPC header buffer, but if padding is present,
	 * then use a preregistered (and zeroed) pad buffer between the RPC
	 * header and any write data. In all non-rdma cases, any following
	 * data has been copied into the RPC header buffer.
	 */
529
	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
530
	req->rl_send_iov[0].length = hdrlen;
531
	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
532

533
	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
534
	req->rl_send_iov[1].length = rpclen;
535
	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
536 537 538 539 540 541 542 543 544 545

	req->rl_niovs = 2;
	return 0;
}

/*
 * Chase down a received write or reply chunklist to get length
 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
 */
static int
546
rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
547 548 549
{
	unsigned int i, total_len;
	struct rpcrdma_write_chunk *cur_wchunk;
550
	char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
551

552
	i = be32_to_cpu(**iptrp);
553 554 555 556 557 558 559 560
	if (i > max)
		return -1;
	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
	total_len = 0;
	while (i--) {
		struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
		ifdebug(FACILITY) {
			u64 off;
A
Al Viro 已提交
561
			xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
562 563
			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
				__func__,
564
				be32_to_cpu(seg->rs_length),
565
				(unsigned long long)off,
566
				be32_to_cpu(seg->rs_handle));
567
		}
568
		total_len += be32_to_cpu(seg->rs_length);
569 570 571 572
		++cur_wchunk;
	}
	/* check and adjust for properly terminated write chunk */
	if (wrchunk) {
A
Al Viro 已提交
573
		__be32 *w = (__be32 *) cur_wchunk;
574 575 576 577
		if (*w++ != xdr_zero)
			return -1;
		cur_wchunk = (struct rpcrdma_write_chunk *) w;
	}
578
	if ((char *)cur_wchunk > base + rep->rr_len)
579 580
		return -1;

A
Al Viro 已提交
581
	*iptrp = (__be32 *) cur_wchunk;
582 583 584 585 586 587 588
	return total_len;
}

/*
 * Scatter inline received data back into provided iov's.
 */
static void
589
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
590 591 592
{
	int i, npages, curlen, olen;
	char *destp;
593 594
	struct page **ppages;
	int page_base;
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612

	curlen = rqst->rq_rcv_buf.head[0].iov_len;
	if (curlen > copy_len) {	/* write chunk header fixup */
		curlen = copy_len;
		rqst->rq_rcv_buf.head[0].iov_len = curlen;
	}

	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
		__func__, srcp, copy_len, curlen);

	/* Shift pointer for first receive segment only */
	rqst->rq_rcv_buf.head[0].iov_base = srcp;
	srcp += curlen;
	copy_len -= curlen;

	olen = copy_len;
	i = 0;
	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
613 614 615 616
	page_base = rqst->rq_rcv_buf.page_base;
	ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
	page_base &= ~PAGE_MASK;

617
	if (copy_len && rqst->rq_rcv_buf.page_len) {
618
		npages = PAGE_ALIGN(page_base +
619 620
			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
		for (; i < npages; i++) {
621
			curlen = PAGE_SIZE - page_base;
622 623 624 625 626
			if (curlen > copy_len)
				curlen = copy_len;
			dprintk("RPC:       %s: page %d"
				" srcp 0x%p len %d curlen %d\n",
				__func__, i, srcp, copy_len, curlen);
627
			destp = kmap_atomic(ppages[i]);
628 629
			memcpy(destp + page_base, srcp, curlen);
			flush_dcache_page(ppages[i]);
630
			kunmap_atomic(destp);
631 632 633 634
			srcp += curlen;
			copy_len -= curlen;
			if (copy_len == 0)
				break;
635
			page_base = 0;
636
		}
637
	}
638 639 640 641 642 643

	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
		curlen = copy_len;
		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
644
			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
645 646 647 648 649 650 651
		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
			__func__, srcp, copy_len, curlen);
		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
		copy_len -= curlen; ++i;
	} else
		rqst->rq_rcv_buf.tail[0].iov_len = 0;

652 653 654 655 656 657 658
	if (pad) {
		/* implicit padding on terminal chunk */
		unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
		while (pad--)
			p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
	}

659 660 661 662 663 664 665 666 667 668
	if (copy_len)
		dprintk("RPC:       %s: %d bytes in"
			" %d extra segments (%d lost)\n",
			__func__, olen, i, copy_len);

	/* TBD avoid a warning from call_decode() */
	rqst->rq_private_buf = rqst->rq_rcv_buf;
}

void
669
rpcrdma_connect_worker(struct work_struct *work)
670
{
671 672
	struct rpcrdma_ep *ep =
		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
673 674 675
	struct rpcrdma_xprt *r_xprt =
		container_of(ep, struct rpcrdma_xprt, rx_ep);
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
676 677

	spin_lock_bh(&xprt->transport_lock);
678 679
	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
		++xprt->connect_cookie;
680 681 682 683 684
	if (ep->rep_connected > 0) {
		if (!xprt_test_and_set_connected(xprt))
			xprt_wake_pending_tasks(xprt, 0);
	} else {
		if (xprt_test_and_clear_connected(xprt))
685
			xprt_wake_pending_tasks(xprt, -ENOTCONN);
686 687 688 689
	}
	spin_unlock_bh(&xprt->transport_lock);
}

690 691 692 693 694 695 696 697 698 699 700 701
/*
 * This function is called when an async event is posted to
 * the connection which changes the connection state. All it
 * does at this point is mark the connection up/down, the rpc
 * timers do the rest.
 */
void
rpcrdma_conn_func(struct rpcrdma_ep *ep)
{
	schedule_delayed_work(&ep->rep_connect_worker, 0);
}

702 703 704 705 706 707 708 709 710 711 712
/*
 * Called as a tasklet to do req/reply match and complete a request
 * Errors must result in the RPC task either being awakened, or
 * allowed to timeout, to discover the errors at that time.
 */
void
rpcrdma_reply_handler(struct rpcrdma_rep *rep)
{
	struct rpcrdma_msg *headerp;
	struct rpcrdma_req *req;
	struct rpc_rqst *rqst;
713 714
	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
A
Al Viro 已提交
715
	__be32 *iptr;
716
	int rdmalen, status;
717
	unsigned long cwnd;
718
	u32 credits;
719 720 721 722 723 724 725 726 727 728

	/* Check status. If bad, signal disconnect and return rep to pool */
	if (rep->rr_len == ~0U) {
		rpcrdma_recv_buffer_put(rep);
		if (r_xprt->rx_ep.rep_connected == 1) {
			r_xprt->rx_ep.rep_connected = -EIO;
			rpcrdma_conn_func(&r_xprt->rx_ep);
		}
		return;
	}
C
Chuck Lever 已提交
729
	if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
730 731 732
		dprintk("RPC:       %s: short/invalid reply\n", __func__);
		goto repost;
	}
733
	headerp = rdmab_to_msg(rep->rr_rdmabuf);
734
	if (headerp->rm_vers != rpcrdma_version) {
735
		dprintk("RPC:       %s: invalid version %d\n",
736
			__func__, be32_to_cpu(headerp->rm_vers));
737 738 739 740 741 742 743 744 745 746
		goto repost;
	}

	/* Get XID and try for a match. */
	spin_lock(&xprt->transport_lock);
	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
	if (rqst == NULL) {
		spin_unlock(&xprt->transport_lock);
		dprintk("RPC:       %s: reply 0x%p failed "
			"to match any request xid 0x%08x len %d\n",
747 748
			__func__, rep, be32_to_cpu(headerp->rm_xid),
			rep->rr_len);
749 750 751 752 753 754 755 756 757 758
repost:
		r_xprt->rx_stats.bad_reply_count++;
		if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
			rpcrdma_recv_buffer_put(rep);

		return;
	}

	/* get request object */
	req = rpcr_to_rdmar(rqst);
759 760 761 762
	if (req->rl_reply) {
		spin_unlock(&xprt->transport_lock);
		dprintk("RPC:       %s: duplicate reply 0x%p to RPC "
			"request 0x%p: xid 0x%08x\n", __func__, rep, req,
763
			be32_to_cpu(headerp->rm_xid));
764 765
		goto repost;
	}
766 767 768

	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
		"                   RPC request 0x%p xid 0x%08x\n",
769 770
			__func__, rep, req, rqst,
			be32_to_cpu(headerp->rm_xid));
771 772 773

	/* from here on, the reply is no longer an orphan */
	req->rl_reply = rep;
774
	xprt->reestablish_timeout = 0;
775 776 777 778

	/* check for expected message types */
	/* The order of some of these tests is important. */
	switch (headerp->rm_type) {
779
	case rdma_msg:
780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
		/* never expect read chunks */
		/* never expect reply chunks (two ways to check) */
		/* never expect write chunks without having offered RDMA */
		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
		     req->rl_nchunks == 0))
			goto badheader;
		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
			/* count any expected write chunks in read reply */
			/* start at write chunk array count */
			iptr = &headerp->rm_body.rm_chunks[2];
			rdmalen = rpcrdma_count_chunks(rep,
						req->rl_nchunks, 1, &iptr);
			/* check for validity, and no reply chunk after */
			if (rdmalen < 0 || *iptr++ != xdr_zero)
				goto badheader;
			rep->rr_len -=
			    ((unsigned char *)iptr - (unsigned char *)headerp);
			status = rep->rr_len + rdmalen;
			r_xprt->rx_stats.total_rdma_reply += rdmalen;
802 803 804 805 806
			/* special case - last chunk may omit padding */
			if (rdmalen &= 3) {
				rdmalen = 4 - rdmalen;
				status += rdmalen;
			}
807 808
		} else {
			/* else ordinary inline */
809
			rdmalen = 0;
C
Chuck Lever 已提交
810 811 812
			iptr = (__be32 *)((unsigned char *)headerp +
							RPCRDMA_HDRLEN_MIN);
			rep->rr_len -= RPCRDMA_HDRLEN_MIN;
813 814 815
			status = rep->rr_len;
		}
		/* Fix up the rpc results for upper layer */
816
		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
817 818
		break;

819
	case rdma_nomsg:
820 821 822 823 824 825
		/* never expect read or write chunks, always reply chunks */
		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
		    headerp->rm_body.rm_chunks[2] != xdr_one ||
		    req->rl_nchunks == 0)
			goto badheader;
C
Chuck Lever 已提交
826 827
		iptr = (__be32 *)((unsigned char *)headerp +
							RPCRDMA_HDRLEN_MIN);
828 829 830 831 832 833 834 835 836 837 838 839 840
		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
		if (rdmalen < 0)
			goto badheader;
		r_xprt->rx_stats.total_rdma_reply += rdmalen;
		/* Reply chunk buffer already is the reply vector - no fixup. */
		status = rdmalen;
		break;

badheader:
	default:
		dprintk("%s: invalid rpcrdma reply header (type %d):"
				" chunks[012] == %d %d %d"
				" expected chunks <= %d\n",
841
				__func__, be32_to_cpu(headerp->rm_type),
842 843 844 845 846 847 848 849 850
				headerp->rm_body.rm_chunks[0],
				headerp->rm_body.rm_chunks[1],
				headerp->rm_body.rm_chunks[2],
				req->rl_nchunks);
		status = -EIO;
		r_xprt->rx_stats.bad_reply_count++;
		break;
	}

851 852 853 854 855 856
	credits = be32_to_cpu(headerp->rm_credit);
	if (credits == 0)
		credits = 1;	/* don't deadlock */
	else if (credits > r_xprt->rx_buf.rb_max_requests)
		credits = r_xprt->rx_buf.rb_max_requests;

857
	cwnd = xprt->cwnd;
858
	xprt->cwnd = credits << RPC_CWNDSHIFT;
859 860 861
	if (xprt->cwnd > cwnd)
		xprt_release_rqst_cong(rqst->rq_task);

862 863 864 865 866
	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
			__func__, xprt, rqst, status);
	xprt_complete_rqst(rqst->rq_task, status);
	spin_unlock(&xprt->transport_lock);
}