frwr_ops.c 18.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
4 5 6 7
 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
 */

/* Lightweight memory registration using Fast Registration Work
8
 * Requests (FRWR).
9
 *
10 11
 * FRWR features ordered asynchronous registration and invalidation
 * of arbitrarily-sized memory regions. This is the fastest and safest
12 13 14
 * but most complex memory registration mode.
 */

15 16
/* Normal operation
 *
17
 * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
18
 * Work Request (frwr_map). When the RDMA operation is finished, this
19
 * Memory Region is invalidated using a LOCAL_INV Work Request
20
 * (frwr_unmap_async and frwr_unmap_sync).
21
 *
22 23 24
 * Typically FAST_REG Work Requests are not signaled, and neither are
 * RDMA Send Work Requests (with the exception of signaling occasionally
 * to prevent provider work queue overflows). This greatly reduces HCA
25 26 27 28 29
 * interrupt workload.
 */

/* Transport recovery
 *
30 31 32 33 34 35 36 37 38 39 40
 * frwr_map and frwr_unmap_* cannot run at the same time the transport
 * connect worker is running. The connect worker holds the transport
 * send lock, just as ->send_request does. This prevents frwr_map and
 * the connect worker from running concurrently. When a connection is
 * closed, the Receive completion queue is drained before the allowing
 * the connect worker to get control. This prevents frwr_unmap and the
 * connect worker from running concurrently.
 *
 * When the underlying transport disconnects, MRs that are in flight
 * are flushed and are likely unusable. Thus all flushed MRs are
 * destroyed. New MRs are created on demand.
41 42
 */

43
#include <linux/sunrpc/rpc_rdma.h>
C
Chuck Lever 已提交
44
#include <linux/sunrpc/svc_rdma.h>
45

46
#include "xprt_rdma.h"
47
#include <trace/events/rpcrdma.h>
48 49 50 51 52

#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY	RPCDBG_TRANS
#endif

53 54
/**
 * frwr_is_supported - Check if device supports FRWR
55
 * @device: interface adapter to check
56 57 58
 *
 * Returns true if device supports FRWR, otherwise false
 */
59
bool frwr_is_supported(struct ib_device *device)
60
{
61
	struct ib_device_attr *attrs = &device->attrs;
62 63 64 65 66 67 68 69 70

	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
		goto out_not_supported;
	if (attrs->max_fast_reg_page_list_len == 0)
		goto out_not_supported;
	return true;

out_not_supported:
	pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
71
		device->name);
72 73 74
	return false;
}

75 76 77 78 79 80
/**
 * frwr_release_mr - Destroy one MR
 * @mr: MR allocated by frwr_init_mr
 *
 */
void frwr_release_mr(struct rpcrdma_mr *mr)
81 82 83 84 85
{
	int rc;

	rc = ib_dereg_mr(mr->frwr.fr_mr);
	if (rc)
86
		trace_xprtrdma_frwr_dereg(mr, rc);
87 88 89 90
	kfree(mr->mr_sg);
	kfree(mr);
}

91
static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
92 93 94
{
	trace_xprtrdma_mr_recycle(mr);

95
	if (mr->mr_dir != DMA_NONE) {
96
		trace_xprtrdma_mr_unmap(mr);
97
		ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
98
				mr->mr_sg, mr->mr_nents, mr->mr_dir);
99
		mr->mr_dir = DMA_NONE;
100 101
	}

102
	spin_lock(&r_xprt->rx_buf.rb_lock);
103 104
	list_del(&mr->mr_all);
	r_xprt->rx_stats.mrs_recycled++;
105
	spin_unlock(&r_xprt->rx_buf.rb_lock);
106 107

	frwr_release_mr(mr);
108 109
}

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
/* MRs are dynamically allocated, so simply clean up and release the MR.
 * A replacement MR will subsequently be allocated on demand.
 */
static void
frwr_mr_recycle_worker(struct work_struct *work)
{
	struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
					     mr_recycle);

	frwr_mr_recycle(mr->mr_xprt, mr);
}

/* frwr_recycle - Discard MRs
 * @req: request to reset
 *
 * Used after a reconnect. These MRs could be in flight, we can't
 * tell. Safe thing to do is release them.
 */
void frwr_recycle(struct rpcrdma_req *req)
{
	struct rpcrdma_mr *mr;

	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
		frwr_mr_recycle(mr->mr_xprt, mr);
}

136 137 138 139 140 141 142 143 144 145 146 147
/* frwr_reset - Place MRs back on the free list
 * @req: request to reset
 *
 * Used after a failed marshal. For FRWR, this means the MRs
 * don't have to be fully released and recreated.
 *
 * NB: This is safe only as long as none of @req's MRs are
 * involved with an ongoing asynchronous FAST_REG or LOCAL_INV
 * Work Request.
 */
void frwr_reset(struct rpcrdma_req *req)
{
C
Chuck Lever 已提交
148
	struct rpcrdma_mr *mr;
149

C
Chuck Lever 已提交
150
	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
151
		rpcrdma_mr_put(mr);
152 153
}

154 155 156 157 158 159 160 161 162
/**
 * frwr_init_mr - Initialize one MR
 * @ia: interface adapter
 * @mr: generic MR to prepare for FRWR
 *
 * Returns zero if successful. Otherwise a negative errno
 * is returned.
 */
int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
163
{
164
	unsigned int depth = ia->ri_max_frwr_depth;
C
Chuck Lever 已提交
165 166
	struct scatterlist *sg;
	struct ib_mr *frmr;
167 168
	int rc;

169 170 171
	/* NB: ib_alloc_mr and device drivers typically allocate
	 *     memory with GFP_KERNEL.
	 */
C
Chuck Lever 已提交
172 173
	frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
	if (IS_ERR(frmr))
174 175
		goto out_mr_err;

176
	sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
C
Chuck Lever 已提交
177
	if (!sg)
178 179
		goto out_list_err;

C
Chuck Lever 已提交
180
	mr->frwr.fr_mr = frmr;
181
	mr->mr_dir = DMA_NONE;
182
	INIT_LIST_HEAD(&mr->mr_list);
183
	INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
C
Chuck Lever 已提交
184 185 186 187
	init_completion(&mr->frwr.fr_linv_done);

	sg_init_table(sg, depth);
	mr->mr_sg = sg;
188 189 190
	return 0;

out_mr_err:
C
Chuck Lever 已提交
191
	rc = PTR_ERR(frmr);
192
	trace_xprtrdma_frwr_alloc(mr, rc);
193 194 195
	return rc;

out_list_err:
C
Chuck Lever 已提交
196 197
	ib_dereg_mr(frmr);
	return -ENOMEM;
198 199
}

200 201 202 203 204 205
/**
 * frwr_open - Prepare an endpoint for use with FRWR
 * @ia: interface adapter this endpoint will use
 * @ep: endpoint to prepare
 *
 * On success, sets:
206 207
 *	ep->rep_attr.cap.max_send_wr
 *	ep->rep_attr.cap.max_recv_wr
208
 *	ep->rep_max_requests
209 210 211 212 213
 *	ia->ri_max_segs
 *
 * And these FRWR-related fields:
 *	ia->ri_max_frwr_depth
 *	ia->ri_mrtype
214 215
 *
 * On failure, a negative errno is returned.
216
 */
217
int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
C
Chuck Lever 已提交
218
{
219
	struct ib_device_attr *attrs = &ia->ri_id->device->attrs;
220
	int max_qp_wr, depth, delta;
C
Chuck Lever 已提交
221

C
Chuck Lever 已提交
222 223 224 225
	ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
	if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
		ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;

C
Chuck Lever 已提交
226 227 228 229 230 231 232 233 234 235 236
	/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
	 * capability, but perform optimally when the MRs are not larger
	 * than a page.
	 */
	if (attrs->max_sge_rd > 1)
		ia->ri_max_frwr_depth = attrs->max_sge_rd;
	else
		ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
	if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
		ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
	dprintk("RPC:       %s: max FR page list depth = %u\n",
237 238 239 240 241 242 243 244 245
		__func__, ia->ri_max_frwr_depth);

	/* Add room for frwr register and invalidate WRs.
	 * 1. FRWR reg WR for head
	 * 2. FRWR invalidate WR for head
	 * 3. N FRWR reg WRs for pagelist
	 * 4. N FRWR invalidate WRs for pagelist
	 * 5. FRWR reg WR for tail
	 * 6. FRWR invalidate WR for tail
C
Chuck Lever 已提交
246 247 248 249
	 * 7. The RDMA_SEND WR
	 */
	depth = 7;

250
	/* Calculate N if the device max FRWR depth is smaller than
C
Chuck Lever 已提交
251 252
	 * RPCRDMA_MAX_DATA_SEGS.
	 */
253 254
	if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
		delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
C
Chuck Lever 已提交
255
		do {
256 257
			depth += 2; /* FRWR reg + invalidate */
			delta -= ia->ri_max_frwr_depth;
C
Chuck Lever 已提交
258 259 260
		} while (delta > 0);
	}

261
	max_qp_wr = ia->ri_id->device->attrs.max_qp_wr;
262 263 264 265
	max_qp_wr -= RPCRDMA_BACKWARD_WRS;
	max_qp_wr -= 1;
	if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
		return -ENOMEM;
266 267 268
	if (ep->rep_max_requests > max_qp_wr)
		ep->rep_max_requests = max_qp_wr;
	ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
269
	if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
270 271
		ep->rep_max_requests = max_qp_wr / depth;
		if (!ep->rep_max_requests)
C
Chuck Lever 已提交
272
			return -EINVAL;
273
		ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
C
Chuck Lever 已提交
274
	}
275 276
	ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
	ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
277
	ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
278 279
	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
	ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
C
Chuck Lever 已提交
280

281 282
	ia->ri_max_segs =
		DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
283 284 285 286
	/* Reply chunks require segments for head and tail buffers */
	ia->ri_max_segs += 2;
	if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
		ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS;
C
Chuck Lever 已提交
287 288 289
	return 0;
}

290 291 292 293 294 295 296
/**
 * frwr_maxpages - Compute size of largest payload
 * @r_xprt: transport
 *
 * Returns maximum size of an RPC message, in pages.
 *
 * FRWR mode conveys a list of pages per chunk segment. The
297 298
 * maximum length of that list is the FRWR page list depth.
 */
299
size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
300 301 302 303
{
	struct rpcrdma_ia *ia = &r_xprt->rx_ia;

	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
304
		     (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth);
305 306
}

307 308 309 310 311 312
/**
 * frwr_map - Register a memory region
 * @r_xprt: controlling transport
 * @seg: memory region co-ordinates
 * @nsegs: number of segments remaining
 * @writing: true when RDMA Write will be used
313
 * @xid: XID of RPC using the registered memory
314
 * @mr: MR to fill in
315 316
 *
 * Prepare a REG_MR Work Request to register a memory region
317
 * for remote access via RDMA READ or RDMA WRITE.
318 319
 *
 * Returns the next segment or a negative errno pointer.
320
 * On success, @mr is filled in.
321
 */
322 323
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
				struct rpcrdma_mr_seg *seg,
C
Chuck Lever 已提交
324
				int nsegs, bool writing, __be32 xid,
325
				struct rpcrdma_mr *mr)
326 327
{
	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
328
	struct ib_reg_wr *reg_wr;
329
	struct ib_mr *ibmr;
330
	int i, n;
331 332
	u8 key;

333 334
	if (nsegs > ia->ri_max_frwr_depth)
		nsegs = ia->ri_max_frwr_depth;
335 336
	for (i = 0; i < nsegs;) {
		if (seg->mr_page)
C
Chuck Lever 已提交
337
			sg_set_page(&mr->mr_sg[i],
338 339 340 341
				    seg->mr_page,
				    seg->mr_len,
				    offset_in_page(seg->mr_offset));
		else
C
Chuck Lever 已提交
342
			sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
343 344
				   seg->mr_len);

345 346
		++seg;
		++i;
347
		if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
C
Chuck Lever 已提交
348
			continue;
349 350 351 352
		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
			break;
	}
C
Chuck Lever 已提交
353
	mr->mr_dir = rpcrdma_data_dir(writing);
354

355 356
	mr->mr_nents =
		ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
C
Chuck Lever 已提交
357
	if (!mr->mr_nents)
358 359
		goto out_dmamap_err;

C
Chuck Lever 已提交
360
	ibmr = mr->frwr.fr_mr;
C
Chuck Lever 已提交
361 362
	n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
	if (unlikely(n != mr->mr_nents))
363
		goto out_mapmr_err;
364

365
	ibmr->iova &= 0x00000000ffffffff;
C
Chuck Lever 已提交
366
	ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32;
C
Chuck Lever 已提交
367 368
	key = (u8)(ibmr->rkey & 0x000000FF);
	ib_update_fast_reg_key(ibmr, ++key);
369

C
Chuck Lever 已提交
370
	reg_wr = &mr->frwr.fr_regwr;
C
Chuck Lever 已提交
371 372
	reg_wr->mr = ibmr;
	reg_wr->key = ibmr->rkey;
373 374 375
	reg_wr->access = writing ?
			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
			 IB_ACCESS_REMOTE_READ;
376

C
Chuck Lever 已提交
377 378 379
	mr->mr_handle = ibmr->rkey;
	mr->mr_length = ibmr->length;
	mr->mr_offset = ibmr->iova;
380
	trace_xprtrdma_mr_map(mr);
381

382
	return seg;
383 384

out_dmamap_err:
385
	mr->mr_dir = DMA_NONE;
386
	trace_xprtrdma_frwr_sgerr(mr, i);
387
	return ERR_PTR(-EIO);
388 389

out_mapmr_err:
390
	trace_xprtrdma_frwr_maperr(mr, n);
391
	return ERR_PTR(-EIO);
392
}
393

C
Chuck Lever 已提交
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
/**
 * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
 * @cq:	completion queue (ignored)
 * @wc:	completed WR
 *
 */
static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
{
	struct ib_cqe *cqe = wc->wr_cqe;
	struct rpcrdma_frwr *frwr =
		container_of(cqe, struct rpcrdma_frwr, fr_cqe);

	/* WARNING: Only wr_cqe and status are reliable at this point */
	trace_xprtrdma_wc_fastreg(wc, frwr);
	/* The MR will get recycled when the associated req is retransmitted */
}

411 412 413 414
/**
 * frwr_send - post Send WR containing the RPC Call message
 * @ia: interface adapter
 * @req: Prepared RPC Call
415
 *
416
 * For FRWR, chain any FastReg WRs to the Send WR. Only a
417 418
 * single ib_post_send call is needed to register memory
 * and then post the Send WR.
419 420
 *
 * Returns the result of ib_post_send.
421
 */
422
int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
423
{
424
	struct ib_send_wr *post_wr;
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
	struct rpcrdma_mr *mr;

	post_wr = &req->rl_sendctx->sc_wr;
	list_for_each_entry(mr, &req->rl_registered, mr_list) {
		struct rpcrdma_frwr *frwr;

		frwr = &mr->frwr;

		frwr->fr_cqe.done = frwr_wc_fastreg;
		frwr->fr_regwr.wr.next = post_wr;
		frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
		frwr->fr_regwr.wr.num_sge = 0;
		frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
		frwr->fr_regwr.wr.send_flags = 0;

		post_wr = &frwr->fr_regwr.wr;
	}

	/* If ib_post_send fails, the next ->send_request for
444
	 * @req will queue these MRs for recovery.
445
	 */
446
	return ib_post_send(ia->ri_id->qp, post_wr, NULL);
447 448
}

449 450 451 452 453
/**
 * frwr_reminv - handle a remotely invalidated mr on the @mrs list
 * @rep: Received reply
 * @mrs: list of MRs to check
 *
454
 */
455
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
456
{
C
Chuck Lever 已提交
457
	struct rpcrdma_mr *mr;
458

C
Chuck Lever 已提交
459 460
	list_for_each_entry(mr, mrs, mr_list)
		if (mr->mr_handle == rep->rr_inv_rkey) {
461
			list_del_init(&mr->mr_list);
462
			trace_xprtrdma_mr_remoteinv(mr);
463
			rpcrdma_mr_put(mr);
464 465 466 467
			break;	/* only one invalidated MR per RPC */
		}
}

C
Chuck Lever 已提交
468 469 470 471 472
static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
	if (wc->status != IB_WC_SUCCESS)
		rpcrdma_mr_recycle(mr);
	else
473
		rpcrdma_mr_put(mr);
C
Chuck Lever 已提交
474 475
}

476
/**
C
Chuck Lever 已提交
477 478 479
 * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
 * @cq:	completion queue (ignored)
 * @wc:	completed WR
480
 *
C
Chuck Lever 已提交
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
 */
static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
{
	struct ib_cqe *cqe = wc->wr_cqe;
	struct rpcrdma_frwr *frwr =
		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);

	/* WARNING: Only wr_cqe and status are reliable at this point */
	trace_xprtrdma_wc_li(wc, frwr);
	__frwr_release_mr(wc, mr);
}

/**
 * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
 * @cq:	completion queue (ignored)
 * @wc:	completed WR
498
 *
C
Chuck Lever 已提交
499
 * Awaken anyone waiting for an MR to finish being fenced.
500
 */
C
Chuck Lever 已提交
501 502 503 504 505 506 507 508 509 510
static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
{
	struct ib_cqe *cqe = wc->wr_cqe;
	struct rpcrdma_frwr *frwr =
		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);

	/* WARNING: Only wr_cqe and status are reliable at this point */
	trace_xprtrdma_wc_li_wake(wc, frwr);
	__frwr_release_mr(wc, mr);
511
	complete(&frwr->fr_linv_done);
C
Chuck Lever 已提交
512 513 514 515 516 517 518 519
}

/**
 * frwr_unmap_sync - invalidate memory regions that were registered for @req
 * @r_xprt: controlling transport instance
 * @req: rpcrdma_req with a non-empty list of MRs to process
 *
 * Sleeps until it is safe for the host CPU to access the previously mapped
520 521 522 523
 * memory regions. This guarantees that registered MRs are properly fenced
 * from the server before the RPC consumer accesses the data in them. It
 * also ensures proper Send flow control: waking the next RPC waits until
 * this RPC has relinquished all its Send Queue entries.
C
Chuck Lever 已提交
524 525
 */
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
526
{
527 528
	struct ib_send_wr *first, **prev, *last;
	const struct ib_send_wr *bad_wr;
529
	struct rpcrdma_frwr *frwr;
C
Chuck Lever 已提交
530
	struct rpcrdma_mr *mr;
C
Chuck Lever 已提交
531
	int rc;
532

533
	/* ORDER: Invalidate all of the MRs first
534 535 536 537
	 *
	 * Chain the LOCAL_INV Work Requests and post them with
	 * a single ib_post_send() call.
	 */
538
	frwr = NULL;
C
Chuck Lever 已提交
539
	prev = &first;
C
Chuck Lever 已提交
540
	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
C
Chuck Lever 已提交
541

542
		trace_xprtrdma_mr_localinv(mr);
C
Chuck Lever 已提交
543
		r_xprt->rx_stats.local_inv_needed++;
C
Chuck Lever 已提交
544

C
Chuck Lever 已提交
545
		frwr = &mr->frwr;
546 547
		frwr->fr_cqe.done = frwr_wc_localinv;
		last = &frwr->fr_invwr;
C
Chuck Lever 已提交
548
		last->next = NULL;
549
		last->wr_cqe = &frwr->fr_cqe;
C
Chuck Lever 已提交
550 551
		last->sg_list = NULL;
		last->num_sge = 0;
C
Chuck Lever 已提交
552
		last->opcode = IB_WR_LOCAL_INV;
C
Chuck Lever 已提交
553
		last->send_flags = IB_SEND_SIGNALED;
C
Chuck Lever 已提交
554
		last->ex.invalidate_rkey = mr->mr_handle;
555

C
Chuck Lever 已提交
556 557
		*prev = last;
		prev = &last->next;
558 559 560 561 562 563
	}

	/* Strong send queue ordering guarantees that when the
	 * last WR in the chain completes, all WRs in the chain
	 * are complete.
	 */
564 565
	frwr->fr_cqe.done = frwr_wc_localinv_wake;
	reinit_completion(&frwr->fr_linv_done);
566

567 568 569 570
	/* Transport disconnect drains the receive CQ before it
	 * replaces the QP. The RPC reply handler won't call us
	 * unless ri_id->qp is a valid pointer.
	 */
571
	bad_wr = NULL;
C
Chuck Lever 已提交
572 573
	rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
	trace_xprtrdma_post_send(req, rc);
574

C
Chuck Lever 已提交
575 576 577
	/* The final LOCAL_INV WR in the chain is supposed to
	 * do the wake. If it was never posted, the wake will
	 * not happen, so don't wait in that case.
578
	 */
C
Chuck Lever 已提交
579 580 581 582
	if (bad_wr != first)
		wait_for_completion(&frwr->fr_linv_done);
	if (!rc)
		return;
583

C
Chuck Lever 已提交
584
	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
585
	 */
586
	while (bad_wr) {
587 588
		frwr = container_of(bad_wr, struct rpcrdma_frwr,
				    fr_invwr);
C
Chuck Lever 已提交
589
		mr = container_of(frwr, struct rpcrdma_mr, frwr);
590
		bad_wr = bad_wr->next;
591

592 593
		list_del_init(&mr->mr_list);
		rpcrdma_mr_recycle(mr);
594
	}
595
}
596 597 598 599 600 601 602 603 604 605 606 607 608

/**
 * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
 * @cq:	completion queue (ignored)
 * @wc:	completed WR
 *
 */
static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
{
	struct ib_cqe *cqe = wc->wr_cqe;
	struct rpcrdma_frwr *frwr =
		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
609
	struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
610 611 612 613

	/* WARNING: Only wr_cqe and status are reliable at this point */
	trace_xprtrdma_wc_li_done(wc, frwr);
	__frwr_release_mr(wc, mr);
614 615 616 617

	/* Ensure @rep is generated before __frwr_release_mr */
	smp_rmb();
	rpcrdma_complete_rqst(rep);
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
}

/**
 * frwr_unmap_async - invalidate memory regions that were registered for @req
 * @r_xprt: controlling transport instance
 * @req: rpcrdma_req with a non-empty list of MRs to process
 *
 * This guarantees that registered MRs are properly fenced from the
 * server before the RPC consumer accesses the data in them. It also
 * ensures proper Send flow control: waking the next RPC waits until
 * this RPC has relinquished all its Send Queue entries.
 */
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
	struct ib_send_wr *first, *last, **prev;
	const struct ib_send_wr *bad_wr;
	struct rpcrdma_frwr *frwr;
	struct rpcrdma_mr *mr;
	int rc;

	/* Chain the LOCAL_INV Work Requests and post them with
	 * a single ib_post_send() call.
	 */
	frwr = NULL;
	prev = &first;
C
Chuck Lever 已提交
643
	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695

		trace_xprtrdma_mr_localinv(mr);
		r_xprt->rx_stats.local_inv_needed++;

		frwr = &mr->frwr;
		frwr->fr_cqe.done = frwr_wc_localinv;
		last = &frwr->fr_invwr;
		last->next = NULL;
		last->wr_cqe = &frwr->fr_cqe;
		last->sg_list = NULL;
		last->num_sge = 0;
		last->opcode = IB_WR_LOCAL_INV;
		last->send_flags = IB_SEND_SIGNALED;
		last->ex.invalidate_rkey = mr->mr_handle;

		*prev = last;
		prev = &last->next;
	}

	/* Strong send queue ordering guarantees that when the
	 * last WR in the chain completes, all WRs in the chain
	 * are complete. The last completion will wake up the
	 * RPC waiter.
	 */
	frwr->fr_cqe.done = frwr_wc_localinv_done;

	/* Transport disconnect drains the receive CQ before it
	 * replaces the QP. The RPC reply handler won't call us
	 * unless ri_id->qp is a valid pointer.
	 */
	bad_wr = NULL;
	rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
	trace_xprtrdma_post_send(req, rc);
	if (!rc)
		return;

	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
	 */
	while (bad_wr) {
		frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
		mr = container_of(frwr, struct rpcrdma_mr, frwr);
		bad_wr = bad_wr->next;

		rpcrdma_mr_recycle(mr);
	}

	/* The final LOCAL_INV WR in the chain is supposed to
	 * do the wake. If it was never posted, the wake will
	 * not happen, so wake here in that case.
	 */
	rpcrdma_complete_rqst(req->rl_reply);
}