verbs.c 38.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
/*
C
Chuck Lever 已提交
3
 * Copyright (c) 2014-2017 Oracle.  All rights reserved.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the BSD-type
 * license below:
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *      Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 *      Neither the name of the Network Appliance, Inc. nor the names of
 *      its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 41
 */

42 43 44 45 46 47 48 49 50 51
/*
 * verbs.c
 *
 * Encapsulates the major functions managing:
 *  o adapters
 *  o endpoints
 *  o connections
 *  o buffer memory
 */

52
#include <linux/interrupt.h>
53
#include <linux/slab.h>
54
#include <linux/sunrpc/addr.h>
55
#include <linux/sunrpc/svc_rdma.h>
56 57

#include <asm-generic/barrier.h>
58
#include <asm/bitops.h>
59

60
#include <rdma/ib_cm.h>
61

62
#include "xprt_rdma.h"
63
#include <trace/events/rpcrdma.h>
64

65 66 67 68
/*
 * Globals/Macros
 */

J
Jeff Layton 已提交
69
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
70 71 72 73 74 75
# define RPCDBG_FACILITY	RPCDBG_TRANS
#endif

/*
 * internal functions
 */
76
static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
C
Chuck Lever 已提交
77 78
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
79
static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp);
80
static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
81

82
struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
83

84 85
int
rpcrdma_alloc_wq(void)
86
{
87
	struct workqueue_struct *recv_wq;
88

89
	recv_wq = alloc_workqueue("xprtrdma_receive",
90
				  WQ_MEM_RECLAIM | WQ_HIGHPRI,
91 92 93
				  0);
	if (!recv_wq)
		return -ENOMEM;
94

95 96
	rpcrdma_receive_wq = recv_wq;
	return 0;
97 98
}

99 100
void
rpcrdma_destroy_wq(void)
101
{
102
	struct workqueue_struct *wq;
103

104 105 106 107 108
	if (rpcrdma_receive_wq) {
		wq = rpcrdma_receive_wq;
		rpcrdma_receive_wq = NULL;
		destroy_workqueue(wq);
	}
109 110
}

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
/**
 * rpcrdma_disconnect_worker - Force a disconnect
 * @work: endpoint to be disconnected
 *
 * Provider callbacks can possibly run in an IRQ context. This function
 * is invoked in a worker thread to guarantee that disconnect wake-up
 * calls are always done in process context.
 */
static void
rpcrdma_disconnect_worker(struct work_struct *work)
{
	struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep,
					     rep_disconnect_worker.work);
	struct rpcrdma_xprt *r_xprt =
		container_of(ep, struct rpcrdma_xprt, rx_ep);

	xprt_force_disconnect(&r_xprt->rx_xprt);
}

130 131 132 133
static void
rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
{
	struct rpcrdma_ep *ep = context;
134 135
	struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
						   rx_ep);
136

137
	trace_xprtrdma_qp_error(r_xprt, event);
138 139 140
	pr_err("rpcrdma: %s on device %s ep %p\n",
	       ib_event_msg(event->event), event->device->name, context);

141 142
	if (ep->rep_connected == 1) {
		ep->rep_connected = -EIO;
143
		schedule_delayed_work(&ep->rep_disconnect_worker, 0);
144 145 146 147
		wake_up_all(&ep->rep_connect_wait);
	}
}

148 149 150 151 152
/**
 * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
 * @cq:	completion queue (ignored)
 * @wc:	completed WR
 *
153 154
 */
static void
155
rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
156
{
157 158 159 160
	struct ib_cqe *cqe = wc->wr_cqe;
	struct rpcrdma_sendctx *sc =
		container_of(cqe, struct rpcrdma_sendctx, sc_cqe);

161
	/* WARNING: Only wr_cqe and status are reliable at this point */
162
	trace_xprtrdma_wc_send(sc, wc);
163 164 165 166
	if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
		pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
		       ib_wc_status_msg(wc->status),
		       wc->status, wc->vendor_err);
167 168

	rpcrdma_sendctx_put_locked(sc);
169
}
170

171
/**
172
 * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
173 174 175 176
 * @cq:	completion queue (ignored)
 * @wc:	completed WR
 *
 */
177
static void
178
rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
179
{
180 181 182
	struct ib_cqe *cqe = wc->wr_cqe;
	struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
					       rr_cqe);
183

184
	/* WARNING: Only wr_id and status are reliable at this point */
185
	trace_xprtrdma_wc_receive(wc);
186 187
	if (wc->status != IB_WC_SUCCESS)
		goto out_fail;
188

189
	/* status == SUCCESS means all fields in wc are trustworthy */
190
	rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
191 192 193
	rep->rr_wc_flags = wc->wc_flags;
	rep->rr_inv_rkey = wc->ex.invalidate_rkey;

194
	ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
195
				   rdmab_addr(rep->rr_rdmabuf),
196
				   wc->byte_len, DMA_FROM_DEVICE);
197

198
out_schedule:
199
	rpcrdma_reply_handler(rep);
200
	return;
201

202 203
out_fail:
	if (wc->status != IB_WC_WR_FLUSH_ERR)
204 205 206
		pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
		       ib_wc_status_msg(wc->status),
		       wc->status, wc->vendor_err);
207
	rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0);
208
	goto out_schedule;
209 210
}

211 212 213 214 215 216 217 218
static void
rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
			       struct rdma_conn_param *param)
{
	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
	const struct rpcrdma_connect_private *pmsg = param->private_data;
	unsigned int rsize, wsize;

219
	/* Default settings for RPC-over-RDMA Version One */
220
	r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
221 222 223 224 225 226
	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;

	if (pmsg &&
	    pmsg->cp_magic == rpcrdma_cmp_magic &&
	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
227
		r_xprt->rx_ia.ri_implicit_roundup = true;
228 229 230 231 232 233 234 235
		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
	}

	if (rsize < cdata->inline_rsize)
		cdata->inline_rsize = rsize;
	if (wsize < cdata->inline_wsize)
		cdata->inline_wsize = wsize;
236 237
	dprintk("RPC:       %s: max send %u, max recv %u\n",
		__func__, cdata->inline_wsize, cdata->inline_rsize);
238 239 240
	rpcrdma_set_max_header_sizes(r_xprt);
}

C
Chuck Lever 已提交
241 242 243 244 245 246 247 248
/**
 * rpcrdma_cm_event_handler - Handle RDMA CM events
 * @id: rdma_cm_id on which an event has occurred
 * @event: details of the event
 *
 * Called with @id's mutex held. Returns 1 if caller should
 * destroy @id, otherwise 0.
 */
249
static int
C
Chuck Lever 已提交
250
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
251
{
252 253 254 255
	struct rpcrdma_xprt *r_xprt = id->context;
	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
256

C
Chuck Lever 已提交
257 258
	might_sleep();

259
	trace_xprtrdma_cm_event(r_xprt, event);
260 261 262
	switch (event->event) {
	case RDMA_CM_EVENT_ADDR_RESOLVED:
	case RDMA_CM_EVENT_ROUTE_RESOLVED:
263
		ia->ri_async_rc = 0;
264
		complete(&ia->ri_done);
265
		return 0;
266
	case RDMA_CM_EVENT_ADDR_ERROR:
267
		ia->ri_async_rc = -EPROTO;
268
		complete(&ia->ri_done);
269
		return 0;
270 271 272
	case RDMA_CM_EVENT_ROUTE_ERROR:
		ia->ri_async_rc = -ENETUNREACH;
		complete(&ia->ri_done);
273
		return 0;
274 275
	case RDMA_CM_EVENT_DEVICE_REMOVAL:
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
276
		pr_info("rpcrdma: removing device %s for %s:%s\n",
277
			ia->ri_device->name,
278
			rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
279 280 281
#endif
		set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
		ep->rep_connected = -ENODEV;
282
		xprt_force_disconnect(xprt);
283 284 285 286 287 288
		wait_for_completion(&ia->ri_remove_done);

		ia->ri_id = NULL;
		ia->ri_device = NULL;
		/* Return 1 to ensure the core destroys the id. */
		return 1;
289
	case RDMA_CM_EVENT_ESTABLISHED:
290
		++xprt->connect_cookie;
291
		ep->rep_connected = 1;
292
		rpcrdma_update_connect_private(r_xprt, &event->param.conn);
293 294
		wake_up_all(&ep->rep_connect_wait);
		break;
295
	case RDMA_CM_EVENT_CONNECT_ERROR:
296
		ep->rep_connected = -ENOTCONN;
297
		goto disconnected;
298
	case RDMA_CM_EVENT_UNREACHABLE:
299
		ep->rep_connected = -ENETUNREACH;
300
		goto disconnected;
301
	case RDMA_CM_EVENT_REJECTED:
302
		dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
303
			rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
304
			rdma_reject_msg(id, event->status));
305
		ep->rep_connected = -ECONNREFUSED;
306
		if (event->status == IB_CM_REJ_STALE_CONN)
307
			ep->rep_connected = -EAGAIN;
308
		goto disconnected;
309
	case RDMA_CM_EVENT_DISCONNECTED:
310
		++xprt->connect_cookie;
311
		ep->rep_connected = -ECONNABORTED;
312 313
disconnected:
		xprt_force_disconnect(xprt);
314
		wake_up_all(&ep->rep_connect_wait);
315
		break;
316 317 318 319
	default:
		break;
	}

320 321 322 323
	dprintk("RPC:       %s: %s:%s on %s/%s: %s\n", __func__,
		rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
		ia->ri_device->name, ia->ri_ops->ro_displayname,
		rdma_event_msg(event->event));
324 325 326 327
	return 0;
}

static struct rdma_cm_id *
328
rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
329
{
330
	unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
331 332 333
	struct rdma_cm_id *id;
	int rc;

334 335
	trace_xprtrdma_conn_start(xprt);

336
	init_completion(&ia->ri_done);
337
	init_completion(&ia->ri_remove_done);
338

C
Chuck Lever 已提交
339
	id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
340
			    xprt, RDMA_PS_TCP, IB_QPT_RC);
341 342 343 344 345 346 347
	if (IS_ERR(id)) {
		rc = PTR_ERR(id);
		dprintk("RPC:       %s: rdma_create_id() failed %i\n",
			__func__, rc);
		return id;
	}

348
	ia->ri_async_rc = -ETIMEDOUT;
349 350 351
	rc = rdma_resolve_addr(id, NULL,
			       (struct sockaddr *)&xprt->rx_xprt.addr,
			       RDMA_RESOLVE_TIMEOUT);
352 353 354 355 356
	if (rc) {
		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
			__func__, rc);
		goto out;
	}
357 358
	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
	if (rc < 0) {
359
		trace_xprtrdma_conn_tout(xprt);
360 361
		goto out;
	}
362

363 364 365 366
	rc = ia->ri_async_rc;
	if (rc)
		goto out;

367
	ia->ri_async_rc = -ETIMEDOUT;
368 369 370 371
	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
	if (rc) {
		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
			__func__, rc);
372
		goto out;
373
	}
374 375
	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
	if (rc < 0) {
376
		trace_xprtrdma_conn_tout(xprt);
377
		goto out;
378
	}
379 380
	rc = ia->ri_async_rc;
	if (rc)
381
		goto out;
382 383

	return id;
384

385 386 387 388 389 390 391 392 393
out:
	rdma_destroy_id(id);
	return ERR_PTR(rc);
}

/*
 * Exported functions.
 */

394 395
/**
 * rpcrdma_ia_open - Open and initialize an Interface Adapter.
396
 * @xprt: transport with IA to (re)initialize
397 398 399
 *
 * Returns 0 on success, negative errno if an appropriate
 * Interface Adapter could not be found and opened.
400 401
 */
int
402
rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
403 404
{
	struct rpcrdma_ia *ia = &xprt->rx_ia;
405 406
	int rc;

407
	ia->ri_id = rpcrdma_create_id(xprt, ia);
408 409
	if (IS_ERR(ia->ri_id)) {
		rc = PTR_ERR(ia->ri_id);
410
		goto out_err;
411
	}
412
	ia->ri_device = ia->ri_id->device;
413

414
	ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
415 416
	if (IS_ERR(ia->ri_pd)) {
		rc = PTR_ERR(ia->ri_pd);
417
		pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
418
		goto out_err;
419 420
	}

421
	switch (xprt_rdma_memreg_strategy) {
422
	case RPCRDMA_FRWR:
423 424 425 426 427
		if (frwr_is_supported(ia)) {
			ia->ri_ops = &rpcrdma_frwr_memreg_ops;
			break;
		}
		/*FALLTHROUGH*/
428
	case RPCRDMA_MTHCAFMR:
429 430 431 432 433
		if (fmr_is_supported(ia)) {
			ia->ri_ops = &rpcrdma_fmr_memreg_ops;
			break;
		}
		/*FALLTHROUGH*/
434
	default:
435 436
		pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
		       ia->ri_device->name, xprt_rdma_memreg_strategy);
437
		rc = -EINVAL;
438
		goto out_err;
439 440 441
	}

	return 0;
442

443 444
out_err:
	rpcrdma_ia_close(ia);
445 446 447
	return rc;
}

448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
/**
 * rpcrdma_ia_remove - Handle device driver unload
 * @ia: interface adapter being removed
 *
 * Divest transport H/W resources associated with this adapter,
 * but allow it to be restored later.
 */
void
rpcrdma_ia_remove(struct rpcrdma_ia *ia)
{
	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
						   rx_ia);
	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
	struct rpcrdma_req *req;
	struct rpcrdma_rep *rep;

	cancel_delayed_work_sync(&buf->rb_refresh_worker);

	/* This is similar to rpcrdma_ep_destroy, but:
	 * - Don't cancel the connect worker.
	 * - Don't call rpcrdma_ep_disconnect, which waits
	 *   for another conn upcall, which will deadlock.
	 * - rdma_disconnect is unneeded, the underlying
	 *   connection is already gone.
	 */
	if (ia->ri_id->qp) {
		ib_drain_qp(ia->ri_id->qp);
		rdma_destroy_qp(ia->ri_id);
		ia->ri_id->qp = NULL;
	}
	ib_free_cq(ep->rep_attr.recv_cq);
480
	ep->rep_attr.recv_cq = NULL;
481
	ib_free_cq(ep->rep_attr.send_cq);
482
	ep->rep_attr.send_cq = NULL;
483 484 485 486 487 488 489 490 491 492 493

	/* The ULP is responsible for ensuring all DMA
	 * mappings and MRs are gone.
	 */
	list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
		rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
	list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
		rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
		rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
		rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
	}
C
Chuck Lever 已提交
494
	rpcrdma_mrs_destroy(buf);
495 496
	ib_dealloc_pd(ia->ri_pd);
	ia->ri_pd = NULL;
497 498 499

	/* Allow waiters to continue */
	complete(&ia->ri_remove_done);
500 501

	trace_xprtrdma_remove(r_xprt);
502 503
}

504 505 506 507
/**
 * rpcrdma_ia_close - Clean up/close an IA.
 * @ia: interface adapter to close
 *
508 509 510 511
 */
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
{
512 513 514
	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
		if (ia->ri_id->qp)
			rdma_destroy_qp(ia->ri_id);
515
		rdma_destroy_id(ia->ri_id);
516
	}
517 518
	ia->ri_id = NULL;
	ia->ri_device = NULL;
519 520 521

	/* If the pd is still busy, xprtrdma missed freeing a resource */
	if (ia->ri_pd && !IS_ERR(ia->ri_pd))
522
		ib_dealloc_pd(ia->ri_pd);
523
	ia->ri_pd = NULL;
524 525 526 527 528 529 530
}

/*
 * Create unconnected endpoint.
 */
int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
531
		  struct rpcrdma_create_data_internal *cdata)
532
{
533
	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
534
	struct ib_cq *sendcq, *recvcq;
535
	unsigned int max_sge;
536
	int rc;
537

538
	max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge,
539
			RPCRDMA_MAX_SEND_SGES);
540 541
	if (max_sge < RPCRDMA_MIN_SEND_SGES) {
		pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
542 543
		return -ENOMEM;
	}
544
	ia->ri_max_send_sges = max_sge;
545

546 547 548
	rc = ia->ri_ops->ro_open(ia, ep, cdata);
	if (rc)
		return rc;
549 550 551 552

	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
	ep->rep_attr.qp_context = ep;
	ep->rep_attr.srq = NULL;
553
	ep->rep_attr.cap.max_send_sge = max_sge;
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568
	ep->rep_attr.cap.max_recv_sge = 1;
	ep->rep_attr.cap.max_inline_data = 0;
	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
	ep->rep_attr.qp_type = IB_QPT_RC;
	ep->rep_attr.port_num = ~0;

	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
		"iovs: send %d recv %d\n",
		__func__,
		ep->rep_attr.cap.max_send_wr,
		ep->rep_attr.cap.max_recv_wr,
		ep->rep_attr.cap.max_send_sge,
		ep->rep_attr.cap.max_recv_sge);

	/* set trigger for requesting send completion */
569 570 571
	ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH,
				   cdata->max_requests >> 2);
	ep->rep_send_count = ep->rep_send_batch;
572
	init_waitqueue_head(&ep->rep_connect_wait);
573 574
	INIT_DELAYED_WORK(&ep->rep_disconnect_worker,
			  rpcrdma_disconnect_worker);
575

576 577
	sendcq = ib_alloc_cq(ia->ri_device, NULL,
			     ep->rep_attr.cap.max_send_wr + 1,
578
			     1, IB_POLL_WORKQUEUE);
579 580 581
	if (IS_ERR(sendcq)) {
		rc = PTR_ERR(sendcq);
		dprintk("RPC:       %s: failed to create send CQ: %i\n",
582 583 584 585
			__func__, rc);
		goto out1;
	}

586 587
	recvcq = ib_alloc_cq(ia->ri_device, NULL,
			     ep->rep_attr.cap.max_recv_wr + 1,
588
			     0, IB_POLL_WORKQUEUE);
589 590 591 592 593 594 595 596 597
	if (IS_ERR(recvcq)) {
		rc = PTR_ERR(recvcq);
		dprintk("RPC:       %s: failed to create recv CQ: %i\n",
			__func__, rc);
		goto out2;
	}

	ep->rep_attr.send_cq = sendcq;
	ep->rep_attr.recv_cq = recvcq;
598 599

	/* Initialize cma parameters */
600
	memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
601

602 603 604
	/* Prepare RDMA-CM private message */
	pmsg->cp_magic = rpcrdma_cmp_magic;
	pmsg->cp_version = RPCRDMA_CMP_VERSION;
605
	pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
606 607 608 609
	pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
	pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
	ep->rep_remote_cma.private_data = pmsg;
	ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
610 611

	/* Client offers RDMA Read but does not initiate */
612
	ep->rep_remote_cma.initiator_depth = 0;
613 614
	ep->rep_remote_cma.responder_resources =
		min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom);
615

616 617 618 619 620 621 622 623 624 625
	/* Limit transport retries so client can detect server
	 * GID changes quickly. RPC layer handles re-establishing
	 * transport connection and retransmission.
	 */
	ep->rep_remote_cma.retry_count = 6;

	/* RPC-over-RDMA handles its own flow control. In addition,
	 * make all RNR NAKs visible so we know that RPC-over-RDMA
	 * flow control is working correctly (no NAKs should be seen).
	 */
626 627 628 629 630 631
	ep->rep_remote_cma.flow_control = 0;
	ep->rep_remote_cma.rnr_retry_count = 0;

	return 0;

out2:
632
	ib_free_cq(sendcq);
633 634 635 636 637 638 639 640 641 642 643
out1:
	return rc;
}

/*
 * rpcrdma_ep_destroy
 *
 * Disconnect and destroy endpoint. After this, the only
 * valid operations on the ep are to free it (if dynamically
 * allocated) or re-create it.
 */
644
void
645 646
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
647
	cancel_delayed_work_sync(&ep->rep_disconnect_worker);
648

649
	if (ia->ri_id && ia->ri_id->qp) {
650
		rpcrdma_ep_disconnect(ep, ia);
651 652
		rdma_destroy_qp(ia->ri_id);
		ia->ri_id->qp = NULL;
653 654
	}

655 656 657 658
	if (ep->rep_attr.recv_cq)
		ib_free_cq(ep->rep_attr.recv_cq);
	if (ep->rep_attr.send_cq)
		ib_free_cq(ep->rep_attr.send_cq);
659 660
}

661 662 663 664 665 666 667 668 669 670
/* Re-establish a connection after a device removal event.
 * Unlike a normal reconnection, a fresh PD and a new set
 * of MRs and buffers is needed.
 */
static int
rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
			 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
	int rc, err;

671
	trace_xprtrdma_reinsert(r_xprt);
672 673

	rc = -EHOSTUNREACH;
674
	if (rpcrdma_ia_open(r_xprt))
675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
		goto out1;

	rc = -ENOMEM;
	err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
	if (err) {
		pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
		goto out2;
	}

	rc = -ENETUNREACH;
	err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
	if (err) {
		pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
		goto out3;
	}

C
Chuck Lever 已提交
691
	rpcrdma_mrs_create(r_xprt);
692 693 694 695 696 697 698 699 700 701
	return 0;

out3:
	rpcrdma_ep_destroy(ep, ia);
out2:
	rpcrdma_ia_close(ia);
out1:
	return rc;
}

702 703 704 705 706 707 708
static int
rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
		     struct rpcrdma_ia *ia)
{
	struct rdma_cm_id *id, *old;
	int err, rc;

709
	trace_xprtrdma_reconnect(r_xprt);
710 711 712 713

	rpcrdma_ep_disconnect(ep, ia);

	rc = -EHOSTUNREACH;
714
	id = rpcrdma_create_id(r_xprt, ia);
715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
	if (IS_ERR(id))
		goto out;

	/* As long as the new ID points to the same device as the
	 * old ID, we can reuse the transport's existing PD and all
	 * previously allocated MRs. Also, the same device means
	 * the transport's previous DMA mappings are still valid.
	 *
	 * This is a sanity check only. There should be no way these
	 * point to two different devices here.
	 */
	old = id;
	rc = -ENETUNREACH;
	if (ia->ri_device != id->device) {
		pr_err("rpcrdma: can't reconnect on different device!\n");
		goto out_destroy;
	}

	err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
	if (err) {
		dprintk("RPC:       %s: rdma_create_qp returned %d\n",
			__func__, err);
		goto out_destroy;
	}

	/* Atomically replace the transport's ID and QP. */
	rc = 0;
	old = ia->ri_id;
	ia->ri_id = id;
	rdma_destroy_qp(old);

out_destroy:
747
	rdma_destroy_id(old);
748 749 750 751
out:
	return rc;
}

752 753 754 755 756 757
/*
 * Connect unconnected endpoint.
 */
int
rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
758 759
	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
						   rx_ia);
760
	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
761
	int rc;
762 763

retry:
764 765
	switch (ep->rep_connected) {
	case 0:
766 767 768 769 770
		dprintk("RPC:       %s: connecting...\n", __func__);
		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
		if (rc) {
			dprintk("RPC:       %s: rdma_create_qp failed %i\n",
				__func__, rc);
771 772
			rc = -ENETUNREACH;
			goto out_noupdate;
773
		}
774
		break;
775 776 777 778 779
	case -ENODEV:
		rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
		if (rc)
			goto out_noupdate;
		break;
780 781 782 783
	default:
		rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
		if (rc)
			goto out;
784 785 786
	}

	ep->rep_connected = 0;
787 788
	xprt_clear_connected(xprt);

789
	rpcrdma_post_recvs(r_xprt, true);
790 791 792 793 794 795 796 797 798 799

	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
	if (rc) {
		dprintk("RPC:       %s: rdma_connect() failed with %i\n",
				__func__, rc);
		goto out;
	}

	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
	if (ep->rep_connected <= 0) {
800
		if (ep->rep_connected == -EAGAIN)
801 802
			goto retry;
		rc = ep->rep_connected;
803
		goto out;
804 805
	}

806
	dprintk("RPC:       %s: connected\n", __func__);
807

808 809 810
out:
	if (rc)
		ep->rep_connected = rc;
811 812

out_noupdate:
813 814 815 816 817 818 819 820 821 822 823 824
	return rc;
}

/*
 * rpcrdma_ep_disconnect
 *
 * This is separate from destroy to facilitate the ability
 * to reconnect without recreating the endpoint.
 *
 * This call is not reentrant, and must not be made in parallel
 * on the same endpoint.
 */
825
void
826 827 828 829 830
rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
	int rc;

	rc = rdma_disconnect(ia->ri_id);
831
	if (!rc)
832 833 834
		/* returns without wait if not connected */
		wait_event_interruptible(ep->rep_connect_wait,
							ep->rep_connected != 1);
835
	else
836
		ep->rep_connected = rc;
837 838
	trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt,
					       rx_ep), rc);
839 840

	ib_drain_qp(ia->ri_id->qp);
841 842
}

843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911
/* Fixed-size circular FIFO queue. This implementation is wait-free and
 * lock-free.
 *
 * Consumer is the code path that posts Sends. This path dequeues a
 * sendctx for use by a Send operation. Multiple consumer threads
 * are serialized by the RPC transport lock, which allows only one
 * ->send_request call at a time.
 *
 * Producer is the code path that handles Send completions. This path
 * enqueues a sendctx that has been completed. Multiple producer
 * threads are serialized by the ib_poll_cq() function.
 */

/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced
 * queue activity, and ib_drain_qp has flushed all remaining Send
 * requests.
 */
static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
{
	unsigned long i;

	for (i = 0; i <= buf->rb_sc_last; i++)
		kfree(buf->rb_sc_ctxs[i]);
	kfree(buf->rb_sc_ctxs);
}

static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
{
	struct rpcrdma_sendctx *sc;

	sc = kzalloc(sizeof(*sc) +
		     ia->ri_max_send_sges * sizeof(struct ib_sge),
		     GFP_KERNEL);
	if (!sc)
		return NULL;

	sc->sc_wr.wr_cqe = &sc->sc_cqe;
	sc->sc_wr.sg_list = sc->sc_sges;
	sc->sc_wr.opcode = IB_WR_SEND;
	sc->sc_cqe.done = rpcrdma_wc_send;
	return sc;
}

static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
{
	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
	struct rpcrdma_sendctx *sc;
	unsigned long i;

	/* Maximum number of concurrent outstanding Send WRs. Capping
	 * the circular queue size stops Send Queue overflow by causing
	 * the ->send_request call to fail temporarily before too many
	 * Sends are posted.
	 */
	i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
	dprintk("RPC:       %s: allocating %lu send_ctxs\n", __func__, i);
	buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
	if (!buf->rb_sc_ctxs)
		return -ENOMEM;

	buf->rb_sc_last = i - 1;
	for (i = 0; i <= buf->rb_sc_last; i++) {
		sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
		if (!sc)
			goto out_destroy;

		sc->sc_xprt = r_xprt;
		buf->rb_sc_ctxs[i] = sc;
	}
912
	buf->rb_flags = 0;
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969

	return 0;

out_destroy:
	rpcrdma_sendctxs_destroy(buf);
	return -ENOMEM;
}

/* The sendctx queue is not guaranteed to have a size that is a
 * power of two, thus the helpers in circ_buf.h cannot be used.
 * The other option is to use modulus (%), which can be expensive.
 */
static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf,
					  unsigned long item)
{
	return likely(item < buf->rb_sc_last) ? item + 1 : 0;
}

/**
 * rpcrdma_sendctx_get_locked - Acquire a send context
 * @buf: transport buffers from which to acquire an unused context
 *
 * Returns pointer to a free send completion context; or NULL if
 * the queue is empty.
 *
 * Usage: Called to acquire an SGE array before preparing a Send WR.
 *
 * The caller serializes calls to this function (per rpcrdma_buffer),
 * and provides an effective memory barrier that flushes the new value
 * of rb_sc_head.
 */
struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf)
{
	struct rpcrdma_xprt *r_xprt;
	struct rpcrdma_sendctx *sc;
	unsigned long next_head;

	next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head);

	if (next_head == READ_ONCE(buf->rb_sc_tail))
		goto out_emptyq;

	/* ORDER: item must be accessed _before_ head is updated */
	sc = buf->rb_sc_ctxs[next_head];

	/* Releasing the lock in the caller acts as a memory
	 * barrier that flushes rb_sc_head.
	 */
	buf->rb_sc_head = next_head;

	return sc;

out_emptyq:
	/* The queue is "empty" if there have not been enough Send
	 * completions recently. This is a sign the Send Queue is
	 * backing up. Cause the caller to pause and try again.
	 */
970
	set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags);
971 972 973 974 975 976 977 978 979 980 981 982 983 984
	r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf);
	r_xprt->rx_stats.empty_sendctx_q++;
	return NULL;
}

/**
 * rpcrdma_sendctx_put_locked - Release a send context
 * @sc: send context to release
 *
 * Usage: Called from Send completion to return a sendctxt
 * to the queue.
 *
 * The caller serializes calls to this function (per rpcrdma_buffer).
 */
985 986
static void
rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
{
	struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
	unsigned long next_tail;

	/* Unmap SGEs of previously completed by unsignaled
	 * Sends by walking up the queue until @sc is found.
	 */
	next_tail = buf->rb_sc_tail;
	do {
		next_tail = rpcrdma_sendctx_next(buf, next_tail);

		/* ORDER: item must be accessed _before_ tail is updated */
		rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]);

	} while (buf->rb_sc_ctxs[next_tail] != sc);

	/* Paired with READ_ONCE */
	smp_store_release(&buf->rb_sc_tail, next_tail);
1005 1006 1007 1008 1009

	if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) {
		smp_mb__after_atomic();
		xprt_write_space(&sc->sc_xprt->rx_xprt);
	}
1010 1011
}

C
Chuck Lever 已提交
1012
static void
C
Chuck Lever 已提交
1013
rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
C
Chuck Lever 已提交
1014 1015 1016 1017 1018 1019 1020
{
	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
	unsigned int count;
	LIST_HEAD(free);
	LIST_HEAD(all);

C
Chuck Lever 已提交
1021
	for (count = 0; count < ia->ri_max_segs; count++) {
C
Chuck Lever 已提交
1022
		struct rpcrdma_mr *mr;
C
Chuck Lever 已提交
1023 1024
		int rc;

C
Chuck Lever 已提交
1025 1026
		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
		if (!mr)
C
Chuck Lever 已提交
1027 1028
			break;

C
Chuck Lever 已提交
1029
		rc = ia->ri_ops->ro_init_mr(ia, mr);
C
Chuck Lever 已提交
1030
		if (rc) {
C
Chuck Lever 已提交
1031
			kfree(mr);
C
Chuck Lever 已提交
1032 1033 1034
			break;
		}

C
Chuck Lever 已提交
1035
		mr->mr_xprt = r_xprt;
C
Chuck Lever 已提交
1036

C
Chuck Lever 已提交
1037 1038
		list_add(&mr->mr_list, &free);
		list_add(&mr->mr_all, &all);
C
Chuck Lever 已提交
1039 1040
	}

C
Chuck Lever 已提交
1041 1042
	spin_lock(&buf->rb_mrlock);
	list_splice(&free, &buf->rb_mrs);
C
Chuck Lever 已提交
1043 1044
	list_splice(&all, &buf->rb_all);
	r_xprt->rx_stats.mrs_allocated += count;
C
Chuck Lever 已提交
1045
	spin_unlock(&buf->rb_mrlock);
1046
	trace_xprtrdma_createmrs(r_xprt, count);
1047 1048

	xprt_write_space(&r_xprt->rx_xprt);
C
Chuck Lever 已提交
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
}

static void
rpcrdma_mr_refresh_worker(struct work_struct *work)
{
	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
						  rb_refresh_worker.work);
	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
						   rx_buf);

C
Chuck Lever 已提交
1059
	rpcrdma_mrs_create(r_xprt);
C
Chuck Lever 已提交
1060 1061
}

1062
struct rpcrdma_req *
1063 1064
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
1065
	struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
1066
	struct rpcrdma_regbuf *rb;
1067 1068
	struct rpcrdma_req *req;

1069
	req = kzalloc(sizeof(*req), GFP_KERNEL);
1070
	if (req == NULL)
1071
		return ERR_PTR(-ENOMEM);
1072

1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
	rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
				  DMA_TO_DEVICE, GFP_KERNEL);
	if (IS_ERR(rb)) {
		kfree(req);
		return ERR_PTR(-ENOMEM);
	}
	req->rl_rdmabuf = rb;
	xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
	req->rl_buffer = buffer;
	INIT_LIST_HEAD(&req->rl_registered);

1084 1085 1086
	spin_lock(&buffer->rb_reqslock);
	list_add(&req->rl_all, &buffer->rb_allreqs);
	spin_unlock(&buffer->rb_reqslock);
1087 1088 1089
	return req;
}

1090 1091
static int
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp)
1092 1093
{
	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1094
	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1095 1096 1097 1098
	struct rpcrdma_rep *rep;
	int rc;

	rc = -ENOMEM;
1099
	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1100 1101 1102
	if (rep == NULL)
		goto out;

1103
	rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
1104
					       DMA_FROM_DEVICE, GFP_KERNEL);
1105 1106
	if (IS_ERR(rep->rr_rdmabuf)) {
		rc = PTR_ERR(rep->rr_rdmabuf);
1107
		goto out_free;
1108
	}
1109 1110
	xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
		     rdmab_length(rep->rr_rdmabuf));
1111

1112
	rep->rr_cqe.done = rpcrdma_wc_receive;
1113
	rep->rr_rxprt = r_xprt;
1114
	INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
1115 1116 1117 1118
	rep->rr_recv_wr.next = NULL;
	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
	rep->rr_recv_wr.num_sge = 1;
1119
	rep->rr_temp = temp;
1120 1121 1122 1123 1124

	spin_lock(&buf->rb_lock);
	list_add(&rep->rr_list, &buf->rb_recv_bufs);
	spin_unlock(&buf->rb_lock);
	return 0;
1125 1126 1127 1128

out_free:
	kfree(rep);
out:
1129 1130 1131
	dprintk("RPC:       %s: reply buffer %d alloc failed\n",
		__func__, rc);
	return rc;
1132 1133
}

1134
int
1135
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1136
{
1137
	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1138 1139
	int i, rc;

1140
	buf->rb_max_requests = r_xprt->rx_data.max_requests;
1141
	buf->rb_bc_srv_max_requests = 0;
C
Chuck Lever 已提交
1142
	spin_lock_init(&buf->rb_mrlock);
1143
	spin_lock_init(&buf->rb_lock);
C
Chuck Lever 已提交
1144
	INIT_LIST_HEAD(&buf->rb_mrs);
C
Chuck Lever 已提交
1145 1146 1147
	INIT_LIST_HEAD(&buf->rb_all);
	INIT_DELAYED_WORK(&buf->rb_refresh_worker,
			  rpcrdma_mr_refresh_worker);
1148

C
Chuck Lever 已提交
1149
	rpcrdma_mrs_create(r_xprt);
1150

1151
	INIT_LIST_HEAD(&buf->rb_send_bufs);
1152 1153
	INIT_LIST_HEAD(&buf->rb_allreqs);
	spin_lock_init(&buf->rb_reqslock);
1154 1155 1156
	for (i = 0; i < buf->rb_max_requests; i++) {
		struct rpcrdma_req *req;

1157 1158
		req = rpcrdma_create_req(r_xprt);
		if (IS_ERR(req)) {
1159 1160
			dprintk("RPC:       %s: request buffer %d alloc"
				" failed\n", __func__, i);
1161
			rc = PTR_ERR(req);
1162 1163
			goto out;
		}
1164
		list_add(&req->rl_list, &buf->rb_send_bufs);
1165 1166
	}

1167
	buf->rb_credits = 1;
1168
	buf->rb_posted_receives = 0;
1169
	INIT_LIST_HEAD(&buf->rb_recv_bufs);
1170

1171 1172 1173 1174
	rc = rpcrdma_sendctxs_create(r_xprt);
	if (rc)
		goto out;

1175 1176 1177 1178 1179 1180
	return 0;
out:
	rpcrdma_buffer_destroy(buf);
	return rc;
}

1181
static void
1182
rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
1183
{
1184
	rpcrdma_free_regbuf(rep->rr_rdmabuf);
1185 1186 1187
	kfree(rep);
}

1188
void
1189
rpcrdma_destroy_req(struct rpcrdma_req *req)
1190
{
1191 1192 1193
	rpcrdma_free_regbuf(req->rl_recvbuf);
	rpcrdma_free_regbuf(req->rl_sendbuf);
	rpcrdma_free_regbuf(req->rl_rdmabuf);
1194 1195 1196
	kfree(req);
}

C
Chuck Lever 已提交
1197
static void
C
Chuck Lever 已提交
1198
rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
C
Chuck Lever 已提交
1199 1200 1201 1202
{
	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
						   rx_buf);
	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
C
Chuck Lever 已提交
1203
	struct rpcrdma_mr *mr;
C
Chuck Lever 已提交
1204 1205 1206
	unsigned int count;

	count = 0;
C
Chuck Lever 已提交
1207
	spin_lock(&buf->rb_mrlock);
C
Chuck Lever 已提交
1208
	while (!list_empty(&buf->rb_all)) {
C
Chuck Lever 已提交
1209 1210
		mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
		list_del(&mr->mr_all);
C
Chuck Lever 已提交
1211

C
Chuck Lever 已提交
1212
		spin_unlock(&buf->rb_mrlock);
1213 1214 1215 1216 1217

		/* Ensure MW is not on any rl_registered list */
		if (!list_empty(&mr->mr_list))
			list_del(&mr->mr_list);

C
Chuck Lever 已提交
1218
		ia->ri_ops->ro_release_mr(mr);
C
Chuck Lever 已提交
1219
		count++;
C
Chuck Lever 已提交
1220
		spin_lock(&buf->rb_mrlock);
C
Chuck Lever 已提交
1221
	}
C
Chuck Lever 已提交
1222
	spin_unlock(&buf->rb_mrlock);
C
Chuck Lever 已提交
1223 1224 1225 1226 1227
	r_xprt->rx_stats.mrs_allocated = 0;

	dprintk("RPC:       %s: released %u MRs\n", __func__, count);
}

1228 1229 1230
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
1231
	cancel_delayed_work_sync(&buf->rb_refresh_worker);
1232

1233 1234
	rpcrdma_sendctxs_destroy(buf);

1235 1236
	while (!list_empty(&buf->rb_recv_bufs)) {
		struct rpcrdma_rep *rep;
1237

1238 1239 1240
		rep = list_first_entry(&buf->rb_recv_bufs,
				       struct rpcrdma_rep, rr_list);
		list_del(&rep->rr_list);
1241
		rpcrdma_destroy_rep(rep);
1242 1243
	}

1244 1245
	spin_lock(&buf->rb_reqslock);
	while (!list_empty(&buf->rb_allreqs)) {
1246
		struct rpcrdma_req *req;
A
Allen Andrews 已提交
1247

1248 1249 1250 1251 1252
		req = list_first_entry(&buf->rb_allreqs,
				       struct rpcrdma_req, rl_all);
		list_del(&req->rl_all);

		spin_unlock(&buf->rb_reqslock);
1253
		rpcrdma_destroy_req(req);
1254
		spin_lock(&buf->rb_reqslock);
1255
	}
1256
	spin_unlock(&buf->rb_reqslock);
A
Allen Andrews 已提交
1257

C
Chuck Lever 已提交
1258
	rpcrdma_mrs_destroy(buf);
1259 1260
}

C
Chuck Lever 已提交
1261 1262 1263 1264 1265 1266 1267 1268 1269
/**
 * rpcrdma_mr_get - Allocate an rpcrdma_mr object
 * @r_xprt: controlling transport
 *
 * Returns an initialized rpcrdma_mr or NULL if no free
 * rpcrdma_mr objects are available.
 */
struct rpcrdma_mr *
rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
1270
{
1271
	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
C
Chuck Lever 已提交
1272
	struct rpcrdma_mr *mr = NULL;
1273

C
Chuck Lever 已提交
1274 1275 1276 1277
	spin_lock(&buf->rb_mrlock);
	if (!list_empty(&buf->rb_mrs))
		mr = rpcrdma_mr_pop(&buf->rb_mrs);
	spin_unlock(&buf->rb_mrlock);
1278

C
Chuck Lever 已提交
1279 1280 1281
	if (!mr)
		goto out_nomrs;
	return mr;
C
Chuck Lever 已提交
1282

C
Chuck Lever 已提交
1283
out_nomrs:
1284
	trace_xprtrdma_nomrs(r_xprt);
1285 1286
	if (r_xprt->rx_ep.rep_connected != -ENODEV)
		schedule_delayed_work(&buf->rb_refresh_worker, 0);
C
Chuck Lever 已提交
1287 1288 1289 1290 1291

	/* Allow the reply handler and refresh worker to run */
	cond_resched();

	return NULL;
1292 1293
}

1294 1295 1296 1297 1298 1299 1300 1301
static void
__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
{
	spin_lock(&buf->rb_mrlock);
	rpcrdma_mr_push(mr, &buf->rb_mrs);
	spin_unlock(&buf->rb_mrlock);
}

C
Chuck Lever 已提交
1302 1303 1304 1305 1306
/**
 * rpcrdma_mr_put - Release an rpcrdma_mr object
 * @mr: object to release
 *
 */
1307
void
C
Chuck Lever 已提交
1308
rpcrdma_mr_put(struct rpcrdma_mr *mr)
1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
{
	__rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
}

/**
 * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
 * @mr: object to release
 *
 */
void
rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
1320
{
C
Chuck Lever 已提交
1321
	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1322

1323
	trace_xprtrdma_mr_unmap(mr);
1324 1325 1326
	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
			mr->mr_sg, mr->mr_nents, mr->mr_dir);
	__rpcrdma_mr_put(&r_xprt->rx_buf, mr);
1327 1328
}

1329 1330 1331
/**
 * rpcrdma_buffer_get - Get a request buffer
 * @buffers: Buffer pool from which to obtain a buffer
1332
 *
1333
 * Returns a fresh rpcrdma_req, or NULL if none are available.
1334 1335 1336 1337 1338
 */
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
	struct rpcrdma_req *req;
1339

1340
	spin_lock(&buffers->rb_lock);
1341 1342 1343 1344
	req = list_first_entry_or_null(&buffers->rb_send_bufs,
				       struct rpcrdma_req, rl_list);
	if (req)
		list_del_init(&req->rl_list);
1345
	spin_unlock(&buffers->rb_lock);
1346
	return req;
1347 1348
}

1349 1350 1351 1352
/**
 * rpcrdma_buffer_put - Put request/reply buffers back into pool
 * @req: object to return
 *
1353 1354 1355 1356 1357
 */
void
rpcrdma_buffer_put(struct rpcrdma_req *req)
{
	struct rpcrdma_buffer *buffers = req->rl_buffer;
1358
	struct rpcrdma_rep *rep = req->rl_reply;
1359

1360 1361
	req->rl_reply = NULL;

1362
	spin_lock(&buffers->rb_lock);
1363
	list_add(&req->rl_list, &buffers->rb_send_bufs);
1364
	if (rep) {
1365 1366 1367 1368
		if (!rep->rr_temp) {
			list_add(&rep->rr_list, &buffers->rb_recv_bufs);
			rep = NULL;
		}
1369
	}
1370
	spin_unlock(&buffers->rb_lock);
1371 1372
	if (rep)
		rpcrdma_destroy_rep(rep);
1373 1374 1375 1376
}

/*
 * Put reply buffers back into pool when not attached to
1377
 * request. This happens in error conditions.
1378 1379 1380 1381
 */
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
1382
	struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
1383

1384 1385 1386 1387 1388 1389 1390
	if (!rep->rr_temp) {
		spin_lock(&buffers->rb_lock);
		list_add(&rep->rr_list, &buffers->rb_recv_bufs);
		spin_unlock(&buffers->rb_lock);
	} else {
		rpcrdma_destroy_rep(rep);
	}
1391 1392
}

1393
/**
1394
 * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
1395
 * @size: size of buffer to be allocated, in bytes
1396
 * @direction: direction of data movement
1397 1398
 * @flags: GFP flags
 *
1399 1400
 * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
 * can be persistently DMA-mapped for I/O.
1401 1402
 *
 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1403 1404
 * receiving the payload of RDMA RECV operations. During Long Calls
 * or Replies they may be registered externally via ro_map.
1405 1406
 */
struct rpcrdma_regbuf *
1407 1408
rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
		     gfp_t flags)
1409 1410 1411 1412 1413
{
	struct rpcrdma_regbuf *rb;

	rb = kmalloc(sizeof(*rb) + size, flags);
	if (rb == NULL)
1414
		return ERR_PTR(-ENOMEM);
1415

1416
	rb->rg_device = NULL;
1417
	rb->rg_direction = direction;
1418
	rb->rg_iov.length = size;
1419 1420

	return rb;
1421
}
1422

1423 1424 1425 1426 1427 1428 1429 1430
/**
 * __rpcrdma_map_regbuf - DMA-map a regbuf
 * @ia: controlling rpcrdma_ia
 * @rb: regbuf to be mapped
 */
bool
__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
1431 1432
	struct ib_device *device = ia->ri_device;

1433 1434 1435
	if (rb->rg_direction == DMA_NONE)
		return false;

1436
	rb->rg_iov.addr = ib_dma_map_single(device,
1437 1438 1439
					    (void *)rb->rg_base,
					    rdmab_length(rb),
					    rb->rg_direction);
1440
	if (ib_dma_mapping_error(device, rdmab_addr(rb)))
1441 1442
		return false;

1443
	rb->rg_device = device;
1444 1445 1446 1447 1448 1449 1450
	rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
	return true;
}

static void
rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
{
1451 1452 1453
	if (!rb)
		return;

1454 1455 1456 1457 1458 1459
	if (!rpcrdma_regbuf_is_mapped(rb))
		return;

	ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
			    rdmab_length(rb), rb->rg_direction);
	rb->rg_device = NULL;
1460 1461 1462 1463 1464 1465 1466
}

/**
 * rpcrdma_free_regbuf - deregister and free registered buffer
 * @rb: regbuf to be deregistered and freed
 */
void
1467
rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
1468
{
1469
	rpcrdma_dma_unmap_regbuf(rb);
1470
	kfree(rb);
1471 1472
}

1473 1474 1475 1476 1477 1478 1479 1480 1481 1482
/*
 * Prepost any receive buffer, then post send.
 *
 * Receive buffer is donated to hardware, reclaimed upon recv completion.
 */
int
rpcrdma_ep_post(struct rpcrdma_ia *ia,
		struct rpcrdma_ep *ep,
		struct rpcrdma_req *req)
{
1483
	struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
1484
	int rc;
1485

1486 1487
	if (!ep->rep_send_count ||
	    test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
1488 1489 1490 1491 1492 1493
		send_wr->send_flags |= IB_SEND_SIGNALED;
		ep->rep_send_count = ep->rep_send_batch;
	} else {
		send_wr->send_flags &= ~IB_SEND_SIGNALED;
		--ep->rep_send_count;
	}
1494

1495
	rc = ia->ri_ops->ro_send(ia, req);
1496
	trace_xprtrdma_post_send(req, rc);
1497
	if (rc)
1498
		return -ENOTCONN;
1499
	return 0;
1500 1501
}

1502
/**
1503 1504 1505
 * rpcrdma_post_recvs - Maybe post some Receive buffers
 * @r_xprt: controlling transport
 * @temp: when true, allocate temp rpcrdma_rep objects
1506 1507
 *
 */
1508 1509
void
rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1510
{
1511 1512 1513
	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
	struct ib_recv_wr *wr, *bad_wr;
	int needed, count, rc;
1514

1515 1516 1517 1518
	needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
	if (buf->rb_posted_receives > needed)
		return;
	needed -= buf->rb_posted_receives;
1519

1520 1521 1522 1523 1524
	count = 0;
	wr = NULL;
	while (needed) {
		struct rpcrdma_regbuf *rb;
		struct rpcrdma_rep *rep;
1525

1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536
		spin_lock(&buf->rb_lock);
		rep = list_first_entry_or_null(&buf->rb_recv_bufs,
					       struct rpcrdma_rep, rr_list);
		if (likely(rep))
			list_del(&rep->rr_list);
		spin_unlock(&buf->rb_lock);
		if (!rep) {
			if (rpcrdma_create_rep(r_xprt, temp))
				break;
			continue;
		}
1537

1538 1539 1540 1541 1542 1543 1544
		rb = rep->rr_rdmabuf;
		if (!rpcrdma_regbuf_is_mapped(rb)) {
			if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) {
				rpcrdma_recv_buffer_put(rep);
				break;
			}
		}
1545

1546 1547 1548 1549 1550 1551 1552 1553 1554
		trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
		rep->rr_recv_wr.next = wr;
		wr = &rep->rr_recv_wr;
		++count;
		--needed;
	}
	if (!count)
		return;

1555 1556
	rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
			  (const struct ib_recv_wr **)&bad_wr);
1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567
	if (rc) {
		for (wr = bad_wr; wr; wr = wr->next) {
			struct rpcrdma_rep *rep;

			rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
			rpcrdma_recv_buffer_put(rep);
			--count;
		}
	}
	buf->rb_posted_receives += count;
	trace_xprtrdma_post_recvs(r_xprt, count, rc);
1568
}