svc_rdma_transport.c 40.7 KB
Newer Older
1
/*
S
Steve Wise 已提交
2
 * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the BSD-type
 * license below:
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *      Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 *      Neither the name of the Network Appliance, Inc. nor the names of
 *      its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Tom Tucker <tom@opengridcomputing.com>
 */

#include <linux/sunrpc/svc_xprt.h>
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
46
#include <linux/interrupt.h>
47
#include <linux/sched.h>
48
#include <linux/slab.h>
49
#include <linux/spinlock.h>
50
#include <linux/workqueue.h>
51 52 53
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <linux/sunrpc/svc_rdma.h>
54
#include <linux/export.h>
55
#include "xprt_rdma.h"
56 57 58

#define RPCDBG_FACILITY	RPCDBG_SVCXPRT

59
static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int);
60
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
61
					struct net *net,
62 63 64 65 66 67 68 69
					struct sockaddr *sa, int salen,
					int flags);
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
static void svc_rdma_release_rqst(struct svc_rqst *);
static void dto_tasklet_func(unsigned long data);
static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
70
static int svc_rdma_secure_port(struct svc_rqst *);
71 72 73
static void rq_cq_reap(struct svcxprt_rdma *xprt);
static void sq_cq_reap(struct svcxprt_rdma *xprt);

R
Roel Kluin 已提交
74
static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
75 76 77 78 79 80 81 82 83 84 85 86 87
static DEFINE_SPINLOCK(dto_lock);
static LIST_HEAD(dto_xprt_q);

static struct svc_xprt_ops svc_rdma_ops = {
	.xpo_create = svc_rdma_create,
	.xpo_recvfrom = svc_rdma_recvfrom,
	.xpo_sendto = svc_rdma_sendto,
	.xpo_release_rqst = svc_rdma_release_rqst,
	.xpo_detach = svc_rdma_detach,
	.xpo_free = svc_rdma_free,
	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
	.xpo_has_wspace = svc_rdma_has_wspace,
	.xpo_accept = svc_rdma_accept,
88
	.xpo_secure_port = svc_rdma_secure_port,
89 90 91 92 93 94
};

struct svc_xprt_class svc_rdma_class = {
	.xcl_name = "rdma",
	.xcl_owner = THIS_MODULE,
	.xcl_ops = &svc_rdma_ops,
95
	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
96
	.xcl_ident = XPRT_TRANSPORT_RDMA,
97 98
};

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
					   struct sockaddr *, int, int);
static void svc_rdma_bc_detach(struct svc_xprt *);
static void svc_rdma_bc_free(struct svc_xprt *);

static struct svc_xprt_ops svc_rdma_bc_ops = {
	.xpo_create = svc_rdma_bc_create,
	.xpo_detach = svc_rdma_bc_detach,
	.xpo_free = svc_rdma_bc_free,
	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
	.xpo_secure_port = svc_rdma_secure_port,
};

struct svc_xprt_class svc_rdma_bc_class = {
	.xcl_name = "rdma-bc",
	.xcl_owner = THIS_MODULE,
	.xcl_ops = &svc_rdma_bc_ops,
	.xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
};

static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
					   struct net *net,
					   struct sockaddr *sa, int salen,
					   int flags)
{
	struct svcxprt_rdma *cma_xprt;
	struct svc_xprt *xprt;

	cma_xprt = rdma_create_xprt(serv, 0);
	if (!cma_xprt)
		return ERR_PTR(-ENOMEM);
	xprt = &cma_xprt->sc_xprt;

	svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
	serv->sv_bc_xprt = xprt;

	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
	return xprt;
}

static void svc_rdma_bc_detach(struct svc_xprt *xprt)
{
	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
}

static void svc_rdma_bc_free(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);

	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
	if (xprt)
		kfree(rdma);
}
#endif	/* CONFIG_SUNRPC_BACKCHANNEL */

156 157
static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
					   gfp_t flags)
158 159 160
{
	struct svc_rdma_op_ctxt *ctxt;

161 162 163 164 165 166 167 168 169 170 171
	ctxt = kmalloc(sizeof(*ctxt), flags);
	if (ctxt) {
		ctxt->xprt = xprt;
		INIT_LIST_HEAD(&ctxt->free);
		INIT_LIST_HEAD(&ctxt->dto_q);
	}
	return ctxt;
}

static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
{
172
	unsigned int i;
173 174 175 176

	/* Each RPC/RDMA credit can consume a number of send
	 * and receive WQEs. One ctxt is allocated for each.
	 */
177
	i = xprt->sc_sq_depth + xprt->sc_rq_depth;
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206

	while (i--) {
		struct svc_rdma_op_ctxt *ctxt;

		ctxt = alloc_ctxt(xprt, GFP_KERNEL);
		if (!ctxt) {
			dprintk("svcrdma: No memory for RDMA ctxt\n");
			return false;
		}
		list_add(&ctxt->free, &xprt->sc_ctxts);
	}
	return true;
}

struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_op_ctxt *ctxt = NULL;

	spin_lock_bh(&xprt->sc_ctxt_lock);
	xprt->sc_ctxt_used++;
	if (list_empty(&xprt->sc_ctxts))
		goto out_empty;

	ctxt = list_first_entry(&xprt->sc_ctxts,
				struct svc_rdma_op_ctxt, free);
	list_del_init(&ctxt->free);
	spin_unlock_bh(&xprt->sc_ctxt_lock);

out:
207
	ctxt->count = 0;
T
Tom Tucker 已提交
208
	ctxt->frmr = NULL;
209
	return ctxt;
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225

out_empty:
	/* Either pre-allocation missed the mark, or send
	 * queue accounting is broken.
	 */
	spin_unlock_bh(&xprt->sc_ctxt_lock);

	ctxt = alloc_ctxt(xprt, GFP_NOIO);
	if (ctxt)
		goto out;

	spin_lock_bh(&xprt->sc_ctxt_lock);
	xprt->sc_ctxt_used--;
	spin_unlock_bh(&xprt->sc_ctxt_lock);
	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
	return NULL;
226 227
}

228
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
229 230 231 232
{
	struct svcxprt_rdma *xprt = ctxt->xprt;
	int i;
	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
T
Tom Tucker 已提交
233 234 235 236 237 238 239 240
		/*
		 * Unmap the DMA addr in the SGE if the lkey matches
		 * the sc_dma_lkey, otherwise, ignore it since it is
		 * an FRMR lkey and will be unmapped later when the
		 * last WR that uses it completes.
		 */
		if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
			atomic_dec(&xprt->sc_dma_used);
241
			ib_dma_unmap_page(xprt->sc_cm_id->device,
T
Tom Tucker 已提交
242 243 244 245
					    ctxt->sge[i].addr,
					    ctxt->sge[i].length,
					    ctxt->direction);
		}
246 247 248
	}
}

249 250
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
{
251
	struct svcxprt_rdma *xprt = ctxt->xprt;
252 253 254 255 256 257
	int i;

	if (free_pages)
		for (i = 0; i < ctxt->count; i++)
			put_page(ctxt->pages[i]);

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
	spin_lock_bh(&xprt->sc_ctxt_lock);
	xprt->sc_ctxt_used--;
	list_add(&ctxt->free, &xprt->sc_ctxts);
	spin_unlock_bh(&xprt->sc_ctxt_lock);
}

static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
{
	while (!list_empty(&xprt->sc_ctxts)) {
		struct svc_rdma_op_ctxt *ctxt;

		ctxt = list_first_entry(&xprt->sc_ctxts,
					struct svc_rdma_op_ctxt, free);
		list_del(&ctxt->free);
		kfree(ctxt);
	}
274 275
}

276
static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
277 278
{
	struct svc_rdma_req_map *map;
279 280 281 282 283 284 285 286 287

	map = kmalloc(sizeof(*map), flags);
	if (map)
		INIT_LIST_HEAD(&map->free);
	return map;
}

static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
{
288
	unsigned int i;
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319

	/* One for each receive buffer on this connection. */
	i = xprt->sc_max_requests;

	while (i--) {
		struct svc_rdma_req_map *map;

		map = alloc_req_map(GFP_KERNEL);
		if (!map) {
			dprintk("svcrdma: No memory for request map\n");
			return false;
		}
		list_add(&map->free, &xprt->sc_maps);
	}
	return true;
}

struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_req_map *map = NULL;

	spin_lock(&xprt->sc_map_lock);
	if (list_empty(&xprt->sc_maps))
		goto out_empty;

	map = list_first_entry(&xprt->sc_maps,
			       struct svc_rdma_req_map, free);
	list_del_init(&map->free);
	spin_unlock(&xprt->sc_map_lock);

out:
320 321
	map->count = 0;
	return map;
322 323 324 325 326 327 328 329 330 331 332

out_empty:
	spin_unlock(&xprt->sc_map_lock);

	/* Pre-allocation amount was incorrect */
	map = alloc_req_map(GFP_NOIO);
	if (map)
		goto out;

	WARN_ONCE(1, "svcrdma: empty request map list?\n");
	return NULL;
333 334
}

335 336
void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
			  struct svc_rdma_req_map *map)
337
{
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
	spin_lock(&xprt->sc_map_lock);
	list_add(&map->free, &xprt->sc_maps);
	spin_unlock(&xprt->sc_map_lock);
}

static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
{
	while (!list_empty(&xprt->sc_maps)) {
		struct svc_rdma_req_map *map;

		map = list_first_entry(&xprt->sc_maps,
				       struct svc_rdma_req_map, free);
		list_del(&map->free);
		kfree(map);
	}
353 354
}

355 356 357 358
/* ib_cq event handler */
static void cq_event_handler(struct ib_event *event, void *context)
{
	struct svc_xprt *xprt = context;
359 360
	dprintk("svcrdma: received CQ event %s (%d), context=%p\n",
		ib_event_msg(event->event), event->event, context);
361 362 363 364 365 366 367 368 369 370 371 372 373 374
	set_bit(XPT_CLOSE, &xprt->xpt_flags);
}

/* QP event handler */
static void qp_event_handler(struct ib_event *event, void *context)
{
	struct svc_xprt *xprt = context;

	switch (event->event) {
	/* These are considered benign events */
	case IB_EVENT_PATH_MIG:
	case IB_EVENT_COMM_EST:
	case IB_EVENT_SQ_DRAINED:
	case IB_EVENT_QP_LAST_WQE_REACHED:
375 376 377
		dprintk("svcrdma: QP event %s (%d) received for QP=%p\n",
			ib_event_msg(event->event), event->event,
			event->element.qp);
378 379 380 381 382 383 384 385
		break;
	/* These are considered fatal events */
	case IB_EVENT_PATH_MIG_ERR:
	case IB_EVENT_QP_FATAL:
	case IB_EVENT_QP_REQ_ERR:
	case IB_EVENT_QP_ACCESS_ERR:
	case IB_EVENT_DEVICE_FATAL:
	default:
386
		dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, "
387
			"closing transport\n",
388 389
			ib_event_msg(event->event), event->event,
			event->element.qp);
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
		set_bit(XPT_CLOSE, &xprt->xpt_flags);
		break;
	}
}

/*
 * Data Transfer Operation Tasklet
 *
 * Walks a list of transports with I/O pending, removing entries as
 * they are added to the server's I/O pending list. Two bits indicate
 * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
 * spinlock that serializes access to the transport list with the RQ
 * and SQ interrupt handlers.
 */
static void dto_tasklet_func(unsigned long data)
{
	struct svcxprt_rdma *xprt;
	unsigned long flags;

	spin_lock_irqsave(&dto_lock, flags);
	while (!list_empty(&dto_xprt_q)) {
		xprt = list_entry(dto_xprt_q.next,
				  struct svcxprt_rdma, sc_dto_q);
		list_del_init(&xprt->sc_dto_q);
		spin_unlock_irqrestore(&dto_lock, flags);

416 417
		rq_cq_reap(xprt);
		sq_cq_reap(xprt);
418

419
		svc_xprt_put(&xprt->sc_xprt);
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
		spin_lock_irqsave(&dto_lock, flags);
	}
	spin_unlock_irqrestore(&dto_lock, flags);
}

/*
 * Receive Queue Completion Handler
 *
 * Since an RQ completion handler is called on interrupt context, we
 * need to defer the handling of the I/O to a tasklet
 */
static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
{
	struct svcxprt_rdma *xprt = cq_context;
	unsigned long flags;

436 437 438 439
	/* Guard against unconditional flush call for destroyed QP */
	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
		return;

440 441 442 443
	/*
	 * Set the bit regardless of whether or not it's on the list
	 * because it may be on the list already due to an SQ
	 * completion.
444
	 */
445 446 447 448 449 450 451
	set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);

	/*
	 * If this transport is not already on the DTO transport queue,
	 * add it
	 */
	spin_lock_irqsave(&dto_lock, flags);
452 453
	if (list_empty(&xprt->sc_dto_q)) {
		svc_xprt_get(&xprt->sc_xprt);
454
		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
455
	}
456 457 458 459 460 461 462 463 464 465 466
	spin_unlock_irqrestore(&dto_lock, flags);

	/* Tasklet does all the work to avoid irqsave locks. */
	tasklet_schedule(&dto_tasklet);
}

/*
 * rq_cq_reap - Process the RQ CQ.
 *
 * Take all completing WC off the CQE and enqueue the associated DTO
 * context on the dto_q for the transport.
467 468
 *
 * Note that caller must hold a transport reference.
469 470 471 472 473 474 475
 */
static void rq_cq_reap(struct svcxprt_rdma *xprt)
{
	int ret;
	struct ib_wc wc;
	struct svc_rdma_op_ctxt *ctxt = NULL;

476 477 478 479
	if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
		return;

	ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
480 481 482 483 484 485
	atomic_inc(&rdma_stat_rq_poll);

	while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
		ctxt->wc_status = wc.status;
		ctxt->byte_len = wc.byte_len;
486
		svc_rdma_unmap_dma(ctxt);
487 488
		if (wc.status != IB_WC_SUCCESS) {
			/* Close the transport */
489
			dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
490 491
			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
			svc_rdma_put_context(ctxt, 1);
492
			svc_xprt_put(&xprt->sc_xprt);
493 494
			continue;
		}
495
		spin_lock_bh(&xprt->sc_rq_dto_lock);
496
		list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
497
		spin_unlock_bh(&xprt->sc_rq_dto_lock);
498
		svc_xprt_put(&xprt->sc_xprt);
499 500 501 502
	}

	if (ctxt)
		atomic_inc(&rdma_stat_rq_prod);
503 504 505 506 507 508 509 510 511

	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
	/*
	 * If data arrived before established event,
	 * don't enqueue. This defers RPC I/O until the
	 * RDMA connection is complete.
	 */
	if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
		svc_xprt_enqueue(&xprt->sc_xprt);
512 513
}

514
/*
515
 * Process a completion context
516 517 518 519
 */
static void process_context(struct svcxprt_rdma *xprt,
			    struct svc_rdma_op_ctxt *ctxt)
{
520 521 522
	struct svc_rdma_op_ctxt *read_hdr;
	int free_pages = 0;

523 524 525 526
	svc_rdma_unmap_dma(ctxt);

	switch (ctxt->wr_op) {
	case IB_WR_SEND:
527
		free_pages = 1;
528 529 530 531 532 533
		break;

	case IB_WR_RDMA_WRITE:
		break;

	case IB_WR_RDMA_READ:
534
	case IB_WR_RDMA_READ_WITH_INV:
S
Steve Wise 已提交
535
		svc_rdma_put_frmr(xprt, ctxt->frmr);
536 537 538 539 540

		if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags))
			break;

		read_hdr = ctxt->read_hdr;
541
		svc_rdma_put_context(ctxt, 0);
542 543 544 545 546 547 548 549

		spin_lock_bh(&xprt->sc_rq_dto_lock);
		set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
		list_add_tail(&read_hdr->dto_q,
			      &xprt->sc_read_complete_q);
		spin_unlock_bh(&xprt->sc_rq_dto_lock);
		svc_xprt_enqueue(&xprt->sc_xprt);
		return;
550 551

	default:
552 553
		dprintk("svcrdma: unexpected completion opcode=%d\n",
			ctxt->wr_op);
554 555
		break;
	}
556 557

	svc_rdma_put_context(ctxt, free_pages);
558 559
}

560 561
/*
 * Send Queue Completion Handler - potentially called on interrupt context.
562 563
 *
 * Note that caller must hold a transport reference.
564 565 566 567
 */
static void sq_cq_reap(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_op_ctxt *ctxt = NULL;
S
Steve Wise 已提交
568 569
	struct ib_wc wc_a[6];
	struct ib_wc *wc;
570 571 572
	struct ib_cq *cq = xprt->sc_sq_cq;
	int ret;

S
Steve Wise 已提交
573 574
	memset(wc_a, 0, sizeof(wc_a));

575 576 577 578
	if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
		return;

	ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
579
	atomic_inc(&rdma_stat_sq_poll);
S
Steve Wise 已提交
580 581
	while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
		int i;
582

S
Steve Wise 已提交
583 584 585
		for (i = 0; i < ret; i++) {
			wc = &wc_a[i];
			if (wc->status != IB_WC_SUCCESS) {
586 587
				dprintk("svcrdma: sq wc err status %s (%d)\n",
					ib_wc_status_msg(wc->status),
S
Steve Wise 已提交
588
					wc->status);
589

S
Steve Wise 已提交
590 591 592
				/* Close the transport */
				set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
			}
593

S
Steve Wise 已提交
594 595 596 597 598 599 600 601 602 603 604
			/* Decrement used SQ WR count */
			atomic_dec(&xprt->sc_sq_count);
			wake_up(&xprt->sc_send_wait);

			ctxt = (struct svc_rdma_op_ctxt *)
				(unsigned long)wc->wr_id;
			if (ctxt)
				process_context(xprt, ctxt);

			svc_xprt_put(&xprt->sc_xprt);
		}
605 606 607 608 609 610 611 612 613 614 615
	}

	if (ctxt)
		atomic_inc(&rdma_stat_sq_prod);
}

static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
{
	struct svcxprt_rdma *xprt = cq_context;
	unsigned long flags;

616 617 618 619
	/* Guard against unconditional flush call for destroyed QP */
	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
		return;

620 621 622 623
	/*
	 * Set the bit regardless of whether or not it's on the list
	 * because it may be on the list already due to an RQ
	 * completion.
624
	 */
625 626 627 628 629 630 631
	set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);

	/*
	 * If this transport is not already on the DTO transport queue,
	 * add it
	 */
	spin_lock_irqsave(&dto_lock, flags);
632 633
	if (list_empty(&xprt->sc_dto_q)) {
		svc_xprt_get(&xprt->sc_xprt);
634
		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
635
	}
636 637 638 639 640 641 642 643 644 645 646 647 648
	spin_unlock_irqrestore(&dto_lock, flags);

	/* Tasklet does all the work to avoid irqsave locks. */
	tasklet_schedule(&dto_tasklet);
}

static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
					     int listener)
{
	struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);

	if (!cma_xprt)
		return NULL;
649
	svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
650 651 652 653
	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
	INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
T
Tom Tucker 已提交
654
	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
655
	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
656
	INIT_LIST_HEAD(&cma_xprt->sc_maps);
657 658 659 660
	init_waitqueue_head(&cma_xprt->sc_send_wait);

	spin_lock_init(&cma_xprt->sc_lock);
	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
T
Tom Tucker 已提交
661
	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
662
	spin_lock_init(&cma_xprt->sc_ctxt_lock);
663
	spin_lock_init(&cma_xprt->sc_map_lock);
664

665
	if (listener)
666 667 668 669 670
		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);

	return cma_xprt;
}

671
int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
672 673 674 675
{
	struct ib_recv_wr recv_wr, *bad_recv_wr;
	struct svc_rdma_op_ctxt *ctxt;
	struct page *page;
676
	dma_addr_t pa;
677 678 679 680 681 682 683 684
	int sge_no;
	int buflen;
	int ret;

	ctxt = svc_rdma_get_context(xprt);
	buflen = 0;
	ctxt->direction = DMA_FROM_DEVICE;
	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
685 686 687 688
		if (sge_no >= xprt->sc_max_sge) {
			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
			goto err_put_ctxt;
		}
689 690 691
		page = alloc_page(flags);
		if (!page)
			goto err_put_ctxt;
692
		ctxt->pages[sge_no] = page;
693 694
		pa = ib_dma_map_page(xprt->sc_cm_id->device,
				     page, 0, PAGE_SIZE,
695
				     DMA_FROM_DEVICE);
696 697 698
		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
			goto err_put_ctxt;
		atomic_inc(&xprt->sc_dma_used);
699 700
		ctxt->sge[sge_no].addr = pa;
		ctxt->sge[sge_no].length = PAGE_SIZE;
701
		ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
702
		ctxt->count = sge_no + 1;
703 704 705 706 707 708 709
		buflen += PAGE_SIZE;
	}
	recv_wr.next = NULL;
	recv_wr.sg_list = &ctxt->sge[0];
	recv_wr.num_sge = ctxt->count;
	recv_wr.wr_id = (u64)(unsigned long)ctxt;

710
	svc_xprt_get(&xprt->sc_xprt);
711
	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
712
	if (ret) {
S
Steve Wise 已提交
713
		svc_rdma_unmap_dma(ctxt);
714
		svc_rdma_put_context(ctxt, 1);
S
Steve Wise 已提交
715
		svc_xprt_put(&xprt->sc_xprt);
716
	}
717
	return ret;
718 719

 err_put_ctxt:
720
	svc_rdma_unmap_dma(ctxt);
721 722
	svc_rdma_put_context(ctxt, 1);
	return -ENOMEM;
723 724 725 726 727 728 729 730 731 732 733 734 735
}

/*
 * This function handles the CONNECT_REQUEST event on a listening
 * endpoint. It is passed the cma_id for the _new_ connection. The context in
 * this cma_id is inherited from the listening cma_id and is the svc_xprt
 * structure for the listening endpoint.
 *
 * This function creates a new xprt for the new connection and enqueues it on
 * the accept queue for the listent xprt. When the listen thread is kicked, it
 * will call the recvfrom method on the listen xprt which will accept the new
 * connection.
 */
736
static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
737 738 739
{
	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
	struct svcxprt_rdma *newxprt;
740
	struct sockaddr *sa;
741 742 743 744 745 746 747 748 749 750 751 752

	/* Create a new transport */
	newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
	if (!newxprt) {
		dprintk("svcrdma: failed to create new transport\n");
		return;
	}
	newxprt->sc_cm_id = new_cma_id;
	new_cma_id->context = newxprt;
	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
		newxprt, newxprt->sc_cm_id, listen_xprt);

753 754 755
	/* Save client advertised inbound read limit for use later in accept. */
	newxprt->sc_ord = client_ird;

756 757 758 759 760 761
	/* Set the local and remote addresses in the transport */
	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));

762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
	/*
	 * Enqueue the new transport on the accept queue of the listening
	 * transport
	 */
	spin_lock_bh(&listen_xprt->sc_lock);
	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
	spin_unlock_bh(&listen_xprt->sc_lock);

	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
	svc_xprt_enqueue(&listen_xprt->sc_xprt);
}

/*
 * Handles events generated on the listening endpoint. These events will be
 * either be incoming connect requests or adapter removal  events.
 */
static int rdma_listen_handler(struct rdma_cm_id *cma_id,
			       struct rdma_cm_event *event)
{
	struct svcxprt_rdma *xprt = cma_id->context;
	int ret = 0;

	switch (event->event) {
	case RDMA_CM_EVENT_CONNECT_REQUEST:
		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
787 788
			"event = %s (%d)\n", cma_id, cma_id->context,
			rdma_event_msg(event->event), event->event);
789
		handle_connect_req(cma_id,
T
Tom Tucker 已提交
790
				   event->param.conn.initiator_depth);
791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
		break;

	case RDMA_CM_EVENT_ESTABLISHED:
		/* Accept complete */
		dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
			"cm_id=%p\n", xprt, cma_id);
		break;

	case RDMA_CM_EVENT_DEVICE_REMOVAL:
		dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
			xprt, cma_id);
		if (xprt)
			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
		break;

	default:
		dprintk("svcrdma: Unexpected event on listening endpoint %p, "
808 809
			"event = %s (%d)\n", cma_id,
			rdma_event_msg(event->event), event->event);
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
		break;
	}

	return ret;
}

static int rdma_cma_handler(struct rdma_cm_id *cma_id,
			    struct rdma_cm_event *event)
{
	struct svc_xprt *xprt = cma_id->context;
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);
	switch (event->event) {
	case RDMA_CM_EVENT_ESTABLISHED:
		/* Accept complete */
825
		svc_xprt_get(xprt);
826 827 828 829 830 831 832 833 834 835 836
		dprintk("svcrdma: Connection completed on DTO xprt=%p, "
			"cm_id=%p\n", xprt, cma_id);
		clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
		svc_xprt_enqueue(xprt);
		break;
	case RDMA_CM_EVENT_DISCONNECTED:
		dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
			xprt, cma_id);
		if (xprt) {
			set_bit(XPT_CLOSE, &xprt->xpt_flags);
			svc_xprt_enqueue(xprt);
837
			svc_xprt_put(xprt);
838 839 840 841
		}
		break;
	case RDMA_CM_EVENT_DEVICE_REMOVAL:
		dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
842 843
			"event = %s (%d)\n", cma_id, xprt,
			rdma_event_msg(event->event), event->event);
844 845 846
		if (xprt) {
			set_bit(XPT_CLOSE, &xprt->xpt_flags);
			svc_xprt_enqueue(xprt);
847
			svc_xprt_put(xprt);
848 849 850 851
		}
		break;
	default:
		dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
852 853
			"event = %s (%d)\n", cma_id,
			rdma_event_msg(event->event), event->event);
854 855 856 857 858 859 860 861 862
		break;
	}
	return 0;
}

/*
 * Create a listening RDMA service endpoint.
 */
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
863
					struct net *net,
864 865 866 867 868 869 870 871
					struct sockaddr *sa, int salen,
					int flags)
{
	struct rdma_cm_id *listen_id;
	struct svcxprt_rdma *cma_xprt;
	int ret;

	dprintk("svcrdma: Creating RDMA socket\n");
872 873 874 875
	if (sa->sa_family != AF_INET) {
		dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
		return ERR_PTR(-EAFNOSUPPORT);
	}
876 877
	cma_xprt = rdma_create_xprt(serv, 1);
	if (!cma_xprt)
878
		return ERR_PTR(-ENOMEM);
879

880 881
	listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
				   RDMA_PS_TCP, IB_QPT_RC);
882
	if (IS_ERR(listen_id)) {
883 884 885
		ret = PTR_ERR(listen_id);
		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
		goto err0;
886
	}
887

888 889 890
	ret = rdma_bind_addr(listen_id, sa);
	if (ret) {
		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
891
		goto err1;
892 893 894 895 896 897
	}
	cma_xprt->sc_cm_id = listen_id;

	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
	if (ret) {
		dprintk("svcrdma: rdma_listen failed = %d\n", ret);
898
		goto err1;
899 900 901 902 903 904 905 906 907 908
	}

	/*
	 * We need to use the address from the cm_id in case the
	 * caller specified 0 for the port number.
	 */
	sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);

	return &cma_xprt->sc_xprt;
909 910 911 912 913 914

 err1:
	rdma_destroy_id(listen_id);
 err0:
	kfree(cma_xprt);
	return ERR_PTR(ret);
915 916
}

T
Tom Tucker 已提交
917 918 919
static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
{
	struct ib_mr *mr;
920
	struct scatterlist *sg;
T
Tom Tucker 已提交
921
	struct svc_rdma_fastreg_mr *frmr;
922
	u32 num_sg;
T
Tom Tucker 已提交
923 924 925 926 927

	frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
	if (!frmr)
		goto err;

928 929
	num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len);
	mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg);
930
	if (IS_ERR(mr))
T
Tom Tucker 已提交
931 932
		goto err_free_frmr;

933 934
	sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL);
	if (!sg)
T
Tom Tucker 已提交
935 936
		goto err_free_mr;

937 938
	sg_init_table(sg, RPCSVC_MAXPAGES);

T
Tom Tucker 已提交
939
	frmr->mr = mr;
940
	frmr->sg = sg;
T
Tom Tucker 已提交
941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959
	INIT_LIST_HEAD(&frmr->frmr_list);
	return frmr;

 err_free_mr:
	ib_dereg_mr(mr);
 err_free_frmr:
	kfree(frmr);
 err:
	return ERR_PTR(-ENOMEM);
}

static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_fastreg_mr *frmr;

	while (!list_empty(&xprt->sc_frmr_q)) {
		frmr = list_entry(xprt->sc_frmr_q.next,
				  struct svc_rdma_fastreg_mr, frmr_list);
		list_del_init(&frmr->frmr_list);
960
		kfree(frmr->sg);
T
Tom Tucker 已提交
961 962 963 964 965 966 967 968 969 970 971 972 973 974
		ib_dereg_mr(frmr->mr);
		kfree(frmr);
	}
}

struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
{
	struct svc_rdma_fastreg_mr *frmr = NULL;

	spin_lock_bh(&rdma->sc_frmr_q_lock);
	if (!list_empty(&rdma->sc_frmr_q)) {
		frmr = list_entry(rdma->sc_frmr_q.next,
				  struct svc_rdma_fastreg_mr, frmr_list);
		list_del_init(&frmr->frmr_list);
975
		frmr->sg_nents = 0;
T
Tom Tucker 已提交
976 977 978 979 980 981 982 983 984 985 986 987
	}
	spin_unlock_bh(&rdma->sc_frmr_q_lock);
	if (frmr)
		return frmr;

	return rdma_alloc_frmr(rdma);
}

void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
		       struct svc_rdma_fastreg_mr *frmr)
{
	if (frmr) {
988 989 990
		ib_dma_unmap_sg(rdma->sc_cm_id->device,
				frmr->sg, frmr->sg_nents, frmr->direction);
		atomic_dec(&rdma->sc_dma_used);
T
Tom Tucker 已提交
991
		spin_lock_bh(&rdma->sc_frmr_q_lock);
992
		WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
T
Tom Tucker 已提交
993 994 995 996 997
		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
		spin_unlock_bh(&rdma->sc_frmr_q_lock);
	}
}

998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
/*
 * This is the xpo_recvfrom function for listening endpoints. Its
 * purpose is to accept incoming connections. The CMA callback handler
 * has already created a new transport and attached it to the new CMA
 * ID.
 *
 * There is a queue of pending connections hung on the listening
 * transport. This queue contains the new svc_xprt structure. This
 * function takes svc_xprt structures off the accept_q and completes
 * the connection.
 */
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *listen_rdma;
	struct svcxprt_rdma *newxprt = NULL;
	struct rdma_conn_param conn_param;
1014
	struct ib_cq_init_attr cq_attr = {};
1015
	struct ib_qp_init_attr qp_attr;
1016
	struct ib_device *dev;
1017
	int uninitialized_var(dma_mr_acc);
M
Michael Wang 已提交
1018
	int need_dma_mr = 0;
1019
	unsigned int i;
1020
	int ret = 0;
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039

	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
	clear_bit(XPT_CONN, &xprt->xpt_flags);
	/* Get the next entry off the accept list */
	spin_lock_bh(&listen_rdma->sc_lock);
	if (!list_empty(&listen_rdma->sc_accept_q)) {
		newxprt = list_entry(listen_rdma->sc_accept_q.next,
				     struct svcxprt_rdma, sc_accept_q);
		list_del_init(&newxprt->sc_accept_q);
	}
	if (!list_empty(&listen_rdma->sc_accept_q))
		set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
	spin_unlock_bh(&listen_rdma->sc_lock);
	if (!newxprt)
		return NULL;

	dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
		newxprt, newxprt->sc_cm_id);

1040
	dev = newxprt->sc_cm_id->device;
1041 1042 1043

	/* Qualify the transport resource defaults with the
	 * capabilities of this particular device */
1044
	newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
1045
				  (size_t)RPCSVC_MAXPAGES);
1046
	newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
1047
				       RPCSVC_MAXPAGES);
1048
	newxprt->sc_max_req_size = svcrdma_max_req_size;
1049 1050 1051 1052 1053 1054 1055
	newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
					 svcrdma_max_requests);
	newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
					    svcrdma_max_bc_requests);
	newxprt->sc_rq_depth = newxprt->sc_max_requests +
			       newxprt->sc_max_bc_requests;
	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
1056

1057 1058
	if (!svc_rdma_prealloc_ctxts(newxprt))
		goto errout;
1059 1060
	if (!svc_rdma_prealloc_maps(newxprt))
		goto errout;
1061

1062 1063 1064 1065
	/*
	 * Limit ORD based on client limit, local device limit, and
	 * configured svcrdma limit.
	 */
1066
	newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
1067
	newxprt->sc_ord = min_t(size_t,	svcrdma_ord, newxprt->sc_ord);
1068

1069
	newxprt->sc_pd = ib_alloc_pd(dev);
1070 1071 1072 1073
	if (IS_ERR(newxprt->sc_pd)) {
		dprintk("svcrdma: error creating PD for connect request\n");
		goto errout;
	}
1074
	cq_attr.cqe = newxprt->sc_sq_depth;
1075
	newxprt->sc_sq_cq = ib_create_cq(dev,
1076 1077 1078
					 sq_comp_handler,
					 cq_event_handler,
					 newxprt,
1079
					 &cq_attr);
1080 1081 1082 1083
	if (IS_ERR(newxprt->sc_sq_cq)) {
		dprintk("svcrdma: error creating SQ CQ for connect request\n");
		goto errout;
	}
1084
	cq_attr.cqe = newxprt->sc_rq_depth;
1085
	newxprt->sc_rq_cq = ib_create_cq(dev,
1086 1087 1088
					 rq_comp_handler,
					 cq_event_handler,
					 newxprt,
1089
					 &cq_attr);
1090 1091 1092 1093 1094 1095 1096 1097 1098
	if (IS_ERR(newxprt->sc_rq_cq)) {
		dprintk("svcrdma: error creating RQ CQ for connect request\n");
		goto errout;
	}

	memset(&qp_attr, 0, sizeof qp_attr);
	qp_attr.event_handler = qp_event_handler;
	qp_attr.qp_context = &newxprt->sc_xprt;
	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
1099
	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
	qp_attr.qp_type = IB_QPT_RC;
	qp_attr.send_cq = newxprt->sc_sq_cq;
	qp_attr.recv_cq = newxprt->sc_rq_cq;
	dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
		"    cm_id->device=%p, sc_pd->device=%p\n"
		"    cap.max_send_wr = %d\n"
		"    cap.max_recv_wr = %d\n"
		"    cap.max_send_sge = %d\n"
		"    cap.max_recv_sge = %d\n",
		newxprt->sc_cm_id, newxprt->sc_pd,
1113
		dev, newxprt->sc_pd->device,
1114 1115 1116 1117 1118 1119 1120
		qp_attr.cap.max_send_wr,
		qp_attr.cap.max_recv_wr,
		qp_attr.cap.max_send_sge,
		qp_attr.cap.max_recv_sge);

	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
	if (ret) {
1121 1122
		dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
		goto errout;
1123 1124 1125
	}
	newxprt->sc_qp = newxprt->sc_cm_id->qp;

1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
	/*
	 * Use the most secure set of MR resources based on the
	 * transport type and available memory management features in
	 * the device. Here's the table implemented below:
	 *
	 *		Fast	Global	DMA	Remote WR
	 *		Reg	LKEY	MR	Access
	 *		Sup'd	Sup'd	Needed	Needed
	 *
	 * IWARP	N	N	Y	Y
	 *		N	Y	Y	Y
	 *		Y	N	Y	N
	 *		Y	Y	N	-
	 *
	 * IB		N	N	Y	N
	 *		N	Y	N	-
	 *		Y	N	Y	N
	 *		Y	Y	N	-
	 *
	 * NB:	iWARP requires remote write access for the data sink
	 *	of an RDMA_READ. IB does not.
	 */
1148
	newxprt->sc_reader = rdma_read_chunk_lcl;
1149
	if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
1150
		newxprt->sc_frmr_pg_list_len =
1151
			dev->attrs.max_fast_reg_page_list_len;
1152
		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
1153
		newxprt->sc_reader = rdma_read_chunk_frmr;
1154 1155 1156 1157 1158
	}

	/*
	 * Determine if a DMA MR is required and if so, what privs are required
	 */
1159 1160
	if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
	    !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
1161
		goto errout;
M
Michael Wang 已提交
1162 1163

	if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
1164
	    !(dev->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
M
Michael Wang 已提交
1165 1166
		need_dma_mr = 1;
		dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
1167
		if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
M
Michael Wang 已提交
1168 1169
		    !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
			dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
1170 1171
	}

1172
	if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
M
Michael Wang 已提交
1173 1174
		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;

1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
	/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
	if (need_dma_mr) {
		/* Register all of physical memory */
		newxprt->sc_phys_mr =
			ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
		if (IS_ERR(newxprt->sc_phys_mr)) {
			dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
				ret);
			goto errout;
		}
		newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
	} else
1187
		newxprt->sc_dma_lkey = dev->local_dma_lkey;
1188

1189
	/* Post receive buffers */
1190
	for (i = 0; i < newxprt->sc_rq_depth; i++) {
1191
		ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
1192 1193 1194 1195 1196 1197 1198 1199 1200
		if (ret) {
			dprintk("svcrdma: failure posting receive buffers\n");
			goto errout;
		}
	}

	/* Swap out the handler */
	newxprt->sc_cm_id->event_handler = rdma_cma_handler;

1201 1202 1203 1204 1205 1206 1207
	/*
	 * Arm the CQs for the SQ and RQ before accepting so we can't
	 * miss the first message
	 */
	ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
	ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);

1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
	/* Accept Connection */
	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
	memset(&conn_param, 0, sizeof conn_param);
	conn_param.responder_resources = 0;
	conn_param.initiator_depth = newxprt->sc_ord;
	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
	if (ret) {
		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
		       ret);
		goto errout;
	}

	dprintk("svcrdma: new connection %p accepted with the following "
		"attributes:\n"
H
Harvey Harrison 已提交
1222
		"    local_ip        : %pI4\n"
1223
		"    local_port	     : %d\n"
H
Harvey Harrison 已提交
1224
		"    remote_ip       : %pI4\n"
1225 1226
		"    remote_port     : %d\n"
		"    max_sge         : %d\n"
1227
		"    max_sge_rd      : %d\n"
1228 1229 1230 1231
		"    sq_depth        : %d\n"
		"    max_requests    : %d\n"
		"    ord             : %d\n",
		newxprt,
H
Harvey Harrison 已提交
1232 1233
		&((struct sockaddr_in *)&newxprt->sc_cm_id->
			 route.addr.src_addr)->sin_addr.s_addr,
1234 1235
		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
		       route.addr.src_addr)->sin_port),
H
Harvey Harrison 已提交
1236 1237
		&((struct sockaddr_in *)&newxprt->sc_cm_id->
			 route.addr.dst_addr)->sin_addr.s_addr,
1238 1239 1240
		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
		       route.addr.dst_addr)->sin_port),
		newxprt->sc_max_sge,
1241
		newxprt->sc_max_sge_rd,
1242 1243 1244 1245 1246 1247 1248 1249
		newxprt->sc_sq_depth,
		newxprt->sc_max_requests,
		newxprt->sc_ord);

	return &newxprt->sc_xprt;

 errout:
	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
1250 1251
	/* Take a reference in case the DTO handler runs */
	svc_xprt_get(&newxprt->sc_xprt);
1252
	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
1253
		ib_destroy_qp(newxprt->sc_qp);
1254
	rdma_destroy_id(newxprt->sc_cm_id);
1255 1256
	/* This call to put will destroy the transport */
	svc_xprt_put(&newxprt->sc_xprt);
1257 1258 1259 1260 1261 1262 1263
	return NULL;
}

static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
{
}

1264
/*
1265
 * When connected, an svc_xprt has at least two references:
1266 1267 1268 1269 1270 1271 1272 1273
 *
 * - A reference held by the cm_id between the ESTABLISHED and
 *   DISCONNECTED events. If the remote peer disconnected first, this
 *   reference could be gone.
 *
 * - A reference held by the svc_recv code that called this function
 *   as part of close processing.
 *
1274
 * At a minimum one references should still be held.
1275
 */
1276 1277 1278 1279 1280
static void svc_rdma_detach(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);
	dprintk("svc: svc_rdma_detach(%p)\n", xprt);
1281 1282

	/* Disconnect and flush posted WQE */
1283 1284 1285
	rdma_disconnect(rdma->sc_cm_id);
}

1286
static void __svc_rdma_free(struct work_struct *work)
1287
{
1288 1289
	struct svcxprt_rdma *rdma =
		container_of(work, struct svcxprt_rdma, sc_work);
1290 1291 1292
	struct svc_xprt *xprt = &rdma->sc_xprt;

	dprintk("svcrdma: %s(%p)\n", __func__, rdma);
1293

1294
	/* We should only be called from kref_put */
1295
	if (atomic_read(&xprt->xpt_ref.refcount) != 0)
1296
		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
1297
		       atomic_read(&xprt->xpt_ref.refcount));
1298

1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
	/*
	 * Destroy queued, but not processed read completions. Note
	 * that this cleanup has to be done before destroying the
	 * cm_id because the device ptr is needed to unmap the dma in
	 * svc_rdma_put_context.
	 */
	while (!list_empty(&rdma->sc_read_complete_q)) {
		struct svc_rdma_op_ctxt *ctxt;
		ctxt = list_entry(rdma->sc_read_complete_q.next,
				  struct svc_rdma_op_ctxt,
				  dto_q);
		list_del_init(&ctxt->dto_q);
		svc_rdma_put_context(ctxt, 1);
	}

	/* Destroy queued, but not processed recv completions */
	while (!list_empty(&rdma->sc_rq_dto_q)) {
		struct svc_rdma_op_ctxt *ctxt;
		ctxt = list_entry(rdma->sc_rq_dto_q.next,
				  struct svc_rdma_op_ctxt,
				  dto_q);
		list_del_init(&ctxt->dto_q);
		svc_rdma_put_context(ctxt, 1);
	}

	/* Warn if we leaked a resource or under-referenced */
1325
	if (rdma->sc_ctxt_used != 0)
1326
		pr_err("svcrdma: ctxt still in use? (%d)\n",
1327
		       rdma->sc_ctxt_used);
1328 1329 1330
	if (atomic_read(&rdma->sc_dma_used) != 0)
		pr_err("svcrdma: dma still in use? (%d)\n",
		       atomic_read(&rdma->sc_dma_used));
1331

1332 1333 1334 1335 1336 1337
	/* Final put of backchannel client transport */
	if (xprt->xpt_bc_xprt) {
		xprt_put(xprt->xpt_bc_xprt);
		xprt->xpt_bc_xprt = NULL;
	}

T
Tom Tucker 已提交
1338
	rdma_dealloc_frmr_q(rdma);
1339
	svc_rdma_destroy_ctxts(rdma);
1340
	svc_rdma_destroy_maps(rdma);
T
Tom Tucker 已提交
1341

1342 1343 1344 1345
	/* Destroy the QP if present (not a listener) */
	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
		ib_destroy_qp(rdma->sc_qp);

1346 1347
	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
		ib_destroy_cq(rdma->sc_sq_cq);
1348

1349 1350
	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
		ib_destroy_cq(rdma->sc_rq_cq);
1351

1352 1353
	if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
		ib_dereg_mr(rdma->sc_phys_mr);
1354

1355 1356
	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
		ib_dealloc_pd(rdma->sc_pd);
1357

1358 1359 1360
	/* Destroy the CM ID */
	rdma_destroy_id(rdma->sc_cm_id);

1361
	kfree(rdma);
1362 1363
}

1364 1365 1366 1367 1368
static void svc_rdma_free(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);
	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
1369
	queue_work(svc_rdma_wq, &rdma->sc_work);
1370 1371
}

1372 1373 1374 1375 1376 1377
static int svc_rdma_has_wspace(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);

	/*
S
Steve Wise 已提交
1378
	 * If there are already waiters on the SQ,
1379 1380 1381 1382 1383 1384 1385 1386 1387
	 * return false.
	 */
	if (waitqueue_active(&rdma->sc_send_wait))
		return 0;

	/* Otherwise return true. */
	return 1;
}

1388 1389 1390 1391 1392
static int svc_rdma_secure_port(struct svc_rqst *rqstp)
{
	return 1;
}

1393 1394
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
1395 1396 1397
	struct ib_send_wr *bad_wr, *n_wr;
	int wr_count;
	int i;
1398 1399 1400
	int ret;

	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1401
		return -ENOTCONN;
1402

1403 1404 1405 1406
	wr_count = 1;
	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
		wr_count++;

1407 1408 1409
	/* If the SQ is full, wait until an SQ entry is available */
	while (1) {
		spin_lock_bh(&xprt->sc_lock);
1410
		if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
1411 1412
			spin_unlock_bh(&xprt->sc_lock);
			atomic_inc(&rdma_stat_sq_starve);
1413 1414

			/* See if we can opportunistically reap SQ WR to make room */
1415 1416 1417 1418 1419 1420
			sq_cq_reap(xprt);

			/* Wait until SQ WR available if SQ still full */
			wait_event(xprt->sc_send_wait,
				   atomic_read(&xprt->sc_sq_count) <
				   xprt->sc_sq_depth);
1421
			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1422
				return -ENOTCONN;
1423 1424
			continue;
		}
1425 1426 1427 1428 1429 1430
		/* Take a transport ref for each WR posted */
		for (i = 0; i < wr_count; i++)
			svc_xprt_get(&xprt->sc_xprt);

		/* Bump used SQ WR count and post */
		atomic_add(wr_count, &xprt->sc_sq_count);
1431
		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1432 1433 1434 1435 1436
		if (ret) {
			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
			atomic_sub(wr_count, &xprt->sc_sq_count);
			for (i = 0; i < wr_count; i ++)
				svc_xprt_put(&xprt->sc_xprt);
1437 1438 1439 1440
			dprintk("svcrdma: failed to post SQ WR rc=%d, "
			       "sc_sq_count=%d, sc_sq_depth=%d\n",
			       ret, atomic_read(&xprt->sc_sq_count),
			       xprt->sc_sq_depth);
1441
		}
1442
		spin_unlock_bh(&xprt->sc_lock);
1443 1444
		if (ret)
			wake_up(&xprt->sc_send_wait);
1445 1446 1447 1448 1449
		break;
	}
	return ret;
}

1450 1451
void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
			 enum rpcrdma_errcode err)
1452 1453 1454 1455
{
	struct ib_send_wr err_wr;
	struct page *p;
	struct svc_rdma_op_ctxt *ctxt;
1456
	__be32 *va;
1457 1458 1459
	int length;
	int ret;

1460 1461 1462
	p = alloc_page(GFP_KERNEL);
	if (!p)
		return;
1463 1464 1465 1466 1467
	va = page_address(p);

	/* XDR encode error */
	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);

1468 1469 1470 1471 1472
	ctxt = svc_rdma_get_context(xprt);
	ctxt->direction = DMA_FROM_DEVICE;
	ctxt->count = 1;
	ctxt->pages[0] = p;

1473
	/* Prepare SGE for local address */
1474 1475 1476
	ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
					    p, 0, length, DMA_FROM_DEVICE);
	if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
1477
		put_page(p);
1478
		svc_rdma_put_context(ctxt, 1);
1479 1480 1481
		return;
	}
	atomic_inc(&xprt->sc_dma_used);
1482 1483
	ctxt->sge[0].lkey = xprt->sc_dma_lkey;
	ctxt->sge[0].length = length;
1484 1485 1486 1487 1488

	/* Prepare SEND WR */
	memset(&err_wr, 0, sizeof err_wr);
	ctxt->wr_op = IB_WR_SEND;
	err_wr.wr_id = (unsigned long)ctxt;
1489
	err_wr.sg_list = ctxt->sge;
1490 1491 1492 1493 1494 1495 1496
	err_wr.num_sge = 1;
	err_wr.opcode = IB_WR_SEND;
	err_wr.send_flags = IB_SEND_SIGNALED;

	/* Post It */
	ret = svc_rdma_send(xprt, &err_wr);
	if (ret) {
1497 1498
		dprintk("svcrdma: Error %d posting send for protocol error\n",
			ret);
1499
		svc_rdma_unmap_dma(ctxt);
1500 1501 1502
		svc_rdma_put_context(ctxt, 1);
	}
}