svc_rdma_transport.c 38.9 KB
Newer Older
1
/*
S
Steve Wise 已提交
2
 * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the BSD-type
 * license below:
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *      Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 *      Neither the name of the Network Appliance, Inc. nor the names of
 *      its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Tom Tucker <tom@opengridcomputing.com>
 */

#include <linux/sunrpc/svc_xprt.h>
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
46
#include <linux/interrupt.h>
47
#include <linux/sched.h>
48
#include <linux/slab.h>
49
#include <linux/spinlock.h>
50
#include <linux/workqueue.h>
51 52 53
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <linux/sunrpc/svc_rdma.h>
54
#include <linux/export.h>
55
#include "xprt_rdma.h"
56 57 58

#define RPCDBG_FACILITY	RPCDBG_SVCXPRT

59
static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int);
60
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
61
					struct net *net,
62 63 64 65 66 67 68 69
					struct sockaddr *sa, int salen,
					int flags);
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
static void svc_rdma_release_rqst(struct svc_rqst *);
static void dto_tasklet_func(unsigned long data);
static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
70
static int svc_rdma_secure_port(struct svc_rqst *);
71 72 73
static void rq_cq_reap(struct svcxprt_rdma *xprt);
static void sq_cq_reap(struct svcxprt_rdma *xprt);

R
Roel Kluin 已提交
74
static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
75 76 77 78 79 80 81 82 83 84 85 86 87
static DEFINE_SPINLOCK(dto_lock);
static LIST_HEAD(dto_xprt_q);

static struct svc_xprt_ops svc_rdma_ops = {
	.xpo_create = svc_rdma_create,
	.xpo_recvfrom = svc_rdma_recvfrom,
	.xpo_sendto = svc_rdma_sendto,
	.xpo_release_rqst = svc_rdma_release_rqst,
	.xpo_detach = svc_rdma_detach,
	.xpo_free = svc_rdma_free,
	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
	.xpo_has_wspace = svc_rdma_has_wspace,
	.xpo_accept = svc_rdma_accept,
88
	.xpo_secure_port = svc_rdma_secure_port,
89 90 91 92 93 94
};

struct svc_xprt_class svc_rdma_class = {
	.xcl_name = "rdma",
	.xcl_owner = THIS_MODULE,
	.xcl_ops = &svc_rdma_ops,
95
	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
96
	.xcl_ident = XPRT_TRANSPORT_RDMA,
97 98
};

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
					   struct sockaddr *, int, int);
static void svc_rdma_bc_detach(struct svc_xprt *);
static void svc_rdma_bc_free(struct svc_xprt *);

static struct svc_xprt_ops svc_rdma_bc_ops = {
	.xpo_create = svc_rdma_bc_create,
	.xpo_detach = svc_rdma_bc_detach,
	.xpo_free = svc_rdma_bc_free,
	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
	.xpo_secure_port = svc_rdma_secure_port,
};

struct svc_xprt_class svc_rdma_bc_class = {
	.xcl_name = "rdma-bc",
	.xcl_owner = THIS_MODULE,
	.xcl_ops = &svc_rdma_bc_ops,
	.xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
};

static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
					   struct net *net,
					   struct sockaddr *sa, int salen,
					   int flags)
{
	struct svcxprt_rdma *cma_xprt;
	struct svc_xprt *xprt;

	cma_xprt = rdma_create_xprt(serv, 0);
	if (!cma_xprt)
		return ERR_PTR(-ENOMEM);
	xprt = &cma_xprt->sc_xprt;

	svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
	serv->sv_bc_xprt = xprt;

	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
	return xprt;
}

static void svc_rdma_bc_detach(struct svc_xprt *xprt)
{
	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
}

static void svc_rdma_bc_free(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);

	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
	if (xprt)
		kfree(rdma);
}
#endif	/* CONFIG_SUNRPC_BACKCHANNEL */

156 157
static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
					   gfp_t flags)
158 159 160
{
	struct svc_rdma_op_ctxt *ctxt;

161 162 163 164 165 166 167 168 169 170 171
	ctxt = kmalloc(sizeof(*ctxt), flags);
	if (ctxt) {
		ctxt->xprt = xprt;
		INIT_LIST_HEAD(&ctxt->free);
		INIT_LIST_HEAD(&ctxt->dto_q);
	}
	return ctxt;
}

static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
{
172
	unsigned int i;
173 174 175 176

	/* Each RPC/RDMA credit can consume a number of send
	 * and receive WQEs. One ctxt is allocated for each.
	 */
177
	i = xprt->sc_sq_depth + xprt->sc_rq_depth;
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206

	while (i--) {
		struct svc_rdma_op_ctxt *ctxt;

		ctxt = alloc_ctxt(xprt, GFP_KERNEL);
		if (!ctxt) {
			dprintk("svcrdma: No memory for RDMA ctxt\n");
			return false;
		}
		list_add(&ctxt->free, &xprt->sc_ctxts);
	}
	return true;
}

struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_op_ctxt *ctxt = NULL;

	spin_lock_bh(&xprt->sc_ctxt_lock);
	xprt->sc_ctxt_used++;
	if (list_empty(&xprt->sc_ctxts))
		goto out_empty;

	ctxt = list_first_entry(&xprt->sc_ctxts,
				struct svc_rdma_op_ctxt, free);
	list_del_init(&ctxt->free);
	spin_unlock_bh(&xprt->sc_ctxt_lock);

out:
207
	ctxt->count = 0;
T
Tom Tucker 已提交
208
	ctxt->frmr = NULL;
209
	return ctxt;
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225

out_empty:
	/* Either pre-allocation missed the mark, or send
	 * queue accounting is broken.
	 */
	spin_unlock_bh(&xprt->sc_ctxt_lock);

	ctxt = alloc_ctxt(xprt, GFP_NOIO);
	if (ctxt)
		goto out;

	spin_lock_bh(&xprt->sc_ctxt_lock);
	xprt->sc_ctxt_used--;
	spin_unlock_bh(&xprt->sc_ctxt_lock);
	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
	return NULL;
226 227
}

228
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
229 230 231 232
{
	struct svcxprt_rdma *xprt = ctxt->xprt;
	int i;
	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
T
Tom Tucker 已提交
233 234
		/*
		 * Unmap the DMA addr in the SGE if the lkey matches
C
Christoph Hellwig 已提交
235
		 * the local_dma_lkey, otherwise, ignore it since it is
T
Tom Tucker 已提交
236 237 238
		 * an FRMR lkey and will be unmapped later when the
		 * last WR that uses it completes.
		 */
C
Christoph Hellwig 已提交
239
		if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
T
Tom Tucker 已提交
240
			atomic_dec(&xprt->sc_dma_used);
241
			ib_dma_unmap_page(xprt->sc_cm_id->device,
T
Tom Tucker 已提交
242 243 244 245
					    ctxt->sge[i].addr,
					    ctxt->sge[i].length,
					    ctxt->direction);
		}
246 247 248
	}
}

249 250
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
{
251
	struct svcxprt_rdma *xprt = ctxt->xprt;
252 253 254 255 256 257
	int i;

	if (free_pages)
		for (i = 0; i < ctxt->count; i++)
			put_page(ctxt->pages[i]);

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
	spin_lock_bh(&xprt->sc_ctxt_lock);
	xprt->sc_ctxt_used--;
	list_add(&ctxt->free, &xprt->sc_ctxts);
	spin_unlock_bh(&xprt->sc_ctxt_lock);
}

static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
{
	while (!list_empty(&xprt->sc_ctxts)) {
		struct svc_rdma_op_ctxt *ctxt;

		ctxt = list_first_entry(&xprt->sc_ctxts,
					struct svc_rdma_op_ctxt, free);
		list_del(&ctxt->free);
		kfree(ctxt);
	}
274 275
}

276
static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
277 278
{
	struct svc_rdma_req_map *map;
279 280 281 282 283 284 285 286 287

	map = kmalloc(sizeof(*map), flags);
	if (map)
		INIT_LIST_HEAD(&map->free);
	return map;
}

static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
{
288
	unsigned int i;
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319

	/* One for each receive buffer on this connection. */
	i = xprt->sc_max_requests;

	while (i--) {
		struct svc_rdma_req_map *map;

		map = alloc_req_map(GFP_KERNEL);
		if (!map) {
			dprintk("svcrdma: No memory for request map\n");
			return false;
		}
		list_add(&map->free, &xprt->sc_maps);
	}
	return true;
}

struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_req_map *map = NULL;

	spin_lock(&xprt->sc_map_lock);
	if (list_empty(&xprt->sc_maps))
		goto out_empty;

	map = list_first_entry(&xprt->sc_maps,
			       struct svc_rdma_req_map, free);
	list_del_init(&map->free);
	spin_unlock(&xprt->sc_map_lock);

out:
320 321
	map->count = 0;
	return map;
322 323 324 325 326 327 328 329 330 331 332

out_empty:
	spin_unlock(&xprt->sc_map_lock);

	/* Pre-allocation amount was incorrect */
	map = alloc_req_map(GFP_NOIO);
	if (map)
		goto out;

	WARN_ONCE(1, "svcrdma: empty request map list?\n");
	return NULL;
333 334
}

335 336
void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
			  struct svc_rdma_req_map *map)
337
{
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
	spin_lock(&xprt->sc_map_lock);
	list_add(&map->free, &xprt->sc_maps);
	spin_unlock(&xprt->sc_map_lock);
}

static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
{
	while (!list_empty(&xprt->sc_maps)) {
		struct svc_rdma_req_map *map;

		map = list_first_entry(&xprt->sc_maps,
				       struct svc_rdma_req_map, free);
		list_del(&map->free);
		kfree(map);
	}
353 354
}

355 356 357 358
/* ib_cq event handler */
static void cq_event_handler(struct ib_event *event, void *context)
{
	struct svc_xprt *xprt = context;
359 360
	dprintk("svcrdma: received CQ event %s (%d), context=%p\n",
		ib_event_msg(event->event), event->event, context);
361 362 363 364 365 366 367 368 369 370 371 372 373 374
	set_bit(XPT_CLOSE, &xprt->xpt_flags);
}

/* QP event handler */
static void qp_event_handler(struct ib_event *event, void *context)
{
	struct svc_xprt *xprt = context;

	switch (event->event) {
	/* These are considered benign events */
	case IB_EVENT_PATH_MIG:
	case IB_EVENT_COMM_EST:
	case IB_EVENT_SQ_DRAINED:
	case IB_EVENT_QP_LAST_WQE_REACHED:
375 376 377
		dprintk("svcrdma: QP event %s (%d) received for QP=%p\n",
			ib_event_msg(event->event), event->event,
			event->element.qp);
378 379 380 381 382 383 384 385
		break;
	/* These are considered fatal events */
	case IB_EVENT_PATH_MIG_ERR:
	case IB_EVENT_QP_FATAL:
	case IB_EVENT_QP_REQ_ERR:
	case IB_EVENT_QP_ACCESS_ERR:
	case IB_EVENT_DEVICE_FATAL:
	default:
386
		dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, "
387
			"closing transport\n",
388 389
			ib_event_msg(event->event), event->event,
			event->element.qp);
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
		set_bit(XPT_CLOSE, &xprt->xpt_flags);
		break;
	}
}

/*
 * Data Transfer Operation Tasklet
 *
 * Walks a list of transports with I/O pending, removing entries as
 * they are added to the server's I/O pending list. Two bits indicate
 * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
 * spinlock that serializes access to the transport list with the RQ
 * and SQ interrupt handlers.
 */
static void dto_tasklet_func(unsigned long data)
{
	struct svcxprt_rdma *xprt;
	unsigned long flags;

	spin_lock_irqsave(&dto_lock, flags);
	while (!list_empty(&dto_xprt_q)) {
		xprt = list_entry(dto_xprt_q.next,
				  struct svcxprt_rdma, sc_dto_q);
		list_del_init(&xprt->sc_dto_q);
		spin_unlock_irqrestore(&dto_lock, flags);

416 417
		rq_cq_reap(xprt);
		sq_cq_reap(xprt);
418

419
		svc_xprt_put(&xprt->sc_xprt);
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
		spin_lock_irqsave(&dto_lock, flags);
	}
	spin_unlock_irqrestore(&dto_lock, flags);
}

/*
 * Receive Queue Completion Handler
 *
 * Since an RQ completion handler is called on interrupt context, we
 * need to defer the handling of the I/O to a tasklet
 */
static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
{
	struct svcxprt_rdma *xprt = cq_context;
	unsigned long flags;

436 437 438 439
	/* Guard against unconditional flush call for destroyed QP */
	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
		return;

440 441 442 443
	/*
	 * Set the bit regardless of whether or not it's on the list
	 * because it may be on the list already due to an SQ
	 * completion.
444
	 */
445 446 447 448 449 450 451
	set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);

	/*
	 * If this transport is not already on the DTO transport queue,
	 * add it
	 */
	spin_lock_irqsave(&dto_lock, flags);
452 453
	if (list_empty(&xprt->sc_dto_q)) {
		svc_xprt_get(&xprt->sc_xprt);
454
		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
455
	}
456 457 458 459 460 461 462 463 464 465 466
	spin_unlock_irqrestore(&dto_lock, flags);

	/* Tasklet does all the work to avoid irqsave locks. */
	tasklet_schedule(&dto_tasklet);
}

/*
 * rq_cq_reap - Process the RQ CQ.
 *
 * Take all completing WC off the CQE and enqueue the associated DTO
 * context on the dto_q for the transport.
467 468
 *
 * Note that caller must hold a transport reference.
469 470 471 472 473 474 475
 */
static void rq_cq_reap(struct svcxprt_rdma *xprt)
{
	int ret;
	struct ib_wc wc;
	struct svc_rdma_op_ctxt *ctxt = NULL;

476 477 478 479
	if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
		return;

	ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
480 481 482 483 484 485
	atomic_inc(&rdma_stat_rq_poll);

	while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
		ctxt->wc_status = wc.status;
		ctxt->byte_len = wc.byte_len;
486
		svc_rdma_unmap_dma(ctxt);
487 488
		if (wc.status != IB_WC_SUCCESS) {
			/* Close the transport */
489
			dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
490 491
			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
			svc_rdma_put_context(ctxt, 1);
492
			svc_xprt_put(&xprt->sc_xprt);
493 494
			continue;
		}
495
		spin_lock_bh(&xprt->sc_rq_dto_lock);
496
		list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
497
		spin_unlock_bh(&xprt->sc_rq_dto_lock);
498
		svc_xprt_put(&xprt->sc_xprt);
499 500 501 502
	}

	if (ctxt)
		atomic_inc(&rdma_stat_rq_prod);
503 504 505 506 507 508 509 510 511

	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
	/*
	 * If data arrived before established event,
	 * don't enqueue. This defers RPC I/O until the
	 * RDMA connection is complete.
	 */
	if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
		svc_xprt_enqueue(&xprt->sc_xprt);
512 513
}

514
/*
515
 * Process a completion context
516 517 518 519
 */
static void process_context(struct svcxprt_rdma *xprt,
			    struct svc_rdma_op_ctxt *ctxt)
{
520 521 522
	struct svc_rdma_op_ctxt *read_hdr;
	int free_pages = 0;

523 524 525 526
	svc_rdma_unmap_dma(ctxt);

	switch (ctxt->wr_op) {
	case IB_WR_SEND:
527
		free_pages = 1;
528 529 530 531 532 533
		break;

	case IB_WR_RDMA_WRITE:
		break;

	case IB_WR_RDMA_READ:
534
	case IB_WR_RDMA_READ_WITH_INV:
S
Steve Wise 已提交
535
		svc_rdma_put_frmr(xprt, ctxt->frmr);
536 537 538 539 540

		if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags))
			break;

		read_hdr = ctxt->read_hdr;
541
		svc_rdma_put_context(ctxt, 0);
542 543 544 545 546 547 548 549

		spin_lock_bh(&xprt->sc_rq_dto_lock);
		set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
		list_add_tail(&read_hdr->dto_q,
			      &xprt->sc_read_complete_q);
		spin_unlock_bh(&xprt->sc_rq_dto_lock);
		svc_xprt_enqueue(&xprt->sc_xprt);
		return;
550 551

	default:
552 553
		dprintk("svcrdma: unexpected completion opcode=%d\n",
			ctxt->wr_op);
554 555
		break;
	}
556 557

	svc_rdma_put_context(ctxt, free_pages);
558 559
}

560 561
/*
 * Send Queue Completion Handler - potentially called on interrupt context.
562 563
 *
 * Note that caller must hold a transport reference.
564 565 566 567
 */
static void sq_cq_reap(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_op_ctxt *ctxt = NULL;
S
Steve Wise 已提交
568 569
	struct ib_wc wc_a[6];
	struct ib_wc *wc;
570 571 572
	struct ib_cq *cq = xprt->sc_sq_cq;
	int ret;

S
Steve Wise 已提交
573 574
	memset(wc_a, 0, sizeof(wc_a));

575 576 577 578
	if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
		return;

	ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
579
	atomic_inc(&rdma_stat_sq_poll);
S
Steve Wise 已提交
580 581
	while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
		int i;
582

S
Steve Wise 已提交
583 584 585
		for (i = 0; i < ret; i++) {
			wc = &wc_a[i];
			if (wc->status != IB_WC_SUCCESS) {
586 587
				dprintk("svcrdma: sq wc err status %s (%d)\n",
					ib_wc_status_msg(wc->status),
S
Steve Wise 已提交
588
					wc->status);
589

S
Steve Wise 已提交
590 591 592
				/* Close the transport */
				set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
			}
593

S
Steve Wise 已提交
594 595 596 597 598 599 600 601 602 603 604
			/* Decrement used SQ WR count */
			atomic_dec(&xprt->sc_sq_count);
			wake_up(&xprt->sc_send_wait);

			ctxt = (struct svc_rdma_op_ctxt *)
				(unsigned long)wc->wr_id;
			if (ctxt)
				process_context(xprt, ctxt);

			svc_xprt_put(&xprt->sc_xprt);
		}
605 606 607 608 609 610 611 612 613 614 615
	}

	if (ctxt)
		atomic_inc(&rdma_stat_sq_prod);
}

static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
{
	struct svcxprt_rdma *xprt = cq_context;
	unsigned long flags;

616 617 618 619
	/* Guard against unconditional flush call for destroyed QP */
	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
		return;

620 621 622 623
	/*
	 * Set the bit regardless of whether or not it's on the list
	 * because it may be on the list already due to an RQ
	 * completion.
624
	 */
625 626 627 628 629 630 631
	set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);

	/*
	 * If this transport is not already on the DTO transport queue,
	 * add it
	 */
	spin_lock_irqsave(&dto_lock, flags);
632 633
	if (list_empty(&xprt->sc_dto_q)) {
		svc_xprt_get(&xprt->sc_xprt);
634
		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
635
	}
636 637 638 639 640 641 642 643 644 645 646 647 648
	spin_unlock_irqrestore(&dto_lock, flags);

	/* Tasklet does all the work to avoid irqsave locks. */
	tasklet_schedule(&dto_tasklet);
}

static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
					     int listener)
{
	struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);

	if (!cma_xprt)
		return NULL;
649
	svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
650 651 652 653
	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
	INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
T
Tom Tucker 已提交
654
	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
655
	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
656
	INIT_LIST_HEAD(&cma_xprt->sc_maps);
657 658 659 660
	init_waitqueue_head(&cma_xprt->sc_send_wait);

	spin_lock_init(&cma_xprt->sc_lock);
	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
T
Tom Tucker 已提交
661
	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
662
	spin_lock_init(&cma_xprt->sc_ctxt_lock);
663
	spin_lock_init(&cma_xprt->sc_map_lock);
664

665
	if (listener)
666 667 668 669 670
		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);

	return cma_xprt;
}

671
int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
672 673 674 675
{
	struct ib_recv_wr recv_wr, *bad_recv_wr;
	struct svc_rdma_op_ctxt *ctxt;
	struct page *page;
676
	dma_addr_t pa;
677 678 679 680 681 682 683 684
	int sge_no;
	int buflen;
	int ret;

	ctxt = svc_rdma_get_context(xprt);
	buflen = 0;
	ctxt->direction = DMA_FROM_DEVICE;
	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
685 686 687 688
		if (sge_no >= xprt->sc_max_sge) {
			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
			goto err_put_ctxt;
		}
689 690 691
		page = alloc_page(flags);
		if (!page)
			goto err_put_ctxt;
692
		ctxt->pages[sge_no] = page;
693 694
		pa = ib_dma_map_page(xprt->sc_cm_id->device,
				     page, 0, PAGE_SIZE,
695
				     DMA_FROM_DEVICE);
696 697 698
		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
			goto err_put_ctxt;
		atomic_inc(&xprt->sc_dma_used);
699 700
		ctxt->sge[sge_no].addr = pa;
		ctxt->sge[sge_no].length = PAGE_SIZE;
C
Christoph Hellwig 已提交
701
		ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
702
		ctxt->count = sge_no + 1;
703 704 705 706 707 708 709
		buflen += PAGE_SIZE;
	}
	recv_wr.next = NULL;
	recv_wr.sg_list = &ctxt->sge[0];
	recv_wr.num_sge = ctxt->count;
	recv_wr.wr_id = (u64)(unsigned long)ctxt;

710
	svc_xprt_get(&xprt->sc_xprt);
711
	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
712
	if (ret) {
S
Steve Wise 已提交
713
		svc_rdma_unmap_dma(ctxt);
714
		svc_rdma_put_context(ctxt, 1);
S
Steve Wise 已提交
715
		svc_xprt_put(&xprt->sc_xprt);
716
	}
717
	return ret;
718 719

 err_put_ctxt:
720
	svc_rdma_unmap_dma(ctxt);
721 722
	svc_rdma_put_context(ctxt, 1);
	return -ENOMEM;
723 724
}

725 726 727 728 729 730 731 732 733 734 735 736 737 738 739
int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
{
	int ret = 0;

	ret = svc_rdma_post_recv(xprt, flags);
	if (ret) {
		pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
		       ret);
		pr_err("svcrdma: closing transport %p.\n", xprt);
		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
		ret = -ENOTCONN;
	}
	return ret;
}

740 741 742 743 744 745 746 747 748 749 750
/*
 * This function handles the CONNECT_REQUEST event on a listening
 * endpoint. It is passed the cma_id for the _new_ connection. The context in
 * this cma_id is inherited from the listening cma_id and is the svc_xprt
 * structure for the listening endpoint.
 *
 * This function creates a new xprt for the new connection and enqueues it on
 * the accept queue for the listent xprt. When the listen thread is kicked, it
 * will call the recvfrom method on the listen xprt which will accept the new
 * connection.
 */
751
static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
752 753 754
{
	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
	struct svcxprt_rdma *newxprt;
755
	struct sockaddr *sa;
756 757 758 759 760 761 762 763 764 765 766 767

	/* Create a new transport */
	newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
	if (!newxprt) {
		dprintk("svcrdma: failed to create new transport\n");
		return;
	}
	newxprt->sc_cm_id = new_cma_id;
	new_cma_id->context = newxprt;
	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
		newxprt, newxprt->sc_cm_id, listen_xprt);

768 769 770
	/* Save client advertised inbound read limit for use later in accept. */
	newxprt->sc_ord = client_ird;

771 772 773 774 775 776
	/* Set the local and remote addresses in the transport */
	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));

777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
	/*
	 * Enqueue the new transport on the accept queue of the listening
	 * transport
	 */
	spin_lock_bh(&listen_xprt->sc_lock);
	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
	spin_unlock_bh(&listen_xprt->sc_lock);

	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
	svc_xprt_enqueue(&listen_xprt->sc_xprt);
}

/*
 * Handles events generated on the listening endpoint. These events will be
 * either be incoming connect requests or adapter removal  events.
 */
static int rdma_listen_handler(struct rdma_cm_id *cma_id,
			       struct rdma_cm_event *event)
{
	struct svcxprt_rdma *xprt = cma_id->context;
	int ret = 0;

	switch (event->event) {
	case RDMA_CM_EVENT_CONNECT_REQUEST:
		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
802 803
			"event = %s (%d)\n", cma_id, cma_id->context,
			rdma_event_msg(event->event), event->event);
804
		handle_connect_req(cma_id,
T
Tom Tucker 已提交
805
				   event->param.conn.initiator_depth);
806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
		break;

	case RDMA_CM_EVENT_ESTABLISHED:
		/* Accept complete */
		dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
			"cm_id=%p\n", xprt, cma_id);
		break;

	case RDMA_CM_EVENT_DEVICE_REMOVAL:
		dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
			xprt, cma_id);
		if (xprt)
			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
		break;

	default:
		dprintk("svcrdma: Unexpected event on listening endpoint %p, "
823 824
			"event = %s (%d)\n", cma_id,
			rdma_event_msg(event->event), event->event);
825 826 827 828 829 830 831 832 833 834 835 836 837 838 839
		break;
	}

	return ret;
}

static int rdma_cma_handler(struct rdma_cm_id *cma_id,
			    struct rdma_cm_event *event)
{
	struct svc_xprt *xprt = cma_id->context;
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);
	switch (event->event) {
	case RDMA_CM_EVENT_ESTABLISHED:
		/* Accept complete */
840
		svc_xprt_get(xprt);
841 842 843 844 845 846 847 848 849 850 851
		dprintk("svcrdma: Connection completed on DTO xprt=%p, "
			"cm_id=%p\n", xprt, cma_id);
		clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
		svc_xprt_enqueue(xprt);
		break;
	case RDMA_CM_EVENT_DISCONNECTED:
		dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
			xprt, cma_id);
		if (xprt) {
			set_bit(XPT_CLOSE, &xprt->xpt_flags);
			svc_xprt_enqueue(xprt);
852
			svc_xprt_put(xprt);
853 854 855 856
		}
		break;
	case RDMA_CM_EVENT_DEVICE_REMOVAL:
		dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
857 858
			"event = %s (%d)\n", cma_id, xprt,
			rdma_event_msg(event->event), event->event);
859 860 861
		if (xprt) {
			set_bit(XPT_CLOSE, &xprt->xpt_flags);
			svc_xprt_enqueue(xprt);
862
			svc_xprt_put(xprt);
863 864 865 866
		}
		break;
	default:
		dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
867 868
			"event = %s (%d)\n", cma_id,
			rdma_event_msg(event->event), event->event);
869 870 871 872 873 874 875 876 877
		break;
	}
	return 0;
}

/*
 * Create a listening RDMA service endpoint.
 */
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
878
					struct net *net,
879 880 881 882 883 884 885 886
					struct sockaddr *sa, int salen,
					int flags)
{
	struct rdma_cm_id *listen_id;
	struct svcxprt_rdma *cma_xprt;
	int ret;

	dprintk("svcrdma: Creating RDMA socket\n");
887 888 889 890
	if (sa->sa_family != AF_INET) {
		dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
		return ERR_PTR(-EAFNOSUPPORT);
	}
891 892
	cma_xprt = rdma_create_xprt(serv, 1);
	if (!cma_xprt)
893
		return ERR_PTR(-ENOMEM);
894

895 896
	listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
				   RDMA_PS_TCP, IB_QPT_RC);
897
	if (IS_ERR(listen_id)) {
898 899 900
		ret = PTR_ERR(listen_id);
		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
		goto err0;
901
	}
902

903 904 905
	ret = rdma_bind_addr(listen_id, sa);
	if (ret) {
		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
906
		goto err1;
907 908 909 910 911 912
	}
	cma_xprt->sc_cm_id = listen_id;

	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
	if (ret) {
		dprintk("svcrdma: rdma_listen failed = %d\n", ret);
913
		goto err1;
914 915 916 917 918 919 920 921 922 923
	}

	/*
	 * We need to use the address from the cm_id in case the
	 * caller specified 0 for the port number.
	 */
	sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);

	return &cma_xprt->sc_xprt;
924 925 926 927 928 929

 err1:
	rdma_destroy_id(listen_id);
 err0:
	kfree(cma_xprt);
	return ERR_PTR(ret);
930 931
}

T
Tom Tucker 已提交
932 933 934
static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
{
	struct ib_mr *mr;
935
	struct scatterlist *sg;
T
Tom Tucker 已提交
936
	struct svc_rdma_fastreg_mr *frmr;
937
	u32 num_sg;
T
Tom Tucker 已提交
938 939 940 941 942

	frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
	if (!frmr)
		goto err;

943 944
	num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len);
	mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg);
945
	if (IS_ERR(mr))
T
Tom Tucker 已提交
946 947
		goto err_free_frmr;

948 949
	sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL);
	if (!sg)
T
Tom Tucker 已提交
950 951
		goto err_free_mr;

952 953
	sg_init_table(sg, RPCSVC_MAXPAGES);

T
Tom Tucker 已提交
954
	frmr->mr = mr;
955
	frmr->sg = sg;
T
Tom Tucker 已提交
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
	INIT_LIST_HEAD(&frmr->frmr_list);
	return frmr;

 err_free_mr:
	ib_dereg_mr(mr);
 err_free_frmr:
	kfree(frmr);
 err:
	return ERR_PTR(-ENOMEM);
}

static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
{
	struct svc_rdma_fastreg_mr *frmr;

	while (!list_empty(&xprt->sc_frmr_q)) {
		frmr = list_entry(xprt->sc_frmr_q.next,
				  struct svc_rdma_fastreg_mr, frmr_list);
		list_del_init(&frmr->frmr_list);
975
		kfree(frmr->sg);
T
Tom Tucker 已提交
976 977 978 979 980 981 982 983 984 985 986 987 988 989
		ib_dereg_mr(frmr->mr);
		kfree(frmr);
	}
}

struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
{
	struct svc_rdma_fastreg_mr *frmr = NULL;

	spin_lock_bh(&rdma->sc_frmr_q_lock);
	if (!list_empty(&rdma->sc_frmr_q)) {
		frmr = list_entry(rdma->sc_frmr_q.next,
				  struct svc_rdma_fastreg_mr, frmr_list);
		list_del_init(&frmr->frmr_list);
990
		frmr->sg_nents = 0;
T
Tom Tucker 已提交
991 992 993 994 995 996 997 998 999 1000 1001 1002
	}
	spin_unlock_bh(&rdma->sc_frmr_q_lock);
	if (frmr)
		return frmr;

	return rdma_alloc_frmr(rdma);
}

void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
		       struct svc_rdma_fastreg_mr *frmr)
{
	if (frmr) {
1003 1004 1005
		ib_dma_unmap_sg(rdma->sc_cm_id->device,
				frmr->sg, frmr->sg_nents, frmr->direction);
		atomic_dec(&rdma->sc_dma_used);
T
Tom Tucker 已提交
1006
		spin_lock_bh(&rdma->sc_frmr_q_lock);
1007
		WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
T
Tom Tucker 已提交
1008 1009 1010 1011 1012
		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
		spin_unlock_bh(&rdma->sc_frmr_q_lock);
	}
}

1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
/*
 * This is the xpo_recvfrom function for listening endpoints. Its
 * purpose is to accept incoming connections. The CMA callback handler
 * has already created a new transport and attached it to the new CMA
 * ID.
 *
 * There is a queue of pending connections hung on the listening
 * transport. This queue contains the new svc_xprt structure. This
 * function takes svc_xprt structures off the accept_q and completes
 * the connection.
 */
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *listen_rdma;
	struct svcxprt_rdma *newxprt = NULL;
	struct rdma_conn_param conn_param;
1029
	struct ib_cq_init_attr cq_attr = {};
1030
	struct ib_qp_init_attr qp_attr;
1031
	struct ib_device *dev;
1032
	unsigned int i;
1033
	int ret = 0;
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052

	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
	clear_bit(XPT_CONN, &xprt->xpt_flags);
	/* Get the next entry off the accept list */
	spin_lock_bh(&listen_rdma->sc_lock);
	if (!list_empty(&listen_rdma->sc_accept_q)) {
		newxprt = list_entry(listen_rdma->sc_accept_q.next,
				     struct svcxprt_rdma, sc_accept_q);
		list_del_init(&newxprt->sc_accept_q);
	}
	if (!list_empty(&listen_rdma->sc_accept_q))
		set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
	spin_unlock_bh(&listen_rdma->sc_lock);
	if (!newxprt)
		return NULL;

	dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
		newxprt, newxprt->sc_cm_id);

1053
	dev = newxprt->sc_cm_id->device;
1054 1055 1056

	/* Qualify the transport resource defaults with the
	 * capabilities of this particular device */
1057
	newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
1058
				  (size_t)RPCSVC_MAXPAGES);
1059
	newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
1060
				       RPCSVC_MAXPAGES);
1061
	newxprt->sc_max_req_size = svcrdma_max_req_size;
1062 1063 1064 1065 1066 1067 1068
	newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
					 svcrdma_max_requests);
	newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
					    svcrdma_max_bc_requests);
	newxprt->sc_rq_depth = newxprt->sc_max_requests +
			       newxprt->sc_max_bc_requests;
	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
1069

1070 1071
	if (!svc_rdma_prealloc_ctxts(newxprt))
		goto errout;
1072 1073
	if (!svc_rdma_prealloc_maps(newxprt))
		goto errout;
1074

1075 1076 1077 1078
	/*
	 * Limit ORD based on client limit, local device limit, and
	 * configured svcrdma limit.
	 */
1079
	newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
1080
	newxprt->sc_ord = min_t(size_t,	svcrdma_ord, newxprt->sc_ord);
1081

1082
	newxprt->sc_pd = ib_alloc_pd(dev);
1083 1084 1085 1086
	if (IS_ERR(newxprt->sc_pd)) {
		dprintk("svcrdma: error creating PD for connect request\n");
		goto errout;
	}
1087
	cq_attr.cqe = newxprt->sc_sq_depth;
1088
	newxprt->sc_sq_cq = ib_create_cq(dev,
1089 1090 1091
					 sq_comp_handler,
					 cq_event_handler,
					 newxprt,
1092
					 &cq_attr);
1093 1094 1095 1096
	if (IS_ERR(newxprt->sc_sq_cq)) {
		dprintk("svcrdma: error creating SQ CQ for connect request\n");
		goto errout;
	}
1097
	cq_attr.cqe = newxprt->sc_rq_depth;
1098
	newxprt->sc_rq_cq = ib_create_cq(dev,
1099 1100 1101
					 rq_comp_handler,
					 cq_event_handler,
					 newxprt,
1102
					 &cq_attr);
1103 1104 1105 1106 1107 1108 1109 1110 1111
	if (IS_ERR(newxprt->sc_rq_cq)) {
		dprintk("svcrdma: error creating RQ CQ for connect request\n");
		goto errout;
	}

	memset(&qp_attr, 0, sizeof qp_attr);
	qp_attr.event_handler = qp_event_handler;
	qp_attr.qp_context = &newxprt->sc_xprt;
	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
1112
	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125
	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
	qp_attr.qp_type = IB_QPT_RC;
	qp_attr.send_cq = newxprt->sc_sq_cq;
	qp_attr.recv_cq = newxprt->sc_rq_cq;
	dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
		"    cm_id->device=%p, sc_pd->device=%p\n"
		"    cap.max_send_wr = %d\n"
		"    cap.max_recv_wr = %d\n"
		"    cap.max_send_sge = %d\n"
		"    cap.max_recv_sge = %d\n",
		newxprt->sc_cm_id, newxprt->sc_pd,
1126
		dev, newxprt->sc_pd->device,
1127 1128 1129 1130 1131 1132 1133
		qp_attr.cap.max_send_wr,
		qp_attr.cap.max_recv_wr,
		qp_attr.cap.max_send_sge,
		qp_attr.cap.max_recv_sge);

	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
	if (ret) {
1134 1135
		dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
		goto errout;
1136 1137 1138
	}
	newxprt->sc_qp = newxprt->sc_cm_id->qp;

1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
	/*
	 * Use the most secure set of MR resources based on the
	 * transport type and available memory management features in
	 * the device. Here's the table implemented below:
	 *
	 *		Fast	Global	DMA	Remote WR
	 *		Reg	LKEY	MR	Access
	 *		Sup'd	Sup'd	Needed	Needed
	 *
	 * IWARP	N	N	Y	Y
	 *		N	Y	Y	Y
	 *		Y	N	Y	N
	 *		Y	Y	N	-
	 *
	 * IB		N	N	Y	N
	 *		N	Y	N	-
	 *		Y	N	Y	N
	 *		Y	Y	N	-
	 *
	 * NB:	iWARP requires remote write access for the data sink
	 *	of an RDMA_READ. IB does not.
	 */
1161
	newxprt->sc_reader = rdma_read_chunk_lcl;
1162
	if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
1163
		newxprt->sc_frmr_pg_list_len =
1164
			dev->attrs.max_fast_reg_page_list_len;
1165
		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
1166
		newxprt->sc_reader = rdma_read_chunk_frmr;
1167 1168 1169 1170 1171
	}

	/*
	 * Determine if a DMA MR is required and if so, what privs are required
	 */
1172 1173
	if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
	    !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
1174
		goto errout;
M
Michael Wang 已提交
1175

1176
	if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
M
Michael Wang 已提交
1177 1178
		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;

1179
	/* Post receive buffers */
1180
	for (i = 0; i < newxprt->sc_rq_depth; i++) {
1181
		ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
1182 1183 1184 1185 1186 1187 1188 1189 1190
		if (ret) {
			dprintk("svcrdma: failure posting receive buffers\n");
			goto errout;
		}
	}

	/* Swap out the handler */
	newxprt->sc_cm_id->event_handler = rdma_cma_handler;

1191 1192 1193 1194 1195 1196 1197
	/*
	 * Arm the CQs for the SQ and RQ before accepting so we can't
	 * miss the first message
	 */
	ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
	ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);

1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
	/* Accept Connection */
	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
	memset(&conn_param, 0, sizeof conn_param);
	conn_param.responder_resources = 0;
	conn_param.initiator_depth = newxprt->sc_ord;
	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
	if (ret) {
		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
		       ret);
		goto errout;
	}

	dprintk("svcrdma: new connection %p accepted with the following "
		"attributes:\n"
H
Harvey Harrison 已提交
1212
		"    local_ip        : %pI4\n"
1213
		"    local_port	     : %d\n"
H
Harvey Harrison 已提交
1214
		"    remote_ip       : %pI4\n"
1215 1216
		"    remote_port     : %d\n"
		"    max_sge         : %d\n"
1217
		"    max_sge_rd      : %d\n"
1218 1219 1220 1221
		"    sq_depth        : %d\n"
		"    max_requests    : %d\n"
		"    ord             : %d\n",
		newxprt,
H
Harvey Harrison 已提交
1222 1223
		&((struct sockaddr_in *)&newxprt->sc_cm_id->
			 route.addr.src_addr)->sin_addr.s_addr,
1224 1225
		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
		       route.addr.src_addr)->sin_port),
H
Harvey Harrison 已提交
1226 1227
		&((struct sockaddr_in *)&newxprt->sc_cm_id->
			 route.addr.dst_addr)->sin_addr.s_addr,
1228 1229 1230
		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
		       route.addr.dst_addr)->sin_port),
		newxprt->sc_max_sge,
1231
		newxprt->sc_max_sge_rd,
1232 1233 1234 1235 1236 1237 1238 1239
		newxprt->sc_sq_depth,
		newxprt->sc_max_requests,
		newxprt->sc_ord);

	return &newxprt->sc_xprt;

 errout:
	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
1240 1241
	/* Take a reference in case the DTO handler runs */
	svc_xprt_get(&newxprt->sc_xprt);
1242
	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
1243
		ib_destroy_qp(newxprt->sc_qp);
1244
	rdma_destroy_id(newxprt->sc_cm_id);
1245 1246
	/* This call to put will destroy the transport */
	svc_xprt_put(&newxprt->sc_xprt);
1247 1248 1249 1250 1251 1252 1253
	return NULL;
}

static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
{
}

1254
/*
1255
 * When connected, an svc_xprt has at least two references:
1256 1257 1258 1259 1260 1261 1262 1263
 *
 * - A reference held by the cm_id between the ESTABLISHED and
 *   DISCONNECTED events. If the remote peer disconnected first, this
 *   reference could be gone.
 *
 * - A reference held by the svc_recv code that called this function
 *   as part of close processing.
 *
1264
 * At a minimum one references should still be held.
1265
 */
1266 1267 1268 1269 1270
static void svc_rdma_detach(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);
	dprintk("svc: svc_rdma_detach(%p)\n", xprt);
1271 1272

	/* Disconnect and flush posted WQE */
1273 1274 1275
	rdma_disconnect(rdma->sc_cm_id);
}

1276
static void __svc_rdma_free(struct work_struct *work)
1277
{
1278 1279
	struct svcxprt_rdma *rdma =
		container_of(work, struct svcxprt_rdma, sc_work);
1280 1281 1282
	struct svc_xprt *xprt = &rdma->sc_xprt;

	dprintk("svcrdma: %s(%p)\n", __func__, rdma);
1283

1284
	/* We should only be called from kref_put */
1285
	if (atomic_read(&xprt->xpt_ref.refcount) != 0)
1286
		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
1287
		       atomic_read(&xprt->xpt_ref.refcount));
1288

1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314
	/*
	 * Destroy queued, but not processed read completions. Note
	 * that this cleanup has to be done before destroying the
	 * cm_id because the device ptr is needed to unmap the dma in
	 * svc_rdma_put_context.
	 */
	while (!list_empty(&rdma->sc_read_complete_q)) {
		struct svc_rdma_op_ctxt *ctxt;
		ctxt = list_entry(rdma->sc_read_complete_q.next,
				  struct svc_rdma_op_ctxt,
				  dto_q);
		list_del_init(&ctxt->dto_q);
		svc_rdma_put_context(ctxt, 1);
	}

	/* Destroy queued, but not processed recv completions */
	while (!list_empty(&rdma->sc_rq_dto_q)) {
		struct svc_rdma_op_ctxt *ctxt;
		ctxt = list_entry(rdma->sc_rq_dto_q.next,
				  struct svc_rdma_op_ctxt,
				  dto_q);
		list_del_init(&ctxt->dto_q);
		svc_rdma_put_context(ctxt, 1);
	}

	/* Warn if we leaked a resource or under-referenced */
1315
	if (rdma->sc_ctxt_used != 0)
1316
		pr_err("svcrdma: ctxt still in use? (%d)\n",
1317
		       rdma->sc_ctxt_used);
1318 1319 1320
	if (atomic_read(&rdma->sc_dma_used) != 0)
		pr_err("svcrdma: dma still in use? (%d)\n",
		       atomic_read(&rdma->sc_dma_used));
1321

1322 1323 1324 1325 1326 1327
	/* Final put of backchannel client transport */
	if (xprt->xpt_bc_xprt) {
		xprt_put(xprt->xpt_bc_xprt);
		xprt->xpt_bc_xprt = NULL;
	}

T
Tom Tucker 已提交
1328
	rdma_dealloc_frmr_q(rdma);
1329
	svc_rdma_destroy_ctxts(rdma);
1330
	svc_rdma_destroy_maps(rdma);
T
Tom Tucker 已提交
1331

1332 1333 1334 1335
	/* Destroy the QP if present (not a listener) */
	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
		ib_destroy_qp(rdma->sc_qp);

1336 1337
	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
		ib_destroy_cq(rdma->sc_sq_cq);
1338

1339 1340
	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
		ib_destroy_cq(rdma->sc_rq_cq);
1341

1342 1343
	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
		ib_dealloc_pd(rdma->sc_pd);
1344

1345 1346 1347
	/* Destroy the CM ID */
	rdma_destroy_id(rdma->sc_cm_id);

1348
	kfree(rdma);
1349 1350
}

1351 1352 1353 1354 1355
static void svc_rdma_free(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);
	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
1356
	queue_work(svc_rdma_wq, &rdma->sc_work);
1357 1358
}

1359 1360 1361 1362 1363 1364
static int svc_rdma_has_wspace(struct svc_xprt *xprt)
{
	struct svcxprt_rdma *rdma =
		container_of(xprt, struct svcxprt_rdma, sc_xprt);

	/*
S
Steve Wise 已提交
1365
	 * If there are already waiters on the SQ,
1366 1367 1368 1369 1370 1371 1372 1373 1374
	 * return false.
	 */
	if (waitqueue_active(&rdma->sc_send_wait))
		return 0;

	/* Otherwise return true. */
	return 1;
}

1375 1376 1377 1378 1379
static int svc_rdma_secure_port(struct svc_rqst *rqstp)
{
	return 1;
}

1380 1381
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
1382 1383 1384
	struct ib_send_wr *bad_wr, *n_wr;
	int wr_count;
	int i;
1385 1386 1387
	int ret;

	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1388
		return -ENOTCONN;
1389

1390 1391 1392 1393
	wr_count = 1;
	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
		wr_count++;

1394 1395 1396
	/* If the SQ is full, wait until an SQ entry is available */
	while (1) {
		spin_lock_bh(&xprt->sc_lock);
1397
		if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
1398 1399
			spin_unlock_bh(&xprt->sc_lock);
			atomic_inc(&rdma_stat_sq_starve);
1400 1401

			/* See if we can opportunistically reap SQ WR to make room */
1402 1403 1404 1405 1406 1407
			sq_cq_reap(xprt);

			/* Wait until SQ WR available if SQ still full */
			wait_event(xprt->sc_send_wait,
				   atomic_read(&xprt->sc_sq_count) <
				   xprt->sc_sq_depth);
1408
			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1409
				return -ENOTCONN;
1410 1411
			continue;
		}
1412 1413 1414 1415 1416 1417
		/* Take a transport ref for each WR posted */
		for (i = 0; i < wr_count; i++)
			svc_xprt_get(&xprt->sc_xprt);

		/* Bump used SQ WR count and post */
		atomic_add(wr_count, &xprt->sc_sq_count);
1418
		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1419 1420 1421 1422 1423
		if (ret) {
			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
			atomic_sub(wr_count, &xprt->sc_sq_count);
			for (i = 0; i < wr_count; i ++)
				svc_xprt_put(&xprt->sc_xprt);
1424 1425 1426 1427
			dprintk("svcrdma: failed to post SQ WR rc=%d, "
			       "sc_sq_count=%d, sc_sq_depth=%d\n",
			       ret, atomic_read(&xprt->sc_sq_count),
			       xprt->sc_sq_depth);
1428
		}
1429
		spin_unlock_bh(&xprt->sc_lock);
1430 1431
		if (ret)
			wake_up(&xprt->sc_send_wait);
1432 1433 1434 1435
		break;
	}
	return ret;
}