iser_memory.c 22.4 KB
Newer Older
1 2
/*
 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
3
 * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *	- Redistributions of source code must retain the above
 *	  copyright notice, this list of conditions and the following
 *	  disclaimer.
 *
 *	- Redistributions in binary form must reproduce the above
 *	  copyright notice, this list of conditions and the following
 *	  disclaimer in the documentation and/or other materials
 *	  provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
A
Al Viro 已提交
37
#include <linux/highmem.h>
38 39 40 41 42
#include <linux/scatterlist.h>

#include "iscsi_iser.h"

#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
43

44 45 46
/**
 * iser_start_rdma_unaligned_sg
 */
47
static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
48 49
					struct iser_data_buf *data,
					struct iser_data_buf *data_copy,
50
					enum iser_data_dir cmd_dir)
51
{
S
Sagi Grimberg 已提交
52
	struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device;
53 54
	struct scatterlist *sgl = (struct scatterlist *)data->buf;
	struct scatterlist *sg;
55
	char *mem = NULL;
56 57 58 59 60
	unsigned long  cmd_data_len = 0;
	int dma_nents, i;

	for_each_sg(sgl, sg, data->size, i)
		cmd_data_len += ib_sg_dma_len(dev, sg);
61 62

	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
O
Or Gerlitz 已提交
63
		mem = (void *)__get_free_pages(GFP_ATOMIC,
64
		      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
65
	else
O
Or Gerlitz 已提交
66
		mem = kmalloc(cmd_data_len, GFP_ATOMIC);
67 68 69

	if (mem == NULL) {
		iser_err("Failed to allocate mem size %d %d for copying sglist\n",
70
			 data->size, (int)cmd_data_len);
71 72 73 74 75 76 77
		return -ENOMEM;
	}

	if (cmd_dir == ISER_DIR_OUT) {
		/* copy the unaligned sg the buffer which is used for RDMA */
		char *p, *from;

78
		sgl = (struct scatterlist *)data->buf;
J
Jens Axboe 已提交
79 80
		p = mem;
		for_each_sg(sgl, sg, data->size, i) {
81
			from = kmap_atomic(sg_page(sg));
82
			memcpy(p,
J
Jens Axboe 已提交
83 84
			       from + sg->offset,
			       sg->length);
85
			kunmap_atomic(from);
J
Jens Axboe 已提交
86
			p += sg->length;
87 88 89
		}
	}

90 91 92 93
	sg_init_one(&data_copy->sg_single, mem, cmd_data_len);
	data_copy->buf = &data_copy->sg_single;
	data_copy->size = 1;
	data_copy->copy_buf = mem;
94

95
	dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1,
96 97
				  (cmd_dir == ISER_DIR_OUT) ?
				  DMA_TO_DEVICE : DMA_FROM_DEVICE);
98 99
	BUG_ON(dma_nents == 0);

100 101 102
	data_copy->dma_nents = dma_nents;
	data_copy->data_len = cmd_data_len;

103 104 105 106 107 108
	return 0;
}

/**
 * iser_finalize_rdma_unaligned_sg
 */
109

110
void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
111 112 113
				     struct iser_data_buf *data,
				     struct iser_data_buf *data_copy,
				     enum iser_data_dir cmd_dir)
114
{
115
	struct ib_device *dev;
116 117
	unsigned long  cmd_data_len;

S
Sagi Grimberg 已提交
118
	dev = iser_task->iser_conn->ib_conn.device->ib_device;
119

120
	ib_dma_unmap_sg(dev, &data_copy->sg_single, 1,
121 122
			(cmd_dir == ISER_DIR_OUT) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE);
123 124 125

	if (cmd_dir == ISER_DIR_IN) {
		char *mem;
J
Jens Axboe 已提交
126
		struct scatterlist *sgl, *sg;
127 128 129 130 131
		unsigned char *p, *to;
		unsigned int sg_size;
		int i;

		/* copy back read RDMA to unaligned sg */
132
		mem = data_copy->copy_buf;
133

134 135
		sgl = (struct scatterlist *)data->buf;
		sg_size = data->size;
136

J
Jens Axboe 已提交
137 138
		p = mem;
		for_each_sg(sgl, sg, sg_size, i) {
139
			to = kmap_atomic(sg_page(sg));
J
Jens Axboe 已提交
140
			memcpy(to + sg->offset,
141
			       p,
J
Jens Axboe 已提交
142
			       sg->length);
143
			kunmap_atomic(to);
J
Jens Axboe 已提交
144
			p += sg->length;
145 146 147
		}
	}

148
	cmd_data_len = data->data_len;
149 150

	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
151
		free_pages((unsigned long)data_copy->copy_buf,
152
			   ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
153
	else
154
		kfree(data_copy->copy_buf);
155

156
	data_copy->copy_buf = NULL;
157 158
}

159 160
#define IS_4K_ALIGNED(addr)	((((unsigned long)addr) & ~MASK_4K) == 0)

161 162 163 164 165 166 167 168 169 170 171 172
/**
 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
 * and returns the length of resulting physical address array (may be less than
 * the original due to possible compaction).
 *
 * we build a "page vec" under the assumption that the SG meets the RDMA
 * alignment requirements. Other then the first and last SG elements, all
 * the "internal" elements can be compacted into a list whose elements are
 * dma addresses of physical pages. The code supports also the weird case
 * where --few fragments of the same page-- are present in the SG as
 * consecutive elements. Also, it handles one entry SG.
 */
173

174
static int iser_sg_to_page_vec(struct iser_data_buf *data,
175 176
			       struct ib_device *ibdev, u64 *pages,
			       int *offset, int *data_size)
177
{
178 179
	struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;
	u64 start_addr, end_addr, page, chunk_start = 0;
180
	unsigned long total_sz = 0;
181 182
	unsigned int dma_len;
	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
183 184

	/* compute the offset of first element */
185
	*offset = (u64) sgl[0].offset & ~MASK_4K;
186

187 188
	new_chunk = 1;
	cur_page  = 0;
J
Jens Axboe 已提交
189
	for_each_sg(sgl, sg, data->dma_nents, i) {
190 191 192 193 194
		start_addr = ib_sg_dma_address(ibdev, sg);
		if (new_chunk)
			chunk_start = start_addr;
		dma_len = ib_sg_dma_len(ibdev, sg);
		end_addr = start_addr + dma_len;
195
		total_sz += dma_len;
196

197 198 199 200
		/* collect page fragments until aligned or end of SG list */
		if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
			new_chunk = 0;
			continue;
201
		}
202 203 204 205 206 207 208
		new_chunk = 1;

		/* address of the first page in the contiguous chunk;
		   masking relevant for the very first SG entry,
		   which might be unaligned */
		page = chunk_start & MASK_4K;
		do {
209
			pages[cur_page++] = page;
210
			page += SIZE_4K;
211
		} while (page < end_addr);
212
	}
213

214 215 216
	*data_size = total_sz;
	iser_dbg("page_vec->data_size:%d cur_page %d\n",
		 *data_size, cur_page);
217 218 219 220 221 222 223 224 225 226
	return cur_page;
}


/**
 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
 * for RDMA sub-list of a scatter-gather list of memory buffers, and  returns
 * the number of entries which are aligned correctly. Supports the case where
 * consecutive SG elements are actually fragments of the same physcial page.
 */
227 228
static int iser_data_buf_aligned_len(struct iser_data_buf *data,
				      struct ib_device *ibdev)
229
{
230 231 232 233 234 235
	struct scatterlist *sgl, *sg, *next_sg = NULL;
	u64 start_addr, end_addr;
	int i, ret_len, start_check = 0;

	if (data->dma_nents == 1)
		return 1;
236

J
Jens Axboe 已提交
237
	sgl = (struct scatterlist *)data->buf;
238
	start_addr  = ib_sg_dma_address(ibdev, sgl);
239

J
Jens Axboe 已提交
240
	for_each_sg(sgl, sg, data->dma_nents, i) {
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
		if (start_check && !IS_4K_ALIGNED(start_addr))
			break;

		next_sg = sg_next(sg);
		if (!next_sg)
			break;

		end_addr    = start_addr + ib_sg_dma_len(ibdev, sg);
		start_addr  = ib_sg_dma_address(ibdev, next_sg);

		if (end_addr == start_addr) {
			start_check = 0;
			continue;
		} else
			start_check = 1;

		if (!IS_4K_ALIGNED(end_addr))
			break;
259
	}
260
	ret_len = (next_sg) ? i : i+1;
261 262 263 264 265
	iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
		 ret_len, data->dma_nents, data);
	return ret_len;
}

266 267
static void iser_data_buf_dump(struct iser_data_buf *data,
			       struct ib_device *ibdev)
268
{
J
Jens Axboe 已提交
269 270
	struct scatterlist *sgl = (struct scatterlist *)data->buf;
	struct scatterlist *sg;
271 272
	int i;

J
Jens Axboe 已提交
273
	for_each_sg(sgl, sg, data->dma_nents, i)
274
		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
E
Erez Zilber 已提交
275
			 "off:0x%x sz:0x%x dma_len:0x%x\n",
J
Jens Axboe 已提交
276
			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
J
Jens Axboe 已提交
277
			 sg_page(sg), sg->offset,
J
Jens Axboe 已提交
278
			 sg->length, ib_sg_dma_len(ibdev, sg));
279 280 281 282 283 284 285 286 287 288 289 290 291
}

static void iser_dump_page_vec(struct iser_page_vec *page_vec)
{
	int i;

	iser_err("page vec length %d data size %d\n",
		 page_vec->length, page_vec->data_size);
	for (i = 0; i < page_vec->length; i++)
		iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
}

static void iser_page_vec_build(struct iser_data_buf *data,
292 293
				struct iser_page_vec *page_vec,
				struct ib_device *ibdev)
294 295 296 297 298 299 300
{
	int page_vec_len = 0;

	page_vec->length = 0;
	page_vec->offset = 0;

	iser_dbg("Translating sg sz: %d\n", data->dma_nents);
301 302 303 304
	page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages,
					   &page_vec->offset,
					   &page_vec->data_size);
	iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len);
305 306 307

	page_vec->length = page_vec_len;

308
	if (page_vec_len * SIZE_4K < page_vec->data_size) {
309
		iser_err("page_vec too short to hold this SG\n");
310
		iser_data_buf_dump(data, ibdev);
311 312 313 314 315
		iser_dump_page_vec(page_vec);
		BUG();
	}
}

316 317 318 319
int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
			    struct iser_data_buf *data,
			    enum iser_data_dir iser_dir,
			    enum dma_data_direction dma_dir)
320
{
321
	struct ib_device *dev;
322

323
	iser_task->dir[iser_dir] = 1;
S
Sagi Grimberg 已提交
324
	dev = iser_task->iser_conn->ib_conn.device->ib_device;
325

326
	data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir);
327 328 329 330 331 332 333
	if (data->dma_nents == 0) {
		iser_err("dma_map_sg failed!!!\n");
		return -EINVAL;
	}
	return 0;
}

334 335
void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
			      struct iser_data_buf *data)
336
{
337
	struct ib_device *dev;
338

S
Sagi Grimberg 已提交
339
	dev = iser_task->iser_conn->ib_conn.device->ib_device;
340
	ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE);
341 342
}

343 344
static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
			      struct ib_device *ibdev,
345 346
			      struct iser_data_buf *mem,
			      struct iser_data_buf *mem_copy,
347 348 349
			      enum iser_data_dir cmd_dir,
			      int aligned_len)
{
350
	struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn;
351 352 353 354 355 356 357 358 359

	iscsi_conn->fmr_unalign_cnt++;
	iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
		  aligned_len, mem->size);

	if (iser_debug_level > 0)
		iser_data_buf_dump(mem, ibdev);

	/* unmap the command data before accessing it */
360
	iser_dma_unmap_task_data(iser_task, mem);
361 362 363

	/* allocate copy buf, if we are writing, copy the */
	/* unaligned scatterlist, dma map the copy        */
364 365
	if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0)
		return -ENOMEM;
366 367 368 369

	return 0;
}

370
/**
371 372
 * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
 * using FMR (if possible) obtaining rkey and va
373 374 375
 *
 * returns 0 on success, errno code on failure
 */
376 377
int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
			  enum iser_data_dir cmd_dir)
378
{
S
Sagi Grimberg 已提交
379 380
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
	struct iser_device   *device = ib_conn->device;
381
	struct ib_device     *ibdev = device->ib_device;
382
	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
383 384 385
	struct iser_regd_buf *regd_buf;
	int aligned_len;
	int err;
E
Erez Zilber 已提交
386
	int i;
387
	struct scatterlist *sg;
388

389
	regd_buf = &iser_task->rdma_regd[cmd_dir];
390

391
	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
392
	if (aligned_len != mem->dma_nents) {
393 394
		err = fall_to_bounce_buf(iser_task, ibdev, mem,
					 &iser_task->data_copy[cmd_dir],
395 396 397 398 399
					 cmd_dir, aligned_len);
		if (err) {
			iser_err("failed to allocate bounce buffer\n");
			return err;
		}
400
		mem = &iser_task->data_copy[cmd_dir];
401 402
	}

403 404 405 406 407 408
	/* if there a single dma entry, FMR is not needed */
	if (mem->dma_nents == 1) {
		sg = (struct scatterlist *)mem->buf;

		regd_buf->reg.lkey = device->mr->lkey;
		regd_buf->reg.rkey = device->mr->rkey;
409 410
		regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
		regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
411
		regd_buf->reg.is_mr = 0;
412 413 414 415 416 417 418 419

		iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "
			 "va: 0x%08lX sz: %ld]\n",
			 (unsigned int)regd_buf->reg.lkey,
			 (unsigned int)regd_buf->reg.rkey,
			 (unsigned long)regd_buf->reg.va,
			 (unsigned long)regd_buf->reg.len);
	} else { /* use FMR for multiple dma entries */
S
Sagi Grimberg 已提交
420 421
		iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev);
		err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec,
422
					&regd_buf->reg);
423
		if (err && err != -EAGAIN) {
424
			iser_data_buf_dump(mem, ibdev);
425 426 427
			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
				 mem->dma_nents,
				 ntoh24(iser_task->desc.iscsi_header.dlength));
428
			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
S
Sagi Grimberg 已提交
429 430 431 432
				 ib_conn->fmr.page_vec->data_size,
				 ib_conn->fmr.page_vec->length,
				 ib_conn->fmr.page_vec->offset);
			for (i = 0; i < ib_conn->fmr.page_vec->length; i++)
433
				iser_err("page_vec[%d] = 0x%llx\n", i,
S
Sagi Grimberg 已提交
434
					 (unsigned long long)ib_conn->fmr.page_vec->pages[i]);
E
Erez Zilber 已提交
435
		}
436 437
		if (err)
			return err;
E
Erez Zilber 已提交
438
	}
439 440
	return 0;
}
441

442 443 444 445
static inline void
iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
		    struct ib_sig_domain *domain)
{
446
	domain->sig_type = IB_SIG_TYPE_T10_DIF;
447 448
	domain->sig.dif.pi_interval = sc->device->sector_size;
	domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff;
449 450 451 452 453 454 455 456 457 458
	/*
	 * At the moment we hard code those, but in the future
	 * we will take them from sc.
	 */
	domain->sig.dif.apptag_check_mask = 0xffff;
	domain->sig.dif.app_escape = true;
	domain->sig.dif.ref_escape = true;
	if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 ||
	    scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2)
		domain->sig.dif.ref_remap = true;
459
};
460 461 462 463 464 465 466

static int
iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
{
	switch (scsi_get_prot_op(sc)) {
	case SCSI_PROT_WRITE_INSERT:
	case SCSI_PROT_READ_STRIP:
467
		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
468
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
469 470 471 472
		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
		break;
	case SCSI_PROT_READ_INSERT:
	case SCSI_PROT_WRITE_STRIP:
473
		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
474
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
475 476 477 478 479
		/*
		 * At the moment we use this modparam to tell what is
		 * the memory bg_type, in the future we will take it
		 * from sc.
		 */
480 481
		sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
						 IB_T10DIF_CRC;
482 483 484
		break;
	case SCSI_PROT_READ_PASS:
	case SCSI_PROT_WRITE_PASS:
485
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
486
		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
487
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
488 489 490 491 492
		/*
		 * At the moment we use this modparam to tell what is
		 * the memory bg_type, in the future we will take it
		 * from sc.
		 */
493 494
		sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
						 IB_T10DIF_CRC;
495 496 497 498 499 500
		break;
	default:
		iser_err("Unsupported PI operation %d\n",
			 scsi_get_prot_op(sc));
		return -EINVAL;
	}
501

502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
	return 0;
}

static int
iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
{
	switch (scsi_get_prot_type(sc)) {
	case SCSI_PROT_DIF_TYPE0:
		break;
	case SCSI_PROT_DIF_TYPE1:
	case SCSI_PROT_DIF_TYPE2:
		*mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG;
		break;
	case SCSI_PROT_DIF_TYPE3:
		*mask = ISER_CHECK_GUARD;
		break;
	default:
		iser_err("Unsupported protection type %d\n",
			 scsi_get_prot_type(sc));
		return -EINVAL;
	}

	return 0;
}

527 528 529 530 531 532 533 534 535 536 537 538 539 540
static void
iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
{
	u32 rkey;

	memset(inv_wr, 0, sizeof(*inv_wr));
	inv_wr->opcode = IB_WR_LOCAL_INV;
	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
	inv_wr->ex.invalidate_rkey = mr->rkey;

	rkey = ib_inc_rkey(mr->rkey);
	ib_update_fast_reg_key(mr, rkey);
}

541 542 543 544 545
static int
iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
		struct fast_reg_descriptor *desc, struct ib_sge *data_sge,
		struct ib_sge *prot_sge, struct ib_sge *sig_sge)
{
S
Sagi Grimberg 已提交
546
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
	struct iser_pi_context *pi_ctx = desc->pi_ctx;
	struct ib_send_wr sig_wr, inv_wr;
	struct ib_send_wr *bad_wr, *wr = NULL;
	struct ib_sig_attrs sig_attrs;
	int ret;

	memset(&sig_attrs, 0, sizeof(sig_attrs));
	ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
	if (ret)
		goto err;

	ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
	if (ret)
		goto err;

	if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
563
		iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
		wr = &inv_wr;
	}

	memset(&sig_wr, 0, sizeof(sig_wr));
	sig_wr.opcode = IB_WR_REG_SIG_MR;
	sig_wr.wr_id = ISER_FASTREG_LI_WRID;
	sig_wr.sg_list = data_sge;
	sig_wr.num_sge = 1;
	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
	if (scsi_prot_sg_count(iser_task->sc))
		sig_wr.wr.sig_handover.prot = prot_sge;
	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
					      IB_ACCESS_REMOTE_READ |
					      IB_ACCESS_REMOTE_WRITE;

	if (!wr)
		wr = &sig_wr;
	else
		wr->next = &sig_wr;

S
Sagi Grimberg 已提交
585
	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607
	if (ret) {
		iser_err("reg_sig_mr failed, ret:%d\n", ret);
		goto err;
	}
	desc->reg_indicators &= ~ISER_SIG_KEY_VALID;

	sig_sge->lkey = pi_ctx->sig_mr->lkey;
	sig_sge->addr = 0;
	sig_sge->length = data_sge->length + prot_sge->length;
	if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT ||
	    scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) {
		sig_sge->length += (data_sge->length /
				   iser_task->sc->device->sector_size) * 8;
	}

	iser_dbg("sig_sge: addr: 0x%llx  length: %u lkey: 0x%x\n",
		 sig_sge->addr, sig_sge->length,
		 sig_sge->lkey);
err:
	return ret;
}

608
static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
609
			    struct iser_regd_buf *regd_buf,
610
			    struct iser_data_buf *mem,
611
			    enum iser_reg_indicator ind,
612
			    struct ib_sge *sge)
613
{
614
	struct fast_reg_descriptor *desc = regd_buf->reg.mem_h;
S
Sagi Grimberg 已提交
615 616
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
	struct iser_device *device = ib_conn->device;
617
	struct ib_device *ibdev = device->ib_device;
618 619
	struct ib_mr *mr;
	struct ib_fast_reg_page_list *frpl;
620 621
	struct ib_send_wr fastreg_wr, inv_wr;
	struct ib_send_wr *bad_wr, *wr = NULL;
622 623 624 625 626 627 628 629 630 631 632 633 634 635 636
	int ret, offset, size, plen;

	/* if there a single dma entry, dma mr suffices */
	if (mem->dma_nents == 1) {
		struct scatterlist *sg = (struct scatterlist *)mem->buf;

		sge->lkey = device->mr->lkey;
		sge->addr   = ib_sg_dma_address(ibdev, &sg[0]);
		sge->length  = ib_sg_dma_len(ibdev, &sg[0]);

		iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n",
			 sge->lkey, sge->addr, sge->length);
		return 0;
	}

637 638 639 640 641 642 643 644 645
	if (ind == ISER_DATA_KEY_VALID) {
		mr = desc->data_mr;
		frpl = desc->data_frpl;
	} else {
		mr = desc->pi_ctx->prot_mr;
		frpl = desc->pi_ctx->prot_frpl;
	}

	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
646 647 648 649 650
				   &offset, &size);
	if (plen * SIZE_4K < size) {
		iser_err("fast reg page_list too short to hold this SG\n");
		return -EINVAL;
	}
651

652
	if (!(desc->reg_indicators & ind)) {
653
		iser_inv_rkey(&inv_wr, mr);
654 655 656 657 658
		wr = &inv_wr;
	}

	/* Prepare FASTREG WR */
	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
659
	fastreg_wr.wr_id = ISER_FASTREG_LI_WRID;
660
	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
661 662
	fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset;
	fastreg_wr.wr.fast_reg.page_list = frpl;
663
	fastreg_wr.wr.fast_reg.page_list_len = plen;
664
	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
665
	fastreg_wr.wr.fast_reg.length = size;
666
	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
667 668 669 670
	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
					       IB_ACCESS_REMOTE_WRITE |
					       IB_ACCESS_REMOTE_READ);

671
	if (!wr)
672
		wr = &fastreg_wr;
673
	else
674 675
		wr->next = &fastreg_wr;

S
Sagi Grimberg 已提交
676
	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
677 678 679 680
	if (ret) {
		iser_err("fast registration failed, ret:%d\n", ret);
		return ret;
	}
681
	desc->reg_indicators &= ~ind;
682

683 684
	sge->lkey = mr->lkey;
	sge->addr = frpl->page_list[0] + offset;
685
	sge->length = size;
686 687 688 689 690

	return ret;
}

/**
691
 * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA,
692 693 694 695
 * using Fast Registration WR (if possible) obtaining rkey and va
 *
 * returns 0 on success, errno code on failure
 */
696 697
int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
			      enum iser_data_dir cmd_dir)
698
{
S
Sagi Grimberg 已提交
699 700
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
	struct iser_device *device = ib_conn->device;
701 702 703
	struct ib_device *ibdev = device->ib_device;
	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
	struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir];
704 705
	struct fast_reg_descriptor *desc = NULL;
	struct ib_sge data_sge;
706 707 708 709 710
	int err, aligned_len;
	unsigned long flags;

	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
	if (aligned_len != mem->dma_nents) {
711 712
		err = fall_to_bounce_buf(iser_task, ibdev, mem,
					 &iser_task->data_copy[cmd_dir],
713 714 715 716 717 718 719 720
					 cmd_dir, aligned_len);
		if (err) {
			iser_err("failed to allocate bounce buffer\n");
			return err;
		}
		mem = &iser_task->data_copy[cmd_dir];
	}

721 722
	if (mem->dma_nents != 1 ||
	    scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
S
Sagi Grimberg 已提交
723 724
		spin_lock_irqsave(&ib_conn->lock, flags);
		desc = list_first_entry(&ib_conn->fastreg.pool,
725 726
					struct fast_reg_descriptor, list);
		list_del(&desc->list);
S
Sagi Grimberg 已提交
727
		spin_unlock_irqrestore(&ib_conn->lock, flags);
728 729
		regd_buf->reg.mem_h = desc;
	}
730

731 732
	err = iser_fast_reg_mr(iser_task, regd_buf, mem,
			       ISER_DATA_KEY_VALID, &data_sge);
733 734 735
	if (err)
		goto err_reg;

736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771
	if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
		struct ib_sge prot_sge, sig_sge;

		memset(&prot_sge, 0, sizeof(prot_sge));
		if (scsi_prot_sg_count(iser_task->sc)) {
			mem = &iser_task->prot[cmd_dir];
			aligned_len = iser_data_buf_aligned_len(mem, ibdev);
			if (aligned_len != mem->dma_nents) {
				err = fall_to_bounce_buf(iser_task, ibdev, mem,
							 &iser_task->prot_copy[cmd_dir],
							 cmd_dir, aligned_len);
				if (err) {
					iser_err("failed to allocate bounce buffer\n");
					return err;
				}
				mem = &iser_task->prot_copy[cmd_dir];
			}

			err = iser_fast_reg_mr(iser_task, regd_buf, mem,
					       ISER_PROT_KEY_VALID, &prot_sge);
			if (err)
				goto err_reg;
		}

		err = iser_reg_sig_mr(iser_task, desc, &data_sge,
				      &prot_sge, &sig_sge);
		if (err) {
			iser_err("Failed to register signature mr\n");
			return err;
		}
		desc->reg_indicators |= ISER_FASTREG_PROTECTED;

		regd_buf->reg.lkey = sig_sge.lkey;
		regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey;
		regd_buf->reg.va = sig_sge.addr;
		regd_buf->reg.len = sig_sge.length;
772 773
		regd_buf->reg.is_mr = 1;
	} else {
774 775 776 777 778 779 780
		if (desc) {
			regd_buf->reg.rkey = desc->data_mr->rkey;
			regd_buf->reg.is_mr = 1;
		} else {
			regd_buf->reg.rkey = device->mr->rkey;
			regd_buf->reg.is_mr = 0;
		}
781

782 783 784 785
		regd_buf->reg.lkey = data_sge.lkey;
		regd_buf->reg.va = data_sge.addr;
		regd_buf->reg.len = data_sge.length;
	}
786

787 788
	return 0;
err_reg:
789
	if (desc) {
S
Sagi Grimberg 已提交
790 791 792
		spin_lock_irqsave(&ib_conn->lock, flags);
		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
		spin_unlock_irqrestore(&ib_conn->lock, flags);
793 794
	}

795 796
	return err;
}