iser_memory.c 22.3 KB
Newer Older
1 2
/*
 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
3
 * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *	- Redistributions of source code must retain the above
 *	  copyright notice, this list of conditions and the following
 *	  disclaimer.
 *
 *	- Redistributions in binary form must reproduce the above
 *	  copyright notice, this list of conditions and the following
 *	  disclaimer in the documentation and/or other materials
 *	  provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
A
Al Viro 已提交
37
#include <linux/highmem.h>
38 39 40 41 42
#include <linux/scatterlist.h>

#include "iscsi_iser.h"

#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
43

44 45 46
/**
 * iser_start_rdma_unaligned_sg
 */
47
static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
48 49
					struct iser_data_buf *data,
					struct iser_data_buf *data_copy,
50
					enum iser_data_dir cmd_dir)
51
{
S
Sagi Grimberg 已提交
52
	struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device;
53 54
	struct scatterlist *sgl = (struct scatterlist *)data->buf;
	struct scatterlist *sg;
55
	char *mem = NULL;
56 57 58 59 60
	unsigned long  cmd_data_len = 0;
	int dma_nents, i;

	for_each_sg(sgl, sg, data->size, i)
		cmd_data_len += ib_sg_dma_len(dev, sg);
61 62

	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
O
Or Gerlitz 已提交
63
		mem = (void *)__get_free_pages(GFP_ATOMIC,
64
		      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
65
	else
O
Or Gerlitz 已提交
66
		mem = kmalloc(cmd_data_len, GFP_ATOMIC);
67 68 69

	if (mem == NULL) {
		iser_err("Failed to allocate mem size %d %d for copying sglist\n",
70
			 data->size, (int)cmd_data_len);
71 72 73 74 75 76 77
		return -ENOMEM;
	}

	if (cmd_dir == ISER_DIR_OUT) {
		/* copy the unaligned sg the buffer which is used for RDMA */
		char *p, *from;

78
		sgl = (struct scatterlist *)data->buf;
J
Jens Axboe 已提交
79 80
		p = mem;
		for_each_sg(sgl, sg, data->size, i) {
81
			from = kmap_atomic(sg_page(sg));
82
			memcpy(p,
J
Jens Axboe 已提交
83 84
			       from + sg->offset,
			       sg->length);
85
			kunmap_atomic(from);
J
Jens Axboe 已提交
86
			p += sg->length;
87 88 89
		}
	}

90 91 92 93
	sg_init_one(&data_copy->sg_single, mem, cmd_data_len);
	data_copy->buf = &data_copy->sg_single;
	data_copy->size = 1;
	data_copy->copy_buf = mem;
94

95
	dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1,
96 97
				  (cmd_dir == ISER_DIR_OUT) ?
				  DMA_TO_DEVICE : DMA_FROM_DEVICE);
98 99
	BUG_ON(dma_nents == 0);

100 101 102
	data_copy->dma_nents = dma_nents;
	data_copy->data_len = cmd_data_len;

103 104 105 106 107 108
	return 0;
}

/**
 * iser_finalize_rdma_unaligned_sg
 */
109

110
void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
111 112 113
				     struct iser_data_buf *data,
				     struct iser_data_buf *data_copy,
				     enum iser_data_dir cmd_dir)
114
{
115
	struct ib_device *dev;
116 117
	unsigned long  cmd_data_len;

S
Sagi Grimberg 已提交
118
	dev = iser_task->iser_conn->ib_conn.device->ib_device;
119

120
	ib_dma_unmap_sg(dev, &data_copy->sg_single, 1,
121 122
			(cmd_dir == ISER_DIR_OUT) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE);
123 124 125

	if (cmd_dir == ISER_DIR_IN) {
		char *mem;
J
Jens Axboe 已提交
126
		struct scatterlist *sgl, *sg;
127 128 129 130 131
		unsigned char *p, *to;
		unsigned int sg_size;
		int i;

		/* copy back read RDMA to unaligned sg */
132
		mem = data_copy->copy_buf;
133

134 135
		sgl = (struct scatterlist *)data->buf;
		sg_size = data->size;
136

J
Jens Axboe 已提交
137 138
		p = mem;
		for_each_sg(sgl, sg, sg_size, i) {
139
			to = kmap_atomic(sg_page(sg));
J
Jens Axboe 已提交
140
			memcpy(to + sg->offset,
141
			       p,
J
Jens Axboe 已提交
142
			       sg->length);
143
			kunmap_atomic(to);
J
Jens Axboe 已提交
144
			p += sg->length;
145 146 147
		}
	}

148
	cmd_data_len = data->data_len;
149 150

	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
151
		free_pages((unsigned long)data_copy->copy_buf,
152
			   ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
153
	else
154
		kfree(data_copy->copy_buf);
155

156
	data_copy->copy_buf = NULL;
157 158
}

159 160
#define IS_4K_ALIGNED(addr)	((((unsigned long)addr) & ~MASK_4K) == 0)

161 162 163 164 165 166 167 168 169 170 171 172
/**
 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
 * and returns the length of resulting physical address array (may be less than
 * the original due to possible compaction).
 *
 * we build a "page vec" under the assumption that the SG meets the RDMA
 * alignment requirements. Other then the first and last SG elements, all
 * the "internal" elements can be compacted into a list whose elements are
 * dma addresses of physical pages. The code supports also the weird case
 * where --few fragments of the same page-- are present in the SG as
 * consecutive elements. Also, it handles one entry SG.
 */
173

174
static int iser_sg_to_page_vec(struct iser_data_buf *data,
175 176
			       struct ib_device *ibdev, u64 *pages,
			       int *offset, int *data_size)
177
{
178 179
	struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;
	u64 start_addr, end_addr, page, chunk_start = 0;
180
	unsigned long total_sz = 0;
181 182
	unsigned int dma_len;
	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
183 184

	/* compute the offset of first element */
185
	*offset = (u64) sgl[0].offset & ~MASK_4K;
186

187 188
	new_chunk = 1;
	cur_page  = 0;
J
Jens Axboe 已提交
189
	for_each_sg(sgl, sg, data->dma_nents, i) {
190 191 192 193 194
		start_addr = ib_sg_dma_address(ibdev, sg);
		if (new_chunk)
			chunk_start = start_addr;
		dma_len = ib_sg_dma_len(ibdev, sg);
		end_addr = start_addr + dma_len;
195
		total_sz += dma_len;
196

197 198 199 200
		/* collect page fragments until aligned or end of SG list */
		if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
			new_chunk = 0;
			continue;
201
		}
202 203 204 205 206 207 208
		new_chunk = 1;

		/* address of the first page in the contiguous chunk;
		   masking relevant for the very first SG entry,
		   which might be unaligned */
		page = chunk_start & MASK_4K;
		do {
209
			pages[cur_page++] = page;
210
			page += SIZE_4K;
211
		} while (page < end_addr);
212
	}
213

214 215 216
	*data_size = total_sz;
	iser_dbg("page_vec->data_size:%d cur_page %d\n",
		 *data_size, cur_page);
217 218 219 220 221 222 223 224 225 226
	return cur_page;
}


/**
 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
 * for RDMA sub-list of a scatter-gather list of memory buffers, and  returns
 * the number of entries which are aligned correctly. Supports the case where
 * consecutive SG elements are actually fragments of the same physcial page.
 */
227 228
static int iser_data_buf_aligned_len(struct iser_data_buf *data,
				      struct ib_device *ibdev)
229
{
230 231 232 233 234 235
	struct scatterlist *sgl, *sg, *next_sg = NULL;
	u64 start_addr, end_addr;
	int i, ret_len, start_check = 0;

	if (data->dma_nents == 1)
		return 1;
236

J
Jens Axboe 已提交
237
	sgl = (struct scatterlist *)data->buf;
238
	start_addr  = ib_sg_dma_address(ibdev, sgl);
239

J
Jens Axboe 已提交
240
	for_each_sg(sgl, sg, data->dma_nents, i) {
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
		if (start_check && !IS_4K_ALIGNED(start_addr))
			break;

		next_sg = sg_next(sg);
		if (!next_sg)
			break;

		end_addr    = start_addr + ib_sg_dma_len(ibdev, sg);
		start_addr  = ib_sg_dma_address(ibdev, next_sg);

		if (end_addr == start_addr) {
			start_check = 0;
			continue;
		} else
			start_check = 1;

		if (!IS_4K_ALIGNED(end_addr))
			break;
259
	}
260
	ret_len = (next_sg) ? i : i+1;
261 262 263 264 265
	iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
		 ret_len, data->dma_nents, data);
	return ret_len;
}

266 267
static void iser_data_buf_dump(struct iser_data_buf *data,
			       struct ib_device *ibdev)
268
{
J
Jens Axboe 已提交
269 270
	struct scatterlist *sgl = (struct scatterlist *)data->buf;
	struct scatterlist *sg;
271 272
	int i;

J
Jens Axboe 已提交
273
	for_each_sg(sgl, sg, data->dma_nents, i)
274
		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
E
Erez Zilber 已提交
275
			 "off:0x%x sz:0x%x dma_len:0x%x\n",
J
Jens Axboe 已提交
276
			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
J
Jens Axboe 已提交
277
			 sg_page(sg), sg->offset,
J
Jens Axboe 已提交
278
			 sg->length, ib_sg_dma_len(ibdev, sg));
279 280 281 282 283 284 285 286 287 288 289 290 291
}

static void iser_dump_page_vec(struct iser_page_vec *page_vec)
{
	int i;

	iser_err("page vec length %d data size %d\n",
		 page_vec->length, page_vec->data_size);
	for (i = 0; i < page_vec->length; i++)
		iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
}

static void iser_page_vec_build(struct iser_data_buf *data,
292 293
				struct iser_page_vec *page_vec,
				struct ib_device *ibdev)
294 295 296 297 298 299 300
{
	int page_vec_len = 0;

	page_vec->length = 0;
	page_vec->offset = 0;

	iser_dbg("Translating sg sz: %d\n", data->dma_nents);
301 302 303 304
	page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages,
					   &page_vec->offset,
					   &page_vec->data_size);
	iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len);
305 306 307

	page_vec->length = page_vec_len;

308
	if (page_vec_len * SIZE_4K < page_vec->data_size) {
309
		iser_err("page_vec too short to hold this SG\n");
310
		iser_data_buf_dump(data, ibdev);
311 312 313 314 315
		iser_dump_page_vec(page_vec);
		BUG();
	}
}

316 317 318 319
int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
			    struct iser_data_buf *data,
			    enum iser_data_dir iser_dir,
			    enum dma_data_direction dma_dir)
320
{
321
	struct ib_device *dev;
322

323
	iser_task->dir[iser_dir] = 1;
S
Sagi Grimberg 已提交
324
	dev = iser_task->iser_conn->ib_conn.device->ib_device;
325

326
	data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir);
327 328 329 330 331 332 333
	if (data->dma_nents == 0) {
		iser_err("dma_map_sg failed!!!\n");
		return -EINVAL;
	}
	return 0;
}

334 335
void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
			      struct iser_data_buf *data)
336
{
337
	struct ib_device *dev;
338

S
Sagi Grimberg 已提交
339
	dev = iser_task->iser_conn->ib_conn.device->ib_device;
340
	ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE);
341 342
}

343 344
static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
			      struct ib_device *ibdev,
345 346
			      struct iser_data_buf *mem,
			      struct iser_data_buf *mem_copy,
347 348 349
			      enum iser_data_dir cmd_dir,
			      int aligned_len)
{
350
	struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn;
351 352 353 354 355 356 357 358 359

	iscsi_conn->fmr_unalign_cnt++;
	iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
		  aligned_len, mem->size);

	if (iser_debug_level > 0)
		iser_data_buf_dump(mem, ibdev);

	/* unmap the command data before accessing it */
360
	iser_dma_unmap_task_data(iser_task, mem);
361 362 363

	/* allocate copy buf, if we are writing, copy the */
	/* unaligned scatterlist, dma map the copy        */
364 365
	if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0)
		return -ENOMEM;
366 367 368 369

	return 0;
}

370
/**
371 372
 * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
 * using FMR (if possible) obtaining rkey and va
373 374 375
 *
 * returns 0 on success, errno code on failure
 */
376 377
int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
			  enum iser_data_dir cmd_dir)
378
{
S
Sagi Grimberg 已提交
379 380
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
	struct iser_device   *device = ib_conn->device;
381
	struct ib_device     *ibdev = device->ib_device;
382
	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
383 384 385
	struct iser_regd_buf *regd_buf;
	int aligned_len;
	int err;
E
Erez Zilber 已提交
386
	int i;
387
	struct scatterlist *sg;
388

389
	regd_buf = &iser_task->rdma_regd[cmd_dir];
390

391
	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
392
	if (aligned_len != mem->dma_nents) {
393 394
		err = fall_to_bounce_buf(iser_task, ibdev, mem,
					 &iser_task->data_copy[cmd_dir],
395 396 397 398 399
					 cmd_dir, aligned_len);
		if (err) {
			iser_err("failed to allocate bounce buffer\n");
			return err;
		}
400
		mem = &iser_task->data_copy[cmd_dir];
401 402
	}

403 404 405 406 407 408
	/* if there a single dma entry, FMR is not needed */
	if (mem->dma_nents == 1) {
		sg = (struct scatterlist *)mem->buf;

		regd_buf->reg.lkey = device->mr->lkey;
		regd_buf->reg.rkey = device->mr->rkey;
409 410
		regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
		regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
411 412 413 414 415 416 417 418

		iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "
			 "va: 0x%08lX sz: %ld]\n",
			 (unsigned int)regd_buf->reg.lkey,
			 (unsigned int)regd_buf->reg.rkey,
			 (unsigned long)regd_buf->reg.va,
			 (unsigned long)regd_buf->reg.len);
	} else { /* use FMR for multiple dma entries */
S
Sagi Grimberg 已提交
419 420
		iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev);
		err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec,
421
					&regd_buf->reg);
422
		if (err && err != -EAGAIN) {
423
			iser_data_buf_dump(mem, ibdev);
424 425 426
			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
				 mem->dma_nents,
				 ntoh24(iser_task->desc.iscsi_header.dlength));
427
			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
S
Sagi Grimberg 已提交
428 429 430 431
				 ib_conn->fmr.page_vec->data_size,
				 ib_conn->fmr.page_vec->length,
				 ib_conn->fmr.page_vec->offset);
			for (i = 0; i < ib_conn->fmr.page_vec->length; i++)
432
				iser_err("page_vec[%d] = 0x%llx\n", i,
S
Sagi Grimberg 已提交
433
					 (unsigned long long)ib_conn->fmr.page_vec->pages[i]);
E
Erez Zilber 已提交
434
		}
435 436
		if (err)
			return err;
E
Erez Zilber 已提交
437
	}
438 439
	return 0;
}
440

441 442 443 444
static inline void
iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
		    struct ib_sig_domain *domain)
{
445
	domain->sig_type = IB_SIG_TYPE_T10_DIF;
446 447
	domain->sig.dif.pi_interval = sc->device->sector_size;
	domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff;
448 449 450 451 452 453 454 455 456 457
	/*
	 * At the moment we hard code those, but in the future
	 * we will take them from sc.
	 */
	domain->sig.dif.apptag_check_mask = 0xffff;
	domain->sig.dif.app_escape = true;
	domain->sig.dif.ref_escape = true;
	if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 ||
	    scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2)
		domain->sig.dif.ref_remap = true;
458
};
459 460 461 462 463 464 465

static int
iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
{
	switch (scsi_get_prot_op(sc)) {
	case SCSI_PROT_WRITE_INSERT:
	case SCSI_PROT_READ_STRIP:
466
		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
467
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
468 469 470 471
		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
		break;
	case SCSI_PROT_READ_INSERT:
	case SCSI_PROT_WRITE_STRIP:
472
		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
473
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
474 475 476 477 478
		/*
		 * At the moment we use this modparam to tell what is
		 * the memory bg_type, in the future we will take it
		 * from sc.
		 */
479 480
		sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
						 IB_T10DIF_CRC;
481 482 483
		break;
	case SCSI_PROT_READ_PASS:
	case SCSI_PROT_WRITE_PASS:
484
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
485
		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
486
		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
487 488 489 490 491
		/*
		 * At the moment we use this modparam to tell what is
		 * the memory bg_type, in the future we will take it
		 * from sc.
		 */
492 493
		sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
						 IB_T10DIF_CRC;
494 495 496 497 498 499
		break;
	default:
		iser_err("Unsupported PI operation %d\n",
			 scsi_get_prot_op(sc));
		return -EINVAL;
	}
500

501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
	return 0;
}

static int
iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
{
	switch (scsi_get_prot_type(sc)) {
	case SCSI_PROT_DIF_TYPE0:
		break;
	case SCSI_PROT_DIF_TYPE1:
	case SCSI_PROT_DIF_TYPE2:
		*mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG;
		break;
	case SCSI_PROT_DIF_TYPE3:
		*mask = ISER_CHECK_GUARD;
		break;
	default:
		iser_err("Unsupported protection type %d\n",
			 scsi_get_prot_type(sc));
		return -EINVAL;
	}

	return 0;
}

526 527 528 529 530 531 532 533 534 535 536 537 538 539
static void
iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
{
	u32 rkey;

	memset(inv_wr, 0, sizeof(*inv_wr));
	inv_wr->opcode = IB_WR_LOCAL_INV;
	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
	inv_wr->ex.invalidate_rkey = mr->rkey;

	rkey = ib_inc_rkey(mr->rkey);
	ib_update_fast_reg_key(mr, rkey);
}

540 541 542 543 544
static int
iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
		struct fast_reg_descriptor *desc, struct ib_sge *data_sge,
		struct ib_sge *prot_sge, struct ib_sge *sig_sge)
{
S
Sagi Grimberg 已提交
545
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
	struct iser_pi_context *pi_ctx = desc->pi_ctx;
	struct ib_send_wr sig_wr, inv_wr;
	struct ib_send_wr *bad_wr, *wr = NULL;
	struct ib_sig_attrs sig_attrs;
	int ret;

	memset(&sig_attrs, 0, sizeof(sig_attrs));
	ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
	if (ret)
		goto err;

	ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
	if (ret)
		goto err;

	if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
562
		iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
		wr = &inv_wr;
	}

	memset(&sig_wr, 0, sizeof(sig_wr));
	sig_wr.opcode = IB_WR_REG_SIG_MR;
	sig_wr.wr_id = ISER_FASTREG_LI_WRID;
	sig_wr.sg_list = data_sge;
	sig_wr.num_sge = 1;
	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
	if (scsi_prot_sg_count(iser_task->sc))
		sig_wr.wr.sig_handover.prot = prot_sge;
	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
					      IB_ACCESS_REMOTE_READ |
					      IB_ACCESS_REMOTE_WRITE;

	if (!wr)
		wr = &sig_wr;
	else
		wr->next = &sig_wr;

S
Sagi Grimberg 已提交
584
	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606
	if (ret) {
		iser_err("reg_sig_mr failed, ret:%d\n", ret);
		goto err;
	}
	desc->reg_indicators &= ~ISER_SIG_KEY_VALID;

	sig_sge->lkey = pi_ctx->sig_mr->lkey;
	sig_sge->addr = 0;
	sig_sge->length = data_sge->length + prot_sge->length;
	if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT ||
	    scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) {
		sig_sge->length += (data_sge->length /
				   iser_task->sc->device->sector_size) * 8;
	}

	iser_dbg("sig_sge: addr: 0x%llx  length: %u lkey: 0x%x\n",
		 sig_sge->addr, sig_sge->length,
		 sig_sge->lkey);
err:
	return ret;
}

607
static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
608
			    struct iser_regd_buf *regd_buf,
609
			    struct iser_data_buf *mem,
610
			    enum iser_reg_indicator ind,
611
			    struct ib_sge *sge)
612
{
613
	struct fast_reg_descriptor *desc = regd_buf->reg.mem_h;
S
Sagi Grimberg 已提交
614 615
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
	struct iser_device *device = ib_conn->device;
616
	struct ib_device *ibdev = device->ib_device;
617 618
	struct ib_mr *mr;
	struct ib_fast_reg_page_list *frpl;
619 620
	struct ib_send_wr fastreg_wr, inv_wr;
	struct ib_send_wr *bad_wr, *wr = NULL;
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
	int ret, offset, size, plen;

	/* if there a single dma entry, dma mr suffices */
	if (mem->dma_nents == 1) {
		struct scatterlist *sg = (struct scatterlist *)mem->buf;

		sge->lkey = device->mr->lkey;
		sge->addr   = ib_sg_dma_address(ibdev, &sg[0]);
		sge->length  = ib_sg_dma_len(ibdev, &sg[0]);

		iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n",
			 sge->lkey, sge->addr, sge->length);
		return 0;
	}

636 637 638 639 640 641 642 643 644
	if (ind == ISER_DATA_KEY_VALID) {
		mr = desc->data_mr;
		frpl = desc->data_frpl;
	} else {
		mr = desc->pi_ctx->prot_mr;
		frpl = desc->pi_ctx->prot_frpl;
	}

	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
645 646 647 648 649
				   &offset, &size);
	if (plen * SIZE_4K < size) {
		iser_err("fast reg page_list too short to hold this SG\n");
		return -EINVAL;
	}
650

651
	if (!(desc->reg_indicators & ind)) {
652
		iser_inv_rkey(&inv_wr, mr);
653 654 655 656 657
		wr = &inv_wr;
	}

	/* Prepare FASTREG WR */
	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
658
	fastreg_wr.wr_id = ISER_FASTREG_LI_WRID;
659
	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
660 661
	fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset;
	fastreg_wr.wr.fast_reg.page_list = frpl;
662
	fastreg_wr.wr.fast_reg.page_list_len = plen;
663
	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
664
	fastreg_wr.wr.fast_reg.length = size;
665
	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
666 667 668 669
	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
					       IB_ACCESS_REMOTE_WRITE |
					       IB_ACCESS_REMOTE_READ);

670
	if (!wr)
671
		wr = &fastreg_wr;
672
	else
673 674
		wr->next = &fastreg_wr;

S
Sagi Grimberg 已提交
675
	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
676 677 678 679
	if (ret) {
		iser_err("fast registration failed, ret:%d\n", ret);
		return ret;
	}
680
	desc->reg_indicators &= ~ind;
681

682 683
	sge->lkey = mr->lkey;
	sge->addr = frpl->page_list[0] + offset;
684
	sge->length = size;
685 686 687 688 689

	return ret;
}

/**
690
 * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA,
691 692 693 694
 * using Fast Registration WR (if possible) obtaining rkey and va
 *
 * returns 0 on success, errno code on failure
 */
695 696
int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
			      enum iser_data_dir cmd_dir)
697
{
S
Sagi Grimberg 已提交
698 699
	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
	struct iser_device *device = ib_conn->device;
700 701 702
	struct ib_device *ibdev = device->ib_device;
	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
	struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir];
703 704
	struct fast_reg_descriptor *desc = NULL;
	struct ib_sge data_sge;
705 706 707 708 709
	int err, aligned_len;
	unsigned long flags;

	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
	if (aligned_len != mem->dma_nents) {
710 711
		err = fall_to_bounce_buf(iser_task, ibdev, mem,
					 &iser_task->data_copy[cmd_dir],
712 713 714 715 716 717 718 719
					 cmd_dir, aligned_len);
		if (err) {
			iser_err("failed to allocate bounce buffer\n");
			return err;
		}
		mem = &iser_task->data_copy[cmd_dir];
	}

720 721
	if (mem->dma_nents != 1 ||
	    scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
S
Sagi Grimberg 已提交
722 723
		spin_lock_irqsave(&ib_conn->lock, flags);
		desc = list_first_entry(&ib_conn->fastreg.pool,
724 725
					struct fast_reg_descriptor, list);
		list_del(&desc->list);
S
Sagi Grimberg 已提交
726
		spin_unlock_irqrestore(&ib_conn->lock, flags);
727 728
		regd_buf->reg.mem_h = desc;
	}
729

730 731
	err = iser_fast_reg_mr(iser_task, regd_buf, mem,
			       ISER_DATA_KEY_VALID, &data_sge);
732 733 734
	if (err)
		goto err_reg;

735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770
	if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
		struct ib_sge prot_sge, sig_sge;

		memset(&prot_sge, 0, sizeof(prot_sge));
		if (scsi_prot_sg_count(iser_task->sc)) {
			mem = &iser_task->prot[cmd_dir];
			aligned_len = iser_data_buf_aligned_len(mem, ibdev);
			if (aligned_len != mem->dma_nents) {
				err = fall_to_bounce_buf(iser_task, ibdev, mem,
							 &iser_task->prot_copy[cmd_dir],
							 cmd_dir, aligned_len);
				if (err) {
					iser_err("failed to allocate bounce buffer\n");
					return err;
				}
				mem = &iser_task->prot_copy[cmd_dir];
			}

			err = iser_fast_reg_mr(iser_task, regd_buf, mem,
					       ISER_PROT_KEY_VALID, &prot_sge);
			if (err)
				goto err_reg;
		}

		err = iser_reg_sig_mr(iser_task, desc, &data_sge,
				      &prot_sge, &sig_sge);
		if (err) {
			iser_err("Failed to register signature mr\n");
			return err;
		}
		desc->reg_indicators |= ISER_FASTREG_PROTECTED;

		regd_buf->reg.lkey = sig_sge.lkey;
		regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey;
		regd_buf->reg.va = sig_sge.addr;
		regd_buf->reg.len = sig_sge.length;
771
	} else {
772
		if (desc)
773
			regd_buf->reg.rkey = desc->data_mr->rkey;
774
		else
775
			regd_buf->reg.rkey = device->mr->rkey;
776

777 778 779 780
		regd_buf->reg.lkey = data_sge.lkey;
		regd_buf->reg.va = data_sge.addr;
		regd_buf->reg.len = data_sge.length;
	}
781

782 783
	return 0;
err_reg:
784
	if (desc) {
S
Sagi Grimberg 已提交
785 786 787
		spin_lock_irqsave(&ib_conn->lock, flags);
		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
		spin_unlock_irqrestore(&ib_conn->lock, flags);
788 789
	}

790 791
	return err;
}