rdma.c 23.1 KB
Newer Older
A
Andy Grover 已提交
1
/*
2
 * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
A
Andy Grover 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/pagemap.h>
34
#include <linux/slab.h>
A
Andy Grover 已提交
35 36 37
#include <linux/rbtree.h>
#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */

A
Andy Grover 已提交
38
#include "rds.h"
A
Andy Grover 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86

/*
 * XXX
 *  - build with sparse
 *  - should we detect duplicate keys on a socket?  hmm.
 *  - an rdma is an mlock, apply rlimit?
 */

/*
 * get the number of pages by looking at the page indices that the start and
 * end addresses fall in.
 *
 * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
 * causes the address to wrap or overflows an unsigned int.  This comes
 * from being stored in the 'length' member of 'struct scatterlist'.
 */
static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
{
	if ((vec->addr + vec->bytes <= vec->addr) ||
	    (vec->bytes > (u64)UINT_MAX))
		return 0;

	return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
		(vec->addr >> PAGE_SHIFT);
}

static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
				       struct rds_mr *insert)
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct rds_mr *mr;

	while (*p) {
		parent = *p;
		mr = rb_entry(parent, struct rds_mr, r_rb_node);

		if (key < mr->r_key)
			p = &(*p)->rb_left;
		else if (key > mr->r_key)
			p = &(*p)->rb_right;
		else
			return mr;
	}

	if (insert) {
		rb_link_node(&insert->r_rb_node, parent, p);
		rb_insert_color(&insert->r_rb_node, root);
87
		refcount_inc(&insert->r_refcount);
A
Andy Grover 已提交
88 89 90 91 92 93 94 95 96 97 98 99 100 101
	}
	return NULL;
}

/*
 * Destroy the transport-specific part of a MR.
 */
static void rds_destroy_mr(struct rds_mr *mr)
{
	struct rds_sock *rs = mr->r_sock;
	void *trans_private = NULL;
	unsigned long flags;

	rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
102
			mr->r_key, refcount_read(&mr->r_refcount));
A
Andy Grover 已提交
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131

	if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
		return;

	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
	if (!RB_EMPTY_NODE(&mr->r_rb_node))
		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
	trans_private = mr->r_trans_private;
	mr->r_trans_private = NULL;
	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);

	if (trans_private)
		mr->r_trans->free_mr(trans_private, mr->r_invalidate);
}

void __rds_put_mr_final(struct rds_mr *mr)
{
	rds_destroy_mr(mr);
	kfree(mr);
}

/*
 * By the time this is called we can't have any more ioctls called on
 * the socket so we don't need to worry about racing with others.
 */
void rds_rdma_drop_keys(struct rds_sock *rs)
{
	struct rds_mr *mr;
	struct rb_node *node;
T
Tina Yang 已提交
132
	unsigned long flags;
A
Andy Grover 已提交
133 134

	/* Release any MRs associated with this socket */
T
Tina Yang 已提交
135
	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
A
Andy Grover 已提交
136
	while ((node = rb_first(&rs->rs_rdma_keys))) {
G
Geliang Tang 已提交
137
		mr = rb_entry(node, struct rds_mr, r_rb_node);
A
Andy Grover 已提交
138 139
		if (mr->r_trans == rs->rs_transport)
			mr->r_invalidate = 0;
T
Tina Yang 已提交
140 141 142 143
		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
		RB_CLEAR_NODE(&mr->r_rb_node);
		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
		rds_destroy_mr(mr);
A
Andy Grover 已提交
144
		rds_mr_put(mr);
T
Tina Yang 已提交
145
		spin_lock_irqsave(&rs->rs_rdma_lock, flags);
A
Andy Grover 已提交
146
	}
T
Tina Yang 已提交
147
	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
A
Andy Grover 已提交
148 149 150 151 152 153 154 155 156 157 158 159 160

	if (rs->rs_transport && rs->rs_transport->flush_mrs)
		rs->rs_transport->flush_mrs();
}

/*
 * Helper function to pin user pages.
 */
static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
			struct page **pages, int write)
{
	int ret;

A
Andy Grover 已提交
161
	ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
A
Andy Grover 已提交
162

A
Andy Grover 已提交
163
	if (ret >= 0 && ret < nr_pages) {
A
Andy Grover 已提交
164 165 166 167 168 169 170 171 172
		while (ret--)
			put_page(pages[ret]);
		ret = -EFAULT;
	}

	return ret;
}

static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
173 174
			  u64 *cookie_ret, struct rds_mr **mr_ret,
			  struct rds_conn_path *cp)
A
Andy Grover 已提交
175 176 177 178 179 180 181 182 183 184 185 186
{
	struct rds_mr *mr = NULL, *found;
	unsigned int nr_pages;
	struct page **pages = NULL;
	struct scatterlist *sg;
	void *trans_private;
	unsigned long flags;
	rds_rdma_cookie_t cookie;
	unsigned int nents;
	long i;
	int ret;

187
	if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
A
Andy Grover 已提交
188 189 190 191
		ret = -ENOTCONN; /* XXX not a great errno */
		goto out;
	}

192
	if (!rs->rs_transport->get_mr) {
A
Andy Grover 已提交
193 194 195 196 197 198 199 200 201 202
		ret = -EOPNOTSUPP;
		goto out;
	}

	nr_pages = rds_pages_in_vec(&args->vec);
	if (nr_pages == 0) {
		ret = -EINVAL;
		goto out;
	}

203 204 205 206 207 208 209 210
	/* Restrict the size of mr irrespective of underlying transport
	 * To account for unaligned mr regions, subtract one from nr_pages
	 */
	if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
		ret = -EMSGSIZE;
		goto out;
	}

A
Andy Grover 已提交
211 212 213 214 215
	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
		args->vec.addr, args->vec.bytes, nr_pages);

	/* XXX clamp nr_pages to limit the size of this alloc? */
	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
216
	if (!pages) {
A
Andy Grover 已提交
217 218 219 220 221
		ret = -ENOMEM;
		goto out;
	}

	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
222
	if (!mr) {
A
Andy Grover 已提交
223 224 225 226
		ret = -ENOMEM;
		goto out;
	}

227
	refcount_set(&mr->r_refcount, 1);
A
Andy Grover 已提交
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
	RB_CLEAR_NODE(&mr->r_rb_node);
	mr->r_trans = rs->rs_transport;
	mr->r_sock = rs;

	if (args->flags & RDS_RDMA_USE_ONCE)
		mr->r_use_once = 1;
	if (args->flags & RDS_RDMA_INVALIDATE)
		mr->r_invalidate = 1;
	if (args->flags & RDS_RDMA_READWRITE)
		mr->r_write = 1;

	/*
	 * Pin the pages that make up the user buffer and transfer the page
	 * pointers to the mr's sg array.  We check to see if we've mapped
	 * the whole region after transferring the partial page references
	 * to the sg array so that we can have one page ref cleanup path.
	 *
	 * For now we have no flag that tells us whether the mapping is
	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
	 * the zero page.
	 */
249
	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
A
Andy Grover 已提交
250 251 252 253 254
	if (ret < 0)
		goto out;

	nents = ret;
	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
255
	if (!sg) {
A
Andy Grover 已提交
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
		ret = -ENOMEM;
		goto out;
	}
	WARN_ON(!nents);
	sg_init_table(sg, nents);

	/* Stick all pages into the scatterlist */
	for (i = 0 ; i < nents; i++)
		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);

	rdsdebug("RDS: trans_private nents is %u\n", nents);

	/* Obtain a transport specific MR. If this succeeds, the
	 * s/g list is now owned by the MR.
	 * Note that dma_map() implies that pending writes are
	 * flushed to RAM, so no dma_sync is needed here. */
	trans_private = rs->rs_transport->get_mr(sg, nents, rs,
273 274
						 &mr->r_key,
						 cp ? cp->cp_conn : NULL);
A
Andy Grover 已提交
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311

	if (IS_ERR(trans_private)) {
		for (i = 0 ; i < nents; i++)
			put_page(sg_page(&sg[i]));
		kfree(sg);
		ret = PTR_ERR(trans_private);
		goto out;
	}

	mr->r_trans_private = trans_private;

	rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
	       mr->r_key, (void *)(unsigned long) args->cookie_addr);

	/* The user may pass us an unaligned address, but we can only
	 * map page aligned regions. So we keep the offset, and build
	 * a 64bit cookie containing <R_Key, offset> and pass that
	 * around. */
	cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
	if (cookie_ret)
		*cookie_ret = cookie;

	if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
		ret = -EFAULT;
		goto out;
	}

	/* Inserting the new MR into the rbtree bumps its
	 * reference count. */
	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
	found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);

	BUG_ON(found && found != mr);

	rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
	if (mr_ret) {
312
		refcount_inc(&mr->r_refcount);
A
Andy Grover 已提交
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
		*mr_ret = mr;
	}

	ret = 0;
out:
	kfree(pages);
	if (mr)
		rds_mr_put(mr);
	return ret;
}

int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
{
	struct rds_get_mr_args args;

	if (optlen != sizeof(struct rds_get_mr_args))
		return -EINVAL;

	if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
			   sizeof(struct rds_get_mr_args)))
		return -EFAULT;

335
	return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
A
Andy Grover 已提交
336 337
}

A
Andy Grover 已提交
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
{
	struct rds_get_mr_for_dest_args args;
	struct rds_get_mr_args new_args;

	if (optlen != sizeof(struct rds_get_mr_for_dest_args))
		return -EINVAL;

	if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
			   sizeof(struct rds_get_mr_for_dest_args)))
		return -EFAULT;

	/*
	 * Initially, just behave like get_mr().
	 * TODO: Implement get_mr as wrapper around this
	 *	 and deprecate it.
	 */
	new_args.vec = args.vec;
	new_args.cookie_addr = args.cookie_addr;
	new_args.flags = args.flags;

359
	return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL);
A
Andy Grover 已提交
360 361
}

A
Andy Grover 已提交
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
/*
 * Free the MR indicated by the given R_Key
 */
int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
{
	struct rds_free_mr_args args;
	struct rds_mr *mr;
	unsigned long flags;

	if (optlen != sizeof(struct rds_free_mr_args))
		return -EINVAL;

	if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
			   sizeof(struct rds_free_mr_args)))
		return -EFAULT;

	/* Special case - a null cookie means flush all unused MRs */
	if (args.cookie == 0) {
		if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
			return -EINVAL;
		rs->rs_transport->flush_mrs();
		return 0;
	}

	/* Look up the MR given its R_key and remove it from the rbtree
	 * so nobody else finds it.
	 * This should also prevent races with rds_rdma_unuse.
	 */
	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
	if (mr) {
		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
		RB_CLEAR_NODE(&mr->r_rb_node);
		if (args.flags & RDS_RDMA_INVALIDATE)
			mr->r_invalidate = 1;
	}
	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);

	if (!mr)
		return -EINVAL;

	/*
	 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
	 * we return.  If we let rds_mr_put() do it it might not happen until
	 * someone else drops their ref.
	 */
	rds_destroy_mr(mr);
	rds_mr_put(mr);
	return 0;
}

/*
 * This is called when we receive an extension header that
 * tells us this MR was used. It allows us to implement
 * use_once semantics
 */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
{
	struct rds_mr *mr;
	unsigned long flags;
	int zot_me = 0;

	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
A
Andy Grover 已提交
426
	if (!mr) {
427 428
		pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
			 r_key);
A
Andy Grover 已提交
429 430 431 432 433
		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
		return;
	}

	if (mr->r_use_once || force) {
A
Andy Grover 已提交
434 435 436
		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
		RB_CLEAR_NODE(&mr->r_rb_node);
		zot_me = 1;
A
Andy Grover 已提交
437
	}
A
Andy Grover 已提交
438 439 440 441 442
	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);

	/* May have to issue a dma_sync on this memory region.
	 * Note we could avoid this if the operation was a RDMA READ,
	 * but at this point we can't tell. */
A
Andy Grover 已提交
443 444
	if (mr->r_trans->sync_mr)
		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
A
Andy Grover 已提交
445

A
Andy Grover 已提交
446 447
	/* If the MR was marked as invalidate, this will
	 * trigger an async flush. */
448
	if (zot_me) {
A
Andy Grover 已提交
449
		rds_destroy_mr(mr);
450 451
		rds_mr_put(mr);
	}
A
Andy Grover 已提交
452 453
}

A
Andy Grover 已提交
454
void rds_rdma_free_op(struct rm_rdma_op *ro)
A
Andy Grover 已提交
455 456 457
{
	unsigned int i;

A
Andy Grover 已提交
458 459
	for (i = 0; i < ro->op_nents; i++) {
		struct page *page = sg_page(&ro->op_sg[i]);
A
Andy Grover 已提交
460 461 462 463

		/* Mark page dirty if it was possibly modified, which
		 * is the case for a RDMA_READ which copies from remote
		 * to local memory */
A
Andy Grover 已提交
464
		if (!ro->op_write) {
465
			WARN_ON(!page->mapping && irqs_disabled());
A
Andy Grover 已提交
466
			set_page_dirty(page);
467
		}
A
Andy Grover 已提交
468 469 470
		put_page(page);
	}

A
Andy Grover 已提交
471 472 473
	kfree(ro->op_notifier);
	ro->op_notifier = NULL;
	ro->op_active = 0;
474 475
}

476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
void rds_atomic_free_op(struct rm_atomic_op *ao)
{
	struct page *page = sg_page(ao->op_sg);

	/* Mark page dirty if it was possibly modified, which
	 * is the case for a RDMA_READ which copies from remote
	 * to local memory */
	set_page_dirty(page);
	put_page(page);

	kfree(ao->op_notifier);
	ao->op_notifier = NULL;
	ao->op_active = 0;
}


492
/*
493
 * Count the number of pages needed to describe an incoming iovec array.
494
 */
495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
{
	int tot_pages = 0;
	unsigned int nr_pages;
	unsigned int i;

	/* figure out the number of pages in the vector */
	for (i = 0; i < nr_iovecs; i++) {
		nr_pages = rds_pages_in_vec(&iov[i]);
		if (nr_pages == 0)
			return -EINVAL;

		tot_pages += nr_pages;

		/*
		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
		 * so tot_pages cannot overflow without first going negative.
		 */
		if (tot_pages < 0)
			return -EINVAL;
	}

	return tot_pages;
}

520 521
int rds_rdma_extra_size(struct rds_rdma_args *args,
			struct rds_iov_vector *iov)
522
{
523
	struct rds_iovec *vec;
524
	struct rds_iovec __user *local_vec;
525
	int tot_pages = 0;
526 527 528 529 530
	unsigned int nr_pages;
	unsigned int i;

	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;

531 532 533
	if (args->nr_local == 0)
		return -EINVAL;

534 535 536 537 538 539 540 541 542 543 544 545 546
	iov->iov = kcalloc(args->nr_local,
			   sizeof(struct rds_iovec),
			   GFP_KERNEL);
	if (!iov->iov)
		return -ENOMEM;

	vec = &iov->iov[0];

	if (copy_from_user(vec, local_vec, args->nr_local *
			   sizeof(struct rds_iovec)))
		return -EFAULT;
	iov->len = args->nr_local;

547
	/* figure out the number of pages in the vector */
548
	for (i = 0; i < args->nr_local; i++, vec++) {
549

550
		nr_pages = rds_pages_in_vec(vec);
551 552 553 554
		if (nr_pages == 0)
			return -EINVAL;

		tot_pages += nr_pages;
555 556 557 558 559

		/*
		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
		 * so tot_pages cannot overflow without first going negative.
		 */
560
		if (tot_pages < 0)
561
			return -EINVAL;
562 563
	}

564
	return tot_pages * sizeof(struct scatterlist);
A
Andy Grover 已提交
565 566 567
}

/*
568 569
 * The application asks for a RDMA transfer.
 * Extract all arguments and set up the rdma_op
A
Andy Grover 已提交
570
 */
571
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
572 573
		       struct cmsghdr *cmsg,
		       struct rds_iov_vector *vec)
A
Andy Grover 已提交
574
{
575
	struct rds_rdma_args *args;
A
Andy Grover 已提交
576
	struct rm_rdma_op *op = &rm->rdma;
D
Dan Carpenter 已提交
577
	int nr_pages;
A
Andy Grover 已提交
578 579
	unsigned int nr_bytes;
	struct page **pages = NULL;
580
	struct rds_iovec *iovs;
A
Andy Grover 已提交
581
	unsigned int i, j;
582
	int ret = 0;
A
Andy Grover 已提交
583

584
	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
A
Andy Grover 已提交
585
	    || rm->rdma.op_active)
586 587 588
		return -EINVAL;

	args = CMSG_DATA(cmsg);
A
Andy Grover 已提交
589

590
	if (ipv6_addr_any(&rs->rs_bound_addr)) {
A
Andy Grover 已提交
591
		ret = -ENOTCONN; /* XXX not a great errno */
592
		goto out_ret;
A
Andy Grover 已提交
593 594
	}

595
	if (args->nr_local > UIO_MAXIOV) {
A
Andy Grover 已提交
596
		ret = -EMSGSIZE;
597
		goto out_ret;
A
Andy Grover 已提交
598 599
	}

600 601 602
	if (vec->len != args->nr_local) {
		ret = -EINVAL;
		goto out_ret;
603 604
	}

605
	iovs = vec->iov;
606 607

	nr_pages = rds_rdma_pages(iovs, args->nr_local);
608 609
	if (nr_pages < 0) {
		ret = -EINVAL;
610
		goto out_ret;
611
	}
A
Andy Grover 已提交
612

613 614
	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
	if (!pages) {
A
Andy Grover 已提交
615
		ret = -ENOMEM;
616
		goto out_ret;
A
Andy Grover 已提交
617 618
	}

A
Andy Grover 已提交
619 620 621
	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
622
	op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
A
Andy Grover 已提交
623 624
	op->op_active = 1;
	op->op_recverr = rs->rs_recverr;
A
Andy Grover 已提交
625
	WARN_ON(!nr_pages);
626 627
	op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret);
	if (!op->op_sg)
628
		goto out_pages;
A
Andy Grover 已提交
629

A
Andy Grover 已提交
630
	if (op->op_notify || op->op_recverr) {
A
Andy Grover 已提交
631 632 633 634 635
		/* We allocate an uninitialized notifier here, because
		 * we don't want to do that in the completion handler. We
		 * would have to use GFP_ATOMIC there, and don't want to deal
		 * with failed allocations.
		 */
A
Andy Grover 已提交
636 637
		op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
		if (!op->op_notifier) {
A
Andy Grover 已提交
638
			ret = -ENOMEM;
639
			goto out_pages;
A
Andy Grover 已提交
640
		}
A
Andy Grover 已提交
641 642
		op->op_notifier->n_user_token = args->user_token;
		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
643 644 645 646 647 648 649 650 651 652

		/* Enable rmda notification on data operation for composite
		 * rds messages and make sure notification is enabled only
		 * for the data operation which follows it so that application
		 * gets notified only after full message gets delivered.
		 */
		if (rm->data.op_sg) {
			rm->rdma.op_notify = 0;
			rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
		}
A
Andy Grover 已提交
653 654 655 656 657 658 659 660 661
	}

	/* The cookie contains the R_Key of the remote memory region, and
	 * optionally an offset into it. This is how we implement RDMA into
	 * unaligned memory.
	 * When setting up the RDMA, we need to add that offset to the
	 * destination address (which is really an offset into the MR)
	 * FIXME: We may want to move this into ib_rdma.c
	 */
A
Andy Grover 已提交
662 663
	op->op_rkey = rds_rdma_cookie_key(args->cookie);
	op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
A
Andy Grover 已提交
664 665 666 667 668 669

	nr_bytes = 0;

	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
	       (unsigned long long)args->nr_local,
	       (unsigned long long)args->remote_vec.addr,
A
Andy Grover 已提交
670
	       op->op_rkey);
A
Andy Grover 已提交
671 672

	for (i = 0; i < args->nr_local; i++) {
673 674 675
		struct rds_iovec *iov = &iovs[i];
		/* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
		unsigned int nr = rds_pages_in_vec(iov);
A
Andy Grover 已提交
676

677 678
		rs->rs_user_addr = iov->addr;
		rs->rs_user_bytes = iov->bytes;
A
Andy Grover 已提交
679 680 681 682

		/* If it's a WRITE operation, we want to pin the pages for reading.
		 * If it's a READ operation, we need to pin the pages for writing.
		 */
683
		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
A
Andy Grover 已提交
684
		if (ret < 0)
685
			goto out_pages;
686 687
		else
			ret = 0;
A
Andy Grover 已提交
688

689 690
		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
			 nr_bytes, nr, iov->bytes, iov->addr);
A
Andy Grover 已提交
691

692
		nr_bytes += iov->bytes;
A
Andy Grover 已提交
693 694

		for (j = 0; j < nr; j++) {
695
			unsigned int offset = iov->addr & ~PAGE_MASK;
696
			struct scatterlist *sg;
A
Andy Grover 已提交
697

A
Andy Grover 已提交
698
			sg = &op->op_sg[op->op_nents + j];
A
Andy Grover 已提交
699
			sg_set_page(sg, pages[j],
700
					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
A
Andy Grover 已提交
701 702
					offset);

703 704
			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
			       sg->offset, sg->length, iov->addr, iov->bytes);
A
Andy Grover 已提交
705

706 707
			iov->addr += sg->length;
			iov->bytes -= sg->length;
A
Andy Grover 已提交
708 709
		}

A
Andy Grover 已提交
710
		op->op_nents += nr;
A
Andy Grover 已提交
711 712 713 714 715 716 717
	}

	if (nr_bytes > args->remote_vec.bytes) {
		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
				nr_bytes,
				(unsigned int) args->remote_vec.bytes);
		ret = -EINVAL;
718
		goto out_pages;
A
Andy Grover 已提交
719
	}
A
Andy Grover 已提交
720
	op->op_bytes = nr_bytes;
A
Andy Grover 已提交
721

722
out_pages:
A
Andy Grover 已提交
723
	kfree(pages);
724
out_ret:
725 726
	if (ret)
		rds_rdma_free_op(op);
727 728
	else
		rds_stats_inc(s_send_rdma);
729 730

	return ret;
A
Andy Grover 已提交
731 732 733 734 735 736 737 738 739 740 741 742 743 744
}

/*
 * The application wants us to pass an RDMA destination (aka MR)
 * to the remote
 */
int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
			  struct cmsghdr *cmsg)
{
	unsigned long flags;
	struct rds_mr *mr;
	u32 r_key;
	int err = 0;

745 746
	if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
	    rm->m_rdma_cookie != 0)
A
Andy Grover 已提交
747 748 749 750 751 752 753 754 755 756 757 758 759
		return -EINVAL;

	memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));

	/* We are reusing a previously mapped MR here. Most likely, the
	 * application has written to the buffer, so we need to explicitly
	 * flush those writes to RAM. Otherwise the HCA may not see them
	 * when doing a DMA from that buffer.
	 */
	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);

	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
760
	if (!mr)
A
Andy Grover 已提交
761 762
		err = -EINVAL;	/* invalid r_key */
	else
763
		refcount_inc(&mr->r_refcount);
A
Andy Grover 已提交
764 765 766 767
	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);

	if (mr) {
		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
A
Andy Grover 已提交
768
		rm->rdma.op_rdma_mr = mr;
A
Andy Grover 已提交
769 770 771 772 773 774 775 776 777 778 779 780 781
	}
	return err;
}

/*
 * The application passes us an address range it wants to enable RDMA
 * to/from. We map the area, and save the <R_Key,offset> pair
 * in rm->m_rdma_cookie. This causes it to be sent along to the peer
 * in an extension header.
 */
int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
			  struct cmsghdr *cmsg)
{
782 783
	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
	    rm->m_rdma_cookie != 0)
A
Andy Grover 已提交
784 785
		return -EINVAL;

786 787
	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie,
			      &rm->rdma.op_rdma_mr, rm->m_conn_path);
A
Andy Grover 已提交
788
}
A
Andy Grover 已提交
789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805

/*
 * Fill in rds_message for an atomic request.
 */
int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
		    struct cmsghdr *cmsg)
{
	struct page *page = NULL;
	struct rds_atomic_args *args;
	int ret = 0;

	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
	 || rm->atomic.op_active)
		return -EINVAL;

	args = CMSG_DATA(cmsg);

806 807 808 809 810 811 812 813
	/* Nonmasked & masked cmsg ops converted to masked hw ops */
	switch (cmsg->cmsg_type) {
	case RDS_CMSG_ATOMIC_FADD:
		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
		rm->atomic.op_m_fadd.add = args->fadd.add;
		rm->atomic.op_m_fadd.nocarry_mask = 0;
		break;
	case RDS_CMSG_MASKED_ATOMIC_FADD:
A
Andy Grover 已提交
814
		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
		rm->atomic.op_m_fadd.add = args->m_fadd.add;
		rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
		break;
	case RDS_CMSG_ATOMIC_CSWP:
		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
		rm->atomic.op_m_cswp.compare = args->cswp.compare;
		rm->atomic.op_m_cswp.swap = args->cswp.swap;
		rm->atomic.op_m_cswp.compare_mask = ~0;
		rm->atomic.op_m_cswp.swap_mask = ~0;
		break;
	case RDS_CMSG_MASKED_ATOMIC_CSWP:
		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
		rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
		rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
		rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
		rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
		break;
	default:
		BUG(); /* should never happen */
A
Andy Grover 已提交
834 835 836
	}

	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
837
	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
838
	rm->atomic.op_active = 1;
A
Andy Grover 已提交
839
	rm->atomic.op_recverr = rs->rs_recverr;
840 841
	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1, &ret);
	if (!rm->atomic.op_sg)
842
		goto err;
A
Andy Grover 已提交
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872

	/* verify 8 byte-aligned */
	if (args->local_addr & 0x7) {
		ret = -EFAULT;
		goto err;
	}

	ret = rds_pin_pages(args->local_addr, 1, &page, 1);
	if (ret != 1)
		goto err;
	ret = 0;

	sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));

	if (rm->atomic.op_notify || rm->atomic.op_recverr) {
		/* We allocate an uninitialized notifier here, because
		 * we don't want to do that in the completion handler. We
		 * would have to use GFP_ATOMIC there, and don't want to deal
		 * with failed allocations.
		 */
		rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
		if (!rm->atomic.op_notifier) {
			ret = -ENOMEM;
			goto err;
		}

		rm->atomic.op_notifier->n_user_token = args->user_token;
		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
	}

873
	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
A
Andy Grover 已提交
874 875 876 877 878 879
	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);

	return ret;
err:
	if (page)
		put_page(page);
880
	rm->atomic.op_active = 0;
A
Andy Grover 已提交
881 882 883 884
	kfree(rm->atomic.op_notifier);

	return ret;
}