ib_rdma.c 16.6 KB
Newer Older
1
/*
2
 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/kernel.h>
34
#include <linux/slab.h>
C
Chris Mason 已提交
35
#include <linux/rculist.h>
36
#include <linux/llist.h>
37

38
#include "rds_single_path.h"
39 40 41
#include "ib_mr.h"

struct workqueue_struct *rds_ib_mr_wq;
42

43 44
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0
45 46 47 48 49 50

static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
	struct rds_ib_device *rds_ibdev;
	struct rds_ib_ipaddr *i_ipaddr;

51 52
	rcu_read_lock();
	list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
C
Chris Mason 已提交
53
		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
54
			if (i_ipaddr->ipaddr == ipaddr) {
55
				refcount_inc(&rds_ibdev->refcount);
C
Chris Mason 已提交
56
				rcu_read_unlock();
57 58 59 60
				return rds_ibdev;
			}
		}
	}
61
	rcu_read_unlock();
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

	return NULL;
}

static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{
	struct rds_ib_ipaddr *i_ipaddr;

	i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
	if (!i_ipaddr)
		return -ENOMEM;

	i_ipaddr->ipaddr = ipaddr;

	spin_lock_irq(&rds_ibdev->spinlock);
C
Chris Mason 已提交
77
	list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
78 79 80 81 82 83 84
	spin_unlock_irq(&rds_ibdev->spinlock);

	return 0;
}

static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{
85
	struct rds_ib_ipaddr *i_ipaddr;
C
Chris Mason 已提交
86 87
	struct rds_ib_ipaddr *to_free = NULL;

88 89

	spin_lock_irq(&rds_ibdev->spinlock);
C
Chris Mason 已提交
90
	list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
91
		if (i_ipaddr->ipaddr == ipaddr) {
C
Chris Mason 已提交
92 93
			list_del_rcu(&i_ipaddr->list);
			to_free = i_ipaddr;
94 95 96 97
			break;
		}
	}
	spin_unlock_irq(&rds_ibdev->spinlock);
C
Chris Mason 已提交
98

99 100
	if (to_free)
		kfree_rcu(to_free, rcu);
101 102
}

103 104
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
			 struct in6_addr *ipaddr)
105 106 107
{
	struct rds_ib_device *rds_ibdev_old;

108
	rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
109
	if (!rds_ibdev_old)
110
		return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
111 112

	if (rds_ibdev_old != rds_ibdev) {
113
		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
114
		rds_ib_dev_put(rds_ibdev_old);
115
		return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
116
	}
117
	rds_ib_dev_put(rds_ibdev_old);
118

119
	return 0;
120 121
}

122
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
123 124 125 126 127 128 129 130 131
{
	struct rds_ib_connection *ic = conn->c_transport_data;

	/* conn was previously on the nodev_conns_list */
	spin_lock_irq(&ib_nodev_conns_lock);
	BUG_ON(list_empty(&ib_nodev_conns));
	BUG_ON(list_empty(&ic->ib_node));
	list_del(&ic->ib_node);

132
	spin_lock(&rds_ibdev->spinlock);
133
	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
134
	spin_unlock(&rds_ibdev->spinlock);
135
	spin_unlock_irq(&ib_nodev_conns_lock);
136 137

	ic->rds_ibdev = rds_ibdev;
138
	refcount_inc(&rds_ibdev->refcount);
139 140
}

141
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
142
{
143
	struct rds_ib_connection *ic = conn->c_transport_data;
144

145 146
	/* place conn on nodev_conns_list */
	spin_lock(&ib_nodev_conns_lock);
147

148 149 150 151 152 153 154 155 156 157
	spin_lock_irq(&rds_ibdev->spinlock);
	BUG_ON(list_empty(&ic->ib_node));
	list_del(&ic->ib_node);
	spin_unlock_irq(&rds_ibdev->spinlock);

	list_add_tail(&ic->ib_node, &ib_nodev_conns);

	spin_unlock(&ib_nodev_conns_lock);

	ic->rds_ibdev = NULL;
158
	rds_ib_dev_put(rds_ibdev);
159 160
}

161
void rds_ib_destroy_nodev_conns(void)
162 163 164 165 166
{
	struct rds_ib_connection *ic, *_ic;
	LIST_HEAD(tmp_list);

	/* avoid calling conn_destroy with irqs off */
167 168 169
	spin_lock_irq(&ib_nodev_conns_lock);
	list_splice(&ib_nodev_conns, &tmp_list);
	spin_unlock_irq(&ib_nodev_conns_lock);
170

A
Andy Grover 已提交
171
	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
172 173 174 175 176
		rds_conn_destroy(ic->conn);
}

void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
{
177
	struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
178

179 180
	iinfo->rdma_mr_max = pool_1m->max_items;
	iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
181 182
}

K
Ka-Cheong Poon 已提交
183
#if IS_ENABLED(CONFIG_IPV6)
184 185 186 187 188 189 190 191
void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
			 struct rds6_info_rdma_connection *iinfo6)
{
	struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;

	iinfo6->rdma_mr_max = pool_1m->max_items;
	iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
}
K
Ka-Cheong Poon 已提交
192
#endif
193

194
struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
195 196
{
	struct rds_ib_mr *ibmr = NULL;
197
	struct llist_node *ret;
198
	unsigned long *flag;
199

200
	preempt_disable();
201
	flag = this_cpu_ptr(&clean_list_grace);
202
	set_bit(CLEAN_LIST_BUSY_BIT, flag);
203
	ret = llist_del_first(&pool->clean_list);
204
	if (ret) {
205
		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
206 207 208 209 210
		if (pool->pool_type == RDS_IB_MR_8K_POOL)
			rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
		else
			rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
	}
211

212 213
	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
	preempt_enable();
214 215 216
	return ibmr;
}

217 218 219 220 221 222 223 224 225 226 227 228
static inline void wait_clean_list_grace(void)
{
	int cpu;
	unsigned long *flag;

	for_each_online_cpu(cpu) {
		flag = &per_cpu(clean_list_grace, cpu);
		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
			cpu_relax();
	}
}

229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
void rds_ib_sync_mr(void *trans_private, int direction)
{
	struct rds_ib_mr *ibmr = trans_private;
	struct rds_ib_device *rds_ibdev = ibmr->device;

	switch (direction) {
	case DMA_FROM_DEVICE:
		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
		break;
	case DMA_TO_DEVICE:
		ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
		break;
	}
}

246
void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
{
	struct rds_ib_device *rds_ibdev = ibmr->device;

	if (ibmr->sg_dma_len) {
		ib_dma_unmap_sg(rds_ibdev->dev,
				ibmr->sg, ibmr->sg_len,
				DMA_BIDIRECTIONAL);
		ibmr->sg_dma_len = 0;
	}

	/* Release the s/g list */
	if (ibmr->sg_len) {
		unsigned int i;

		for (i = 0; i < ibmr->sg_len; ++i) {
			struct page *page = sg_page(&ibmr->sg[i]);

			/* FIXME we need a way to tell a r/w MR
			 * from a r/o MR */
266
			WARN_ON(!page->mapping && irqs_disabled());
267 268 269 270 271 272 273 274 275 276
			set_page_dirty(page);
			put_page(page);
		}
		kfree(ibmr->sg);

		ibmr->sg = NULL;
		ibmr->sg_len = 0;
	}
}

277
void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
278 279 280 281 282
{
	unsigned int pinned = ibmr->sg_len;

	__rds_ib_teardown_mr(ibmr);
	if (pinned) {
283
		struct rds_ib_mr_pool *pool = ibmr->pool;
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299

		atomic_sub(pinned, &pool->free_pinned);
	}
}

static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
{
	unsigned int item_count;

	item_count = atomic_read(&pool->item_count);
	if (free_all)
		return item_count;

	return 0;
}

300
/*
301
 * given an llist of mrs, put them all into the list_head for more processing
302
 */
W
Wengang Wang 已提交
303 304
static unsigned int llist_append_to_list(struct llist_head *llist,
					 struct list_head *list)
305 306
{
	struct rds_ib_mr *ibmr;
307 308
	struct llist_node *node;
	struct llist_node *next;
W
Wengang Wang 已提交
309
	unsigned int count = 0;
310 311 312 313 314

	node = llist_del_all(llist);
	while (node) {
		next = node->next;
		ibmr = llist_entry(node, struct rds_ib_mr, llnode);
315
		list_add_tail(&ibmr->unmap_list, list);
316
		node = next;
W
Wengang Wang 已提交
317
		count++;
318
	}
W
Wengang Wang 已提交
319
	return count;
320 321 322
}

/*
323 324 325
 * this takes a list head of mrs and turns it into linked llist nodes
 * of clusters.  Each cluster has linked llist nodes of
 * MR_CLUSTER_SIZE mrs that are ready for reuse.
326
 */
327 328 329 330
static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
				struct list_head *list,
				struct llist_node **nodes_head,
				struct llist_node **nodes_tail)
331 332
{
	struct rds_ib_mr *ibmr;
333 334
	struct llist_node *cur = NULL;
	struct llist_node **next = nodes_head;
335 336

	list_for_each_entry(ibmr, list, unmap_list) {
337 338 339
		cur = &ibmr->llnode;
		*next = cur;
		next = &cur->next;
340
	}
341 342
	*next = NULL;
	*nodes_tail = cur;
343 344
}

345 346 347 348 349 350
/*
 * Flush our pool of MRs.
 * At a minimum, all currently unused MRs are unmapped.
 * If the number of MRs allocated exceeds the limit, we also try
 * to free as many MRs as needed to get back to this limit.
 */
351 352
int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
			 int free_all, struct rds_ib_mr **ibmr_ret)
353
{
354
	struct rds_ib_mr *ibmr;
355 356
	struct llist_node *clean_nodes;
	struct llist_node *clean_tail;
357 358
	LIST_HEAD(unmap_list);
	unsigned long unpinned = 0;
W
Wengang Wang 已提交
359
	unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
360

361 362 363 364
	if (pool->pool_type == RDS_IB_MR_8K_POOL)
		rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
	else
		rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);
365

366 367
	if (ibmr_ret) {
		DEFINE_WAIT(wait);
368
		while (!mutex_trylock(&pool->flush_lock)) {
369
			ibmr = rds_ib_reuse_mr(pool);
370 371 372 373 374 375 376 377
			if (ibmr) {
				*ibmr_ret = ibmr;
				finish_wait(&pool->flush_wait, &wait);
				goto out_nolock;
			}

			prepare_to_wait(&pool->flush_wait, &wait,
					TASK_UNINTERRUPTIBLE);
378
			if (llist_empty(&pool->clean_list))
379 380
				schedule();

381
			ibmr = rds_ib_reuse_mr(pool);
382 383 384 385 386 387 388 389 390 391 392
			if (ibmr) {
				*ibmr_ret = ibmr;
				finish_wait(&pool->flush_wait, &wait);
				goto out_nolock;
			}
		}
		finish_wait(&pool->flush_wait, &wait);
	} else
		mutex_lock(&pool->flush_lock);

	if (ibmr_ret) {
393
		ibmr = rds_ib_reuse_mr(pool);
394 395 396 397 398
		if (ibmr) {
			*ibmr_ret = ibmr;
			goto out;
		}
	}
399 400

	/* Get the list of all MRs to be dropped. Ordering matters -
401 402
	 * we want to put drop_list ahead of free_list.
	 */
W
Wengang Wang 已提交
403 404
	dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
	dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
405
	if (free_all)
406
		llist_append_to_list(&pool->clean_list, &unmap_list);
407 408 409 410 411 412

	free_goal = rds_ib_flush_goal(pool, free_all);

	if (list_empty(&unmap_list))
		goto out;

413 414 415 416
	if (pool->use_fastreg)
		rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
	else
		rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
417

418 419 420
	if (!list_empty(&unmap_list)) {
		/* we have to make sure that none of the things we're about
		 * to put on the clean list would race with other cpus trying
421
		 * to pull items off.  The llist would explode if we managed to
422
		 * remove something from the clean list and then add it back again
423
		 * while another CPU was spinning on that same item in llist_del_first.
424
		 *
425
		 * This is pretty unlikely, but just in case  wait for an llist grace period
426 427 428 429
		 * here before adding anything back into the clean list.
		 */
		wait_clean_list_grace();

430
		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
431
		if (ibmr_ret)
432
			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
433

434 435 436
		/* more than one entry in llist nodes */
		if (clean_nodes->next)
			llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
437 438

	}
439 440

	atomic_sub(unpinned, &pool->free_pinned);
W
Wengang Wang 已提交
441
	atomic_sub(dirty_to_clean, &pool->dirty_count);
442 443 444 445
	atomic_sub(nfreed, &pool->item_count);

out:
	mutex_unlock(&pool->flush_lock);
446 447 448
	if (waitqueue_active(&pool->flush_wait))
		wake_up(&pool->flush_wait);
out_nolock:
449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
	return 0;
}

struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
{
	struct rds_ib_mr *ibmr = NULL;
	int iter = 0;

	while (1) {
		ibmr = rds_ib_reuse_mr(pool);
		if (ibmr)
			return ibmr;

		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
			break;

		atomic_dec(&pool->item_count);

		if (++iter > 2) {
			if (pool->pool_type == RDS_IB_MR_8K_POOL)
				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
			else
				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
			return ERR_PTR(-EAGAIN);
		}

		/* We do have some empty MRs. Flush them out. */
		if (pool->pool_type == RDS_IB_MR_8K_POOL)
			rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
		else
			rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);

		rds_ib_flush_mr_pool(pool, 0, &ibmr);
		if (ibmr)
			return ibmr;
	}

	return ibmr;
487 488 489 490
}

static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
{
491
	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
492

493
	rds_ib_flush_mr_pool(pool, 0, NULL);
494 495 496 497 498
}

void rds_ib_free_mr(void *trans_private, int invalidate)
{
	struct rds_ib_mr *ibmr = trans_private;
499
	struct rds_ib_mr_pool *pool = ibmr->pool;
500 501 502 503 504
	struct rds_ib_device *rds_ibdev = ibmr->device;

	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);

	/* Return it to the pool's free list */
505 506 507 508
	if (rds_ibdev->use_fastreg)
		rds_ib_free_frmr_list(ibmr);
	else
		rds_ib_free_fmr_list(ibmr);
509 510 511 512 513

	atomic_add(ibmr->sg_len, &pool->free_pinned);
	atomic_inc(&pool->dirty_count);

	/* If we've pinned too many pages, request a flush */
514
	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
515
	    atomic_read(&pool->dirty_count) >= pool->max_items / 5)
516
		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
517 518 519

	if (invalidate) {
		if (likely(!in_interrupt())) {
520
			rds_ib_flush_mr_pool(pool, 0, NULL);
521 522
		} else {
			/* We get here if the user created a MR marked
523 524
			 * as use_once and invalidate at the same time.
			 */
525
			queue_delayed_work(rds_ib_mr_wq,
526
					   &pool->flush_worker, 10);
527 528
		}
	}
529 530

	rds_ib_dev_put(rds_ibdev);
531 532 533 534 535 536
}

void rds_ib_flush_mrs(void)
{
	struct rds_ib_device *rds_ibdev;

537
	down_read(&rds_ib_devices_lock);
538
	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
539 540
		if (rds_ibdev->mr_8k_pool)
			rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);
541

542 543
		if (rds_ibdev->mr_1m_pool)
			rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
544
	}
545
	up_read(&rds_ib_devices_lock);
546 547 548
}

void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
549 550
		    struct rds_sock *rs, u32 *key_ret,
		    struct rds_connection *conn)
551 552 553
{
	struct rds_ib_device *rds_ibdev;
	struct rds_ib_mr *ibmr = NULL;
554
	struct rds_ib_connection *ic = NULL;
555 556
	int ret;

557
	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
558 559 560 561 562
	if (!rds_ibdev) {
		ret = -ENODEV;
		goto out;
	}

563 564 565
	if (conn)
		ic = conn->c_transport_data;

566
	if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
567 568 569 570
		ret = -ENODEV;
		goto out;
	}

571 572 573 574
	if (rds_ibdev->use_fastreg)
		ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
	else
		ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
575 576
	if (IS_ERR(ibmr)) {
		ret = PTR_ERR(ibmr);
577
		pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
578 579 580
	} else {
		return ibmr;
	}
581

582
 out:
583 584
	if (rds_ibdev)
		rds_ib_dev_put(rds_ibdev);
585

586
	return ERR_PTR(ret);
587
}
588

589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
{
	cancel_delayed_work_sync(&pool->flush_worker);
	rds_ib_flush_mr_pool(pool, 1, NULL);
	WARN_ON(atomic_read(&pool->item_count));
	WARN_ON(atomic_read(&pool->free_pinned));
	kfree(pool);
}

struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
					     int pool_type)
{
	struct rds_ib_mr_pool *pool;

	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
	if (!pool)
		return ERR_PTR(-ENOMEM);

	pool->pool_type = pool_type;
	init_llist_head(&pool->free_list);
	init_llist_head(&pool->drop_list);
	init_llist_head(&pool->clean_list);
	mutex_init(&pool->flush_lock);
	init_waitqueue_head(&pool->flush_wait);
	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);

	if (pool_type == RDS_IB_MR_1M_POOL) {
		/* +1 allows for unaligned MRs */
		pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
618
		pool->max_items = rds_ibdev->max_1m_mrs;
619 620 621
	} else {
		/* pool_type == RDS_IB_MR_8K_POOL */
		pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
622
		pool->max_items = rds_ibdev->max_8k_mrs;
623 624 625 626 627 628
	}

	pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
	pool->fmr_attr.page_shift = PAGE_SHIFT;
	pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
629
	pool->use_fastreg = rds_ibdev->use_fastreg;
630 631 632 633 634 635

	return pool;
}

int rds_ib_mr_init(void)
{
636
	rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0);
637 638 639 640 641 642 643 644 645 646 647 648 649
	if (!rds_ib_mr_wq)
		return -ENOMEM;
	return 0;
}

/* By the time this is called all the IB devices should have been torn down and
 * had their pools freed.  As each pool is freed its work struct is waited on,
 * so the pool flushing work queue should be idle by the time we get here.
 */
void rds_ib_mr_exit(void)
{
	destroy_workqueue(rds_ib_mr_wq);
}