smc_core.c 27.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  Basic Transport Functions exploiting Infiniband API
 *
 *  Copyright IBM Corp. 2016
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 */

#include <linux/socket.h>
#include <linux/if_vlan.h>
#include <linux/random.h>
#include <linux/workqueue.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <rdma/ib_verbs.h>
19
#include <rdma/ib_cache.h>
20 21 22 23 24

#include "smc.h"
#include "smc_clc.h"
#include "smc_core.h"
#include "smc_ib.h"
25
#include "smc_wr.h"
U
Ursula Braun 已提交
26
#include "smc_llc.h"
27
#include "smc_cdc.h"
28
#include "smc_close.h"
29
#include "smc_ism.h"
30

31 32
#define SMC_LGR_NUM_INCR		256
#define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
33
#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
34
#define SMC_LGR_FREE_DELAY_FAST		(8 * HZ)
35

36 37 38 39 40
static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
	.list = LIST_HEAD_INIT(smc_lgr_list.list),
	.num = 0,
};
U
Ursula Braun 已提交
41

42 43
static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
			 struct smc_buf_desc *buf_desc);
44

45 46 47 48 49 50 51
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
	/* client link group creation always follows the server link group
	 * creation. For client use a somewhat higher removal delay time,
	 * otherwise there is a risk of out-of-sync link groups.
	 */
	mod_delayed_work(system_wq, &lgr->free_work,
52 53
			 (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
			 SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
54 55
}

56 57 58
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
{
	mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST);
59 60
}

61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
/* Register connection's alert token in our lookup structure.
 * To use rbtrees we have to implement our own insert core.
 * Requires @conns_lock
 * @smc		connection to register
 * Returns 0 on success, != otherwise.
 */
static void smc_lgr_add_alert_token(struct smc_connection *conn)
{
	struct rb_node **link, *parent = NULL;
	u32 token = conn->alert_token_local;

	link = &conn->lgr->conns_all.rb_node;
	while (*link) {
		struct smc_connection *cur = rb_entry(*link,
					struct smc_connection, alert_node);

		parent = *link;
		if (cur->alert_token_local > token)
			link = &parent->rb_left;
		else
			link = &parent->rb_right;
	}
	/* Put the new node there */
	rb_link_node(&conn->alert_node, parent, link);
	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
}

/* Register connection in link group by assigning an alert token
 * registered in a search tree.
 * Requires @conns_lock
 * Note that '0' is a reserved value and not assigned.
 */
static void smc_lgr_register_conn(struct smc_connection *conn)
{
	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
	static atomic_t nexttoken = ATOMIC_INIT(0);

	/* find a new alert_token_local value not yet used by some connection
	 * in this link group
	 */
	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
	while (!conn->alert_token_local) {
		conn->alert_token_local = atomic_inc_return(&nexttoken);
		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
			conn->alert_token_local = 0;
	}
	smc_lgr_add_alert_token(conn);
	conn->lgr->conns_num++;
}

/* Unregister connection and reset the alert token of the given connection<
 */
static void __smc_lgr_unregister_conn(struct smc_connection *conn)
{
	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
	struct smc_link_group *lgr = conn->lgr;

	rb_erase(&conn->alert_node, &lgr->conns_all);
	lgr->conns_num--;
	conn->alert_token_local = 0;
	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
}

124
/* Unregister connection from lgr
125 126 127 128 129
 */
static void smc_lgr_unregister_conn(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

130 131
	if (!lgr)
		return;
132 133 134 135 136 137 138
	write_lock_bh(&lgr->conns_lock);
	if (conn->alert_token_local) {
		__smc_lgr_unregister_conn(conn);
	}
	write_unlock_bh(&lgr->conns_lock);
}

139 140 141 142 143 144 145 146 147 148 149 150 151 152
/* Send delete link, either as client to request the initiation
 * of the DELETE LINK sequence from server; or as server to
 * initiate the delete processing. See smc_llc_rx_delete_link().
 */
static int smc_link_send_delete(struct smc_link *lnk)
{
	if (lnk->state == SMC_LNK_ACTIVE &&
	    !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
		smc_llc_link_deleting(lnk);
		return 0;
	}
	return -ENOTCONN;
}

U
Ursula Braun 已提交
153 154
static void smc_lgr_free(struct smc_link_group *lgr);

155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
static void smc_lgr_free_work(struct work_struct *work)
{
	struct smc_link_group *lgr = container_of(to_delayed_work(work),
						  struct smc_link_group,
						  free_work);
	bool conns;

	spin_lock_bh(&smc_lgr_list.lock);
	read_lock_bh(&lgr->conns_lock);
	conns = RB_EMPTY_ROOT(&lgr->conns_all);
	read_unlock_bh(&lgr->conns_lock);
	if (!conns) { /* number of lgr connections is no longer zero */
		spin_unlock_bh(&smc_lgr_list.lock);
		return;
	}
170 171
	if (!list_empty(&lgr->list))
		list_del_init(&lgr->list); /* remove from smc_lgr_list */
172
	spin_unlock_bh(&smc_lgr_list.lock);
173 174

	if (!lgr->is_smcd && !lgr->terminating)	{
175 176
		struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

177
		/* try to send del link msg, on error free lgr immediately */
178 179
		if (lnk->state == SMC_LNK_ACTIVE &&
		    !smc_link_send_delete(lnk)) {
180 181 182 183 184 185
			/* reschedule in case we never receive a response */
			smc_lgr_schedule_free_work(lgr);
			return;
		}
	}

186
	if (!delayed_work_pending(&lgr->free_work)) {
187 188 189 190
		struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

		if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
			smc_llc_link_inactive(lnk);
H
Hans Wippel 已提交
191 192
		if (lgr->is_smcd)
			smc_ism_signal_shutdown(lgr);
193
		smc_lgr_free(lgr);
194
	}
195 196 197
}

/* create a new SMC link group */
198
static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
199 200 201 202 203
{
	struct smc_link_group *lgr;
	struct smc_link *lnk;
	u8 rndvec[3];
	int rc = 0;
U
Ursula Braun 已提交
204
	int i;
205

206
	if (ini->is_smcd && ini->vlan_id) {
207 208
		if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
			rc = SMC_CLC_DECL_ISMVLANERR;
209
			goto out;
210
		}
211 212
	}

213 214
	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
	if (!lgr) {
215
		rc = SMC_CLC_DECL_MEM;
216 217
		goto out;
	}
218
	lgr->is_smcd = ini->is_smcd;
219
	lgr->sync_err = 0;
220
	lgr->vlan_id = ini->vlan_id;
U
Ursula Braun 已提交
221 222
	rwlock_init(&lgr->sndbufs_lock);
	rwlock_init(&lgr->rmbs_lock);
223
	rwlock_init(&lgr->conns_lock);
U
Ursula Braun 已提交
224 225 226 227
	for (i = 0; i < SMC_RMBE_SIZES; i++) {
		INIT_LIST_HEAD(&lgr->sndbufs[i]);
		INIT_LIST_HEAD(&lgr->rmbs[i]);
	}
228 229
	smc_lgr_list.num += SMC_LGR_NUM_INCR;
	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
230 231
	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
	lgr->conns_all = RB_ROOT;
232
	if (ini->is_smcd) {
233
		/* SMC-D specific settings */
234 235
		lgr->peer_gid = ini->ism_gid;
		lgr->smcd = ini->ism_dev;
236 237 238
	} else {
		/* SMC-R specific settings */
		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
239 240
		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
		       SMC_SYSTEMID_LEN);
241 242 243 244 245

		lnk = &lgr->lnk[SMC_SINGLE_LINK];
		/* initialize link */
		lnk->state = SMC_LNK_ACTIVATING;
		lnk->link_id = SMC_SINGLE_LINK;
246 247 248 249 250 251
		lnk->smcibdev = ini->ib_dev;
		lnk->ibport = ini->ib_port;
		lnk->path_mtu =
			ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
		if (!ini->ib_dev->initialized)
			smc_ib_setup_per_ibdev(ini->ib_dev);
252 253 254
		get_random_bytes(rndvec, sizeof(rndvec));
		lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
			(rndvec[2] << 16);
255
		rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
256 257
					  ini->vlan_id, lnk->gid,
					  &lnk->sgid_index);
258 259
		if (rc)
			goto free_lgr;
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
		rc = smc_llc_link_init(lnk);
		if (rc)
			goto free_lgr;
		rc = smc_wr_alloc_link_mem(lnk);
		if (rc)
			goto clear_llc_lnk;
		rc = smc_ib_create_protection_domain(lnk);
		if (rc)
			goto free_link_mem;
		rc = smc_ib_create_queue_pair(lnk);
		if (rc)
			goto dealloc_pd;
		rc = smc_wr_create_link(lnk);
		if (rc)
			goto destroy_qp;
	}
276 277 278 279
	smc->conn.lgr = lgr;
	spin_lock_bh(&smc_lgr_list.lock);
	list_add(&lgr->list, &smc_lgr_list.list);
	spin_unlock_bh(&smc_lgr_list.lock);
280 281
	return 0;

282 283 284 285 286 287
destroy_qp:
	smc_ib_destroy_queue_pair(lnk);
dealloc_pd:
	smc_ib_dealloc_protection_domain(lnk);
free_link_mem:
	smc_wr_free_link_mem(lnk);
288 289
clear_llc_lnk:
	smc_llc_link_clear(lnk);
290 291
free_lgr:
	kfree(lgr);
292
out:
293 294 295 296 297 298
	if (rc < 0) {
		if (rc == -ENOMEM)
			rc = SMC_CLC_DECL_MEM;
		else
			rc = SMC_CLC_DECL_INTERR;
	}
299 300 301
	return rc;
}

302 303
static void smc_buf_unuse(struct smc_connection *conn,
			  struct smc_link_group *lgr)
U
Ursula Braun 已提交
304
{
305
	if (conn->sndbuf_desc)
U
Ursula Braun 已提交
306 307
		conn->sndbuf_desc->used = 0;
	if (conn->rmb_desc) {
308
		if (!conn->rmb_desc->regerr) {
309 310 311 312 313 314
			if (!lgr->is_smcd) {
				/* unregister rmb with peer */
				smc_llc_do_delete_rkey(
						&lgr->lnk[SMC_SINGLE_LINK],
						conn->rmb_desc);
			}
315
			conn->rmb_desc->used = 0;
316 317 318 319 320 321
		} else {
			/* buf registration failed, reuse not possible */
			write_lock_bh(&lgr->rmbs_lock);
			list_del(&conn->rmb_desc->list);
			write_unlock_bh(&lgr->rmbs_lock);

322
			smc_buf_free(lgr, true, conn->rmb_desc);
323
		}
U
Ursula Braun 已提交
324 325 326
	}
}

327 328 329
/* remove a finished connection from its link group */
void smc_conn_free(struct smc_connection *conn)
{
330 331 332
	struct smc_link_group *lgr = conn->lgr;

	if (!lgr)
333
		return;
334
	if (lgr->is_smcd) {
335
		smc_ism_unset_conn(conn);
336 337
		tasklet_kill(&conn->rx_tsklet);
	} else {
338
		smc_cdc_tx_dismiss_slots(conn);
339
	}
340
	smc_lgr_unregister_conn(conn);
341
	smc_buf_unuse(conn, lgr);		/* allow buffer reuse */
342
	conn->lgr = NULL;
343 344 345

	if (!lgr->conns_num)
		smc_lgr_schedule_free_work(lgr);
346 347 348 349 350
}

static void smc_link_clear(struct smc_link *lnk)
{
	lnk->peer_qpn = 0;
351
	smc_llc_link_clear(lnk);
352
	smc_ib_modify_qp_reset(lnk);
353
	smc_wr_free_link(lnk);
354 355
	smc_ib_destroy_queue_pair(lnk);
	smc_ib_dealloc_protection_domain(lnk);
356
	smc_wr_free_link_mem(lnk);
357 358
}

359 360
static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
			  struct smc_buf_desc *buf_desc)
U
Ursula Braun 已提交
361
{
362 363
	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

364 365 366 367 368 369 370 371 372
	if (is_rmb) {
		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
			smc_ib_put_memory_region(
					buf_desc->mr_rx[SMC_SINGLE_LINK]);
		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
				    DMA_FROM_DEVICE);
	} else {
		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
				    DMA_TO_DEVICE);
U
Ursula Braun 已提交
373
	}
374
	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
375 376
	if (buf_desc->pages)
		__free_pages(buf_desc->pages, buf_desc->order);
377
	kfree(buf_desc);
U
Ursula Braun 已提交
378 379
}

380 381 382
static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
			  struct smc_buf_desc *buf_desc)
{
383 384 385
	if (is_dmb) {
		/* restore original buf len */
		buf_desc->len += sizeof(struct smcd_cdc_msg);
386
		smc_ism_unregister_dmb(lgr->smcd, buf_desc);
387
	} else {
388
		kfree(buf_desc->cpu_addr);
389
	}
390 391 392 393 394 395 396 397 398 399 400 401
	kfree(buf_desc);
}

static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
			 struct smc_buf_desc *buf_desc)
{
	if (lgr->is_smcd)
		smcd_buf_free(lgr, is_rmb, buf_desc);
	else
		smcr_buf_free(lgr, is_rmb, buf_desc);
}

402
static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
U
Ursula Braun 已提交
403
{
404 405
	struct smc_buf_desc *buf_desc, *bf_desc;
	struct list_head *buf_list;
U
Ursula Braun 已提交
406 407 408
	int i;

	for (i = 0; i < SMC_RMBE_SIZES; i++) {
409 410 411 412 413
		if (is_rmb)
			buf_list = &lgr->rmbs[i];
		else
			buf_list = &lgr->sndbufs[i];
		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
U
Ursula Braun 已提交
414
					 list) {
415
			list_del(&buf_desc->list);
416
			smc_buf_free(lgr, is_rmb, buf_desc);
U
Ursula Braun 已提交
417 418 419 420
		}
	}
}

421 422 423 424 425 426 427 428
static void smc_lgr_free_bufs(struct smc_link_group *lgr)
{
	/* free send buffers */
	__smc_lgr_free_bufs(lgr, false);
	/* free rmbs */
	__smc_lgr_free_bufs(lgr, true);
}

429
/* remove a link group */
U
Ursula Braun 已提交
430
static void smc_lgr_free(struct smc_link_group *lgr)
431
{
432
	smc_lgr_free_bufs(lgr);
433 434 435 436
	if (lgr->is_smcd)
		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
	else
		smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
437 438 439
	kfree(lgr);
}

440 441 442 443 444 445 446 447 448
void smc_lgr_forget(struct smc_link_group *lgr)
{
	spin_lock_bh(&smc_lgr_list.lock);
	/* do not use this link group for new connections */
	if (!list_empty(&lgr->list))
		list_del_init(&lgr->list);
	spin_unlock_bh(&smc_lgr_list.lock);
}

449
/* terminate linkgroup abnormally */
450
static void __smc_lgr_terminate(struct smc_link_group *lgr)
451 452
{
	struct smc_connection *conn;
453
	struct smc_sock *smc;
454 455
	struct rb_node *node;

456 457 458
	if (lgr->terminating)
		return;	/* lgr already terminating */
	lgr->terminating = 1;
459 460
	if (!list_empty(&lgr->list)) /* forget lgr */
		list_del_init(&lgr->list);
461 462
	if (!lgr->is_smcd)
		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
463 464 465 466 467

	write_lock_bh(&lgr->conns_lock);
	node = rb_first(&lgr->conns_all);
	while (node) {
		conn = rb_entry(node, struct smc_connection, alert_node);
468
		smc = container_of(conn, struct smc_sock, conn);
469
		sock_hold(&smc->sk); /* sock_put in close work */
470
		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
471
		__smc_lgr_unregister_conn(conn);
472
		conn->lgr = NULL;
473
		write_unlock_bh(&lgr->conns_lock);
474 475
		if (!schedule_work(&conn->close_work))
			sock_put(&smc->sk);
476
		write_lock_bh(&lgr->conns_lock);
477 478 479
		node = rb_first(&lgr->conns_all);
	}
	write_unlock_bh(&lgr->conns_lock);
480 481
	if (!lgr->is_smcd)
		wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
482
	smc_lgr_schedule_free_work(lgr);
483 484
}

485 486 487 488 489 490 491
void smc_lgr_terminate(struct smc_link_group *lgr)
{
	spin_lock_bh(&smc_lgr_list.lock);
	__smc_lgr_terminate(lgr);
	spin_unlock_bh(&smc_lgr_list.lock);
}

492 493 494 495 496
/* Called when IB port is terminated */
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
{
	struct smc_link_group *lgr, *l;

497
	spin_lock_bh(&smc_lgr_list.lock);
498
	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
499 500
		if (!lgr->is_smcd &&
		    lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
501
		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
502
			__smc_lgr_terminate(lgr);
503
	}
504
	spin_unlock_bh(&smc_lgr_list.lock);
505 506
}

507
/* Called when SMC-D device is terminated or peer is lost */
H
Hans Wippel 已提交
508
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
509 510 511 512 513 514 515 516 517
{
	struct smc_link_group *lgr, *l;
	LIST_HEAD(lgr_free_list);

	/* run common cleanup function and build free list */
	spin_lock_bh(&smc_lgr_list.lock);
	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
		if (lgr->is_smcd && lgr->smcd == dev &&
		    (!peer_gid || lgr->peer_gid == peer_gid) &&
H
Hans Wippel 已提交
518
		    (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
519 520 521 522 523 524 525 526 527 528
			__smc_lgr_terminate(lgr);
			list_move(&lgr->list, &lgr_free_list);
		}
	}
	spin_unlock_bh(&smc_lgr_list.lock);

	/* cancel the regular free workers and actually free lgrs */
	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
		list_del_init(&lgr->list);
		cancel_delayed_work_sync(&lgr->free_work);
H
Hans Wippel 已提交
529 530
		if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */
			smc_ism_signal_shutdown(lgr);
531 532 533 534
		smc_lgr_free(lgr);
	}
}

535 536 537
/* Determine vlan of internal TCP socket.
 * @vlan_id: address to store the determined vlan id into
 */
538
int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
539 540
{
	struct dst_entry *dst = sk_dst_get(clcsock->sk);
541 542
	struct net_device *ndev;
	int i, nest_lvl, rc = 0;
543

544
	ini->vlan_id = 0;
545 546 547 548 549 550 551 552 553
	if (!dst) {
		rc = -ENOTCONN;
		goto out;
	}
	if (!dst->dev) {
		rc = -ENODEV;
		goto out_rel;
	}

554 555
	ndev = dst->dev;
	if (is_vlan_dev(ndev)) {
556
		ini->vlan_id = vlan_dev_vlan_id(ndev);
557 558 559 560 561 562 563 564 565 566 567 568 569
		goto out_rel;
	}

	rtnl_lock();
	nest_lvl = dev_get_nest_level(ndev);
	for (i = 0; i < nest_lvl; i++) {
		struct list_head *lower = &ndev->adj_list.lower;

		if (list_empty(lower))
			break;
		lower = lower->next;
		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
		if (is_vlan_dev(ndev)) {
570
			ini->vlan_id = vlan_dev_vlan_id(ndev);
571 572 573 574
			break;
		}
	}
	rtnl_unlock();
575 576 577 578 579 580 581

out_rel:
	dst_release(dst);
out:
	return rc;
}

582 583
static bool smcr_lgr_match(struct smc_link_group *lgr,
			   struct smc_clc_msg_local *lcl,
584
			   enum smc_lgr_role role, u32 clcqpn)
585
{
586 587 588 589 590 591
	return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
		       SMC_SYSTEMID_LEN) &&
		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
			SMC_GID_SIZE) &&
		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
			sizeof(lcl->mac)) &&
592 593 594
		lgr->role == role &&
		(lgr->role == SMC_SERV ||
		 lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn);
595
}
596

597 598 599 600
static bool smcd_lgr_match(struct smc_link_group *lgr,
			   struct smcd_dev *smcismdev, u64 peer_gid)
{
	return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
601 602 603
}

/* create a new SMC connection (and a new link group if necessary) */
604
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
605 606 607 608 609 610
{
	struct smc_connection *conn = &smc->conn;
	struct smc_link_group *lgr;
	enum smc_lgr_role role;
	int rc = 0;

611
	ini->cln_first_contact = SMC_FIRST_CONTACT;
612
	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
613
	if (role == SMC_CLNT && ini->srv_first_contact)
614 615 616 617 618 619 620
		/* create new link group as well */
		goto create;

	/* determine if an existing link group can be reused */
	spin_lock_bh(&smc_lgr_list.lock);
	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
		write_lock_bh(&lgr->conns_lock);
621 622 623
		if ((ini->is_smcd ?
		     smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
		     smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
624
		    !lgr->sync_err &&
625
		    lgr->vlan_id == ini->vlan_id &&
626 627
		    (role == SMC_CLNT ||
		     lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
628
			/* link group found */
629
			ini->cln_first_contact = SMC_REUSE_CONTACT;
630 631
			conn->lgr = lgr;
			smc_lgr_register_conn(conn); /* add smc conn to lgr */
632 633
			if (delayed_work_pending(&lgr->free_work))
				cancel_delayed_work(&lgr->free_work);
634 635 636 637 638 639 640
			write_unlock_bh(&lgr->conns_lock);
			break;
		}
		write_unlock_bh(&lgr->conns_lock);
	}
	spin_unlock_bh(&smc_lgr_list.lock);

641
	if (role == SMC_CLNT && !ini->srv_first_contact &&
642
	    ini->cln_first_contact == SMC_FIRST_CONTACT) {
643 644 645 646
		/* Server reuses a link group, but Client wants to start
		 * a new one
		 * send out_of_sync decline, reason synchr. error
		 */
647
		return SMC_CLC_DECL_SYNCERR;
648 649 650
	}

create:
651
	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
652
		rc = smc_lgr_create(smc, ini);
653 654 655 656
		if (rc)
			goto out;
		smc_lgr_register_conn(conn); /* add smc conn to lgr */
	}
657
	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
658
	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
S
Stefan Raspl 已提交
659
	conn->urg_state = SMC_URG_READ;
660
	if (ini->is_smcd) {
661 662 663
		conn->rx_off = sizeof(struct smcd_cdc_msg);
		smcd_cdc_rx_init(conn); /* init tasklet for this conn */
	}
664 665 666
#ifndef KERNEL_HAS_ATOMIC64
	spin_lock_init(&conn->acurs_lock);
#endif
667 668

out:
669
	return rc;
670
}
U
Ursula Braun 已提交
671

672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
/* convert the RMB size into the compressed notation - minimum 16K.
 * In contrast to plain ilog2, this rounds towards the next power of 2,
 * so the socket application gets at least its desired sndbuf / rcvbuf size.
 */
static u8 smc_compress_bufsize(int size)
{
	u8 compressed;

	if (size <= SMC_BUF_MIN_SIZE)
		return 0;

	size = (size - 1) >> 14;
	compressed = ilog2(size) + 1;
	if (compressed >= SMC_RMBE_SIZES)
		compressed = SMC_RMBE_SIZES - 1;
	return compressed;
}

/* convert the RMB size from compressed notation into integer */
int smc_uncompress_bufsize(u8 compressed)
{
	u32 size;

	size = 0x00000001 << (((int)compressed) + 14);
	return (int)size;
}

699 700
/* try to reuse a sndbuf or rmb description slot for a certain
 * buffer size; if not available, return NULL
U
Ursula Braun 已提交
701
 */
702 703 704
static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
					     rwlock_t *lock,
					     struct list_head *buf_list)
U
Ursula Braun 已提交
705
{
706
	struct smc_buf_desc *buf_slot;
U
Ursula Braun 已提交
707

708 709 710 711 712
	read_lock_bh(lock);
	list_for_each_entry(buf_slot, buf_list, list) {
		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
			read_unlock_bh(lock);
			return buf_slot;
U
Ursula Braun 已提交
713 714
		}
	}
715
	read_unlock_bh(lock);
U
Ursula Braun 已提交
716 717 718
	return NULL;
}

U
Ursula Braun 已提交
719 720 721 722 723 724 725 726 727
/* one of the conditions for announcing a receiver's current window size is
 * that it "results in a minimum increase in the window size of 10% of the
 * receive buffer space" [RFC7609]
 */
static inline int smc_rmb_wnd_update_limit(int rmbe_size)
{
	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
}

728 729
static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
						bool is_rmb, int bufsize)
730 731 732 733 734 735 736 737 738 739
{
	struct smc_buf_desc *buf_desc;
	struct smc_link *lnk;
	int rc;

	/* try to alloc a new buffer */
	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
	if (!buf_desc)
		return ERR_PTR(-ENOMEM);

740 741 742 743 744 745
	buf_desc->order = get_order(bufsize);
	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
				      __GFP_NOMEMALLOC | __GFP_COMP |
				      __GFP_NORETRY | __GFP_ZERO,
				      buf_desc->order);
	if (!buf_desc->pages) {
746 747 748
		kfree(buf_desc);
		return ERR_PTR(-EAGAIN);
	}
749
	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
750 751 752 753 754 755

	/* build the sg table from the pages */
	lnk = &lgr->lnk[SMC_SINGLE_LINK];
	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
			    GFP_KERNEL);
	if (rc) {
756
		smc_buf_free(lgr, is_rmb, buf_desc);
757 758 759 760 761 762 763 764 765 766
		return ERR_PTR(rc);
	}
	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
		   buf_desc->cpu_addr, bufsize);

	/* map sg table to DMA address */
	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
	/* SMC protocol depends on mapping to one DMA address only */
	if (rc != 1)  {
767
		smc_buf_free(lgr, is_rmb, buf_desc);
768 769 770 771 772 773 774 775 776 777
		return ERR_PTR(-EAGAIN);
	}

	/* create a new memory region for the RMB */
	if (is_rmb) {
		rc = smc_ib_get_memory_region(lnk->roce_pd,
					      IB_ACCESS_REMOTE_WRITE |
					      IB_ACCESS_LOCAL_WRITE,
					      buf_desc);
		if (rc) {
778
			smc_buf_free(lgr, is_rmb, buf_desc);
779 780 781 782
			return ERR_PTR(rc);
		}
	}

783
	buf_desc->len = bufsize;
784 785 786
	return buf_desc;
}

787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
#define SMCD_DMBE_SIZES		7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */

static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
						bool is_dmb, int bufsize)
{
	struct smc_buf_desc *buf_desc;
	int rc;

	if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
		return ERR_PTR(-EAGAIN);

	/* try to alloc a new DMB */
	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
	if (!buf_desc)
		return ERR_PTR(-ENOMEM);
	if (is_dmb) {
		rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
		if (rc) {
			kfree(buf_desc);
			return ERR_PTR(-EAGAIN);
		}
808 809 810
		buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
		/* CDC header stored in buf. So, pretend it was smaller */
		buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
811 812 813 814 815 816 817 818 819 820 821 822 823 824
	} else {
		buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
					     __GFP_NOWARN | __GFP_NORETRY |
					     __GFP_NOMEMALLOC);
		if (!buf_desc->cpu_addr) {
			kfree(buf_desc);
			return ERR_PTR(-EAGAIN);
		}
		buf_desc->len = bufsize;
	}
	return buf_desc;
}

static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
U
Ursula Braun 已提交
825
{
826
	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
U
Ursula Braun 已提交
827 828
	struct smc_connection *conn = &smc->conn;
	struct smc_link_group *lgr = conn->lgr;
829
	struct list_head *buf_list;
830
	int bufsize, bufsize_short;
831 832
	int sk_buf_size;
	rwlock_t *lock;
U
Ursula Braun 已提交
833

834 835 836 837 838 839 840
	if (is_rmb)
		/* use socket recv buffer size (w/o overhead) as start value */
		sk_buf_size = smc->sk.sk_rcvbuf / 2;
	else
		/* use socket send buffer size (w/o overhead) as start value */
		sk_buf_size = smc->sk.sk_sndbuf / 2;

841
	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
842
	     bufsize_short >= 0; bufsize_short--) {
843

844 845 846 847 848 849
		if (is_rmb) {
			lock = &lgr->rmbs_lock;
			buf_list = &lgr->rmbs[bufsize_short];
		} else {
			lock = &lgr->sndbufs_lock;
			buf_list = &lgr->sndbufs[bufsize_short];
850
		}
851
		bufsize = smc_uncompress_bufsize(bufsize_short);
852 853 854
		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
			continue;

855
		/* check for reusable slot in the link group */
856
		buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
857 858
		if (buf_desc) {
			memset(buf_desc->cpu_addr, 0, bufsize);
U
Ursula Braun 已提交
859 860
			break; /* found reusable slot */
		}
861

862 863 864 865 866
		if (is_smcd)
			buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
		else
			buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);

867 868 869
		if (PTR_ERR(buf_desc) == -ENOMEM)
			break;
		if (IS_ERR(buf_desc))
870
			continue;
871

872 873 874 875 876
		buf_desc->used = 1;
		write_lock_bh(lock);
		list_add(&buf_desc->list, buf_list);
		write_unlock_bh(lock);
		break; /* found */
U
Ursula Braun 已提交
877
	}
878

879
	if (IS_ERR(buf_desc))
880 881 882 883
		return -ENOMEM;

	if (is_rmb) {
		conn->rmb_desc = buf_desc;
884 885
		conn->rmbe_size_short = bufsize_short;
		smc->sk.sk_rcvbuf = bufsize * 2;
886
		atomic_set(&conn->bytes_to_rcv, 0);
887 888
		conn->rmbe_update_limit =
			smc_rmb_wnd_update_limit(buf_desc->len);
889 890
		if (is_smcd)
			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
U
Ursula Braun 已提交
891
	} else {
892 893 894
		conn->sndbuf_desc = buf_desc;
		smc->sk.sk_sndbuf = bufsize * 2;
		atomic_set(&conn->sndbuf_space, bufsize);
U
Ursula Braun 已提交
895
	}
896 897 898
	return 0;
}

899 900 901 902
void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

903 904
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
905 906 907 908 909 910 911 912
	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
			       conn->sndbuf_desc, DMA_TO_DEVICE);
}

void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

913 914
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
915 916 917 918 919 920 921 922
	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
				  conn->sndbuf_desc, DMA_TO_DEVICE);
}

void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

923 924
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
925 926 927 928 929 930 931 932
	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
			       conn->rmb_desc, DMA_FROM_DEVICE);
}

void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

933 934
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
935 936 937 938
	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
				  conn->rmb_desc, DMA_FROM_DEVICE);
}

939 940 941 942 943 944
/* create the send and receive buffer for an SMC socket;
 * receive buffers are called RMBs;
 * (even though the SMC protocol allows more than one RMB-element per RMB,
 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
 * extra RMB for every connection in a link group
 */
945
int smc_buf_create(struct smc_sock *smc, bool is_smcd)
946 947 948 949
{
	int rc;

	/* create send buffer */
950
	rc = __smc_buf_create(smc, is_smcd, false);
951 952 953
	if (rc)
		return rc;
	/* create rmb */
954
	rc = __smc_buf_create(smc, is_smcd, true);
955
	if (rc)
956
		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
957
	return rc;
U
Ursula Braun 已提交
958
}
959 960 961 962 963 964 965 966 967 968 969 970

static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
{
	int i;

	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
			return i;
	}
	return -ENOSPC;
}

971 972
/* add a new rtoken from peer */
int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
973
{
974 975
	u64 dma_addr = be64_to_cpu(nw_vaddr);
	u32 rkey = ntohl(nw_rkey);
976 977 978 979
	int i;

	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
980
		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
981
		    test_bit(i, lgr->rtokens_used_mask)) {
982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006
			/* already in list */
			return i;
		}
	}
	i = smc_rmb_reserve_rtoken_idx(lgr);
	if (i < 0)
		return i;
	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
	return i;
}

/* delete an rtoken */
int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
{
	u32 rkey = ntohl(nw_rkey);
	int i;

	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
		    test_bit(i, lgr->rtokens_used_mask)) {
			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;

			clear_bit(i, lgr->rtokens_used_mask);
1007 1008 1009
			return 0;
		}
	}
1010 1011 1012 1013 1014 1015 1016 1017 1018
	return -ENOENT;
}

/* save rkey and dma_addr received from peer during clc handshake */
int smc_rmb_rtoken_handling(struct smc_connection *conn,
			    struct smc_clc_msg_accept_confirm *clc)
{
	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
					  clc->rmb_rkey);
1019 1020 1021 1022
	if (conn->rtoken_idx < 0)
		return conn->rtoken_idx;
	return 0;
}
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035

/* Called (from smc_exit) when module is removed */
void smc_core_exit(void)
{
	struct smc_link_group *lgr, *lg;
	LIST_HEAD(lgr_freeing_list);

	spin_lock_bh(&smc_lgr_list.lock);
	if (!list_empty(&smc_lgr_list.list))
		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
	spin_unlock_bh(&smc_lgr_list.lock);
	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
		list_del_init(&lgr->list);
1036 1037 1038 1039 1040 1041 1042 1043
		if (!lgr->is_smcd) {
			struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

			if (lnk->state == SMC_LNK_ACTIVE)
				smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
							 false);
			smc_llc_link_inactive(lnk);
		}
1044
		cancel_delayed_work_sync(&lgr->free_work);
H
Hans Wippel 已提交
1045 1046
		if (lgr->is_smcd)
			smc_ism_signal_shutdown(lgr);
1047 1048 1049
		smc_lgr_free(lgr); /* free link group */
	}
}