smc_core.c 29.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  Basic Transport Functions exploiting Infiniband API
 *
 *  Copyright IBM Corp. 2016
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 */

#include <linux/socket.h>
#include <linux/if_vlan.h>
#include <linux/random.h>
#include <linux/workqueue.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <rdma/ib_verbs.h>
19
#include <rdma/ib_cache.h>
20 21 22 23 24

#include "smc.h"
#include "smc_clc.h"
#include "smc_core.h"
#include "smc_ib.h"
25
#include "smc_wr.h"
U
Ursula Braun 已提交
26
#include "smc_llc.h"
27
#include "smc_cdc.h"
28
#include "smc_close.h"
29
#include "smc_ism.h"
30

31 32
#define SMC_LGR_NUM_INCR		256
#define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
33
#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
34
#define SMC_LGR_FREE_DELAY_FAST		(8 * HZ)
35

36 37 38 39 40
static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
	.list = LIST_HEAD_INIT(smc_lgr_list.list),
	.num = 0,
};
U
Ursula Braun 已提交
41

42 43
static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
			 struct smc_buf_desc *buf_desc);
44

45 46 47 48 49 50 51 52 53 54 55 56 57
/* return head of link group list and its lock for a given link group */
static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
						  spinlock_t **lgr_lock)
{
	if (lgr->is_smcd) {
		*lgr_lock = &lgr->smcd->lgr_lock;
		return &lgr->smcd->lgr_list;
	}

	*lgr_lock = &smc_lgr_list.lock;
	return &smc_lgr_list.list;
}

58 59 60 61 62 63 64
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
	/* client link group creation always follows the server link group
	 * creation. For client use a somewhat higher removal delay time,
	 * otherwise there is a risk of out-of-sync link groups.
	 */
	mod_delayed_work(system_wq, &lgr->free_work,
65 66
			 (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
			 SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
67 68
}

69 70 71
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
{
	mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST);
72 73
}

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
/* Register connection's alert token in our lookup structure.
 * To use rbtrees we have to implement our own insert core.
 * Requires @conns_lock
 * @smc		connection to register
 * Returns 0 on success, != otherwise.
 */
static void smc_lgr_add_alert_token(struct smc_connection *conn)
{
	struct rb_node **link, *parent = NULL;
	u32 token = conn->alert_token_local;

	link = &conn->lgr->conns_all.rb_node;
	while (*link) {
		struct smc_connection *cur = rb_entry(*link,
					struct smc_connection, alert_node);

		parent = *link;
		if (cur->alert_token_local > token)
			link = &parent->rb_left;
		else
			link = &parent->rb_right;
	}
	/* Put the new node there */
	rb_link_node(&conn->alert_node, parent, link);
	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
}

/* Register connection in link group by assigning an alert token
 * registered in a search tree.
 * Requires @conns_lock
 * Note that '0' is a reserved value and not assigned.
 */
static void smc_lgr_register_conn(struct smc_connection *conn)
{
	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
	static atomic_t nexttoken = ATOMIC_INIT(0);

	/* find a new alert_token_local value not yet used by some connection
	 * in this link group
	 */
	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
	while (!conn->alert_token_local) {
		conn->alert_token_local = atomic_inc_return(&nexttoken);
		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
			conn->alert_token_local = 0;
	}
	smc_lgr_add_alert_token(conn);
	conn->lgr->conns_num++;
}

/* Unregister connection and reset the alert token of the given connection<
 */
static void __smc_lgr_unregister_conn(struct smc_connection *conn)
{
	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
	struct smc_link_group *lgr = conn->lgr;

	rb_erase(&conn->alert_node, &lgr->conns_all);
	lgr->conns_num--;
	conn->alert_token_local = 0;
	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
}

137
/* Unregister connection from lgr
138 139 140 141 142
 */
static void smc_lgr_unregister_conn(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

143 144
	if (!lgr)
		return;
145 146 147 148 149 150 151
	write_lock_bh(&lgr->conns_lock);
	if (conn->alert_token_local) {
		__smc_lgr_unregister_conn(conn);
	}
	write_unlock_bh(&lgr->conns_lock);
}

152 153 154 155 156 157 158 159 160 161 162 163 164 165
/* Send delete link, either as client to request the initiation
 * of the DELETE LINK sequence from server; or as server to
 * initiate the delete processing. See smc_llc_rx_delete_link().
 */
static int smc_link_send_delete(struct smc_link *lnk)
{
	if (lnk->state == SMC_LNK_ACTIVE &&
	    !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
		smc_llc_link_deleting(lnk);
		return 0;
	}
	return -ENOTCONN;
}

U
Ursula Braun 已提交
166 167
static void smc_lgr_free(struct smc_link_group *lgr);

168 169 170 171 172
static void smc_lgr_free_work(struct work_struct *work)
{
	struct smc_link_group *lgr = container_of(to_delayed_work(work),
						  struct smc_link_group,
						  free_work);
173
	spinlock_t *lgr_lock;
174 175
	bool conns;

176 177
	smc_lgr_list_head(lgr, &lgr_lock);
	spin_lock_bh(lgr_lock);
178 179 180 181
	read_lock_bh(&lgr->conns_lock);
	conns = RB_EMPTY_ROOT(&lgr->conns_all);
	read_unlock_bh(&lgr->conns_lock);
	if (!conns) { /* number of lgr connections is no longer zero */
182
		spin_unlock_bh(lgr_lock);
183 184
		return;
	}
185
	list_del_init(&lgr->list); /* remove from smc_lgr_list */
186
	spin_unlock_bh(lgr_lock);
187 188

	if (!lgr->is_smcd && !lgr->terminating)	{
189 190
		struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

191
		/* try to send del link msg, on error free lgr immediately */
192 193
		if (lnk->state == SMC_LNK_ACTIVE &&
		    !smc_link_send_delete(lnk)) {
194 195 196 197 198 199
			/* reschedule in case we never receive a response */
			smc_lgr_schedule_free_work(lgr);
			return;
		}
	}

200
	if (!delayed_work_pending(&lgr->free_work)) {
201 202 203 204
		struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

		if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
			smc_llc_link_inactive(lnk);
H
Hans Wippel 已提交
205 206
		if (lgr->is_smcd)
			smc_ism_signal_shutdown(lgr);
207
		smc_lgr_free(lgr);
208
	}
209 210 211
}

/* create a new SMC link group */
212
static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
213 214
{
	struct smc_link_group *lgr;
215
	struct list_head *lgr_list;
216
	struct smc_link *lnk;
217
	spinlock_t *lgr_lock;
218 219
	u8 rndvec[3];
	int rc = 0;
U
Ursula Braun 已提交
220
	int i;
221

222
	if (ini->is_smcd && ini->vlan_id) {
223 224
		if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
			rc = SMC_CLC_DECL_ISMVLANERR;
225
			goto out;
226
		}
227 228
	}

229 230
	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
	if (!lgr) {
231
		rc = SMC_CLC_DECL_MEM;
232
		goto ism_put_vlan;
233
	}
234
	lgr->is_smcd = ini->is_smcd;
235
	lgr->sync_err = 0;
236
	lgr->vlan_id = ini->vlan_id;
U
Ursula Braun 已提交
237 238
	rwlock_init(&lgr->sndbufs_lock);
	rwlock_init(&lgr->rmbs_lock);
239
	rwlock_init(&lgr->conns_lock);
U
Ursula Braun 已提交
240 241 242 243
	for (i = 0; i < SMC_RMBE_SIZES; i++) {
		INIT_LIST_HEAD(&lgr->sndbufs[i]);
		INIT_LIST_HEAD(&lgr->rmbs[i]);
	}
244 245
	smc_lgr_list.num += SMC_LGR_NUM_INCR;
	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
246 247
	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
	lgr->conns_all = RB_ROOT;
248
	if (ini->is_smcd) {
249
		/* SMC-D specific settings */
250
		get_device(&ini->ism_dev->dev);
251 252
		lgr->peer_gid = ini->ism_gid;
		lgr->smcd = ini->ism_dev;
253
		lgr_list = &ini->ism_dev->lgr_list;
254
		lgr_lock = &lgr->smcd->lgr_lock;
255 256
	} else {
		/* SMC-R specific settings */
257
		get_device(&ini->ib_dev->ibdev->dev);
258
		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
259 260
		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
		       SMC_SYSTEMID_LEN);
261 262 263 264 265

		lnk = &lgr->lnk[SMC_SINGLE_LINK];
		/* initialize link */
		lnk->state = SMC_LNK_ACTIVATING;
		lnk->link_id = SMC_SINGLE_LINK;
266 267
		lnk->smcibdev = ini->ib_dev;
		lnk->ibport = ini->ib_port;
268
		lgr_list = &smc_lgr_list.list;
269
		lgr_lock = &smc_lgr_list.lock;
270 271 272 273
		lnk->path_mtu =
			ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
		if (!ini->ib_dev->initialized)
			smc_ib_setup_per_ibdev(ini->ib_dev);
274 275 276
		get_random_bytes(rndvec, sizeof(rndvec));
		lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
			(rndvec[2] << 16);
277
		rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
278 279
					  ini->vlan_id, lnk->gid,
					  &lnk->sgid_index);
280 281
		if (rc)
			goto free_lgr;
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
		rc = smc_llc_link_init(lnk);
		if (rc)
			goto free_lgr;
		rc = smc_wr_alloc_link_mem(lnk);
		if (rc)
			goto clear_llc_lnk;
		rc = smc_ib_create_protection_domain(lnk);
		if (rc)
			goto free_link_mem;
		rc = smc_ib_create_queue_pair(lnk);
		if (rc)
			goto dealloc_pd;
		rc = smc_wr_create_link(lnk);
		if (rc)
			goto destroy_qp;
	}
298
	smc->conn.lgr = lgr;
299
	spin_lock_bh(lgr_lock);
300
	list_add(&lgr->list, lgr_list);
301
	spin_unlock_bh(lgr_lock);
302 303
	return 0;

304 305 306 307 308 309
destroy_qp:
	smc_ib_destroy_queue_pair(lnk);
dealloc_pd:
	smc_ib_dealloc_protection_domain(lnk);
free_link_mem:
	smc_wr_free_link_mem(lnk);
310 311
clear_llc_lnk:
	smc_llc_link_clear(lnk);
312 313
free_lgr:
	kfree(lgr);
314 315 316
ism_put_vlan:
	if (ini->is_smcd && ini->vlan_id)
		smc_ism_put_vlan(ini->ism_dev, ini->vlan_id);
317
out:
318 319 320 321 322 323
	if (rc < 0) {
		if (rc == -ENOMEM)
			rc = SMC_CLC_DECL_MEM;
		else
			rc = SMC_CLC_DECL_INTERR;
	}
324 325 326
	return rc;
}

327 328
static void smc_buf_unuse(struct smc_connection *conn,
			  struct smc_link_group *lgr)
U
Ursula Braun 已提交
329
{
330
	if (conn->sndbuf_desc)
U
Ursula Braun 已提交
331 332
		conn->sndbuf_desc->used = 0;
	if (conn->rmb_desc) {
333
		if (!conn->rmb_desc->regerr) {
334 335 336 337 338 339
			if (!lgr->is_smcd) {
				/* unregister rmb with peer */
				smc_llc_do_delete_rkey(
						&lgr->lnk[SMC_SINGLE_LINK],
						conn->rmb_desc);
			}
340
			conn->rmb_desc->used = 0;
341 342 343 344 345 346
		} else {
			/* buf registration failed, reuse not possible */
			write_lock_bh(&lgr->rmbs_lock);
			list_del(&conn->rmb_desc->list);
			write_unlock_bh(&lgr->rmbs_lock);

347
			smc_buf_free(lgr, true, conn->rmb_desc);
348
		}
U
Ursula Braun 已提交
349 350 351
	}
}

352 353 354
/* remove a finished connection from its link group */
void smc_conn_free(struct smc_connection *conn)
{
355 356 357
	struct smc_link_group *lgr = conn->lgr;

	if (!lgr)
358
		return;
359
	if (lgr->is_smcd) {
360
		smc_ism_unset_conn(conn);
361 362
		tasklet_kill(&conn->rx_tsklet);
	} else {
363
		smc_cdc_tx_dismiss_slots(conn);
364
	}
365
	smc_lgr_unregister_conn(conn);
366
	smc_buf_unuse(conn, lgr);		/* allow buffer reuse */
367
	conn->lgr = NULL;
368 369 370

	if (!lgr->conns_num)
		smc_lgr_schedule_free_work(lgr);
371 372 373 374 375
}

static void smc_link_clear(struct smc_link *lnk)
{
	lnk->peer_qpn = 0;
376
	smc_llc_link_clear(lnk);
377
	smc_ib_modify_qp_reset(lnk);
378
	smc_wr_free_link(lnk);
379 380
	smc_ib_destroy_queue_pair(lnk);
	smc_ib_dealloc_protection_domain(lnk);
381
	smc_wr_free_link_mem(lnk);
382 383
}

384 385
static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
			  struct smc_buf_desc *buf_desc)
U
Ursula Braun 已提交
386
{
387 388
	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

389 390 391 392 393 394 395 396 397
	if (is_rmb) {
		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
			smc_ib_put_memory_region(
					buf_desc->mr_rx[SMC_SINGLE_LINK]);
		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
				    DMA_FROM_DEVICE);
	} else {
		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
				    DMA_TO_DEVICE);
U
Ursula Braun 已提交
398
	}
399
	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
400 401
	if (buf_desc->pages)
		__free_pages(buf_desc->pages, buf_desc->order);
402
	kfree(buf_desc);
U
Ursula Braun 已提交
403 404
}

405 406 407
static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
			  struct smc_buf_desc *buf_desc)
{
408 409 410
	if (is_dmb) {
		/* restore original buf len */
		buf_desc->len += sizeof(struct smcd_cdc_msg);
411
		smc_ism_unregister_dmb(lgr->smcd, buf_desc);
412
	} else {
413
		kfree(buf_desc->cpu_addr);
414
	}
415 416 417 418 419 420 421 422 423 424 425 426
	kfree(buf_desc);
}

static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
			 struct smc_buf_desc *buf_desc)
{
	if (lgr->is_smcd)
		smcd_buf_free(lgr, is_rmb, buf_desc);
	else
		smcr_buf_free(lgr, is_rmb, buf_desc);
}

427
static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
U
Ursula Braun 已提交
428
{
429 430
	struct smc_buf_desc *buf_desc, *bf_desc;
	struct list_head *buf_list;
U
Ursula Braun 已提交
431 432 433
	int i;

	for (i = 0; i < SMC_RMBE_SIZES; i++) {
434 435 436 437 438
		if (is_rmb)
			buf_list = &lgr->rmbs[i];
		else
			buf_list = &lgr->sndbufs[i];
		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
U
Ursula Braun 已提交
439
					 list) {
440
			list_del(&buf_desc->list);
441
			smc_buf_free(lgr, is_rmb, buf_desc);
U
Ursula Braun 已提交
442 443 444 445
		}
	}
}

446 447 448 449 450 451 452 453
static void smc_lgr_free_bufs(struct smc_link_group *lgr)
{
	/* free send buffers */
	__smc_lgr_free_bufs(lgr, false);
	/* free rmbs */
	__smc_lgr_free_bufs(lgr, true);
}

454
/* remove a link group */
U
Ursula Braun 已提交
455
static void smc_lgr_free(struct smc_link_group *lgr)
456
{
457
	smc_lgr_free_bufs(lgr);
458
	if (lgr->is_smcd) {
459
		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
460 461
		put_device(&lgr->smcd->dev);
	} else {
462
		smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
463 464
		put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
	}
465 466 467
	kfree(lgr);
}

468 469
void smc_lgr_forget(struct smc_link_group *lgr)
{
470 471 472 473 474
	struct list_head *lgr_list;
	spinlock_t *lgr_lock;

	lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
	spin_lock_bh(lgr_lock);
475
	/* do not use this link group for new connections */
476 477 478
	if (!list_empty(lgr_list))
		list_del_init(lgr_list);
	spin_unlock_bh(lgr_lock);
479 480
}

481
/* terminate link group */
482
static void __smc_lgr_terminate(struct smc_link_group *lgr)
483 484
{
	struct smc_connection *conn;
485
	struct smc_sock *smc;
486 487
	struct rb_node *node;

488 489 490
	if (lgr->terminating)
		return;	/* lgr already terminating */
	lgr->terminating = 1;
491 492
	if (!lgr->is_smcd)
		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
493 494 495 496 497

	write_lock_bh(&lgr->conns_lock);
	node = rb_first(&lgr->conns_all);
	while (node) {
		conn = rb_entry(node, struct smc_connection, alert_node);
498
		smc = container_of(conn, struct smc_sock, conn);
499
		sock_hold(&smc->sk); /* sock_put in close work */
500
		conn->killed = 1;
501
		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
502
		__smc_lgr_unregister_conn(conn);
503
		conn->lgr = NULL;
504
		write_unlock_bh(&lgr->conns_lock);
505 506
		if (!schedule_work(&conn->close_work))
			sock_put(&smc->sk);
507
		write_lock_bh(&lgr->conns_lock);
508 509 510
		node = rb_first(&lgr->conns_all);
	}
	write_unlock_bh(&lgr->conns_lock);
511 512
	if (!lgr->is_smcd)
		wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
513
	smc_lgr_schedule_free_work(lgr);
514 515
}

516
/* unlink and terminate link group */
517 518
void smc_lgr_terminate(struct smc_link_group *lgr)
{
519 520 521 522
	spinlock_t *lgr_lock;

	smc_lgr_list_head(lgr, &lgr_lock);
	spin_lock_bh(lgr_lock);
523 524 525 526 527
	if (lgr->terminating) {
		spin_unlock_bh(lgr_lock);
		return;	/* lgr already terminating */
	}
	list_del_init(&lgr->list);
528
	spin_unlock_bh(lgr_lock);
529
	__smc_lgr_terminate(lgr);
530 531
}

532 533 534 535
/* Called when IB port is terminated */
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
{
	struct smc_link_group *lgr, *l;
536
	LIST_HEAD(lgr_free_list);
537

538
	spin_lock_bh(&smc_lgr_list.lock);
539
	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
540 541
		if (!lgr->is_smcd &&
		    lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
542
		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
543
			list_move(&lgr->list, &lgr_free_list);
544
	}
545
	spin_unlock_bh(&smc_lgr_list.lock);
546 547 548 549 550

	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
		list_del_init(&lgr->list);
		__smc_lgr_terminate(lgr);
	}
551 552
}

553
/* Called when SMC-D device is terminated or peer is lost */
H
Hans Wippel 已提交
554
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
555 556 557 558 559
{
	struct smc_link_group *lgr, *l;
	LIST_HEAD(lgr_free_list);

	/* run common cleanup function and build free list */
560
	spin_lock_bh(&dev->lgr_lock);
561 562
	list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
		if ((!peer_gid || lgr->peer_gid == peer_gid) &&
H
Hans Wippel 已提交
563
		    (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
564 565 566
			list_move(&lgr->list, &lgr_free_list);
		}
	}
567
	spin_unlock_bh(&dev->lgr_lock);
568 569 570 571

	/* cancel the regular free workers and actually free lgrs */
	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
		list_del_init(&lgr->list);
572
		__smc_lgr_terminate(lgr);
573
		cancel_delayed_work_sync(&lgr->free_work);
H
Hans Wippel 已提交
574 575
		if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */
			smc_ism_signal_shutdown(lgr);
576 577 578 579
		smc_lgr_free(lgr);
	}
}

580 581 582
/* Determine vlan of internal TCP socket.
 * @vlan_id: address to store the determined vlan id into
 */
583
int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
584 585
{
	struct dst_entry *dst = sk_dst_get(clcsock->sk);
586 587
	struct net_device *ndev;
	int i, nest_lvl, rc = 0;
588

589
	ini->vlan_id = 0;
590 591 592 593 594 595 596 597 598
	if (!dst) {
		rc = -ENOTCONN;
		goto out;
	}
	if (!dst->dev) {
		rc = -ENODEV;
		goto out_rel;
	}

599 600
	ndev = dst->dev;
	if (is_vlan_dev(ndev)) {
601
		ini->vlan_id = vlan_dev_vlan_id(ndev);
602 603 604 605 606 607 608 609 610 611 612 613 614
		goto out_rel;
	}

	rtnl_lock();
	nest_lvl = dev_get_nest_level(ndev);
	for (i = 0; i < nest_lvl; i++) {
		struct list_head *lower = &ndev->adj_list.lower;

		if (list_empty(lower))
			break;
		lower = lower->next;
		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
		if (is_vlan_dev(ndev)) {
615
			ini->vlan_id = vlan_dev_vlan_id(ndev);
616 617 618 619
			break;
		}
	}
	rtnl_unlock();
620 621 622 623 624 625 626

out_rel:
	dst_release(dst);
out:
	return rc;
}

627 628
static bool smcr_lgr_match(struct smc_link_group *lgr,
			   struct smc_clc_msg_local *lcl,
629
			   enum smc_lgr_role role, u32 clcqpn)
630
{
631 632 633 634 635 636
	return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
		       SMC_SYSTEMID_LEN) &&
		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
			SMC_GID_SIZE) &&
		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
			sizeof(lcl->mac)) &&
637 638 639
		lgr->role == role &&
		(lgr->role == SMC_SERV ||
		 lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn);
640
}
641

642 643 644 645
static bool smcd_lgr_match(struct smc_link_group *lgr,
			   struct smcd_dev *smcismdev, u64 peer_gid)
{
	return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
646 647 648
}

/* create a new SMC connection (and a new link group if necessary) */
649
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
650 651
{
	struct smc_connection *conn = &smc->conn;
652
	struct list_head *lgr_list;
653 654
	struct smc_link_group *lgr;
	enum smc_lgr_role role;
655
	spinlock_t *lgr_lock;
656 657
	int rc = 0;

658
	lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
659
	lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
660
	ini->cln_first_contact = SMC_FIRST_CONTACT;
661
	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
662
	if (role == SMC_CLNT && ini->srv_first_contact)
663 664 665 666
		/* create new link group as well */
		goto create;

	/* determine if an existing link group can be reused */
667
	spin_lock_bh(lgr_lock);
668
	list_for_each_entry(lgr, lgr_list, list) {
669
		write_lock_bh(&lgr->conns_lock);
670 671 672
		if ((ini->is_smcd ?
		     smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
		     smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
673
		    !lgr->sync_err &&
674
		    lgr->vlan_id == ini->vlan_id &&
675 676
		    (role == SMC_CLNT ||
		     lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
677
			/* link group found */
678
			ini->cln_first_contact = SMC_REUSE_CONTACT;
679 680
			conn->lgr = lgr;
			smc_lgr_register_conn(conn); /* add smc conn to lgr */
681 682
			if (delayed_work_pending(&lgr->free_work))
				cancel_delayed_work(&lgr->free_work);
683 684 685 686 687
			write_unlock_bh(&lgr->conns_lock);
			break;
		}
		write_unlock_bh(&lgr->conns_lock);
	}
688
	spin_unlock_bh(lgr_lock);
689

690
	if (role == SMC_CLNT && !ini->srv_first_contact &&
691
	    ini->cln_first_contact == SMC_FIRST_CONTACT) {
692 693 694 695
		/* Server reuses a link group, but Client wants to start
		 * a new one
		 * send out_of_sync decline, reason synchr. error
		 */
696
		return SMC_CLC_DECL_SYNCERR;
697 698 699
	}

create:
700
	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
701
		rc = smc_lgr_create(smc, ini);
702 703
		if (rc)
			goto out;
704 705
		lgr = conn->lgr;
		write_lock_bh(&lgr->conns_lock);
706
		smc_lgr_register_conn(conn); /* add smc conn to lgr */
707
		write_unlock_bh(&lgr->conns_lock);
708
	}
709
	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
710
	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
S
Stefan Raspl 已提交
711
	conn->urg_state = SMC_URG_READ;
712
	if (ini->is_smcd) {
713 714 715
		conn->rx_off = sizeof(struct smcd_cdc_msg);
		smcd_cdc_rx_init(conn); /* init tasklet for this conn */
	}
716 717 718
#ifndef KERNEL_HAS_ATOMIC64
	spin_lock_init(&conn->acurs_lock);
#endif
719 720

out:
721
	return rc;
722
}
U
Ursula Braun 已提交
723

724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
/* convert the RMB size into the compressed notation - minimum 16K.
 * In contrast to plain ilog2, this rounds towards the next power of 2,
 * so the socket application gets at least its desired sndbuf / rcvbuf size.
 */
static u8 smc_compress_bufsize(int size)
{
	u8 compressed;

	if (size <= SMC_BUF_MIN_SIZE)
		return 0;

	size = (size - 1) >> 14;
	compressed = ilog2(size) + 1;
	if (compressed >= SMC_RMBE_SIZES)
		compressed = SMC_RMBE_SIZES - 1;
	return compressed;
}

/* convert the RMB size from compressed notation into integer */
int smc_uncompress_bufsize(u8 compressed)
{
	u32 size;

	size = 0x00000001 << (((int)compressed) + 14);
	return (int)size;
}

751 752
/* try to reuse a sndbuf or rmb description slot for a certain
 * buffer size; if not available, return NULL
U
Ursula Braun 已提交
753
 */
754 755 756
static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
					     rwlock_t *lock,
					     struct list_head *buf_list)
U
Ursula Braun 已提交
757
{
758
	struct smc_buf_desc *buf_slot;
U
Ursula Braun 已提交
759

760 761 762 763 764
	read_lock_bh(lock);
	list_for_each_entry(buf_slot, buf_list, list) {
		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
			read_unlock_bh(lock);
			return buf_slot;
U
Ursula Braun 已提交
765 766
		}
	}
767
	read_unlock_bh(lock);
U
Ursula Braun 已提交
768 769 770
	return NULL;
}

U
Ursula Braun 已提交
771 772 773 774 775 776 777 778 779
/* one of the conditions for announcing a receiver's current window size is
 * that it "results in a minimum increase in the window size of 10% of the
 * receive buffer space" [RFC7609]
 */
static inline int smc_rmb_wnd_update_limit(int rmbe_size)
{
	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
}

780 781
static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
						bool is_rmb, int bufsize)
782 783 784 785 786 787 788 789 790 791
{
	struct smc_buf_desc *buf_desc;
	struct smc_link *lnk;
	int rc;

	/* try to alloc a new buffer */
	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
	if (!buf_desc)
		return ERR_PTR(-ENOMEM);

792 793 794 795 796 797
	buf_desc->order = get_order(bufsize);
	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
				      __GFP_NOMEMALLOC | __GFP_COMP |
				      __GFP_NORETRY | __GFP_ZERO,
				      buf_desc->order);
	if (!buf_desc->pages) {
798 799 800
		kfree(buf_desc);
		return ERR_PTR(-EAGAIN);
	}
801
	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
802 803 804 805 806 807

	/* build the sg table from the pages */
	lnk = &lgr->lnk[SMC_SINGLE_LINK];
	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
			    GFP_KERNEL);
	if (rc) {
808
		smc_buf_free(lgr, is_rmb, buf_desc);
809 810 811 812 813 814 815 816 817 818
		return ERR_PTR(rc);
	}
	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
		   buf_desc->cpu_addr, bufsize);

	/* map sg table to DMA address */
	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
	/* SMC protocol depends on mapping to one DMA address only */
	if (rc != 1)  {
819
		smc_buf_free(lgr, is_rmb, buf_desc);
820 821 822 823 824 825 826 827 828 829
		return ERR_PTR(-EAGAIN);
	}

	/* create a new memory region for the RMB */
	if (is_rmb) {
		rc = smc_ib_get_memory_region(lnk->roce_pd,
					      IB_ACCESS_REMOTE_WRITE |
					      IB_ACCESS_LOCAL_WRITE,
					      buf_desc);
		if (rc) {
830
			smc_buf_free(lgr, is_rmb, buf_desc);
831 832 833 834
			return ERR_PTR(rc);
		}
	}

835
	buf_desc->len = bufsize;
836 837 838
	return buf_desc;
}

839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
#define SMCD_DMBE_SIZES		7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */

static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
						bool is_dmb, int bufsize)
{
	struct smc_buf_desc *buf_desc;
	int rc;

	if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
		return ERR_PTR(-EAGAIN);

	/* try to alloc a new DMB */
	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
	if (!buf_desc)
		return ERR_PTR(-ENOMEM);
	if (is_dmb) {
		rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
		if (rc) {
			kfree(buf_desc);
			return ERR_PTR(-EAGAIN);
		}
860 861 862
		buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
		/* CDC header stored in buf. So, pretend it was smaller */
		buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
863 864 865 866 867 868 869 870 871 872 873 874 875 876
	} else {
		buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
					     __GFP_NOWARN | __GFP_NORETRY |
					     __GFP_NOMEMALLOC);
		if (!buf_desc->cpu_addr) {
			kfree(buf_desc);
			return ERR_PTR(-EAGAIN);
		}
		buf_desc->len = bufsize;
	}
	return buf_desc;
}

static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
U
Ursula Braun 已提交
877
{
878
	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
U
Ursula Braun 已提交
879 880
	struct smc_connection *conn = &smc->conn;
	struct smc_link_group *lgr = conn->lgr;
881
	struct list_head *buf_list;
882
	int bufsize, bufsize_short;
883 884
	int sk_buf_size;
	rwlock_t *lock;
U
Ursula Braun 已提交
885

886 887 888 889 890 891 892
	if (is_rmb)
		/* use socket recv buffer size (w/o overhead) as start value */
		sk_buf_size = smc->sk.sk_rcvbuf / 2;
	else
		/* use socket send buffer size (w/o overhead) as start value */
		sk_buf_size = smc->sk.sk_sndbuf / 2;

893
	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
894
	     bufsize_short >= 0; bufsize_short--) {
895

896 897 898 899 900 901
		if (is_rmb) {
			lock = &lgr->rmbs_lock;
			buf_list = &lgr->rmbs[bufsize_short];
		} else {
			lock = &lgr->sndbufs_lock;
			buf_list = &lgr->sndbufs[bufsize_short];
902
		}
903
		bufsize = smc_uncompress_bufsize(bufsize_short);
904 905 906
		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
			continue;

907
		/* check for reusable slot in the link group */
908
		buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
909 910
		if (buf_desc) {
			memset(buf_desc->cpu_addr, 0, bufsize);
U
Ursula Braun 已提交
911 912
			break; /* found reusable slot */
		}
913

914 915 916 917 918
		if (is_smcd)
			buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
		else
			buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);

919 920 921
		if (PTR_ERR(buf_desc) == -ENOMEM)
			break;
		if (IS_ERR(buf_desc))
922
			continue;
923

924 925 926 927 928
		buf_desc->used = 1;
		write_lock_bh(lock);
		list_add(&buf_desc->list, buf_list);
		write_unlock_bh(lock);
		break; /* found */
U
Ursula Braun 已提交
929
	}
930

931
	if (IS_ERR(buf_desc))
932 933 934 935
		return -ENOMEM;

	if (is_rmb) {
		conn->rmb_desc = buf_desc;
936 937
		conn->rmbe_size_short = bufsize_short;
		smc->sk.sk_rcvbuf = bufsize * 2;
938
		atomic_set(&conn->bytes_to_rcv, 0);
939 940
		conn->rmbe_update_limit =
			smc_rmb_wnd_update_limit(buf_desc->len);
941 942
		if (is_smcd)
			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
U
Ursula Braun 已提交
943
	} else {
944 945 946
		conn->sndbuf_desc = buf_desc;
		smc->sk.sk_sndbuf = bufsize * 2;
		atomic_set(&conn->sndbuf_space, bufsize);
U
Ursula Braun 已提交
947
	}
948 949 950
	return 0;
}

951 952 953 954
void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

955 956
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
957 958 959 960 961 962 963 964
	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
			       conn->sndbuf_desc, DMA_TO_DEVICE);
}

void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

965 966
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
967 968 969 970 971 972 973 974
	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
				  conn->sndbuf_desc, DMA_TO_DEVICE);
}

void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

975 976
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
977 978 979 980 981 982 983 984
	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
			       conn->rmb_desc, DMA_FROM_DEVICE);
}

void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
{
	struct smc_link_group *lgr = conn->lgr;

985 986
	if (!conn->lgr || conn->lgr->is_smcd)
		return;
987 988 989 990
	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
				  conn->rmb_desc, DMA_FROM_DEVICE);
}

991 992 993 994 995 996
/* create the send and receive buffer for an SMC socket;
 * receive buffers are called RMBs;
 * (even though the SMC protocol allows more than one RMB-element per RMB,
 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
 * extra RMB for every connection in a link group
 */
997
int smc_buf_create(struct smc_sock *smc, bool is_smcd)
998 999 1000 1001
{
	int rc;

	/* create send buffer */
1002
	rc = __smc_buf_create(smc, is_smcd, false);
1003 1004 1005
	if (rc)
		return rc;
	/* create rmb */
1006
	rc = __smc_buf_create(smc, is_smcd, true);
1007
	if (rc)
1008
		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
1009
	return rc;
U
Ursula Braun 已提交
1010
}
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022

static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
{
	int i;

	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
			return i;
	}
	return -ENOSPC;
}

1023 1024
/* add a new rtoken from peer */
int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
1025
{
1026 1027
	u64 dma_addr = be64_to_cpu(nw_vaddr);
	u32 rkey = ntohl(nw_rkey);
1028 1029 1030 1031
	int i;

	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
1032
		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
1033
		    test_bit(i, lgr->rtokens_used_mask)) {
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
			/* already in list */
			return i;
		}
	}
	i = smc_rmb_reserve_rtoken_idx(lgr);
	if (i < 0)
		return i;
	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
	return i;
}

/* delete an rtoken */
int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
{
	u32 rkey = ntohl(nw_rkey);
	int i;

	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
		    test_bit(i, lgr->rtokens_used_mask)) {
			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;

			clear_bit(i, lgr->rtokens_used_mask);
1059 1060 1061
			return 0;
		}
	}
1062 1063 1064 1065 1066 1067 1068 1069 1070
	return -ENOENT;
}

/* save rkey and dma_addr received from peer during clc handshake */
int smc_rmb_rtoken_handling(struct smc_connection *conn,
			    struct smc_clc_msg_accept_confirm *clc)
{
	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
					  clc->rmb_rkey);
1071 1072 1073 1074
	if (conn->rtoken_idx < 0)
		return conn->rtoken_idx;
	return 0;
}
1075

1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
static void smc_core_going_away(void)
{
	struct smc_ib_device *smcibdev;
	struct smcd_dev *smcd;

	spin_lock(&smc_ib_devices.lock);
	list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
		int i;

		for (i = 0; i < SMC_MAX_PORTS; i++)
			set_bit(i, smcibdev->ports_going_away);
	}
	spin_unlock(&smc_ib_devices.lock);

	spin_lock(&smcd_dev_list.lock);
	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
		smcd->going_away = 1;
	}
	spin_unlock(&smcd_dev_list.lock);
}

1097 1098 1099 1100 1101
/* Called (from smc_exit) when module is removed */
void smc_core_exit(void)
{
	struct smc_link_group *lgr, *lg;
	LIST_HEAD(lgr_freeing_list);
1102
	struct smcd_dev *smcd;
1103

1104 1105
	smc_core_going_away();

1106
	spin_lock_bh(&smc_lgr_list.lock);
1107
	list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1108
	spin_unlock_bh(&smc_lgr_list.lock);
1109 1110 1111 1112 1113 1114

	spin_lock(&smcd_dev_list.lock);
	list_for_each_entry(smcd, &smcd_dev_list.list, list)
		list_splice_init(&smcd->lgr_list, &lgr_freeing_list);
	spin_unlock(&smcd_dev_list.lock);

1115 1116
	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
		list_del_init(&lgr->list);
1117 1118 1119 1120 1121 1122 1123 1124
		if (!lgr->is_smcd) {
			struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];

			if (lnk->state == SMC_LNK_ACTIVE)
				smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
							 false);
			smc_llc_link_inactive(lnk);
		}
1125
		cancel_delayed_work_sync(&lgr->free_work);
H
Hans Wippel 已提交
1126 1127
		if (lgr->is_smcd)
			smc_ism_signal_shutdown(lgr);
1128 1129 1130
		smc_lgr_free(lgr); /* free link group */
	}
}