smc_core.h 16.0 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  Definitions for SMC Connections, Link Groups and Links
 *
 *  Copyright IBM Corp. 2016
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 */

#ifndef _SMC_CORE_H
#define _SMC_CORE_H

15
#include <linux/atomic.h>
16 17
#include <linux/smc.h>
#include <linux/pci.h>
18
#include <rdma/ib_verbs.h>
19
#include <net/genetlink.h>
20 21 22 23

#include "smc.h"
#include "smc_ib.h"

U
Ursula Braun 已提交
24 25
#define SMC_RMBS_PER_LGR_MAX	255	/* max. # of RMBs per link group */

26 27 28
struct smc_lgr_list {			/* list of link group definition */
	struct list_head	list;
	spinlock_t		lock;	/* protects list of link groups */
29
	u32			num;	/* unique link group number */
30 31 32 33 34 35 36
};

enum smc_lgr_role {		/* possible roles of a link group */
	SMC_CLNT,	/* client */
	SMC_SERV	/* server */
};

37
enum smc_link_state {			/* possible states of a link */
38
	SMC_LNK_UNUSED,		/* link is unused */
39 40
	SMC_LNK_INACTIVE,	/* link is inactive */
	SMC_LNK_ACTIVATING,	/* link is being activated */
41
	SMC_LNK_ACTIVE,		/* link is active */
42 43
};

44 45 46 47 48 49
#define SMC_WR_BUF_SIZE		48	/* size of work request buffer */

struct smc_wr_buf {
	u8	raw[SMC_WR_BUF_SIZE];
};

50 51 52 53 54 55 56 57
#define SMC_WR_REG_MR_WAIT_TIME	(5 * HZ)/* wait time for ib_wr_reg_mr result */

enum smc_wr_reg_state {
	POSTED,		/* ib_wr_reg_mr request posted */
	CONFIRMED,	/* ib_wr_reg_mr response: successful */
	FAILED		/* ib_wr_reg_mr response: failure */
};

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
struct smc_rdma_sge {				/* sges for RDMA writes */
	struct ib_sge		wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE];
};

#define SMC_MAX_RDMA_WRITES	2		/* max. # of RDMA writes per
						 * message send
						 */

struct smc_rdma_sges {				/* sges per message send */
	struct smc_rdma_sge	tx_rdma_sge[SMC_MAX_RDMA_WRITES];
};

struct smc_rdma_wr {				/* work requests per message
						 * send
						 */
	struct ib_rdma_wr	wr_tx_rdma[SMC_MAX_RDMA_WRITES];
};

76 77
#define SMC_LGR_ID_SIZE		4

78 79 80
struct smc_link {
	struct smc_ib_device	*smcibdev;	/* ib-device */
	u8			ibport;		/* port - values 1 | 2 */
81 82 83
	struct ib_pd		*roce_pd;	/* IB protection domain,
						 * unique for every RoCE QP
						 */
84 85
	struct ib_qp		*roce_qp;	/* IB queue pair */
	struct ib_qp_attr	qp_attr;	/* IB queue pair attributes */
86 87 88 89

	struct smc_wr_buf	*wr_tx_bufs;	/* WR send payload buffers */
	struct ib_send_wr	*wr_tx_ibs;	/* WR send meta data */
	struct ib_sge		*wr_tx_sges;	/* WR send gather meta data */
90 91
	struct smc_rdma_sges	*wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
	struct smc_rdma_wr	*wr_tx_rdmas;	/* WR RDMA WRITE */
92
	struct smc_wr_tx_pend	*wr_tx_pends;	/* WR send waiting for CQE */
93
	struct completion	*wr_tx_compl;	/* WR send CQE completion */
94 95 96 97 98 99
	/* above four vectors have wr_tx_cnt elements and use the same index */
	dma_addr_t		wr_tx_dma_addr;	/* DMA address of wr_tx_bufs */
	atomic_long_t		wr_tx_id;	/* seq # of last sent WR */
	unsigned long		*wr_tx_mask;	/* bit mask of used indexes */
	u32			wr_tx_cnt;	/* number of WR send buffers */
	wait_queue_head_t	wr_tx_wait;	/* wait for free WR send buf */
100
	atomic_t		wr_tx_refcnt;	/* tx refs to link */
101 102 103 104 105 106 107 108

	struct smc_wr_buf	*wr_rx_bufs;	/* WR recv payload buffers */
	struct ib_recv_wr	*wr_rx_ibs;	/* WR recv meta data */
	struct ib_sge		*wr_rx_sges;	/* WR recv scatter meta data */
	/* above three vectors have wr_rx_cnt elements and use the same index */
	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
	u64			wr_rx_id;	/* seq # of last recv WR */
	u32			wr_rx_cnt;	/* number of WR recv buffers */
109
	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
110

111 112
	struct ib_reg_wr	wr_reg;		/* WR register memory region */
	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
113
	atomic_t		wr_reg_refcnt;	/* reg refs to link */
114 115
	enum smc_wr_reg_state	wr_reg_state;	/* state of wr_reg request */

116 117
	u8			gid[SMC_GID_SIZE];/* gid matching used vlan id*/
	u8			sgid_index;	/* gid index for vlan id      */
118 119 120 121 122 123
	u32			peer_qpn;	/* QP number of peer */
	enum ib_mtu		path_mtu;	/* used mtu */
	enum ib_mtu		peer_mtu;	/* mtu size of peer */
	u32			psn_initial;	/* QP tx initial packet seqno */
	u32			peer_psn;	/* QP rx initial packet seqno */
	u8			peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
124
	u8			peer_gid[SMC_GID_SIZE];	/* gid of peer*/
U
Ursula Braun 已提交
125
	u8			link_id;	/* unique # within link group */
126
	u8			link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
127
	u8			peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
128
	u8			link_idx;	/* index in lgr link array */
K
Karsten Graul 已提交
129
	u8			link_is_asym;	/* is link asymmetric? */
130
	struct smc_link_group	*lgr;		/* parent link group */
131
	struct work_struct	link_down_wrk;	/* wrk to bring link down */
132 133
	char			ibname[IB_DEVICE_NAME_MAX]; /* ib device name */
	int			ndev_ifidx; /* network device ifindex */
134 135

	enum smc_link_state	state;		/* state of link */
136 137 138
	struct delayed_work	llc_testlink_wrk; /* testlink worker */
	struct completion	llc_testlink_resp; /* wait for rx of testlink */
	int			llc_testlink_time; /* testlink interval */
139
	atomic_t		conn_cnt; /* connections on this link */
140 141 142 143 144
};

/* For now we just allow one parallel link per link group. The SMC protocol
 * allows more (up to 8).
 */
145
#define SMC_LINKS_PER_LGR_MAX	3
146 147
#define SMC_SINGLE_LINK		0

U
Ursula Braun 已提交
148 149 150 151
/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
struct smc_buf_desc {
	struct list_head	list;
	void			*cpu_addr;	/* virtual address of buffer */
152
	struct page		*pages;
153
	int			len;		/* length of buffer */
U
Ursula Braun 已提交
154
	u32			used;		/* currently used / unused */
155 156
	union {
		struct { /* SMC-R */
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
			struct sg_table	sgt[SMC_LINKS_PER_LGR_MAX];
					/* virtual buffer */
			struct ib_mr	*mr_rx[SMC_LINKS_PER_LGR_MAX];
					/* for rmb only: memory region
					 * incl. rkey provided to peer
					 */
			u32		order;	/* allocation order */

			u8		is_conf_rkey;
					/* confirm_rkey done */
			u8		is_reg_mr[SMC_LINKS_PER_LGR_MAX];
					/* mem region registered */
			u8		is_map_ib[SMC_LINKS_PER_LGR_MAX];
					/* mem region mapped to lnk */
			u8		is_reg_err;
					/* buffer registration err */
173 174
		};
		struct { /* SMC-D */
175 176 177 178 179 180
			unsigned short	sba_idx;
					/* SBA index number */
			u64		token;
					/* DMB token number */
			dma_addr_t	dma_addr;
					/* DMA address */
181 182
		};
	};
U
Ursula Braun 已提交
183 184
};

185 186 187 188 189
struct smc_rtoken {				/* address/key of remote RMB */
	u64			dma_addr;
	u32			rkey;
};

190 191 192 193 194 195
#define SMC_BUF_MIN_SIZE	16384	/* minimum size of an RMB */
#define SMC_RMBE_SIZES		16	/* number of distinct RMBE sizes */
/* theoretically, the RFC states that largest size would be 512K,
 * i.e. compressed 5 and thus 6 sizes (0..5), despite
 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
 */
U
Ursula Braun 已提交
196

197 198
struct smcd_dev;

199 200 201 202 203 204 205 206
enum smc_lgr_type {				/* redundancy state of lgr */
	SMC_LGR_NONE,			/* no active links, lgr to be deleted */
	SMC_LGR_SINGLE,			/* 1 active RNIC on each peer */
	SMC_LGR_SYMMETRIC,		/* 2 active RNICs on each peer */
	SMC_LGR_ASYMMETRIC_PEER,	/* local has 2, peer 1 active RNICs */
	SMC_LGR_ASYMMETRIC_LOCAL,	/* local has 1, peer 2 active RNICs */
};

207 208 209 210 211 212 213 214 215 216 217 218 219 220
enum smc_llc_flowtype {
	SMC_LLC_FLOW_NONE	= 0,
	SMC_LLC_FLOW_ADD_LINK	= 2,
	SMC_LLC_FLOW_DEL_LINK	= 4,
	SMC_LLC_FLOW_RKEY	= 6,
};

struct smc_llc_qentry;

struct smc_llc_flow {
	enum smc_llc_flowtype type;
	struct smc_llc_qentry *qentry;
};

221 222 223 224 225 226
struct smc_link_group {
	struct list_head	list;
	struct rb_root		conns_all;	/* connection tree */
	rwlock_t		conns_lock;	/* protects conns_all */
	unsigned int		conns_num;	/* current # of connections */
	unsigned short		vlan_id;	/* vlan id of link group */
U
Ursula Braun 已提交
227 228

	struct list_head	sndbufs[SMC_RMBE_SIZES];/* tx buffers */
229
	struct mutex		sndbufs_lock;	/* protects tx buffers */
U
Ursula Braun 已提交
230
	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
231
	struct mutex		rmbs_lock;	/* protects rx buffers */
232

U
Ursula Braun 已提交
233
	u8			id[SMC_LGR_ID_SIZE];	/* unique lgr id */
234
	struct delayed_work	free_work;	/* delayed freeing of an lgr */
235
	struct work_struct	terminate_work;	/* abnormal lgr termination */
236
	struct workqueue_struct	*tx_wq;		/* wq for conn. tx workers */
237 238
	u8			sync_err : 1;	/* lgr no longer fits to peer */
	u8			terminating : 1;/* lgr is terminating */
U
Ursula Braun 已提交
239
	u8			freeing : 1;	/* lgr is being freed */
240 241

	bool			is_smcd;	/* SMC-R or SMC-D */
242 243 244 245 246
	u8			smc_version;
	u8			negotiated_eid[SMC_MAX_EID_LEN];
	u8			peer_os;	/* peer operating system */
	u8			peer_smc_release;
	u8			peer_hostname[SMC_MAX_HOSTNAME_LEN];
247 248 249 250 251 252 253 254 255 256 257
	union {
		struct { /* SMC-R */
			enum smc_lgr_role	role;
						/* client or server */
			struct smc_link		lnk[SMC_LINKS_PER_LGR_MAX];
						/* smc link */
			char			peer_systemid[SMC_SYSTEMID_LEN];
						/* unique system_id of peer */
			struct smc_rtoken	rtokens[SMC_RMBS_PER_LGR_MAX]
						[SMC_LINKS_PER_LGR_MAX];
						/* remote addr/key pairs */
258
			DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
259
						/* used rtoken elements */
260
			u8			next_link_id;
261 262
			enum smc_lgr_type	type;
						/* redundancy state */
263 264
			u8			pnet_id[SMC_MAX_PNETID_LEN + 1];
						/* pnet id of this lgr */
265 266 267 268
			struct list_head	llc_event_q;
						/* queue for llc events */
			spinlock_t		llc_event_q_lock;
						/* protects llc_event_q */
269 270
			struct mutex		llc_conf_mutex;
						/* protects lgr reconfig. */
271
			struct work_struct	llc_add_link_work;
272
			struct work_struct	llc_del_link_work;
273 274
			struct work_struct	llc_event_work;
						/* llc event worker */
275
			wait_queue_head_t	llc_flow_waiter;
276
						/* w4 next llc event */
277 278
			wait_queue_head_t	llc_msg_waiter;
						/* w4 next llc msg */
279 280 281 282 283 284 285 286
			struct smc_llc_flow	llc_flow_lcl;
						/* llc local control field */
			struct smc_llc_flow	llc_flow_rmt;
						/* llc remote control field */
			struct smc_llc_qentry	*delayed_event;
						/* arrived when flow active */
			spinlock_t		llc_flow_lock;
						/* protects llc flow */
287 288
			int			llc_testlink_time;
						/* link keep alive time */
289 290
			u32			llc_termination_rsn;
						/* rsn code for termination */
291 292 293
			u8			nexthop_mac[ETH_ALEN];
			u8			uses_gateway;
			__be32			saddr;
294 295 296 297 298 299
		};
		struct { /* SMC-D */
			u64			peer_gid;
						/* Peer GID (remote) */
			struct smcd_dev		*smcd;
						/* ISM device for VLAN reg. */
300 301
			u8			peer_shutdown : 1;
						/* peer triggered shutdownn */
302 303
		};
	};
304 305
};

306 307
struct smc_clc_msg_local;

308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
#define GID_LIST_SIZE	2

struct smc_gidlist {
	u8			len;
	u8			list[GID_LIST_SIZE][SMC_GID_SIZE];
};

struct smc_init_info_smcrv2 {
	/* Input fields */
	__be32			saddr;
	struct sock		*clc_sk;
	__be32			daddr;

	/* Output fields when saddr is set */
	struct smc_ib_device	*ib_dev_v2;
	u8			ib_port_v2;
	u8			ib_gid_v2[SMC_GID_SIZE];

	/* Additional output fields when clc_sk and daddr is set as well */
	u8			uses_gateway;
	u8			nexthop_mac[ETH_ALEN];

	struct smc_gidlist	gidlist;
};

333 334
struct smc_init_info {
	u8			is_smcd;
335 336
	u8			smc_type_v1;
	u8			smc_type_v2;
337 338
	u8			first_contact_peer;
	u8			first_contact_local;
339
	unsigned short		vlan_id;
340
	u32			rc;
341
	u8			negotiated_eid[SMC_MAX_EID_LEN];
342 343
	/* SMC-R */
	struct smc_clc_msg_local *ib_lcl;
344 345
	u8			smcr_version;
	u8			check_smcrv2;
346 347 348
	u8			peer_gid[SMC_GID_SIZE];
	u8			peer_mac[ETH_ALEN];
	u8			peer_systemid[SMC_SYSTEMID_LEN];
349 350 351 352
	struct smc_ib_device	*ib_dev;
	u8			ib_gid[SMC_GID_SIZE];
	u8			ib_port;
	u32			ib_clcqpn;
353
	struct smc_init_info_smcrv2 smcrv2;
354
	/* SMC-D */
355 356
	u64			ism_peer_gid[SMC_MAX_ISM_DEVS + 1];
	struct smcd_dev		*ism_dev[SMC_MAX_ISM_DEVS + 1];
357
	u16			ism_chid[SMC_MAX_ISM_DEVS + 1];
358
	u8			ism_offered_cnt; /* # of ISM devices offered */
359
	u8			ism_selected;    /* index of selected ISM dev*/
360
	u8			smcd_version;
361 362
};

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
/* Find the connection associated with the given alert token in the link group.
 * To use rbtrees we have to implement our own search core.
 * Requires @conns_lock
 * @token	alert token to search for
 * @lgr		 link group to search in
 * Returns connection associated with token if found, NULL otherwise.
 */
static inline struct smc_connection *smc_lgr_find_conn(
	u32 token, struct smc_link_group *lgr)
{
	struct smc_connection *res = NULL;
	struct rb_node *node;

	node = lgr->conns_all.rb_node;
	while (node) {
		struct smc_connection *cur = rb_entry(node,
					struct smc_connection, alert_node);

		if (cur->alert_token_local > token) {
			node = node->rb_left;
		} else {
			if (cur->alert_token_local < token) {
				node = node->rb_right;
			} else {
				res = cur;
				break;
			}
		}
	}

	return res;
}

396 397 398 399 400 401 402 403
/* returns true if the specified link is usable */
static inline bool smc_link_usable(struct smc_link *lnk)
{
	if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE)
		return false;
	return true;
}

404 405 406 407 408
static inline bool smc_link_active(struct smc_link *lnk)
{
	return lnk->state == SMC_LNK_ACTIVE;
}

409 410 411 412 413 414 415 416 417 418 419 420 421
static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
{
	sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
		be16_to_cpu(((__be16 *)gid_raw)[0]),
		be16_to_cpu(((__be16 *)gid_raw)[1]),
		be16_to_cpu(((__be16 *)gid_raw)[2]),
		be16_to_cpu(((__be16 *)gid_raw)[3]),
		be16_to_cpu(((__be16 *)gid_raw)[4]),
		be16_to_cpu(((__be16 *)gid_raw)[5]),
		be16_to_cpu(((__be16 *)gid_raw)[6]),
		be16_to_cpu(((__be16 *)gid_raw)[7]));
}

422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
struct smc_pci_dev {
	__u32		pci_fid;
	__u16		pci_pchid;
	__u16		pci_vendor;
	__u16		pci_device;
	__u8		pci_id[SMC_PCI_ID_STR_LEN];
};

static inline void smc_set_pci_values(struct pci_dev *pci_dev,
				      struct smc_pci_dev *smc_dev)
{
	smc_dev->pci_vendor = pci_dev->vendor;
	smc_dev->pci_device = pci_dev->device;
	snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s",
		 pci_name(pci_dev));
#if IS_ENABLED(CONFIG_S390)
	{ /* Set s390 specific PCI information */
	struct zpci_dev *zdev;

	zdev = to_zpci(pci_dev);
	smc_dev->pci_fid = zdev->fid;
	smc_dev->pci_pchid = zdev->pchid;
	}
#endif
}

U
Ursula Braun 已提交
448 449 450
struct smc_sock;
struct smc_clc_msg_accept_confirm;

451
void smc_lgr_cleanup_early(struct smc_connection *conn);
452
void smc_lgr_terminate_sched(struct smc_link_group *lgr);
453
void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
454
void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport);
H
Hans Wippel 已提交
455 456
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
			unsigned short vlan);
457
void smc_smcd_terminate_all(struct smcd_dev *dev);
458
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
459
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
460
int smc_uncompress_bufsize(u8 compressed);
461
int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
462
			    struct smc_clc_msg_accept_confirm *clc);
463 464
int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
465 466 467 468
void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
		    __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey);
void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
		     __be64 nw_vaddr, __be32 nw_rkey);
469 470 471 472
void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
473
int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
474

475
void smc_conn_free(struct smc_connection *conn);
476
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
477
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
478
int smc_core_init(void);
479
void smc_core_exit(void);
480

481 482
int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
		   u8 link_idx, struct smc_init_info *ini);
483
void smcr_link_clear(struct smc_link *lnk, bool log);
484 485
void smc_switch_link_and_count(struct smc_connection *conn,
			       struct smc_link *to_lnk);
486 487
int smcr_buf_map_lgr(struct smc_link *lnk);
int smcr_buf_reg_lgr(struct smc_link *lnk);
K
Karsten Graul 已提交
488 489 490
void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
			    enum smc_lgr_type new_type, int asym_lnk_idx);
491
int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
492 493
struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
				  struct smc_link *from_lnk, bool is_dev_err);
494 495
void smcr_link_down_cond(struct smc_link *lnk);
void smcr_link_down_cond_sched(struct smc_link *lnk);
496
int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb);
497
int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
498
int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb);
499
int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
500

501 502
static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
{
503
	return link->lgr;
504
}
505
#endif