提交 eace92e3 编写于 作者: D David S. Miller

Merge branch 'smc-get-rid-of-unsafe_global_rkey'

Ursula Braun says:

====================
net/smc: get rid of unsafe_global_rkey

The smc code uses the unsafe_global_rkey, exposing all memory for
remote reads and writes once a connection is established.
Here is now a patch series to get rid of unsafe_global_rkey usage.
Main idea is to switch to SG-logic and separate memory regions for RMBs.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
...@@ -8,10 +8,6 @@ config SMC ...@@ -8,10 +8,6 @@ config SMC
The Linux implementation of the SMC-R solution is designed as The Linux implementation of the SMC-R solution is designed as
a separate socket family SMC. a separate socket family SMC.
Warning: SMC will expose all memory for remote reads and writes
once a connection is established. Don't enable this option except
for tightly controlled lab environment.
Select this option if you want to run SMC socket applications Select this option if you want to run SMC socket applications
config SMC_DIAG config SMC_DIAG
......
...@@ -338,6 +338,12 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) ...@@ -338,6 +338,12 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
return SMC_CLC_DECL_INTERR; return SMC_CLC_DECL_INTERR;
smc_wr_remember_qp_attr(link); smc_wr_remember_qp_attr(link);
rc = smc_wr_reg_send(link,
smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
if (rc)
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK response over RoCE fabric */ /* send CONFIRM LINK response over RoCE fabric */
rc = smc_llc_send_confirm_link(link, rc = smc_llc_send_confirm_link(link,
link->smcibdev->mac[link->ibport - 1], link->smcibdev->mac[link->ibport - 1],
...@@ -430,12 +436,8 @@ static int smc_connect_rdma(struct smc_sock *smc) ...@@ -430,12 +436,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
smc_conn_save_peer_info(smc, &aclc); smc_conn_save_peer_info(smc, &aclc);
rc = smc_sndbuf_create(smc); /* create send buffer and rmb */
if (rc) { rc = smc_buf_create(smc);
reason_code = SMC_CLC_DECL_MEM;
goto decline_rdma_unlock;
}
rc = smc_rmb_create(smc);
if (rc) { if (rc) {
reason_code = SMC_CLC_DECL_MEM; reason_code = SMC_CLC_DECL_MEM;
goto decline_rdma_unlock; goto decline_rdma_unlock;
...@@ -459,7 +461,20 @@ static int smc_connect_rdma(struct smc_sock *smc) ...@@ -459,7 +461,20 @@ static int smc_connect_rdma(struct smc_sock *smc)
reason_code = SMC_CLC_DECL_INTERR; reason_code = SMC_CLC_DECL_INTERR;
goto decline_rdma_unlock; goto decline_rdma_unlock;
} }
} else {
struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
if (!buf_desc->reused) {
/* register memory region for new rmb */
rc = smc_wr_reg_send(link,
buf_desc->mr_rx[SMC_SINGLE_LINK]);
if (rc) {
reason_code = SMC_CLC_DECL_INTERR;
goto decline_rdma_unlock;
}
}
} }
smc_rmb_sync_sg_for_device(&smc->conn);
rc = smc_clc_send_confirm(smc); rc = smc_clc_send_confirm(smc);
if (rc) if (rc)
...@@ -692,6 +707,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) ...@@ -692,6 +707,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
int rc; int rc;
link = &lgr->lnk[SMC_SINGLE_LINK]; link = &lgr->lnk[SMC_SINGLE_LINK];
rc = smc_wr_reg_send(link,
smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
if (rc)
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK request to client over the RoCE fabric */ /* send CONFIRM LINK request to client over the RoCE fabric */
rc = smc_llc_send_confirm_link(link, rc = smc_llc_send_confirm_link(link,
link->smcibdev->mac[link->ibport - 1], link->smcibdev->mac[link->ibport - 1],
...@@ -779,11 +800,6 @@ static void smc_listen_work(struct work_struct *work) ...@@ -779,11 +800,6 @@ static void smc_listen_work(struct work_struct *work)
mutex_lock(&smc_create_lgr_pending); mutex_lock(&smc_create_lgr_pending);
local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
smcibdev, ibport, &pclc.lcl, 0); smcibdev, ibport, &pclc.lcl, 0);
if (local_contact == SMC_REUSE_CONTACT)
/* lock no longer needed, free it due to following
* smc_clc_wait_msg() call
*/
mutex_unlock(&smc_create_lgr_pending);
if (local_contact < 0) { if (local_contact < 0) {
rc = local_contact; rc = local_contact;
if (rc == -ENOMEM) if (rc == -ENOMEM)
...@@ -794,12 +810,8 @@ static void smc_listen_work(struct work_struct *work) ...@@ -794,12 +810,8 @@ static void smc_listen_work(struct work_struct *work)
} }
link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
rc = smc_sndbuf_create(new_smc); /* create send buffer and rmb */
if (rc) { rc = smc_buf_create(new_smc);
reason_code = SMC_CLC_DECL_MEM;
goto decline_rdma;
}
rc = smc_rmb_create(new_smc);
if (rc) { if (rc) {
reason_code = SMC_CLC_DECL_MEM; reason_code = SMC_CLC_DECL_MEM;
goto decline_rdma; goto decline_rdma;
...@@ -808,6 +820,21 @@ static void smc_listen_work(struct work_struct *work) ...@@ -808,6 +820,21 @@ static void smc_listen_work(struct work_struct *work)
smc_close_init(new_smc); smc_close_init(new_smc);
smc_rx_init(new_smc); smc_rx_init(new_smc);
if (local_contact != SMC_FIRST_CONTACT) {
struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
if (!buf_desc->reused) {
/* register memory region for new rmb */
rc = smc_wr_reg_send(link,
buf_desc->mr_rx[SMC_SINGLE_LINK]);
if (rc) {
reason_code = SMC_CLC_DECL_INTERR;
goto decline_rdma;
}
}
}
smc_rmb_sync_sg_for_device(&new_smc->conn);
rc = smc_clc_send_accept(new_smc, local_contact); rc = smc_clc_send_accept(new_smc, local_contact);
if (rc) if (rc)
goto out_err; goto out_err;
...@@ -853,8 +880,7 @@ static void smc_listen_work(struct work_struct *work) ...@@ -853,8 +880,7 @@ static void smc_listen_work(struct work_struct *work)
if (newsmcsk->sk_state == SMC_INIT) if (newsmcsk->sk_state == SMC_INIT)
newsmcsk->sk_state = SMC_ACTIVE; newsmcsk->sk_state = SMC_ACTIVE;
enqueue: enqueue:
if (local_contact == SMC_FIRST_CONTACT) mutex_unlock(&smc_create_lgr_pending);
mutex_unlock(&smc_create_lgr_pending);
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
if (lsmc->sk.sk_state == SMC_LISTEN) { if (lsmc->sk.sk_state == SMC_LISTEN) {
smc_accept_enqueue(&lsmc->sk, newsmcsk); smc_accept_enqueue(&lsmc->sk, newsmcsk);
......
...@@ -204,13 +204,13 @@ int smc_clc_send_confirm(struct smc_sock *smc) ...@@ -204,13 +204,13 @@ int smc_clc_send_confirm(struct smc_sock *smc)
memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
hton24(cclc.qpn, link->roce_qp->qp_num); hton24(cclc.qpn, link->roce_qp->qp_num);
cclc.rmb_rkey = cclc.rmb_rkey =
htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.rmbe_alert_token = htonl(conn->alert_token_local);
cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
cclc.rmbe_size = conn->rmbe_size_short; cclc.rmbe_size = conn->rmbe_size_short;
cclc.rmb_dma_addr = cclc.rmb_dma_addr = cpu_to_be64(
cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
hton24(cclc.psn, link->psn_initial); hton24(cclc.psn, link->psn_initial);
memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
...@@ -256,13 +256,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) ...@@ -256,13 +256,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
hton24(aclc.qpn, link->roce_qp->qp_num); hton24(aclc.qpn, link->roce_qp->qp_num);
aclc.rmb_rkey = aclc.rmb_rkey =
htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.rmbe_alert_token = htonl(conn->alert_token_local);
aclc.qp_mtu = link->path_mtu; aclc.qp_mtu = link->path_mtu;
aclc.rmbe_size = conn->rmbe_size_short, aclc.rmbe_size = conn->rmbe_size_short,
aclc.rmb_dma_addr = aclc.rmb_dma_addr = cpu_to_be64(
cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
hton24(aclc.psn, link->psn_initial); hton24(aclc.psn, link->psn_initial);
memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
......
...@@ -175,7 +175,6 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, ...@@ -175,7 +175,6 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
rc = smc_wr_alloc_link_mem(lnk); rc = smc_wr_alloc_link_mem(lnk);
if (rc) if (rc)
goto free_lgr; goto free_lgr;
init_waitqueue_head(&lnk->wr_tx_wait);
rc = smc_ib_create_protection_domain(lnk); rc = smc_ib_create_protection_domain(lnk);
if (rc) if (rc)
goto free_link_mem; goto free_link_mem;
...@@ -207,17 +206,14 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, ...@@ -207,17 +206,14 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
return rc; return rc;
} }
static void smc_sndbuf_unuse(struct smc_connection *conn) static void smc_buf_unuse(struct smc_connection *conn)
{ {
if (conn->sndbuf_desc) { if (conn->sndbuf_desc) {
conn->sndbuf_desc->used = 0; conn->sndbuf_desc->used = 0;
conn->sndbuf_size = 0; conn->sndbuf_size = 0;
} }
}
static void smc_rmb_unuse(struct smc_connection *conn)
{
if (conn->rmb_desc) { if (conn->rmb_desc) {
conn->rmb_desc->reused = true;
conn->rmb_desc->used = 0; conn->rmb_desc->used = 0;
conn->rmbe_size = 0; conn->rmbe_size = 0;
} }
...@@ -232,8 +228,7 @@ void smc_conn_free(struct smc_connection *conn) ...@@ -232,8 +228,7 @@ void smc_conn_free(struct smc_connection *conn)
return; return;
smc_cdc_tx_dismiss_slots(conn); smc_cdc_tx_dismiss_slots(conn);
smc_lgr_unregister_conn(conn); smc_lgr_unregister_conn(conn);
smc_rmb_unuse(conn); smc_buf_unuse(conn);
smc_sndbuf_unuse(conn);
} }
static void smc_link_clear(struct smc_link *lnk) static void smc_link_clear(struct smc_link *lnk)
...@@ -246,48 +241,57 @@ static void smc_link_clear(struct smc_link *lnk) ...@@ -246,48 +241,57 @@ static void smc_link_clear(struct smc_link *lnk)
smc_wr_free_link_mem(lnk); smc_wr_free_link_mem(lnk);
} }
static void smc_lgr_free_sndbufs(struct smc_link_group *lgr) static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
bool is_rmb)
{ {
struct smc_buf_desc *sndbuf_desc, *bf_desc; if (is_rmb) {
int i; if (buf_desc->mr_rx[SMC_SINGLE_LINK])
smc_ib_put_memory_region(
for (i = 0; i < SMC_RMBE_SIZES; i++) { buf_desc->mr_rx[SMC_SINGLE_LINK]);
list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i], smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
list) { DMA_FROM_DEVICE);
list_del(&sndbuf_desc->list); } else {
smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
smc_uncompress_bufsize(i), DMA_TO_DEVICE);
sndbuf_desc, DMA_TO_DEVICE);
kfree(sndbuf_desc->cpu_addr);
kfree(sndbuf_desc);
}
} }
sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
if (buf_desc->cpu_addr)
free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order);
kfree(buf_desc);
} }
static void smc_lgr_free_rmbs(struct smc_link_group *lgr) static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
{ {
struct smc_buf_desc *rmb_desc, *bf_desc;
struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
struct smc_buf_desc *buf_desc, *bf_desc;
struct list_head *buf_list;
int i; int i;
for (i = 0; i < SMC_RMBE_SIZES; i++) { for (i = 0; i < SMC_RMBE_SIZES; i++) {
list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i], if (is_rmb)
buf_list = &lgr->rmbs[i];
else
buf_list = &lgr->sndbufs[i];
list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
list) { list) {
list_del(&rmb_desc->list); list_del(&buf_desc->list);
smc_ib_buf_unmap(lnk->smcibdev, smc_buf_free(buf_desc, lnk, is_rmb);
smc_uncompress_bufsize(i),
rmb_desc, DMA_FROM_DEVICE);
kfree(rmb_desc->cpu_addr);
kfree(rmb_desc);
} }
} }
} }
static void smc_lgr_free_bufs(struct smc_link_group *lgr)
{
/* free send buffers */
__smc_lgr_free_bufs(lgr, false);
/* free rmbs */
__smc_lgr_free_bufs(lgr, true);
}
/* remove a link group */ /* remove a link group */
void smc_lgr_free(struct smc_link_group *lgr) void smc_lgr_free(struct smc_link_group *lgr)
{ {
smc_lgr_free_rmbs(lgr); smc_lgr_free_bufs(lgr);
smc_lgr_free_sndbufs(lgr);
smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
kfree(lgr); kfree(lgr);
} }
...@@ -452,45 +456,25 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, ...@@ -452,45 +456,25 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
return rc ? rc : local_contact; return rc ? rc : local_contact;
} }
/* try to reuse a sndbuf description slot of the sndbufs list for a certain /* try to reuse a sndbuf or rmb description slot for a certain
* buf_size; if not available, return NULL * buffer size; if not available, return NULL
*/ */
static inline static inline
struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr, struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr,
int compressed_bufsize) int compressed_bufsize,
rwlock_t *lock,
struct list_head *buf_list)
{ {
struct smc_buf_desc *sndbuf_slot; struct smc_buf_desc *buf_slot;
read_lock_bh(&lgr->sndbufs_lock);
list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
list) {
if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
read_unlock_bh(&lgr->sndbufs_lock);
return sndbuf_slot;
}
}
read_unlock_bh(&lgr->sndbufs_lock);
return NULL;
}
/* try to reuse an rmb description slot of the rmbs list for a certain read_lock_bh(lock);
* rmbe_size; if not available, return NULL list_for_each_entry(buf_slot, buf_list, list) {
*/ if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
static inline read_unlock_bh(lock);
struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr, return buf_slot;
int compressed_bufsize)
{
struct smc_buf_desc *rmb_slot;
read_lock_bh(&lgr->rmbs_lock);
list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
list) {
if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
read_unlock_bh(&lgr->rmbs_lock);
return rmb_slot;
} }
} }
read_unlock_bh(&lgr->rmbs_lock); read_unlock_bh(lock);
return NULL; return NULL;
} }
...@@ -503,136 +487,186 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) ...@@ -503,136 +487,186 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size)
return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
} }
/* create the tx buffer for an SMC socket */ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
int smc_sndbuf_create(struct smc_sock *smc) bool is_rmb, int bufsize)
{ {
struct smc_connection *conn = &smc->conn; struct smc_buf_desc *buf_desc;
struct smc_link_group *lgr = conn->lgr; struct smc_link *lnk;
int tmp_bufsize, tmp_bufsize_short;
struct smc_buf_desc *sndbuf_desc;
int rc; int rc;
/* use socket send buffer size (w/o overhead) as start value */ /* try to alloc a new buffer */
for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
tmp_bufsize_short >= 0; tmp_bufsize_short--) { if (!buf_desc)
tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); return ERR_PTR(-ENOMEM);
/* check for reusable sndbuf_slot in the link group */
sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short); buf_desc->cpu_addr =
if (sndbuf_desc) { (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN |
memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize); __GFP_NOMEMALLOC |
break; /* found reusable slot */ __GFP_NORETRY | __GFP_ZERO,
} get_order(bufsize));
/* try to alloc a new send buffer */ if (!buf_desc->cpu_addr) {
sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL); kfree(buf_desc);
if (!sndbuf_desc) return ERR_PTR(-EAGAIN);
break; /* give up with -ENOMEM */ }
sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize, buf_desc->order = get_order(bufsize);
GFP_KERNEL | __GFP_NOWARN |
__GFP_NOMEMALLOC | /* build the sg table from the pages */
__GFP_NORETRY); lnk = &lgr->lnk[SMC_SINGLE_LINK];
if (!sndbuf_desc->cpu_addr) { rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
kfree(sndbuf_desc); GFP_KERNEL);
sndbuf_desc = NULL; if (rc) {
/* if send buffer allocation has failed, smc_buf_free(buf_desc, lnk, is_rmb);
* try a smaller one return ERR_PTR(rc);
*/ }
continue; sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
} buf_desc->cpu_addr, bufsize);
rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
tmp_bufsize, sndbuf_desc, /* map sg table to DMA address */
DMA_TO_DEVICE); rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
/* SMC protocol depends on mapping to one DMA address only */
if (rc != 1) {
smc_buf_free(buf_desc, lnk, is_rmb);
return ERR_PTR(-EAGAIN);
}
/* create a new memory region for the RMB */
if (is_rmb) {
rc = smc_ib_get_memory_region(lnk->roce_pd,
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_LOCAL_WRITE,
buf_desc);
if (rc) { if (rc) {
kfree(sndbuf_desc->cpu_addr); smc_buf_free(buf_desc, lnk, is_rmb);
kfree(sndbuf_desc); return ERR_PTR(rc);
sndbuf_desc = NULL;
continue; /* if mapping failed, try smaller one */
} }
sndbuf_desc->used = 1;
write_lock_bh(&lgr->sndbufs_lock);
list_add(&sndbuf_desc->list,
&lgr->sndbufs[tmp_bufsize_short]);
write_unlock_bh(&lgr->sndbufs_lock);
break;
}
if (sndbuf_desc && sndbuf_desc->cpu_addr) {
conn->sndbuf_desc = sndbuf_desc;
conn->sndbuf_size = tmp_bufsize;
smc->sk.sk_sndbuf = tmp_bufsize * 2;
atomic_set(&conn->sndbuf_space, tmp_bufsize);
return 0;
} else {
return -ENOMEM;
} }
return buf_desc;
} }
/* create the RMB for an SMC socket (even though the SMC protocol static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
* allows more than one RMB-element per RMB, the Linux implementation
* uses just one RMB-element per RMB, i.e. uses an extra RMB for every
* connection in a link group
*/
int smc_rmb_create(struct smc_sock *smc)
{ {
struct smc_connection *conn = &smc->conn; struct smc_connection *conn = &smc->conn;
struct smc_link_group *lgr = conn->lgr; struct smc_link_group *lgr = conn->lgr;
int tmp_bufsize, tmp_bufsize_short; struct smc_buf_desc *buf_desc = NULL;
struct smc_buf_desc *rmb_desc; struct list_head *buf_list;
int rc; int bufsize, bufsize_short;
int sk_buf_size;
rwlock_t *lock;
if (is_rmb)
/* use socket recv buffer size (w/o overhead) as start value */
sk_buf_size = smc->sk.sk_rcvbuf / 2;
else
/* use socket send buffer size (w/o overhead) as start value */
sk_buf_size = smc->sk.sk_sndbuf / 2;
for (bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
bufsize_short >= 0; bufsize_short--) {
if (is_rmb) {
lock = &lgr->rmbs_lock;
buf_list = &lgr->rmbs[bufsize_short];
} else {
lock = &lgr->sndbufs_lock;
buf_list = &lgr->sndbufs[bufsize_short];
}
bufsize = smc_uncompress_bufsize(bufsize_short);
if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
continue;
/* use socket recv buffer size (w/o overhead) as start value */ /* check for reusable slot in the link group */
for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2); buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list);
tmp_bufsize_short >= 0; tmp_bufsize_short--) { if (buf_desc) {
tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); memset(buf_desc->cpu_addr, 0, bufsize);
/* check for reusable rmb_slot in the link group */
rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
if (rmb_desc) {
memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
break; /* found reusable slot */ break; /* found reusable slot */
} }
/* try to alloc a new RMB */
rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL); buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize);
if (!rmb_desc) if (PTR_ERR(buf_desc) == -ENOMEM)
break; /* give up with -ENOMEM */ break;
rmb_desc->cpu_addr = kzalloc(tmp_bufsize, if (IS_ERR(buf_desc))
GFP_KERNEL | __GFP_NOWARN |
__GFP_NOMEMALLOC |
__GFP_NORETRY);
if (!rmb_desc->cpu_addr) {
kfree(rmb_desc);
rmb_desc = NULL;
/* if RMB allocation has failed,
* try a smaller one
*/
continue; continue;
}
rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, buf_desc->used = 1;
tmp_bufsize, rmb_desc, write_lock_bh(lock);
DMA_FROM_DEVICE); list_add(&buf_desc->list, buf_list);
if (rc) { write_unlock_bh(lock);
kfree(rmb_desc->cpu_addr); break; /* found */
kfree(rmb_desc);
rmb_desc = NULL;
continue; /* if mapping failed, try smaller one */
}
rmb_desc->rkey[SMC_SINGLE_LINK] =
lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey;
rmb_desc->used = 1;
write_lock_bh(&lgr->rmbs_lock);
list_add(&rmb_desc->list,
&lgr->rmbs[tmp_bufsize_short]);
write_unlock_bh(&lgr->rmbs_lock);
break;
} }
if (rmb_desc && rmb_desc->cpu_addr) {
conn->rmb_desc = rmb_desc; if (IS_ERR(buf_desc))
conn->rmbe_size = tmp_bufsize; return -ENOMEM;
conn->rmbe_size_short = tmp_bufsize_short;
smc->sk.sk_rcvbuf = tmp_bufsize * 2; if (is_rmb) {
conn->rmb_desc = buf_desc;
conn->rmbe_size = bufsize;
conn->rmbe_size_short = bufsize_short;
smc->sk.sk_rcvbuf = bufsize * 2;
atomic_set(&conn->bytes_to_rcv, 0); atomic_set(&conn->bytes_to_rcv, 0);
conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
return 0;
} else { } else {
return -ENOMEM; conn->sndbuf_desc = buf_desc;
conn->sndbuf_size = bufsize;
smc->sk.sk_sndbuf = bufsize * 2;
atomic_set(&conn->sndbuf_space, bufsize);
} }
return 0;
}
void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->sndbuf_desc, DMA_TO_DEVICE);
}
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->sndbuf_desc, DMA_TO_DEVICE);
}
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->rmb_desc, DMA_FROM_DEVICE);
}
void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->rmb_desc, DMA_FROM_DEVICE);
}
/* create the send and receive buffer for an SMC socket;
* receive buffers are called RMBs;
* (even though the SMC protocol allows more than one RMB-element per RMB,
* the Linux implementation uses just one RMB-element per RMB, i.e. uses an
* extra RMB for every connection in a link group
*/
int smc_buf_create(struct smc_sock *smc)
{
int rc;
/* create send buffer */
rc = __smc_buf_create(smc, false);
if (rc)
return rc;
/* create rmb */
rc = __smc_buf_create(smc, true);
if (rc)
smc_buf_free(smc->conn.sndbuf_desc,
&smc->conn.lgr->lnk[SMC_SINGLE_LINK], false);
return rc;
} }
static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
......
...@@ -37,6 +37,14 @@ struct smc_wr_buf { ...@@ -37,6 +37,14 @@ struct smc_wr_buf {
u8 raw[SMC_WR_BUF_SIZE]; u8 raw[SMC_WR_BUF_SIZE];
}; };
#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */
enum smc_wr_reg_state {
POSTED, /* ib_wr_reg_mr request posted */
CONFIRMED, /* ib_wr_reg_mr response: successful */
FAILED /* ib_wr_reg_mr response: failure */
};
struct smc_link { struct smc_link {
struct smc_ib_device *smcibdev; /* ib-device */ struct smc_ib_device *smcibdev; /* ib-device */
u8 ibport; /* port - values 1 | 2 */ u8 ibport; /* port - values 1 | 2 */
...@@ -65,6 +73,10 @@ struct smc_link { ...@@ -65,6 +73,10 @@ struct smc_link {
u64 wr_rx_id; /* seq # of last recv WR */ u64 wr_rx_id; /* seq # of last recv WR */
u32 wr_rx_cnt; /* number of WR recv buffers */ u32 wr_rx_cnt; /* number of WR recv buffers */
struct ib_reg_wr wr_reg; /* WR register memory region */
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
union ib_gid gid; /* gid matching used vlan id */ union ib_gid gid; /* gid matching used vlan id */
u32 peer_qpn; /* QP number of peer */ u32 peer_qpn; /* QP number of peer */
enum ib_mtu path_mtu; /* used mtu */ enum ib_mtu path_mtu; /* used mtu */
...@@ -90,14 +102,15 @@ struct smc_link { ...@@ -90,14 +102,15 @@ struct smc_link {
/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
struct smc_buf_desc { struct smc_buf_desc {
struct list_head list; struct list_head list;
u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
/* mapped address of buffer */
void *cpu_addr; /* virtual address of buffer */ void *cpu_addr; /* virtual address of buffer */
u32 rkey[SMC_LINKS_PER_LGR_MAX]; struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
/* for rmb only: struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
* rkey provided to peer /* for rmb only: memory region
* incl. rkey provided to peer
*/ */
u32 order; /* allocation order */
u32 used; /* currently used / unused */ u32 used; /* currently used / unused */
bool reused; /* new created / reused */
}; };
struct smc_rtoken { /* address/key of remote RMB */ struct smc_rtoken { /* address/key of remote RMB */
...@@ -173,9 +186,11 @@ struct smc_clc_msg_accept_confirm; ...@@ -173,9 +186,11 @@ struct smc_clc_msg_accept_confirm;
void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_free(struct smc_link_group *lgr);
void smc_lgr_terminate(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr);
int smc_sndbuf_create(struct smc_sock *smc); int smc_buf_create(struct smc_sock *smc);
int smc_rmb_create(struct smc_sock *smc);
int smc_rmb_rtoken_handling(struct smc_connection *conn, int smc_rmb_rtoken_handling(struct smc_connection *conn,
struct smc_clc_msg_accept_confirm *clc); struct smc_clc_msg_accept_confirm *clc);
void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
#endif #endif
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/random.h> #include <linux/random.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/scatterlist.h>
#include <rdma/ib_verbs.h> #include <rdma/ib_verbs.h>
#include "smc_pnet.h" #include "smc_pnet.h"
...@@ -192,8 +193,7 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) ...@@ -192,8 +193,7 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
{ {
int rc; int rc;
lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
IB_PD_UNSAFE_GLOBAL_RKEY);
rc = PTR_ERR_OR_ZERO(lnk->roce_pd); rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
if (IS_ERR(lnk->roce_pd)) if (IS_ERR(lnk->roce_pd))
lnk->roce_pd = NULL; lnk->roce_pd = NULL;
...@@ -232,10 +232,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) ...@@ -232,10 +232,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
.recv_cq = lnk->smcibdev->roce_cq_recv, .recv_cq = lnk->smcibdev->roce_cq_recv,
.srq = NULL, .srq = NULL,
.cap = { .cap = {
.max_send_wr = SMC_WR_BUF_CNT,
/* include unsolicited rdma_writes as well, /* include unsolicited rdma_writes as well,
* there are max. 2 RDMA_WRITE per 1 WR_SEND * there are max. 2 RDMA_WRITE per 1 WR_SEND
*/ */
.max_send_wr = SMC_WR_BUF_CNT * 3,
.max_recv_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3,
.max_send_sge = SMC_IB_MAX_SEND_SGE, .max_send_sge = SMC_IB_MAX_SEND_SGE,
.max_recv_sge = 1, .max_recv_sge = 1,
...@@ -254,33 +254,117 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) ...@@ -254,33 +254,117 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
return rc; return rc;
} }
/* map a new TX or RX buffer to DMA */ void smc_ib_put_memory_region(struct ib_mr *mr)
int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{ {
int rc = 0; ib_dereg_mr(mr);
}
if (buf_slot->dma_addr[SMC_SINGLE_LINK]) static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot)
return rc; /* already mapped */ {
buf_slot->dma_addr[SMC_SINGLE_LINK] = unsigned int offset = 0;
ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, int sg_num;
buf_size, data_direction);
if (ib_dma_mapping_error(smcibdev->ibdev, /* map the largest prefix of a dma mapped SG list */
buf_slot->dma_addr[SMC_SINGLE_LINK])) sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK],
rc = -EIO; buf_slot->sgt[SMC_SINGLE_LINK].sgl,
return rc; buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
&offset, PAGE_SIZE);
return sg_num;
}
/* Allocate a memory region and map the dma mapped SG list of buf_slot */
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
struct smc_buf_desc *buf_slot)
{
if (buf_slot->mr_rx[SMC_SINGLE_LINK])
return 0; /* already done */
buf_slot->mr_rx[SMC_SINGLE_LINK] =
ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) {
int rc;
rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]);
buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL;
return rc;
}
if (smc_ib_map_mr_sg(buf_slot) != 1)
return -EINVAL;
return 0;
} }
void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size, /* synchronize buffer usage for cpu access */
void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{
struct scatterlist *sg;
unsigned int i;
/* for now there is just one DMA address */
for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
if (!sg_dma_len(sg))
break;
ib_dma_sync_single_for_cpu(smcibdev->ibdev,
sg_dma_address(sg),
sg_dma_len(sg),
data_direction);
}
}
/* synchronize buffer usage for device access */
void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{
struct scatterlist *sg;
unsigned int i;
/* for now there is just one DMA address */
for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
if (!sg_dma_len(sg))
break;
ib_dma_sync_single_for_device(smcibdev->ibdev,
sg_dma_address(sg),
sg_dma_len(sg),
data_direction);
}
}
/* Map a new TX or RX buffer SG-table to DMA */
int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot, struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction) enum dma_data_direction data_direction)
{ {
if (!buf_slot->dma_addr[SMC_SINGLE_LINK]) int mapped_nents;
mapped_nents = ib_dma_map_sg(smcibdev->ibdev,
buf_slot->sgt[SMC_SINGLE_LINK].sgl,
buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
data_direction);
if (!mapped_nents)
return -ENOMEM;
return mapped_nents;
}
void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{
if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address)
return; /* already unmapped */ return; /* already unmapped */
ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
data_direction); ib_dma_unmap_sg(smcibdev->ibdev,
buf_slot->dma_addr[SMC_SINGLE_LINK] = 0; buf_slot->sgt[SMC_SINGLE_LINK].sgl,
buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
data_direction);
buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;
} }
static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
......
...@@ -51,12 +51,12 @@ int smc_ib_register_client(void) __init; ...@@ -51,12 +51,12 @@ int smc_ib_register_client(void) __init;
void smc_ib_unregister_client(void); void smc_ib_unregister_client(void);
bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize,
struct smc_buf_desc *buf_slot, struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction); enum dma_data_direction data_direction);
void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
void smc_ib_dealloc_protection_domain(struct smc_link *lnk); void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
int smc_ib_create_protection_domain(struct smc_link *lnk); int smc_ib_create_protection_domain(struct smc_link *lnk);
void smc_ib_destroy_queue_pair(struct smc_link *lnk); void smc_ib_destroy_queue_pair(struct smc_link *lnk);
...@@ -65,6 +65,13 @@ int smc_ib_ready_link(struct smc_link *lnk); ...@@ -65,6 +65,13 @@ int smc_ib_ready_link(struct smc_link *lnk);
int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk);
int smc_ib_modify_qp_reset(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk);
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
struct smc_buf_desc *buf_slot);
void smc_ib_put_memory_region(struct ib_mr *mr);
void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
#endif #endif
...@@ -170,6 +170,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, ...@@ -170,6 +170,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
copylen, conn->rmbe_size - cons.count); copylen, conn->rmbe_size - cons.count);
chunk_len_sum = chunk_len; chunk_len_sum = chunk_len;
chunk_off = cons.count; chunk_off = cons.count;
smc_rmb_sync_sg_for_cpu(conn);
for (chunk = 0; chunk < 2; chunk++) { for (chunk = 0; chunk < 2; chunk++) {
if (!(flags & MSG_TRUNC)) { if (!(flags & MSG_TRUNC)) {
rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
...@@ -177,6 +178,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, ...@@ -177,6 +178,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
if (rc) { if (rc) {
if (!read_done) if (!read_done)
read_done = -EFAULT; read_done = -EFAULT;
smc_rmb_sync_sg_for_device(conn);
goto out; goto out;
} }
} }
...@@ -190,6 +192,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, ...@@ -190,6 +192,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
chunk_len_sum += chunk_len; chunk_len_sum += chunk_len;
chunk_off = 0; /* modulo offset in recv ring buffer */ chunk_off = 0; /* modulo offset in recv ring buffer */
} }
smc_rmb_sync_sg_for_device(conn);
/* update cursors */ /* update cursors */
if (!(flags & MSG_PEEK)) { if (!(flags & MSG_PEEK)) {
......
...@@ -174,10 +174,12 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) ...@@ -174,10 +174,12 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
copylen, conn->sndbuf_size - tx_cnt_prep); copylen, conn->sndbuf_size - tx_cnt_prep);
chunk_len_sum = chunk_len; chunk_len_sum = chunk_len;
chunk_off = tx_cnt_prep; chunk_off = tx_cnt_prep;
smc_sndbuf_sync_sg_for_cpu(conn);
for (chunk = 0; chunk < 2; chunk++) { for (chunk = 0; chunk < 2; chunk++) {
rc = memcpy_from_msg(sndbuf_base + chunk_off, rc = memcpy_from_msg(sndbuf_base + chunk_off,
msg, chunk_len); msg, chunk_len);
if (rc) { if (rc) {
smc_sndbuf_sync_sg_for_device(conn);
if (send_done) if (send_done)
return send_done; return send_done;
goto out_err; goto out_err;
...@@ -192,6 +194,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) ...@@ -192,6 +194,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
chunk_len_sum += chunk_len; chunk_len_sum += chunk_len;
chunk_off = 0; /* modulo offset in send ring buffer */ chunk_off = 0; /* modulo offset in send ring buffer */
} }
smc_sndbuf_sync_sg_for_device(conn);
/* update cursors */ /* update cursors */
smc_curs_add(conn->sndbuf_size, &prep, copylen); smc_curs_add(conn->sndbuf_size, &prep, copylen);
smc_curs_write(&conn->tx_curs_prep, smc_curs_write(&conn->tx_curs_prep,
...@@ -277,6 +280,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) ...@@ -277,6 +280,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
struct smc_link_group *lgr = conn->lgr; struct smc_link_group *lgr = conn->lgr;
int to_send, rmbespace; int to_send, rmbespace;
struct smc_link *link; struct smc_link *link;
dma_addr_t dma_addr;
int num_sges; int num_sges;
int rc; int rc;
...@@ -334,12 +338,11 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) ...@@ -334,12 +338,11 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
src_len = conn->sndbuf_size - sent.count; src_len = conn->sndbuf_size - sent.count;
} }
src_len_sum = src_len; src_len_sum = src_len;
dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (dstchunk = 0; dstchunk < 2; dstchunk++) {
num_sges = 0; num_sges = 0;
for (srcchunk = 0; srcchunk < 2; srcchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) {
sges[srcchunk].addr = sges[srcchunk].addr = dma_addr + src_off;
conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
src_off;
sges[srcchunk].length = src_len; sges[srcchunk].length = src_len;
sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
num_sges++; num_sges++;
......
...@@ -68,6 +68,16 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) ...@@ -68,6 +68,16 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
int i; int i;
link = wc->qp->qp_context; link = wc->qp->qp_context;
if (wc->opcode == IB_WC_REG_MR) {
if (wc->status)
link->wr_reg_state = FAILED;
else
link->wr_reg_state = CONFIRMED;
wake_up(&link->wr_reg_wait);
return;
}
pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
if (pnd_snd_idx == link->wr_tx_cnt) if (pnd_snd_idx == link->wr_tx_cnt)
return; return;
...@@ -243,6 +253,52 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) ...@@ -243,6 +253,52 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
return rc; return rc;
} }
/* Register a memory region and wait for result. */
int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
{
struct ib_send_wr *failed_wr = NULL;
int rc;
ib_req_notify_cq(link->smcibdev->roce_cq_send,
IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
link->wr_reg_state = POSTED;
link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
link->wr_reg.mr = mr;
link->wr_reg.key = mr->rkey;
failed_wr = &link->wr_reg.wr;
rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, &failed_wr);
WARN_ON(failed_wr != &link->wr_reg.wr);
if (rc)
return rc;
rc = wait_event_interruptible_timeout(link->wr_reg_wait,
(link->wr_reg_state != POSTED),
SMC_WR_REG_MR_WAIT_TIME);
if (!rc) {
/* timeout - terminate connections */
struct smc_link_group *lgr;
lgr = container_of(link, struct smc_link_group,
lnk[SMC_SINGLE_LINK]);
smc_lgr_terminate(lgr);
return -EPIPE;
}
if (rc == -ERESTARTSYS)
return -EINTR;
switch (link->wr_reg_state) {
case CONFIRMED:
rc = 0;
break;
case FAILED:
rc = -EIO;
break;
case POSTED:
rc = -EPIPE;
break;
}
return rc;
}
void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type, void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
smc_wr_tx_filter filter, smc_wr_tx_filter filter,
smc_wr_tx_dismisser dismisser, smc_wr_tx_dismisser dismisser,
...@@ -458,6 +514,11 @@ static void smc_wr_init_sge(struct smc_link *lnk) ...@@ -458,6 +514,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
lnk->wr_rx_ibs[i].num_sge = 1; lnk->wr_rx_ibs[i].num_sge = 1;
} }
lnk->wr_reg.wr.next = NULL;
lnk->wr_reg.wr.num_sge = 0;
lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
} }
void smc_wr_free_link(struct smc_link *lnk) void smc_wr_free_link(struct smc_link *lnk)
...@@ -602,6 +663,8 @@ int smc_wr_create_link(struct smc_link *lnk) ...@@ -602,6 +663,8 @@ int smc_wr_create_link(struct smc_link *lnk)
smc_wr_init_sge(lnk); smc_wr_init_sge(lnk);
memset(lnk->wr_tx_mask, 0, memset(lnk->wr_tx_mask, 0,
BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
init_waitqueue_head(&lnk->wr_tx_wait);
init_waitqueue_head(&lnk->wr_reg_wait);
return rc; return rc;
dma_unmap: dma_unmap:
......
...@@ -102,5 +102,6 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, ...@@ -102,5 +102,6 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
int smc_wr_rx_post_init(struct smc_link *link); int smc_wr_rx_post_init(struct smc_link *link);
void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);
#endif /* SMC_WR_H */ #endif /* SMC_WR_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册