diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 173a7c2f96368ee6c6da6bbcf0ca6ff7e1150ce8..8c431385b2721285509e7a0182d3002a201f1592 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -98,7 +98,8 @@ struct tcp_options_received { tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ - sack_ok : 4, /* SACK seen on SYN packet */ + sack_ok : 3, /* SACK seen on SYN packet */ + smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 num_sacks; /* Number of SACK blocks */ @@ -110,6 +111,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0; +#if IS_ENABLED(CONFIG_SMC) + rx_opt->smc_ok = 0; +#endif } /* This is the max number of SACKS that we'll generate and process. It's safe @@ -229,7 +233,8 @@ struct tcp_sock { syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ + syn_smc:1; /* SYN includes SMC */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 425752f768d2f1a0efb13964204e07f27609e9db..c49938d1481ae6b8bb38a8e14cb448413fc7ad1e 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -92,7 +92,8 @@ struct inet_request_sock { wscale_ok : 1, ecn_ok : 1, acked : 1, - no_srccheck: 1; + no_srccheck: 1, + smc_ok : 1; kmemcheck_bitfield_end(flags); u32 ir_mark; union { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2392f74074e71af6a6f8113f1ba9c13c7ee5c1bc..285bc82dea410b22ac585ee65daff5cbac7c3fc7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 +#define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths @@ -203,6 +204,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 +#define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 @@ -213,6 +215,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 +#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -2108,4 +2111,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); } + +#if IS_ENABLED(CONFIG_SMC) +extern struct static_key_false tcp_have_smc; +#endif #endif /* _TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8f36277e82e9dbea750ce66b73018a81b30b5156..f6e1c00e300eeedcfe2ff0f4f2a4e1d997cd315d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include #include +#include #include #include @@ -302,6 +303,11 @@ EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +#if IS_ENABLED(CONFIG_SMC) +DEFINE_STATIC_KEY_FALSE(tcp_have_smc); +EXPORT_SYMBOL(tcp_have_smc); +#endif + /* * Current number of TCP sockets. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 893286db4623773afe75f4e0cfea7ad0d24ea2fe..337f6011528a7d4c3ab7fdcc0623496cfefafc71 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,6 +76,8 @@ #include #include #include +#include +#include int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; @@ -3737,6 +3739,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } +static void smc_parse_options(const struct tcphdr *th, + struct tcp_options_received *opt_rx, + const unsigned char *ptr, + int opsize) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (th->syn && !(opsize & 1) && + opsize >= TCPOLEN_EXP_SMC_BASE && + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + opt_rx->smc_ok = 1; + } +#endif +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3844,6 +3861,9 @@ void tcp_parse_options(const struct net *net, tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); + else + smc_parse_options(th, opt_rx, ptr, + opsize); break; } @@ -5598,6 +5618,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return false; } +static void smc_check_reset_syn(struct tcp_sock *tp) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && !tp->rx_opt.smc_ok) + tp->syn_smc = 0; + } +#endif +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5704,6 +5734,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * is initialized. */ tp->copied_seq = tp->rcv_nxt; + smc_check_reset_syn(tp); + smp_mb(); tcp_finish_connect(sk, skb); @@ -6157,6 +6189,9 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); +#if IS_ENABLED(CONFIG_SMC) + ireq->smc_ok = rx_opt->smc_ok; +#endif } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a952357054f4ddfbb98746ebcf323d1c45f7e951..056009f1c14f13ac4af987d0a7451f32dbde0023 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -416,6 +417,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +static void smc_check_reset_syn_req(struct tcp_sock *oldtp, + struct request_sock *req, + struct tcp_sock *newtp) +{ +#if IS_ENABLED(CONFIG_SMC) + struct inet_request_sock *ireq; + + if (static_branch_unlikely(&tcp_have_smc)) { + ireq = inet_rsk(req); + if (oldtp->syn_smc && !ireq->smc_ok) + newtp->syn_smc = 0; + } +#endif +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -433,6 +449,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1f01f4c9c738484822aee7a6810d90857e6a72dc..c8fc512e0bbb48f7d36e159e8aae56ec70a24498 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -422,6 +423,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) +#define OPTION_SMC (1 << 9) + +static void smc_options_write(__be32 *ptr, u16 *options) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (unlikely(OPTION_SMC & *options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } + } +#endif +} struct tcp_out_options { u16 options; /* bit field of OPTION_* */ @@ -540,6 +557,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } ptr += (len + 3) >> 2; } + + smc_options_write(ptr, &options); +} + +static void smc_set_option(const struct tcp_sock *tp, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif +} + +static void smc_set_option_cond(const struct tcp_sock *tp, + const struct inet_request_sock *ireq, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && ireq->smc_ok) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif } /* Compute TCP options for SYN packets. This is not the final @@ -607,11 +659,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + smc_set_option(tp, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct request_sock *req, +static unsigned int tcp_synack_options(const struct sock *sk, + struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, @@ -667,6 +722,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, } } + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -3195,8 +3252,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + - sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 745f145d4c4d43316f3cb30083066456497fec3a..6451c5013e06a93d75c4bd2a9ad95f2ab0873f53 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -390,6 +390,12 @@ static int smc_connect_rdma(struct smc_sock *smc) int rc = 0; u8 ibport; + if (!tcp_sk(smc->clcsock->sk)->syn_smc) { + /* peer has not signalled SMC-capability */ + smc->use_fallback = true; + goto out_connected; + } + /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(smc)) { reason_code = SMC_CLC_DECL_IPSEC; @@ -555,6 +561,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, } smc_copy_sock_settings_to_clc(smc); + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc) goto out; @@ -759,6 +766,12 @@ static void smc_listen_work(struct work_struct *work) u8 prefix_len; u8 ibport; + /* check if peer is smc capable */ + if (!tcp_sk(newclcsock->sk)->syn_smc) { + new_smc->use_fallback = true; + goto out_connected; + } + /* do inband token exchange - *wait for and receive SMC Proposal CLC message */ @@ -808,7 +821,7 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - goto decline_rdma; + goto decline_rdma_unlock; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -816,7 +829,7 @@ static void smc_listen_work(struct work_struct *work) rc = smc_buf_create(new_smc); if (rc) { reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma; + goto decline_rdma_unlock; } smc_close_init(new_smc); @@ -831,7 +844,7 @@ static void smc_listen_work(struct work_struct *work) buf_desc->mr_rx[SMC_SINGLE_LINK]); if (rc) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma; + goto decline_rdma_unlock; } } } @@ -839,15 +852,15 @@ static void smc_listen_work(struct work_struct *work) rc = smc_clc_send_accept(new_smc, local_contact); if (rc) - goto out_err; + goto out_err_unlock; /* receive SMC Confirm CLC message */ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), SMC_CLC_CONFIRM); if (reason_code < 0) - goto out_err; + goto out_err_unlock; if (reason_code > 0) - goto decline_rdma; + goto decline_rdma_unlock; smc_conn_save_peer_info(new_smc, &cclc); if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, &cclc); @@ -855,34 +868,34 @@ static void smc_listen_work(struct work_struct *work) rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); if (rc) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma; + goto decline_rdma_unlock; } if (local_contact == SMC_FIRST_CONTACT) { rc = smc_ib_ready_link(link); if (rc) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma; + goto decline_rdma_unlock; } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); if (reason_code < 0) { /* peer is not aware of a problem */ rc = reason_code; - goto out_err; + goto out_err_unlock; } if (reason_code > 0) - goto decline_rdma; + goto decline_rdma_unlock; } smc_tx_init(new_smc); + mutex_unlock(&smc_create_lgr_pending); out_connected: sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; enqueue: - mutex_unlock(&smc_create_lgr_pending); lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); if (lsmc->sk.sk_state == SMC_LISTEN) { smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -896,6 +909,8 @@ static void smc_listen_work(struct work_struct *work) sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ return; +decline_rdma_unlock: + mutex_unlock(&smc_create_lgr_pending); decline_rdma: /* RDMA setup failed, switch back to TCP */ smc_conn_free(&new_smc->conn); @@ -907,6 +922,8 @@ static void smc_listen_work(struct work_struct *work) } goto out_connected; +out_err_unlock: + mutex_unlock(&smc_create_lgr_pending); out_err: newsmcsk->sk_state = SMC_CLOSED; smc_conn_free(&new_smc->conn); @@ -963,6 +980,7 @@ static int smc_listen(struct socket *sock, int backlog) * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_listen(smc->clcsock, backlog); if (rc) @@ -1405,6 +1423,7 @@ static int __init smc_init(void) goto out_sock; } + static_branch_enable(&tcp_have_smc); return 0; out_sock: @@ -1429,6 +1448,7 @@ static void __exit smc_exit(void) list_del_init(&lgr->list); smc_lgr_free(lgr); /* free link group */ } + static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); proto_unregister(&smc_proto);