diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 649489f825a5ec9d91802e2b429dc04da95343e8..86913eb5cfa0a9dfb7033f2fecf475f4008cc6dd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -7,12 +7,11 @@ * applicable with RoCE-cards only * * Initial restrictions: - * - IPv6 support postponed * - support for alternate links postponed * - partial support for non-blocking sockets only * - support for urgent data postponed * - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun * based on prototype from Frank Blaschka @@ -64,6 +63,10 @@ static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; +static struct smc_hashinfo smc_v6_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), +}; + int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; @@ -103,6 +106,18 @@ struct proto smc_proto = { }; EXPORT_SYMBOL_GPL(smc_proto); +struct proto smc_proto6 = { + .name = "SMC6", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v6_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; +EXPORT_SYMBOL_GPL(smc_proto6); + static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -159,19 +174,22 @@ static void smc_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } -static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, + int protocol) { struct smc_sock *smc; + struct proto *prot; struct sock *sk; - sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); + prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); if (!sk) return NULL; sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; - sk->sk_protocol = SMCPROTO_SMC; + sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(&smc->accept_q); @@ -198,10 +216,13 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, goto out; rc = -EAFNOSUPPORT; + if (addr->sin_family != AF_INET && + addr->sin_family != AF_INET6 && + addr->sin_family != AF_UNSPEC) + goto out; /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ - if ((addr->sin_family != AF_INET) && - ((addr->sin_family != AF_UNSPEC) || - (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) + if (addr->sin_family == AF_UNSPEC && + addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; lock_sock(sk); @@ -529,7 +550,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, /* separate smc parameter checking to be safe */ if (alen < sizeof(addr->sa_family)) goto out_err; - if (addr->sa_family != AF_INET) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) goto out_err; lock_sock(sk); @@ -571,7 +592,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) int rc; release_sock(lsk); - new_sk = smc_sock_alloc(sock_net(lsk), NULL); + new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; @@ -767,8 +788,6 @@ static void smc_listen_work(struct work_struct *work) struct smc_link *link; int reason_code = 0; int rc = 0; - __be32 subnet; - u8 prefix_len; u8 ibport; /* check if peer is smc capable */ @@ -803,17 +822,11 @@ static void smc_listen_work(struct work_struct *work) goto decline_rdma; } - /* determine subnet and mask from internal TCP socket */ - rc = smc_clc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); - if (rc) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } - pclc = (struct smc_clc_msg_proposal *)&buf; pclc_prfx = smc_clc_proposal_get_prefix(pclc); - if (pclc_prfx->outgoing_subnet != subnet || - pclc_prfx->prefix_len != prefix_len) { + + rc = smc_clc_prfx_match(newclcsock, pclc_prfx); + if (rc) { reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } @@ -1375,6 +1388,7 @@ static const struct proto_ops smc_sock_ops = { static int smc_create(struct net *net, struct socket *sock, int protocol, int kern) { + int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; struct sock *sk; int rc; @@ -1384,20 +1398,20 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -EPROTONOSUPPORT; - if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) + if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) goto out; rc = -ENOBUFS; sock->ops = &smc_sock_ops; - sk = smc_sock_alloc(net, sock); + sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ - rc = sock_create_kern(net, PF_INET, SOCK_STREAM, - IPPROTO_TCP, &smc->clcsock); + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); if (rc) { sk_common_release(sk); goto out; @@ -1437,16 +1451,23 @@ static int __init smc_init(void) rc = proto_register(&smc_proto, 1); if (rc) { - pr_err("%s: proto_register fails with %d\n", __func__, rc); + pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); goto out_pnet; } + rc = proto_register(&smc_proto6, 1); + if (rc) { + pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); + goto out_proto; + } + rc = sock_register(&smc_sock_family_ops); if (rc) { pr_err("%s: sock_register fails with %d\n", __func__, rc); - goto out_proto; + goto out_proto6; } INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { @@ -1459,6 +1480,8 @@ static int __init smc_init(void) out_sock: sock_unregister(PF_SMC); +out_proto6: + proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); out_pnet: @@ -1483,6 +1506,7 @@ static void __exit smc_exit(void) static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); + proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); } diff --git a/net/smc/smc.h b/net/smc/smc.h index 268cdf11533c8af55832ff00d4f0568c101e788e..e4829a2f46baf8fb7c129ba4ffeca24c5189134e 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -18,11 +18,13 @@ #include "smc_ib.h" -#define SMCPROTO_SMC 0 /* SMC protocol */ +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ #define SMC_MAX_PORTS 2 /* Max # of ports */ extern struct proto smc_proto; +extern struct proto smc_proto6; #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 874c5a75d6dd1754127aef5c72296321f564c25a..64fbc3230e6c456c355e8dd24e9394301fff18e3 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -5,7 +5,7 @@ * CLC (connection layer control) handshake over initial TCP socket to * prepare for RDMA traffic * - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun */ @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -74,15 +75,67 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) return true; } -/* determine subnet and mask of internal TCP socket */ -int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, - __be32 *subnet, u8 *prefix_len) +/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ +static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + + if (!in_dev) + return -ENODEV; + for_ifa(in_dev) { + if (!inet_ifa_match(ipv4, ifa)) + continue; + prop->prefix_len = inet_mask_len(ifa->ifa_mask); + prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; + /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ + return 0; + } endfor_ifa(in_dev); + return -ENOENT; +} + +/* fill CLC proposal msg with ipv6 prefixes from device */ +static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_ifaddr *ifa; + int cnt = 0; + + if (!in6_dev) + return -ENODEV; + /* use a maximum of 8 IPv6 prefixes from device */ + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + ipv6_addr_prefix(&ipv6_prfx[cnt].prefix, + &ifa->addr, ifa->prefix_len); + ipv6_prfx[cnt].prefix_len = ifa->prefix_len; + cnt++; + if (cnt == SMC_CLC_MAX_V6_PREFIX) + break; + } + prop->ipv6_prefixes_cnt = cnt; + if (cnt) + return 0; +#endif + return -ENOENT; +} + +/* retrieve and set prefixes in CLC proposal msg */ +static int smc_clc_prfx_set(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) { struct dst_entry *dst = sk_dst_get(clcsock->sk); - struct in_device *in_dev; - struct sockaddr_in addr; + struct sockaddr_storage addrs; + struct sockaddr_in6 *addr6; + struct sockaddr_in *addr; int rc = -ENOENT; + memset(prop, 0, sizeof(*prop)); if (!dst) { rc = -ENOTCONN; goto out; @@ -91,22 +144,97 @@ int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, rc = -ENODEV; goto out_rel; } - /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addr); - /* analyze IPv4 specific data of net_device belonging to TCP socket */ + kernel_getsockname(clcsock, (struct sockaddr *)&addrs); + /* analyze IP specific data of net_device belonging to TCP socket */ + addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); - in_dev = __in_dev_get_rcu(dst->dev); + if (addrs.ss_family == PF_INET) { + /* IPv4 */ + addr = (struct sockaddr_in *)&addrs; + rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { + /* mapped IPv4 address - peer is IPv4 only */ + rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + prop); + } else { + /* IPv6 */ + rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); + } + rcu_read_unlock(); +out_rel: + dst_release(dst); +out: + return rc; +} + +/* match ipv4 addrs of dev against addr in CLC proposal */ +static int smc_clc_prfx_match4_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return -ENODEV; for_ifa(in_dev) { - if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) - continue; - *prefix_len = inet_mask_len(ifa->ifa_mask); - *subnet = ifa->ifa_address & ifa->ifa_mask; - rc = 0; - break; + if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && + inet_ifa_match(prop->outgoing_subnet, ifa)) + return 0; } endfor_ifa(in_dev); - rcu_read_unlock(); + return -ENOENT; +} + +/* match ipv6 addrs of dev against addrs in CLC proposal */ +static int smc_clc_prfx_match6_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dev); + struct smc_clc_ipv6_prefix *ipv6_prfx; + struct inet6_ifaddr *ifa; + int i, max; + + if (!in6_dev) + return -ENODEV; + /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */ + ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop)); + max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX); + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + for (i = 0; i < max; i++) { + if (ifa->prefix_len == ipv6_prfx[i].prefix_len && + ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix, + ifa->prefix_len)) + return 0; + } + } +#endif + return -ENOENT; +} + +/* check if proposed prefixes match one of our device prefixes */ +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + int rc; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) + rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + else + rc = smc_clc_prfx_match6_rcu(dst->dev, prop); + rcu_read_unlock(); out_rel: dst_release(dst); out: @@ -232,16 +360,24 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport) { + struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_msg_proposal pclc; struct smc_clc_msg_trail trl; + int len, i, plen, rc; int reason_code = 0; - struct kvec vec[3]; + struct kvec vec[4]; struct msghdr msg; - int len, plen, rc; + + /* retrieve ip prefixes for CLC proposal msg */ + rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx); + if (rc) + return SMC_CLC_DECL_CNFERR; /* configuration error */ /* send SMC Proposal CLC message */ - plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl); + plen = sizeof(pclc) + sizeof(pclc_prfx) + + (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) + + sizeof(trl); memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; @@ -252,23 +388,22 @@ int smc_clc_send_proposal(struct smc_sock *smc, memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); pclc.iparea_offset = htons(0); - memset(&pclc_prfx, 0, sizeof(pclc_prfx)); - /* determine subnet and mask from internal TCP socket */ - rc = smc_clc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, - &pclc_prfx.prefix_len); - if (rc) - return SMC_CLC_DECL_CNFERR; /* configuration error */ - pclc_prfx.ipv6_prefixes_cnt = 0; memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); - vec[0].iov_base = &pclc; - vec[0].iov_len = sizeof(pclc); - vec[1].iov_base = &pclc_prfx; - vec[1].iov_len = sizeof(pclc_prfx); - vec[2].iov_base = &trl; - vec[2].iov_len = sizeof(trl); + i = 0; + vec[i].iov_base = &pclc; + vec[i++].iov_len = sizeof(pclc); + vec[i].iov_base = &pclc_prfx; + vec[i++].iov_len = sizeof(pclc_prfx); + if (pclc_prfx.ipv6_prefixes_cnt > 0) { + vec[i].iov_base = &ipv6_prfx[0]; + vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + } + vec[i].iov_base = &trl; + vec[i++].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ - len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen); + len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); if (len < sizeof(pclc)) { if (len >= 0) { reason_code = -ENETUNREACH; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 20e048beac303638f85b5b6590b2bce6d469e182..63bf1dc2c1f9731d97374e68ed7f804b3bfaba2f 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -60,10 +60,15 @@ struct smc_clc_msg_local { /* header2 of clc messages */ u8 mac[6]; /* mac of ib_device port */ }; +#define SMC_CLC_MAX_V6_PREFIX 8 + +/* Struct would be 4 byte aligned, but it is used in an array that is sent + * to peers and must conform to RFC7609, hence we need to use packed here. + */ struct smc_clc_ipv6_prefix { - u8 prefix[4]; + struct in6_addr prefix; u8 prefix_len; -} __packed; +} __packed; /* format defined in RFC7609 */ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ @@ -79,9 +84,11 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ } __aligned(4); #define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28 -#define SMC_CLC_PROPOSAL_MAX_PREFIX (8 * sizeof(struct smc_clc_ipv6_prefix)) +#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \ + sizeof(struct smc_clc_ipv6_prefix)) #define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \ SMC_CLC_PROPOSAL_MAX_OFFSET + \ + sizeof(struct smc_clc_msg_proposal_prefix) + \ SMC_CLC_PROPOSAL_MAX_PREFIX + \ sizeof(struct smc_clc_msg_trail)) @@ -122,8 +129,8 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } -int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, - u8 *prefix_len); +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);