提交 e43b76ab 编写于 作者: J Jakub Kicinski

Merge branch 'tcp-receive-path-optimizations'

Eric Dumazet says:

====================
tcp: receive path optimizations

This series aims to reduce cache line misses in RX path.

I am still working on better cache locality in tcp_sock but
this will wait few more weeks.
====================

Link: https://lore.kernel.org/r/20211025164825.259415-1-eric.dumazet@gmail.comSigned-off-by: NJakub Kicinski <kuba@kernel.org>
......@@ -282,7 +282,6 @@ struct ipv6_pinfo {
__be32 rcv_flowinfo;
__u32 dst_cookie;
__u32 rx_dst_cookie;
struct ipv6_mc_socklist __rcu *ipv6_mc_list;
struct ipv6_ac_socklist *ipv6_ac_list;
......
......@@ -130,7 +130,8 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
sk_rx_queue_set(sk, skb);
}
......
......@@ -207,11 +207,10 @@ struct inet_sock {
__be32 inet_saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
struct ip_options_rcu __rcu *inet_opt;
__be16 inet_sport;
__u16 inet_id;
struct ip_options_rcu __rcu *inet_opt;
int rx_dst_ifindex;
__u8 tos;
__u8 min_ttl;
__u8 mc_ttl;
......
......@@ -24,6 +24,7 @@
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/sockptr.h>
#include <linux/static_key.h>
#include <net/inet_sock.h>
#include <net/route.h>
......@@ -750,6 +751,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
struct ipcm_cookie *ipc, bool allow_ipv6);
DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
......
......@@ -1092,6 +1092,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
/*
* socket options (ipv6_sockglue.c)
*/
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
......
......@@ -259,6 +259,8 @@ struct bpf_local_storage;
* @sk_rcvbuf: size of receive buffer in bytes
* @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux
* @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
* @sk_rx_dst_cookie: cookie for @sk_rx_dst
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
......@@ -430,6 +432,9 @@ struct sock {
struct xfrm_policy __rcu *sk_policy[2];
#endif
struct dst_entry *sk_rx_dst;
int sk_rx_dst_ifindex;
u32 sk_rx_dst_cookie;
struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc;
int sk_sndbuf;
......@@ -1911,10 +1916,8 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
if (skb_rx_queue_recorded(skb)) {
u16 rx_queue = skb_get_rx_queue(skb);
if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
return;
sk->sk_rx_queue_mapping = rx_queue;
if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
}
#endif
}
......@@ -1922,15 +1925,19 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}
static inline int sk_rx_queue_get(const struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
return sk->sk_rx_queue_mapping;
if (sk) {
int res = READ_ONCE(sk->sk_rx_queue_mapping);
if (res != NO_QUEUE_MAPPING)
return res;
}
#endif
return -1;
......
......@@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
return ip_mc_leave_group(sk, &mreq);
}
DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
static int do_ip_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
......@@ -1352,7 +1354,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
inet->min_ttl = val;
if (val)
static_branch_enable(&ip4_min_ttl);
/* tcp_v4_err() and tcp_v4_rcv() might read min_ttl
* while we are changint it.
*/
WRITE_ONCE(inet->min_ttl, val);
break;
default:
......
......@@ -508,9 +508,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
if (sk->sk_state == TCP_CLOSE)
goto out;
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
}
tp = tcp_sk(sk);
......@@ -1703,7 +1706,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) {
dst_release(dst);
......@@ -1788,7 +1791,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
......@@ -2068,9 +2071,13 @@ int tcp_v4_rcv(struct sk_buff *skb)
return 0;
}
}
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
......@@ -2195,7 +2202,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
sk->sk_rx_dst_ifindex = skb->skb_iif;
}
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);
......
......@@ -55,6 +55,8 @@
struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock);
DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount);
int ip6_ra_control(struct sock *sk, int sel)
{
struct ip6_ra_chain *ra, *new_ra, **rap;
......@@ -950,7 +952,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
np->min_hopcount = val;
if (val)
static_branch_enable(&ip6_min_hopcount);
/* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
* while we are changing it.
*/
WRITE_ONCE(np->min_hopcount, val);
retv = 0;
break;
case IPV6_DONTFRAG:
......
......@@ -108,8 +108,8 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
const struct rt6_info *rt = (const struct rt6_info *)dst;
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
sk->sk_rx_dst_ifindex = skb->skb_iif;
sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}
......@@ -414,9 +414,12 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sk->sk_state == TCP_CLOSE)
goto out;
if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
if (static_branch_unlikely(&ip6_min_hopcount)) {
/* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
}
tp = tcp_sk(sk);
......@@ -569,7 +572,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
consume_skb(inet_rsk(req)->pktopts);
}
#ifdef CONFIG_TCP_MD5SIG
......@@ -1509,9 +1512,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
dst, np->rx_dst_cookie) == NULL) {
dst, sk->sk_rx_dst_cookie) == NULL) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
......@@ -1591,7 +1594,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
}
}
kfree_skb(opt_skb);
consume_skb(opt_skb);
return 0;
}
......@@ -1726,9 +1729,13 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
return 0;
}
}
if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
if (static_branch_unlikely(&ip6_min_hopcount)) {
/* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
}
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
......@@ -1872,9 +1879,9 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie);
dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
......
......@@ -884,7 +884,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
if (udp_sk_rx_dst_set(sk, dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;
inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}
......@@ -1073,7 +1073,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
dst = READ_ONCE(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst) {
/* set noref for now.
* any place which wants to hold dst has to call
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册