diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index eaee2c8d4c00add74b162bbcdeb555934586cf24..b1c6500e7a8df4d7377b291e9afc09363e66cd17 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -856,12 +856,21 @@ ip_dynaddr - BOOLEAN ip_early_demux - BOOLEAN Optimize input packet processing down to one demux for certain kinds of local sockets. Currently we only do this - for established TCP sockets. + for established TCP and connected UDP sockets. It may add an additional cost for pure routing workloads that reduces overall throughput, in such case you should disable it. Default: 1 +tcp_early_demux - BOOLEAN + Enable early demux for established TCP sockets. + Default: 1 + +udp_early_demux - BOOLEAN + Enable early demux for connected UDP sockets. Disable this if + your system could experience more unconnected load. + Default: 1 + icmp_echo_ignore_all - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO requests sent to it. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a0e89190a3e9240a80e7c307c0d3f482ea6168f4..cd686c4fb32dc5409a08f818d48228bffa6f6778 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -95,6 +95,8 @@ struct netns_ipv4 { /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; int sysctl_ip_early_demux; + int sysctl_tcp_early_demux; + int sysctl_udp_early_demux; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; diff --git a/include/net/protocol.h b/include/net/protocol.h index bf36ca34af7ad255b9eb821cbed0a70abad993f5..65ba335b0e7e66bb7f1b4bd279d31e616e0dd31e 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -40,6 +40,7 @@ /* This is used to register protocols. */ struct net_protocol { void (*early_demux)(struct sk_buff *skb); + void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, @@ -54,7 +55,7 @@ struct net_protocol { #if IS_ENABLED(CONFIG_IPV6) struct inet6_protocol { void (*early_demux)(struct sk_buff *skb); - + void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, @@ -92,12 +93,12 @@ struct inet_protosw { #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ #define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ -extern const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; +extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS]; extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS]; #if IS_ENABLED(CONFIG_IPV6) -extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; +extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; #endif int inet_add_protocol(const struct net_protocol *prot, unsigned char num); diff --git a/include/net/udp.h b/include/net/udp.h index c9d8b8e848e05c2e7228f287f88ccdb57b2e10c2..3391dbd739595a76150453c28468ce8bb55530f8 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -372,4 +372,5 @@ void udp_encap_enable(void); #if IS_ENABLED(CONFIG_IPV6) void udpv6_encap_enable(void); #endif + #endif /* _UDP_H */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6b1fc6e4278ef4f1cba58412977918af31d73e62..d1a11707a12682fcd70f22f6df77087b779a5826 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1599,8 +1599,9 @@ static const struct net_protocol igmp_protocol = { }; #endif -static const struct net_protocol tcp_protocol = { +static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, + .early_demux_handler = tcp_v4_early_demux, .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, @@ -1608,8 +1609,9 @@ static const struct net_protocol tcp_protocol = { .icmp_strict_tag_validation = 1, }; -static const struct net_protocol udp_protocol = { +static struct net_protocol udp_protocol = { .early_demux = udp_v4_early_demux, + .early_demux_handler = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, @@ -1720,6 +1722,8 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; + net->ipv4.sysctl_udp_early_demux = 1; + net->ipv4.sysctl_tcp_early_demux = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d6feabb0351607f282e1f78f159c0ccb88bcec96..fa2dc8f692c631f1ff7fe814c3ee27f0de2a41d8 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct net_device *dev = skb->dev; + void (*edemux)(struct sk_buff *skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) int protocol = iph->protocol; ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->early_demux) { - ipprot->early_demux(skb); + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { + edemux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 4b7c0ec65251ef40577a2d5e360fcbaed391a566..32a691b7ce2c7e79eab6491b52457a11e666f7d3 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,7 +28,7 @@ #include #include -const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 711c3e2e17b1a46925deb99a2e22916610388033..6fb25693c00b92cbf881a13b06f2276b288853b1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,6 +24,7 @@ #include #include #include +#include static int zero; static int one = 1; @@ -294,6 +295,58 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return ret; } +static void proc_configure_early_demux(int enabled, int protocol) +{ + struct net_protocol *ipprot; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_protocol *ip6prot; +#endif + + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot) + ipprot->early_demux = enabled ? ipprot->early_demux_handler : + NULL; + +#if IS_ENABLED(CONFIG_IPV6) + ip6prot = rcu_dereference(inet6_protos[protocol]); + if (ip6prot) + ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : + NULL; +#endif +} + +static int proc_tcp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_tcp_early_demux; + + proc_configure_early_demux(enabled, IPPROTO_TCP); + } + + return ret; +} + +static int proc_udp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_udp_early_demux; + + proc_configure_early_demux(enabled, IPPROTO_UDP); + } + + return ret; +} + static struct ctl_table ipv4_table[] = { { .procname = "tcp_timestamps", @@ -749,6 +802,20 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "udp_early_demux", + .data = &init_net.ipv4.sysctl_udp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_udp_early_demux + }, + { + .procname = "tcp_early_demux", + .data = &init_net.ipv4.sysctl_tcp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tcp_early_demux + }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index aacfb4bce1533b3f3b38e1173c18cb1bb6b33099..b04539dd4629d2b71b5db27c4a64a89151b2d5d7 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,6 +49,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + void (*edemux)(struct sk_buff *skb); + /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -60,8 +62,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); - if (ipprot && ipprot->early_demux) - ipprot->early_demux(skb); + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) + edemux(skb); } if (!skb_valid_dst(skb)) ip6_route_input(skb); diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index e3770abe688a3a9059456fe9195adbfcdfb73157..b5d54d4f995c0f4bade2e3f1c4def9616252ca55 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -26,7 +26,7 @@ #include #if IS_ENABLED(CONFIG_IPV6) -const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; +struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet6_protos); int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0f08d718a00238b228d859d2c0a1dab10db57125..031a8c019f7a740cffe9d1703d9d87992268b028 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1925,8 +1925,9 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; -static const struct inet6_protocol tcpv6_protocol = { +static struct inet6_protocol tcpv6_protocol = { .early_demux = tcp_v6_early_demux, + .early_demux_handler = tcp_v6_early_demux, .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b793ed1d2a36960291b60de44b3cffea20861415..fd4b1c98a47230b94641c31fe3213b3dff6ac915 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1436,8 +1436,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif -static const struct inet6_protocol udpv6_protocol = { +static struct inet6_protocol udpv6_protocol = { .early_demux = udp_v6_early_demux, + .early_demux_handler = udp_v6_early_demux, .handler = udpv6_rcv, .err_handler = udpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,