diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9e36738c1fe164fa75cb6c4a0802773925f73b9a..8e4fcac4df72f7f4188df410e90e3706998dc738 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -183,6 +183,7 @@ struct netns_ipv4 { unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; + int sysctl_tcp_reflect_tos; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 54023a46db04720ac343549113dadd7fa6da9082..3e5f4f2e705e84937112aed32df60b6a9d7e8127 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &comp_sack_nr_max, }, + { + .procname = "tcp_reflect_tos", + .data = &init_net.ipv4.sysctl_tcp_reflect_tos, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c4c7ad4c8b5a5aacf9837d8947da3c8cfc473105..ace48b2790ffaf41bf3a84f0d3b4496b760ac30a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -972,6 +972,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi4 fl4; int err = -1; struct sk_buff *skb; + u8 tos; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) @@ -979,6 +980,9 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; + if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -986,7 +990,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, rcu_dereference(ireq->ireq_opt), - inet_sk(sk)->tos); + tos & ~INET_ECN_MASK); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1531,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); if (!dst) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 04efa3ee80ef42fa04fd1c672e5d91bfc3f98ea3..862058dce6d047f5c5e78a67284dff8403c22312 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; + u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, @@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_lock(); opt = ireq->ipv6_opt; + tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : np->tclass; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, + tclass & ~INET_ECN_MASK, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); @@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (np->repflow) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever,