diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 56ff2952edafbc3a26616359b5ae56318fcb8db0..2f87377e9af70b8b714f842170e0ba3ac9ffddf8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -134,6 +134,7 @@ struct tcp_request_sock { * FastOpen it's the seq# * after data-in-SYN. */ + u8 syn_tos; }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) diff --git a/include/net/ip.h b/include/net/ip.h index b09c48d862cc1077b2d98924348096eae917d7c3..0f72bf8c0cbffa8b4dfad36f448ed4f94da7eb3c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -151,7 +151,7 @@ int igmp_mc_init(void); int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, - struct ip_options_rcu *opt); + struct ip_options_rcu *opt, u8 tos); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ip_list_rcv(struct list_head *head, struct packet_type *pt, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9e36738c1fe164fa75cb6c4a0802773925f73b9a..8e4fcac4df72f7f4188df410e90e3706998dc738 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -183,6 +183,7 @@ struct netns_ipv4 { unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; + int sysctl_tcp_reflect_tos; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d8f3751a512b6b3b532db77e377fc8d97649ea76..bb3d70664dde0521091147bc40baf546b2b80c94 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -495,7 +495,8 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + rcu_dereference(ireq->ireq_opt), + inet_sk(sk)->tos); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -537,7 +538,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) local_bh_disable(); bh_lock_sock(ctl_sk); err = ip_build_and_send_pkt(skb, ctl_sk, - rxiph->daddr, rxiph->saddr, NULL); + rxiph->daddr, rxiph->saddr, NULL, + inet_sk(ctl_sk)->tos); bh_unlock_sock(ctl_sk); if (net_xmit_eval(err) == 0) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b931d0b02e49cf9b8f05690cc62f5d687a674463..5fb536ff51f01d25c2a241b4b3a28b5b94a480e6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -142,7 +142,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, - __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, + u8 tos) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); @@ -155,7 +156,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - iph->tos = inet->tos; + iph->tos = tos; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index f0794f0232bae749244fff35d8b96b1f561a5e87..c375c126f436f160ccd656dff6b477f05363f73e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_request_sock *treq; struct request_sock *req; #ifdef CONFIG_MPTCP - struct tcp_request_sock *treq; - if (sk_is_mptcp(sk)) ops = &mptcp_subflow_request_sock_ops; #endif @@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, if (!req) return NULL; -#if IS_ENABLED(CONFIG_MPTCP) treq = tcp_rsk(req); + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; +#if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) { int err = mptcp_subflow_init_cookie_req(req, sk, skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 54023a46db04720ac343549113dadd7fa6da9082..3e5f4f2e705e84937112aed32df60b6a9d7e8127 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &comp_sack_nr_max, }, + { + .procname = "tcp_reflect_tos", + .data = &init_net.ipv4.sysctl_tcp_reflect_tos, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4337841faeff9723ecb755026069dfe4b63a384f..3658ad84f0c6252658dd5174eae31dcde4b34942 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6834,6 +6834,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; tcp_openreq_init_rwin(req, sk, dst); sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index af27cfa9d8d38be29a8d7d355c9aa7429e10973c..ace48b2790ffaf41bf3a84f0d3b4496b760ac30a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -972,6 +972,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi4 fl4; int err = -1; struct sk_buff *skb; + u8 tos; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) @@ -979,13 +980,17 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; + if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + rcu_dereference(ireq->ireq_opt), + tos & ~INET_ECN_MASK); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1530,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); if (!dst) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 04efa3ee80ef42fa04fd1c672e5d91bfc3f98ea3..862058dce6d047f5c5e78a67284dff8403c22312 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; + u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, @@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_lock(); opt = ireq->ipv6_opt; + tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : np->tclass; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, + tclass & ~INET_ECN_MASK, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); @@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (np->repflow) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever,