提交 ac8f1710 编写于 作者: W Wei Wang 提交者: David S. Miller

tcp: reflect tos value received in SYN to the socket

This commit adds a new TCP feature to reflect the tos value received in
SYN, and send it out on the SYN-ACK, and eventually set the tos value of
the established socket with this reflected tos value. This provides a
way to set the traffic class/QoS level for all traffic in the same
connection to be the same as the incoming SYN request. It could be
useful in data centers to provide equivalent QoS according to the
incoming request.
This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by
default turned off.
Signed-off-by: NWei Wang <weiwan@google.com>
Signed-off-by: NEric Dumazet <edumazet@google.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 de033b7d
...@@ -183,6 +183,7 @@ struct netns_ipv4 { ...@@ -183,6 +183,7 @@ struct netns_ipv4 {
unsigned int sysctl_tcp_fastopen_blackhole_timeout; unsigned int sysctl_tcp_fastopen_blackhole_timeout;
atomic_t tfo_active_disable_times; atomic_t tfo_active_disable_times;
unsigned long tfo_active_disable_stamp; unsigned long tfo_active_disable_stamp;
int sysctl_tcp_reflect_tos;
int sysctl_udp_wmem_min; int sysctl_udp_wmem_min;
int sysctl_udp_rmem_min; int sysctl_udp_rmem_min;
......
...@@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
.extra2 = &comp_sack_nr_max, .extra2 = &comp_sack_nr_max,
}, },
{
.procname = "tcp_reflect_tos",
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{ {
.procname = "udp_rmem_min", .procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min, .data = &init_net.ipv4.sysctl_udp_rmem_min,
......
...@@ -972,6 +972,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -972,6 +972,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi4 fl4; struct flowi4 fl4;
int err = -1; int err = -1;
struct sk_buff *skb; struct sk_buff *skb;
u8 tos;
/* First, grab a route. */ /* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
...@@ -979,6 +980,9 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -979,6 +980,9 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
if (skb) { if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
...@@ -986,7 +990,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -986,7 +990,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr, ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt), rcu_dereference(ireq->ireq_opt),
inet_sk(sk)->tos); tos & ~INET_ECN_MASK);
rcu_read_unlock(); rcu_read_unlock();
err = net_xmit_eval(err); err = net_xmit_eval(err);
} }
...@@ -1531,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, ...@@ -1531,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32(); newinet->inet_id = prandom_u32();
/* Set ToS of the new socket based upon the value of incoming SYN. */
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
if (!dst) { if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req); dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst) if (!dst)
......
...@@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6 = &fl->u.ip6; struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb; struct sk_buff *skb;
int err = -ENOMEM; int err = -ENOMEM;
u8 tclass;
/* First, grab a route. */ /* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
...@@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_lock(); rcu_read_lock();
opt = ireq->ipv6_opt; opt = ireq->ipv6_opt;
tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
tcp_rsk(req)->syn_tos : np->tclass;
if (!opt) if (!opt)
opt = rcu_dereference(np->opt); opt = rcu_dereference(np->opt);
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
tclass & ~INET_ECN_MASK,
sk->sk_priority); sk->sk_priority);
rcu_read_unlock(); rcu_read_unlock();
err = net_xmit_eval(err); err = net_xmit_eval(err);
...@@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ...@@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (np->repflow) if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
/* Set ToS of the new socket based upon the value of incoming SYN. */
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
/* Clone native IPv6 options from listening socket (if any) /* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever, Yes, keeping reference count would be much more clever,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册