Merge branch 'tcp-eor'

Martin KaFai Lau says: ==================== tcp: Make use of MSG_EOR in tcp_sendmsg v4: ~ Do not set eor bit in do_tcp_sendpages() since there is no way to pass MSG_EOR from the userland now. ~ Avoid rmw by testing MSG_EOR first in tcp_sendmsg(). ~ Move TCP_SKB_CB(skb)->eor test to a new helper tcp_skb_can_collapse_to() (suggested by Soheil). ~ Add some packetdrill tests. v3: ~ Separate EOR marking from the SKBTX_ANY_TSTAMP logic. ~ Move the eor bit test back to the loop in tcp_sendmsg and tcp_sendpage because there could be >1 threads doing sendmsg. ~ Thanks to Eric Dumazet's suggestions on v2. ~ The TCP timestamp bug fixes are separated into other threads. v2: ~ Rework based on the recent work "add TX timestamping via cmsg" by Soheil Hassas Yeganeh <soheil.kdev@gmail.com> ~ This version takes the MSG_EOR bit as a signal of end-of-response-message and leave the selective timestamping job to the cmsg ~ Changes based on the v1 feedback (like avoid unlikely check in a loop and adding tcp_sendpage support) ~ The first 3 patches are bug fixes. The fixes in this series depend on the newly introduced txstamp_ack in net-next. I will make relevant patches against net after getting some feedback. ~ The test results are based on the recently posted net fix: "tcp: Fix SOF_TIMESTAMPING_TX_ACK when handling dup acks" One potential use case is to use MSG_EOR with SOF_TIMESTAMPING_TX_ACK to get a more accurate TCP ack timestamping on application protocol with multiple outgoing response messages (e.g. HTTP2). One of our use case is at the webserver. The webserver tracks the HTTP2 response latency by measuring when the webserver sends the first byte to the socket till the TCP ACK of the last byte is received. In the cases where we don't have client side measurement, measuring from the server side is the only option. In the cases we have the client side measurement, the server side data can also be used to justify/cross-check-with the client side data. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>

Merge branch 'tcp-eor'
Martin KaFai Lau says: ==================== tcp: Make use of MSG_EOR in tcp_sendmsg v4: ~ Do not set eor bit in do_tcp_sendpages() since there is no way to pass MSG_EOR from the userland now. ~ Avoid rmw by testing MSG_EOR first in tcp_sendmsg(). ~ Move TCP_SKB_CB(skb)->eor test to a new helper tcp_skb_can_collapse_to() (suggested by Soheil). ~ Add some packetdrill tests. v3: ~ Separate EOR marking from the SKBTX_ANY_TSTAMP logic. ~ Move the eor bit test back to the loop in tcp_sendmsg and tcp_sendpage because there could be >1 threads doing sendmsg. ~ Thanks to Eric Dumazet's suggestions on v2. ~ The TCP timestamp bug fixes are separated into other threads. v2: ~ Rework based on the recent work "add TX timestamping via cmsg" by Soheil Hassas Yeganeh <soheil.kdev@gmail.com> ~ This version takes the MSG_EOR bit as a signal of end-of-response-message and leave the selective timestamping job to the cmsg ~ Changes based on the v1 feedback (like avoid unlikely check in a loop and adding tcp_sendpage support) ~ The first 3 patches are bug fixes. The fixes in this series depend on the newly introduced txstamp_ack in net-next. I will make relevant patches against net after getting some feedback. ~ The test results are based on the recently posted net fix: "tcp: Fix SOF_TIMESTAMPING_TX_ACK when handling dup acks" One potential use case is to use MSG_EOR with SOF_TIMESTAMPING_TX_ACK to get a more accurate TCP ack timestamping on application protocol with multiple outgoing response messages (e.g. HTTP2). One of our use case is at the webserver. The webserver tracks the HTTP2 response latency by measuring when the webserver sends the first byte to the socket till the TCP ACK of the last byte is received. In the cases where we don't have client side measurement, measuring from the server side is the only option. In the cases we have the client side measurement, the server side data can also be used to justify/cross-check-with the client side data. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>
f345c9a5 · David S. Miller · 2a9e8438 · a166140e · f345c9a5 · f345c9a5
隐藏空白更改
内联并排

Showing with 29 addition and 3 deletion

include/net/tcp.h include/net/tcp.h +7 -1

net/ipv4/tcp.c net/ipv4/tcp.c +5 -2

net/ipv4/tcp_input.c net/ipv4/tcp_input.c +4 -0

net/ipv4/tcp_output.c net/ipv4/tcp_output.c +13 -0

未找到文件。
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -761,7 +761,8 @@ struct tcp_skb_cb {
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
-			unused:7;
+			eor:1,		/* Is skb MSG_EOR marked? */
+			unused:6;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct inet_skb_parm	h4;
@@ -808,6 +809,11 @@ static inline int tcp_skb_mss(const struct sk_buff *skb)
 	return TCP_SKB_CB(skb)->tcp_gso_size;
 }
+static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
+{
+	return likely(!TCP_SKB_CB(skb)->eor);
+}
 /* Events passed to congestion control interface */
 enum tcp_ca_event {
 	CA_EVENT_TX_START,	/* first transmit when no packets in flight */

--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -909,7 +909,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 		int copy, i;
 		bool can_coalesce;
-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+		    !tcp_skb_can_collapse_to(skb)) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
@@ -1157,7 +1158,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			copy = max - skb->len;
 		}
-		if (copy <= 0) {
+		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
 new_segment:
 			/* Allocate new segment. If the interface is SG,
 			 * allocate skb fitting to single page.
@@ -1251,6 +1252,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 		copied += copy;
 		if (!msg_data_left(msg)) {
 			tcp_tx_timestamp(sk, sockc.tsflags, skb);
+			if (unlikely(flags & MSG_EOR))
+				TCP_SKB_CB(skb)->eor = 1;
 			goto out;
 		}

--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1303,6 +1303,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	}
 	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 		TCP_SKB_CB(prev)->end_seq++;
@@ -1368,6 +1369,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
 		goto fallback;
+	if (!tcp_skb_can_collapse_to(prev))
+		goto fallback;
 	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
 		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);

--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1134,6 +1134,12 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
 	}
 }
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+	TCP_SKB_CB(skb)->eor = 0;
+}
 /* Function to create two new TCP segments.  Shrinks the given segment
 * to the specified size and appends a new segment with the rest of the
 * packet to the list.  This won't be called frequently, I hope.
@@ -1179,6 +1185,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
 	TCP_SKB_CB(buff)->tcp_flags = flags;
 	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+	tcp_skb_fragment_eor(skb, buff);
 	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
 		/* Copy and checksum data tail into the new buffer. */
@@ -1739,6 +1746,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	/* This packet was never sent out yet, so no SACK bits. */
 	TCP_SKB_CB(buff)->sacked = 0;
+	tcp_skb_fragment_eor(skb, buff);
 	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
 	skb_split(skb, buff, len);
 	tcp_fragment_tstamp(skb, buff);
@@ -2499,6 +2508,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	 * packet counting does not break.
 	 */
 	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
 	/* changed transmit queue under us so clear hints */
 	tcp_clear_retrans_hints_partial(tp);
@@ -2550,6 +2560,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 		if (!tcp_can_collapse(sk, skb))
 			break;
+		if (!tcp_skb_can_collapse_to(to))
+			break;
 		space -= skb->len;
 		if (first) {