提交 c134ecb8 编写于 作者: M Martin KaFai Lau 提交者: David S. Miller

tcp: Make use of MSG_EOR in tcp_sendmsg

This patch adds an eor bit to the TCP_SKB_CB.  When MSG_EOR
is passed to tcp_sendmsg, the eor bit will be set at the skb
containing the last byte of the userland's msg.  The eor bit
will prevent data from appending to that skb in the future.

The change in do_tcp_sendpages is to honor the eor set
during the previous tcp_sendmsg(MSG_EOR) call.

This patch handles the tcp_sendmsg case.  The followup patches
will handle other skb coalescing and fragment cases.

One potential use case is to use MSG_EOR with
SOF_TIMESTAMPING_TX_ACK to get a more accurate
TCP ack timestamping on application protocol with
multiple outgoing response messages (e.g. HTTP2).

Packetdrill script for testing:
~~~~~~
+0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10`
+0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1`
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.200 < . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

0.200 write(4, ..., 14600) = 14600
0.200 sendto(4, ..., 730, MSG_EOR, ..., ...) = 730
0.200 sendto(4, ..., 730, MSG_EOR, ..., ...) = 730

0.200 > .  1:7301(7300) ack 1
0.200 > P. 7301:14601(7300) ack 1

0.300 < . 1:1(0) ack 14601 win 257
0.300 > P. 14601:15331(730) ack 1
0.300 > P. 15331:16061(730) ack 1

0.400 < . 1:1(0) ack 16061 win 257
0.400 close(4) = 0
0.400 > F. 16061:16061(0) ack 1
0.400 < F. 1:1(0) ack 16062 win 257
0.400 > . 16062:16062(0) ack 2
Signed-off-by: NMartin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Suggested-by: NEric Dumazet <edumazet@google.com>
Acked-by: NEric Dumazet <edumazet@google.com>
Acked-by: NSoheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 2a9e8438
...@@ -761,7 +761,8 @@ struct tcp_skb_cb { ...@@ -761,7 +761,8 @@ struct tcp_skb_cb {
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
unused:7; eor:1, /* Is skb MSG_EOR marked? */
unused:6;
__u32 ack_seq; /* Sequence number ACK'd */ __u32 ack_seq; /* Sequence number ACK'd */
union { union {
struct inet_skb_parm h4; struct inet_skb_parm h4;
...@@ -808,6 +809,11 @@ static inline int tcp_skb_mss(const struct sk_buff *skb) ...@@ -808,6 +809,11 @@ static inline int tcp_skb_mss(const struct sk_buff *skb)
return TCP_SKB_CB(skb)->tcp_gso_size; return TCP_SKB_CB(skb)->tcp_gso_size;
} }
static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
{
return likely(!TCP_SKB_CB(skb)->eor);
}
/* Events passed to congestion control interface */ /* Events passed to congestion control interface */
enum tcp_ca_event { enum tcp_ca_event {
CA_EVENT_TX_START, /* first transmit when no packets in flight */ CA_EVENT_TX_START, /* first transmit when no packets in flight */
......
...@@ -909,7 +909,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ...@@ -909,7 +909,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i; int copy, i;
bool can_coalesce; bool can_coalesce;
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
!tcp_skb_can_collapse_to(skb)) {
new_segment: new_segment:
if (!sk_stream_memory_free(sk)) if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf; goto wait_for_sndbuf;
...@@ -1157,7 +1158,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1157,7 +1158,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
copy = max - skb->len; copy = max - skb->len;
} }
if (copy <= 0) { if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
new_segment: new_segment:
/* Allocate new segment. If the interface is SG, /* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page. * allocate skb fitting to single page.
...@@ -1251,6 +1252,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1251,6 +1252,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
copied += copy; copied += copy;
if (!msg_data_left(msg)) { if (!msg_data_left(msg)) {
tcp_tx_timestamp(sk, sockc.tsflags, skb); tcp_tx_timestamp(sk, sockc.tsflags, skb);
if (unlikely(flags & MSG_EOR))
TCP_SKB_CB(skb)->eor = 1;
goto out; goto out;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册