diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0bd6520329f6fd04420c9e1772277b948931ac99..10869906cc574949c6957790d72274a94a70ebc7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -454,9 +454,15 @@ enum { * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), + + /* segment contains only zerocopy data and should not be + * charged to the kernel memory. + */ + SKBFL_PURE_ZEROCOPY = BIT(2), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } +static inline bool skb_zcopy_pure(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; +} + +static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ + return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); +} + static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); @@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } diff --git a/include/net/sock.h b/include/net/sock.h index 620de053002d71d42a73c7a429d9163f0c7e8669..b32906e1ab55527b5418f203d3de05853863f166 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1603,13 +1603,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) __sk_mem_reclaim(sk, SK_RECLAIM_CHUNK); } -static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) -{ - sk_wmem_queued_add(sk, -skb->truesize); - sk_mem_uncharge(sk, skb->truesize); - __kfree_skb(skb); -} - static inline void sock_release_ownership(struct sock *sk) { if (sk->sk_lock.owned) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 8e8c5922a7b0c0a268a4c5a53d223489a10ca0bd..af91f370432efbbde63367ee6bb2b9dfcd8afa6e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -290,6 +290,16 @@ static inline bool tcp_out_of_memory(struct sock *sk) return false; } +static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) +{ + sk_wmem_queued_add(sk, -skb->truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, skb->truesize); + else + sk_mem_uncharge(sk, SKB_TRUESIZE(MAX_TCP_HEADER)); + __kfree_skb(skb); +} + void sk_forced_mem_schedule(struct sock *sk, int size); bool tcp_check_oom(struct sock *sk, int shift); @@ -967,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && - mptcp_skb_can_collapse(to, from)); + mptcp_skb_can_collapse(to, from) && + skb_pure_zcopy_same(to, from)); } /* Events passed to congestion control interface */ @@ -1875,7 +1886,7 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) diff --git a/net/core/datagram.c b/net/core/datagram.c index 15ab9ffb27fe999c9038d67d822de715d1e61baa..ee290776c661d02fd65aaa38ed9d5b5c2b6bb946 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); - sk_mem_charge(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 67a9188d8a49c83ce722f68d3064e9ba0545ea37..29e617d8d7fb254e63d478b754e01deca08ac1b5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; - skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a7b1138d619cd478a67ebaa28f451a4f8c860520..2561c14a6e639e9f327c8fd1ef7a3293b9e9baf4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(skb)) { bool mem_scheduled; + skb->truesize = SKB_TRUESIZE(size + MAX_TCP_HEADER); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); @@ -932,7 +933,7 @@ void tcp_remove_empty_skb(struct sock *sk) tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } } @@ -1319,6 +1320,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) copy = min_t(int, copy, pfrag->size - pfrag->offset); + /* skb changing from pure zc to mixed, must charge zc */ + if (unlikely(skb_zcopy_pure(skb))) { + if (!sk_wmem_schedule(sk, skb->data_len)) + goto wait_for_space; + + sk_mem_charge(sk, skb->data_len); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + if (!sk_wmem_schedule(sk, copy)) goto wait_for_space; @@ -1339,8 +1349,16 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) } pfrag->offset += copy; } else { - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_space; + /* First append to a fragless skb builds initial + * pure zerocopy skb + */ + if (!skb->len) + skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; + + if (!skb_zcopy_pure(skb)) { + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_space; + } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { @@ -2893,7 +2911,7 @@ static void tcp_rtx_queue_purge(struct sock *sk) * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } } @@ -2904,7 +2922,7 @@ void tcp_write_queue_purge(struct sock *sk) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6867e5db3e352d35b94b212db8b72f88672e3d1d..287b57aadc3741776ec29b96e27bd76d5c8b7405 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); - sk_mem_uncharge(sk, delta_truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) + if (unlikely(TCP_SKB_CB(skb)->eor) || + tcp_has_tx_tstamp(skb) || + !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; @@ -2412,7 +2415,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; tcp_skb_collapse_tstamp(nskb, skb); tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH);