提交 8a75e30e 编写于 作者: J Jakub Kicinski

Merge branch 'accurate-memory-charging-for-msg_zerocopy'

Talal Ahmad says:

====================
Accurate Memory Charging For MSG_ZEROCOPY

This series improves the accuracy of msg_zerocopy memory accounting.
At present, when msg_zerocopy is used memory is charged twice for the
data - once when user space allocates it, and then again within
__zerocopy_sg_from_iter. The memory charging in the kernel is excessive
because data is held in user pages and is never actually copied to skb
fragments. This leads to incorrectly inflated memory statistics for
programs passing MSG_ZEROCOPY.

We reduce this inaccuracy by introducing the notion of "pure" zerocopy
SKBs - where all the frags in the SKB are backed by pinned userspace
pages, and none are backed by copied pages. For such SKBs, tracked via
the new SKBFL_PURE_ZEROCOPY flag, we elide sk_mem_charge/uncharge
calls, leading to more accurate accounting.

However, SKBs can also be coalesced by the stack at present,
potentially leading to "impure" SKBs. We restrict this coalescing so
it can only happen within the sendmsg() system call itself, for the
most recently allocated SKB. While this can lead to a small degree of
double-charging of memory, this case does not arise often in practice
for workloads that set MSG_ZEROCOPY.

Testing verified that memory usage in the kernel is lowered.
Instrumentation with counters also showed that accounting at time
charging and uncharging is balanced.
====================

Link: https://lore.kernel.org/r/20211030020542.3870542-1-mailtalalahmad@gmail.comSigned-off-by: NJakub Kicinski <kuba@kernel.org>
...@@ -454,9 +454,15 @@ enum { ...@@ -454,9 +454,15 @@ enum {
* all frags to avoid possible bad checksum * all frags to avoid possible bad checksum
*/ */
SKBFL_SHARED_FRAG = BIT(1), SKBFL_SHARED_FRAG = BIT(1),
/* segment contains only zerocopy data and should not be
* charged to the kernel memory.
*/
SKBFL_PURE_ZEROCOPY = BIT(2),
}; };
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
/* /*
* The callback notifies userspace to release buffers when skb DMA is done in * The callback notifies userspace to release buffers when skb DMA is done in
...@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) ...@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
return is_zcopy ? skb_uarg(skb) : NULL; return is_zcopy ? skb_uarg(skb) : NULL;
} }
static inline bool skb_zcopy_pure(const struct sk_buff *skb)
{
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}
static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
const struct sk_buff *skb2)
{
return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
}
static inline void net_zcopy_get(struct ubuf_info *uarg) static inline void net_zcopy_get(struct ubuf_info *uarg)
{ {
refcount_inc(&uarg->refcnt); refcount_inc(&uarg->refcnt);
...@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) ...@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
if (!skb_zcopy_is_nouarg(skb)) if (!skb_zcopy_is_nouarg(skb))
uarg->callback(skb, uarg, zerocopy_success); uarg->callback(skb, uarg, zerocopy_success);
skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
} }
} }
......
...@@ -1603,13 +1603,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) ...@@ -1603,13 +1603,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK); __sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
} }
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk_wmem_queued_add(sk, -skb->truesize);
sk_mem_uncharge(sk, skb->truesize);
__kfree_skb(skb);
}
static inline void sock_release_ownership(struct sock *sk) static inline void sock_release_ownership(struct sock *sk)
{ {
if (sk->sk_lock.owned) { if (sk->sk_lock.owned) {
......
...@@ -290,6 +290,16 @@ static inline bool tcp_out_of_memory(struct sock *sk) ...@@ -290,6 +290,16 @@ static inline bool tcp_out_of_memory(struct sock *sk)
return false; return false;
} }
static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk_wmem_queued_add(sk, -skb->truesize);
if (!skb_zcopy_pure(skb))
sk_mem_uncharge(sk, skb->truesize);
else
sk_mem_uncharge(sk, SKB_TRUESIZE(MAX_TCP_HEADER));
__kfree_skb(skb);
}
void sk_forced_mem_schedule(struct sock *sk, int size); void sk_forced_mem_schedule(struct sock *sk, int size);
bool tcp_check_oom(struct sock *sk, int shift); bool tcp_check_oom(struct sock *sk, int shift);
...@@ -967,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, ...@@ -967,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
const struct sk_buff *from) const struct sk_buff *from)
{ {
return likely(tcp_skb_can_collapse_to(to) && return likely(tcp_skb_can_collapse_to(to) &&
mptcp_skb_can_collapse(to, from)); mptcp_skb_can_collapse(to, from) &&
skb_pure_zcopy_same(to, from));
} }
/* Events passed to congestion control interface */ /* Events passed to congestion control interface */
...@@ -1875,7 +1886,7 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc ...@@ -1875,7 +1886,7 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc
{ {
list_del(&skb->tcp_tsorted_anchor); list_del(&skb->tcp_tsorted_anchor);
tcp_rtx_queue_unlink(skb, sk); tcp_rtx_queue_unlink(skb, sk);
sk_wmem_free_skb(sk, skb); tcp_wmem_free_skb(sk, skb);
} }
static inline void tcp_push_pending_frames(struct sock *sk) static inline void tcp_push_pending_frames(struct sock *sk)
......
...@@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, ...@@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
skb->truesize += truesize; skb->truesize += truesize;
if (sk && sk->sk_type == SOCK_STREAM) { if (sk && sk->sk_type == SOCK_STREAM) {
sk_wmem_queued_add(sk, truesize); sk_wmem_queued_add(sk, truesize);
sk_mem_charge(sk, truesize); if (!skb_zcopy_pure(skb))
sk_mem_charge(sk, truesize);
} else { } else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc); refcount_add(truesize, &skb->sk->sk_wmem_alloc);
} }
......
...@@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb, ...@@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb,
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{ {
int pos = skb_headlen(skb); int pos = skb_headlen(skb);
const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
skb_zerocopy_clone(skb1, skb, 0); skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */ if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos); skb_split_inside_header(skb, skb1, len, pos);
......
...@@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, ...@@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
if (likely(skb)) { if (likely(skb)) {
bool mem_scheduled; bool mem_scheduled;
skb->truesize = SKB_TRUESIZE(size + MAX_TCP_HEADER);
if (force_schedule) { if (force_schedule) {
mem_scheduled = true; mem_scheduled = true;
sk_forced_mem_schedule(sk, skb->truesize); sk_forced_mem_schedule(sk, skb->truesize);
...@@ -932,7 +933,7 @@ void tcp_remove_empty_skb(struct sock *sk) ...@@ -932,7 +933,7 @@ void tcp_remove_empty_skb(struct sock *sk)
tcp_unlink_write_queue(skb, sk); tcp_unlink_write_queue(skb, sk);
if (tcp_write_queue_empty(sk)) if (tcp_write_queue_empty(sk))
tcp_chrono_stop(sk, TCP_CHRONO_BUSY); tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
sk_wmem_free_skb(sk, skb); tcp_wmem_free_skb(sk, skb);
} }
} }
...@@ -1319,6 +1320,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1319,6 +1320,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
copy = min_t(int, copy, pfrag->size - pfrag->offset); copy = min_t(int, copy, pfrag->size - pfrag->offset);
/* skb changing from pure zc to mixed, must charge zc */
if (unlikely(skb_zcopy_pure(skb))) {
if (!sk_wmem_schedule(sk, skb->data_len))
goto wait_for_space;
sk_mem_charge(sk, skb->data_len);
skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
}
if (!sk_wmem_schedule(sk, copy)) if (!sk_wmem_schedule(sk, copy))
goto wait_for_space; goto wait_for_space;
...@@ -1339,8 +1349,16 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1339,8 +1349,16 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
} }
pfrag->offset += copy; pfrag->offset += copy;
} else { } else {
if (!sk_wmem_schedule(sk, copy)) /* First append to a fragless skb builds initial
goto wait_for_space; * pure zerocopy skb
*/
if (!skb->len)
skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
if (!skb_zcopy_pure(skb)) {
if (!sk_wmem_schedule(sk, copy))
goto wait_for_space;
}
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
if (err == -EMSGSIZE || err == -EEXIST) { if (err == -EMSGSIZE || err == -EEXIST) {
...@@ -2893,7 +2911,7 @@ static void tcp_rtx_queue_purge(struct sock *sk) ...@@ -2893,7 +2911,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
* list_del(&skb->tcp_tsorted_anchor) * list_del(&skb->tcp_tsorted_anchor)
*/ */
tcp_rtx_queue_unlink(skb, sk); tcp_rtx_queue_unlink(skb, sk);
sk_wmem_free_skb(sk, skb); tcp_wmem_free_skb(sk, skb);
} }
} }
...@@ -2904,7 +2922,7 @@ void tcp_write_queue_purge(struct sock *sk) ...@@ -2904,7 +2922,7 @@ void tcp_write_queue_purge(struct sock *sk)
tcp_chrono_stop(sk, TCP_CHRONO_BUSY); tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
tcp_skb_tsorted_anchor_cleanup(skb); tcp_skb_tsorted_anchor_cleanup(skb);
sk_wmem_free_skb(sk, skb); tcp_wmem_free_skb(sk, skb);
} }
tcp_rtx_queue_purge(sk); tcp_rtx_queue_purge(sk);
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
......
...@@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) ...@@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
if (delta_truesize) { if (delta_truesize) {
skb->truesize -= delta_truesize; skb->truesize -= delta_truesize;
sk_wmem_queued_add(sk, -delta_truesize); sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize); if (!skb_zcopy_pure(skb))
sk_mem_uncharge(sk, delta_truesize);
} }
/* Any change of skb->len requires recalculation of tso factor. */ /* Any change of skb->len requires recalculation of tso factor. */
...@@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) ...@@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
if (len <= skb->len) if (len <= skb->len)
break; break;
if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) if (unlikely(TCP_SKB_CB(skb)->eor) ||
tcp_has_tx_tstamp(skb) ||
!skb_pure_zcopy_same(skb, next))
return false; return false;
len -= skb->len; len -= skb->len;
...@@ -2412,7 +2415,7 @@ static int tcp_mtu_probe(struct sock *sk) ...@@ -2412,7 +2415,7 @@ static int tcp_mtu_probe(struct sock *sk)
TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
tcp_skb_collapse_tstamp(nskb, skb); tcp_skb_collapse_tstamp(nskb, skb);
tcp_unlink_write_queue(skb, sk); tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb); tcp_wmem_free_skb(sk, skb);
} else { } else {
TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
~(TCPHDR_FIN|TCPHDR_PSH); ~(TCPHDR_FIN|TCPHDR_PSH);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册