提交 e370a723 编写于 作者: E Eric Dumazet 提交者: David S. Miller

af_unix: improve STREAM behavior with fragmented memory

unix_stream_sendmsg() currently uses order-2 allocations,
and we had numerous reports this can fail.

The __GFP_REPEAT flag present in sock_alloc_send_pskb() is
not helping.

This patch extends the work done in commit eb6a2481
("af_unix: reduce high order page allocations) for
datagram sockets.

This opens the possibility of zero copy IO (splice() and
friends)

The trick is to not use skb_pull() anymore in recvmsg() path,
and instead add a @consumed field in UNIXCB() to track amount
of already read payload in the skb.

There is a performance regression for large sends
because of extra page allocations that will be addressed
in a follow-up patch, allowing sock_alloc_send_pskb()
to attempt high order page allocations.
Signed-off-by: NEric Dumazet <edumazet@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 149479d0
...@@ -35,6 +35,7 @@ struct unix_skb_parms { ...@@ -35,6 +35,7 @@ struct unix_skb_parms {
#ifdef CONFIG_SECURITY_NETWORK #ifdef CONFIG_SECURITY_NETWORK
u32 secid; /* Security ID */ u32 secid; /* Security ID */
#endif #endif
u32 consumed;
}; };
#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb))
......
...@@ -1596,6 +1596,10 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, ...@@ -1596,6 +1596,10 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
return err; return err;
} }
/* We use paged skbs for stream sockets, and limit occupancy to 32768
* bytes, and a minimun of a full page.
*/
#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
struct msghdr *msg, size_t len) struct msghdr *msg, size_t len)
...@@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, ...@@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
struct scm_cookie tmp_scm; struct scm_cookie tmp_scm;
bool fds_sent = false; bool fds_sent = false;
int max_level; int max_level;
int data_len;
if (NULL == siocb->scm) if (NULL == siocb->scm)
siocb->scm = &tmp_scm; siocb->scm = &tmp_scm;
...@@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, ...@@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
goto pipe_err; goto pipe_err;
while (sent < len) { while (sent < len) {
/* size = len - sent;
* Optimisation for the fact that under 0.01% of X
* messages typically need breaking up.
*/
size = len-sent;
/* Keep two messages in the pipe so it schedules better */ /* Keep two messages in the pipe so it schedules better */
if (size > ((sk->sk_sndbuf >> 1) - 64)) size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
size = (sk->sk_sndbuf >> 1) - 64;
if (size > SKB_MAX_ALLOC) /* allow fallback to order-0 allocations */
size = SKB_MAX_ALLOC; size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
/*
* Grab a buffer
*/
skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
&err);
if (skb == NULL) skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
msg->msg_flags & MSG_DONTWAIT, &err);
if (!skb)
goto out_err; goto out_err;
/*
* If you pass two values to the sock_alloc_send_skb
* it tries to grab the large buffer with GFP_NOFS
* (which can fail easily), and if it fails grab the
* fallback size buffer which is under a page and will
* succeed. [Alan]
*/
size = min_t(int, size, skb_tailroom(skb));
/* Only send the fds in the first buffer */ /* Only send the fds in the first buffer */
err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
if (err < 0) { if (err < 0) {
...@@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, ...@@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
max_level = err + 1; max_level = err + 1;
fds_sent = true; fds_sent = true;
err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); skb_put(skb, size - data_len);
skb->data_len = data_len;
skb->len = size;
err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size);
if (err) { if (err) {
kfree_skb(skb); kfree_skb(skb);
goto out_err; goto out_err;
...@@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, ...@@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
return timeo; return timeo;
} }
static unsigned int unix_skb_len(const struct sk_buff *skb)
{
return skb->len - UNIXCB(skb).consumed;
}
static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, struct msghdr *msg, size_t size,
int flags) int flags)
...@@ -1977,8 +1971,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, ...@@ -1977,8 +1971,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
} }
skip = sk_peek_offset(sk, flags); skip = sk_peek_offset(sk, flags);
while (skip >= skb->len) { while (skip >= unix_skb_len(skb)) {
skip -= skb->len; skip -= unix_skb_len(skb);
last = skb; last = skb;
skb = skb_peek_next(skb, &sk->sk_receive_queue); skb = skb_peek_next(skb, &sk->sk_receive_queue);
if (!skb) if (!skb)
...@@ -2005,8 +1999,9 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, ...@@ -2005,8 +1999,9 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
sunaddr = NULL; sunaddr = NULL;
} }
chunk = min_t(unsigned int, skb->len - skip, size); chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
msg->msg_iov, chunk)) {
if (copied == 0) if (copied == 0)
copied = -EFAULT; copied = -EFAULT;
break; break;
...@@ -2016,14 +2011,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, ...@@ -2016,14 +2011,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
/* Mark read part of skb as used */ /* Mark read part of skb as used */
if (!(flags & MSG_PEEK)) { if (!(flags & MSG_PEEK)) {
skb_pull(skb, chunk); UNIXCB(skb).consumed += chunk;
sk_peek_offset_bwd(sk, chunk); sk_peek_offset_bwd(sk, chunk);
if (UNIXCB(skb).fp) if (UNIXCB(skb).fp)
unix_detach_fds(siocb->scm, skb); unix_detach_fds(siocb->scm, skb);
if (skb->len) if (unix_skb_len(skb))
break; break;
skb_unlink(skb, &sk->sk_receive_queue); skb_unlink(skb, &sk->sk_receive_queue);
...@@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk) ...@@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk)
if (sk->sk_type == SOCK_STREAM || if (sk->sk_type == SOCK_STREAM ||
sk->sk_type == SOCK_SEQPACKET) { sk->sk_type == SOCK_SEQPACKET) {
skb_queue_walk(&sk->sk_receive_queue, skb) skb_queue_walk(&sk->sk_receive_queue, skb)
amount += skb->len; amount += unix_skb_len(skb);
} else { } else {
skb = skb_peek(&sk->sk_receive_queue); skb = skb_peek(&sk->sk_receive_queue);
if (skb) if (skb)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册