diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index a5c784c893122788c8fd1d4db270576330b02518..5f0922613f1a8db7963fe3c44d21280d7290d84e 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -162,6 +162,27 @@ SOF_TIMESTAMPING_OPT_CMSG: option IP_PKTINFO simultaneously. +SOF_TIMESTAMPING_OPT_TSONLY: + + Applies to transmit timestamps only. Makes the kernel return the + timestamp as a cmsg alongside an empty packet, as opposed to + alongside the original packet. This reduces the amount of memory + charged to the socket's receive budget (SO_RCVBUF) and delivers + the timestamp even if sysctl net.core.tstamp_allow_data is 0. + This option disables SOF_TIMESTAMPING_OPT_CMSG. + + +New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to +disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate +regardless of the setting of sysctl net.core.tstamp_allow_data. + +An exception is when a process needs additional cmsg data, for +instance SOL_IP/IP_PKTINFO to detect the egress network interface. +Then pass option SOF_TIMESTAMPING_OPT_CMSG. This option depends on +having access to the contents of the original packet, so cannot be +combined with SOF_TIMESTAMPING_OPT_TSONLY. + + 1.4 Bytestream Timestamps The SO_TIMESTAMPING interface supports timestamping of bytes in a diff --git a/Documentation/networking/timestamping/txtimestamp.c b/Documentation/networking/timestamping/txtimestamp.c index 05694febc2387a914f0aeb90eb4478df65e011b9..8217510d3842b3a7e599a92c7f7502d2bbdcde86 100644 --- a/Documentation/networking/timestamping/txtimestamp.c +++ b/Documentation/networking/timestamping/txtimestamp.c @@ -70,6 +70,7 @@ static int do_ipv6 = 1; static int cfg_payload_len = 10; static bool cfg_show_payload; static bool cfg_do_pktinfo; +static bool cfg_loop_nodata; static uint16_t dest_port = 9000; static struct sockaddr_in daddr; @@ -141,6 +142,9 @@ static void print_payload(char *data, int len) { int i; + if (!len) + return; + if (len > 70) len = 70; @@ -177,6 +181,7 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) struct sock_extended_err *serr = NULL; struct scm_timestamping *tss = NULL; struct cmsghdr *cm; + int batch = 0; for (cm = CMSG_FIRSTHDR(msg); cm && cm->cmsg_len; @@ -209,10 +214,18 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) } else fprintf(stderr, "unknown cmsg %d,%d\n", cm->cmsg_level, cm->cmsg_type); + + if (serr && tss) { + print_timestamp(tss, serr->ee_info, serr->ee_data, + payload_len); + serr = NULL; + tss = NULL; + batch++; + } } - if (serr && tss) - print_timestamp(tss, serr->ee_info, serr->ee_data, payload_len); + if (batch > 1) + fprintf(stderr, "batched %d timestamps\n", batch); } static int recv_errmsg(int fd) @@ -244,7 +257,7 @@ static int recv_errmsg(int fd) if (ret == -1 && errno != EAGAIN) error(1, errno, "recvmsg"); - if (ret > 0) { + if (ret >= 0) { __recv_errmsg_cmsg(&msg, ret); if (cfg_show_payload) print_payload(data, cfg_payload_len); @@ -309,6 +322,9 @@ static void do_test(int family, unsigned int opt) opt |= SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_CMSG | SOF_TIMESTAMPING_OPT_ID; + if (cfg_loop_nodata) + opt |= SOF_TIMESTAMPING_OPT_TSONLY; + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, (char *) &opt, sizeof(opt))) error(1, 0, "setsockopt timestamping"); @@ -378,6 +394,7 @@ static void __attribute__((noreturn)) usage(const char *filepath) " -h: show this message\n" " -I: request PKTINFO\n" " -l N: send N bytes at a time\n" + " -n: set no-payload option\n" " -r: use raw\n" " -R: use raw (IP_HDRINCL)\n" " -p N: connect to port N\n" @@ -392,7 +409,7 @@ static void parse_opt(int argc, char **argv) int proto_count = 0; char c; - while ((c = getopt(argc, argv, "46hIl:p:rRux")) != -1) { + while ((c = getopt(argc, argv, "46hIl:np:rRux")) != -1) { switch (c) { case '4': do_ipv6 = 0; @@ -403,6 +420,9 @@ static void parse_opt(int argc, char **argv) case 'I': cfg_do_pktinfo = true; break; + case 'n': + cfg_loop_nodata = true; + break; case 'r': proto_count++; cfg_proto = SOCK_RAW; diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 666594b43cfff9f7a50678e3d74bd29c116d3648..6294b5186ae552b8b2fcb78ac4b8617ee6f8fe38 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -97,6 +97,14 @@ rmem_max The maximum receive socket buffer size in bytes. +tstamp_allow_data +----------------- +Allow processes to receive tx timestamps looped together with the original +packet contents. If disabled, transmit timestamp requests from unprivileged +processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set. +Default: 1 (on) + + wmem_default ------------ diff --git a/include/net/sock.h b/include/net/sock.h index 15341499786cf3dbb5bfafff3174b5131ed99226..511ef7c8889b100e36c26d361c00e72f8b10e972 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2239,6 +2239,7 @@ bool sk_net_capable(const struct sock *sk, int cap); extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; +extern int sysctl_tstamp_allow_data; extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index edbc888ceb51c1ffa66b11c1f58eea0343b80faf..6d1abea9746ea7b7d1128a0c1fd1a3ad5f45b96d 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -24,8 +24,9 @@ enum { SOF_TIMESTAMPING_TX_SCHED = (1<<8), SOF_TIMESTAMPING_TX_ACK = (1<<9), SOF_TIMESTAMPING_OPT_CMSG = (1<<10), + SOF_TIMESTAMPING_OPT_TSONLY = (1<<11), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_CMSG, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 56db472e9b864e805e0ab36dd73a0404d2fc66d5..a5bff2767f15abe09b5f0d0a3bfecfb5775a4e64 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -74,6 +74,8 @@ #include <asm/uaccess.h> #include <trace/events/skb.h> #include <linux/highmem.h> +#include <linux/capability.h> +#include <linux/user_namespace.h> struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; @@ -3690,11 +3692,28 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, kfree_skb(skb); } +static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) +{ + bool ret; + + if (likely(sysctl_tstamp_allow_data || tsonly)) + return true; + + read_lock_bh(&sk->sk_callback_lock); + ret = sk->sk_socket && sk->sk_socket->file && + file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); + read_unlock_bh(&sk->sk_callback_lock); + return ret; +} + void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps) { struct sock *sk = skb->sk; + if (!skb_may_tx_timestamp(sk, false)) + return; + /* take a reference to prevent skb_orphan() from freeing the socket */ sock_hold(sk); @@ -3710,19 +3729,28 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; + bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; - if (!sk) + if (!sk || !skb_may_tx_timestamp(sk, tsonly)) return; - if (hwtstamps) - *skb_hwtstamps(orig_skb) = *hwtstamps; + if (tsonly) + skb = alloc_skb(0, GFP_ATOMIC); else - orig_skb->tstamp = ktime_get_real(); - - skb = skb_clone(orig_skb, GFP_ATOMIC); + skb = skb_clone(orig_skb, GFP_ATOMIC); if (!skb) return; + if (tsonly) { + skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags; + skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; + } + + if (hwtstamps) + *skb_hwtstamps(skb) = *hwtstamps; + else + skb->tstamp = ktime_get_real(); + __skb_complete_tx_timestamp(skb, sk, tstype); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); diff --git a/net/core/sock.c b/net/core/sock.c index 1c7a33db1314f3e2a7ded154b9ea498023d2d2e4..93c8b20c91e496648f7f2e5c769c062ca2dd679d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -325,6 +325,8 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +int sysctl_tstamp_allow_data __read_mostly = 1; + struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; EXPORT_SYMBOL_GPL(memalloc_socks); @@ -840,6 +842,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ret = -EINVAL; break; } + if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk->sk_protocol == IPPROTO_TCP) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 31baba2a71ce15e49450f69dae81e7d3be1ff3f2..fde21d19e61b3a10c17a7eca093f05de41cabceb 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -321,6 +321,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tstamp_allow_data", + .data = &sysctl_tstamp_allow_data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one + }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index db5e0f81ce0a45713357b0e71bd2d794321edfb9..31d8c71986b40e28e5c84f5e62d474a639d3bf91 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -483,7 +483,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin) { + if (sin && skb->len) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); @@ -496,8 +496,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || - ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { + if (skb->len && + (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_sk(sk)->cmsg_flags) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 49f5e73db1224549c7f3fc156209a85ed6ce4056..c215be70cac08af78953ea860c5d67cf9d3fa642 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -369,7 +369,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin) { + if (sin && skb->len) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; @@ -394,8 +394,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - - if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL && skb->len) { sin->sin6_family = AF_INET6; if (np->rxopt.all) { if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index 74c0fcd36838dfc7326eeddb7726483576da0413..5394b6be46ecd5ebb6c677b745a4848977433f70 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -42,6 +42,11 @@ void rxrpc_UDP_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } + if (!skb->len) { + _leave("UDP empty message"); + kfree_skb(skb); + return; + } rxrpc_new_skb(skb);