提交 a624f93c 编写于 作者: D David S. Miller

Merge branch 'tcp-bbr'

Neal Cardwell says:

====================
tcp: BBR congestion control algorithm

This patch series implements a new TCP congestion control algorithm:
BBR (Bottleneck Bandwidth and RTT). A paper with a detailed
description of BBR will be published in ACM Queue, September-October
2016, as "BBR: Congestion-Based Congestion Control". BBR is widely
deployed in production at Google.

The patch series starts with a set of supporting infrastructure
changes, including a few that extend the congestion control
framework. The last patch adds BBR as a TCP congestion control
module. Please see individual patches for the details.

- v3 -> v4:
 - Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control"
   to use const to qualify all the constant parameters.
   Thanks to Stephen Hemminger.
 - In "tcp_bbr: add BBR congestion control", remove the bbr_rate_kbps()
   function, which had a 64-bit divide that would be problematic on some
   architectures, and just use bbr_rate_bytes_per_sec() directly.
   Thanks to Kenneth Klette Jonassen for suggesting this.
 - In "tcp: switch back to proper tcp_skb_cb size check in tcp_init()",
   switched from sizeof(skb->cb) to FIELD_SIZEOF.
   Thanks to Lance Richardson for suggesting this.
 - Updated "tcp_bbr: add BBR congestion control" commit message with
   performance data, more details about deployment at Google, and
   another reminder to use fq with BBR.
 - Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control"
   to use MODULE_LICENSE("Dual BSD/GPL").

- v2 -> v3: fix another issue caught by build bots:
 - adjust rate_sample struct initialization syntax to allow gcc-4.4 to compile
   the "tcp: track data delivery rate for a TCP connection" patch; also
   adjusted some similar syntax in "tcp_bbr: add BBR congestion control"

- v1 -> v2: fix issues caught by build bots:
 - fix "tcp: export data delivery rate" to use rate64 instead of rate,
   so there is a 64-bit numerator for the do_div call
 - fix conflicting definitions for minmax caused by
   "tcp: use windowed min filter library for TCP min_rtt estimation"
   with a new commit:
   tcp: cdg: rename struct minmax in tcp_cdg.c to avoid a naming conflict
 - fix warning about the use of __packed in
   "tcp: track data delivery rate for a TCP connection",
   which involves the addition of a new commit:
   tcp: switch back to proper tcp_skb_cb size check in tcp_init()
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -19,6 +19,7 @@
#include <linux/skbuff.h>
#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
......@@ -212,7 +213,8 @@ struct tcp_sock {
u8 reord; /* reordering detected */
} rack;
u16 advmss; /* Advertised MSS */
u8 unused;
u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
unused:7;
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */
......@@ -234,9 +236,7 @@ struct tcp_sock {
u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */
struct rtt_meas {
u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
} rtt_min[3];
struct minmax rtt_min;
u32 packets_out; /* Packets which are "in flight" */
u32 retrans_out; /* Retransmitted packets out */
......@@ -268,6 +268,12 @@ struct tcp_sock {
* receiver in Recovery. */
u32 prr_out; /* Total number of pkts sent during Recovery. */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 lost; /* Total data packets lost incl. rexmits */
u32 app_limited; /* limited until "delivered" reaches this val */
struct skb_mstamp first_tx_mstamp; /* start of window send phase */
struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
u32 rate_delivered; /* saved rate sample: packets delivered */
u32 rate_interval_us; /* saved rate sample: time elapsed */
u32 rcv_wnd; /* Current receiver window */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
......
/**
* lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
*
*/
#ifndef MINMAX_H
#define MINMAX_H
#include <linux/types.h>
/* A single data point for our parameterized min-max tracker */
struct minmax_sample {
u32 t; /* time measurement was taken */
u32 v; /* value measured */
};
/* State for the parameterized min-max tracker */
struct minmax {
struct minmax_sample s[3];
};
static inline u32 minmax_get(const struct minmax *m)
{
return m->s[0].v;
}
static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas)
{
struct minmax_sample val = { .t = t, .v = meas };
m->s[2] = m->s[1] = m->s[0] = val;
return m->s[0].v;
}
u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas);
u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas);
#endif
......@@ -134,8 +134,8 @@ struct inet_connection_sock {
} icsk_mtup;
u32 icsk_user_timeout;
u64 icsk_ca_priv[64 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE (8 * sizeof(u64))
u64 icsk_ca_priv[88 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64))
};
#define ICSK_TIME_RETRANS 1 /* Retransmit timer */
......
......@@ -533,6 +533,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */
u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle);
bool tcp_may_send_now(struct sock *sk);
......@@ -671,7 +673,7 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
return tp->rtt_min[0].rtt;
return minmax_get(&tp->rtt_min);
}
/* Compute the actual receive window we are currently advertising.
......@@ -763,8 +765,16 @@ struct tcp_skb_cb {
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
/* There is space for up to 20 bytes */
__u32 in_flight;/* Bytes in flight when packet sent */
/* There is space for up to 24 bytes */
__u32 in_flight:30,/* Bytes in flight at transmit */
is_app_limited:1, /* cwnd not fully used? */
unused:1;
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered;
/* start of send pipeline phase */
struct skb_mstamp first_tx_mstamp;
/* when we reached the "delivered" count */
struct skb_mstamp delivered_mstamp;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
......@@ -860,6 +870,27 @@ struct ack_sample {
u32 in_flight;
};
/* A rate sample measures the number of (original/retransmitted) data
* packets delivered "delivered" over an interval of time "interval_us".
* The tcp_rate.c code fills in the rate sample, and congestion
* control modules that define a cong_control function to run at the end
* of ACK processing can optionally chose to consult this sample when
* setting cwnd and pacing rate.
* A sample is invalid if "delivered" or "interval_us" is negative.
*/
struct rate_sample {
struct skb_mstamp prior_mstamp; /* starting timestamp for interval */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
s32 delivered; /* number of packets delivered over interval */
long interval_us; /* time for tp->delivered to incr "delivered" */
long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
int losses; /* number of packets marked lost upon ACK */
u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
u32 prior_in_flight; /* in flight before this ACK */
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
};
struct tcp_congestion_ops {
struct list_head list;
u32 key;
......@@ -884,6 +915,14 @@ struct tcp_congestion_ops {
u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
/* suggest number of segments for each skb to transmit (optional) */
u32 (*tso_segs_goal)(struct sock *sk);
/* returns the multiplier used in tcp_sndbuf_expand (optional) */
u32 (*sndbuf_expand)(struct sock *sk);
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
*/
void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
/* get info for inet_diag (optional) */
size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
......@@ -946,6 +985,14 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
icsk->icsk_ca_ops->cwnd_event(sk, event);
}
/* From tcp_rate.c */
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs);
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
struct skb_mstamp *now, struct rate_sample *rs);
void tcp_rate_check_app_limited(struct sock *sk);
/* These functions determine how the current flow behaves in respect of SACK
* handling. SACK is negotiated with the peer, and therefore it can vary
* between different flows.
......
......@@ -124,6 +124,7 @@ enum {
INET_DIAG_PEERS,
INET_DIAG_PAD,
INET_DIAG_MARK,
INET_DIAG_BBRINFO,
__INET_DIAG_MAX,
};
......@@ -157,8 +158,20 @@ struct tcp_dctcp_info {
__u32 dctcp_ab_tot;
};
/* INET_DIAG_BBRINFO */
struct tcp_bbr_info {
/* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
__u32 bbr_bw_lo; /* lower 32 bits of bw */
__u32 bbr_bw_hi; /* upper 32 bits of bw */
__u32 bbr_min_rtt; /* min-filtered RTT in uSec */
__u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
__u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
};
union tcp_cc_info {
struct tcpvegas_info vegas;
struct tcp_dctcp_info dctcp;
struct tcp_bbr_info bbr;
};
#endif /* _UAPI_INET_DIAG_H_ */
......@@ -792,6 +792,8 @@ enum {
TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */
TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
__TCA_FQ_MAX
};
......
......@@ -167,6 +167,7 @@ struct tcp_info {
__u8 tcpi_backoff;
__u8 tcpi_options;
__u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
__u8 tcpi_delivery_rate_app_limited:1;
__u32 tcpi_rto;
__u32 tcpi_ato;
......@@ -211,6 +212,8 @@ struct tcp_info {
__u32 tcpi_min_rtt;
__u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
__u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
__u64 tcpi_delivery_rate;
};
/* for TCP_MD5SIG socket option */
......
......@@ -22,7 +22,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
......
/**
* lib/minmax.c: windowed min/max tracker
*
* Kathleen Nichols' algorithm for tracking the minimum (or maximum)
* value of a data stream over some fixed time interval. (E.g.,
* the minimum RTT over the past five minutes.) It uses constant
* space and constant time per update yet almost always delivers
* the same minimum as an implementation that has to keep all the
* data in the window.
*
* The algorithm keeps track of the best, 2nd best & 3rd best min
* values, maintaining an invariant that the measurement time of
* the n'th best >= n-1'th best. It also makes sure that the three
* values are widely separated in the time window since that bounds
* the worse case error when that data is monotonically increasing
* over the window.
*
* Upon getting a new min, we can forget everything earlier because
* it has no value - the new min is <= everything else in the window
* by definition and it's the most recent. So we restart fresh on
* every new min and overwrites 2nd & 3rd choices. The same property
* holds for 2nd & 3rd best.
*/
#include <linux/module.h>
#include <linux/win_minmax.h>
/* As time advances, update the 1st, 2nd, and 3rd choices. */
static u32 minmax_subwin_update(struct minmax *m, u32 win,
const struct minmax_sample *val)
{
u32 dt = val->t - m->s[0].t;
if (unlikely(dt > win)) {
/*
* Passed entire window without a new val so make 2nd
* choice the new val & 3rd choice the new 2nd choice.
* we may have to iterate this since our 2nd choice
* may also be outside the window (we checked on entry
* that the third choice was in the window).
*/
m->s[0] = m->s[1];
m->s[1] = m->s[2];
m->s[2] = *val;
if (unlikely(val->t - m->s[0].t > win)) {
m->s[0] = m->s[1];
m->s[1] = m->s[2];
m->s[2] = *val;
}
} else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) {
/*
* We've passed a quarter of the window without a new val
* so take a 2nd choice from the 2nd quarter of the window.
*/
m->s[2] = m->s[1] = *val;
} else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) {
/*
* We've passed half the window without finding a new val
* so take a 3rd choice from the last half of the window
*/
m->s[2] = *val;
}
return m->s[0].v;
}
/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */
u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas)
{
struct minmax_sample val = { .t = t, .v = meas };
if (unlikely(val.v >= m->s[0].v) || /* found new max? */
unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */
return minmax_reset(m, t, meas); /* forget earlier samples */
if (unlikely(val.v >= m->s[1].v))
m->s[2] = m->s[1] = val;
else if (unlikely(val.v >= m->s[2].v))
m->s[2] = val;
return minmax_subwin_update(m, win, &val);
}
EXPORT_SYMBOL(minmax_running_max);
/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */
u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas)
{
struct minmax_sample val = { .t = t, .v = meas };
if (unlikely(val.v <= m->s[0].v) || /* found new min? */
unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */
return minmax_reset(m, t, meas); /* forget earlier samples */
if (unlikely(val.v <= m->s[1].v))
m->s[2] = m->s[1] = val;
else if (unlikely(val.v <= m->s[2].v))
m->s[2] = val;
return minmax_subwin_update(m, win, &val);
}
......@@ -640,6 +640,21 @@ config TCP_CONG_CDG
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
config TCP_CONG_BBR
tristate "BBR TCP"
default n
---help---
BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
maximize network utilization and minimize queues. It builds an explicit
model of the the bottleneck delivery rate and path round-trip
propagation delay. It tolerates packet loss and delay unrelated to
congestion. It can operate over LAN, WAN, cellular, wifi, or cable
modem links. It can coexist with flows that use loss-based congestion
control, and can operate with shallow buffers, deep buffers,
bufferbloat, policers, or AQM schemes that do not provide a delay
signal. It requires the fq ("Fair Queue") pacing packet scheduler.
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
......@@ -674,6 +689,9 @@ choice
config DEFAULT_CDG
bool "CDG" if TCP_CONG_CDG=y
config DEFAULT_BBR
bool "BBR" if TCP_CONG_BBR=y
config DEFAULT_RENO
bool "Reno"
endchoice
......
......@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
tcp_recovery.o \
tcp_rate.o tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
......@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
......
......@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
tp->rtt_min[0].rtt = ~0U;
minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
......@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
*/
tp->snd_cwnd = TCP_INIT_CWND;
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
......@@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
res = do_tcp_sendpages(sk, page, offset, size, flags);
release_sock(sk);
return res;
......@@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
......@@ -2704,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
u32 now = tcp_time_stamp, intv;
unsigned int start;
int notsent_bytes;
u64 rate64;
......@@ -2794,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
rate = READ_ONCE(tp->rate_delivered);
intv = READ_ONCE(tp->rate_interval_us);
if (rate && intv) {
rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
do_div(rate64, intv);
put_unaligned(rate64, &info->tcpi_delivery_rate);
}
}
EXPORT_SYMBOL_GPL(tcp_get_info);
......@@ -3261,11 +3278,12 @@ static void __init tcp_init_mem(void)
void __init tcp_init(void)
{
unsigned long limit;
int max_rshare, max_wshare, cnt;
unsigned long limit;
unsigned int i;
sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
FIELD_SIZEOF(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
......
此差异已折叠。
......@@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
module_param(use_tolerance, bool, 0644);
MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
struct minmax {
struct cdg_minmax {
union {
struct {
s32 min;
......@@ -74,10 +74,10 @@ enum cdg_state {
};
struct cdg {
struct minmax rtt;
struct minmax rtt_prev;
struct minmax *gradients;
struct minmax gsum;
struct cdg_minmax rtt;
struct cdg_minmax rtt_prev;
struct cdg_minmax *gradients;
struct cdg_minmax gsum;
bool gfilled;
u8 tail;
u8 state;
......@@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct minmax *gradients;
struct cdg_minmax *gradients;
switch (ev) {
case CA_EVENT_CWND_RESTART:
......
......@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
int ret = 0;
/* all algorithms must implement ssthresh and cong_avoid ops */
if (!ca->ssthresh || !ca->cong_avoid) {
if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
......
......@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
int sndmem, per_mss;
u32 nr_segs;
......@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to POLLOUT)
*/
sndmem = 2 * nr_segs * per_mss;
sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
......@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
/* Sum the number of packets on the wire we have marked as lost.
* There are two cases we care about here:
* a) Packet hasn't been marked lost (nor retransmitted),
* and this is the first loss.
* b) Packet has been marked both lost and retransmitted,
* and this means we think it was lost again.
*/
static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
__u8 sacked = TCP_SKB_CB(skb)->sacked;
if (!(sacked & TCPCB_LOST) ||
((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
tp->lost += tcp_skb_pcount(skb);
}
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);
tp->lost_out += tcp_skb_pcount(skb);
tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
......@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
tcp_sum_lost(tp, skb);
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
......@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
struct rate_sample *rate;
int flag;
};
......@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
&skb->skb_mstamp);
tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
......@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_advance_highest_sack(sk, skb);
tcp_skb_collapse_tstamp(prev, skb);
if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
......@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack,
tcp_skb_pcount(skb),
&skb->skb_mstamp);
tcp_rate_skb_delivered(sk, skb, state->rate);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
......@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
if (found_dup_sack)
if (found_dup_sack) {
state->flag |= FLAG_DSACKING_ACK;
tp->delivered++; /* A spurious retransmission is delivered */
}
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
......@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
bool mark_lost;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
......@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
is_reneg);
if (mark_lost)
tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
if (mark_lost) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
......@@ -2503,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (inet_csk(sk)->icsk_ca_ops->cong_control)
return;
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
(tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
......@@ -2879,67 +2915,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
*rexmit = REXMIT_LOST;
}
/* Kathleen Nichols' algorithm for tracking the minimum value of
* a data stream over some fixed time interval. (E.g., the minimum
* RTT over the past five minutes.) It uses constant space and constant
* time per update yet almost always delivers the same minimum as an
* implementation that has to keep all the data in the window.
*
* The algorithm keeps track of the best, 2nd best & 3rd best min
* values, maintaining an invariant that the measurement time of the
* n'th best >= n-1'th best. It also makes sure that the three values
* are widely separated in the time window since that bounds the worse
* case error when that data is monotonically increasing over the window.
*
* Upon getting a new min, we can forget everything earlier because it
* has no value - the new min is <= everything else in the window by
* definition and it's the most recent. So we restart fresh on every new min
* and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
* best.
*/
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
struct rtt_meas *m = tcp_sk(sk)->rtt_min;
struct rtt_meas rttm = {
.rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
.ts = now,
};
u32 elapsed;
/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
if (unlikely(rttm.rtt <= m[0].rtt))
m[0] = m[1] = m[2] = rttm;
else if (rttm.rtt <= m[1].rtt)
m[1] = m[2] = rttm;
else if (rttm.rtt <= m[2].rtt)
m[2] = rttm;
elapsed = now - m[0].ts;
if (unlikely(elapsed > wlen)) {
/* Passed entire window without a new min so make 2nd choice
* the new min & 3rd choice the new 2nd. So forth and so on.
*/
m[0] = m[1];
m[1] = m[2];
m[2] = rttm;
if (now - m[0].ts > wlen) {
m[0] = m[1];
m[1] = rttm;
if (now - m[0].ts > wlen)
m[0] = rttm;
}
} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
/* Passed a quarter of the window without a new min so
* take 2nd choice from the 2nd quarter of the window.
*/
m[2] = m[1] = rttm;
} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
/* Passed half the window without a new min so take the 3rd
* choice from the last half of the window.
*/
m[2] = rttm;
}
struct tcp_sock *tp = tcp_sk(sk);
u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
rtt_us ? : jiffies_to_usecs(1));
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
......@@ -3102,10 +3084,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
struct tcp_sacktag_state *sack)
struct tcp_sacktag_state *sack,
struct skb_mstamp *now)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt, now;
struct skb_mstamp first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
......@@ -3137,7 +3120,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
fully_acked = false;
} else {
/* Speedup tcp_unlink_write_queue() and next loop */
......@@ -3173,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
tcp_rate_skb_delivered(sk, skb, sack->rate);
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
......@@ -3205,16 +3188,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now);
if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
}
if (sack->first_sackt.v64) {
sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
}
sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us);
......@@ -3242,7 +3224,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
......@@ -3333,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
int flag)
int flag, const struct rate_sample *rs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->cong_control) {
icsk->icsk_ca_ops->cong_control(sk, rs);
return;
}
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
......@@ -3579,17 +3568,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
u32 prior_delivered = tp->delivered;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
sack_state.rate = &rs;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
......@@ -3612,6 +3605,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
skb_mstamp_get(&now);
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
......@@ -3622,6 +3617,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
}
prior_fackets = tp->fackets_out;
rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
......@@ -3677,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state);
&sack_state, &now);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
......@@ -3694,7 +3690,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
tcp_rate_gen(sk, delivered, lost, &now, &rs);
tcp_cong_control(sk, ack, delivered, flag, &rs);
tcp_xmit_recovery(sk, rexmit);
return 1;
......@@ -5993,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else
tcp_init_metrics(sk);
tcp_update_pacing_rate(sk);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
......
......@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
newtp->rtt_min[0].rtt = ~0U;
minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
......@@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
/* There's a bubble in the pipe until at least the first ACK. */
newtp->app_limited = ~0U;
tcp_init_xmit_timers(newsk);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
......
......@@ -918,6 +918,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
tcp_rate_skb_sent(sk, skb);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
......@@ -1213,6 +1214,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
/* Update delivered info for the new segment */
TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
......@@ -1358,6 +1362,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
}
return mtu;
}
EXPORT_SYMBOL(tcp_mss_to_mtu);
/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
......@@ -1545,7 +1550,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs)
{
u32 bytes, segs;
......@@ -1557,10 +1563,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
segs = max_t(u32, bytes / mss_now, min_tso_segs);
return min_t(u32, segs, sk->sk_gso_max_segs);
}
EXPORT_SYMBOL(tcp_tso_autosize);
/* Return the number of segments we want in the skb we are transmitting.
* See if congestion control module wants to decide; otherwise, autosize.
*/
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
return tso_segs ? :
tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
......@@ -2057,7 +2076,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
}
max_segs = tcp_tso_autosize(sk, mss_now);
max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
......@@ -2774,7 +2793,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked;
int segs;
......
#include <net/tcp.h>
/* The bandwidth estimator estimates the rate at which the network
* can currently deliver outbound data packets for this flow. At a high
* level, it operates by taking a delivery rate sample for each ACK.
*
* A rate sample records the rate at which the network delivered packets
* for this flow, calculated over the time interval between the transmission
* of a data packet and the acknowledgment of that packet.
*
* Specifically, over the interval between each transmit and corresponding ACK,
* the estimator generates a delivery rate sample. Typically it uses the rate
* at which packets were acknowledged. However, the approach of using only the
* acknowledgment rate faces a challenge under the prevalent ACK decimation or
* compression: packets can temporarily appear to be delivered much quicker
* than the bottleneck rate. Since it is physically impossible to do that in a
* sustained fashion, when the estimator notices that the ACK rate is faster
* than the transmit rate, it uses the latter:
*
* send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
* ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
* bw = min(send_rate, ack_rate)
*
* Notice the estimator essentially estimates the goodput, not always the
* network bottleneck link rate when the sending or receiving is limited by
* other factors like applications or receiver window limits. The estimator
* deliberately avoids using the inter-packet spacing approach because that
* approach requires a large number of samples and sophisticated filtering.
*
* TCP flows can often be application-limited in request/response workloads.
* The estimator marks a bandwidth sample as application-limited if there
* was some moment during the sampled window of packets when there was no data
* ready to send in the write queue.
*/
/* Snapshot the current delivery information in the skb, to generate
* a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
*/
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* In general we need to start delivery rate samples from the
* time we received the most recent ACK, to ensure we include
* the full time the network needs to deliver all in-flight
* packets. If there are no packets in flight yet, then we
* know that any ACKs after now indicate that the network was
* able to deliver those packets completely in the sampling
* interval between now and the next ACK.
*
* Note that we use packets_out instead of tcp_packets_in_flight(tp)
* because the latter is a guess based on RTO and loss-marking
* heuristics. We don't want spurious RTOs or loss markings to cause
* a spuriously small time interval, causing a spuriously high
* bandwidth estimate.
*/
if (!tp->packets_out) {
tp->first_tx_mstamp = skb->skb_mstamp;
tp->delivered_mstamp = skb->skb_mstamp;
}
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
}
/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
* delivery information when the skb was last transmitted.
*
* If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
* called multiple times. We favor the information from the most recently
* sent skb, i.e., the skb with the highest prior_delivered count.
*/
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
if (!scb->tx.delivered_mstamp.v64)
return;
if (!rs->prior_delivered ||
after(scb->tx.delivered, rs->prior_delivered)) {
rs->prior_delivered = scb->tx.delivered;
rs->prior_mstamp = scb->tx.delivered_mstamp;
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
/* Find the duration of the "send phase" of this window: */
rs->interval_us = skb_mstamp_us_delta(
&skb->skb_mstamp,
&scb->tx.first_tx_mstamp);
/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = skb->skb_mstamp;
}
/* Mark off the skb delivered once it's sacked to avoid being
* used again when it's cumulatively acked. For acked packets
* we don't need to reset since it'll be freed soon.
*/
if (scb->sacked & TCPCB_SACKED_ACKED)
scb->tx.delivered_mstamp.v64 = 0;
}
/* Update the connection delivery information and generate a rate sample. */
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
struct skb_mstamp *now, struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 snd_us, ack_us;
/* Clear app limited if bubble is acked and gone. */
if (tp->app_limited && after(tp->delivered, tp->app_limited))
tp->app_limited = 0;
/* TODO: there are multiple places throughout tcp_ack() to get
* current time. Refactor the code using a new "tcp_acktag_state"
* to carry current time, flags, stats like "tcp_sacktag_state".
*/
if (delivered)
tp->delivered_mstamp = *now;
rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
rs->losses = lost; /* freshly marked lost */
/* Return an invalid sample if no timing information is available. */
if (!rs->prior_mstamp.v64) {
rs->delivered = -1;
rs->interval_us = -1;
return;
}
rs->delivered = tp->delivered - rs->prior_delivered;
/* Model sending data and receiving ACKs as separate pipeline phases
* for a window. Usually the ACK phase is longer, but with ACK
* compression the send phase can be longer. To be safe we use the
* longer phase.
*/
snd_us = rs->interval_us; /* send phase */
ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);
/* Normally we expect interval_us >= min-rtt.
* Note that rate may still be over-estimated when a spuriously
* retransmistted skb was first (s)acked because "interval_us"
* is under-estimated (up to an RTT). However continuously
* measuring the delivery rate during loss recovery is crucial
* for connections suffer heavy or prolonged losses.
*/
if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
if (!rs->is_retrans)
pr_debug("tcp rate: %ld %d %u %u %u\n",
rs->interval_us, rs->delivered,
inet_csk(sk)->icsk_ca_state,
tp->rx_opt.sack_ok, tcp_min_rtt(tp));
rs->interval_us = -1;
return;
}
/* Record the last non-app-limited or the highest app-limited bw */
if (!rs->is_app_limited ||
((u64)rs->delivered * tp->rate_interval_us >=
(u64)tp->rate_delivered * rs->interval_us)) {
tp->rate_delivered = rs->delivered;
tp->rate_interval_us = rs->interval_us;
tp->rate_app_limited = rs->is_app_limited;
}
}
/* If a gap is detected between sends, mark the socket application-limited. */
void tcp_rate_check_app_limited(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (/* We have less than one packet to send. */
tp->write_seq - tp->snd_nxt < tp->mss_cache &&
/* Nothing in sending host's qdisc queues or NIC tx queue. */
sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
/* We are not limited by CWND. */
tcp_packets_in_flight(tp) < tp->snd_cwnd &&
/* All lost packets have been retransmitted. */
tp->lost_out <= tp->retrans_out)
tp->app_limited =
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
}
......@@ -94,6 +94,7 @@ struct fq_sched_data {
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
......@@ -433,7 +434,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
u32 rate;
u32 rate, plen;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
......@@ -482,7 +483,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
prefetch(&skb->end);
f->credit -= qdisc_pkt_len(skb);
if (f->credit > 0 || !q->rate_enable)
if (!q->rate_enable)
goto out;
/* Do not pace locally generated ack packets */
......@@ -493,8 +494,15 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
if (skb->sk)
rate = min(skb->sk->sk_pacing_rate, rate);
if (rate <= q->low_rate_threshold) {
f->credit = 0;
plen = qdisc_pkt_len(skb);
} else {
plen = max(qdisc_pkt_len(skb), q->quantum);
if (f->credit > 0)
goto out;
}
if (rate != ~0U) {
u32 plen = max(qdisc_pkt_len(skb), q->quantum);
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
......@@ -662,6 +670,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt)
......@@ -716,6 +725,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_FQ_FLOW_MAX_RATE])
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
q->low_rate_threshold =
nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
if (tb[TCA_FQ_RATE_ENABLE]) {
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
......@@ -781,6 +794,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
qdisc_watchdog_init(&q->watchdog, sch);
if (opt)
......@@ -811,6 +825,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册