ccid3.c 29.3 KB
Newer Older
1 2 3
/*
 *  net/dccp/ccids/ccid3.c
 *
4
 *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
I
Ian McDonald 已提交
5 6
 *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
 *  Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
 *
 *  An implementation of the DCCP protocol
 *
 *  This code has been developed by the University of Waikato WAND
 *  research group. For further information please see http://www.wand.net.nz/
 *
 *  This code also uses code from Lulea University, rereleased as GPL by its
 *  authors:
 *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
 *
 *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
 *  and to make it work as a loadable module in the DCCP stack written by
 *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
 *
 *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
#include "../dccp.h"
#include "ccid3.h"

G
Gerrit Renker 已提交
40 41
#include <asm/unaligned.h>

42 43 44
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static int ccid3_debug;
#define ccid3_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid3_debug, format, ##a)
45 46 47 48
#else
#define ccid3_pr_debug(format, a...)
#endif

49 50 51
/*
 *	Transmitter Half-Connection Routines
 */
52
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
53 54 55 56 57 58 59 60 61 62 63 64 65
static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
{
	static char *ccid3_state_names[] = {
	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
	[TFRC_SSTATE_FBACK]    = "FBACK",
	[TFRC_SSTATE_TERM]     = "TERM",
	};

	return ccid3_state_names[state];
}
#endif

66 67
static void ccid3_hc_tx_set_state(struct sock *sk,
				  enum ccid3_hc_tx_states state)
68
{
69
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
70 71 72
	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
73 74
		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
		       ccid3_tx_state_name(state));
75 76 77 78
	WARN_ON(state == oldstate);
	hctx->ccid3hctx_state = state;
}

79
/*
80 81 82 83 84 85
 * Compute the initial sending rate X_init in the manner of RFC 3390:
 *
 *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
 *
 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
86 87 88 89
 * For consistency with other parts of the code, X_init is scaled by 2^6.
 */
static inline u64 rfc3390_initial_rate(struct sock *sk)
{
90 91 92
	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
	const __u32 w_init = min_t(__u32, 4 * hctx->ccid3hctx_s,
				   max_t(__u32, 2 * hctx->ccid3hctx_s, 4380));
93

94
	return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
95 96
}

97
/*
98
 * Recalculate t_ipi and delta (should be called whenever X changes)
99
 */
100
static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
101
{
102
	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
103 104
	hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
					     hctx->ccid3hctx_x);
105

106
	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
107 108
	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
					   TFRC_OPSYS_HALF_TIME_GRAN);
I
Ian McDonald 已提交
109

110
	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
I
Ian McDonald 已提交
111
		       hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
112
		       hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
I
Ian McDonald 已提交
113

114
}
115

116 117 118 119 120 121 122
static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
{
	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);

	return delta / hctx->ccid3hctx_rtt;
}

123 124 125 126
/**
 * ccid3_hc_tx_update_x  -  Update allowed sending rate X
 * @stamp: most recent time if available - can be left NULL.
 * This function tracks draft rfc3448bis, check there for latest details.
127
 *
128 129 130 131 132
 * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
 *       fine-grained resolution of sending rates. This requires scaling by 2^6
 *       throughout the code. Only X_calc is unscaled (in bytes/second).
 *
 */
133
static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
134

135
{
136
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
137
	__u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
138
	const  __u64 old_x = hctx->ccid3hctx_x;
139
	ktime_t now = stamp? *stamp : ktime_get_real();
140

141 142
	/*
	 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
143 144
	 * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
	 * a sender is idle if it has not sent anything over a 2-RTT-period.
145 146
	 * For consistency with X and X_recv, min_rate is also scaled by 2^6.
	 */
147
	if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
148 149 150 151
		min_rate = rfc3390_initial_rate(sk);
		min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
	}

152
	if (hctx->ccid3hctx_p > 0) {
153

154
		hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
155
					min_rate);
156 157
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
					(((__u64)hctx->ccid3hctx_s) << 6) /
158
								TFRC_T_MBI);
159

160 161
	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
				- (s64)hctx->ccid3hctx_rtt >= 0) {
162

163 164 165 166 167
		hctx->ccid3hctx_x =
			max(min(2 * hctx->ccid3hctx_x, min_rate),
			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
				       hctx->ccid3hctx_rtt));
		hctx->ccid3hctx_t_ld = now;
168
	}
169

I
Ian McDonald 已提交
170
	if (hctx->ccid3hctx_x != old_x) {
171 172 173 174 175
		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
			       "X_recv=%u\n", (unsigned)(old_x >> 6),
			       (unsigned)(hctx->ccid3hctx_x >> 6),
			       hctx->ccid3hctx_x_calc,
			       (unsigned)(hctx->ccid3hctx_x_recv >> 6));
I
Ian McDonald 已提交
176

177
		ccid3_update_send_interval(hctx);
I
Ian McDonald 已提交
178
	}
179 180
}

181
/*
182 183
 *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
 *	@len: DCCP packet payload size in bytes
184 185 186
 */
static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
{
187 188
	const u16 old_s = hctx->ccid3hctx_s;

189
	hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
190 191 192

	if (hctx->ccid3hctx_s != old_s)
		ccid3_update_send_interval(hctx);
193 194
}

195
/*
196 197
 *	Update Window Counter using the algorithm from [RFC 4342, 8.1].
 *	The algorithm is not applicable if RTT < 4 microseconds.
198 199
 */
static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
200
						ktime_t now)
201 202 203 204 205 206
{
	u32 quarter_rtts;

	if (unlikely(hctx->ccid3hctx_rtt < 4))	/* avoid divide-by-zero */
		return;

207 208
	quarter_rtts = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
	quarter_rtts /= hctx->ccid3hctx_rtt / 4;
209 210

	if (quarter_rtts > 0) {
211
		hctx->ccid3hctx_t_last_win_count = now;
212 213 214 215 216
		hctx->ccid3hctx_last_win_count	+= min_t(u32, quarter_rtts, 5);
		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
	}
}

217 218 219
static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
{
	struct sock *sk = (struct sock *)data;
220
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
221
	unsigned long t_nfb = USEC_PER_SEC / 5;
222 223 224 225 226

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */
		/* XXX: set some sensible MIB */
227
		goto restart_timer;
228 229
	}

230
	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
231
		       ccid3_tx_state_name(hctx->ccid3hctx_state));
232

233 234
	switch (hctx->ccid3hctx_state) {
	case TFRC_SSTATE_NO_FBACK:
235
		/* RFC 3448, 4.4: Halve send rate directly */
236 237 238
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
					(((__u64)hctx->ccid3hctx_s) << 6) /
								    TFRC_T_MBI);
239

240 241
		ccid3_pr_debug("%s(%p, state=%s), updated tx rate to %u "
			       "bytes/s\n", dccp_role(sk), sk,
242
			       ccid3_tx_state_name(hctx->ccid3hctx_state),
243
			       (unsigned)(hctx->ccid3hctx_x >> 6));
244
		/* The value of R is still undefined and so we can not recompute
J
Joe Perches 已提交
245
		 * the timeout value. Keep initial value as per [RFC 4342, 5]. */
246
		t_nfb = TFRC_INITIAL_TIMEOUT;
247
		ccid3_update_send_interval(hctx);
248 249
		break;
	case TFRC_SSTATE_FBACK:
250
		/*
251 252 253 254 255 256 257 258
		 *  Modify the cached value of X_recv [RFC 3448, 4.4]
		 *
		 *  If (p == 0 || X_calc > 2 * X_recv)
		 *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
		 *  Else
		 *    X_recv = X_calc / 4;
		 *
		 *  Note that X_recv is scaled by 2^6 while X_calc is not
259
		 */
260 261 262 263 264 265 266 267 268 269 270 271
		BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);

		if (hctx->ccid3hctx_p == 0 ||
		    (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))) {

			hctx->ccid3hctx_x_recv =
				max(hctx->ccid3hctx_x_recv / 2,
				    (((__u64)hctx->ccid3hctx_s) << 6) /
							      (2 * TFRC_T_MBI));
		} else {
			hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
			hctx->ccid3hctx_x_recv <<= 4;
272
		}
273
		/* Now recalculate X [RFC 3448, 4.3, step (4)] */
274
		ccid3_hc_tx_update_x(sk, NULL);
275 276
		/*
		 * Schedule no feedback timer to expire in
277 278
		 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
		 * See comments in packet_recv() regarding the value of t_RTO.
279
		 */
280
		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
281
		break;
282
	case TFRC_SSTATE_NO_SENT:
283
		DCCP_BUG("%s(%p) - Illegal state NO_SENT", dccp_role(sk), sk);
284 285
		/* fall through */
	case TFRC_SSTATE_TERM:
286 287 288
		goto out;
	}

289 290
restart_timer:
	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
291
			   jiffies + usecs_to_jiffies(t_nfb));
292 293 294 295 296
out:
	bh_unlock_sock(sk);
	sock_put(sk);
}

297 298 299 300 301 302
/*
 * returns
 *   > 0: delay (in msecs) that should pass before actually sending
 *   = 0: can send immediately
 *   < 0: error condition; do not send packet
 */
303
static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
304 305
{
	struct dccp_sock *dp = dccp_sk(sk);
306
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
307 308
	ktime_t now = ktime_get_real();
	s64 delay;
309 310

	/*
311 312 313
	 * This function is called only for Data and DataAck packets. Sending
	 * zero-sized Data(Ack)s is theoretically possible, but for congestion
	 * control this case is pathological - ignore it.
314
	 */
315
	if (unlikely(skb->len == 0))
316
		return -EBADMSG;
317 318 319

	switch (hctx->ccid3hctx_state) {
	case TFRC_SSTATE_NO_SENT:
320
		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
321
			       (jiffies +
322
				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
323 324
		hctx->ccid3hctx_last_win_count	 = 0;
		hctx->ccid3hctx_t_last_win_count = now;
325 326

		/* Set t_0 for initial packet */
327
		hctx->ccid3hctx_t_nom = now;
328 329 330 331 332 333 334 335 336 337 338 339

		hctx->ccid3hctx_s = skb->len;

		/*
		 * Use initial RTT sample when available: recommended by erratum
		 * to RFC 4342. This implements the initialisation procedure of
		 * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
		 */
		if (dp->dccps_syn_rtt) {
			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
			hctx->ccid3hctx_rtt  = dp->dccps_syn_rtt;
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
340
			hctx->ccid3hctx_t_ld = now;
341
		} else {
342 343
			/* Sender does not have RTT sample: X_pps = 1 pkt/sec */
			hctx->ccid3hctx_x = hctx->ccid3hctx_s;
344 345 346 347 348
			hctx->ccid3hctx_x <<= 6;
		}
		ccid3_update_send_interval(hctx);

		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
349 350 351
		break;
	case TFRC_SSTATE_NO_FBACK:
	case TFRC_SSTATE_FBACK:
352
		delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
I
Ian McDonald 已提交
353
		ccid3_pr_debug("delay=%ld\n", (long)delay);
354
		/*
355
		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
356 357 358 359 360 361
		 *
		 * if (t_now > t_nom - delta)
		 *       // send the packet now
		 * else
		 *       // send the packet in (t_nom - t_now) milliseconds.
		 */
362
		if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
363
			return (u32)delay / 1000L;
364

365
		ccid3_hc_tx_update_win_count(hctx, now);
366
		break;
367
	case TFRC_SSTATE_TERM:
368
		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
369
		return -EINVAL;
370 371
	}

372 373
	/* prepare to send now (add options etc.) */
	dp->dccps_hc_tx_insert_options = 1;
374 375 376
	DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;

	/* set the nominal send time for the next following packet */
377 378
	hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
					     hctx->ccid3hctx_t_ipi);
379
	return 0;
380 381
}

382 383
static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
				    unsigned int len)
384
{
385
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
386

387
	ccid3_hc_tx_update_s(hctx, len);
388

389
	if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
390
		DCCP_CRIT("packet history - out of memory!");
391 392 393 394
}

static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
395
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
396
	struct ccid3_options_received *opt_recv;
397
	ktime_t now;
398
	unsigned long t_nfb;
399
	u32 pinv, r_sample;
400

401 402 403 404 405 406 407 408 409 410
	/* we are only interested in ACKs */
	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
		return;

	opt_recv = &hctx->ccid3hctx_options_received;

	switch (hctx->ccid3hctx_state) {
	case TFRC_SSTATE_NO_FBACK:
	case TFRC_SSTATE_FBACK:
411 412
		now = ktime_get_real();

413
		/* estimate RTT from history if ACK number is valid */
414 415 416
		r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
					    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
		if (r_sample == 0) {
417 418 419
			DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
				  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
				  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
420 421 422
			return;
		}

423
		/* Update receive rate in units of 64 * bytes/second */
424 425
		hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
		hctx->ccid3hctx_x_recv <<= 6;
426 427 428

		/* Update loss event rate */
		pinv = opt_recv->ccid3or_loss_event_rate;
429
		if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
430
			hctx->ccid3hctx_p = 0;
431
		else				       /* can not exceed 100% */
432
			hctx->ccid3hctx_p = 1000000 / pinv;
433
		/*
434
		 * Validate new RTT sample and update moving average
435
		 */
436
		r_sample = dccp_sample_rtt(sk, r_sample);
437
		hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
438

439
		if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
440 441 442
			/*
			 * Larger Initial Windows [RFC 4342, sec. 5]
			 */
443
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
444
			hctx->ccid3hctx_t_ld = now;
445

446
			ccid3_update_send_interval(hctx);
447

448
			ccid3_pr_debug("%s(%p), s=%u, MSS=%u, "
449
				       "R_sample=%uus, X=%u\n", dccp_role(sk),
A
Andrew Morton 已提交
450
				       sk, hctx->ccid3hctx_s,
451
				       dccp_sk(sk)->dccps_mss_cache, r_sample,
452
				       (unsigned)(hctx->ccid3hctx_x >> 6));
453

454 455
			ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
		} else {
456

457 458 459 460 461 462
			/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
			if (hctx->ccid3hctx_p > 0)
				hctx->ccid3hctx_x_calc =
					tfrc_calc_x(hctx->ccid3hctx_s,
						    hctx->ccid3hctx_rtt,
						    hctx->ccid3hctx_p);
463
			ccid3_hc_tx_update_x(sk, &now);
464

465
			ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
466 467
				       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
				       dccp_role(sk),
468
				       sk, hctx->ccid3hctx_rtt, r_sample,
469 470
				       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
				       hctx->ccid3hctx_x_calc,
471
				       (unsigned)(hctx->ccid3hctx_x_recv >> 6),
472
				       (unsigned)(hctx->ccid3hctx_x >> 6));
473 474 475 476 477
		}

		/* unschedule no feedback timer */
		sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);

478
		/*
479 480
		 * As we have calculated new ipi, delta, t_nom it is possible
		 * that we now can send a packet, so wake up dccp_wait_for_ccid
481 482
		 */
		sk->sk_write_space(sk);
483

484 485 486
		/*
		 * Update timeout interval for the nofeedback timer.
		 * We use a configuration option to increase the lower bound.
487 488
		 * This can help avoid triggering the nofeedback timer too
		 * often ('spinning') on LANs with small RTTs.
489
		 */
490
		hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
491
						   CONFIG_IP_DCCP_CCID3_RTO *
492
						   (USEC_PER_SEC/1000));
493 494
		/*
		 * Schedule no feedback timer to expire in
495
		 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
496
		 */
497
		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
498

499
		ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
500 501
			       "expire in %lu jiffies (%luus)\n",
			       dccp_role(sk),
502 503 504
			       sk, usecs_to_jiffies(t_nfb), t_nfb);

		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
505
				   jiffies + usecs_to_jiffies(t_nfb));
506
		break;
507
	case TFRC_SSTATE_NO_SENT:	/* fall through */
G
Gerrit Renker 已提交
508
	case TFRC_SSTATE_TERM:		/* ignore feedback when closing */
509 510 511 512 513
		break;
	}
}

static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
514 515
				     unsigned char len, u16 idx,
				     unsigned char *value)
516 517
{
	int rc = 0;
518 519
	const struct dccp_sock *dp = dccp_sk(sk);
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
520
	struct ccid3_options_received *opt_recv;
G
Gerrit Renker 已提交
521
	__be32 opt_val;
522 523 524 525 526 527 528 529 530 531 532 533 534

	opt_recv = &hctx->ccid3hctx_options_received;

	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
		opt_recv->ccid3or_loss_event_rate    = ~0;
		opt_recv->ccid3or_loss_intervals_idx = 0;
		opt_recv->ccid3or_loss_intervals_len = 0;
		opt_recv->ccid3or_receive_rate	     = 0;
	}

	switch (option) {
	case TFRC_OPT_LOSS_EVENT_RATE:
535
		if (unlikely(len != 4)) {
536
			DCCP_WARN("%s(%p), invalid len %d "
537 538
				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
				  dccp_role(sk), sk, len);
539 540
			rc = -EINVAL;
		} else {
G
Gerrit Renker 已提交
541 542
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
543
			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
544 545 546 547 548 549 550
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_loss_event_rate);
		}
		break;
	case TFRC_OPT_LOSS_INTERVALS:
		opt_recv->ccid3or_loss_intervals_idx = idx;
		opt_recv->ccid3or_loss_intervals_len = len;
551
		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
552 553 554 555 556
			       dccp_role(sk), sk,
			       opt_recv->ccid3or_loss_intervals_idx,
			       opt_recv->ccid3or_loss_intervals_len);
		break;
	case TFRC_OPT_RECEIVE_RATE:
557
		if (unlikely(len != 4)) {
558
			DCCP_WARN("%s(%p), invalid len %d "
559 560
				  "for TFRC_OPT_RECEIVE_RATE\n",
				  dccp_role(sk), sk, len);
561 562
			rc = -EINVAL;
		} else {
G
Gerrit Renker 已提交
563 564
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
565
			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
566 567 568 569 570 571 572 573 574
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_receive_rate);
		}
		break;
	}

	return rc;
}

575
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
576
{
577
	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
578 579

	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
580
	hctx->ccid3hctx_hist = NULL;
581 582
	setup_timer(&hctx->ccid3hctx_no_feedback_timer,
			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
583 584 585 586 587 588

	return 0;
}

static void ccid3_hc_tx_exit(struct sock *sk)
{
589
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
590 591 592 593

	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);

594
	tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
595 596
}

597 598
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
599
	struct ccid3_hc_tx_sock *hctx;
600 601 602 603 604

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

605
	hctx = ccid3_hc_tx_sk(sk);
606 607 608 609 610 611 612
	info->tcpi_rto = hctx->ccid3hctx_t_rto;
	info->tcpi_rtt = hctx->ccid3hctx_rtt;
}

static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
613
	const struct ccid3_hc_tx_sock *hctx;
614 615 616 617 618 619
	const void *val;

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

620
	hctx = ccid3_hc_tx_sk(sk);
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
	switch (optname) {
	case DCCP_SOCKOPT_CCID_TX_INFO:
		if (len < sizeof(hctx->ccid3hctx_tfrc))
			return -EINVAL;
		len = sizeof(hctx->ccid3hctx_tfrc);
		val = &hctx->ccid3hctx_tfrc;
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

638
/*
639
 *	Receiver Half-Connection Routines
640
 */
641 642 643 644 645 646 647 648 649

/* CCID3 feedback types */
enum ccid3_fback_type {
	CCID3_FBACK_NONE = 0,
	CCID3_FBACK_INITIAL,
	CCID3_FBACK_PERIODIC,
	CCID3_FBACK_PARAM_CHANGE
};

650
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
651 652 653 654 655 656 657 658 659 660 661 662
static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
{
	static char *ccid3_rx_state_names[] = {
	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
	[TFRC_RSTATE_DATA]    = "DATA",
	[TFRC_RSTATE_TERM]    = "TERM",
	};

	return ccid3_rx_state_names[state];
}
#endif

663 664
static void ccid3_hc_rx_set_state(struct sock *sk,
				  enum ccid3_hc_rx_states state)
665
{
666
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
667 668 669
	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
670 671
		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
		       ccid3_rx_state_name(state));
672 673 674 675
	WARN_ON(state == oldstate);
	hcrx->ccid3hcrx_state = state;
}

676 677 678
static void ccid3_hc_rx_send_feedback(struct sock *sk,
				      const struct sk_buff *skb,
				      enum ccid3_fback_type fbtype)
679
{
680
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
681
	struct dccp_sock *dp = dccp_sk(sk);
682
	ktime_t now;
683
	s64 delta = 0;
684

685 686 687
	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
		return;

688
	now = ktime_get_real();
689

690 691
	switch (fbtype) {
	case CCID3_FBACK_INITIAL:
692
		hcrx->ccid3hcrx_x_recv = 0;
693
		hcrx->ccid3hcrx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
694
		break;
695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
	case CCID3_FBACK_PARAM_CHANGE:
		/*
		 * When parameters change (new loss or p > p_prev), we do not
		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
		 * need to  reuse the previous value of X_recv. However, when
		 * X_recv was 0 (due to early loss), this would kill X down to
		 * s/t_mbi (i.e. one packet in 64 seconds).
		 * To avoid such drastic reduction, we approximate X_recv as
		 * the number of bytes since last feedback.
		 * This is a safe fallback, since X is bounded above by X_calc.
		 */
		if (hcrx->ccid3hcrx_x_recv > 0)
			break;
		/* fall through */
	case CCID3_FBACK_PERIODIC:
		delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
		if (delta <= 0)
			DCCP_BUG("delta (%ld) <= 0", (long)delta);
		else
			hcrx->ccid3hcrx_x_recv =
				scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
716
		break;
717
	default:
718 719 720
		return;
	}

721 722
	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
		       hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
723

724
	hcrx->ccid3hcrx_tstamp_last_feedback = now;
725
	hcrx->ccid3hcrx_last_counter	     = dccp_hdr(skb)->dccph_ccval;
726 727
	hcrx->ccid3hcrx_bytes_recv	     = 0;

728
	dp->dccps_hc_rx_insert_options = 1;
729 730 731
	dccp_send_ack(sk);
}

732
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
733
{
734
	const struct ccid3_hc_rx_sock *hcrx;
735
	__be32 x_recv, pinv;
736

737
	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
738
		return 0;
739

740
	hcrx = ccid3_hc_rx_sk(sk);
741 742

	if (dccp_packet_without_ack(skb))
743 744
		return 0;

745 746
	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
	pinv   = htonl(hcrx->ccid3hcrx_pinv);
747

748
	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
749
			       &pinv, sizeof(pinv)) ||
750
	    dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
751
			       &x_recv, sizeof(x_recv)))
752 753 754
		return -1;

	return 0;
755 756
}

757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
/** ccid3_first_li  -  Implements [RFC 3448, 6.3.1]
 *
 * Determine the length of the first loss interval via inverse lookup.
 * Assume that X_recv can be computed by the throughput equation
 *		    s
 *	X_recv = --------
 *		 R * fval
 * Find some p such that f(p) = fval; return 1/p (scaled).
 */
static u32 ccid3_first_li(struct sock *sk)
{
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
	u32 x_recv, p, delta;
	u64 fval;

	if (hcrx->ccid3hcrx_rtt == 0) {
		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
		hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
	}

	delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
	x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
	if (x_recv == 0) {		/* would also trigger divide-by-zero */
		DCCP_WARN("X_recv==0\n");
		if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
			DCCP_BUG("stored value of X_recv is zero");
			return ~0U;
		}
	}

	fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
	fval = scaled_div32(fval, x_recv);
	p = tfrc_calc_x_reverse_lookup(fval);

	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);

	return p == 0 ? ~0U : scaled_div(1, p);
}

797
static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
798
{
799
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
	const u32 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
	const bool is_data_packet = dccp_data_packet(skb);

	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
		if (is_data_packet) {
			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
			do_feedback = CCID3_FBACK_INITIAL;
			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
			hcrx->ccid3hcrx_s = payload;
			/*
			 * Not necessary to update ccid3hcrx_bytes_recv here,
			 * since X_recv = 0 for the first feedback packet (cf.
			 * RFC 3448, 6.3) -- gerrit
			 */
I
Ian McDonald 已提交
815
		}
816
		goto update_records;
I
Ian McDonald 已提交
817 818
	}

819 820
	if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
		return; /* done receiving */
821

822 823 824 825 826 827 828
	if (is_data_packet) {
		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
		/*
		 * Update moving-average of s and the sum of received payload bytes
		 */
		hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
		hcrx->ccid3hcrx_bytes_recv += payload;
829 830
	}

831 832 833
	/*
	 * Handle pending losses and otherwise check for new loss
	 */
834 835 836 837 838 839 840 841
	if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist) &&
	    tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist,
				&hcrx->ccid3hcrx_li_hist,
				skb, ndp, ccid3_first_li, sk) ) {
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
		goto done_receiving;
	}

842 843
	if (tfrc_rx_hist_new_loss_indicated(&hcrx->ccid3hcrx_hist, skb, ndp))
		goto update_records;
844

845 846 847 848 849
	/*
	 * Handle data packets: RTT sampling and monitoring p
	 */
	if (unlikely(!is_data_packet))
		goto update_records;
850

851
	if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
852 853 854 855 856 857 858 859
		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
		/*
		 * Empty loss history: no loss so far, hence p stays 0.
		 * Sample RTT values, since an RTT estimate is required for the
		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
		 */
		if (sample != 0)
			hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
860 861 862 863 864 865 866

	} else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
		/*
		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
		 * has decreased (resp. p has increased), send feedback now.
		 */
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
867 868
	}

869 870 871 872 873
	/*
	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
	 */
	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
		do_feedback = CCID3_FBACK_PERIODIC;
I
Ian McDonald 已提交
874

875 876
update_records:
	tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
877

878
done_receiving:
879 880
	if (do_feedback)
		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
881 882
}

883
static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
884
{
885
	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
886 887

	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
888
	tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
889
	return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
890 891 892 893
}

static void ccid3_hc_rx_exit(struct sock *sk)
{
894
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
895 896 897

	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);

898
	tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
899
	tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
900 901
}

902 903
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
904
	const struct ccid3_hc_rx_sock *hcrx;
905

906 907 908 909
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

910
	hcrx = ccid3_hc_rx_sk(sk);
911 912 913
	info->tcpi_ca_state = hcrx->ccid3hcrx_state;
	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
	info->tcpi_rcv_rtt  = hcrx->ccid3hcrx_rtt;
914 915
}

916 917 918
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
919
	const struct ccid3_hc_rx_sock *hcrx;
920
	struct tfrc_rx_info rx_info;
921
	const void *val;
922

923 924 925 926
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

927
	hcrx = ccid3_hc_rx_sk(sk);
928 929
	switch (optname) {
	case DCCP_SOCKOPT_CCID_RX_INFO:
930
		if (len < sizeof(rx_info))
931
			return -EINVAL;
932 933 934 935 936 937
		rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
		rx_info.tfrcrx_rtt    = hcrx->ccid3hcrx_rtt;
		rx_info.tfrcrx_p      = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
					   scaled_div(1, hcrx->ccid3hcrx_pinv);
		len = sizeof(rx_info);
		val = &rx_info;
938 939 940 941 942 943 944 945 946 947 948
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

949
static struct ccid_operations ccid3 = {
I
Ian McDonald 已提交
950
	.ccid_id		   = DCCPC_CCID3,
951
	.ccid_name		   = "TCP-Friendly Rate Control",
952
	.ccid_owner		   = THIS_MODULE,
953
	.ccid_hc_tx_obj_size	   = sizeof(struct ccid3_hc_tx_sock),
954 955 956 957 958 959
	.ccid_hc_tx_init	   = ccid3_hc_tx_init,
	.ccid_hc_tx_exit	   = ccid3_hc_tx_exit,
	.ccid_hc_tx_send_packet	   = ccid3_hc_tx_send_packet,
	.ccid_hc_tx_packet_sent	   = ccid3_hc_tx_packet_sent,
	.ccid_hc_tx_packet_recv	   = ccid3_hc_tx_packet_recv,
	.ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
960
	.ccid_hc_rx_obj_size	   = sizeof(struct ccid3_hc_rx_sock),
961 962 963 964
	.ccid_hc_rx_init	   = ccid3_hc_rx_init,
	.ccid_hc_rx_exit	   = ccid3_hc_rx_exit,
	.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
	.ccid_hc_rx_packet_recv	   = ccid3_hc_rx_packet_recv,
965 966
	.ccid_hc_rx_get_info	   = ccid3_hc_rx_get_info,
	.ccid_hc_tx_get_info	   = ccid3_hc_tx_get_info,
967 968
	.ccid_hc_rx_getsockopt	   = ccid3_hc_rx_getsockopt,
	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
969
};
970

971
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
972
module_param(ccid3_debug, bool, 0444);
973
MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
974
#endif
975 976 977

static __init int ccid3_module_init(void)
{
978
	return ccid_register(&ccid3);
979 980 981 982 983 984 985 986 987
}
module_init(ccid3_module_init);

static __exit void ccid3_module_exit(void)
{
	ccid_unregister(&ccid3);
}
module_exit(ccid3_module_exit);

988
MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
989
	      "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
990 991 992
MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
MODULE_LICENSE("GPL");
MODULE_ALIAS("net-dccp-ccid-3");