ccid3.c 29.2 KB
Newer Older
1 2 3
/*
 *  net/dccp/ccids/ccid3.c
 *
4
 *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
I
Ian McDonald 已提交
5 6
 *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
 *  Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
 *
 *  An implementation of the DCCP protocol
 *
 *  This code has been developed by the University of Waikato WAND
 *  research group. For further information please see http://www.wand.net.nz/
 *
 *  This code also uses code from Lulea University, rereleased as GPL by its
 *  authors:
 *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
 *
 *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
 *  and to make it work as a loadable module in the DCCP stack written by
 *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
 *
 *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
#include "../dccp.h"
#include "ccid3.h"

G
Gerrit Renker 已提交
40 41
#include <asm/unaligned.h>

42 43 44
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static int ccid3_debug;
#define ccid3_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid3_debug, format, ##a)
45 46 47 48
#else
#define ccid3_pr_debug(format, a...)
#endif

49 50 51
/*
 *	Transmitter Half-Connection Routines
 */
52
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
53 54 55 56 57 58 59 60 61 62 63 64 65
static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
{
	static char *ccid3_state_names[] = {
	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
	[TFRC_SSTATE_FBACK]    = "FBACK",
	[TFRC_SSTATE_TERM]     = "TERM",
	};

	return ccid3_state_names[state];
}
#endif

66 67
static void ccid3_hc_tx_set_state(struct sock *sk,
				  enum ccid3_hc_tx_states state)
68
{
69
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
70 71 72
	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
73 74
		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
		       ccid3_tx_state_name(state));
75 76 77 78
	WARN_ON(state == oldstate);
	hctx->ccid3hctx_state = state;
}

79
/*
80 81 82 83 84 85
 * Compute the initial sending rate X_init in the manner of RFC 3390:
 *
 *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
 *
 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
86 87 88 89
 * For consistency with other parts of the code, X_init is scaled by 2^6.
 */
static inline u64 rfc3390_initial_rate(struct sock *sk)
{
90
	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
91 92
	const __u32 w_init = clamp_t(__u32, 4380U,
			2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
93

94
	return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
95 96
}

97
/*
98
 * Recalculate t_ipi and delta (should be called whenever X changes)
99
 */
I
Ilpo Järvinen 已提交
100
static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
101
{
102
	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
103 104
	hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
					     hctx->ccid3hctx_x);
105

106
	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
107 108
	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
					   TFRC_OPSYS_HALF_TIME_GRAN);
I
Ian McDonald 已提交
109

110
	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
I
Ian McDonald 已提交
111
		       hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
112
		       hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
I
Ian McDonald 已提交
113

114
}
115

116 117 118 119 120 121 122
static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
{
	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);

	return delta / hctx->ccid3hctx_rtt;
}

123 124 125 126
/**
 * ccid3_hc_tx_update_x  -  Update allowed sending rate X
 * @stamp: most recent time if available - can be left NULL.
 * This function tracks draft rfc3448bis, check there for latest details.
127
 *
128 129 130 131 132
 * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
 *       fine-grained resolution of sending rates. This requires scaling by 2^6
 *       throughout the code. Only X_calc is unscaled (in bytes/second).
 *
 */
133
static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
134
{
135
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
136
	__u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
137
	const  __u64 old_x = hctx->ccid3hctx_x;
138
	ktime_t now = stamp ? *stamp : ktime_get_real();
139

140 141
	/*
	 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
142 143
	 * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
	 * a sender is idle if it has not sent anything over a 2-RTT-period.
144 145
	 * For consistency with X and X_recv, min_rate is also scaled by 2^6.
	 */
146
	if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
147 148 149 150
		min_rate = rfc3390_initial_rate(sk);
		min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
	}

151
	if (hctx->ccid3hctx_p > 0) {
152

153
		hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
154
					min_rate);
155 156
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
					(((__u64)hctx->ccid3hctx_s) << 6) /
157
								TFRC_T_MBI);
158

159 160
	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
				- (s64)hctx->ccid3hctx_rtt >= 0) {
161

G
Gerrit Renker 已提交
162 163
		hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
164 165 166
			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
				       hctx->ccid3hctx_rtt));
		hctx->ccid3hctx_t_ld = now;
167
	}
168

I
Ian McDonald 已提交
169
	if (hctx->ccid3hctx_x != old_x) {
170 171 172 173 174
		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
			       "X_recv=%u\n", (unsigned)(old_x >> 6),
			       (unsigned)(hctx->ccid3hctx_x >> 6),
			       hctx->ccid3hctx_x_calc,
			       (unsigned)(hctx->ccid3hctx_x_recv >> 6));
I
Ian McDonald 已提交
175

176
		ccid3_update_send_interval(hctx);
I
Ian McDonald 已提交
177
	}
178 179
}

180
/*
181 182
 *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
 *	@len: DCCP packet payload size in bytes
183 184 185
 */
static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
{
186 187
	const u16 old_s = hctx->ccid3hctx_s;

188
	hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
189 190 191

	if (hctx->ccid3hctx_s != old_s)
		ccid3_update_send_interval(hctx);
192 193
}

194
/*
195
 *	Update Window Counter using the algorithm from [RFC 4342, 8.1].
196
 *	As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
197 198
 */
static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
199
						ktime_t now)
200
{
201 202
	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
	    quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
203 204

	if (quarter_rtts > 0) {
205
		hctx->ccid3hctx_t_last_win_count = now;
206
		hctx->ccid3hctx_last_win_count  += min(quarter_rtts, 5U);
207 208 209 210
		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
	}
}

211 212 213
static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
{
	struct sock *sk = (struct sock *)data;
214
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
215
	unsigned long t_nfb = USEC_PER_SEC / 5;
216 217 218 219 220

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */
		/* XXX: set some sensible MIB */
221
		goto restart_timer;
222 223
	}

224
	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
225
		       ccid3_tx_state_name(hctx->ccid3hctx_state));
226

227 228 229 230 231 232 233 234 235 236 237 238
	if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
	else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
		goto out;

	/*
	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
	 */
	if (hctx->ccid3hctx_t_rto == 0 ||	/* no feedback received yet */
	    hctx->ccid3hctx_p == 0) {

		/* halve send rate directly */
239 240 241
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
					(((__u64)hctx->ccid3hctx_s) << 6) /
								    TFRC_T_MBI);
242
		ccid3_update_send_interval(hctx);
243
	} else {
244
		/*
245
		 *  Modify the cached value of X_recv
246
		 *
247
		 *  If (X_calc > 2 * X_recv)
248 249 250 251 252
		 *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
		 *  Else
		 *    X_recv = X_calc / 4;
		 *
		 *  Note that X_recv is scaled by 2^6 while X_calc is not
253
		 */
254 255
		BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);

256
		if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
257 258 259 260
			hctx->ccid3hctx_x_recv =
				max(hctx->ccid3hctx_x_recv / 2,
				    (((__u64)hctx->ccid3hctx_s) << 6) /
							      (2 * TFRC_T_MBI));
261
		else {
262 263
			hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
			hctx->ccid3hctx_x_recv <<= 4;
264
		}
265
		ccid3_hc_tx_update_x(sk, NULL);
266
	}
267 268 269 270 271 272 273 274 275 276 277
	ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
			(unsigned long long)hctx->ccid3hctx_x);

	/*
	 * Set new timeout for the nofeedback timer.
	 * See comments in packet_recv() regarding the value of t_RTO.
	 */
	if (unlikely(hctx->ccid3hctx_t_rto == 0))	/* no feedback yet */
		t_nfb = TFRC_INITIAL_TIMEOUT;
	else
		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
278

279 280
restart_timer:
	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
281
			   jiffies + usecs_to_jiffies(t_nfb));
282 283 284 285 286
out:
	bh_unlock_sock(sk);
	sock_put(sk);
}

287 288 289 290 291 292
/*
 * returns
 *   > 0: delay (in msecs) that should pass before actually sending
 *   = 0: can send immediately
 *   < 0: error condition; do not send packet
 */
293
static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
294 295
{
	struct dccp_sock *dp = dccp_sk(sk);
296
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
297 298
	ktime_t now = ktime_get_real();
	s64 delay;
299 300

	/*
301 302 303
	 * This function is called only for Data and DataAck packets. Sending
	 * zero-sized Data(Ack)s is theoretically possible, but for congestion
	 * control this case is pathological - ignore it.
304
	 */
305
	if (unlikely(skb->len == 0))
306
		return -EBADMSG;
307 308 309

	switch (hctx->ccid3hctx_state) {
	case TFRC_SSTATE_NO_SENT:
310
		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
311
			       (jiffies +
312
				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
313 314
		hctx->ccid3hctx_last_win_count	 = 0;
		hctx->ccid3hctx_t_last_win_count = now;
315 316

		/* Set t_0 for initial packet */
317
		hctx->ccid3hctx_t_nom = now;
318 319 320 321 322 323 324 325 326 327 328 329

		hctx->ccid3hctx_s = skb->len;

		/*
		 * Use initial RTT sample when available: recommended by erratum
		 * to RFC 4342. This implements the initialisation procedure of
		 * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
		 */
		if (dp->dccps_syn_rtt) {
			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
			hctx->ccid3hctx_rtt  = dp->dccps_syn_rtt;
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
330
			hctx->ccid3hctx_t_ld = now;
331
		} else {
332 333 334 335 336 337 338 339
			/*
			 * Sender does not have RTT sample:
			 * - set fallback RTT (RFC 4340, 3.4) since a RTT value
			 *   is needed in several parts (e.g.  window counter);
			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
			 */
			hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
			hctx->ccid3hctx_x   = hctx->ccid3hctx_s;
340 341 342 343 344
			hctx->ccid3hctx_x <<= 6;
		}
		ccid3_update_send_interval(hctx);

		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
345 346 347
		break;
	case TFRC_SSTATE_NO_FBACK:
	case TFRC_SSTATE_FBACK:
348
		delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
I
Ian McDonald 已提交
349
		ccid3_pr_debug("delay=%ld\n", (long)delay);
350
		/*
351
		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
352 353 354 355 356 357
		 *
		 * if (t_now > t_nom - delta)
		 *       // send the packet now
		 * else
		 *       // send the packet in (t_nom - t_now) milliseconds.
		 */
358
		if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
359
			return (u32)delay / 1000L;
360

361
		ccid3_hc_tx_update_win_count(hctx, now);
362
		break;
363
	case TFRC_SSTATE_TERM:
364
		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
365
		return -EINVAL;
366 367
	}

368 369
	/* prepare to send now (add options etc.) */
	dp->dccps_hc_tx_insert_options = 1;
370 371 372
	DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;

	/* set the nominal send time for the next following packet */
373 374
	hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
					     hctx->ccid3hctx_t_ipi);
375
	return 0;
376 377
}

378 379
static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
				    unsigned int len)
380
{
381
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
382

383
	ccid3_hc_tx_update_s(hctx, len);
384

385
	if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
386
		DCCP_CRIT("packet history - out of memory!");
387 388 389 390
}

static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
391
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
392
	struct ccid3_options_received *opt_recv;
393
	ktime_t now;
394
	unsigned long t_nfb;
395
	u32 pinv, r_sample;
396

397 398 399 400
	/* we are only interested in ACKs */
	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
		return;
401 402 403 404
	/* ... and only in the established state */
	if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
	    hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
		return;
405 406

	opt_recv = &hctx->ccid3hctx_options_received;
407 408 409 410 411 412 413 414 415 416 417
	now = ktime_get_real();

	/* Estimate RTT from history if ACK number is valid */
	r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
	if (r_sample == 0) {
		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
			  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
			  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
		return;
	}
418

419 420 421
	/* Update receive rate in units of 64 * bytes/second */
	hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
	hctx->ccid3hctx_x_recv <<= 6;
422

423 424 425 426 427
	/* Update loss event rate (which is scaled by 1e6) */
	pinv = opt_recv->ccid3or_loss_event_rate;
	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
		hctx->ccid3hctx_p = 0;
	else				       /* can not exceed 100% */
428
		hctx->ccid3hctx_p = scaled_div(1, pinv);
429 430 431 432 433
	/*
	 * Validate new RTT sample and update moving average
	 */
	r_sample = dccp_sample_rtt(sk, r_sample);
	hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
434 435 436
	/*
	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
	 */
437
	if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
438
		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
439

440 441 442 443 444 445
		if (hctx->ccid3hctx_t_rto == 0) {
			/*
			 * Initial feedback packet: Larger Initial Windows (4.2)
			 */
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
			hctx->ccid3hctx_t_ld = now;
446

447
			ccid3_update_send_interval(hctx);
448

449 450 451 452 453 454 455 456
			goto done_computing_x;
		} else if (hctx->ccid3hctx_p == 0) {
			/*
			 * First feedback after nofeedback timer expiry (4.3)
			 */
			goto done_computing_x;
		}
	}
457

458 459 460
	/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
	if (hctx->ccid3hctx_p > 0)
		hctx->ccid3hctx_x_calc =
461 462 463
				tfrc_calc_x(hctx->ccid3hctx_s,
					    hctx->ccid3hctx_rtt,
					    hctx->ccid3hctx_p);
464
	ccid3_hc_tx_update_x(sk, &now);
465

466 467
done_computing_x:
	ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
468 469 470 471 472 473 474
			       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
			       dccp_role(sk),
			       sk, hctx->ccid3hctx_rtt, r_sample,
			       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
			       hctx->ccid3hctx_x_calc,
			       (unsigned)(hctx->ccid3hctx_x_recv >> 6),
			       (unsigned)(hctx->ccid3hctx_x >> 6));
475

476 477
	/* unschedule no feedback timer */
	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
478

479 480 481 482 483
	/*
	 * As we have calculated new ipi, delta, t_nom it is possible
	 * that we now can send a packet, so wake up dccp_wait_for_ccid
	 */
	sk->sk_write_space(sk);
484

485 486 487 488 489 490 491 492 493 494 495 496 497 498
	/*
	 * Update timeout interval for the nofeedback timer.
	 * We use a configuration option to increase the lower bound.
	 * This can help avoid triggering the nofeedback timer too
	 * often ('spinning') on LANs with small RTTs.
	 */
	hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
					   (CONFIG_IP_DCCP_CCID3_RTO *
					    (USEC_PER_SEC / 1000)));
	/*
	 * Schedule no feedback timer to expire in
	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
	 */
	t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
499

500 501 502 503
	ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
		       "expire in %lu jiffies (%luus)\n",
		       dccp_role(sk),
		       sk, usecs_to_jiffies(t_nfb), t_nfb);
504

505 506
	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
			   jiffies + usecs_to_jiffies(t_nfb));
507 508 509
}

static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
510 511
				     unsigned char len, u16 idx,
				     unsigned char *value)
512 513
{
	int rc = 0;
514 515
	const struct dccp_sock *dp = dccp_sk(sk);
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
516
	struct ccid3_options_received *opt_recv;
G
Gerrit Renker 已提交
517
	__be32 opt_val;
518 519 520 521 522 523 524 525 526 527 528 529 530

	opt_recv = &hctx->ccid3hctx_options_received;

	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
		opt_recv->ccid3or_loss_event_rate    = ~0;
		opt_recv->ccid3or_loss_intervals_idx = 0;
		opt_recv->ccid3or_loss_intervals_len = 0;
		opt_recv->ccid3or_receive_rate	     = 0;
	}

	switch (option) {
	case TFRC_OPT_LOSS_EVENT_RATE:
531
		if (unlikely(len != 4)) {
532
			DCCP_WARN("%s(%p), invalid len %d "
533 534
				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
				  dccp_role(sk), sk, len);
535 536
			rc = -EINVAL;
		} else {
G
Gerrit Renker 已提交
537 538
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
539
			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
540 541 542 543 544 545 546
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_loss_event_rate);
		}
		break;
	case TFRC_OPT_LOSS_INTERVALS:
		opt_recv->ccid3or_loss_intervals_idx = idx;
		opt_recv->ccid3or_loss_intervals_len = len;
547
		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
548 549 550 551 552
			       dccp_role(sk), sk,
			       opt_recv->ccid3or_loss_intervals_idx,
			       opt_recv->ccid3or_loss_intervals_len);
		break;
	case TFRC_OPT_RECEIVE_RATE:
553
		if (unlikely(len != 4)) {
554
			DCCP_WARN("%s(%p), invalid len %d "
555 556
				  "for TFRC_OPT_RECEIVE_RATE\n",
				  dccp_role(sk), sk, len);
557 558
			rc = -EINVAL;
		} else {
G
Gerrit Renker 已提交
559 560
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
561
			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
562 563 564 565 566 567 568 569 570
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_receive_rate);
		}
		break;
	}

	return rc;
}

571
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
572
{
573
	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
574 575

	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
576
	hctx->ccid3hctx_hist = NULL;
577 578
	setup_timer(&hctx->ccid3hctx_no_feedback_timer,
			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
579 580 581 582 583 584

	return 0;
}

static void ccid3_hc_tx_exit(struct sock *sk)
{
585
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
586 587 588 589

	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);

590
	tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
591 592
}

593 594
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
595
	struct ccid3_hc_tx_sock *hctx;
596 597 598 599 600

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

601
	hctx = ccid3_hc_tx_sk(sk);
602 603 604 605 606 607 608
	info->tcpi_rto = hctx->ccid3hctx_t_rto;
	info->tcpi_rtt = hctx->ccid3hctx_rtt;
}

static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
609
	const struct ccid3_hc_tx_sock *hctx;
610 611 612 613 614 615
	const void *val;

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

616
	hctx = ccid3_hc_tx_sk(sk);
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
	switch (optname) {
	case DCCP_SOCKOPT_CCID_TX_INFO:
		if (len < sizeof(hctx->ccid3hctx_tfrc))
			return -EINVAL;
		len = sizeof(hctx->ccid3hctx_tfrc);
		val = &hctx->ccid3hctx_tfrc;
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

634
/*
635
 *	Receiver Half-Connection Routines
636
 */
637 638 639 640 641 642 643 644 645

/* CCID3 feedback types */
enum ccid3_fback_type {
	CCID3_FBACK_NONE = 0,
	CCID3_FBACK_INITIAL,
	CCID3_FBACK_PERIODIC,
	CCID3_FBACK_PARAM_CHANGE
};

646
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
647 648 649 650 651 652 653 654 655 656 657 658
static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
{
	static char *ccid3_rx_state_names[] = {
	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
	[TFRC_RSTATE_DATA]    = "DATA",
	[TFRC_RSTATE_TERM]    = "TERM",
	};

	return ccid3_rx_state_names[state];
}
#endif

659 660
static void ccid3_hc_rx_set_state(struct sock *sk,
				  enum ccid3_hc_rx_states state)
661
{
662
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
663 664 665
	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
666 667
		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
		       ccid3_rx_state_name(state));
668 669 670 671
	WARN_ON(state == oldstate);
	hcrx->ccid3hcrx_state = state;
}

672 673 674
static void ccid3_hc_rx_send_feedback(struct sock *sk,
				      const struct sk_buff *skb,
				      enum ccid3_fback_type fbtype)
675
{
676
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
677
	struct dccp_sock *dp = dccp_sk(sk);
678
	ktime_t now;
679
	s64 delta = 0;
680

681 682 683
	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
		return;

684
	now = ktime_get_real();
685

686 687
	switch (fbtype) {
	case CCID3_FBACK_INITIAL:
688
		hcrx->ccid3hcrx_x_recv = 0;
689
		hcrx->ccid3hcrx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
690
		break;
691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711
	case CCID3_FBACK_PARAM_CHANGE:
		/*
		 * When parameters change (new loss or p > p_prev), we do not
		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
		 * need to  reuse the previous value of X_recv. However, when
		 * X_recv was 0 (due to early loss), this would kill X down to
		 * s/t_mbi (i.e. one packet in 64 seconds).
		 * To avoid such drastic reduction, we approximate X_recv as
		 * the number of bytes since last feedback.
		 * This is a safe fallback, since X is bounded above by X_calc.
		 */
		if (hcrx->ccid3hcrx_x_recv > 0)
			break;
		/* fall through */
	case CCID3_FBACK_PERIODIC:
		delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
		if (delta <= 0)
			DCCP_BUG("delta (%ld) <= 0", (long)delta);
		else
			hcrx->ccid3hcrx_x_recv =
				scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
712
		break;
713
	default:
714 715 716
		return;
	}

717 718
	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
		       hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
719

720
	hcrx->ccid3hcrx_tstamp_last_feedback = now;
721
	hcrx->ccid3hcrx_last_counter	     = dccp_hdr(skb)->dccph_ccval;
722 723
	hcrx->ccid3hcrx_bytes_recv	     = 0;

724
	dp->dccps_hc_rx_insert_options = 1;
725 726 727
	dccp_send_ack(sk);
}

728
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
729
{
730
	const struct ccid3_hc_rx_sock *hcrx;
731
	__be32 x_recv, pinv;
732

733
	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
734
		return 0;
735

736
	hcrx = ccid3_hc_rx_sk(sk);
737 738

	if (dccp_packet_without_ack(skb))
739 740
		return 0;

741 742
	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
	pinv   = htonl(hcrx->ccid3hcrx_pinv);
743

744
	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
745
			       &pinv, sizeof(pinv)) ||
746
	    dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
747
			       &x_recv, sizeof(x_recv)))
748 749 750
		return -1;

	return 0;
751 752
}

753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
/** ccid3_first_li  -  Implements [RFC 3448, 6.3.1]
 *
 * Determine the length of the first loss interval via inverse lookup.
 * Assume that X_recv can be computed by the throughput equation
 *		    s
 *	X_recv = --------
 *		 R * fval
 * Find some p such that f(p) = fval; return 1/p (scaled).
 */
static u32 ccid3_first_li(struct sock *sk)
{
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
	u32 x_recv, p, delta;
	u64 fval;

	if (hcrx->ccid3hcrx_rtt == 0) {
		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
		hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
	}

	delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
	x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
	if (x_recv == 0) {		/* would also trigger divide-by-zero */
		DCCP_WARN("X_recv==0\n");
		if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
			DCCP_BUG("stored value of X_recv is zero");
			return ~0U;
		}
	}

	fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
	fval = scaled_div32(fval, x_recv);
	p = tfrc_calc_x_reverse_lookup(fval);

	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);

	return p == 0 ? ~0U : scaled_div(1, p);
}

793
static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
794
{
795
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
	const u32 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
	const bool is_data_packet = dccp_data_packet(skb);

	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
		if (is_data_packet) {
			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
			do_feedback = CCID3_FBACK_INITIAL;
			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
			hcrx->ccid3hcrx_s = payload;
			/*
			 * Not necessary to update ccid3hcrx_bytes_recv here,
			 * since X_recv = 0 for the first feedback packet (cf.
			 * RFC 3448, 6.3) -- gerrit
			 */
I
Ian McDonald 已提交
811
		}
812
		goto update_records;
I
Ian McDonald 已提交
813 814
	}

815 816
	if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
		return; /* done receiving */
817

818 819 820 821 822 823 824
	if (is_data_packet) {
		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
		/*
		 * Update moving-average of s and the sum of received payload bytes
		 */
		hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
		hcrx->ccid3hcrx_bytes_recv += payload;
825 826
	}

827 828 829
	/*
	 * Handle pending losses and otherwise check for new loss
	 */
830 831 832 833 834 835 836 837
	if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist) &&
	    tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist,
				&hcrx->ccid3hcrx_li_hist,
				skb, ndp, ccid3_first_li, sk) ) {
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
		goto done_receiving;
	}

838 839
	if (tfrc_rx_hist_new_loss_indicated(&hcrx->ccid3hcrx_hist, skb, ndp))
		goto update_records;
840

841 842 843 844 845
	/*
	 * Handle data packets: RTT sampling and monitoring p
	 */
	if (unlikely(!is_data_packet))
		goto update_records;
846

847
	if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
848 849 850 851 852 853 854 855
		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
		/*
		 * Empty loss history: no loss so far, hence p stays 0.
		 * Sample RTT values, since an RTT estimate is required for the
		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
		 */
		if (sample != 0)
			hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
856 857 858 859 860 861 862

	} else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
		/*
		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
		 * has decreased (resp. p has increased), send feedback now.
		 */
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
863 864
	}

865 866 867 868 869
	/*
	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
	 */
	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
		do_feedback = CCID3_FBACK_PERIODIC;
I
Ian McDonald 已提交
870

871 872
update_records:
	tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
873

874
done_receiving:
875 876
	if (do_feedback)
		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
877 878
}

879
static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
880
{
881
	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
882 883

	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
884
	tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
885
	return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
886 887 888 889
}

static void ccid3_hc_rx_exit(struct sock *sk)
{
890
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
891 892 893

	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);

894
	tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
895
	tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
896 897
}

898 899
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
900
	const struct ccid3_hc_rx_sock *hcrx;
901

902 903 904 905
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

906
	hcrx = ccid3_hc_rx_sk(sk);
907 908 909
	info->tcpi_ca_state = hcrx->ccid3hcrx_state;
	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
	info->tcpi_rcv_rtt  = hcrx->ccid3hcrx_rtt;
910 911
}

912 913 914
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
915
	const struct ccid3_hc_rx_sock *hcrx;
916
	struct tfrc_rx_info rx_info;
917
	const void *val;
918

919 920 921 922
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

923
	hcrx = ccid3_hc_rx_sk(sk);
924 925
	switch (optname) {
	case DCCP_SOCKOPT_CCID_RX_INFO:
926
		if (len < sizeof(rx_info))
927
			return -EINVAL;
928 929 930 931 932 933
		rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
		rx_info.tfrcrx_rtt    = hcrx->ccid3hcrx_rtt;
		rx_info.tfrcrx_p      = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
					   scaled_div(1, hcrx->ccid3hcrx_pinv);
		len = sizeof(rx_info);
		val = &rx_info;
934 935 936 937 938 939 940 941 942 943 944
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

945
static struct ccid_operations ccid3 = {
I
Ian McDonald 已提交
946
	.ccid_id		   = DCCPC_CCID3,
947
	.ccid_name		   = "TCP-Friendly Rate Control",
948
	.ccid_owner		   = THIS_MODULE,
949
	.ccid_hc_tx_obj_size	   = sizeof(struct ccid3_hc_tx_sock),
950 951 952 953 954 955
	.ccid_hc_tx_init	   = ccid3_hc_tx_init,
	.ccid_hc_tx_exit	   = ccid3_hc_tx_exit,
	.ccid_hc_tx_send_packet	   = ccid3_hc_tx_send_packet,
	.ccid_hc_tx_packet_sent	   = ccid3_hc_tx_packet_sent,
	.ccid_hc_tx_packet_recv	   = ccid3_hc_tx_packet_recv,
	.ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
956
	.ccid_hc_rx_obj_size	   = sizeof(struct ccid3_hc_rx_sock),
957 958 959 960
	.ccid_hc_rx_init	   = ccid3_hc_rx_init,
	.ccid_hc_rx_exit	   = ccid3_hc_rx_exit,
	.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
	.ccid_hc_rx_packet_recv	   = ccid3_hc_rx_packet_recv,
961 962
	.ccid_hc_rx_get_info	   = ccid3_hc_rx_get_info,
	.ccid_hc_tx_get_info	   = ccid3_hc_tx_get_info,
963 964
	.ccid_hc_rx_getsockopt	   = ccid3_hc_rx_getsockopt,
	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
965
};
966

967
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
968
module_param(ccid3_debug, bool, 0444);
969
MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
970
#endif
971 972 973

static __init int ccid3_module_init(void)
{
974
	return ccid_register(&ccid3);
975 976 977 978 979 980 981 982 983
}
module_init(ccid3_module_init);

static __exit void ccid3_module_exit(void)
{
	ccid_unregister(&ccid3);
}
module_exit(ccid3_module_exit);

984
MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
985
	      "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
986 987 988
MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
MODULE_LICENSE("GPL");
MODULE_ALIAS("net-dccp-ccid-3");