ccid3.c 29.1 KB
Newer Older
1 2 3
/*
 *  net/dccp/ccids/ccid3.c
 *
4
 *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
I
Ian McDonald 已提交
5 6
 *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
 *  Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
 *
 *  An implementation of the DCCP protocol
 *
 *  This code has been developed by the University of Waikato WAND
 *  research group. For further information please see http://www.wand.net.nz/
 *
 *  This code also uses code from Lulea University, rereleased as GPL by its
 *  authors:
 *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
 *
 *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
 *  and to make it work as a loadable module in the DCCP stack written by
 *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
 *
 *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
#include "../dccp.h"
#include "ccid3.h"

G
Gerrit Renker 已提交
40 41
#include <asm/unaligned.h>

42 43 44
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static int ccid3_debug;
#define ccid3_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid3_debug, format, ##a)
45 46 47 48
#else
#define ccid3_pr_debug(format, a...)
#endif

49 50 51
/*
 *	Transmitter Half-Connection Routines
 */
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
{
	static char *ccid3_state_names[] = {
	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
	[TFRC_SSTATE_FBACK]    = "FBACK",
	[TFRC_SSTATE_TERM]     = "TERM",
	};

	return ccid3_state_names[state];
}
#endif

static void ccid3_hc_tx_set_state(struct sock *sk,
				  enum ccid3_hc_tx_states state)
{
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
		       ccid3_tx_state_name(state));
	WARN_ON(state == oldstate);
	hctx->ccid3hctx_state = state;
}
78

79
/*
80 81
 * Compute the initial sending rate X_init in the manner of RFC 3390:
 *
82
 *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
83
 *
84 85
 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
86 87 88 89
 * For consistency with other parts of the code, X_init is scaled by 2^6.
 */
static inline u64 rfc3390_initial_rate(struct sock *sk)
{
90 91 92
	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
	const __u32 w_init = clamp_t(__u32, 4380U,
			2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
93

94
	return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
95 96
}

97 98
/*
 * Recalculate t_ipi and delta (should be called whenever X changes)
99
 */
I
Ilpo Järvinen 已提交
100
static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
101
{
102 103 104 105 106 107 108 109 110 111 112 113
	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
	hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
					     hctx->ccid3hctx_x);

	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
					   TFRC_OPSYS_HALF_TIME_GRAN);

	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
		       hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
		       hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));

114
}
115

116 117
static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
{
118
	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
119

120
	return delta / hctx->ccid3hctx_rtt;
121 122
}

123 124 125 126
/**
 * ccid3_hc_tx_update_x  -  Update allowed sending rate X
 * @stamp: most recent time if available - can be left NULL.
 * This function tracks draft rfc3448bis, check there for latest details.
127
 *
128 129 130 131 132
 * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
 *       fine-grained resolution of sending rates. This requires scaling by 2^6
 *       throughout the code. Only X_calc is unscaled (in bytes/second).
 *
 */
133
static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
134
{
135
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
136 137
	__u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
	const  __u64 old_x = hctx->ccid3hctx_x;
138
	ktime_t now = stamp ? *stamp : ktime_get_real();
139

140 141
	/*
	 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
142 143
	 * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
	 * a sender is idle if it has not sent anything over a 2-RTT-period.
144 145
	 * For consistency with X and X_recv, min_rate is also scaled by 2^6.
	 */
146
	if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
147
		min_rate = rfc3390_initial_rate(sk);
148
		min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
149 150
	}

151
	if (hctx->ccid3hctx_p > 0) {
152

153 154 155 156 157
		hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
					min_rate);
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
					(((__u64)hctx->ccid3hctx_s) << 6) /
								TFRC_T_MBI);
158

159 160
	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
				- (s64)hctx->ccid3hctx_rtt >= 0) {
161

162 163 164 165 166
		hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
				       hctx->ccid3hctx_rtt));
		hctx->ccid3hctx_t_ld = now;
167
	}
168

169
	if (hctx->ccid3hctx_x != old_x) {
170 171
		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
			       "X_recv=%u\n", (unsigned)(old_x >> 6),
172 173 174
			       (unsigned)(hctx->ccid3hctx_x >> 6),
			       hctx->ccid3hctx_x_calc,
			       (unsigned)(hctx->ccid3hctx_x_recv >> 6));
I
Ian McDonald 已提交
175

176
		ccid3_update_send_interval(hctx);
I
Ian McDonald 已提交
177
	}
178 179
}

180
/*
181 182
 *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
 *	@len: DCCP packet payload size in bytes
183
 */
184
static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
185
{
186 187 188 189 190 191
	const u16 old_s = hctx->ccid3hctx_s;

	hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);

	if (hctx->ccid3hctx_s != old_s)
		ccid3_update_send_interval(hctx);
192 193
}

194
/*
195
 *	Update Window Counter using the algorithm from [RFC 4342, 8.1].
196
 *	As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
197 198
 */
static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
199
						ktime_t now)
200
{
201 202
	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
	    quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
203 204

	if (quarter_rtts > 0) {
205 206 207
		hctx->ccid3hctx_t_last_win_count = now;
		hctx->ccid3hctx_last_win_count  += min(quarter_rtts, 5U);
		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
208 209 210
	}
}

211 212 213
static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
{
	struct sock *sk = (struct sock *)data;
214
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
215
	unsigned long t_nfb = USEC_PER_SEC / 5;
216 217 218 219 220

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */
		/* XXX: set some sensible MIB */
221
		goto restart_timer;
222 223
	}

224 225
	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
		       ccid3_tx_state_name(hctx->ccid3hctx_state));
226

227 228 229
	if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
	else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
G
Gerrit Renker 已提交
230 231
		goto out;

232 233 234
	/*
	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
	 */
235 236
	if (hctx->ccid3hctx_t_rto == 0 ||	/* no feedback received yet */
	    hctx->ccid3hctx_p == 0) {
237 238

		/* halve send rate directly */
239 240 241
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
					(((__u64)hctx->ccid3hctx_s) << 6) /
								    TFRC_T_MBI);
242
		ccid3_update_send_interval(hctx);
243
	} else {
244
		/*
245
		 *  Modify the cached value of X_recv
246
		 *
247
		 *  If (X_calc > 2 * X_recv)
248 249 250 251 252
		 *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
		 *  Else
		 *    X_recv = X_calc / 4;
		 *
		 *  Note that X_recv is scaled by 2^6 while X_calc is not
253
		 */
254
		BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
255

256 257 258 259 260
		if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
			hctx->ccid3hctx_x_recv =
				max(hctx->ccid3hctx_x_recv / 2,
				    (((__u64)hctx->ccid3hctx_s) << 6) /
							      (2 * TFRC_T_MBI));
261
		else {
262 263
			hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
			hctx->ccid3hctx_x_recv <<= 4;
264
		}
265
		ccid3_hc_tx_update_x(sk, NULL);
266
	}
267
	ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
268
			(unsigned long long)hctx->ccid3hctx_x);
269 270 271 272 273

	/*
	 * Set new timeout for the nofeedback timer.
	 * See comments in packet_recv() regarding the value of t_RTO.
	 */
274
	if (unlikely(hctx->ccid3hctx_t_rto == 0))	/* no feedback yet */
275 276
		t_nfb = TFRC_INITIAL_TIMEOUT;
	else
277
		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
278

279
restart_timer:
280
	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
281
			   jiffies + usecs_to_jiffies(t_nfb));
282 283 284 285 286
out:
	bh_unlock_sock(sk);
	sock_put(sk);
}

287 288 289 290 291
/*
 * returns
 *   > 0: delay (in msecs) that should pass before actually sending
 *   = 0: can send immediately
 *   < 0: error condition; do not send packet
292
 */
293
static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
294 295
{
	struct dccp_sock *dp = dccp_sk(sk);
296
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
297 298
	ktime_t now = ktime_get_real();
	s64 delay;
299 300

	/*
301 302 303
	 * This function is called only for Data and DataAck packets. Sending
	 * zero-sized Data(Ack)s is theoretically possible, but for congestion
	 * control this case is pathological - ignore it.
304
	 */
305
	if (unlikely(skb->len == 0))
306
		return -EBADMSG;
307

308 309 310 311
	switch (hctx->ccid3hctx_state) {
	case TFRC_SSTATE_NO_SENT:
		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
			       (jiffies +
312
				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
313 314
		hctx->ccid3hctx_last_win_count	 = 0;
		hctx->ccid3hctx_t_last_win_count = now;
315 316

		/* Set t_0 for initial packet */
317 318 319
		hctx->ccid3hctx_t_nom = now;

		hctx->ccid3hctx_s = skb->len;
320 321 322 323 324 325 326 327

		/*
		 * Use initial RTT sample when available: recommended by erratum
		 * to RFC 4342. This implements the initialisation procedure of
		 * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
		 */
		if (dp->dccps_syn_rtt) {
			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
328 329 330
			hctx->ccid3hctx_rtt  = dp->dccps_syn_rtt;
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
			hctx->ccid3hctx_t_ld = now;
331
		} else {
332 333 334 335 336 337
			/*
			 * Sender does not have RTT sample:
			 * - set fallback RTT (RFC 4340, 3.4) since a RTT value
			 *   is needed in several parts (e.g.  window counter);
			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
			 */
338 339 340
			hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
			hctx->ccid3hctx_x   = hctx->ccid3hctx_s;
			hctx->ccid3hctx_x <<= 6;
341 342 343
		}
		ccid3_update_send_interval(hctx);

344 345 346 347 348
		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
		break;
	case TFRC_SSTATE_NO_FBACK:
	case TFRC_SSTATE_FBACK:
		delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
I
Ian McDonald 已提交
349
		ccid3_pr_debug("delay=%ld\n", (long)delay);
350
		/*
351
		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
352 353 354 355 356 357
		 *
		 * if (t_now > t_nom - delta)
		 *       // send the packet now
		 * else
		 *       // send the packet in (t_nom - t_now) milliseconds.
		 */
358 359
		if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
			return (u32)delay / 1000L;
360

361
		ccid3_hc_tx_update_win_count(hctx, now);
362 363 364 365
		break;
	case TFRC_SSTATE_TERM:
		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
		return -EINVAL;
366 367
	}

368 369
	/* prepare to send now (add options etc.) */
	dp->dccps_hc_tx_insert_options = 1;
370
	DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
371 372

	/* set the nominal send time for the next following packet */
373 374 375
	hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
					     hctx->ccid3hctx_t_ipi);
	return 0;
376 377
}

378 379
static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
				    unsigned int len)
380
{
381
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
382

383
	ccid3_hc_tx_update_s(hctx, len);
384

385
	if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
386
		DCCP_CRIT("packet history - out of memory!");
387 388 389 390
}

static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
391
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
392
	struct ccid3_options_received *opt_recv;
393
	ktime_t now;
394
	unsigned long t_nfb;
395
	u32 pinv, r_sample;
396

397 398 399 400
	/* we are only interested in ACKs */
	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
		return;
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
	/* ... and only in the established state */
	if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
	    hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
		return;

	opt_recv = &hctx->ccid3hctx_options_received;
	now = ktime_get_real();

	/* Estimate RTT from history if ACK number is valid */
	r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
	if (r_sample == 0) {
		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
			  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
			  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
416
		return;
417
	}
418

419 420 421
	/* Update receive rate in units of 64 * bytes/second */
	hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
	hctx->ccid3hctx_x_recv <<= 6;
422

423 424 425 426 427 428 429 430 431 432 433
	/* Update loss event rate (which is scaled by 1e6) */
	pinv = opt_recv->ccid3or_loss_event_rate;
	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
		hctx->ccid3hctx_p = 0;
	else				       /* can not exceed 100% */
		hctx->ccid3hctx_p = scaled_div(1, pinv);
	/*
	 * Validate new RTT sample and update moving average
	 */
	r_sample = dccp_sample_rtt(sk, r_sample);
	hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
434 435 436
	/*
	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
	 */
437 438
	if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
439

440
		if (hctx->ccid3hctx_t_rto == 0) {
441 442 443
			/*
			 * Initial feedback packet: Larger Initial Windows (4.2)
			 */
444 445
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
			hctx->ccid3hctx_t_ld = now;
446

447
			ccid3_update_send_interval(hctx);
448

449
			goto done_computing_x;
450
		} else if (hctx->ccid3hctx_p == 0) {
451 452 453 454 455 456
			/*
			 * First feedback after nofeedback timer expiry (4.3)
			 */
			goto done_computing_x;
		}
	}
457

458
	/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
459 460 461 462 463
	if (hctx->ccid3hctx_p > 0)
		hctx->ccid3hctx_x_calc =
				tfrc_calc_x(hctx->ccid3hctx_s,
					    hctx->ccid3hctx_rtt,
					    hctx->ccid3hctx_p);
464
	ccid3_hc_tx_update_x(sk, &now);
465

466 467
done_computing_x:
	ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
468
			       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
469 470 471 472 473 474
			       dccp_role(sk),
			       sk, hctx->ccid3hctx_rtt, r_sample,
			       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
			       hctx->ccid3hctx_x_calc,
			       (unsigned)(hctx->ccid3hctx_x_recv >> 6),
			       (unsigned)(hctx->ccid3hctx_x >> 6));
475

476
	/* unschedule no feedback timer */
477
	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
478

479 480 481 482 483
	/*
	 * As we have calculated new ipi, delta, t_nom it is possible
	 * that we now can send a packet, so wake up dccp_wait_for_ccid
	 */
	sk->sk_write_space(sk);
484

485 486 487 488 489 490
	/*
	 * Update timeout interval for the nofeedback timer.
	 * We use a configuration option to increase the lower bound.
	 * This can help avoid triggering the nofeedback timer too
	 * often ('spinning') on LANs with small RTTs.
	 */
491 492 493
	hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
					   (CONFIG_IP_DCCP_CCID3_RTO *
					    (USEC_PER_SEC / 1000)));
494 495 496 497
	/*
	 * Schedule no feedback timer to expire in
	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
	 */
498
	t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
499

500 501
	ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
		       "expire in %lu jiffies (%luus)\n",
502 503
		       dccp_role(sk),
		       sk, usecs_to_jiffies(t_nfb), t_nfb);
504

505
	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
506
			   jiffies + usecs_to_jiffies(t_nfb));
507 508
}

509 510 511
static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
				     unsigned char len, u16 idx,
				     unsigned char *value)
512
{
513 514
	int rc = 0;
	const struct dccp_sock *dp = dccp_sk(sk);
515
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
516
	struct ccid3_options_received *opt_recv;
G
Gerrit Renker 已提交
517
	__be32 opt_val;
518

519
	opt_recv = &hctx->ccid3hctx_options_received;
520

521 522 523 524 525 526 527
	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
		opt_recv->ccid3or_loss_event_rate    = ~0;
		opt_recv->ccid3or_loss_intervals_idx = 0;
		opt_recv->ccid3or_loss_intervals_len = 0;
		opt_recv->ccid3or_receive_rate	     = 0;
	}
528

529 530 531 532 533 534 535
	switch (option) {
	case TFRC_OPT_LOSS_EVENT_RATE:
		if (unlikely(len != 4)) {
			DCCP_WARN("%s(%p), invalid len %d "
				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
				  dccp_role(sk), sk, len);
			rc = -EINVAL;
536
		} else {
537 538
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
539
			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
540 541
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_loss_event_rate);
542
		}
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
		break;
	case TFRC_OPT_LOSS_INTERVALS:
		opt_recv->ccid3or_loss_intervals_idx = idx;
		opt_recv->ccid3or_loss_intervals_len = len;
		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
			       dccp_role(sk), sk,
			       opt_recv->ccid3or_loss_intervals_idx,
			       opt_recv->ccid3or_loss_intervals_len);
		break;
	case TFRC_OPT_RECEIVE_RATE:
		if (unlikely(len != 4)) {
			DCCP_WARN("%s(%p), invalid len %d "
				  "for TFRC_OPT_RECEIVE_RATE\n",
				  dccp_role(sk), sk, len);
			rc = -EINVAL;
		} else {
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_receive_rate);
		}
		break;
566
	}
567 568

	return rc;
569 570
}

571
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
572
{
573
	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
574

575 576 577 578 579
	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
	hctx->ccid3hctx_hist = NULL;
	setup_timer(&hctx->ccid3hctx_no_feedback_timer,
			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);

580 581 582 583 584
	return 0;
}

static void ccid3_hc_tx_exit(struct sock *sk)
{
585
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
586

587 588 589 590
	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);

	tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
591 592
}

593 594
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
595 596 597 598 599 600 601 602 603
	struct ccid3_hc_tx_sock *hctx;

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

	hctx = ccid3_hc_tx_sk(sk);
	info->tcpi_rto = hctx->ccid3hctx_t_rto;
	info->tcpi_rtt = hctx->ccid3hctx_rtt;
604 605 606 607 608
}

static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
609
	const struct ccid3_hc_tx_sock *hctx;
610 611
	const void *val;

612 613 614 615 616
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

	hctx = ccid3_hc_tx_sk(sk);
617 618
	switch (optname) {
	case DCCP_SOCKOPT_CCID_TX_INFO:
619
		if (len < sizeof(hctx->ccid3hctx_tfrc))
620
			return -EINVAL;
621 622
		len = sizeof(hctx->ccid3hctx_tfrc);
		val = &hctx->ccid3hctx_tfrc;
623 624 625 626 627 628 629 630 631 632 633
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

634
/*
635
 *	Receiver Half-Connection Routines
636
 */
637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

/* CCID3 feedback types */
enum ccid3_fback_type {
	CCID3_FBACK_NONE = 0,
	CCID3_FBACK_INITIAL,
	CCID3_FBACK_PERIODIC,
	CCID3_FBACK_PARAM_CHANGE
};

#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
{
	static char *ccid3_rx_state_names[] = {
	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
	[TFRC_RSTATE_DATA]    = "DATA",
	[TFRC_RSTATE_TERM]    = "TERM",
	};

	return ccid3_rx_state_names[state];
}
#endif

static void ccid3_hc_rx_set_state(struct sock *sk,
				  enum ccid3_hc_rx_states state)
{
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
		       ccid3_rx_state_name(state));
	WARN_ON(state == oldstate);
	hcrx->ccid3hcrx_state = state;
}

672 673 674
static void ccid3_hc_rx_send_feedback(struct sock *sk,
				      const struct sk_buff *skb,
				      enum ccid3_fback_type fbtype)
675
{
676
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
677 678 679 680 681 682 683 684
	struct dccp_sock *dp = dccp_sk(sk);
	ktime_t now;
	s64 delta = 0;

	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
		return;

	now = ktime_get_real();
685

686 687
	switch (fbtype) {
	case CCID3_FBACK_INITIAL:
688 689
		hcrx->ccid3hcrx_x_recv = 0;
		hcrx->ccid3hcrx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
690
		break;
691 692 693 694
	case CCID3_FBACK_PARAM_CHANGE:
		/*
		 * When parameters change (new loss or p > p_prev), we do not
		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
695 696 697 698 699 700
		 * need to  reuse the previous value of X_recv. However, when
		 * X_recv was 0 (due to early loss), this would kill X down to
		 * s/t_mbi (i.e. one packet in 64 seconds).
		 * To avoid such drastic reduction, we approximate X_recv as
		 * the number of bytes since last feedback.
		 * This is a safe fallback, since X is bounded above by X_calc.
701
		 */
702 703 704
		if (hcrx->ccid3hcrx_x_recv > 0)
			break;
		/* fall through */
705
	case CCID3_FBACK_PERIODIC:
706 707 708 709 710 711
		delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
		if (delta <= 0)
			DCCP_BUG("delta (%ld) <= 0", (long)delta);
		else
			hcrx->ccid3hcrx_x_recv =
				scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
712
		break;
713
	default:
714 715 716
		return;
	}

717 718
	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
		       hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
719

720 721 722
	hcrx->ccid3hcrx_tstamp_last_feedback = now;
	hcrx->ccid3hcrx_last_counter	     = dccp_hdr(skb)->dccph_ccval;
	hcrx->ccid3hcrx_bytes_recv	     = 0;
723

724 725
	dp->dccps_hc_rx_insert_options = 1;
	dccp_send_ack(sk);
726 727
}

728
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
729
{
730
	const struct ccid3_hc_rx_sock *hcrx;
731
	__be32 x_recv, pinv;
732

733
	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
734
		return 0;
735

736 737
	hcrx = ccid3_hc_rx_sk(sk);

738
	if (dccp_packet_without_ack(skb))
739 740
		return 0;

741 742
	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
	pinv   = htonl(hcrx->ccid3hcrx_pinv);
743

744
	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
745
			       &pinv, sizeof(pinv)) ||
746
	    dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
747
			       &x_recv, sizeof(x_recv)))
748 749 750
		return -1;

	return 0;
751 752
}

753 754 755 756 757 758 759 760 761 762 763 764
/** ccid3_first_li  -  Implements [RFC 3448, 6.3.1]
 *
 * Determine the length of the first loss interval via inverse lookup.
 * Assume that X_recv can be computed by the throughput equation
 *		    s
 *	X_recv = --------
 *		 R * fval
 * Find some p such that f(p) = fval; return 1/p (scaled).
 */
static u32 ccid3_first_li(struct sock *sk)
{
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
765
	u32 x_recv, p, delta;
766 767
	u64 fval;

768 769 770 771
	if (hcrx->ccid3hcrx_rtt == 0) {
		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
		hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
	}
772

773 774 775 776 777 778 779 780 781
	delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
	x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
	if (x_recv == 0) {		/* would also trigger divide-by-zero */
		DCCP_WARN("X_recv==0\n");
		if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
			DCCP_BUG("stored value of X_recv is zero");
			return ~0U;
		}
	}
782

783 784
	fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
	fval = scaled_div32(fval, x_recv);
785 786 787 788 789
	p = tfrc_calc_x_reverse_lookup(fval);

	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);

790
	return p == 0 ? ~0U : scaled_div(1, p);
791 792
}

793
static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
794
{
795
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
796
	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
797
	const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
798 799
	const bool is_data_packet = dccp_data_packet(skb);

800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
		if (is_data_packet) {
			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
			do_feedback = CCID3_FBACK_INITIAL;
			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
			hcrx->ccid3hcrx_s = payload;
			/*
			 * Not necessary to update ccid3hcrx_bytes_recv here,
			 * since X_recv = 0 for the first feedback packet (cf.
			 * RFC 3448, 6.3) -- gerrit
			 */
		}
		goto update_records;
	}

	if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
		return; /* done receiving */

	if (is_data_packet) {
		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
		/*
		 * Update moving-average of s and the sum of received payload bytes
		 */
		hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
		hcrx->ccid3hcrx_bytes_recv += payload;
	}

827 828 829
	/*
	 * Perform loss detection and handle pending losses
	 */
830 831 832 833 834 835 836 837 838
	if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist,
				skb, ndp, ccid3_first_li, sk)) {
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
		goto done_receiving;
	}

	if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
		return; /* done receiving */

839
	/*
840
	 * Handle data packets: RTT sampling and monitoring p
841
	 */
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
	if (unlikely(!is_data_packet))
		goto update_records;

	if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
		/*
		 * Empty loss history: no loss so far, hence p stays 0.
		 * Sample RTT values, since an RTT estimate is required for the
		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
		 */
		if (sample != 0)
			hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);

	} else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
		/*
		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
		 * has decreased (resp. p has increased), send feedback now.
		 */
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
	}

863 864 865
	/*
	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
	 */
866 867 868 869 870 871 872 873 874
	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
		do_feedback = CCID3_FBACK_PERIODIC;

update_records:
	tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);

done_receiving:
	if (do_feedback)
		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
875 876
}

877
static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
878
{
879
	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
880

881 882 883
	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
	tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
	return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
884 885 886 887
}

static void ccid3_hc_rx_exit(struct sock *sk)
{
888
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
889

890 891 892 893
	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);

	tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
	tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
894 895
}

896 897
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
898 899 900 901 902 903 904 905
	const struct ccid3_hc_rx_sock *hcrx;

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

	hcrx = ccid3_hc_rx_sk(sk);
	info->tcpi_ca_state = hcrx->ccid3hcrx_state;
906
	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
907
	info->tcpi_rcv_rtt  = hcrx->ccid3hcrx_rtt;
908 909
}

910 911 912
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
913
	const struct ccid3_hc_rx_sock *hcrx;
914
	struct tfrc_rx_info rx_info;
915
	const void *val;
916

917 918 919 920 921
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

	hcrx = ccid3_hc_rx_sk(sk);
922 923
	switch (optname) {
	case DCCP_SOCKOPT_CCID_RX_INFO:
924
		if (len < sizeof(rx_info))
925
			return -EINVAL;
926 927 928 929
		rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
		rx_info.tfrcrx_rtt    = hcrx->ccid3hcrx_rtt;
		rx_info.tfrcrx_p      = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
					   scaled_div(1, hcrx->ccid3hcrx_pinv);
930 931
		len = sizeof(rx_info);
		val = &rx_info;
932 933 934 935 936 937 938 939 940 941 942
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

943
static struct ccid_operations ccid3 = {
I
Ian McDonald 已提交
944
	.ccid_id		   = DCCPC_CCID3,
945
	.ccid_name		   = "TCP-Friendly Rate Control",
946
	.ccid_owner		   = THIS_MODULE,
947
	.ccid_hc_tx_obj_size	   = sizeof(struct ccid3_hc_tx_sock),
948 949 950 951 952 953
	.ccid_hc_tx_init	   = ccid3_hc_tx_init,
	.ccid_hc_tx_exit	   = ccid3_hc_tx_exit,
	.ccid_hc_tx_send_packet	   = ccid3_hc_tx_send_packet,
	.ccid_hc_tx_packet_sent	   = ccid3_hc_tx_packet_sent,
	.ccid_hc_tx_packet_recv	   = ccid3_hc_tx_packet_recv,
	.ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
954
	.ccid_hc_rx_obj_size	   = sizeof(struct ccid3_hc_rx_sock),
955 956 957 958
	.ccid_hc_rx_init	   = ccid3_hc_rx_init,
	.ccid_hc_rx_exit	   = ccid3_hc_rx_exit,
	.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
	.ccid_hc_rx_packet_recv	   = ccid3_hc_rx_packet_recv,
959 960
	.ccid_hc_rx_get_info	   = ccid3_hc_rx_get_info,
	.ccid_hc_tx_get_info	   = ccid3_hc_tx_get_info,
961 962
	.ccid_hc_rx_getsockopt	   = ccid3_hc_rx_getsockopt,
	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
963
};
964

965
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
966
module_param(ccid3_debug, bool, 0644);
967
MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
968
#endif
969 970 971

static __init int ccid3_module_init(void)
{
972
	return ccid_register(&ccid3);
973 974 975 976 977 978 979 980 981
}
module_init(ccid3_module_init);

static __exit void ccid3_module_exit(void)
{
	ccid_unregister(&ccid3);
}
module_exit(ccid3_module_exit);

982
MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
983
	      "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
984 985 986
MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
MODULE_LICENSE("GPL");
MODULE_ALIAS("net-dccp-ccid-3");