ccid3.c 29.0 KB
Newer Older
1 2 3
/*
 *  net/dccp/ccids/ccid3.c
 *
4
 *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
I
Ian McDonald 已提交
5 6
 *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
 *  Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
 *
 *  An implementation of the DCCP protocol
 *
 *  This code has been developed by the University of Waikato WAND
 *  research group. For further information please see http://www.wand.net.nz/
 *
 *  This code also uses code from Lulea University, rereleased as GPL by its
 *  authors:
 *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
 *
 *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
 *  and to make it work as a loadable module in the DCCP stack written by
 *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
 *
 *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
#include "../dccp.h"
#include "ccid3.h"

G
Gerrit Renker 已提交
40 41
#include <asm/unaligned.h>

42 43 44
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static int ccid3_debug;
#define ccid3_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid3_debug, format, ##a)
45 46 47 48
#else
#define ccid3_pr_debug(format, a...)
#endif

49 50 51
/*
 *	Transmitter Half-Connection Routines
 */
52
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
53 54 55 56 57 58 59 60 61 62 63 64 65
static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
{
	static char *ccid3_state_names[] = {
	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
	[TFRC_SSTATE_FBACK]    = "FBACK",
	[TFRC_SSTATE_TERM]     = "TERM",
	};

	return ccid3_state_names[state];
}
#endif

66 67
static void ccid3_hc_tx_set_state(struct sock *sk,
				  enum ccid3_hc_tx_states state)
68
{
69
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
70 71 72
	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
73 74
		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
		       ccid3_tx_state_name(state));
75 76 77 78
	WARN_ON(state == oldstate);
	hctx->ccid3hctx_state = state;
}

79
/*
80 81 82 83 84 85
 * Compute the initial sending rate X_init in the manner of RFC 3390:
 *
 *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
 *
 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
86 87 88 89
 * For consistency with other parts of the code, X_init is scaled by 2^6.
 */
static inline u64 rfc3390_initial_rate(struct sock *sk)
{
90 91 92
	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
	const __u32 w_init = min_t(__u32, 4 * hctx->ccid3hctx_s,
				   max_t(__u32, 2 * hctx->ccid3hctx_s, 4380));
93

94
	return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
95 96
}

97
/*
98
 * Recalculate t_ipi and delta (should be called whenever X changes)
99
 */
100
static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
101
{
102
	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
103 104
	hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
					     hctx->ccid3hctx_x);
105

106
	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
107 108
	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
					   TFRC_OPSYS_HALF_TIME_GRAN);
I
Ian McDonald 已提交
109

110
	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
I
Ian McDonald 已提交
111
		       hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
112
		       hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
I
Ian McDonald 已提交
113

114
}
115

116 117 118 119 120 121 122
static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
{
	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);

	return delta / hctx->ccid3hctx_rtt;
}

123 124 125 126
/**
 * ccid3_hc_tx_update_x  -  Update allowed sending rate X
 * @stamp: most recent time if available - can be left NULL.
 * This function tracks draft rfc3448bis, check there for latest details.
127
 *
128 129 130 131 132
 * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
 *       fine-grained resolution of sending rates. This requires scaling by 2^6
 *       throughout the code. Only X_calc is unscaled (in bytes/second).
 *
 */
133
static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
134
{
135
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
136
	__u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
137
	const  __u64 old_x = hctx->ccid3hctx_x;
138
	ktime_t now = stamp ? *stamp : ktime_get_real();
139

140 141
	/*
	 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
142 143
	 * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
	 * a sender is idle if it has not sent anything over a 2-RTT-period.
144 145
	 * For consistency with X and X_recv, min_rate is also scaled by 2^6.
	 */
146
	if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
147 148 149 150
		min_rate = rfc3390_initial_rate(sk);
		min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
	}

151
	if (hctx->ccid3hctx_p > 0) {
152

153
		hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
154
					min_rate);
155 156
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
					(((__u64)hctx->ccid3hctx_s) << 6) /
157
								TFRC_T_MBI);
158

159 160
	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
				- (s64)hctx->ccid3hctx_rtt >= 0) {
161

162 163 164 165 166
		hctx->ccid3hctx_x =
			max(min(2 * hctx->ccid3hctx_x, min_rate),
			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
				       hctx->ccid3hctx_rtt));
		hctx->ccid3hctx_t_ld = now;
167
	}
168

I
Ian McDonald 已提交
169
	if (hctx->ccid3hctx_x != old_x) {
170 171 172 173 174
		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
			       "X_recv=%u\n", (unsigned)(old_x >> 6),
			       (unsigned)(hctx->ccid3hctx_x >> 6),
			       hctx->ccid3hctx_x_calc,
			       (unsigned)(hctx->ccid3hctx_x_recv >> 6));
I
Ian McDonald 已提交
175

176
		ccid3_update_send_interval(hctx);
I
Ian McDonald 已提交
177
	}
178 179
}

180
/*
181 182
 *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
 *	@len: DCCP packet payload size in bytes
183 184 185
 */
static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
{
186 187
	const u16 old_s = hctx->ccid3hctx_s;

188
	hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
189 190 191

	if (hctx->ccid3hctx_s != old_s)
		ccid3_update_send_interval(hctx);
192 193
}

194
/*
195 196
 *	Update Window Counter using the algorithm from [RFC 4342, 8.1].
 *	The algorithm is not applicable if RTT < 4 microseconds.
197 198
 */
static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
199
						ktime_t now)
200 201 202 203 204 205
{
	u32 quarter_rtts;

	if (unlikely(hctx->ccid3hctx_rtt < 4))	/* avoid divide-by-zero */
		return;

206 207
	quarter_rtts = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
	quarter_rtts /= hctx->ccid3hctx_rtt / 4;
208 209

	if (quarter_rtts > 0) {
210
		hctx->ccid3hctx_t_last_win_count = now;
211 212 213 214 215
		hctx->ccid3hctx_last_win_count	+= min_t(u32, quarter_rtts, 5);
		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
	}
}

216 217 218
static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
{
	struct sock *sk = (struct sock *)data;
219
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
220
	unsigned long t_nfb = USEC_PER_SEC / 5;
221 222 223 224 225

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */
		/* XXX: set some sensible MIB */
226
		goto restart_timer;
227 228
	}

229
	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
230
		       ccid3_tx_state_name(hctx->ccid3hctx_state));
231

232 233 234 235 236 237 238 239 240 241 242 243
	if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
	else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
		goto out;

	/*
	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
	 */
	if (hctx->ccid3hctx_t_rto == 0 ||	/* no feedback received yet */
	    hctx->ccid3hctx_p == 0) {

		/* halve send rate directly */
244 245 246
		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
					(((__u64)hctx->ccid3hctx_s) << 6) /
								    TFRC_T_MBI);
247
		ccid3_update_send_interval(hctx);
248
	} else {
249
		/*
250
		 *  Modify the cached value of X_recv
251
		 *
252
		 *  If (X_calc > 2 * X_recv)
253 254 255 256 257
		 *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
		 *  Else
		 *    X_recv = X_calc / 4;
		 *
		 *  Note that X_recv is scaled by 2^6 while X_calc is not
258
		 */
259 260
		BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);

261
		if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
262 263 264 265
			hctx->ccid3hctx_x_recv =
				max(hctx->ccid3hctx_x_recv / 2,
				    (((__u64)hctx->ccid3hctx_s) << 6) /
							      (2 * TFRC_T_MBI));
266
		else {
267 268
			hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
			hctx->ccid3hctx_x_recv <<= 4;
269
		}
270
		ccid3_hc_tx_update_x(sk, NULL);
271
	}
272 273 274 275 276 277 278 279 280 281 282
	ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
			(unsigned long long)hctx->ccid3hctx_x);

	/*
	 * Set new timeout for the nofeedback timer.
	 * See comments in packet_recv() regarding the value of t_RTO.
	 */
	if (unlikely(hctx->ccid3hctx_t_rto == 0))	/* no feedback yet */
		t_nfb = TFRC_INITIAL_TIMEOUT;
	else
		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
283

284 285
restart_timer:
	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
286
			   jiffies + usecs_to_jiffies(t_nfb));
287 288 289 290 291
out:
	bh_unlock_sock(sk);
	sock_put(sk);
}

292 293 294 295 296 297
/*
 * returns
 *   > 0: delay (in msecs) that should pass before actually sending
 *   = 0: can send immediately
 *   < 0: error condition; do not send packet
 */
298
static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
299 300
{
	struct dccp_sock *dp = dccp_sk(sk);
301
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
302 303
	ktime_t now = ktime_get_real();
	s64 delay;
304 305

	/*
306 307 308
	 * This function is called only for Data and DataAck packets. Sending
	 * zero-sized Data(Ack)s is theoretically possible, but for congestion
	 * control this case is pathological - ignore it.
309
	 */
310
	if (unlikely(skb->len == 0))
311
		return -EBADMSG;
312 313 314

	switch (hctx->ccid3hctx_state) {
	case TFRC_SSTATE_NO_SENT:
315
		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
316
			       (jiffies +
317
				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
318 319
		hctx->ccid3hctx_last_win_count	 = 0;
		hctx->ccid3hctx_t_last_win_count = now;
320 321

		/* Set t_0 for initial packet */
322
		hctx->ccid3hctx_t_nom = now;
323 324 325 326 327 328 329 330 331 332 333 334

		hctx->ccid3hctx_s = skb->len;

		/*
		 * Use initial RTT sample when available: recommended by erratum
		 * to RFC 4342. This implements the initialisation procedure of
		 * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
		 */
		if (dp->dccps_syn_rtt) {
			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
			hctx->ccid3hctx_rtt  = dp->dccps_syn_rtt;
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
335
			hctx->ccid3hctx_t_ld = now;
336
		} else {
337 338
			/* Sender does not have RTT sample: X_pps = 1 pkt/sec */
			hctx->ccid3hctx_x = hctx->ccid3hctx_s;
339 340 341 342 343
			hctx->ccid3hctx_x <<= 6;
		}
		ccid3_update_send_interval(hctx);

		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
344 345 346
		break;
	case TFRC_SSTATE_NO_FBACK:
	case TFRC_SSTATE_FBACK:
347
		delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
I
Ian McDonald 已提交
348
		ccid3_pr_debug("delay=%ld\n", (long)delay);
349
		/*
350
		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
351 352 353 354 355 356
		 *
		 * if (t_now > t_nom - delta)
		 *       // send the packet now
		 * else
		 *       // send the packet in (t_nom - t_now) milliseconds.
		 */
357
		if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
358
			return (u32)delay / 1000L;
359

360
		ccid3_hc_tx_update_win_count(hctx, now);
361
		break;
362
	case TFRC_SSTATE_TERM:
363
		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
364
		return -EINVAL;
365 366
	}

367 368
	/* prepare to send now (add options etc.) */
	dp->dccps_hc_tx_insert_options = 1;
369 370 371
	DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;

	/* set the nominal send time for the next following packet */
372 373
	hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
					     hctx->ccid3hctx_t_ipi);
374
	return 0;
375 376
}

377 378
static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
				    unsigned int len)
379
{
380
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
381

382
	ccid3_hc_tx_update_s(hctx, len);
383

384
	if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
385
		DCCP_CRIT("packet history - out of memory!");
386 387 388 389
}

static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
390
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
391
	struct ccid3_options_received *opt_recv;
392
	ktime_t now;
393
	unsigned long t_nfb;
394
	u32 pinv, r_sample;
395

396 397 398 399
	/* we are only interested in ACKs */
	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
		return;
400 401 402 403
	/* ... and only in the established state */
	if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
	    hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
		return;
404 405

	opt_recv = &hctx->ccid3hctx_options_received;
406 407 408 409 410 411 412 413 414 415 416
	now = ktime_get_real();

	/* Estimate RTT from history if ACK number is valid */
	r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
	if (r_sample == 0) {
		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
			  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
			  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
		return;
	}
417

418 419 420
	/* Update receive rate in units of 64 * bytes/second */
	hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
	hctx->ccid3hctx_x_recv <<= 6;
421

422 423 424 425 426
	/* Update loss event rate (which is scaled by 1e6) */
	pinv = opt_recv->ccid3or_loss_event_rate;
	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
		hctx->ccid3hctx_p = 0;
	else				       /* can not exceed 100% */
427
		hctx->ccid3hctx_p = scaled_div(1, pinv);
428 429 430 431 432
	/*
	 * Validate new RTT sample and update moving average
	 */
	r_sample = dccp_sample_rtt(sk, r_sample);
	hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
433 434 435
	/*
	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
	 */
436
	if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
437
		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
438

439 440 441 442 443 444
		if (hctx->ccid3hctx_t_rto == 0) {
			/*
			 * Initial feedback packet: Larger Initial Windows (4.2)
			 */
			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
			hctx->ccid3hctx_t_ld = now;
445

446
			ccid3_update_send_interval(hctx);
447

448 449 450 451 452 453 454 455
			goto done_computing_x;
		} else if (hctx->ccid3hctx_p == 0) {
			/*
			 * First feedback after nofeedback timer expiry (4.3)
			 */
			goto done_computing_x;
		}
	}
456

457 458 459
	/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
	if (hctx->ccid3hctx_p > 0)
		hctx->ccid3hctx_x_calc =
460 461 462
				tfrc_calc_x(hctx->ccid3hctx_s,
					    hctx->ccid3hctx_rtt,
					    hctx->ccid3hctx_p);
463
	ccid3_hc_tx_update_x(sk, &now);
464

465 466
done_computing_x:
	ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
467 468 469 470 471 472 473
			       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
			       dccp_role(sk),
			       sk, hctx->ccid3hctx_rtt, r_sample,
			       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
			       hctx->ccid3hctx_x_calc,
			       (unsigned)(hctx->ccid3hctx_x_recv >> 6),
			       (unsigned)(hctx->ccid3hctx_x >> 6));
474

475 476
	/* unschedule no feedback timer */
	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
477

478 479 480 481 482
	/*
	 * As we have calculated new ipi, delta, t_nom it is possible
	 * that we now can send a packet, so wake up dccp_wait_for_ccid
	 */
	sk->sk_write_space(sk);
483

484 485 486 487 488 489 490 491 492 493 494 495 496 497
	/*
	 * Update timeout interval for the nofeedback timer.
	 * We use a configuration option to increase the lower bound.
	 * This can help avoid triggering the nofeedback timer too
	 * often ('spinning') on LANs with small RTTs.
	 */
	hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
					   (CONFIG_IP_DCCP_CCID3_RTO *
					    (USEC_PER_SEC / 1000)));
	/*
	 * Schedule no feedback timer to expire in
	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
	 */
	t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
498

499 500 501 502
	ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
		       "expire in %lu jiffies (%luus)\n",
		       dccp_role(sk),
		       sk, usecs_to_jiffies(t_nfb), t_nfb);
503

504 505
	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
			   jiffies + usecs_to_jiffies(t_nfb));
506 507 508
}

static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
509 510
				     unsigned char len, u16 idx,
				     unsigned char *value)
511 512
{
	int rc = 0;
513 514
	const struct dccp_sock *dp = dccp_sk(sk);
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
515
	struct ccid3_options_received *opt_recv;
G
Gerrit Renker 已提交
516
	__be32 opt_val;
517 518 519 520 521 522 523 524 525 526 527 528 529

	opt_recv = &hctx->ccid3hctx_options_received;

	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
		opt_recv->ccid3or_loss_event_rate    = ~0;
		opt_recv->ccid3or_loss_intervals_idx = 0;
		opt_recv->ccid3or_loss_intervals_len = 0;
		opt_recv->ccid3or_receive_rate	     = 0;
	}

	switch (option) {
	case TFRC_OPT_LOSS_EVENT_RATE:
530
		if (unlikely(len != 4)) {
531
			DCCP_WARN("%s(%p), invalid len %d "
532 533
				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
				  dccp_role(sk), sk, len);
534 535
			rc = -EINVAL;
		} else {
G
Gerrit Renker 已提交
536 537
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
538
			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
539 540 541 542 543 544 545
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_loss_event_rate);
		}
		break;
	case TFRC_OPT_LOSS_INTERVALS:
		opt_recv->ccid3or_loss_intervals_idx = idx;
		opt_recv->ccid3or_loss_intervals_len = len;
546
		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
547 548 549 550 551
			       dccp_role(sk), sk,
			       opt_recv->ccid3or_loss_intervals_idx,
			       opt_recv->ccid3or_loss_intervals_len);
		break;
	case TFRC_OPT_RECEIVE_RATE:
552
		if (unlikely(len != 4)) {
553
			DCCP_WARN("%s(%p), invalid len %d "
554 555
				  "for TFRC_OPT_RECEIVE_RATE\n",
				  dccp_role(sk), sk, len);
556 557
			rc = -EINVAL;
		} else {
G
Gerrit Renker 已提交
558 559
			opt_val = get_unaligned((__be32 *)value);
			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
560
			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
561 562 563 564 565 566 567 568 569
				       dccp_role(sk), sk,
				       opt_recv->ccid3or_receive_rate);
		}
		break;
	}

	return rc;
}

570
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
571
{
572
	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
573 574

	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
575
	hctx->ccid3hctx_hist = NULL;
576 577
	setup_timer(&hctx->ccid3hctx_no_feedback_timer,
			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
578 579 580 581 582 583

	return 0;
}

static void ccid3_hc_tx_exit(struct sock *sk)
{
584
	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
585 586 587 588

	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);

589
	tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
590 591
}

592 593
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
594
	struct ccid3_hc_tx_sock *hctx;
595 596 597 598 599

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

600
	hctx = ccid3_hc_tx_sk(sk);
601 602 603 604 605 606 607
	info->tcpi_rto = hctx->ccid3hctx_t_rto;
	info->tcpi_rtt = hctx->ccid3hctx_rtt;
}

static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
608
	const struct ccid3_hc_tx_sock *hctx;
609 610 611 612 613 614
	const void *val;

	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

615
	hctx = ccid3_hc_tx_sk(sk);
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
	switch (optname) {
	case DCCP_SOCKOPT_CCID_TX_INFO:
		if (len < sizeof(hctx->ccid3hctx_tfrc))
			return -EINVAL;
		len = sizeof(hctx->ccid3hctx_tfrc);
		val = &hctx->ccid3hctx_tfrc;
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

633
/*
634
 *	Receiver Half-Connection Routines
635
 */
636 637 638 639 640 641 642 643 644

/* CCID3 feedback types */
enum ccid3_fback_type {
	CCID3_FBACK_NONE = 0,
	CCID3_FBACK_INITIAL,
	CCID3_FBACK_PERIODIC,
	CCID3_FBACK_PARAM_CHANGE
};

645
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
646 647 648 649 650 651 652 653 654 655 656 657
static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
{
	static char *ccid3_rx_state_names[] = {
	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
	[TFRC_RSTATE_DATA]    = "DATA",
	[TFRC_RSTATE_TERM]    = "TERM",
	};

	return ccid3_rx_state_names[state];
}
#endif

658 659
static void ccid3_hc_rx_set_state(struct sock *sk,
				  enum ccid3_hc_rx_states state)
660
{
661
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
662 663 664
	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;

	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
665 666
		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
		       ccid3_rx_state_name(state));
667 668 669 670
	WARN_ON(state == oldstate);
	hcrx->ccid3hcrx_state = state;
}

671 672 673
static void ccid3_hc_rx_send_feedback(struct sock *sk,
				      const struct sk_buff *skb,
				      enum ccid3_fback_type fbtype)
674
{
675
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
676
	struct dccp_sock *dp = dccp_sk(sk);
677
	ktime_t now;
678
	s64 delta = 0;
679

680 681 682
	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
		return;

683
	now = ktime_get_real();
684

685 686
	switch (fbtype) {
	case CCID3_FBACK_INITIAL:
687
		hcrx->ccid3hcrx_x_recv = 0;
688
		hcrx->ccid3hcrx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
689
		break;
690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
	case CCID3_FBACK_PARAM_CHANGE:
		/*
		 * When parameters change (new loss or p > p_prev), we do not
		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
		 * need to  reuse the previous value of X_recv. However, when
		 * X_recv was 0 (due to early loss), this would kill X down to
		 * s/t_mbi (i.e. one packet in 64 seconds).
		 * To avoid such drastic reduction, we approximate X_recv as
		 * the number of bytes since last feedback.
		 * This is a safe fallback, since X is bounded above by X_calc.
		 */
		if (hcrx->ccid3hcrx_x_recv > 0)
			break;
		/* fall through */
	case CCID3_FBACK_PERIODIC:
		delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
		if (delta <= 0)
			DCCP_BUG("delta (%ld) <= 0", (long)delta);
		else
			hcrx->ccid3hcrx_x_recv =
				scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
711
		break;
712
	default:
713 714 715
		return;
	}

716 717
	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
		       hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
718

719
	hcrx->ccid3hcrx_tstamp_last_feedback = now;
720
	hcrx->ccid3hcrx_last_counter	     = dccp_hdr(skb)->dccph_ccval;
721 722
	hcrx->ccid3hcrx_bytes_recv	     = 0;

723
	dp->dccps_hc_rx_insert_options = 1;
724 725 726
	dccp_send_ack(sk);
}

727
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
728
{
729
	const struct ccid3_hc_rx_sock *hcrx;
730
	__be32 x_recv, pinv;
731

732
	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
733
		return 0;
734

735
	hcrx = ccid3_hc_rx_sk(sk);
736 737

	if (dccp_packet_without_ack(skb))
738 739
		return 0;

740 741
	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
	pinv   = htonl(hcrx->ccid3hcrx_pinv);
742

743
	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
744
			       &pinv, sizeof(pinv)) ||
745
	    dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
746
			       &x_recv, sizeof(x_recv)))
747 748 749
		return -1;

	return 0;
750 751
}

752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
/** ccid3_first_li  -  Implements [RFC 3448, 6.3.1]
 *
 * Determine the length of the first loss interval via inverse lookup.
 * Assume that X_recv can be computed by the throughput equation
 *		    s
 *	X_recv = --------
 *		 R * fval
 * Find some p such that f(p) = fval; return 1/p (scaled).
 */
static u32 ccid3_first_li(struct sock *sk)
{
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
	u32 x_recv, p, delta;
	u64 fval;

	if (hcrx->ccid3hcrx_rtt == 0) {
		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
		hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
	}

	delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
	x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
	if (x_recv == 0) {		/* would also trigger divide-by-zero */
		DCCP_WARN("X_recv==0\n");
		if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
			DCCP_BUG("stored value of X_recv is zero");
			return ~0U;
		}
	}

	fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
	fval = scaled_div32(fval, x_recv);
	p = tfrc_calc_x_reverse_lookup(fval);

	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);

	return p == 0 ? ~0U : scaled_div(1, p);
}

792
static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
793
{
794
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
	const u32 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
	const bool is_data_packet = dccp_data_packet(skb);

	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
		if (is_data_packet) {
			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
			do_feedback = CCID3_FBACK_INITIAL;
			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
			hcrx->ccid3hcrx_s = payload;
			/*
			 * Not necessary to update ccid3hcrx_bytes_recv here,
			 * since X_recv = 0 for the first feedback packet (cf.
			 * RFC 3448, 6.3) -- gerrit
			 */
I
Ian McDonald 已提交
810
		}
811
		goto update_records;
I
Ian McDonald 已提交
812 813
	}

814 815
	if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
		return; /* done receiving */
816

817 818 819 820 821 822 823
	if (is_data_packet) {
		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
		/*
		 * Update moving-average of s and the sum of received payload bytes
		 */
		hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
		hcrx->ccid3hcrx_bytes_recv += payload;
824 825
	}

826 827 828
	/*
	 * Handle pending losses and otherwise check for new loss
	 */
829 830 831 832 833 834 835 836
	if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist) &&
	    tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist,
				&hcrx->ccid3hcrx_li_hist,
				skb, ndp, ccid3_first_li, sk) ) {
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
		goto done_receiving;
	}

837 838
	if (tfrc_rx_hist_new_loss_indicated(&hcrx->ccid3hcrx_hist, skb, ndp))
		goto update_records;
839

840 841 842 843 844
	/*
	 * Handle data packets: RTT sampling and monitoring p
	 */
	if (unlikely(!is_data_packet))
		goto update_records;
845

846
	if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
847 848 849 850 851 852 853 854
		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
		/*
		 * Empty loss history: no loss so far, hence p stays 0.
		 * Sample RTT values, since an RTT estimate is required for the
		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
		 */
		if (sample != 0)
			hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
855 856 857 858 859 860 861

	} else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
		/*
		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
		 * has decreased (resp. p has increased), send feedback now.
		 */
		do_feedback = CCID3_FBACK_PARAM_CHANGE;
862 863
	}

864 865 866 867 868
	/*
	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
	 */
	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
		do_feedback = CCID3_FBACK_PERIODIC;
I
Ian McDonald 已提交
869

870 871
update_records:
	tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
872

873
done_receiving:
874 875
	if (do_feedback)
		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
876 877
}

878
static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
879
{
880
	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
881 882

	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
883
	tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
884
	return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
885 886 887 888
}

static void ccid3_hc_rx_exit(struct sock *sk)
{
889
	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
890 891 892

	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);

893
	tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
894
	tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
895 896
}

897 898
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
899
	const struct ccid3_hc_rx_sock *hcrx;
900

901 902 903 904
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return;

905
	hcrx = ccid3_hc_rx_sk(sk);
906 907 908
	info->tcpi_ca_state = hcrx->ccid3hcrx_state;
	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
	info->tcpi_rcv_rtt  = hcrx->ccid3hcrx_rtt;
909 910
}

911 912 913
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
				  u32 __user *optval, int __user *optlen)
{
914
	const struct ccid3_hc_rx_sock *hcrx;
915
	struct tfrc_rx_info rx_info;
916
	const void *val;
917

918 919 920 921
	/* Listen socks doesn't have a private CCID block */
	if (sk->sk_state == DCCP_LISTEN)
		return -EINVAL;

922
	hcrx = ccid3_hc_rx_sk(sk);
923 924
	switch (optname) {
	case DCCP_SOCKOPT_CCID_RX_INFO:
925
		if (len < sizeof(rx_info))
926
			return -EINVAL;
927 928 929 930 931 932
		rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
		rx_info.tfrcrx_rtt    = hcrx->ccid3hcrx_rtt;
		rx_info.tfrcrx_p      = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
					   scaled_div(1, hcrx->ccid3hcrx_pinv);
		len = sizeof(rx_info);
		val = &rx_info;
933 934 935 936 937 938 939 940 941 942 943
		break;
	default:
		return -ENOPROTOOPT;
	}

	if (put_user(len, optlen) || copy_to_user(optval, val, len))
		return -EFAULT;

	return 0;
}

944
static struct ccid_operations ccid3 = {
I
Ian McDonald 已提交
945
	.ccid_id		   = DCCPC_CCID3,
946
	.ccid_name		   = "TCP-Friendly Rate Control",
947
	.ccid_owner		   = THIS_MODULE,
948
	.ccid_hc_tx_obj_size	   = sizeof(struct ccid3_hc_tx_sock),
949 950 951 952 953 954
	.ccid_hc_tx_init	   = ccid3_hc_tx_init,
	.ccid_hc_tx_exit	   = ccid3_hc_tx_exit,
	.ccid_hc_tx_send_packet	   = ccid3_hc_tx_send_packet,
	.ccid_hc_tx_packet_sent	   = ccid3_hc_tx_packet_sent,
	.ccid_hc_tx_packet_recv	   = ccid3_hc_tx_packet_recv,
	.ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
955
	.ccid_hc_rx_obj_size	   = sizeof(struct ccid3_hc_rx_sock),
956 957 958 959
	.ccid_hc_rx_init	   = ccid3_hc_rx_init,
	.ccid_hc_rx_exit	   = ccid3_hc_rx_exit,
	.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
	.ccid_hc_rx_packet_recv	   = ccid3_hc_rx_packet_recv,
960 961
	.ccid_hc_rx_get_info	   = ccid3_hc_rx_get_info,
	.ccid_hc_tx_get_info	   = ccid3_hc_tx_get_info,
962 963
	.ccid_hc_rx_getsockopt	   = ccid3_hc_rx_getsockopt,
	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
964
};
965

966
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
967
module_param(ccid3_debug, bool, 0444);
968
MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
969
#endif
970 971 972

static __init int ccid3_module_init(void)
{
973
	return ccid_register(&ccid3);
974 975 976 977 978 979 980 981 982
}
module_init(ccid3_module_init);

static __exit void ccid3_module_exit(void)
{
	ccid_unregister(&ccid3);
}
module_exit(ccid3_module_exit);

983
MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
984
	      "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
985 986 987
MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
MODULE_LICENSE("GPL");
MODULE_ALIAS("net-dccp-ccid-3");