cfq-iosched.c 50.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
 *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
 */
#include <linux/module.h>
A
Al Viro 已提交
10 11
#include <linux/blkdev.h>
#include <linux/elevator.h>
L
Linus Torvalds 已提交
12 13
#include <linux/hash.h>
#include <linux/rbtree.h>
14
#include <linux/ioprio.h>
L
Linus Torvalds 已提交
15 16 17 18

/*
 * tunables
 */
19 20 21 22
static const int cfq_quantum = 4;		/* max queue in one round of service */
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
L
Linus Torvalds 已提交
23

24
static const int cfq_slice_sync = HZ / 10;
J
Jens Axboe 已提交
25
static int cfq_slice_async = HZ / 25;
26
static const int cfq_slice_async_rq = 2;
27
static int cfq_slice_idle = HZ / 125;
28 29 30 31 32 33

#define CFQ_IDLE_GRACE		(HZ / 10)
#define CFQ_SLICE_SCALE		(5)

#define CFQ_KEY_ASYNC		(0)

L
Linus Torvalds 已提交
34 35 36 37 38 39 40 41 42
/*
 * for the hash of cfqq inside the cfqd
 */
#define CFQ_QHASH_SHIFT		6
#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)

#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)

J
Jens Axboe 已提交
43 44
#define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
#define RQ_CFQQ(rq)		((rq)->elevator_private2)
L
Linus Torvalds 已提交
45 46 47 48

static kmem_cache_t *cfq_pool;
static kmem_cache_t *cfq_ioc_pool;

49
static DEFINE_PER_CPU(unsigned long, ioc_count);
50 51
static struct completion *ioc_gone;

52 53 54 55
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

J
Jens Axboe 已提交
56 57 58 59 60 61 62 63 64 65
#define ASYNC			(0)
#define SYNC			(1)

#define cfq_cfqq_dispatched(cfqq)	\
	((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])

#define cfq_cfqq_class_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)

#define cfq_cfqq_sync(cfqq)		\
	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
66

67 68
#define sample_valid(samples)	((samples) > 80)

69 70 71
/*
 * Per block device queue structure
 */
L
Linus Torvalds 已提交
72
struct cfq_data {
73 74 75 76 77 78 79 80 81 82 83 84 85 86
	request_queue_t *queue;

	/*
	 * rr list of queues with requests and the count of them
	 */
	struct list_head rr_list[CFQ_PRIO_LISTS];
	struct list_head busy_rr;
	struct list_head cur_rr;
	struct list_head idle_rr;
	unsigned int busy_queues;

	/*
	 * cfqq lookup hash
	 */
L
Linus Torvalds 已提交
87 88
	struct hlist_head *cfq_hash;

89
	int rq_in_driver;
90
	int hw_tag;
L
Linus Torvalds 已提交
91

92 93 94 95 96
	/*
	 * idle window management
	 */
	struct timer_list idle_slice_timer;
	struct work_struct unplug_work;
L
Linus Torvalds 已提交
97

98 99 100 101 102 103
	struct cfq_queue *active_queue;
	struct cfq_io_context *active_cic;
	int cur_prio, cur_end_prio;
	unsigned int dispatch_slice;

	struct timer_list idle_class_timer;
L
Linus Torvalds 已提交
104 105

	sector_t last_sector;
106
	unsigned long last_end_request;
L
Linus Torvalds 已提交
107 108 109 110 111

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
112
	unsigned int cfq_fifo_expire[2];
L
Linus Torvalds 已提交
113 114
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
115 116 117
	unsigned int cfq_slice[2];
	unsigned int cfq_slice_async_rq;
	unsigned int cfq_slice_idle;
118 119

	struct list_head cic_list;
L
Linus Torvalds 已提交
120 121
};

122 123 124
/*
 * Per process-grouping structure
 */
L
Linus Torvalds 已提交
125 126 127 128 129
struct cfq_queue {
	/* reference count */
	atomic_t ref;
	/* parent cfq_data */
	struct cfq_data *cfqd;
130
	/* cfqq lookup hash */
L
Linus Torvalds 已提交
131 132
	struct hlist_node cfq_hash;
	/* hash key */
133
	unsigned int key;
134
	/* member of the rr/busy/cur/idle cfqd list */
L
Linus Torvalds 已提交
135 136 137 138
	struct list_head cfq_list;
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
J
Jens Axboe 已提交
139
	struct request *next_rq;
L
Linus Torvalds 已提交
140 141 142 143 144
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
145
	struct list_head fifo;
L
Linus Torvalds 已提交
146

147 148 149
	unsigned long slice_start;
	unsigned long slice_end;
	unsigned long slice_left;
L
Linus Torvalds 已提交
150

J
Jens Axboe 已提交
151 152
	/* number of requests that are on the dispatch list */
	int on_dispatch[2];
153 154 155 156 157

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
	unsigned short ioprio_class, org_ioprio_class;

J
Jens Axboe 已提交
158 159
	/* various state flags, see below */
	unsigned int flags;
L
Linus Torvalds 已提交
160 161
};

J
Jens Axboe 已提交
162 163 164 165 166 167 168 169 170
enum cfqq_state_flags {
	CFQ_CFQQ_FLAG_on_rr = 0,
	CFQ_CFQQ_FLAG_wait_request,
	CFQ_CFQQ_FLAG_must_alloc,
	CFQ_CFQQ_FLAG_must_alloc_slice,
	CFQ_CFQQ_FLAG_must_dispatch,
	CFQ_CFQQ_FLAG_fifo_expire,
	CFQ_CFQQ_FLAG_idle_window,
	CFQ_CFQQ_FLAG_prio_changed,
171
	CFQ_CFQQ_FLAG_queue_new,
J
Jens Axboe 已提交
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
	cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
	cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
	return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
CFQ_CFQQ_FNS(must_alloc);
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(must_dispatch);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
196
CFQ_CFQQ_FNS(queue_new);
J
Jens Axboe 已提交
197 198 199
#undef CFQ_CFQQ_FNS

static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
J
Jens Axboe 已提交
200
static void cfq_dispatch_insert(request_queue_t *, struct request *);
201
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
L
Linus Torvalds 已提交
202

A
Andrew Morton 已提交
203 204 205 206 207 208
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
209
	if (cfqd->busy_queues)
A
Andrew Morton 已提交
210 211 212 213 214 215 216
		kblockd_schedule_work(&cfqd->unplug_work);
}

static int cfq_queue_empty(request_queue_t *q)
{
	struct cfq_data *cfqd = q->elevator->elevator_data;

217
	return !cfqd->busy_queues;
A
Andrew Morton 已提交
218 219
}

220 221
static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
{
J
Jens Axboe 已提交
222
	if (rw == READ || rw == WRITE_SYNC)
223 224 225 226 227
		return task->pid;

	return CFQ_KEY_ASYNC;
}

L
Linus Torvalds 已提交
228
/*
J
Jens Axboe 已提交
229
 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
L
Linus Torvalds 已提交
230
 * We choose the request that is closest to the head right now. Distance
231
 * behind the head is penalized and only allowed to a certain extent.
L
Linus Torvalds 已提交
232
 */
J
Jens Axboe 已提交
233 234
static struct request *
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
L
Linus Torvalds 已提交
235 236 237
{
	sector_t last, s1, s2, d1 = 0, d2 = 0;
	unsigned long back_max;
238 239 240
#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
L
Linus Torvalds 已提交
241

J
Jens Axboe 已提交
242 243 244 245
	if (rq1 == NULL || rq1 == rq2)
		return rq2;
	if (rq2 == NULL)
		return rq1;
J
Jens Axboe 已提交
246

J
Jens Axboe 已提交
247 248 249 250
	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
		return rq1;
	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
		return rq2;
L
Linus Torvalds 已提交
251

J
Jens Axboe 已提交
252 253
	s1 = rq1->sector;
	s2 = rq2->sector;
L
Linus Torvalds 已提交
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271

	last = cfqd->last_sector;

	/*
	 * by definition, 1KiB is 2 sectors
	 */
	back_max = cfqd->cfq_back_max * 2;

	/*
	 * Strict one way elevator _except_ in the case where we allow
	 * short backward seeks which are biased as twice the cost of a
	 * similar forward seek.
	 */
	if (s1 >= last)
		d1 = s1 - last;
	else if (s1 + back_max >= last)
		d1 = (last - s1) * cfqd->cfq_back_penalty;
	else
272
		wrap |= CFQ_RQ1_WRAP;
L
Linus Torvalds 已提交
273 274 275 276 277 278

	if (s2 >= last)
		d2 = s2 - last;
	else if (s2 + back_max >= last)
		d2 = (last - s2) * cfqd->cfq_back_penalty;
	else
279
		wrap |= CFQ_RQ2_WRAP;
L
Linus Torvalds 已提交
280 281

	/* Found required data */
282 283 284 285 286 287

	/*
	 * By doing switch() on the bit mask "wrap" we avoid having to
	 * check two variables for all permutations: --> faster!
	 */
	switch (wrap) {
J
Jens Axboe 已提交
288
	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
289
		if (d1 < d2)
J
Jens Axboe 已提交
290
			return rq1;
291
		else if (d2 < d1)
J
Jens Axboe 已提交
292
			return rq2;
293 294
		else {
			if (s1 >= s2)
J
Jens Axboe 已提交
295
				return rq1;
296
			else
J
Jens Axboe 已提交
297
				return rq2;
298
		}
L
Linus Torvalds 已提交
299

300
	case CFQ_RQ2_WRAP:
J
Jens Axboe 已提交
301
		return rq1;
302
	case CFQ_RQ1_WRAP:
J
Jens Axboe 已提交
303 304
		return rq2;
	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
305 306 307 308 309 310 311 312
	default:
		/*
		 * Since both rqs are wrapped,
		 * start with the one that's further behind head
		 * (--> only *one* back seek required),
		 * since back seek takes more time than forward.
		 */
		if (s1 <= s2)
J
Jens Axboe 已提交
313
			return rq1;
L
Linus Torvalds 已提交
314
		else
J
Jens Axboe 已提交
315
			return rq2;
L
Linus Torvalds 已提交
316 317 318 319 320 321
	}
}

/*
 * would be nice to take fifo expire time into account as well
 */
J
Jens Axboe 已提交
322 323 324
static struct request *
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		  struct request *last)
L
Linus Torvalds 已提交
325
{
326 327
	struct rb_node *rbnext = rb_next(&last->rb_node);
	struct rb_node *rbprev = rb_prev(&last->rb_node);
J
Jens Axboe 已提交
328
	struct request *next = NULL, *prev = NULL;
L
Linus Torvalds 已提交
329

330
	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
L
Linus Torvalds 已提交
331 332

	if (rbprev)
J
Jens Axboe 已提交
333
		prev = rb_entry_rq(rbprev);
L
Linus Torvalds 已提交
334

335
	if (rbnext)
J
Jens Axboe 已提交
336
		next = rb_entry_rq(rbnext);
337 338 339
	else {
		rbnext = rb_first(&cfqq->sort_list);
		if (rbnext && rbnext != &last->rb_node)
J
Jens Axboe 已提交
340
			next = rb_entry_rq(rbnext);
341
	}
L
Linus Torvalds 已提交
342

343
	return cfq_choose_req(cfqd, next, prev);
L
Linus Torvalds 已提交
344 345
}

346
static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
L
Linus Torvalds 已提交
347
{
348
	struct cfq_data *cfqd = cfqq->cfqd;
349
	struct list_head *list;
L
Linus Torvalds 已提交
350

J
Jens Axboe 已提交
351
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
L
Linus Torvalds 已提交
352

353
	list_del(&cfqq->cfq_list);
L
Linus Torvalds 已提交
354

355 356 357 358 359 360 361 362 363 364 365 366
	if (cfq_class_rt(cfqq))
		list = &cfqd->cur_rr;
	else if (cfq_class_idle(cfqq))
		list = &cfqd->idle_rr;
	else {
		/*
		 * if cfqq has requests in flight, don't allow it to be
		 * found in cfq_set_active_queue before it has finished them.
		 * this is done to increase fairness between a process that
		 * has lots of io pending vs one that only generates one
		 * sporadically or synchronously
		 */
J
Jens Axboe 已提交
367
		if (cfq_cfqq_dispatched(cfqq))
368 369 370
			list = &cfqd->busy_rr;
		else
			list = &cfqd->rr_list[cfqq->ioprio];
L
Linus Torvalds 已提交
371 372
	}

373
	/*
374 375 376
	 * If this queue was preempted or is new (never been serviced), let
	 * it be added first for fairness but beind other new queues.
	 * Otherwise, just add to the back  of the list.
377
	 */
378 379 380
	if (preempted || cfq_cfqq_queue_new(cfqq)) {
		struct list_head *n = list;
		struct cfq_queue *__cfqq;
381

382 383 384 385
		while (n->next != list) {
			__cfqq = list_entry_cfqq(n->next);
			if (!cfq_cfqq_queue_new(__cfqq))
				break;
L
Linus Torvalds 已提交
386

387 388
			n = n->next;
		}
L
Linus Torvalds 已提交
389

390
		list = n;
L
Linus Torvalds 已提交
391 392
	}

393
	list_add_tail(&cfqq->cfq_list, list);
L
Linus Torvalds 已提交
394 395 396 397
}

/*
 * add to busy list of queues for service, trying to be fair in ordering
398
 * the pending list according to last request service
L
Linus Torvalds 已提交
399 400
 */
static inline void
401
cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
402
{
J
Jens Axboe 已提交
403 404
	BUG_ON(cfq_cfqq_on_rr(cfqq));
	cfq_mark_cfqq_on_rr(cfqq);
L
Linus Torvalds 已提交
405 406
	cfqd->busy_queues++;

407
	cfq_resort_rr_list(cfqq, 0);
L
Linus Torvalds 已提交
408 409 410 411 412
}

static inline void
cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
J
Jens Axboe 已提交
413 414
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
	cfq_clear_cfqq_on_rr(cfqq);
415
	list_del_init(&cfqq->cfq_list);
L
Linus Torvalds 已提交
416 417 418 419 420 421 422 423

	BUG_ON(!cfqd->busy_queues);
	cfqd->busy_queues--;
}

/*
 * rb tree support functions
 */
J
Jens Axboe 已提交
424
static inline void cfq_del_rq_rb(struct request *rq)
L
Linus Torvalds 已提交
425
{
J
Jens Axboe 已提交
426
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
427
	struct cfq_data *cfqd = cfqq->cfqd;
J
Jens Axboe 已提交
428
	const int sync = rq_is_sync(rq);
L
Linus Torvalds 已提交
429

430 431
	BUG_ON(!cfqq->queued[sync]);
	cfqq->queued[sync]--;
L
Linus Torvalds 已提交
432

J
Jens Axboe 已提交
433
	elv_rb_del(&cfqq->sort_list, rq);
L
Linus Torvalds 已提交
434

435
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
436
		cfq_del_cfqq_rr(cfqd, cfqq);
L
Linus Torvalds 已提交
437 438
}

J
Jens Axboe 已提交
439
static void cfq_add_rq_rb(struct request *rq)
L
Linus Torvalds 已提交
440
{
J
Jens Axboe 已提交
441
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
L
Linus Torvalds 已提交
442
	struct cfq_data *cfqd = cfqq->cfqd;
443
	struct request *__alias;
L
Linus Torvalds 已提交
444

445
	cfqq->queued[rq_is_sync(rq)]++;
L
Linus Torvalds 已提交
446 447 448 449 450

	/*
	 * looks a little odd, but the first insert might return an alias.
	 * if that happens, put the alias on the dispatch list
	 */
451
	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
J
Jens Axboe 已提交
452
		cfq_dispatch_insert(cfqd->queue, __alias);
L
Linus Torvalds 已提交
453 454 455
}

static inline void
J
Jens Axboe 已提交
456
cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
L
Linus Torvalds 已提交
457
{
458 459
	elv_rb_del(&cfqq->sort_list, rq);
	cfqq->queued[rq_is_sync(rq)]--;
J
Jens Axboe 已提交
460
	cfq_add_rq_rb(rq);
L
Linus Torvalds 已提交
461 462
}

463 464
static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
L
Linus Torvalds 已提交
465
{
466 467 468
	struct task_struct *tsk = current;
	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
	struct cfq_queue *cfqq;
L
Linus Torvalds 已提交
469

470
	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
471 472 473
	if (cfqq) {
		sector_t sector = bio->bi_sector + bio_sectors(bio);

474
		return elv_rb_find(&cfqq->sort_list, sector);
475
	}
L
Linus Torvalds 已提交
476 477 478 479

	return NULL;
}

480
static void cfq_activate_request(request_queue_t *q, struct request *rq)
L
Linus Torvalds 已提交
481
{
482
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
483

484
	cfqd->rq_in_driver++;
485 486 487 488 489 490 491 492 493

	/*
	 * If the depth is larger 1, it really could be queueing. But lets
	 * make the mark a little higher - idling could still be good for
	 * low queueing, and a low queueing number could also just indicate
	 * a SCSI mid layer like behaviour where limit+1 is often seen.
	 */
	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
		cfqd->hw_tag = 1;
L
Linus Torvalds 已提交
494 495
}

496
static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
L
Linus Torvalds 已提交
497
{
498 499 500 501
	struct cfq_data *cfqd = q->elevator->elevator_data;

	WARN_ON(!cfqd->rq_in_driver);
	cfqd->rq_in_driver--;
L
Linus Torvalds 已提交
502 503
}

504
static void cfq_remove_request(struct request *rq)
L
Linus Torvalds 已提交
505
{
J
Jens Axboe 已提交
506
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
507

J
Jens Axboe 已提交
508 509
	if (cfqq->next_rq == rq)
		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
L
Linus Torvalds 已提交
510

511
	list_del_init(&rq->queuelist);
J
Jens Axboe 已提交
512
	cfq_del_rq_rb(rq);
L
Linus Torvalds 已提交
513 514 515 516 517 518 519 520
}

static int
cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct request *__rq;

521
	__rq = cfq_find_rq_fmerge(cfqd, bio);
522
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
523 524
		*req = __rq;
		return ELEVATOR_FRONT_MERGE;
L
Linus Torvalds 已提交
525 526 527 528 529
	}

	return ELEVATOR_NO_MERGE;
}

530 531
static void cfq_merged_request(request_queue_t *q, struct request *req,
			       int type)
L
Linus Torvalds 已提交
532
{
533
	if (type == ELEVATOR_FRONT_MERGE) {
J
Jens Axboe 已提交
534
		struct cfq_queue *cfqq = RQ_CFQQ(req);
L
Linus Torvalds 已提交
535

J
Jens Axboe 已提交
536
		cfq_reposition_rq_rb(cfqq, req);
L
Linus Torvalds 已提交
537 538 539 540 541 542 543
	}
}

static void
cfq_merged_requests(request_queue_t *q, struct request *rq,
		    struct request *next)
{
544 545 546 547 548 549 550
	/*
	 * reposition in fifo if next is older than rq
	 */
	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
	    time_before(next->start_time, rq->start_time))
		list_move(&rq->queuelist, &next->queuelist);

551
	cfq_remove_request(next);
552 553 554 555 556 557 558 559 560 561 562 563 564 565
}

static inline void
__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	if (cfqq) {
		/*
		 * stop potential idle class queues waiting service
		 */
		del_timer(&cfqd->idle_class_timer);

		cfqq->slice_start = jiffies;
		cfqq->slice_end = 0;
		cfqq->slice_left = 0;
J
Jens Axboe 已提交
566 567
		cfq_clear_cfqq_must_alloc_slice(cfqq);
		cfq_clear_cfqq_fifo_expire(cfqq);
568 569 570 571 572
	}

	cfqd->active_queue = cfqq;
}

573 574 575 576 577 578 579 580 581 582 583 584
/*
 * current cfqq expired its slice (or was too idle), select new one
 */
static void
__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		    int preempted)
{
	unsigned long now = jiffies;

	if (cfq_cfqq_wait_request(cfqq))
		del_timer(&cfqd->idle_slice_timer);

585
	if (!preempted && !cfq_cfqq_dispatched(cfqq))
586 587 588 589
		cfq_schedule_dispatch(cfqd);

	cfq_clear_cfqq_must_dispatch(cfqq);
	cfq_clear_cfqq_wait_request(cfqq);
590
	cfq_clear_cfqq_queue_new(cfqq);
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622

	/*
	 * store what was left of this slice, if the queue idled out
	 * or was preempted
	 */
	if (time_after(cfqq->slice_end, now))
		cfqq->slice_left = cfqq->slice_end - now;
	else
		cfqq->slice_left = 0;

	if (cfq_cfqq_on_rr(cfqq))
		cfq_resort_rr_list(cfqq, preempted);

	if (cfqq == cfqd->active_queue)
		cfqd->active_queue = NULL;

	if (cfqd->active_cic) {
		put_io_context(cfqd->active_cic->ioc);
		cfqd->active_cic = NULL;
	}

	cfqd->dispatch_slice = 0;
}

static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
{
	struct cfq_queue *cfqq = cfqd->active_queue;

	if (cfqq)
		__cfq_slice_expired(cfqd, cfqq, preempted);
}

623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
/*
 * 0
 * 0,1
 * 0,1,2
 * 0,1,2,3
 * 0,1,2,3,4
 * 0,1,2,3,4,5
 * 0,1,2,3,4,5,6
 * 0,1,2,3,4,5,6,7
 */
static int cfq_get_next_prio_level(struct cfq_data *cfqd)
{
	int prio, wrap;

	prio = -1;
	wrap = 0;
	do {
		int p;

		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
			if (!list_empty(&cfqd->rr_list[p])) {
				prio = p;
				break;
			}
		}

		if (prio != -1)
			break;
		cfqd->cur_prio = 0;
		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
			cfqd->cur_end_prio = 0;
			if (wrap)
				break;
			wrap = 1;
L
Linus Torvalds 已提交
657
		}
658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
	} while (1);

	if (unlikely(prio == -1))
		return -1;

	BUG_ON(prio >= CFQ_PRIO_LISTS);

	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);

	cfqd->cur_prio = prio + 1;
	if (cfqd->cur_prio > cfqd->cur_end_prio) {
		cfqd->cur_end_prio = cfqd->cur_prio;
		cfqd->cur_prio = 0;
	}
	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
		cfqd->cur_prio = 0;
		cfqd->cur_end_prio = 0;
L
Linus Torvalds 已提交
675 676
	}

677 678 679
	return prio;
}

J
Jens Axboe 已提交
680
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
681
{
682
	struct cfq_queue *cfqq = NULL;
683

684 685 686 687 688 689
	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
		/*
		 * if current list is non-empty, grab first entry. if it is
		 * empty, get next prio level and grab first entry then if any
		 * are spliced
		 */
690
		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
691 692 693 694 695
	} else if (!list_empty(&cfqd->busy_rr)) {
		/*
		 * If no new queues are available, check if the busy list has
		 * some before falling back to idle io.
		 */
696
		cfqq = list_entry_cfqq(cfqd->busy_rr.next);
697 698 699 700 701 702
	} else if (!list_empty(&cfqd->idle_rr)) {
		/*
		 * if we have idle queues and no rt or be queues had pending
		 * requests, either allow immediate service if the grace period
		 * has passed or arm the idle grace timer
		 */
703 704 705 706 707 708 709 710 711
		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;

		if (time_after_eq(jiffies, end))
			cfqq = list_entry_cfqq(cfqd->idle_rr.next);
		else
			mod_timer(&cfqd->idle_class_timer, end);
	}

	__cfq_set_active_queue(cfqd, cfqq);
J
Jens Axboe 已提交
712
	return cfqq;
713 714
}

715 716
#define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024))

717 718 719
static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)

{
720
	struct cfq_io_context *cic;
721 722
	unsigned long sl;

723
	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
724 725 726 727 728 729 730
	WARN_ON(cfqq != cfqd->active_queue);

	/*
	 * idle is disabled, either manually or by past process history
	 */
	if (!cfqd->cfq_slice_idle)
		return 0;
J
Jens Axboe 已提交
731
	if (!cfq_cfqq_idle_window(cfqq))
732 733 734 735
		return 0;
	/*
	 * task has exited, don't wait
	 */
736 737
	cic = cfqd->active_cic;
	if (!cic || !cic->ioc->task)
738 739
		return 0;

J
Jens Axboe 已提交
740 741
	cfq_mark_cfqq_must_dispatch(cfqq);
	cfq_mark_cfqq_wait_request(cfqq);
742

743
	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
744 745 746 747 748 749

	/*
	 * we don't want to idle for seeks, but we do want to allow
	 * fair distribution of slice time for a process doing back-to-back
	 * seeks. so allow a little bit of time for him to submit a new rq
	 */
750
	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
751
		sl = min(sl, msecs_to_jiffies(2));
752

753
	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
754
	return 1;
L
Linus Torvalds 已提交
755 756
}

J
Jens Axboe 已提交
757
static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
L
Linus Torvalds 已提交
758 759
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
760
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
761

762 763 764
	cfq_remove_request(rq);
	cfqq->on_dispatch[rq_is_sync(rq)]++;
	elv_dispatch_sort(q, rq);
765 766 767

	rq = list_entry(q->queue_head.prev, struct request, queuelist);
	cfqd->last_sector = rq->sector + rq->nr_sectors;
L
Linus Torvalds 已提交
768 769 770 771 772
}

/*
 * return expired entry, or NULL to just start from scratch in rbtree
 */
J
Jens Axboe 已提交
773
static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
774 775
{
	struct cfq_data *cfqd = cfqq->cfqd;
776
	struct request *rq;
777
	int fifo;
L
Linus Torvalds 已提交
778

J
Jens Axboe 已提交
779
	if (cfq_cfqq_fifo_expire(cfqq))
L
Linus Torvalds 已提交
780
		return NULL;
781 782
	if (list_empty(&cfqq->fifo))
		return NULL;
L
Linus Torvalds 已提交
783

784 785
	fifo = cfq_cfqq_class_sync(cfqq);
	rq = rq_entry_fifo(cfqq->fifo.next);
L
Linus Torvalds 已提交
786

787 788 789
	if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
		cfq_mark_cfqq_fifo_expire(cfqq);
		return rq;
L
Linus Torvalds 已提交
790 791 792 793 794 795
	}

	return NULL;
}

/*
J
Jens Axboe 已提交
796 797 798
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
L
Linus Torvalds 已提交
799
 */
800 801 802 803 804 805 806 807 808 809
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];

	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);

	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
}

L
Linus Torvalds 已提交
810
static inline void
811
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
812
{
813 814
	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
}
L
Linus Torvalds 已提交
815

816 817 818 819
static inline int
cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	const int base_rq = cfqd->cfq_slice_async_rq;
L
Linus Torvalds 已提交
820

821
	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
L
Linus Torvalds 已提交
822

823
	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
L
Linus Torvalds 已提交
824 825
}

826 827 828
/*
 * get next queue for service
 */
829
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
L
Linus Torvalds 已提交
830
{
831
	unsigned long now = jiffies;
L
Linus Torvalds 已提交
832 833
	struct cfq_queue *cfqq;

834 835 836
	cfqq = cfqd->active_queue;
	if (!cfqq)
		goto new_queue;
L
Linus Torvalds 已提交
837

838 839 840
	/*
	 * slice has expired
	 */
J
Jens Axboe 已提交
841 842
	if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
		goto expire;
L
Linus Torvalds 已提交
843

844 845 846 847
	/*
	 * if queue has requests, dispatch one. if not, check if
	 * enough slice is left to wait for one
	 */
848
	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
849
		goto keep_queue;
850 851 852 853
	else if (cfq_cfqq_dispatched(cfqq)) {
		cfqq = NULL;
		goto keep_queue;
	} else if (cfq_cfqq_class_sync(cfqq)) {
854 855 856 857
		if (cfq_arm_slice_timer(cfqd, cfqq))
			return NULL;
	}

J
Jens Axboe 已提交
858
expire:
859
	cfq_slice_expired(cfqd, 0);
J
Jens Axboe 已提交
860 861
new_queue:
	cfqq = cfq_set_active_queue(cfqd);
862
keep_queue:
J
Jens Axboe 已提交
863
	return cfqq;
864 865 866 867 868 869 870 871
}

static int
__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
			int max_dispatch)
{
	int dispatched = 0;

872
	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
873 874

	do {
J
Jens Axboe 已提交
875
		struct request *rq;
L
Linus Torvalds 已提交
876 877

		/*
878
		 * follow expired path, else get first next available
L
Linus Torvalds 已提交
879
		 */
J
Jens Axboe 已提交
880 881
		if ((rq = cfq_check_fifo(cfqq)) == NULL)
			rq = cfqq->next_rq;
882 883 884 885

		/*
		 * finally, insert request into driver dispatch list
		 */
J
Jens Axboe 已提交
886
		cfq_dispatch_insert(cfqd->queue, rq);
L
Linus Torvalds 已提交
887

888 889
		cfqd->dispatch_slice++;
		dispatched++;
L
Linus Torvalds 已提交
890

891
		if (!cfqd->active_cic) {
J
Jens Axboe 已提交
892 893
			atomic_inc(&RQ_CIC(rq)->ioc->refcount);
			cfqd->active_cic = RQ_CIC(rq);
894
		}
L
Linus Torvalds 已提交
895

896
		if (RB_EMPTY_ROOT(&cfqq->sort_list))
897 898 899 900 901
			break;

	} while (dispatched < max_dispatch);

	/*
902
	 * if slice end isn't set yet, set it.
903 904 905 906 907 908 909 910 911 912
	 */
	if (!cfqq->slice_end)
		cfq_set_prio_slice(cfqd, cfqq);

	/*
	 * expire an async queue immediately if it has used up its slice. idle
	 * queue always expire after 1 dispatch round.
	 */
	if ((!cfq_cfqq_sync(cfqq) &&
	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
913 914
	    cfq_class_idle(cfqq) ||
	    !cfq_cfqq_idle_window(cfqq))
915 916 917 918 919
		cfq_slice_expired(cfqd, 0);

	return dispatched;
}

920 921 922 923
static int
cfq_forced_dispatch_cfqqs(struct list_head *list)
{
	struct cfq_queue *cfqq, *next;
924
	int dispatched;
925

926
	dispatched = 0;
927
	list_for_each_entry_safe(cfqq, next, list, cfq_list) {
J
Jens Axboe 已提交
928 929
		while (cfqq->next_rq) {
			cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
930 931 932 933
			dispatched++;
		}
		BUG_ON(!list_empty(&cfqq->fifo));
	}
934

935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
	return dispatched;
}

static int
cfq_forced_dispatch(struct cfq_data *cfqd)
{
	int i, dispatched = 0;

	for (i = 0; i < CFQ_PRIO_LISTS; i++)
		dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);

	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
	dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);

	cfq_slice_expired(cfqd, 0);

	BUG_ON(cfqd->busy_queues);

	return dispatched;
}

957
static int
958
cfq_dispatch_requests(request_queue_t *q, int force)
959 960
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
961 962
	struct cfq_queue *cfqq, *prev_cfqq;
	int dispatched;
963 964 965 966

	if (!cfqd->busy_queues)
		return 0;

967 968 969
	if (unlikely(force))
		return cfq_forced_dispatch(cfqd);

970 971 972
	dispatched = 0;
	prev_cfqq = NULL;
	while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
973 974
		int max_dispatch;

975 976 977 978 979 980
		/*
		 * Don't repeat dispatch from the previous queue.
		 */
		if (prev_cfqq == cfqq)
			break;

J
Jens Axboe 已提交
981 982
		cfq_clear_cfqq_must_dispatch(cfqq);
		cfq_clear_cfqq_wait_request(cfqq);
983 984
		del_timer(&cfqd->idle_slice_timer);

985 986 987
		max_dispatch = cfqd->cfq_quantum;
		if (cfq_class_idle(cfqq))
			max_dispatch = 1;
L
Linus Torvalds 已提交
988

989 990 991 992 993 994 995 996 997 998
		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);

		/*
		 * If the dispatch cfqq has idling enabled and is still
		 * the active queue, break out.
		 */
		if (cfq_cfqq_idle_window(cfqq) && cfqd->active_queue)
			break;

		prev_cfqq = cfqq;
L
Linus Torvalds 已提交
999 1000
	}

1001
	return dispatched;
L
Linus Torvalds 已提交
1002 1003 1004
}

/*
J
Jens Axboe 已提交
1005 1006
 * task holds one reference to the queue, dropped when task exits. each rq
 * in-flight on this queue also holds a reference, dropped when rq is freed.
L
Linus Torvalds 已提交
1007 1008 1009 1010 1011
 *
 * queue lock must be held here.
 */
static void cfq_put_queue(struct cfq_queue *cfqq)
{
1012 1013 1014
	struct cfq_data *cfqd = cfqq->cfqd;

	BUG_ON(atomic_read(&cfqq->ref) <= 0);
L
Linus Torvalds 已提交
1015 1016 1017 1018 1019

	if (!atomic_dec_and_test(&cfqq->ref))
		return;

	BUG_ON(rb_first(&cfqq->sort_list));
1020
	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
J
Jens Axboe 已提交
1021
	BUG_ON(cfq_cfqq_on_rr(cfqq));
L
Linus Torvalds 已提交
1022

1023
	if (unlikely(cfqd->active_queue == cfqq))
J
Jens Axboe 已提交
1024
		__cfq_slice_expired(cfqd, cfqq, 0);
1025

L
Linus Torvalds 已提交
1026 1027 1028 1029 1030 1031 1032 1033
	/*
	 * it's on the empty list and still hashed
	 */
	list_del(&cfqq->cfq_list);
	hlist_del(&cfqq->cfq_hash);
	kmem_cache_free(cfq_pool, cfqq);
}

J
Jens Axboe 已提交
1034
static struct cfq_queue *
J
Jens Axboe 已提交
1035 1036
__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
		    const int hashval)
L
Linus Torvalds 已提交
1037 1038
{
	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1039 1040
	struct hlist_node *entry;
	struct cfq_queue *__cfqq;
L
Linus Torvalds 已提交
1041

1042
	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
A
Al Viro 已提交
1043
		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
L
Linus Torvalds 已提交
1044

1045
		if (__cfqq->key == key && (__p == prio || !prio))
L
Linus Torvalds 已提交
1046 1047 1048 1049 1050 1051 1052
			return __cfqq;
	}

	return NULL;
}

static struct cfq_queue *
J
Jens Axboe 已提交
1053
cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
L
Linus Torvalds 已提交
1054
{
J
Jens Axboe 已提交
1055
	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
L
Linus Torvalds 已提交
1056 1057
}

1058
static void cfq_free_io_context(struct io_context *ioc)
L
Linus Torvalds 已提交
1059
{
1060
	struct cfq_io_context *__cic;
1061 1062
	struct rb_node *n;
	int freed = 0;
L
Linus Torvalds 已提交
1063

1064 1065 1066
	while ((n = rb_first(&ioc->cic_root)) != NULL) {
		__cic = rb_entry(n, struct cfq_io_context, rb_node);
		rb_erase(&__cic->rb_node, &ioc->cic_root);
1067
		kmem_cache_free(cfq_ioc_pool, __cic);
1068
		freed++;
L
Linus Torvalds 已提交
1069 1070
	}

1071 1072 1073
	elv_ioc_count_mod(ioc_count, -freed);

	if (ioc_gone && !elv_ioc_count_read(ioc_count))
1074
		complete(ioc_gone);
L
Linus Torvalds 已提交
1075 1076
}

1077
static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
1078
{
1079 1080
	if (unlikely(cfqq == cfqd->active_queue))
		__cfq_slice_expired(cfqd, cfqq, 0);
1081

1082 1083
	cfq_put_queue(cfqq);
}
1084

1085 1086 1087
static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
					 struct cfq_io_context *cic)
{
1088 1089 1090 1091
	list_del_init(&cic->queue_list);
	smp_wmb();
	cic->key = NULL;

1092
	if (cic->cfqq[ASYNC]) {
1093
		cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
1094 1095 1096 1097
		cic->cfqq[ASYNC] = NULL;
	}

	if (cic->cfqq[SYNC]) {
1098
		cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
1099 1100
		cic->cfqq[SYNC] = NULL;
	}
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113
}


/*
 * Called with interrupts disabled
 */
static void cfq_exit_single_io_context(struct cfq_io_context *cic)
{
	struct cfq_data *cfqd = cic->key;

	if (cfqd) {
		request_queue_t *q = cfqd->queue;

1114
		spin_lock_irq(q->queue_lock);
1115
		__cfq_exit_single_io_context(cfqd, cic);
1116
		spin_unlock_irq(q->queue_lock);
1117
	}
L
Linus Torvalds 已提交
1118 1119
}

1120
static void cfq_exit_io_context(struct io_context *ioc)
L
Linus Torvalds 已提交
1121
{
1122
	struct cfq_io_context *__cic;
1123
	struct rb_node *n;
1124

L
Linus Torvalds 已提交
1125 1126 1127
	/*
	 * put the reference this task is holding to the various queues
	 */
1128 1129 1130 1131 1132

	n = rb_first(&ioc->cic_root);
	while (n != NULL) {
		__cic = rb_entry(n, struct cfq_io_context, rb_node);

1133
		cfq_exit_single_io_context(__cic);
1134
		n = rb_next(n);
L
Linus Torvalds 已提交
1135 1136 1137
	}
}

1138
static struct cfq_io_context *
A
Al Viro 已提交
1139
cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
L
Linus Torvalds 已提交
1140
{
1141
	struct cfq_io_context *cic;
L
Linus Torvalds 已提交
1142

1143
	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node);
L
Linus Torvalds 已提交
1144
	if (cic) {
1145
		memset(cic, 0, sizeof(*cic));
1146
		cic->last_end_request = jiffies;
1147
		INIT_LIST_HEAD(&cic->queue_list);
1148 1149
		cic->dtor = cfq_free_io_context;
		cic->exit = cfq_exit_io_context;
1150
		elv_ioc_count_inc(ioc_count);
L
Linus Torvalds 已提交
1151 1152 1153 1154 1155
	}

	return cic;
}

1156 1157 1158 1159 1160
static void cfq_init_prio_data(struct cfq_queue *cfqq)
{
	struct task_struct *tsk = current;
	int ioprio_class;

J
Jens Axboe 已提交
1161
	if (!cfq_cfqq_prio_changed(cfqq))
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185
		return;

	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
	switch (ioprio_class) {
		default:
			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
		case IOPRIO_CLASS_NONE:
			/*
			 * no prio set, place us in the middle of the BE classes
			 */
			cfqq->ioprio = task_nice_ioprio(tsk);
			cfqq->ioprio_class = IOPRIO_CLASS_BE;
			break;
		case IOPRIO_CLASS_RT:
			cfqq->ioprio = task_ioprio(tsk);
			cfqq->ioprio_class = IOPRIO_CLASS_RT;
			break;
		case IOPRIO_CLASS_BE:
			cfqq->ioprio = task_ioprio(tsk);
			cfqq->ioprio_class = IOPRIO_CLASS_BE;
			break;
		case IOPRIO_CLASS_IDLE:
			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
			cfqq->ioprio = 7;
J
Jens Axboe 已提交
1186
			cfq_clear_cfqq_idle_window(cfqq);
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196
			break;
	}

	/*
	 * keep track of original prio settings in case we have to temporarily
	 * elevate the priority of this queue
	 */
	cfqq->org_ioprio = cfqq->ioprio;
	cfqq->org_ioprio_class = cfqq->ioprio_class;

J
Jens Axboe 已提交
1197
	if (cfq_cfqq_on_rr(cfqq))
1198 1199
		cfq_resort_rr_list(cfqq, 0);

J
Jens Axboe 已提交
1200
	cfq_clear_cfqq_prio_changed(cfqq);
1201 1202
}

1203
static inline void changed_ioprio(struct cfq_io_context *cic)
1204
{
1205 1206
	struct cfq_data *cfqd = cic->key;
	struct cfq_queue *cfqq;
1207

1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
	if (unlikely(!cfqd))
		return;

	spin_lock(cfqd->queue->queue_lock);

	cfqq = cic->cfqq[ASYNC];
	if (cfqq) {
		struct cfq_queue *new_cfqq;
		new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task,
					 GFP_ATOMIC);
		if (new_cfqq) {
			cic->cfqq[ASYNC] = new_cfqq;
			cfq_put_queue(cfqq);
		}
1222
	}
1223 1224 1225 1226 1227 1228

	cfqq = cic->cfqq[SYNC];
	if (cfqq)
		cfq_mark_cfqq_prio_changed(cfqq);

	spin_unlock(cfqd->queue->queue_lock);
1229 1230
}

1231
static void cfq_ioc_set_ioprio(struct io_context *ioc)
1232
{
1233
	struct cfq_io_context *cic;
1234
	struct rb_node *n;
1235

1236
	ioc->ioprio_changed = 0;
1237

1238 1239 1240
	n = rb_first(&ioc->cic_root);
	while (n != NULL) {
		cic = rb_entry(n, struct cfq_io_context, rb_node);
1241

1242
		changed_ioprio(cic);
1243 1244
		n = rb_next(n);
	}
1245 1246 1247
}

static struct cfq_queue *
1248
cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
A
Al Viro 已提交
1249
	      gfp_t gfp_mask)
1250 1251 1252
{
	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
	struct cfq_queue *cfqq, *new_cfqq = NULL;
1253
	unsigned short ioprio;
1254 1255

retry:
1256
	ioprio = tsk->ioprio;
J
Jens Axboe 已提交
1257
	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1258 1259 1260 1261 1262 1263

	if (!cfqq) {
		if (new_cfqq) {
			cfqq = new_cfqq;
			new_cfqq = NULL;
		} else if (gfp_mask & __GFP_WAIT) {
1264 1265 1266 1267 1268 1269
			/*
			 * Inform the allocator of the fact that we will
			 * just repeat this allocation if it fails, to allow
			 * the allocator to do whatever it needs to attempt to
			 * free memory.
			 */
1270
			spin_unlock_irq(cfqd->queue->queue_lock);
1271
			new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node);
1272 1273 1274
			spin_lock_irq(cfqd->queue->queue_lock);
			goto retry;
		} else {
1275
			cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node);
1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293
			if (!cfqq)
				goto out;
		}

		memset(cfqq, 0, sizeof(*cfqq));

		INIT_HLIST_NODE(&cfqq->cfq_hash);
		INIT_LIST_HEAD(&cfqq->cfq_list);
		INIT_LIST_HEAD(&cfqq->fifo);

		cfqq->key = key;
		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
		atomic_set(&cfqq->ref, 0);
		cfqq->cfqd = cfqd;
		/*
		 * set ->slice_left to allow preemption for a new process
		 */
		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1294
		cfq_mark_cfqq_idle_window(cfqq);
J
Jens Axboe 已提交
1295
		cfq_mark_cfqq_prio_changed(cfqq);
1296
		cfq_mark_cfqq_queue_new(cfqq);
J
Jens Axboe 已提交
1297
		cfq_init_prio_data(cfqq);
1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308
	}

	if (new_cfqq)
		kmem_cache_free(cfq_pool, new_cfqq);

	atomic_inc(&cfqq->ref);
out:
	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
	return cfqq;
}

1309 1310 1311
static void
cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
{
1312
	WARN_ON(!list_empty(&cic->queue_list));
1313 1314
	rb_erase(&cic->rb_node, &ioc->cic_root);
	kmem_cache_free(cfq_ioc_pool, cic);
1315
	elv_ioc_count_dec(ioc_count);
1316 1317
}

1318 1319 1320
static struct cfq_io_context *
cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
{
1321
	struct rb_node *n;
1322
	struct cfq_io_context *cic;
1323
	void *k, *key = cfqd;
1324

1325 1326
restart:
	n = ioc->cic_root.rb_node;
1327 1328
	while (n) {
		cic = rb_entry(n, struct cfq_io_context, rb_node);
1329 1330 1331
		/* ->key must be copied to avoid race with cfq_exit_queue() */
		k = cic->key;
		if (unlikely(!k)) {
1332 1333 1334
			cfq_drop_dead_cic(ioc, cic);
			goto restart;
		}
1335

1336
		if (key < k)
1337
			n = n->rb_left;
1338
		else if (key > k)
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
			n = n->rb_right;
		else
			return cic;
	}

	return NULL;
}

static inline void
cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
	     struct cfq_io_context *cic)
{
1351 1352
	struct rb_node **p;
	struct rb_node *parent;
1353
	struct cfq_io_context *__cic;
1354
	void *k;
1355 1356 1357 1358

	cic->ioc = ioc;
	cic->key = cfqd;

1359 1360 1361
restart:
	parent = NULL;
	p = &ioc->cic_root.rb_node;
1362 1363 1364
	while (*p) {
		parent = *p;
		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
1365 1366 1367
		/* ->key must be copied to avoid race with cfq_exit_queue() */
		k = __cic->key;
		if (unlikely(!k)) {
1368
			cfq_drop_dead_cic(ioc, __cic);
1369 1370
			goto restart;
		}
1371

1372
		if (cic->key < k)
1373
			p = &(*p)->rb_left;
1374
		else if (cic->key > k)
1375 1376 1377 1378 1379 1380 1381
			p = &(*p)->rb_right;
		else
			BUG();
	}

	rb_link_node(&cic->rb_node, parent, p);
	rb_insert_color(&cic->rb_node, &ioc->cic_root);
1382 1383

	spin_lock_irq(cfqd->queue->queue_lock);
1384
	list_add(&cic->queue_list, &cfqd->cic_list);
1385
	spin_unlock_irq(cfqd->queue->queue_lock);
1386 1387
}

L
Linus Torvalds 已提交
1388 1389 1390
/*
 * Setup general io context and cfq io context. There can be several cfq
 * io contexts per general io context, if this process is doing io to more
1391
 * than one device managed by cfq.
L
Linus Torvalds 已提交
1392 1393
 */
static struct cfq_io_context *
1394
cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
L
Linus Torvalds 已提交
1395
{
1396
	struct io_context *ioc = NULL;
L
Linus Torvalds 已提交
1397 1398
	struct cfq_io_context *cic;

1399
	might_sleep_if(gfp_mask & __GFP_WAIT);
L
Linus Torvalds 已提交
1400

1401
	ioc = get_io_context(gfp_mask, cfqd->queue->node);
L
Linus Torvalds 已提交
1402 1403 1404
	if (!ioc)
		return NULL;

1405 1406 1407
	cic = cfq_cic_rb_lookup(cfqd, ioc);
	if (cic)
		goto out;
L
Linus Torvalds 已提交
1408

1409 1410 1411
	cic = cfq_alloc_io_context(cfqd, gfp_mask);
	if (cic == NULL)
		goto err;
L
Linus Torvalds 已提交
1412

1413
	cfq_cic_link(cfqd, ioc, cic);
L
Linus Torvalds 已提交
1414
out:
1415 1416 1417 1418
	smp_read_barrier_depends();
	if (unlikely(ioc->ioprio_changed))
		cfq_ioc_set_ioprio(ioc);

L
Linus Torvalds 已提交
1419 1420 1421 1422 1423 1424
	return cic;
err:
	put_io_context(ioc);
	return NULL;
}

1425 1426
static void
cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
L
Linus Torvalds 已提交
1427
{
1428
	unsigned long elapsed, ttime;
L
Linus Torvalds 已提交
1429

1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441
	/*
	 * if this context already has stuff queued, thinktime is from
	 * last queue not last end
	 */
#if 0
	if (time_after(cic->last_end_request, cic->last_queue))
		elapsed = jiffies - cic->last_end_request;
	else
		elapsed = jiffies - cic->last_queue;
#else
		elapsed = jiffies - cic->last_end_request;
#endif
L
Linus Torvalds 已提交
1442

1443
	ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1444

1445 1446 1447 1448
	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
}
L
Linus Torvalds 已提交
1449

1450 1451
static void
cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
J
Jens Axboe 已提交
1452
		       struct request *rq)
1453 1454 1455 1456
{
	sector_t sdist;
	u64 total;

J
Jens Axboe 已提交
1457 1458
	if (cic->last_request_pos < rq->sector)
		sdist = rq->sector - cic->last_request_pos;
1459
	else
J
Jens Axboe 已提交
1460
		sdist = cic->last_request_pos - rq->sector;
1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476

	/*
	 * Don't allow the seek distance to get too large from the
	 * odd fragment, pagein, etc
	 */
	if (cic->seek_samples <= 60) /* second&third seek */
		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
	else
		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);

	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
	total = cic->seek_total + (cic->seek_samples/2);
	do_div(total, cic->seek_samples);
	cic->seek_mean = (sector_t)total;
}
L
Linus Torvalds 已提交
1477

1478 1479 1480 1481 1482 1483 1484 1485
/*
 * Disable idle window if the process thinks too long or seeks so much that
 * it doesn't matter
 */
static void
cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		       struct cfq_io_context *cic)
{
J
Jens Axboe 已提交
1486
	int enable_idle = cfq_cfqq_idle_window(cfqq);
L
Linus Torvalds 已提交
1487

1488 1489
	if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
	    (cfqd->hw_tag && CIC_SEEKY(cic)))
1490 1491 1492 1493 1494 1495
		enable_idle = 0;
	else if (sample_valid(cic->ttime_samples)) {
		if (cic->ttime_mean > cfqd->cfq_slice_idle)
			enable_idle = 0;
		else
			enable_idle = 1;
L
Linus Torvalds 已提交
1496 1497
	}

J
Jens Axboe 已提交
1498 1499 1500 1501
	if (enable_idle)
		cfq_mark_cfqq_idle_window(cfqq);
	else
		cfq_clear_cfqq_idle_window(cfqq);
1502
}
L
Linus Torvalds 已提交
1503

1504 1505 1506 1507 1508 1509 1510

/*
 * Check if new_cfqq should preempt the currently active queue. Return 0 for
 * no or if we aren't sure, a 1 will cause a preempt.
 */
static int
cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
J
Jens Axboe 已提交
1511
		   struct request *rq)
1512 1513 1514 1515 1516 1517 1518
{
	struct cfq_queue *cfqq = cfqd->active_queue;

	if (cfq_class_idle(new_cfqq))
		return 0;

	if (!cfqq)
1519
		return 0;
1520 1521 1522

	if (cfq_class_idle(cfqq))
		return 1;
J
Jens Axboe 已提交
1523
	if (!cfq_cfqq_wait_request(new_cfqq))
1524 1525 1526 1527 1528 1529
		return 0;
	/*
	 * if it doesn't have slice left, forget it
	 */
	if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
		return 0;
J
Jens Axboe 已提交
1530
	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
		return 1;

	return 0;
}

/*
 * cfqq preempts the active queue. if we allowed preempt with no slice left,
 * let it have half of its nominal slice.
 */
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct cfq_queue *__cfqq, *next;

	list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
		cfq_resort_rr_list(__cfqq, 1);

	if (!cfqq->slice_left)
		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;

	cfqq->slice_end = cfqq->slice_left + jiffies;
1551
	cfq_slice_expired(cfqd, 1);
1552 1553 1554 1555
	__cfq_set_active_queue(cfqd, cfqq);
}

/*
J
Jens Axboe 已提交
1556
 * Called when a new fs request (rq) is added (to cfqq). Check if there's
1557 1558 1559
 * something we should do about it
 */
static void
J
Jens Axboe 已提交
1560 1561
cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		struct request *rq)
1562
{
J
Jens Axboe 已提交
1563
	struct cfq_io_context *cic = RQ_CIC(rq);
1564

1565
	/*
1566
	 * check if this request is a better next-serve candidate)) {
1567
	 */
J
Jens Axboe 已提交
1568 1569
	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
	BUG_ON(!cfqq->next_rq);
1570

J
Jens Axboe 已提交
1571 1572 1573 1574
	/*
	 * we never wait for an async request and we don't allow preemption
	 * of an async request. so just return early
	 */
J
Jens Axboe 已提交
1575
	if (!rq_is_sync(rq)) {
1576 1577 1578 1579 1580 1581 1582
		/*
		 * sync process issued an async request, if it's waiting
		 * then expire it and kick rq handling.
		 */
		if (cic == cfqd->active_cic &&
		    del_timer(&cfqd->idle_slice_timer)) {
			cfq_slice_expired(cfqd, 0);
1583
			blk_start_queueing(cfqd->queue);
1584
		}
J
Jens Axboe 已提交
1585
		return;
1586
	}
1587

J
Jens Axboe 已提交
1588
	cfq_update_io_thinktime(cfqd, cic);
J
Jens Axboe 已提交
1589
	cfq_update_io_seektime(cfqd, cic, rq);
J
Jens Axboe 已提交
1590 1591 1592
	cfq_update_idle_window(cfqd, cfqq, cic);

	cic->last_queue = jiffies;
J
Jens Axboe 已提交
1593
	cic->last_request_pos = rq->sector + rq->nr_sectors;
1594 1595 1596 1597 1598 1599 1600

	if (cfqq == cfqd->active_queue) {
		/*
		 * if we are waiting for a request for this queue, let it rip
		 * immediately and flag that we must not expire this queue
		 * just now
		 */
J
Jens Axboe 已提交
1601 1602
		if (cfq_cfqq_wait_request(cfqq)) {
			cfq_mark_cfqq_must_dispatch(cfqq);
1603
			del_timer(&cfqd->idle_slice_timer);
1604
			blk_start_queueing(cfqd->queue);
1605
		}
J
Jens Axboe 已提交
1606
	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
1607 1608 1609 1610 1611 1612
		/*
		 * not the active queue - expire current slice if it is
		 * idle and has expired it's mean thinktime or this new queue
		 * has some old slice time left and is of higher priority
		 */
		cfq_preempt_queue(cfqd, cfqq);
J
Jens Axboe 已提交
1613
		cfq_mark_cfqq_must_dispatch(cfqq);
1614
		blk_start_queueing(cfqd->queue);
1615
	}
L
Linus Torvalds 已提交
1616 1617
}

1618
static void cfq_insert_request(request_queue_t *q, struct request *rq)
L
Linus Torvalds 已提交
1619
{
1620
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
1621
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1622 1623

	cfq_init_prio_data(cfqq);
L
Linus Torvalds 已提交
1624

J
Jens Axboe 已提交
1625
	cfq_add_rq_rb(rq);
L
Linus Torvalds 已提交
1626

1627 1628 1629
	if (!cfq_cfqq_on_rr(cfqq))
		cfq_add_cfqq_rr(cfqd, cfqq);

1630 1631
	list_add_tail(&rq->queuelist, &cfqq->fifo);

J
Jens Axboe 已提交
1632
	cfq_rq_enqueued(cfqd, cfqq, rq);
L
Linus Torvalds 已提交
1633 1634 1635 1636
}

static void cfq_completed_request(request_queue_t *q, struct request *rq)
{
J
Jens Axboe 已提交
1637
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1638
	struct cfq_data *cfqd = cfqq->cfqd;
1639
	const int sync = rq_is_sync(rq);
1640
	unsigned long now;
L
Linus Torvalds 已提交
1641

1642
	now = jiffies;
L
Linus Torvalds 已提交
1643

1644 1645 1646 1647
	WARN_ON(!cfqd->rq_in_driver);
	WARN_ON(!cfqq->on_dispatch[sync]);
	cfqd->rq_in_driver--;
	cfqq->on_dispatch[sync]--;
L
Linus Torvalds 已提交
1648

1649 1650
	if (!cfq_class_idle(cfqq))
		cfqd->last_end_request = now;
J
Jens Axboe 已提交
1651

1652 1653
	if (!cfq_cfqq_dispatched(cfqq) && cfq_cfqq_on_rr(cfqq))
		cfq_resort_rr_list(cfqq, 0);
L
Linus Torvalds 已提交
1654

1655
	if (sync)
J
Jens Axboe 已提交
1656
		RQ_CIC(rq)->last_end_request = now;
1657 1658 1659 1660 1661 1662 1663 1664

	/*
	 * If this is the active queue, check if it needs to be expired,
	 * or if we want to idle in case it has no pending requests.
	 */
	if (cfqd->active_queue == cfqq) {
		if (time_after(now, cfqq->slice_end))
			cfq_slice_expired(cfqd, 0);
1665
		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1666 1667 1668 1669
			if (!cfq_arm_slice_timer(cfqd, cfqq))
				cfq_schedule_dispatch(cfqd);
		}
	}
L
Linus Torvalds 已提交
1670 1671
}

1672 1673 1674 1675 1676
/*
 * we temporarily boost lower priority queues if they are holding fs exclusive
 * resources. they are boosted to normal prio (CLASS_BE/4)
 */
static void cfq_prio_boost(struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
1677
{
1678 1679
	const int ioprio_class = cfqq->ioprio_class;
	const int ioprio = cfqq->ioprio;
L
Linus Torvalds 已提交
1680

1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698
	if (has_fs_excl()) {
		/*
		 * boost idle prio on transactions that would lock out other
		 * users of the filesystem
		 */
		if (cfq_class_idle(cfqq))
			cfqq->ioprio_class = IOPRIO_CLASS_BE;
		if (cfqq->ioprio > IOPRIO_NORM)
			cfqq->ioprio = IOPRIO_NORM;
	} else {
		/*
		 * check if we need to unboost the queue
		 */
		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
			cfqq->ioprio_class = cfqq->org_ioprio_class;
		if (cfqq->ioprio != cfqq->org_ioprio)
			cfqq->ioprio = cfqq->org_ioprio;
	}
L
Linus Torvalds 已提交
1699

1700 1701 1702 1703
	/*
	 * refile between round-robin lists if we moved the priority class
	 */
	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
J
Jens Axboe 已提交
1704
	    cfq_cfqq_on_rr(cfqq))
1705 1706
		cfq_resort_rr_list(cfqq, 0);
}
L
Linus Torvalds 已提交
1707

1708
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
1709
{
J
Jens Axboe 已提交
1710
	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
A
Andrew Morton 已提交
1711
	    !cfq_cfqq_must_alloc_slice(cfqq)) {
J
Jens Axboe 已提交
1712
		cfq_mark_cfqq_must_alloc_slice(cfqq);
1713
		return ELV_MQUEUE_MUST;
J
Jens Axboe 已提交
1714
	}
L
Linus Torvalds 已提交
1715

1716 1717 1718
	return ELV_MQUEUE_MAY;
}

1719
static int cfq_may_queue(request_queue_t *q, int rw)
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct task_struct *tsk = current;
	struct cfq_queue *cfqq;

	/*
	 * don't force setup of a queue from here, as a call to may_queue
	 * does not necessarily imply that a request actually will be queued.
	 * so just lookup a possibly existing queue, or return 'may queue'
	 * if that fails
	 */
J
Jens Axboe 已提交
1731
	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1732 1733 1734 1735
	if (cfqq) {
		cfq_init_prio_data(cfqq);
		cfq_prio_boost(cfqq);

1736
		return __cfq_may_queue(cfqq);
1737 1738 1739
	}

	return ELV_MQUEUE_MAY;
L
Linus Torvalds 已提交
1740 1741 1742 1743 1744 1745 1746
}

/*
 * queue lock held here
 */
static void cfq_put_request(request_queue_t *q, struct request *rq)
{
J
Jens Axboe 已提交
1747
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
L
Linus Torvalds 已提交
1748

J
Jens Axboe 已提交
1749
	if (cfqq) {
1750
		const int rw = rq_data_dir(rq);
L
Linus Torvalds 已提交
1751

1752 1753
		BUG_ON(!cfqq->allocated[rw]);
		cfqq->allocated[rw]--;
L
Linus Torvalds 已提交
1754

J
Jens Axboe 已提交
1755
		put_io_context(RQ_CIC(rq)->ioc);
L
Linus Torvalds 已提交
1756 1757

		rq->elevator_private = NULL;
J
Jens Axboe 已提交
1758
		rq->elevator_private2 = NULL;
L
Linus Torvalds 已提交
1759 1760 1761 1762 1763 1764

		cfq_put_queue(cfqq);
	}
}

/*
1765
 * Allocate cfq data structures associated with this request.
L
Linus Torvalds 已提交
1766
 */
1767
static int
1768
cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
L
Linus Torvalds 已提交
1769 1770
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
1771
	struct task_struct *tsk = current;
L
Linus Torvalds 已提交
1772 1773
	struct cfq_io_context *cic;
	const int rw = rq_data_dir(rq);
J
Jens Axboe 已提交
1774
	pid_t key = cfq_queue_pid(tsk, rw);
1775
	struct cfq_queue *cfqq;
L
Linus Torvalds 已提交
1776
	unsigned long flags;
1777
	int is_sync = key != CFQ_KEY_ASYNC;
L
Linus Torvalds 已提交
1778 1779 1780

	might_sleep_if(gfp_mask & __GFP_WAIT);

1781
	cic = cfq_get_io_context(cfqd, gfp_mask);
1782

L
Linus Torvalds 已提交
1783 1784
	spin_lock_irqsave(q->queue_lock, flags);

1785 1786 1787
	if (!cic)
		goto queue_fail;

1788
	if (!cic->cfqq[is_sync]) {
1789
		cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
1790 1791
		if (!cfqq)
			goto queue_fail;
L
Linus Torvalds 已提交
1792

1793
		cic->cfqq[is_sync] = cfqq;
1794
	} else
1795
		cfqq = cic->cfqq[is_sync];
L
Linus Torvalds 已提交
1796 1797

	cfqq->allocated[rw]++;
J
Jens Axboe 已提交
1798
	cfq_clear_cfqq_must_alloc(cfqq);
1799
	atomic_inc(&cfqq->ref);
L
Linus Torvalds 已提交
1800

J
Jens Axboe 已提交
1801
	spin_unlock_irqrestore(q->queue_lock, flags);
J
Jens Axboe 已提交
1802

J
Jens Axboe 已提交
1803 1804 1805
	rq->elevator_private = cic;
	rq->elevator_private2 = cfqq;
	return 0;
L
Linus Torvalds 已提交
1806

1807 1808 1809
queue_fail:
	if (cic)
		put_io_context(cic->ioc);
1810

J
Jens Axboe 已提交
1811
	cfq_schedule_dispatch(cfqd);
L
Linus Torvalds 已提交
1812 1813 1814 1815
	spin_unlock_irqrestore(q->queue_lock, flags);
	return 1;
}

1816 1817 1818 1819 1820 1821
static void cfq_kick_queue(void *data)
{
	request_queue_t *q = data;
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
1822
	blk_start_queueing(q);
1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849
	spin_unlock_irqrestore(q->queue_lock, flags);
}

/*
 * Timer running if the active_queue is currently idling inside its time slice
 */
static void cfq_idle_slice_timer(unsigned long data)
{
	struct cfq_data *cfqd = (struct cfq_data *) data;
	struct cfq_queue *cfqq;
	unsigned long flags;

	spin_lock_irqsave(cfqd->queue->queue_lock, flags);

	if ((cfqq = cfqd->active_queue) != NULL) {
		unsigned long now = jiffies;

		/*
		 * expired
		 */
		if (time_after(now, cfqq->slice_end))
			goto expire;

		/*
		 * only expire and reinvoke request handler, if there are
		 * other queues with pending requests
		 */
1850
		if (!cfqd->busy_queues)
1851 1852 1853 1854 1855
			goto out_cont;

		/*
		 * not expired and it has a request pending, let it dispatch
		 */
1856
		if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
J
Jens Axboe 已提交
1857
			cfq_mark_cfqq_must_dispatch(cfqq);
1858 1859 1860 1861 1862 1863
			goto out_kick;
		}
	}
expire:
	cfq_slice_expired(cfqd, 0);
out_kick:
J
Jens Axboe 已提交
1864
	cfq_schedule_dispatch(cfqd);
1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882
out_cont:
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
}

/*
 * Timer running if an idle class queue is waiting for service
 */
static void cfq_idle_class_timer(unsigned long data)
{
	struct cfq_data *cfqd = (struct cfq_data *) data;
	unsigned long flags, end;

	spin_lock_irqsave(cfqd->queue->queue_lock, flags);

	/*
	 * race with a non-idle queue, reset timer
	 */
	end = cfqd->last_end_request + CFQ_IDLE_GRACE;
1883 1884 1885
	if (!time_after_eq(jiffies, end))
		mod_timer(&cfqd->idle_class_timer, end);
	else
J
Jens Axboe 已提交
1886
		cfq_schedule_dispatch(cfqd);
1887 1888 1889 1890

	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
}

J
Jens Axboe 已提交
1891 1892 1893 1894 1895 1896
static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
{
	del_timer_sync(&cfqd->idle_slice_timer);
	del_timer_sync(&cfqd->idle_class_timer);
	blk_sync_queue(cfqd->queue);
}
1897

L
Linus Torvalds 已提交
1898 1899
static void cfq_exit_queue(elevator_t *e)
{
1900
	struct cfq_data *cfqd = e->elevator_data;
1901
	request_queue_t *q = cfqd->queue;
1902

J
Jens Axboe 已提交
1903
	cfq_shutdown_timer_wq(cfqd);
1904

1905
	spin_lock_irq(q->queue_lock);
1906

1907 1908
	if (cfqd->active_queue)
		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
1909 1910

	while (!list_empty(&cfqd->cic_list)) {
1911 1912 1913
		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
							struct cfq_io_context,
							queue_list);
1914 1915

		__cfq_exit_single_io_context(cfqd, cic);
1916
	}
1917

1918
	spin_unlock_irq(q->queue_lock);
1919 1920 1921 1922 1923

	cfq_shutdown_timer_wq(cfqd);

	kfree(cfqd->cfq_hash);
	kfree(cfqd);
L
Linus Torvalds 已提交
1924 1925
}

J
Jens Axboe 已提交
1926
static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
L
Linus Torvalds 已提交
1927 1928 1929 1930
{
	struct cfq_data *cfqd;
	int i;

1931
	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
L
Linus Torvalds 已提交
1932
	if (!cfqd)
J
Jens Axboe 已提交
1933
		return NULL;
L
Linus Torvalds 已提交
1934 1935

	memset(cfqd, 0, sizeof(*cfqd));
1936 1937 1938 1939 1940 1941 1942

	for (i = 0; i < CFQ_PRIO_LISTS; i++)
		INIT_LIST_HEAD(&cfqd->rr_list[i]);

	INIT_LIST_HEAD(&cfqd->busy_rr);
	INIT_LIST_HEAD(&cfqd->cur_rr);
	INIT_LIST_HEAD(&cfqd->idle_rr);
1943
	INIT_LIST_HEAD(&cfqd->cic_list);
L
Linus Torvalds 已提交
1944

1945
	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
L
Linus Torvalds 已提交
1946
	if (!cfqd->cfq_hash)
J
Jens Axboe 已提交
1947
		goto out_free;
L
Linus Torvalds 已提交
1948 1949 1950 1951 1952 1953

	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);

	cfqd->queue = q;

1954 1955 1956 1957 1958 1959 1960 1961 1962 1963
	init_timer(&cfqd->idle_slice_timer);
	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
	cfqd->idle_slice_timer.data = (unsigned long) cfqd;

	init_timer(&cfqd->idle_class_timer);
	cfqd->idle_class_timer.function = cfq_idle_class_timer;
	cfqd->idle_class_timer.data = (unsigned long) cfqd;

	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);

L
Linus Torvalds 已提交
1964
	cfqd->cfq_quantum = cfq_quantum;
1965 1966
	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
L
Linus Torvalds 已提交
1967 1968
	cfqd->cfq_back_max = cfq_back_max;
	cfqd->cfq_back_penalty = cfq_back_penalty;
1969 1970 1971 1972
	cfqd->cfq_slice[0] = cfq_slice_async;
	cfqd->cfq_slice[1] = cfq_slice_sync;
	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
	cfqd->cfq_slice_idle = cfq_slice_idle;
J
Jens Axboe 已提交
1973

J
Jens Axboe 已提交
1974
	return cfqd;
J
Jens Axboe 已提交
1975
out_free:
L
Linus Torvalds 已提交
1976
	kfree(cfqd);
J
Jens Axboe 已提交
1977
	return NULL;
L
Linus Torvalds 已提交
1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
}

static void cfq_slab_kill(void)
{
	if (cfq_pool)
		kmem_cache_destroy(cfq_pool);
	if (cfq_ioc_pool)
		kmem_cache_destroy(cfq_ioc_pool);
}

static int __init cfq_slab_setup(void)
{
	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
					NULL, NULL);
	if (!cfq_pool)
		goto fail;

	cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
			sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
	if (!cfq_ioc_pool)
		goto fail;

	return 0;
fail:
	cfq_slab_kill();
	return -ENOMEM;
}

/*
 * sysfs parts below -->
 */

static ssize_t
cfq_var_show(unsigned int var, char *page)
{
	return sprintf(page, "%d\n", var);
}

static ssize_t
cfq_var_store(unsigned int *var, const char *page, size_t count)
{
	char *p = (char *) page;

	*var = simple_strtoul(p, &p, 10);
	return count;
}

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
2026
static ssize_t __FUNC(elevator_t *e, char *page)			\
L
Linus Torvalds 已提交
2027
{									\
2028
	struct cfq_data *cfqd = e->elevator_data;			\
L
Linus Torvalds 已提交
2029 2030 2031 2032 2033 2034
	unsigned int __data = __VAR;					\
	if (__CONV)							\
		__data = jiffies_to_msecs(__data);			\
	return cfq_var_show(__data, (page));				\
}
SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2035 2036
SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2037 2038
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2039 2040 2041 2042
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
L
Linus Torvalds 已提交
2043 2044 2045
#undef SHOW_FUNCTION

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
2046
static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
L
Linus Torvalds 已提交
2047
{									\
2048
	struct cfq_data *cfqd = e->elevator_data;			\
L
Linus Torvalds 已提交
2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061
	unsigned int __data;						\
	int ret = cfq_var_store(&__data, (page), count);		\
	if (__data < (MIN))						\
		__data = (MIN);						\
	else if (__data > (MAX))					\
		__data = (MAX);						\
	if (__CONV)							\
		*(__PTR) = msecs_to_jiffies(__data);			\
	else								\
		*(__PTR) = __data;					\
	return ret;							\
}
STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2062 2063
STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2064 2065
STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2066 2067 2068 2069
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
L
Linus Torvalds 已提交
2070 2071
#undef STORE_FUNCTION

2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085
#define CFQ_ATTR(name) \
	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

static struct elv_fs_entry cfq_attrs[] = {
	CFQ_ATTR(quantum),
	CFQ_ATTR(fifo_expire_sync),
	CFQ_ATTR(fifo_expire_async),
	CFQ_ATTR(back_seek_max),
	CFQ_ATTR(back_seek_penalty),
	CFQ_ATTR(slice_sync),
	CFQ_ATTR(slice_async),
	CFQ_ATTR(slice_async_rq),
	CFQ_ATTR(slice_idle),
	__ATTR_NULL
L
Linus Torvalds 已提交
2086 2087 2088 2089 2090 2091 2092
};

static struct elevator_type iosched_cfq = {
	.ops = {
		.elevator_merge_fn = 		cfq_merge,
		.elevator_merged_fn =		cfq_merged_request,
		.elevator_merge_req_fn =	cfq_merged_requests,
2093
		.elevator_dispatch_fn =		cfq_dispatch_requests,
L
Linus Torvalds 已提交
2094
		.elevator_add_req_fn =		cfq_insert_request,
2095
		.elevator_activate_req_fn =	cfq_activate_request,
L
Linus Torvalds 已提交
2096 2097 2098
		.elevator_deactivate_req_fn =	cfq_deactivate_request,
		.elevator_queue_empty_fn =	cfq_queue_empty,
		.elevator_completed_req_fn =	cfq_completed_request,
2099 2100
		.elevator_former_req_fn =	elv_rb_former_request,
		.elevator_latter_req_fn =	elv_rb_latter_request,
L
Linus Torvalds 已提交
2101 2102 2103 2104 2105
		.elevator_set_req_fn =		cfq_set_request,
		.elevator_put_req_fn =		cfq_put_request,
		.elevator_may_queue_fn =	cfq_may_queue,
		.elevator_init_fn =		cfq_init_queue,
		.elevator_exit_fn =		cfq_exit_queue,
2106
		.trim =				cfq_free_io_context,
L
Linus Torvalds 已提交
2107
	},
2108
	.elevator_attrs =	cfq_attrs,
L
Linus Torvalds 已提交
2109 2110 2111 2112 2113 2114 2115 2116
	.elevator_name =	"cfq",
	.elevator_owner =	THIS_MODULE,
};

static int __init cfq_init(void)
{
	int ret;

2117 2118 2119 2120 2121 2122 2123 2124
	/*
	 * could be 0 on HZ < 1000 setups
	 */
	if (!cfq_slice_async)
		cfq_slice_async = 1;
	if (!cfq_slice_idle)
		cfq_slice_idle = 1;

L
Linus Torvalds 已提交
2125 2126 2127 2128
	if (cfq_slab_setup())
		return -ENOMEM;

	ret = elv_register(&iosched_cfq);
2129 2130
	if (ret)
		cfq_slab_kill();
L
Linus Torvalds 已提交
2131 2132 2133 2134 2135 2136

	return ret;
}

static void __exit cfq_exit(void)
{
2137
	DECLARE_COMPLETION(all_gone);
L
Linus Torvalds 已提交
2138
	elv_unregister(&iosched_cfq);
2139
	ioc_gone = &all_gone;
2140 2141
	/* ioc_gone's update must be visible before reading ioc_count */
	smp_wmb();
2142
	if (elv_ioc_count_read(ioc_count))
2143
		wait_for_completion(ioc_gone);
2144
	synchronize_rcu();
2145
	cfq_slab_kill();
L
Linus Torvalds 已提交
2146 2147 2148 2149 2150 2151 2152 2153
}

module_init(cfq_init);
module_exit(cfq_exit);

MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");