cfq-iosched.c 103.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
7
 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
L
Linus Torvalds 已提交
8 9
 */
#include <linux/module.h>
10
#include <linux/slab.h>
A
Al Viro 已提交
11 12
#include <linux/blkdev.h>
#include <linux/elevator.h>
R
Randy Dunlap 已提交
13
#include <linux/jiffies.h>
L
Linus Torvalds 已提交
14
#include <linux/rbtree.h>
15
#include <linux/ioprio.h>
16
#include <linux/blktrace_api.h>
17
#include "cfq.h"
L
Linus Torvalds 已提交
18 19 20 21

/*
 * tunables
 */
22
/* max queue in one round of service */
S
Shaohua Li 已提交
23
static const int cfq_quantum = 8;
24
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
25 26 27 28
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
29
static const int cfq_slice_sync = HZ / 10;
J
Jens Axboe 已提交
30
static int cfq_slice_async = HZ / 25;
31
static const int cfq_slice_async_rq = 2;
32
static int cfq_slice_idle = HZ / 125;
33
static int cfq_group_idle = HZ / 125;
34 35
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
36

37
/*
38
 * offset from end of service tree
39
 */
40
#define CFQ_IDLE_DELAY		(HZ / 5)
41 42 43 44 45 46

/*
 * below this threshold, we consider thinktime immediate
 */
#define CFQ_MIN_TT		(2)

47
#define CFQ_SLICE_SCALE		(5)
48
#define CFQ_HW_QUEUE_MIN	(5)
49
#define CFQ_SERVICE_SHIFT       12
50

51
#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
52
#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
53
#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
54
#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
55

56
#define RQ_CIC(rq)		\
57 58 59
	((struct cfq_io_context *) (rq)->elevator_private[0])
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private[2])
L
Linus Torvalds 已提交
60

61 62
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
L
Linus Torvalds 已提交
63

64
static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
65
static struct completion *ioc_gone;
66
static DEFINE_SPINLOCK(ioc_gone_lock);
67

68 69 70
static DEFINE_SPINLOCK(cic_index_lock);
static DEFINE_IDA(cic_index_ida);

71 72 73 74
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

75
#define sample_valid(samples)	((samples) > 80)
76
#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
77

78 79 80 81 82 83 84 85 86
/*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 * move this into the elevator for the rq sorting as well.
 */
struct cfq_rb_root {
	struct rb_root rb;
	struct rb_node *left;
87
	unsigned count;
88
	unsigned total_weight;
89
	u64 min_vdisktime;
90
};
91 92
#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
			.count = 0, .min_vdisktime = 0, }
93

94 95 96 97 98
/*
 * Per process-grouping structure
 */
struct cfq_queue {
	/* reference count */
99
	int ref;
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
	/* various state flags, see below */
	unsigned int flags;
	/* parent cfq_data */
	struct cfq_data *cfqd;
	/* service_tree member */
	struct rb_node rb_node;
	/* service_tree key */
	unsigned long rb_key;
	/* prio tree member */
	struct rb_node p_node;
	/* prio tree root we belong to, if any */
	struct rb_root *p_root;
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
	struct request *next_rq;
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
	struct list_head fifo;

123 124
	/* time when queue got scheduled in to dispatch first request. */
	unsigned long dispatch_start;
125
	unsigned int allocated_slice;
126
	unsigned int slice_dispatch;
127 128
	/* time when first request from queue completed and slice started. */
	unsigned long slice_start;
129 130 131 132 133 134 135 136 137 138 139 140
	unsigned long slice_end;
	long slice_resid;

	/* pending metadata requests */
	int meta_pending;
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
	unsigned short ioprio_class, org_ioprio_class;

141 142
	pid_t pid;

143
	u32 seek_history;
144 145
	sector_t last_request_pos;

146
	struct cfq_rb_root *service_tree;
J
Jeff Moyer 已提交
147
	struct cfq_queue *new_cfqq;
148
	struct cfq_group *cfqg;
149 150
	/* Number of sectors dispatched from queue in single dispatch round */
	unsigned long nr_sectors;
151 152
};

153
/*
154
 * First index in the service_trees.
155 156 157 158
 * IDLE is handled separately, so it has negative index
 */
enum wl_prio_t {
	BE_WORKLOAD = 0,
159 160
	RT_WORKLOAD = 1,
	IDLE_WORKLOAD = 2,
161
	CFQ_PRIO_NR,
162 163
};

164 165 166 167 168 169 170 171 172
/*
 * Second index in the service_trees.
 */
enum wl_type_t {
	ASYNC_WORKLOAD = 0,
	SYNC_NOIDLE_WORKLOAD = 1,
	SYNC_WORKLOAD = 2
};

173 174
/* This is per cgroup per device grouping structure */
struct cfq_group {
175 176 177 178 179
	/* group service_tree member */
	struct rb_node rb_node;

	/* group service_tree key */
	u64 vdisktime;
180
	unsigned int weight;
181 182 183 184

	/* number of cfqq currently on this group */
	int nr_cfqq;

185
	/*
186 187 188 189 190 191 192 193 194 195 196 197
	 * Per group busy queus average. Useful for workload slice calc. We
	 * create the array for each prio class but at run time it is used
	 * only for RT and BE class and slot for IDLE class remains unused.
	 * This is primarily done to avoid confusion and a gcc warning.
	 */
	unsigned int busy_queues_avg[CFQ_PRIO_NR];
	/*
	 * rr lists of queues with requests. We maintain service trees for
	 * RT and BE classes. These trees are subdivided in subclasses
	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
	 * class there is no subclassification and all the cfq queues go on
	 * a single tree service_tree_idle.
198 199 200 201
	 * Counts are embedded in the cfq_rb_root
	 */
	struct cfq_rb_root service_trees[2][3];
	struct cfq_rb_root service_tree_idle;
202 203 204 205

	unsigned long saved_workload_slice;
	enum wl_type_t saved_workload;
	enum wl_prio_t saved_serving_prio;
206 207 208
	struct blkio_group blkg;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	struct hlist_node cfqd_node;
209
	int ref;
210
#endif
211 212
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;
213
};
214

215 216 217
/*
 * Per block device queue structure
 */
L
Linus Torvalds 已提交
218
struct cfq_data {
219
	struct request_queue *queue;
220 221
	/* Root service tree for cfq_groups */
	struct cfq_rb_root grp_service_tree;
222
	struct cfq_group root_group;
223

224 225
	/*
	 * The priority currently being served
226
	 */
227
	enum wl_prio_t serving_prio;
228 229
	enum wl_type_t serving_type;
	unsigned long workload_expires;
230
	struct cfq_group *serving_group;
231 232 233 234 235 236 237 238

	/*
	 * Each priority tree is sorted by next_request position.  These
	 * trees are used when determining if two or more queues are
	 * interleaving requests (see cfq_close_cooperator).
	 */
	struct rb_root prio_trees[CFQ_PRIO_LISTS];

239 240
	unsigned int busy_queues;

241 242
	int rq_in_driver;
	int rq_in_flight[2];
243 244 245 246 247

	/*
	 * queue-depth detection
	 */
	int rq_queued;
248
	int hw_tag;
249 250 251 252 253 254 255 256
	/*
	 * hw_tag can be
	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
	 *  0 => no NCQ
	 */
	int hw_tag_est_depth;
	unsigned int hw_tag_samples;
L
Linus Torvalds 已提交
257

258 259 260 261
	/*
	 * idle window management
	 */
	struct timer_list idle_slice_timer;
262
	struct work_struct unplug_work;
L
Linus Torvalds 已提交
263

264 265 266
	struct cfq_queue *active_queue;
	struct cfq_io_context *active_cic;

267 268 269 270 271
	/*
	 * async queue for each priority case
	 */
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	struct cfq_queue *async_idle_cfqq;
272

J
Jens Axboe 已提交
273
	sector_t last_position;
L
Linus Torvalds 已提交
274 275 276 277 278

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
279
	unsigned int cfq_fifo_expire[2];
L
Linus Torvalds 已提交
280 281
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
282 283 284
	unsigned int cfq_slice[2];
	unsigned int cfq_slice_async_rq;
	unsigned int cfq_slice_idle;
285
	unsigned int cfq_group_idle;
286
	unsigned int cfq_latency;
287

288
	unsigned int cic_index;
289
	struct list_head cic_list;
L
Linus Torvalds 已提交
290

291 292 293 294
	/*
	 * Fallback dummy cfqq for extreme OOM conditions
	 */
	struct cfq_queue oom_cfqq;
295

296
	unsigned long last_delayed_sync;
297 298 299

	/* List of cfq groups being managed on this device*/
	struct hlist_head cfqg_list;
300
	struct rcu_head rcu;
L
Linus Torvalds 已提交
301 302
};

303 304
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);

305 306
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
					    enum wl_prio_t prio,
307
					    enum wl_type_t type)
308
{
309 310 311
	if (!cfqg)
		return NULL;

312
	if (prio == IDLE_WORKLOAD)
313
		return &cfqg->service_tree_idle;
314

315
	return &cfqg->service_trees[prio][type];
316 317
}

J
Jens Axboe 已提交
318
enum cfqq_state_flags {
319 320
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
321
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
322 323 324 325
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
326
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
327
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
328
	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
329
	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
330
	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
331
	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
J
Jens Axboe 已提交
332 333 334 335 336
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
337
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
J
Jens Axboe 已提交
338 339 340
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
341
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
J
Jens Axboe 已提交
342 343 344
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
345
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
J
Jens Axboe 已提交
346 347 348 349
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
350
CFQ_CFQQ_FNS(must_dispatch);
J
Jens Axboe 已提交
351 352 353 354
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
355
CFQ_CFQQ_FNS(slice_new);
356
CFQ_CFQQ_FNS(sync);
357
CFQ_CFQQ_FNS(coop);
358
CFQ_CFQQ_FNS(split_coop);
359
CFQ_CFQQ_FNS(deep);
360
CFQ_CFQQ_FNS(wait_busy);
J
Jens Axboe 已提交
361 362
#undef CFQ_CFQQ_FNS

363
#ifdef CONFIG_CFQ_GROUP_IOSCHED
V
Vivek Goyal 已提交
364 365 366 367 368 369 370 371 372 373
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
			blkg_path(&(cfqq)->cfqg->blkg), ##args);

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
				blkg_path(&(cfqg)->blkg), ##args);      \

#else
374 375
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
V
Vivek Goyal 已提交
376 377
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
#endif
378 379 380
#define cfq_log(cfqd, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

381 382 383 384 385 386 387 388 389 390 391
/* Traverses through cfq group service trees */
#define for_each_cfqg_st(cfqg, i, j, st) \
	for (i = 0; i <= IDLE_WORKLOAD; i++) \
		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
			: &cfqg->service_tree_idle; \
			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
			(i == IDLE_WORKLOAD && j == 0); \
			j++, st = i < IDLE_WORKLOAD ? \
			&cfqg->service_trees[i][j]: NULL) \


392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
static inline bool iops_mode(struct cfq_data *cfqd)
{
	/*
	 * If we are not idling on queues and it is a NCQ drive, parallel
	 * execution of requests is on and measuring time is not possible
	 * in most of the cases until and unless we drive shallower queue
	 * depths and that becomes a performance bottleneck. In such cases
	 * switch to start providing fairness in terms of number of IOs.
	 */
	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
		return true;
	else
		return false;
}

407 408 409 410 411 412 413 414 415
static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
{
	if (cfq_class_idle(cfqq))
		return IDLE_WORKLOAD;
	if (cfq_class_rt(cfqq))
		return RT_WORKLOAD;
	return BE_WORKLOAD;
}

416 417 418 419 420 421 422 423 424 425

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
	if (!cfq_cfqq_sync(cfqq))
		return ASYNC_WORKLOAD;
	if (!cfq_cfqq_idle_window(cfqq))
		return SYNC_NOIDLE_WORKLOAD;
	return SYNC_WORKLOAD;
}

426 427 428
static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
					struct cfq_data *cfqd,
					struct cfq_group *cfqg)
429 430
{
	if (wl == IDLE_WORKLOAD)
431
		return cfqg->service_tree_idle.count;
432

433 434 435
	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
436 437
}

438 439 440 441 442 443 444
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg)
{
	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
}

445
static void cfq_dispatch_insert(struct request_queue *, struct request *);
446
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
447
				       struct io_context *, gfp_t);
448
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
449 450 451
						struct io_context *);

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
452
					    bool is_sync)
453
{
454
	return cic->cfqq[is_sync];
455 456 457
}

static inline void cic_set_cfqq(struct cfq_io_context *cic,
458
				struct cfq_queue *cfqq, bool is_sync)
459
{
460
	cic->cfqq[is_sync] = cfqq;
461 462
}

463
#define CIC_DEAD_KEY	1ul
464
#define CIC_DEAD_INDEX_SHIFT	1
465 466 467

static inline void *cfqd_dead_key(struct cfq_data *cfqd)
{
468
	return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
469 470 471 472 473 474 475 476 477 478 479 480
}

static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
{
	struct cfq_data *cfqd = cic->key;

	if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
		return NULL;

	return cfqd;
}

481 482 483 484
/*
 * We regard a request as SYNC, if it's either a read or has the SYNC bit
 * set (in which case it could also be direct WRITE).
 */
485
static inline bool cfq_bio_sync(struct bio *bio)
486
{
487
	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
488
}
L
Linus Torvalds 已提交
489

A
Andrew Morton 已提交
490 491 492 493
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
494
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
A
Andrew Morton 已提交
495
{
496 497
	if (cfqd->busy_queues) {
		cfq_log(cfqd, "schedule dispatch");
498
		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
499
	}
A
Andrew Morton 已提交
500 501
}

502
static int cfq_queue_empty(struct request_queue *q)
A
Andrew Morton 已提交
503 504 505
{
	struct cfq_data *cfqd = q->elevator->elevator_data;

506
	return !cfqd->rq_queued;
A
Andrew Morton 已提交
507 508
}

509 510 511 512 513
/*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
 */
514
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
515
				 unsigned short prio)
516
{
517
	const int base_slice = cfqd->cfq_slice[sync];
518

519 520 521 522
	WARN_ON(prio >= IOPRIO_BE_NR);

	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
}
523

524 525 526 527
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
528 529
}

530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
{
	u64 d = delta << CFQ_SERVICE_SHIFT;

	d = d * BLKIO_WEIGHT_DEFAULT;
	do_div(d, cfqg->weight);
	return d;
}

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
	s64 delta = (s64)(vdisktime - min_vdisktime);
	if (delta > 0)
		min_vdisktime = vdisktime;

	return min_vdisktime;
}

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
	s64 delta = (s64)(vdisktime - min_vdisktime);
	if (delta < 0)
		min_vdisktime = vdisktime;

	return min_vdisktime;
}

static void update_min_vdisktime(struct cfq_rb_root *st)
{
	u64 vdisktime = st->min_vdisktime;
	struct cfq_group *cfqg;

	if (st->left) {
		cfqg = rb_entry_cfqg(st->left);
		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
	}

	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
}

570 571 572 573 574 575
/*
 * get averaged number of queues of RT/BE priority.
 * average is updated, with a formula that gives more weight to higher numbers,
 * to quickly follows sudden increases and decrease slowly
 */

576 577
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg, bool rt)
578
{
579 580 581
	unsigned min_q, max_q;
	unsigned mult  = cfq_hist_divisor - 1;
	unsigned round = cfq_hist_divisor / 2;
582
	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
583

584 585 586
	min_q = min(cfqg->busy_queues_avg[rt], busy);
	max_q = max(cfqg->busy_queues_avg[rt], busy);
	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
587
		cfq_hist_divisor;
588 589 590 591 592 593 594 595 596
	return cfqg->busy_queues_avg[rt];
}

static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;

	return cfq_target_latency * cfqg->weight / st->total_weight;
597 598
}

599
static inline unsigned
600
cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
601
{
602 603
	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
	if (cfqd->cfq_latency) {
604 605 606 607 608 609
		/*
		 * interested queues (we consider only the ones with the same
		 * priority class in the cfq group)
		 */
		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
						cfq_class_rt(cfqq));
610 611
		unsigned sync_slice = cfqd->cfq_slice[1];
		unsigned expect_latency = sync_slice * iq;
612 613 614
		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

		if (expect_latency > group_slice) {
615 616 617 618 619 620 621
			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
			/* scale low_slice according to IO priority
			 * and sync vs async */
			unsigned low_slice =
				min(slice, base_low_slice * slice / sync_slice);
			/* the adapted slice value is scaled to fit all iqs
			 * into the target latency */
622
			slice = max(slice * group_slice / expect_latency,
623 624 625
				    low_slice);
		}
	}
626 627 628 629 630 631
	return slice;
}

static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
632
	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
633

634
	cfqq->slice_start = jiffies;
635
	cfqq->slice_end = jiffies + slice;
636
	cfqq->allocated_slice = slice;
637
	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
638 639 640 641 642 643 644
}

/*
 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
 * isn't valid until the first request from the dispatch is activated
 * and the slice time set.
 */
645
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
646 647
{
	if (cfq_cfqq_slice_new(cfqq))
S
Shaohua Li 已提交
648
		return false;
649
	if (time_before(jiffies, cfqq->slice_end))
S
Shaohua Li 已提交
650
		return false;
651

S
Shaohua Li 已提交
652
	return true;
653 654
}

L
Linus Torvalds 已提交
655
/*
J
Jens Axboe 已提交
656
 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
L
Linus Torvalds 已提交
657
 * We choose the request that is closest to the head right now. Distance
658
 * behind the head is penalized and only allowed to a certain extent.
L
Linus Torvalds 已提交
659
 */
J
Jens Axboe 已提交
660
static struct request *
661
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
L
Linus Torvalds 已提交
662
{
663
	sector_t s1, s2, d1 = 0, d2 = 0;
L
Linus Torvalds 已提交
664
	unsigned long back_max;
665 666 667
#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
L
Linus Torvalds 已提交
668

J
Jens Axboe 已提交
669 670 671 672
	if (rq1 == NULL || rq1 == rq2)
		return rq2;
	if (rq2 == NULL)
		return rq1;
J
Jens Axboe 已提交
673

J
Jens Axboe 已提交
674 675 676 677
	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
		return rq1;
	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
		return rq2;
678
	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
679
		return rq1;
680 681
	else if ((rq2->cmd_flags & REQ_META) &&
		 !(rq1->cmd_flags & REQ_META))
682
		return rq2;
L
Linus Torvalds 已提交
683

684 685
	s1 = blk_rq_pos(rq1);
	s2 = blk_rq_pos(rq2);
L
Linus Torvalds 已提交
686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701

	/*
	 * by definition, 1KiB is 2 sectors
	 */
	back_max = cfqd->cfq_back_max * 2;

	/*
	 * Strict one way elevator _except_ in the case where we allow
	 * short backward seeks which are biased as twice the cost of a
	 * similar forward seek.
	 */
	if (s1 >= last)
		d1 = s1 - last;
	else if (s1 + back_max >= last)
		d1 = (last - s1) * cfqd->cfq_back_penalty;
	else
702
		wrap |= CFQ_RQ1_WRAP;
L
Linus Torvalds 已提交
703 704 705 706 707 708

	if (s2 >= last)
		d2 = s2 - last;
	else if (s2 + back_max >= last)
		d2 = (last - s2) * cfqd->cfq_back_penalty;
	else
709
		wrap |= CFQ_RQ2_WRAP;
L
Linus Torvalds 已提交
710 711

	/* Found required data */
712 713 714 715 716 717

	/*
	 * By doing switch() on the bit mask "wrap" we avoid having to
	 * check two variables for all permutations: --> faster!
	 */
	switch (wrap) {
J
Jens Axboe 已提交
718
	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
719
		if (d1 < d2)
J
Jens Axboe 已提交
720
			return rq1;
721
		else if (d2 < d1)
J
Jens Axboe 已提交
722
			return rq2;
723 724
		else {
			if (s1 >= s2)
J
Jens Axboe 已提交
725
				return rq1;
726
			else
J
Jens Axboe 已提交
727
				return rq2;
728
		}
L
Linus Torvalds 已提交
729

730
	case CFQ_RQ2_WRAP:
J
Jens Axboe 已提交
731
		return rq1;
732
	case CFQ_RQ1_WRAP:
J
Jens Axboe 已提交
733 734
		return rq2;
	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
735 736 737 738 739 740 741 742
	default:
		/*
		 * Since both rqs are wrapped,
		 * start with the one that's further behind head
		 * (--> only *one* back seek required),
		 * since back seek takes more time than forward.
		 */
		if (s1 <= s2)
J
Jens Axboe 已提交
743
			return rq1;
L
Linus Torvalds 已提交
744
		else
J
Jens Axboe 已提交
745
			return rq2;
L
Linus Torvalds 已提交
746 747 748
	}
}

749 750 751
/*
 * The below is leftmost cache rbtree addon
 */
752
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
753
{
754 755 756 757
	/* Service tree is empty */
	if (!root->count)
		return NULL;

758 759 760
	if (!root->left)
		root->left = rb_first(&root->rb);

761 762 763 764
	if (root->left)
		return rb_entry(root->left, struct cfq_queue, rb_node);

	return NULL;
765 766
}

767 768 769 770 771 772 773 774 775 776 777
static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
{
	if (!root->left)
		root->left = rb_first(&root->rb);

	if (root->left)
		return rb_entry_cfqg(root->left);

	return NULL;
}

778 779 780 781 782 783
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
	rb_erase(n, root);
	RB_CLEAR_NODE(n);
}

784 785 786 787
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
	if (root->left == n)
		root->left = NULL;
788
	rb_erase_init(n, &root->rb);
789
	--root->count;
790 791
}

L
Linus Torvalds 已提交
792 793 794
/*
 * would be nice to take fifo expire time into account as well
 */
J
Jens Axboe 已提交
795 796 797
static struct request *
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		  struct request *last)
L
Linus Torvalds 已提交
798
{
799 800
	struct rb_node *rbnext = rb_next(&last->rb_node);
	struct rb_node *rbprev = rb_prev(&last->rb_node);
J
Jens Axboe 已提交
801
	struct request *next = NULL, *prev = NULL;
L
Linus Torvalds 已提交
802

803
	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
L
Linus Torvalds 已提交
804 805

	if (rbprev)
J
Jens Axboe 已提交
806
		prev = rb_entry_rq(rbprev);
L
Linus Torvalds 已提交
807

808
	if (rbnext)
J
Jens Axboe 已提交
809
		next = rb_entry_rq(rbnext);
810 811 812
	else {
		rbnext = rb_first(&cfqq->sort_list);
		if (rbnext && rbnext != &last->rb_node)
J
Jens Axboe 已提交
813
			next = rb_entry_rq(rbnext);
814
	}
L
Linus Torvalds 已提交
815

816
	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
L
Linus Torvalds 已提交
817 818
}

819 820
static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
				      struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
821
{
822 823 824
	/*
	 * just an approximation, should be ok.
	 */
825
	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
826
		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
827 828
}

829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870
static inline s64
cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
	return cfqg->vdisktime - st->min_vdisktime;
}

static void
__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
	struct rb_node **node = &st->rb.rb_node;
	struct rb_node *parent = NULL;
	struct cfq_group *__cfqg;
	s64 key = cfqg_key(st, cfqg);
	int left = 1;

	while (*node != NULL) {
		parent = *node;
		__cfqg = rb_entry_cfqg(parent);

		if (key < cfqg_key(st, __cfqg))
			node = &parent->rb_left;
		else {
			node = &parent->rb_right;
			left = 0;
		}
	}

	if (left)
		st->left = &cfqg->rb_node;

	rb_link_node(&cfqg->rb_node, parent, node);
	rb_insert_color(&cfqg->rb_node, &st->rb);
}

static void
cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
	struct cfq_group *__cfqg;
	struct rb_node *n;

	cfqg->nr_cfqq++;
G
Gui Jianfeng 已提交
871
	if (!RB_EMPTY_NODE(&cfqg->rb_node))
872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
		return;

	/*
	 * Currently put the group at the end. Later implement something
	 * so that groups get lesser vtime based on their weights, so that
	 * if group does not loose all if it was not continously backlogged.
	 */
	n = rb_last(&st->rb);
	if (n) {
		__cfqg = rb_entry_cfqg(n);
		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
	} else
		cfqg->vdisktime = st->min_vdisktime;

	__cfq_group_service_tree_add(st, cfqg);
887
	st->total_weight += cfqg->weight;
888 889 890 891 892 893 894 895 896
}

static void
cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;

	BUG_ON(cfqg->nr_cfqq < 1);
	cfqg->nr_cfqq--;
897

898 899 900 901
	/* If there are other cfq queues under this group, don't delete it */
	if (cfqg->nr_cfqq)
		return;

V
Vivek Goyal 已提交
902
	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
903
	st->total_weight -= cfqg->weight;
904 905
	if (!RB_EMPTY_NODE(&cfqg->rb_node))
		cfq_rb_erase(&cfqg->rb_node, st);
906
	cfqg->saved_workload_slice = 0;
907
	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
908 909 910 911
}

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
{
912
	unsigned int slice_used;
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928

	/*
	 * Queue got expired before even a single request completed or
	 * got expired immediately after first request completion.
	 */
	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
		/*
		 * Also charge the seek time incurred to the group, otherwise
		 * if there are mutiple queues in the group, each can dispatch
		 * a single request on seeky media and cause lots of seek time
		 * and group will never know it.
		 */
		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
					1);
	} else {
		slice_used = jiffies - cfqq->slice_start;
929 930
		if (slice_used > cfqq->allocated_slice)
			slice_used = cfqq->allocated_slice;
931 932 933 934 935 936
	}

	return slice_used;
}

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
937
				struct cfq_queue *cfqq)
938 939
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
940
	unsigned int used_sl, charge;
941 942 943 944
	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
			- cfqg->service_tree_idle.count;

	BUG_ON(nr_sync < 0);
945
	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
946

947 948 949 950
	if (iops_mode(cfqd))
		charge = cfqq->slice_dispatch;
	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
		charge = cfqq->allocated_slice;
951 952 953

	/* Can't update vdisktime while group is on service tree */
	cfq_rb_erase(&cfqg->rb_node, st);
954
	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
955 956 957 958 959 960 961 962 963 964
	__cfq_group_service_tree_add(st, cfqg);

	/* This group is being expired. Save the context */
	if (time_after(cfqd->workload_expires, jiffies)) {
		cfqg->saved_workload_slice = cfqd->workload_expires
						- jiffies;
		cfqg->saved_workload = cfqd->serving_type;
		cfqg->saved_serving_prio = cfqd->serving_prio;
	} else
		cfqg->saved_workload_slice = 0;
V
Vivek Goyal 已提交
965 966 967

	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
					st->min_vdisktime);
968 969 970
	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
			iops_mode(cfqd), cfqq->nr_sectors);
971 972
	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
973 974
}

975 976 977 978 979 980 981 982
#ifdef CONFIG_CFQ_GROUP_IOSCHED
static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
{
	if (blkg)
		return container_of(blkg, struct cfq_group, blkg);
	return NULL;
}

983 984
void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
					unsigned int weight)
985 986 987 988
{
	cfqg_of_blkg(blkg)->weight = weight;
}

989 990 991 992 993 994 995 996
static struct cfq_group *
cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
	struct cfq_group *cfqg = NULL;
	void *key = cfqd;
	int i, j;
	struct cfq_rb_root *st;
997 998
	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
	unsigned int major, minor;
999 1000

	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1001 1002 1003 1004 1005
	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
		cfqg->blkg.dev = MKDEV(major, minor);
		goto done;
	}
1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
	if (cfqg || !create)
		goto done;

	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
	if (!cfqg)
		goto done;

	for_each_cfqg_st(cfqg, i, j, st)
		*st = CFQ_RB_ROOT;
	RB_CLEAR_NODE(&cfqg->rb_node);

1017 1018 1019 1020 1021 1022
	/*
	 * Take the initial reference that will be released on destroy
	 * This can be thought of a joint reference by cgroup and
	 * elevator which will be dropped by either elevator exit
	 * or cgroup deletion path depending on who is exiting first.
	 */
1023
	cfqg->ref = 1;
1024

1025 1026
	/*
	 * Add group onto cgroup list. It might happen that bdi->dev is
1027
	 * not initialized yet. Initialize this new group without major
1028 1029 1030 1031 1032 1033
	 * and minor info and this info will be filled in once a new thread
	 * comes for IO. See code above.
	 */
	if (bdi->dev) {
		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1034
					MKDEV(major, minor));
1035 1036 1037 1038
	} else
		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
					0);

1039
	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065

	/* Add group on cfqd list */
	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);

done:
	return cfqg;
}

/*
 * Search for the cfq group current task belongs to. If create = 1, then also
 * create the cfq group if it does not exist. request_queue lock must be held.
 */
static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
{
	struct cgroup *cgroup;
	struct cfq_group *cfqg = NULL;

	rcu_read_lock();
	cgroup = task_cgroup(current, blkio_subsys_id);
	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
	if (!cfqg && create)
		cfqg = &cfqd->root_group;
	rcu_read_unlock();
	return cfqg;
}

1066 1067
static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
{
1068
	cfqg->ref++;
1069 1070 1071
	return cfqg;
}

1072 1073 1074 1075 1076 1077 1078
static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
{
	/* Currently, all async queues are mapped to root group */
	if (!cfq_cfqq_sync(cfqq))
		cfqg = &cfqq->cfqd->root_group;

	cfqq->cfqg = cfqg;
1079
	/* cfqq reference on cfqg */
1080
	cfqq->cfqg->ref++;
1081 1082 1083 1084 1085 1086 1087
}

static void cfq_put_cfqg(struct cfq_group *cfqg)
{
	struct cfq_rb_root *st;
	int i, j;

1088 1089 1090
	BUG_ON(cfqg->ref <= 0);
	cfqg->ref--;
	if (cfqg->ref)
1091 1092
		return;
	for_each_cfqg_st(cfqg, i, j, st)
G
Gui Jianfeng 已提交
1093
		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121
	kfree(cfqg);
}

static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
	/* Something wrong if we are trying to remove same group twice */
	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));

	hlist_del_init(&cfqg->cfqd_node);

	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	cfq_put_cfqg(cfqg);
}

static void cfq_release_cfq_groups(struct cfq_data *cfqd)
{
	struct hlist_node *pos, *n;
	struct cfq_group *cfqg;

	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
		/*
		 * If cgroup removal path got to blk_group first and removed
		 * it from cgroup list, then it will take care of destroying
		 * cfqg also.
		 */
1122
		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1123 1124
			cfq_destroy_cfqg(cfqd, cfqg);
	}
1125
}
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150

/*
 * Blk cgroup controller notification saying that blkio_group object is being
 * delinked as associated cgroup object is going away. That also means that
 * no new IO will come in this group. So get rid of this group as soon as
 * any pending IO in the group is finished.
 *
 * This function is called under rcu_read_lock(). key is the rcu protected
 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
 * read lock.
 *
 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
 * it should not be NULL as even if elevator was exiting, cgroup deltion
 * path got to it first.
 */
void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
{
	unsigned long  flags;
	struct cfq_data *cfqd = key;

	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
}

1151 1152 1153 1154 1155
#else /* GROUP_IOSCHED */
static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
{
	return &cfqd->root_group;
}
1156 1157 1158

static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
{
1159
	return cfqg;
1160 1161
}

1162 1163 1164 1165 1166
static inline void
cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
	cfqq->cfqg = cfqg;
}

1167 1168 1169
static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}

1170 1171
#endif /* GROUP_IOSCHED */

1172
/*
1173
 * The cfqd->service_trees holds all pending cfq_queue's that have
1174 1175 1176
 * requests waiting to be processed. It is sorted in the order that
 * we will service the queues.
 */
1177
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1178
				 bool add_front)
1179
{
1180 1181
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;
1182
	unsigned long rb_key;
1183
	struct cfq_rb_root *service_tree;
1184
	int left;
1185
	int new_cfqq = 1;
1186 1187
	int group_changed = 0;

1188
	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1189
						cfqq_type(cfqq));
1190 1191
	if (cfq_class_idle(cfqq)) {
		rb_key = CFQ_IDLE_DELAY;
1192
		parent = rb_last(&service_tree->rb);
1193 1194 1195 1196 1197 1198
		if (parent && parent != &cfqq->rb_node) {
			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
			rb_key += __cfqq->rb_key;
		} else
			rb_key += jiffies;
	} else if (!add_front) {
1199 1200 1201 1202 1203 1204
		/*
		 * Get our rb key offset. Subtract any residual slice
		 * value carried from last service. A negative resid
		 * count indicates slice overrun, and this should position
		 * the next service time further away in the tree.
		 */
1205
		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1206
		rb_key -= cfqq->slice_resid;
1207
		cfqq->slice_resid = 0;
1208 1209
	} else {
		rb_key = -HZ;
1210
		__cfqq = cfq_rb_first(service_tree);
1211 1212
		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
	}
L
Linus Torvalds 已提交
1213

1214
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1215
		new_cfqq = 0;
1216
		/*
1217
		 * same position, nothing more to do
1218
		 */
1219 1220
		if (rb_key == cfqq->rb_key &&
		    cfqq->service_tree == service_tree)
1221
			return;
L
Linus Torvalds 已提交
1222

1223 1224
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
		cfqq->service_tree = NULL;
L
Linus Torvalds 已提交
1225
	}
1226

1227
	left = 1;
1228
	parent = NULL;
1229 1230
	cfqq->service_tree = service_tree;
	p = &service_tree->rb.rb_node;
1231
	while (*p) {
1232
		struct rb_node **n;
1233

1234 1235 1236
		parent = *p;
		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1237
		/*
1238
		 * sort by key, that represents service time.
1239
		 */
1240
		if (time_before(rb_key, __cfqq->rb_key))
1241
			n = &(*p)->rb_left;
1242
		else {
1243
			n = &(*p)->rb_right;
1244
			left = 0;
1245
		}
1246 1247

		p = n;
1248 1249
	}

1250
	if (left)
1251
		service_tree->left = &cfqq->rb_node;
1252

1253 1254
	cfqq->rb_key = rb_key;
	rb_link_node(&cfqq->rb_node, parent, p);
1255 1256
	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
	service_tree->count++;
1257
	if ((add_front || !new_cfqq) && !group_changed)
1258
		return;
1259
	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
L
Linus Torvalds 已提交
1260 1261
}

1262
static struct cfq_queue *
1263 1264 1265
cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
		     sector_t sector, struct rb_node **ret_parent,
		     struct rb_node ***rb_link)
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281
{
	struct rb_node **p, *parent;
	struct cfq_queue *cfqq = NULL;

	parent = NULL;
	p = &root->rb_node;
	while (*p) {
		struct rb_node **n;

		parent = *p;
		cfqq = rb_entry(parent, struct cfq_queue, p_node);

		/*
		 * Sort strictly based on sector.  Smallest to the left,
		 * largest to the right.
		 */
1282
		if (sector > blk_rq_pos(cfqq->next_rq))
1283
			n = &(*p)->rb_right;
1284
		else if (sector < blk_rq_pos(cfqq->next_rq))
1285 1286 1287 1288
			n = &(*p)->rb_left;
		else
			break;
		p = n;
1289
		cfqq = NULL;
1290 1291 1292 1293 1294
	}

	*ret_parent = parent;
	if (rb_link)
		*rb_link = p;
1295
	return cfqq;
1296 1297 1298 1299 1300 1301 1302
}

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;

1303 1304 1305 1306
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
1307 1308 1309 1310 1311 1312

	if (cfq_class_idle(cfqq))
		return;
	if (!cfqq->next_rq)
		return;

1313
	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1314 1315
	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1316 1317
	if (!__cfqq) {
		rb_link_node(&cfqq->p_node, parent, p);
1318 1319 1320
		rb_insert_color(&cfqq->p_node, cfqq->p_root);
	} else
		cfqq->p_root = NULL;
1321 1322
}

1323 1324 1325
/*
 * Update cfqq's position in the service tree.
 */
1326
static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
J
Jens Axboe 已提交
1327 1328 1329 1330
{
	/*
	 * Resorting requires the cfqq to be on the RR list already.
	 */
1331
	if (cfq_cfqq_on_rr(cfqq)) {
1332
		cfq_service_tree_add(cfqd, cfqq, 0);
1333 1334
		cfq_prio_tree_add(cfqd, cfqq);
	}
J
Jens Axboe 已提交
1335 1336
}

L
Linus Torvalds 已提交
1337 1338
/*
 * add to busy list of queues for service, trying to be fair in ordering
1339
 * the pending list according to last request service
L
Linus Torvalds 已提交
1340
 */
J
Jens Axboe 已提交
1341
static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
1342
{
1343
	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
J
Jens Axboe 已提交
1344 1345
	BUG_ON(cfq_cfqq_on_rr(cfqq));
	cfq_mark_cfqq_on_rr(cfqq);
L
Linus Torvalds 已提交
1346 1347
	cfqd->busy_queues++;

1348
	cfq_resort_rr_list(cfqd, cfqq);
L
Linus Torvalds 已提交
1349 1350
}

1351 1352 1353 1354
/*
 * Called when the cfqq no longer has requests pending, remove it from
 * the service tree.
 */
J
Jens Axboe 已提交
1355
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
1356
{
1357
	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
J
Jens Axboe 已提交
1358 1359
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
	cfq_clear_cfqq_on_rr(cfqq);
L
Linus Torvalds 已提交
1360

1361 1362 1363 1364
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
		cfqq->service_tree = NULL;
	}
1365 1366 1367 1368
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
1369

1370
	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
L
Linus Torvalds 已提交
1371 1372 1373 1374 1375 1376 1377
	BUG_ON(!cfqd->busy_queues);
	cfqd->busy_queues--;
}

/*
 * rb tree support functions
 */
J
Jens Axboe 已提交
1378
static void cfq_del_rq_rb(struct request *rq)
L
Linus Torvalds 已提交
1379
{
J
Jens Axboe 已提交
1380 1381
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	const int sync = rq_is_sync(rq);
L
Linus Torvalds 已提交
1382

1383 1384
	BUG_ON(!cfqq->queued[sync]);
	cfqq->queued[sync]--;
L
Linus Torvalds 已提交
1385

J
Jens Axboe 已提交
1386
	elv_rb_del(&cfqq->sort_list, rq);
L
Linus Torvalds 已提交
1387

1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
		/*
		 * Queue will be deleted from service tree when we actually
		 * expire it later. Right now just remove it from prio tree
		 * as it is empty.
		 */
		if (cfqq->p_root) {
			rb_erase(&cfqq->p_node, cfqq->p_root);
			cfqq->p_root = NULL;
		}
	}
L
Linus Torvalds 已提交
1399 1400
}

J
Jens Axboe 已提交
1401
static void cfq_add_rq_rb(struct request *rq)
L
Linus Torvalds 已提交
1402
{
J
Jens Axboe 已提交
1403
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
L
Linus Torvalds 已提交
1404
	struct cfq_data *cfqd = cfqq->cfqd;
1405
	struct request *__alias, *prev;
L
Linus Torvalds 已提交
1406

1407
	cfqq->queued[rq_is_sync(rq)]++;
L
Linus Torvalds 已提交
1408 1409 1410 1411 1412

	/*
	 * looks a little odd, but the first insert might return an alias.
	 * if that happens, put the alias on the dispatch list
	 */
1413
	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
J
Jens Axboe 已提交
1414
		cfq_dispatch_insert(cfqd->queue, __alias);
1415 1416 1417

	if (!cfq_cfqq_on_rr(cfqq))
		cfq_add_cfqq_rr(cfqd, cfqq);
1418 1419 1420 1421

	/*
	 * check if this request is a better next-serve candidate
	 */
1422
	prev = cfqq->next_rq;
1423
	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1424 1425 1426 1427 1428 1429 1430

	/*
	 * adjust priority tree position, if ->next_rq changes
	 */
	if (prev != cfqq->next_rq)
		cfq_prio_tree_add(cfqd, cfqq);

1431
	BUG_ON(!cfqq->next_rq);
L
Linus Torvalds 已提交
1432 1433
}

J
Jens Axboe 已提交
1434
static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
L
Linus Torvalds 已提交
1435
{
1436 1437
	elv_rb_del(&cfqq->sort_list, rq);
	cfqq->queued[rq_is_sync(rq)]--;
1438 1439
	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
					rq_data_dir(rq), rq_is_sync(rq));
J
Jens Axboe 已提交
1440
	cfq_add_rq_rb(rq);
1441
	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1442 1443
			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
			rq_is_sync(rq));
L
Linus Torvalds 已提交
1444 1445
}

1446 1447
static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
L
Linus Torvalds 已提交
1448
{
1449
	struct task_struct *tsk = current;
1450
	struct cfq_io_context *cic;
1451
	struct cfq_queue *cfqq;
L
Linus Torvalds 已提交
1452

1453
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
1454 1455 1456 1457
	if (!cic)
		return NULL;

	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1458 1459 1460
	if (cfqq) {
		sector_t sector = bio->bi_sector + bio_sectors(bio);

1461
		return elv_rb_find(&cfqq->sort_list, sector);
1462
	}
L
Linus Torvalds 已提交
1463 1464 1465 1466

	return NULL;
}

1467
static void cfq_activate_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
1468
{
1469
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
1470

1471
	cfqd->rq_in_driver++;
1472
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1473
						cfqd->rq_in_driver);
1474

1475
	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
L
Linus Torvalds 已提交
1476 1477
}

1478
static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
1479
{
1480 1481
	struct cfq_data *cfqd = q->elevator->elevator_data;

1482 1483
	WARN_ON(!cfqd->rq_in_driver);
	cfqd->rq_in_driver--;
1484
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1485
						cfqd->rq_in_driver);
L
Linus Torvalds 已提交
1486 1487
}

1488
static void cfq_remove_request(struct request *rq)
L
Linus Torvalds 已提交
1489
{
J
Jens Axboe 已提交
1490
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1491

J
Jens Axboe 已提交
1492 1493
	if (cfqq->next_rq == rq)
		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
L
Linus Torvalds 已提交
1494

1495
	list_del_init(&rq->queuelist);
J
Jens Axboe 已提交
1496
	cfq_del_rq_rb(rq);
1497

1498
	cfqq->cfqd->rq_queued--;
1499 1500
	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
					rq_data_dir(rq), rq_is_sync(rq));
1501
	if (rq->cmd_flags & REQ_META) {
1502 1503 1504
		WARN_ON(!cfqq->meta_pending);
		cfqq->meta_pending--;
	}
L
Linus Torvalds 已提交
1505 1506
}

1507 1508
static int cfq_merge(struct request_queue *q, struct request **req,
		     struct bio *bio)
L
Linus Torvalds 已提交
1509 1510 1511 1512
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct request *__rq;

1513
	__rq = cfq_find_rq_fmerge(cfqd, bio);
1514
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
1515 1516
		*req = __rq;
		return ELEVATOR_FRONT_MERGE;
L
Linus Torvalds 已提交
1517 1518 1519 1520 1521
	}

	return ELEVATOR_NO_MERGE;
}

1522
static void cfq_merged_request(struct request_queue *q, struct request *req,
1523
			       int type)
L
Linus Torvalds 已提交
1524
{
1525
	if (type == ELEVATOR_FRONT_MERGE) {
J
Jens Axboe 已提交
1526
		struct cfq_queue *cfqq = RQ_CFQQ(req);
L
Linus Torvalds 已提交
1527

J
Jens Axboe 已提交
1528
		cfq_reposition_rq_rb(cfqq, req);
L
Linus Torvalds 已提交
1529 1530 1531
	}
}

D
Divyesh Shah 已提交
1532 1533 1534
static void cfq_bio_merged(struct request_queue *q, struct request *req,
				struct bio *bio)
{
1535 1536
	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
					bio_data_dir(bio), cfq_bio_sync(bio));
D
Divyesh Shah 已提交
1537 1538
}

L
Linus Torvalds 已提交
1539
static void
1540
cfq_merged_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
1541 1542
		    struct request *next)
{
1543
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1544 1545 1546 1547
	/*
	 * reposition in fifo if next is older than rq
	 */
	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1548
	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1549
		list_move(&rq->queuelist, &next->queuelist);
1550 1551
		rq_set_fifo_time(rq, rq_fifo_time(next));
	}
1552

1553 1554
	if (cfqq->next_rq == next)
		cfqq->next_rq = rq;
1555
	cfq_remove_request(next);
1556 1557
	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
					rq_data_dir(next), rq_is_sync(next));
1558 1559
}

1560
static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1561 1562 1563
			   struct bio *bio)
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
1564
	struct cfq_io_context *cic;
1565 1566 1567
	struct cfq_queue *cfqq;

	/*
1568
	 * Disallow merge of a sync bio into an async request.
1569
	 */
1570
	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1571
		return false;
1572 1573

	/*
1574 1575
	 * Lookup the cfqq that this bio will be queued with. Allow
	 * merge only if rq is queued there.
1576
	 */
1577
	cic = cfq_cic_lookup(cfqd, current->io_context);
1578
	if (!cic)
1579
		return false;
1580

1581
	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1582
	return cfqq == RQ_CFQQ(rq);
1583 1584
}

1585 1586 1587
static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	del_timer(&cfqd->idle_slice_timer);
1588
	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1589 1590
}

J
Jens Axboe 已提交
1591 1592
static void __cfq_set_active_queue(struct cfq_data *cfqd,
				   struct cfq_queue *cfqq)
1593 1594
{
	if (cfqq) {
1595 1596
		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
				cfqd->serving_prio, cfqd->serving_type);
1597
		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1598 1599
		cfqq->slice_start = 0;
		cfqq->dispatch_start = jiffies;
1600
		cfqq->allocated_slice = 0;
1601
		cfqq->slice_end = 0;
1602
		cfqq->slice_dispatch = 0;
1603
		cfqq->nr_sectors = 0;
1604 1605

		cfq_clear_cfqq_wait_request(cfqq);
1606
		cfq_clear_cfqq_must_dispatch(cfqq);
J
Jens Axboe 已提交
1607 1608
		cfq_clear_cfqq_must_alloc_slice(cfqq);
		cfq_clear_cfqq_fifo_expire(cfqq);
1609
		cfq_mark_cfqq_slice_new(cfqq);
1610

1611
		cfq_del_timer(cfqd, cfqq);
1612 1613 1614 1615 1616
	}

	cfqd->active_queue = cfqq;
}

1617 1618 1619 1620 1621
/*
 * current cfqq expired its slice (or was too idle), select new one
 */
static void
__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1622
		    bool timed_out)
1623
{
1624 1625
	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

1626
	if (cfq_cfqq_wait_request(cfqq))
1627
		cfq_del_timer(cfqd, cfqq);
1628 1629

	cfq_clear_cfqq_wait_request(cfqq);
1630
	cfq_clear_cfqq_wait_busy(cfqq);
1631

1632 1633 1634 1635 1636 1637 1638 1639 1640
	/*
	 * If this cfqq is shared between multiple processes, check to
	 * make sure that those processes are still issuing I/Os within
	 * the mean seek distance.  If not, it may be time to break the
	 * queues apart again.
	 */
	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
		cfq_mark_cfqq_split_coop(cfqq);

1641
	/*
1642
	 * store what was left of this slice, if the queue idled/timed out
1643
	 */
1644 1645
	if (timed_out) {
		if (cfq_cfqq_slice_new(cfqq))
1646
			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1647 1648
		else
			cfqq->slice_resid = cfqq->slice_end - jiffies;
1649 1650
		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
	}
1651

1652
	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1653

1654 1655 1656
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
		cfq_del_cfqq_rr(cfqd, cfqq);

1657
	cfq_resort_rr_list(cfqd, cfqq);
1658 1659 1660 1661 1662 1663 1664 1665 1666 1667

	if (cfqq == cfqd->active_queue)
		cfqd->active_queue = NULL;

	if (cfqd->active_cic) {
		put_io_context(cfqd->active_cic->ioc);
		cfqd->active_cic = NULL;
	}
}

1668
static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1669 1670 1671 1672
{
	struct cfq_queue *cfqq = cfqd->active_queue;

	if (cfqq)
1673
		__cfq_slice_expired(cfqd, cfqq, timed_out);
1674 1675
}

1676 1677 1678 1679
/*
 * Get next queue for service. Unless we have a queue preemption,
 * we'll simply select the first cfqq in the service tree.
 */
J
Jens Axboe 已提交
1680
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1681
{
1682
	struct cfq_rb_root *service_tree =
1683
		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1684
					cfqd->serving_type);
1685

1686 1687 1688
	if (!cfqd->rq_queued)
		return NULL;

1689 1690 1691
	/* There is nothing to dispatch */
	if (!service_tree)
		return NULL;
1692 1693 1694
	if (RB_EMPTY_ROOT(&service_tree->rb))
		return NULL;
	return cfq_rb_first(service_tree);
J
Jens Axboe 已提交
1695 1696
}

1697 1698
static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
{
1699
	struct cfq_group *cfqg;
1700 1701 1702 1703 1704 1705 1706
	struct cfq_queue *cfqq;
	int i, j;
	struct cfq_rb_root *st;

	if (!cfqd->rq_queued)
		return NULL;

1707 1708 1709 1710
	cfqg = cfq_get_next_cfqg(cfqd);
	if (!cfqg)
		return NULL;

1711 1712 1713 1714 1715 1716
	for_each_cfqg_st(cfqg, i, j, st)
		if ((cfqq = cfq_rb_first(st)) != NULL)
			return cfqq;
	return NULL;
}

1717 1718 1719
/*
 * Get and set a new active queue for service.
 */
1720 1721
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
					      struct cfq_queue *cfqq)
J
Jens Axboe 已提交
1722
{
1723
	if (!cfqq)
1724
		cfqq = cfq_get_next_queue(cfqd);
J
Jens Axboe 已提交
1725

1726
	__cfq_set_active_queue(cfqd, cfqq);
J
Jens Axboe 已提交
1727
	return cfqq;
1728 1729
}

1730 1731 1732
static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
					  struct request *rq)
{
1733 1734
	if (blk_rq_pos(rq) >= cfqd->last_position)
		return blk_rq_pos(rq) - cfqd->last_position;
1735
	else
1736
		return cfqd->last_position - blk_rq_pos(rq);
1737 1738
}

1739
static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1740
			       struct request *rq)
J
Jens Axboe 已提交
1741
{
1742
	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
J
Jens Axboe 已提交
1743 1744
}

1745 1746 1747
static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
				    struct cfq_queue *cur_cfqq)
{
1748
	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759
	struct rb_node *parent, *node;
	struct cfq_queue *__cfqq;
	sector_t sector = cfqd->last_position;

	if (RB_EMPTY_ROOT(root))
		return NULL;

	/*
	 * First, if we find a request starting at the end of the last
	 * request, choose it.
	 */
1760
	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1761 1762 1763 1764 1765 1766 1767 1768
	if (__cfqq)
		return __cfqq;

	/*
	 * If the exact sector wasn't found, the parent of the NULL leaf
	 * will contain the closest sector.
	 */
	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
1769
	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1770 1771
		return __cfqq;

1772
	if (blk_rq_pos(__cfqq->next_rq) < sector)
1773 1774 1775 1776 1777 1778 1779
		node = rb_next(&__cfqq->p_node);
	else
		node = rb_prev(&__cfqq->p_node);
	if (!node)
		return NULL;

	__cfqq = rb_entry(node, struct cfq_queue, p_node);
1780
	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796
		return __cfqq;

	return NULL;
}

/*
 * cfqd - obvious
 * cur_cfqq - passed in so that we don't decide that the current queue is
 * 	      closely cooperating with itself.
 *
 * So, basically we're assuming that that cur_cfqq has dispatched at least
 * one request, and that cfqd->last_position reflects a position on the disk
 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
 * assumption.
 */
static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1797
					      struct cfq_queue *cur_cfqq)
J
Jens Axboe 已提交
1798
{
1799 1800
	struct cfq_queue *cfqq;

1801 1802
	if (cfq_class_idle(cur_cfqq))
		return NULL;
1803 1804 1805 1806 1807
	if (!cfq_cfqq_sync(cur_cfqq))
		return NULL;
	if (CFQQ_SEEKY(cur_cfqq))
		return NULL;

1808 1809 1810 1811 1812 1813
	/*
	 * Don't search priority tree if it's the only queue in the group.
	 */
	if (cur_cfqq->cfqg->nr_cfqq == 1)
		return NULL;

J
Jens Axboe 已提交
1814
	/*
1815 1816 1817
	 * We should notice if some of the queues are cooperating, eg
	 * working closely on the same area of the disk. In that case,
	 * we can group them together and don't waste time idling.
J
Jens Axboe 已提交
1818
	 */
1819 1820 1821 1822
	cfqq = cfqq_close(cfqd, cur_cfqq);
	if (!cfqq)
		return NULL;

1823 1824 1825 1826
	/* If new queue belongs to different cfq_group, don't choose it */
	if (cur_cfqq->cfqg != cfqq->cfqg)
		return NULL;

J
Jeff Moyer 已提交
1827 1828 1829 1830 1831
	/*
	 * It only makes sense to merge sync queues.
	 */
	if (!cfq_cfqq_sync(cfqq))
		return NULL;
1832 1833
	if (CFQQ_SEEKY(cfqq))
		return NULL;
J
Jeff Moyer 已提交
1834

1835 1836 1837 1838 1839 1840
	/*
	 * Do not merge queues of different priority classes
	 */
	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
		return NULL;

1841
	return cfqq;
J
Jens Axboe 已提交
1842 1843
}

1844 1845 1846 1847 1848 1849 1850
/*
 * Determine whether we should enforce idle window for this queue.
 */

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	enum wl_prio_t prio = cfqq_prio(cfqq);
1851
	struct cfq_rb_root *service_tree = cfqq->service_tree;
1852

1853 1854 1855
	BUG_ON(!service_tree);
	BUG_ON(!service_tree->count);

1856 1857 1858
	if (!cfqd->cfq_slice_idle)
		return false;

1859 1860 1861 1862 1863
	/* We never do for idle class queues. */
	if (prio == IDLE_WORKLOAD)
		return false;

	/* We do for queues that were marked with idle window flag. */
1864 1865
	if (cfq_cfqq_idle_window(cfqq) &&
	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1866 1867 1868 1869 1870 1871
		return true;

	/*
	 * Otherwise, we do only if they are the last ones
	 * in their service tree.
	 */
1872
	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
S
Shaohua Li 已提交
1873
		return true;
1874 1875
	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
			service_tree->count);
S
Shaohua Li 已提交
1876
	return false;
1877 1878
}

J
Jens Axboe 已提交
1879
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1880
{
1881
	struct cfq_queue *cfqq = cfqd->active_queue;
1882
	struct cfq_io_context *cic;
1883
	unsigned long sl, group_idle = 0;
1884

1885
	/*
J
Jens Axboe 已提交
1886 1887 1888
	 * SSD device without seek penalty, disable idling. But only do so
	 * for devices that support queuing, otherwise we still have a problem
	 * with sync vs async workloads.
1889
	 */
J
Jens Axboe 已提交
1890
	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1891 1892
		return;

1893
	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
J
Jens Axboe 已提交
1894
	WARN_ON(cfq_cfqq_slice_new(cfqq));
1895 1896 1897 1898

	/*
	 * idle is disabled, either manually or by past process history
	 */
1899 1900 1901 1902 1903 1904 1905
	if (!cfq_should_idle(cfqd, cfqq)) {
		/* no queue idling. Check for group idling */
		if (cfqd->cfq_group_idle)
			group_idle = cfqd->cfq_group_idle;
		else
			return;
	}
J
Jens Axboe 已提交
1906

1907
	/*
1908
	 * still active requests from this queue, don't idle
1909
	 */
1910
	if (cfqq->dispatched)
1911 1912
		return;

1913 1914 1915
	/*
	 * task has exited, don't wait
	 */
1916
	cic = cfqd->active_cic;
1917
	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
J
Jens Axboe 已提交
1918 1919
		return;

1920 1921 1922 1923 1924 1925
	/*
	 * If our average think time is larger than the remaining time
	 * slice, then don't idle. This avoids overrunning the allotted
	 * time slice.
	 */
	if (sample_valid(cic->ttime_samples) &&
1926 1927 1928
	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
				cic->ttime_mean);
1929
		return;
1930
	}
1931

1932 1933 1934 1935
	/* There are other queues in the group, don't do group idle */
	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
		return;

J
Jens Axboe 已提交
1936
	cfq_mark_cfqq_wait_request(cfqq);
1937

1938 1939 1940 1941
	if (group_idle)
		sl = cfqd->cfq_group_idle;
	else
		sl = cfqd->cfq_slice_idle;
1942

1943
	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1944
	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1945 1946
	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
			group_idle ? 1 : 0);
L
Linus Torvalds 已提交
1947 1948
}

1949 1950 1951
/*
 * Move request from internal lists to the request queue dispatch list.
 */
1952
static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
1953
{
1954
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
1955
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1956

1957 1958
	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

1959
	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1960
	cfq_remove_request(rq);
J
Jens Axboe 已提交
1961
	cfqq->dispatched++;
1962
	(RQ_CFQG(rq))->dispatched++;
1963
	elv_dispatch_sort(q, rq);
1964

1965
	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1966
	cfqq->nr_sectors += blk_rq_sectors(rq);
1967
	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1968
					rq_data_dir(rq), rq_is_sync(rq));
L
Linus Torvalds 已提交
1969 1970 1971 1972 1973
}

/*
 * return expired entry, or NULL to just start from scratch in rbtree
 */
J
Jens Axboe 已提交
1974
static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
1975
{
1976
	struct request *rq = NULL;
L
Linus Torvalds 已提交
1977

J
Jens Axboe 已提交
1978
	if (cfq_cfqq_fifo_expire(cfqq))
L
Linus Torvalds 已提交
1979
		return NULL;
1980 1981 1982

	cfq_mark_cfqq_fifo_expire(cfqq);

1983 1984
	if (list_empty(&cfqq->fifo))
		return NULL;
L
Linus Torvalds 已提交
1985

1986
	rq = rq_entry_fifo(cfqq->fifo.next);
1987
	if (time_before(jiffies, rq_fifo_time(rq)))
1988
		rq = NULL;
L
Linus Torvalds 已提交
1989

1990
	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
J
Jens Axboe 已提交
1991
	return rq;
L
Linus Torvalds 已提交
1992 1993
}

1994 1995 1996 1997
static inline int
cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	const int base_rq = cfqd->cfq_slice_async_rq;
L
Linus Torvalds 已提交
1998

1999
	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
L
Linus Torvalds 已提交
2000

2001
	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
L
Linus Torvalds 已提交
2002 2003
}

J
Jeff Moyer 已提交
2004 2005 2006 2007 2008 2009 2010 2011
/*
 * Must be called with the queue_lock held.
 */
static int cfqq_process_refs(struct cfq_queue *cfqq)
{
	int process_refs, io_refs;

	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2012
	process_refs = cfqq->ref - io_refs;
J
Jeff Moyer 已提交
2013 2014 2015 2016 2017 2018
	BUG_ON(process_refs < 0);
	return process_refs;
}

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
{
2019
	int process_refs, new_process_refs;
J
Jeff Moyer 已提交
2020 2021
	struct cfq_queue *__cfqq;

2022 2023 2024 2025 2026 2027 2028 2029 2030
	/*
	 * If there are no process references on the new_cfqq, then it is
	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
	 * chain may have dropped their last reference (not just their
	 * last process reference).
	 */
	if (!cfqq_process_refs(new_cfqq))
		return;

J
Jeff Moyer 已提交
2031 2032 2033 2034 2035 2036 2037 2038
	/* Avoid a circular list and skip interim queue merges */
	while ((__cfqq = new_cfqq->new_cfqq)) {
		if (__cfqq == cfqq)
			return;
		new_cfqq = __cfqq;
	}

	process_refs = cfqq_process_refs(cfqq);
2039
	new_process_refs = cfqq_process_refs(new_cfqq);
J
Jeff Moyer 已提交
2040 2041 2042 2043
	/*
	 * If the process for the cfqq has gone away, there is no
	 * sense in merging the queues.
	 */
2044
	if (process_refs == 0 || new_process_refs == 0)
J
Jeff Moyer 已提交
2045 2046
		return;

2047 2048 2049 2050 2051
	/*
	 * Merge in the direction of the lesser amount of work.
	 */
	if (new_process_refs >= process_refs) {
		cfqq->new_cfqq = new_cfqq;
2052
		new_cfqq->ref += process_refs;
2053 2054
	} else {
		new_cfqq->new_cfqq = cfqq;
2055
		cfqq->ref += new_process_refs;
2056
	}
J
Jeff Moyer 已提交
2057 2058
}

2059
static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2060
				struct cfq_group *cfqg, enum wl_prio_t prio)
2061 2062 2063 2064 2065 2066 2067
{
	struct cfq_queue *queue;
	int i;
	bool key_valid = false;
	unsigned long lowest_key = 0;
	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

2068 2069 2070
	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
		/* select the one with lowest rb_key */
		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081
		if (queue &&
		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
			lowest_key = queue->rb_key;
			cur_best = i;
			key_valid = true;
		}
	}

	return cur_best;
}

2082
static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2083 2084 2085
{
	unsigned slice;
	unsigned count;
2086
	struct cfq_rb_root *st;
2087
	unsigned group_slice;
2088
	enum wl_prio_t original_prio = cfqd->serving_prio;
2089

2090
	/* Choose next priority. RT > BE > IDLE */
2091
	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2092
		cfqd->serving_prio = RT_WORKLOAD;
2093
	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2094 2095 2096 2097 2098 2099 2100
		cfqd->serving_prio = BE_WORKLOAD;
	else {
		cfqd->serving_prio = IDLE_WORKLOAD;
		cfqd->workload_expires = jiffies + 1;
		return;
	}

2101 2102 2103
	if (original_prio != cfqd->serving_prio)
		goto new_workload;

2104 2105 2106 2107 2108
	/*
	 * For RT and BE, we have to choose also the type
	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
	 * expiration time
	 */
2109
	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2110
	count = st->count;
2111 2112

	/*
2113
	 * check workload expiration, and that we still have other queues ready
2114
	 */
2115
	if (count && !time_after(jiffies, cfqd->workload_expires))
2116 2117
		return;

2118
new_workload:
2119 2120
	/* otherwise select new workload type */
	cfqd->serving_type =
2121 2122
		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2123
	count = st->count;
2124 2125 2126 2127 2128 2129

	/*
	 * the workload slice is computed as a fraction of target latency
	 * proportional to the number of queues in that workload, over
	 * all the queues in the same priority class
	 */
2130 2131 2132 2133 2134
	group_slice = cfq_group_slice(cfqd, cfqg);

	slice = group_slice * count /
		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2135

2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149
	if (cfqd->serving_type == ASYNC_WORKLOAD) {
		unsigned int tmp;

		/*
		 * Async queues are currently system wide. Just taking
		 * proportion of queues with-in same group will lead to higher
		 * async ratio system wide as generally root group is going
		 * to have higher weight. A more accurate thing would be to
		 * calculate system wide asnc/sync ratio.
		 */
		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
		tmp = tmp/cfqd->busy_queues;
		slice = min_t(unsigned, slice, tmp);

2150 2151 2152
		/* async workload slice is scaled down according to
		 * the sync/async slice ratio. */
		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2153
	} else
2154 2155 2156 2157
		/* sync workload slice is at least 2 * cfq_slice_idle */
		slice = max(slice, 2 * cfqd->cfq_slice_idle);

	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2158
	cfq_log(cfqd, "workload slice:%d", slice);
2159 2160 2161
	cfqd->workload_expires = jiffies + slice;
}

2162 2163 2164
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2165
	struct cfq_group *cfqg;
2166 2167 2168

	if (RB_EMPTY_ROOT(&st->rb))
		return NULL;
2169 2170 2171
	cfqg = cfq_rb_first_group(st);
	update_min_vdisktime(st);
	return cfqg;
2172 2173
}

2174 2175
static void cfq_choose_cfqg(struct cfq_data *cfqd)
{
2176 2177 2178
	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);

	cfqd->serving_group = cfqg;
2179 2180 2181 2182 2183 2184

	/* Restore the workload type data */
	if (cfqg->saved_workload_slice) {
		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
		cfqd->serving_type = cfqg->saved_workload;
		cfqd->serving_prio = cfqg->saved_serving_prio;
2185 2186 2187
	} else
		cfqd->workload_expires = jiffies - 1;

2188
	choose_service_tree(cfqd, cfqg);
2189 2190
}

2191
/*
2192 2193
 * Select a queue for service. If we have a current active queue,
 * check whether to continue servicing it, or retrieve and set a new one.
2194
 */
2195
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
L
Linus Torvalds 已提交
2196
{
2197
	struct cfq_queue *cfqq, *new_cfqq = NULL;
L
Linus Torvalds 已提交
2198

2199 2200 2201
	cfqq = cfqd->active_queue;
	if (!cfqq)
		goto new_queue;
L
Linus Torvalds 已提交
2202

2203 2204
	if (!cfqd->rq_queued)
		return NULL;
2205 2206 2207 2208 2209 2210 2211

	/*
	 * We were waiting for group to get backlogged. Expire the queue
	 */
	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
		goto expire;

2212
	/*
J
Jens Axboe 已提交
2213
	 * The active queue has run out of time, expire it and select new.
2214
	 */
2215 2216 2217 2218 2219 2220 2221 2222 2223 2224
	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
		/*
		 * If slice had not expired at the completion of last request
		 * we might not have turned on wait_busy flag. Don't expire
		 * the queue yet. Allow the group to get backlogged.
		 *
		 * The very fact that we have used the slice, that means we
		 * have been idling all along on this queue and it should be
		 * ok to wait for this request to complete.
		 */
2225 2226 2227
		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
			cfqq = NULL;
2228
			goto keep_queue;
2229
		} else
2230
			goto check_group_idle;
2231
	}
L
Linus Torvalds 已提交
2232

2233
	/*
J
Jens Axboe 已提交
2234 2235
	 * The active queue has requests and isn't expired, allow it to
	 * dispatch.
2236
	 */
2237
	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2238
		goto keep_queue;
J
Jens Axboe 已提交
2239

2240 2241 2242 2243
	/*
	 * If another queue has a request waiting within our mean seek
	 * distance, let it run.  The expire code will check for close
	 * cooperators and put the close queue at the front of the service
J
Jeff Moyer 已提交
2244
	 * tree.  If possible, merge the expiring queue with the new cfqq.
2245
	 */
2246
	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
J
Jeff Moyer 已提交
2247 2248 2249
	if (new_cfqq) {
		if (!cfqq->new_cfqq)
			cfq_setup_merge(cfqq, new_cfqq);
2250
		goto expire;
J
Jeff Moyer 已提交
2251
	}
2252

J
Jens Axboe 已提交
2253 2254 2255 2256 2257
	/*
	 * No requests pending. If the active queue still has requests in
	 * flight or is idling for a new request, allow either of these
	 * conditions to happen (or time out) before selecting a new queue.
	 */
2258 2259 2260 2261 2262
	if (timer_pending(&cfqd->idle_slice_timer)) {
		cfqq = NULL;
		goto keep_queue;
	}

2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273
	/*
	 * This is a deep seek queue, but the device is much faster than
	 * the queue can deliver, don't idle
	 **/
	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
	    (cfq_cfqq_slice_new(cfqq) ||
	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
		cfq_clear_cfqq_deep(cfqq);
		cfq_clear_cfqq_idle_window(cfqq);
	}

2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285
	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
		cfqq = NULL;
		goto keep_queue;
	}

	/*
	 * If group idle is enabled and there are requests dispatched from
	 * this group, wait for requests to complete.
	 */
check_group_idle:
	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
	    && cfqq->cfqg->dispatched) {
2286 2287
		cfqq = NULL;
		goto keep_queue;
2288 2289
	}

J
Jens Axboe 已提交
2290
expire:
2291
	cfq_slice_expired(cfqd, 0);
J
Jens Axboe 已提交
2292
new_queue:
2293 2294 2295 2296 2297
	/*
	 * Current queue expired. Check if we have to switch to a new
	 * service tree
	 */
	if (!new_cfqq)
2298
		cfq_choose_cfqg(cfqd);
2299

2300
	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2301
keep_queue:
J
Jens Axboe 已提交
2302
	return cfqq;
2303 2304
}

J
Jens Axboe 已提交
2305
static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2306 2307 2308 2309 2310 2311 2312 2313 2314
{
	int dispatched = 0;

	while (cfqq->next_rq) {
		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
		dispatched++;
	}

	BUG_ON(!list_empty(&cfqq->fifo));
2315 2316

	/* By default cfqq is not expired if it is empty. Do it explicitly */
2317
	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2318 2319 2320
	return dispatched;
}

2321 2322 2323 2324
/*
 * Drain our current requests. Used for barriers and when switching
 * io schedulers on-the-fly.
 */
2325
static int cfq_forced_dispatch(struct cfq_data *cfqd)
2326
{
2327
	struct cfq_queue *cfqq;
2328
	int dispatched = 0;
2329

2330
	/* Expire the timeslice of the current active queue first */
2331
	cfq_slice_expired(cfqd, 0);
2332 2333
	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
		__cfq_set_active_queue(cfqd, cfqq);
2334
		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2335
	}
2336 2337 2338

	BUG_ON(cfqd->busy_queues);

2339
	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2340 2341 2342
	return dispatched;
}

S
Shaohua Li 已提交
2343 2344 2345 2346 2347
static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
	struct cfq_queue *cfqq)
{
	/* the queue hasn't finished any request, can't estimate */
	if (cfq_cfqq_slice_new(cfqq))
S
Shaohua Li 已提交
2348
		return true;
S
Shaohua Li 已提交
2349 2350
	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
		cfqq->slice_end))
S
Shaohua Li 已提交
2351
		return true;
S
Shaohua Li 已提交
2352

S
Shaohua Li 已提交
2353
	return false;
S
Shaohua Li 已提交
2354 2355
}

2356
static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2357 2358
{
	unsigned int max_dispatch;
2359

2360 2361 2362
	/*
	 * Drain async requests before we start sync IO
	 */
2363
	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2364
		return false;
2365

2366 2367 2368
	/*
	 * If this is an async queue and we have sync IO in flight, let it wait
	 */
2369
	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2370
		return false;
2371

S
Shaohua Li 已提交
2372
	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2373 2374
	if (cfq_class_idle(cfqq))
		max_dispatch = 1;
2375

2376 2377 2378 2379 2380 2381 2382
	/*
	 * Does this cfqq already have too much IO in flight?
	 */
	if (cfqq->dispatched >= max_dispatch) {
		/*
		 * idle queue must always only have a single IO in flight
		 */
2383
		if (cfq_class_idle(cfqq))
2384
			return false;
2385

2386 2387 2388
		/*
		 * We have other queues, don't allow more IO from this one
		 */
S
Shaohua Li 已提交
2389
		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
2390
			return false;
2391

2392
		/*
2393
		 * Sole queue user, no limit
2394
		 */
S
Shaohua Li 已提交
2395 2396 2397 2398 2399 2400 2401 2402 2403 2404
		if (cfqd->busy_queues == 1)
			max_dispatch = -1;
		else
			/*
			 * Normally we start throttling cfqq when cfq_quantum/2
			 * requests have been dispatched. But we can drive
			 * deeper queue depths at the beginning of slice
			 * subjected to upper limit of cfq_quantum.
			 * */
			max_dispatch = cfqd->cfq_quantum;
2405 2406 2407 2408 2409 2410 2411
	}

	/*
	 * Async queues must wait a bit before being allowed dispatch.
	 * We also ramp up the dispatch depth gradually for async IO,
	 * based on the last sync IO we serviced
	 */
2412
	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2413
		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2414
		unsigned int depth;
2415

2416
		depth = last_sync / cfqd->cfq_slice[1];
2417 2418
		if (!depth && !cfqq->dispatched)
			depth = 1;
2419 2420
		if (depth < max_dispatch)
			max_dispatch = depth;
2421
	}
2422

2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480
	/*
	 * If we're below the current max, allow a dispatch
	 */
	return cfqq->dispatched < max_dispatch;
}

/*
 * Dispatch a request from cfqq, moving them to the request queue
 * dispatch list.
 */
static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct request *rq;

	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

	if (!cfq_may_dispatch(cfqd, cfqq))
		return false;

	/*
	 * follow expired path, else get first next available
	 */
	rq = cfq_check_fifo(cfqq);
	if (!rq)
		rq = cfqq->next_rq;

	/*
	 * insert request into driver dispatch list
	 */
	cfq_dispatch_insert(cfqd->queue, rq);

	if (!cfqd->active_cic) {
		struct cfq_io_context *cic = RQ_CIC(rq);

		atomic_long_inc(&cic->ioc->refcount);
		cfqd->active_cic = cic;
	}

	return true;
}

/*
 * Find the cfqq that we need to service and move a request from that to the
 * dispatch list
 */
static int cfq_dispatch_requests(struct request_queue *q, int force)
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct cfq_queue *cfqq;

	if (!cfqd->busy_queues)
		return 0;

	if (unlikely(force))
		return cfq_forced_dispatch(cfqd);

	cfqq = cfq_select_queue(cfqd);
	if (!cfqq)
2481 2482
		return 0;

2483
	/*
2484
	 * Dispatch a request from this cfqq, if it is allowed
2485
	 */
2486 2487 2488
	if (!cfq_dispatch_request(cfqd, cfqq))
		return 0;

2489
	cfqq->slice_dispatch++;
2490
	cfq_clear_cfqq_must_dispatch(cfqq);
2491

2492 2493 2494 2495 2496 2497 2498 2499
	/*
	 * expire an async queue immediately if it has used up its slice. idle
	 * queue always expire after 1 dispatch round.
	 */
	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
	    cfq_class_idle(cfqq))) {
		cfqq->slice_end = jiffies + 1;
2500
		cfq_slice_expired(cfqd, 0);
L
Linus Torvalds 已提交
2501 2502
	}

2503
	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2504
	return 1;
L
Linus Torvalds 已提交
2505 2506 2507
}

/*
J
Jens Axboe 已提交
2508 2509
 * task holds one reference to the queue, dropped when task exits. each rq
 * in-flight on this queue also holds a reference, dropped when rq is freed.
L
Linus Torvalds 已提交
2510
 *
2511
 * Each cfq queue took a reference on the parent group. Drop it now.
L
Linus Torvalds 已提交
2512 2513 2514 2515
 * queue lock must be held here.
 */
static void cfq_put_queue(struct cfq_queue *cfqq)
{
2516
	struct cfq_data *cfqd = cfqq->cfqd;
2517
	struct cfq_group *cfqg;
2518

2519
	BUG_ON(cfqq->ref <= 0);
L
Linus Torvalds 已提交
2520

2521 2522
	cfqq->ref--;
	if (cfqq->ref)
L
Linus Torvalds 已提交
2523 2524
		return;

2525
	cfq_log_cfqq(cfqd, cfqq, "put_queue");
L
Linus Torvalds 已提交
2526
	BUG_ON(rb_first(&cfqq->sort_list));
2527
	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2528
	cfqg = cfqq->cfqg;
L
Linus Torvalds 已提交
2529

2530
	if (unlikely(cfqd->active_queue == cfqq)) {
2531
		__cfq_slice_expired(cfqd, cfqq, 0);
2532
		cfq_schedule_dispatch(cfqd);
2533
	}
2534

2535
	BUG_ON(cfq_cfqq_on_rr(cfqq));
L
Linus Torvalds 已提交
2536
	kmem_cache_free(cfq_pool, cfqq);
2537
	cfq_put_cfqg(cfqg);
L
Linus Torvalds 已提交
2538 2539
}

2540 2541 2542
/*
 * Must always be called with the rcu_read_lock() held
 */
2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553
static void
__call_for_each_cic(struct io_context *ioc,
		    void (*func)(struct io_context *, struct cfq_io_context *))
{
	struct cfq_io_context *cic;
	struct hlist_node *n;

	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
		func(ioc, cic);
}

2554
/*
2555
 * Call func for each cic attached to this ioc.
2556
 */
2557
static void
2558 2559
call_for_each_cic(struct io_context *ioc,
		  void (*func)(struct io_context *, struct cfq_io_context *))
L
Linus Torvalds 已提交
2560
{
2561
	rcu_read_lock();
2562
	__call_for_each_cic(ioc, func);
2563
	rcu_read_unlock();
2564 2565 2566 2567 2568 2569 2570 2571 2572
}

static void cfq_cic_free_rcu(struct rcu_head *head)
{
	struct cfq_io_context *cic;

	cic = container_of(head, struct cfq_io_context, rcu_head);

	kmem_cache_free(cfq_ioc_pool, cic);
2573
	elv_ioc_count_dec(cfq_ioc_count);
2574

2575 2576 2577 2578 2579 2580 2581
	if (ioc_gone) {
		/*
		 * CFQ scheduler is exiting, grab exit lock and check
		 * the pending io context count. If it hits zero,
		 * complete ioc_gone and set it back to NULL
		 */
		spin_lock(&ioc_gone_lock);
2582
		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2583 2584 2585 2586 2587
			complete(ioc_gone);
			ioc_gone = NULL;
		}
		spin_unlock(&ioc_gone_lock);
	}
2588
}
2589

2590 2591 2592
static void cfq_cic_free(struct cfq_io_context *cic)
{
	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2593 2594 2595 2596 2597
}

static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
{
	unsigned long flags;
2598
	unsigned long dead_key = (unsigned long) cic->key;
2599

2600
	BUG_ON(!(dead_key & CIC_DEAD_KEY));
2601 2602

	spin_lock_irqsave(&ioc->lock, flags);
2603
	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2604
	hlist_del_rcu(&cic->cic_list);
2605 2606
	spin_unlock_irqrestore(&ioc->lock, flags);

2607
	cfq_cic_free(cic);
2608 2609
}

2610 2611 2612 2613 2614
/*
 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
 * and ->trim() which is called with the task lock held
 */
2615 2616 2617
static void cfq_free_io_context(struct io_context *ioc)
{
	/*
2618 2619 2620 2621
	 * ioc->refcount is zero here, or we are called from elv_unregister(),
	 * so no more cic's are allowed to be linked into this ioc.  So it
	 * should be ok to iterate over the known list, we will see all cic's
	 * since no new ones are added.
2622
	 */
2623
	__call_for_each_cic(ioc, cic_free_func);
L
Linus Torvalds 已提交
2624 2625
}

2626
static void cfq_put_cooperator(struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
2627
{
J
Jeff Moyer 已提交
2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644
	struct cfq_queue *__cfqq, *next;

	/*
	 * If this queue was scheduled to merge with another queue, be
	 * sure to drop the reference taken on that queue (and others in
	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
	 */
	__cfqq = cfqq->new_cfqq;
	while (__cfqq) {
		if (__cfqq == cfqq) {
			WARN(1, "cfqq->new_cfqq loop detected\n");
			break;
		}
		next = __cfqq->new_cfqq;
		cfq_put_queue(__cfqq);
		__cfqq = next;
	}
2645 2646 2647 2648 2649 2650 2651 2652 2653 2654
}

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	if (unlikely(cfqq == cfqd->active_queue)) {
		__cfq_slice_expired(cfqd, cfqq, 0);
		cfq_schedule_dispatch(cfqd);
	}

	cfq_put_cooperator(cfqq);
J
Jeff Moyer 已提交
2655

2656 2657
	cfq_put_queue(cfqq);
}
2658

2659 2660 2661
static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
					 struct cfq_io_context *cic)
{
2662 2663
	struct io_context *ioc = cic->ioc;

2664
	list_del_init(&cic->queue_list);
2665 2666

	/*
2667
	 * Make sure dead mark is seen for dead queues
2668
	 */
2669
	smp_wmb();
2670
	cic->key = cfqd_dead_key(cfqd);
2671

2672 2673 2674
	if (ioc->ioc_data == cic)
		rcu_assign_pointer(ioc->ioc_data, NULL);

2675 2676 2677
	if (cic->cfqq[BLK_RW_ASYNC]) {
		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
		cic->cfqq[BLK_RW_ASYNC] = NULL;
2678 2679
	}

2680 2681 2682
	if (cic->cfqq[BLK_RW_SYNC]) {
		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
		cic->cfqq[BLK_RW_SYNC] = NULL;
2683
	}
2684 2685
}

2686 2687
static void cfq_exit_single_io_context(struct io_context *ioc,
				       struct cfq_io_context *cic)
2688
{
2689
	struct cfq_data *cfqd = cic_to_cfqd(cic);
2690 2691

	if (cfqd) {
2692
		struct request_queue *q = cfqd->queue;
2693
		unsigned long flags;
2694

2695
		spin_lock_irqsave(q->queue_lock, flags);
2696 2697 2698 2699 2700 2701

		/*
		 * Ensure we get a fresh copy of the ->key to prevent
		 * race between exiting task and queue
		 */
		smp_read_barrier_depends();
2702
		if (cic->key == cfqd)
2703 2704
			__cfq_exit_single_io_context(cfqd, cic);

2705
		spin_unlock_irqrestore(q->queue_lock, flags);
2706
	}
L
Linus Torvalds 已提交
2707 2708
}

2709 2710 2711 2712
/*
 * The process that ioc belongs to has exited, we need to clean up
 * and put the internal structures we have that belongs to that process.
 */
2713
static void cfq_exit_io_context(struct io_context *ioc)
L
Linus Torvalds 已提交
2714
{
2715
	call_for_each_cic(ioc, cfq_exit_single_io_context);
L
Linus Torvalds 已提交
2716 2717
}

2718
static struct cfq_io_context *
A
Al Viro 已提交
2719
cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
L
Linus Torvalds 已提交
2720
{
2721
	struct cfq_io_context *cic;
L
Linus Torvalds 已提交
2722

2723 2724
	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
							cfqd->queue->node);
L
Linus Torvalds 已提交
2725
	if (cic) {
2726
		cic->last_end_request = jiffies;
2727
		INIT_LIST_HEAD(&cic->queue_list);
2728
		INIT_HLIST_NODE(&cic->cic_list);
2729 2730
		cic->dtor = cfq_free_io_context;
		cic->exit = cfq_exit_io_context;
2731
		elv_ioc_count_inc(cfq_ioc_count);
L
Linus Torvalds 已提交
2732 2733 2734 2735 2736
	}

	return cic;
}

2737
static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2738 2739 2740 2741
{
	struct task_struct *tsk = current;
	int ioprio_class;

J
Jens Axboe 已提交
2742
	if (!cfq_cfqq_prio_changed(cfqq))
2743 2744
		return;

2745
	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2746
	switch (ioprio_class) {
2747 2748 2749 2750
	default:
		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
	case IOPRIO_CLASS_NONE:
		/*
2751
		 * no prio set, inherit CPU scheduling settings
2752 2753
		 */
		cfqq->ioprio = task_nice_ioprio(tsk);
2754
		cfqq->ioprio_class = task_nice_ioclass(tsk);
2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768
		break;
	case IOPRIO_CLASS_RT:
		cfqq->ioprio = task_ioprio(ioc);
		cfqq->ioprio_class = IOPRIO_CLASS_RT;
		break;
	case IOPRIO_CLASS_BE:
		cfqq->ioprio = task_ioprio(ioc);
		cfqq->ioprio_class = IOPRIO_CLASS_BE;
		break;
	case IOPRIO_CLASS_IDLE:
		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
		cfqq->ioprio = 7;
		cfq_clear_cfqq_idle_window(cfqq);
		break;
2769 2770 2771 2772 2773 2774 2775 2776
	}

	/*
	 * keep track of original prio settings in case we have to temporarily
	 * elevate the priority of this queue
	 */
	cfqq->org_ioprio = cfqq->ioprio;
	cfqq->org_ioprio_class = cfqq->ioprio_class;
J
Jens Axboe 已提交
2777
	cfq_clear_cfqq_prio_changed(cfqq);
2778 2779
}

J
Jens Axboe 已提交
2780
static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2781
{
2782
	struct cfq_data *cfqd = cic_to_cfqd(cic);
2783
	struct cfq_queue *cfqq;
2784
	unsigned long flags;
2785

2786 2787 2788
	if (unlikely(!cfqd))
		return;

2789
	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2790

2791
	cfqq = cic->cfqq[BLK_RW_ASYNC];
2792 2793
	if (cfqq) {
		struct cfq_queue *new_cfqq;
2794 2795
		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
						GFP_ATOMIC);
2796
		if (new_cfqq) {
2797
			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2798 2799
			cfq_put_queue(cfqq);
		}
2800
	}
2801

2802
	cfqq = cic->cfqq[BLK_RW_SYNC];
2803 2804 2805
	if (cfqq)
		cfq_mark_cfqq_prio_changed(cfqq);

2806
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2807 2808
}

2809
static void cfq_ioc_set_ioprio(struct io_context *ioc)
2810
{
2811
	call_for_each_cic(ioc, changed_ioprio);
2812
	ioc->ioprio_changed = 0;
2813 2814
}

2815
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2816
			  pid_t pid, bool is_sync)
2817 2818 2819 2820 2821
{
	RB_CLEAR_NODE(&cfqq->rb_node);
	RB_CLEAR_NODE(&cfqq->p_node);
	INIT_LIST_HEAD(&cfqq->fifo);

2822
	cfqq->ref = 0;
2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834
	cfqq->cfqd = cfqd;

	cfq_mark_cfqq_prio_changed(cfqq);

	if (is_sync) {
		if (!cfq_class_idle(cfqq))
			cfq_mark_cfqq_idle_window(cfqq);
		cfq_mark_cfqq_sync(cfqq);
	}
	cfqq->pid = pid;
}

2835 2836 2837 2838
#ifdef CONFIG_CFQ_GROUP_IOSCHED
static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
{
	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2839
	struct cfq_data *cfqd = cic_to_cfqd(cic);
2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869
	unsigned long flags;
	struct request_queue *q;

	if (unlikely(!cfqd))
		return;

	q = cfqd->queue;

	spin_lock_irqsave(q->queue_lock, flags);

	if (sync_cfqq) {
		/*
		 * Drop reference to sync queue. A new sync queue will be
		 * assigned in new group upon arrival of a fresh request.
		 */
		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
		cic_set_cfqq(cic, NULL, 1);
		cfq_put_queue(sync_cfqq);
	}

	spin_unlock_irqrestore(q->queue_lock, flags);
}

static void cfq_ioc_set_cgroup(struct io_context *ioc)
{
	call_for_each_cic(ioc, changed_cgroup);
	ioc->cgroup_changed = 0;
}
#endif  /* CONFIG_CFQ_GROUP_IOSCHED */

2870
static struct cfq_queue *
2871
cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2872
		     struct io_context *ioc, gfp_t gfp_mask)
2873 2874
{
	struct cfq_queue *cfqq, *new_cfqq = NULL;
2875
	struct cfq_io_context *cic;
2876
	struct cfq_group *cfqg;
2877 2878

retry:
2879
	cfqg = cfq_get_cfqg(cfqd, 1);
2880
	cic = cfq_cic_lookup(cfqd, ioc);
2881 2882
	/* cic always exists here */
	cfqq = cic_to_cfqq(cic, is_sync);
2883

2884 2885 2886 2887 2888 2889
	/*
	 * Always try a new alloc if we fell back to the OOM cfqq
	 * originally, since it should just be a temporary situation.
	 */
	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
		cfqq = NULL;
2890 2891 2892 2893 2894
		if (new_cfqq) {
			cfqq = new_cfqq;
			new_cfqq = NULL;
		} else if (gfp_mask & __GFP_WAIT) {
			spin_unlock_irq(cfqd->queue->queue_lock);
2895
			new_cfqq = kmem_cache_alloc_node(cfq_pool,
2896
					gfp_mask | __GFP_ZERO,
2897
					cfqd->queue->node);
2898
			spin_lock_irq(cfqd->queue->queue_lock);
2899 2900
			if (new_cfqq)
				goto retry;
2901
		} else {
2902 2903 2904
			cfqq = kmem_cache_alloc_node(cfq_pool,
					gfp_mask | __GFP_ZERO,
					cfqd->queue->node);
2905 2906
		}

2907 2908 2909
		if (cfqq) {
			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
			cfq_init_prio_data(cfqq, ioc);
2910
			cfq_link_cfqq_cfqg(cfqq, cfqg);
2911 2912 2913
			cfq_log_cfqq(cfqd, cfqq, "alloced");
		} else
			cfqq = &cfqd->oom_cfqq;
2914 2915 2916 2917 2918 2919 2920 2921
	}

	if (new_cfqq)
		kmem_cache_free(cfq_pool, new_cfqq);

	return cfqq;
}

2922 2923 2924
static struct cfq_queue **
cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
{
2925
	switch (ioprio_class) {
2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936
	case IOPRIO_CLASS_RT:
		return &cfqd->async_cfqq[0][ioprio];
	case IOPRIO_CLASS_BE:
		return &cfqd->async_cfqq[1][ioprio];
	case IOPRIO_CLASS_IDLE:
		return &cfqd->async_idle_cfqq;
	default:
		BUG();
	}
}

2937
static struct cfq_queue *
2938
cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2939 2940
	      gfp_t gfp_mask)
{
2941 2942
	const int ioprio = task_ioprio(ioc);
	const int ioprio_class = task_ioprio_class(ioc);
2943
	struct cfq_queue **async_cfqq = NULL;
2944 2945
	struct cfq_queue *cfqq = NULL;

2946 2947 2948 2949 2950
	if (!is_sync) {
		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
		cfqq = *async_cfqq;
	}

2951
	if (!cfqq)
2952
		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2953 2954 2955 2956

	/*
	 * pin the queue now that it's allocated, scheduler exit will prune it
	 */
2957
	if (!is_sync && !(*async_cfqq)) {
2958
		cfqq->ref++;
2959
		*async_cfqq = cfqq;
2960 2961
	}

2962
	cfqq->ref++;
2963 2964 2965
	return cfqq;
}

2966 2967 2968
/*
 * We drop cfq io contexts lazily, so we may find a dead one.
 */
2969
static void
2970 2971
cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
		  struct cfq_io_context *cic)
2972
{
2973 2974
	unsigned long flags;

2975
	WARN_ON(!list_empty(&cic->queue_list));
2976
	BUG_ON(cic->key != cfqd_dead_key(cfqd));
J
Jens Axboe 已提交
2977

2978 2979
	spin_lock_irqsave(&ioc->lock, flags);

2980
	BUG_ON(ioc->ioc_data == cic);
J
Jens Axboe 已提交
2981

2982
	radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2983
	hlist_del_rcu(&cic->cic_list);
2984 2985 2986
	spin_unlock_irqrestore(&ioc->lock, flags);

	cfq_cic_free(cic);
2987 2988
}

2989
static struct cfq_io_context *
2990
cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2991 2992
{
	struct cfq_io_context *cic;
2993
	unsigned long flags;
2994

2995 2996 2997
	if (unlikely(!ioc))
		return NULL;

2998 2999
	rcu_read_lock();

J
Jens Axboe 已提交
3000 3001 3002
	/*
	 * we maintain a last-hit cache, to avoid browsing over the tree
	 */
3003
	cic = rcu_dereference(ioc->ioc_data);
3004 3005
	if (cic && cic->key == cfqd) {
		rcu_read_unlock();
J
Jens Axboe 已提交
3006
		return cic;
3007
	}
J
Jens Axboe 已提交
3008

3009
	do {
3010
		cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
3011 3012 3013
		rcu_read_unlock();
		if (!cic)
			break;
3014
		if (unlikely(cic->key != cfqd)) {
3015
			cfq_drop_dead_cic(cfqd, ioc, cic);
3016
			rcu_read_lock();
3017
			continue;
3018
		}
3019

3020
		spin_lock_irqsave(&ioc->lock, flags);
3021
		rcu_assign_pointer(ioc->ioc_data, cic);
3022
		spin_unlock_irqrestore(&ioc->lock, flags);
3023 3024
		break;
	} while (1);
3025

3026
	return cic;
3027 3028
}

3029 3030 3031 3032 3033
/*
 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
 * the process specific cfq io context when entered from the block layer.
 * Also adds the cic to a per-cfqd list, used when this queue is removed.
 */
J
Jens Axboe 已提交
3034 3035
static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
			struct cfq_io_context *cic, gfp_t gfp_mask)
3036
{
3037
	unsigned long flags;
3038
	int ret;
3039

3040 3041 3042 3043
	ret = radix_tree_preload(gfp_mask);
	if (!ret) {
		cic->ioc = ioc;
		cic->key = cfqd;
3044

3045 3046
		spin_lock_irqsave(&ioc->lock, flags);
		ret = radix_tree_insert(&ioc->radix_root,
3047
						cfqd->cic_index, cic);
3048 3049
		if (!ret)
			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3050
		spin_unlock_irqrestore(&ioc->lock, flags);
3051

3052 3053 3054 3055 3056 3057 3058
		radix_tree_preload_end();

		if (!ret) {
			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
			list_add(&cic->queue_list, &cfqd->cic_list);
			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
		}
3059 3060
	}

3061 3062
	if (ret)
		printk(KERN_ERR "cfq: cic link failed!\n");
3063

3064
	return ret;
3065 3066
}

L
Linus Torvalds 已提交
3067 3068 3069
/*
 * Setup general io context and cfq io context. There can be several cfq
 * io contexts per general io context, if this process is doing io to more
3070
 * than one device managed by cfq.
L
Linus Torvalds 已提交
3071 3072
 */
static struct cfq_io_context *
3073
cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
L
Linus Torvalds 已提交
3074
{
3075
	struct io_context *ioc = NULL;
L
Linus Torvalds 已提交
3076 3077
	struct cfq_io_context *cic;

3078
	might_sleep_if(gfp_mask & __GFP_WAIT);
L
Linus Torvalds 已提交
3079

3080
	ioc = get_io_context(gfp_mask, cfqd->queue->node);
L
Linus Torvalds 已提交
3081 3082 3083
	if (!ioc)
		return NULL;

3084
	cic = cfq_cic_lookup(cfqd, ioc);
3085 3086
	if (cic)
		goto out;
L
Linus Torvalds 已提交
3087

3088 3089 3090
	cic = cfq_alloc_io_context(cfqd, gfp_mask);
	if (cic == NULL)
		goto err;
L
Linus Torvalds 已提交
3091

3092 3093 3094
	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
		goto err_free;

L
Linus Torvalds 已提交
3095
out:
3096 3097 3098 3099
	smp_read_barrier_depends();
	if (unlikely(ioc->ioprio_changed))
		cfq_ioc_set_ioprio(ioc);

3100 3101 3102 3103
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	if (unlikely(ioc->cgroup_changed))
		cfq_ioc_set_cgroup(ioc);
#endif
L
Linus Torvalds 已提交
3104
	return cic;
3105 3106
err_free:
	cfq_cic_free(cic);
L
Linus Torvalds 已提交
3107 3108 3109 3110 3111
err:
	put_io_context(ioc);
	return NULL;
}

3112 3113
static void
cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
L
Linus Torvalds 已提交
3114
{
3115 3116
	unsigned long elapsed = jiffies - cic->last_end_request;
	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
3117

3118 3119 3120 3121
	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
}
L
Linus Torvalds 已提交
3122

3123
static void
3124
cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
J
Jens Axboe 已提交
3125
		       struct request *rq)
3126
{
3127
	sector_t sdist = 0;
3128
	sector_t n_sec = blk_rq_sectors(rq);
3129 3130 3131 3132 3133 3134
	if (cfqq->last_request_pos) {
		if (cfqq->last_request_pos < blk_rq_pos(rq))
			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
		else
			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
	}
3135

3136
	cfqq->seek_history <<= 1;
3137 3138 3139 3140
	if (blk_queue_nonrot(cfqd->queue))
		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
	else
		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3141
}
L
Linus Torvalds 已提交
3142

3143 3144 3145 3146 3147 3148 3149 3150
/*
 * Disable idle window if the process thinks too long or seeks so much that
 * it doesn't matter
 */
static void
cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		       struct cfq_io_context *cic)
{
3151
	int old_idle, enable_idle;
3152

3153 3154 3155 3156
	/*
	 * Don't idle for async or idle io prio class
	 */
	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3157 3158
		return;

3159
	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
L
Linus Torvalds 已提交
3160

3161 3162 3163
	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
		cfq_mark_cfqq_deep(cfqq);

3164 3165 3166
	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
		enable_idle = 0;
	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3167
	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3168 3169
		enable_idle = 0;
	else if (sample_valid(cic->ttime_samples)) {
3170
		if (cic->ttime_mean > cfqd->cfq_slice_idle)
3171 3172 3173
			enable_idle = 0;
		else
			enable_idle = 1;
L
Linus Torvalds 已提交
3174 3175
	}

3176 3177 3178 3179 3180 3181 3182
	if (old_idle != enable_idle) {
		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
		if (enable_idle)
			cfq_mark_cfqq_idle_window(cfqq);
		else
			cfq_clear_cfqq_idle_window(cfqq);
	}
3183
}
L
Linus Torvalds 已提交
3184

3185 3186 3187 3188
/*
 * Check if new_cfqq should preempt the currently active queue. Return 0 for
 * no or if we aren't sure, a 1 will cause a preempt.
 */
3189
static bool
3190
cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
J
Jens Axboe 已提交
3191
		   struct request *rq)
3192
{
J
Jens Axboe 已提交
3193
	struct cfq_queue *cfqq;
3194

J
Jens Axboe 已提交
3195 3196
	cfqq = cfqd->active_queue;
	if (!cfqq)
3197
		return false;
3198

J
Jens Axboe 已提交
3199
	if (cfq_class_idle(new_cfqq))
3200
		return false;
3201 3202

	if (cfq_class_idle(cfqq))
3203
		return true;
3204

3205 3206 3207 3208 3209 3210
	/*
	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
	 */
	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
		return false;

3211 3212 3213 3214
	/*
	 * if the new request is sync, but the currently running queue is
	 * not, let the sync request have priority.
	 */
J
Jens Axboe 已提交
3215
	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3216
		return true;
3217

3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230
	if (new_cfqq->cfqg != cfqq->cfqg)
		return false;

	if (cfq_slice_used(cfqq))
		return true;

	/* Allow preemption only if we are idling on sync-noidle tree */
	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
	    new_cfqq->service_tree->count == 2 &&
	    RB_EMPTY_ROOT(&cfqq->sort_list))
		return true;

3231 3232 3233 3234
	/*
	 * So both queues are sync. Let the new request get disk time if
	 * it's a metadata request and the current queue is doing regular IO.
	 */
3235
	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
3236
		return true;
3237

3238 3239 3240 3241
	/*
	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
	 */
	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3242
		return true;
3243

3244 3245 3246 3247
	/* An idle queue should not be idle now for some reason */
	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
		return true;

3248
	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3249
		return false;
3250 3251 3252 3253 3254

	/*
	 * if this request is as-good as one we would expect from the
	 * current cfqq, let it preempt
	 */
3255
	if (cfq_rq_close(cfqd, cfqq, rq))
3256
		return true;
3257

3258
	return false;
3259 3260 3261 3262 3263 3264 3265 3266
}

/*
 * cfqq preempts the active queue. if we allowed preempt with no slice left,
 * let it have half of its nominal slice.
 */
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
3267 3268
	struct cfq_queue *old_cfqq = cfqd->active_queue;

3269
	cfq_log_cfqq(cfqd, cfqq, "preempt");
3270
	cfq_slice_expired(cfqd, 1);
3271

3272 3273 3274 3275 3276 3277 3278
	/*
	 * workload type is changed, don't save slice, otherwise preempt
	 * doesn't happen
	 */
	if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
		cfqq->cfqg->saved_workload_slice = 0;

3279 3280 3281 3282 3283
	/*
	 * Put the new queue at the front of the of the current list,
	 * so we know that it will be selected next.
	 */
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3284 3285

	cfq_service_tree_add(cfqd, cfqq, 1);
3286

3287 3288
	cfqq->slice_end = 0;
	cfq_mark_cfqq_slice_new(cfqq);
3289 3290 3291
}

/*
J
Jens Axboe 已提交
3292
 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3293 3294 3295
 * something we should do about it
 */
static void
J
Jens Axboe 已提交
3296 3297
cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		struct request *rq)
3298
{
J
Jens Axboe 已提交
3299
	struct cfq_io_context *cic = RQ_CIC(rq);
3300

3301
	cfqd->rq_queued++;
3302
	if (rq->cmd_flags & REQ_META)
3303 3304
		cfqq->meta_pending++;

J
Jens Axboe 已提交
3305
	cfq_update_io_thinktime(cfqd, cic);
3306
	cfq_update_io_seektime(cfqd, cfqq, rq);
J
Jens Axboe 已提交
3307 3308
	cfq_update_idle_window(cfqd, cfqq, cic);

3309
	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3310 3311 3312

	if (cfqq == cfqd->active_queue) {
		/*
3313 3314 3315
		 * Remember that we saw a request from this process, but
		 * don't start queuing just yet. Otherwise we risk seeing lots
		 * of tiny requests, because we disrupt the normal plugging
3316 3317
		 * and merging. If the request is already larger than a single
		 * page, let it rip immediately. For that case we assume that
3318 3319 3320
		 * merging is already done. Ditto for a busy system that
		 * has other work pending, don't risk delaying until the
		 * idle timer unplug to continue working.
3321
		 */
3322
		if (cfq_cfqq_wait_request(cfqq)) {
3323 3324
			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
			    cfqd->busy_queues > 1) {
3325
				cfq_del_timer(cfqd, cfqq);
3326
				cfq_clear_cfqq_wait_request(cfqq);
3327
				__blk_run_queue(cfqd->queue);
3328
			} else {
3329
				cfq_blkiocg_update_idle_time_stats(
3330
						&cfqq->cfqg->blkg);
3331
				cfq_mark_cfqq_must_dispatch(cfqq);
3332
			}
3333
		}
J
Jens Axboe 已提交
3334
	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3335 3336 3337
		/*
		 * not the active queue - expire current slice if it is
		 * idle and has expired it's mean thinktime or this new queue
3338 3339
		 * has some old slice time left and is of higher priority or
		 * this new queue is RT and the current one is BE
3340 3341
		 */
		cfq_preempt_queue(cfqd, cfqq);
T
Tejun Heo 已提交
3342
		__blk_run_queue(cfqd->queue);
3343
	}
L
Linus Torvalds 已提交
3344 3345
}

3346
static void cfq_insert_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
3347
{
3348
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
3349
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3350

3351
	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3352
	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
L
Linus Torvalds 已提交
3353

3354
	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3355
	list_add_tail(&rq->queuelist, &cfqq->fifo);
3356
	cfq_add_rq_rb(rq);
3357
	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3358 3359
			&cfqd->serving_group->blkg, rq_data_dir(rq),
			rq_is_sync(rq));
J
Jens Axboe 已提交
3360
	cfq_rq_enqueued(cfqd, cfqq, rq);
L
Linus Torvalds 已提交
3361 3362
}

3363 3364 3365 3366 3367 3368
/*
 * Update hw_tag based on peak queue depth over 50 samples under
 * sufficient load.
 */
static void cfq_update_hw_tag(struct cfq_data *cfqd)
{
S
Shaohua Li 已提交
3369 3370
	struct cfq_queue *cfqq = cfqd->active_queue;

3371 3372
	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3373 3374 3375

	if (cfqd->hw_tag == 1)
		return;
3376 3377

	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3378
	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3379 3380
		return;

S
Shaohua Li 已提交
3381 3382 3383 3384 3385 3386 3387
	/*
	 * If active queue hasn't enough requests and can idle, cfq might not
	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
	 * case
	 */
	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3388
	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
S
Shaohua Li 已提交
3389 3390
		return;

3391 3392 3393
	if (cfqd->hw_tag_samples++ < 50)
		return;

3394
	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3395 3396 3397 3398 3399
		cfqd->hw_tag = 1;
	else
		cfqd->hw_tag = 0;
}

3400 3401 3402 3403
static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct cfq_io_context *cic = cfqd->active_cic;

3404 3405 3406 3407
	/* If the queue already has requests, don't wait */
	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
		return false;

3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432
	/* If there are other queues in the group, don't wait */
	if (cfqq->cfqg->nr_cfqq > 1)
		return false;

	if (cfq_slice_used(cfqq))
		return true;

	/* if slice left is less than think time, wait busy */
	if (cic && sample_valid(cic->ttime_samples)
	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
		return true;

	/*
	 * If think times is less than a jiffy than ttime_mean=0 and above
	 * will not be true. It might happen that slice has not expired yet
	 * but will expire soon (4-5 ns) during select_queue(). To cover the
	 * case where think time is less than a jiffy, mark the queue wait
	 * busy if only 1 jiffy is left in the slice.
	 */
	if (cfqq->slice_end - jiffies == 1)
		return true;

	return false;
}

3433
static void cfq_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
3434
{
J
Jens Axboe 已提交
3435
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3436
	struct cfq_data *cfqd = cfqq->cfqd;
3437
	const int sync = rq_is_sync(rq);
3438
	unsigned long now;
L
Linus Torvalds 已提交
3439

3440
	now = jiffies;
3441 3442
	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
		     !!(rq->cmd_flags & REQ_NOIDLE));
L
Linus Torvalds 已提交
3443

3444 3445
	cfq_update_hw_tag(cfqd);

3446
	WARN_ON(!cfqd->rq_in_driver);
J
Jens Axboe 已提交
3447
	WARN_ON(!cfqq->dispatched);
3448
	cfqd->rq_in_driver--;
J
Jens Axboe 已提交
3449
	cfqq->dispatched--;
3450
	(RQ_CFQG(rq))->dispatched--;
3451 3452 3453
	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
			rq_data_dir(rq), rq_is_sync(rq));
L
Linus Torvalds 已提交
3454

3455
	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3456

3457
	if (sync) {
J
Jens Axboe 已提交
3458
		RQ_CIC(rq)->last_end_request = now;
3459 3460
		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
			cfqd->last_delayed_sync = now;
3461
	}
3462 3463 3464 3465 3466 3467

	/*
	 * If this is the active queue, check if it needs to be expired,
	 * or if we want to idle in case it has no pending requests.
	 */
	if (cfqd->active_queue == cfqq) {
3468 3469
		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

3470 3471 3472 3473
		if (cfq_cfqq_slice_new(cfqq)) {
			cfq_set_prio_slice(cfqd, cfqq);
			cfq_clear_cfqq_slice_new(cfqq);
		}
3474 3475

		/*
3476 3477
		 * Should we wait for next request to come in before we expire
		 * the queue.
3478
		 */
3479
		if (cfq_should_wait_busy(cfqd, cfqq)) {
3480 3481 3482 3483
			unsigned long extend_sl = cfqd->cfq_slice_idle;
			if (!cfqd->cfq_slice_idle)
				extend_sl = cfqd->cfq_group_idle;
			cfqq->slice_end = jiffies + extend_sl;
3484
			cfq_mark_cfqq_wait_busy(cfqq);
3485
			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3486 3487
		}

3488
		/*
3489 3490 3491 3492 3493 3494
		 * Idling is not enabled on:
		 * - expired queues
		 * - idle-priority queues
		 * - async queues
		 * - queues with still some requests queued
		 * - when there is a close cooperator
3495
		 */
3496
		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3497
			cfq_slice_expired(cfqd, 1);
3498 3499
		else if (sync && cfqq_empty &&
			 !cfq_close_cooperator(cfqd, cfqq)) {
3500
			cfq_arm_slice_timer(cfqd);
3501
		}
3502
	}
J
Jens Axboe 已提交
3503

3504
	if (!cfqd->rq_in_driver)
3505
		cfq_schedule_dispatch(cfqd);
L
Linus Torvalds 已提交
3506 3507
}

3508 3509 3510 3511 3512
/*
 * we temporarily boost lower priority queues if they are holding fs exclusive
 * resources. they are boosted to normal prio (CLASS_BE/4)
 */
static void cfq_prio_boost(struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
3513
{
3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524
	if (has_fs_excl()) {
		/*
		 * boost idle prio on transactions that would lock out other
		 * users of the filesystem
		 */
		if (cfq_class_idle(cfqq))
			cfqq->ioprio_class = IOPRIO_CLASS_BE;
		if (cfqq->ioprio > IOPRIO_NORM)
			cfqq->ioprio = IOPRIO_NORM;
	} else {
		/*
3525
		 * unboost the queue (if needed)
3526
		 */
3527 3528
		cfqq->ioprio_class = cfqq->org_ioprio_class;
		cfqq->ioprio = cfqq->org_ioprio;
3529 3530
	}
}
L
Linus Torvalds 已提交
3531

3532
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3533
{
3534
	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
J
Jens Axboe 已提交
3535
		cfq_mark_cfqq_must_alloc_slice(cfqq);
3536
		return ELV_MQUEUE_MUST;
J
Jens Axboe 已提交
3537
	}
L
Linus Torvalds 已提交
3538

3539 3540 3541
	return ELV_MQUEUE_MAY;
}

3542
static int cfq_may_queue(struct request_queue *q, int rw)
3543 3544 3545
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct task_struct *tsk = current;
3546
	struct cfq_io_context *cic;
3547 3548 3549 3550 3551 3552 3553 3554
	struct cfq_queue *cfqq;

	/*
	 * don't force setup of a queue from here, as a call to may_queue
	 * does not necessarily imply that a request actually will be queued.
	 * so just lookup a possibly existing queue, or return 'may queue'
	 * if that fails
	 */
3555
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3556 3557 3558
	if (!cic)
		return ELV_MQUEUE_MAY;

3559
	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3560
	if (cfqq) {
3561
		cfq_init_prio_data(cfqq, cic->ioc);
3562 3563
		cfq_prio_boost(cfqq);

3564
		return __cfq_may_queue(cfqq);
3565 3566 3567
	}

	return ELV_MQUEUE_MAY;
L
Linus Torvalds 已提交
3568 3569 3570 3571 3572
}

/*
 * queue lock held here
 */
3573
static void cfq_put_request(struct request *rq)
L
Linus Torvalds 已提交
3574
{
J
Jens Axboe 已提交
3575
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
L
Linus Torvalds 已提交
3576

J
Jens Axboe 已提交
3577
	if (cfqq) {
3578
		const int rw = rq_data_dir(rq);
L
Linus Torvalds 已提交
3579

3580 3581
		BUG_ON(!cfqq->allocated[rw]);
		cfqq->allocated[rw]--;
L
Linus Torvalds 已提交
3582

J
Jens Axboe 已提交
3583
		put_io_context(RQ_CIC(rq)->ioc);
L
Linus Torvalds 已提交
3584

3585 3586
		rq->elevator_private[0] = NULL;
		rq->elevator_private[1] = NULL;
L
Linus Torvalds 已提交
3587

3588 3589
		/* Put down rq reference on cfqg */
		cfq_put_cfqg(RQ_CFQG(rq));
3590
		rq->elevator_private[2] = NULL;
3591

L
Linus Torvalds 已提交
3592 3593 3594 3595
		cfq_put_queue(cfqq);
	}
}

J
Jeff Moyer 已提交
3596 3597 3598 3599 3600 3601
static struct cfq_queue *
cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
		struct cfq_queue *cfqq)
{
	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3602
	cfq_mark_cfqq_coop(cfqq->new_cfqq);
J
Jeff Moyer 已提交
3603 3604 3605 3606
	cfq_put_queue(cfqq);
	return cic_to_cfqq(cic, 1);
}

3607 3608 3609 3610 3611 3612 3613 3614 3615 3616
/*
 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
 * was the last process referring to said cfqq.
 */
static struct cfq_queue *
split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
{
	if (cfqq_process_refs(cfqq) == 1) {
		cfqq->pid = current->pid;
		cfq_clear_cfqq_coop(cfqq);
3617
		cfq_clear_cfqq_split_coop(cfqq);
3618 3619 3620 3621
		return cfqq;
	}

	cic_set_cfqq(cic, NULL, 1);
3622 3623 3624

	cfq_put_cooperator(cfqq);

3625 3626 3627
	cfq_put_queue(cfqq);
	return NULL;
}
L
Linus Torvalds 已提交
3628
/*
3629
 * Allocate cfq data structures associated with this request.
L
Linus Torvalds 已提交
3630
 */
3631
static int
3632
cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
L
Linus Torvalds 已提交
3633 3634 3635 3636
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct cfq_io_context *cic;
	const int rw = rq_data_dir(rq);
3637
	const bool is_sync = rq_is_sync(rq);
3638
	struct cfq_queue *cfqq;
L
Linus Torvalds 已提交
3639 3640 3641 3642
	unsigned long flags;

	might_sleep_if(gfp_mask & __GFP_WAIT);

3643
	cic = cfq_get_io_context(cfqd, gfp_mask);
3644

L
Linus Torvalds 已提交
3645 3646
	spin_lock_irqsave(q->queue_lock, flags);

3647 3648 3649
	if (!cic)
		goto queue_fail;

3650
new_queue:
3651
	cfqq = cic_to_cfqq(cic, is_sync);
3652
	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3653
		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3654
		cic_set_cfqq(cic, cfqq, is_sync);
J
Jeff Moyer 已提交
3655
	} else {
3656 3657 3658
		/*
		 * If the queue was seeky for too long, break it apart.
		 */
3659
		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3660 3661 3662 3663 3664 3665
			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
			cfqq = split_cfqq(cic, cfqq);
			if (!cfqq)
				goto new_queue;
		}

J
Jeff Moyer 已提交
3666 3667 3668 3669 3670 3671 3672 3673
		/*
		 * Check to see if this queue is scheduled to merge with
		 * another, closely cooperating queue.  The merging of
		 * queues happens here as it must be done in process context.
		 * The reference on new_cfqq was taken in merge_cfqqs.
		 */
		if (cfqq->new_cfqq)
			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3674
	}
L
Linus Torvalds 已提交
3675 3676 3677

	cfqq->allocated[rw]++;

J
Jens Axboe 已提交
3678
	spin_unlock_irqrestore(q->queue_lock, flags);
J
Jens Axboe 已提交
3679

3680
	cfqq->ref++;
3681 3682 3683
	rq->elevator_private[0] = cic;
	rq->elevator_private[1] = cfqq;
	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
J
Jens Axboe 已提交
3684
	return 0;
L
Linus Torvalds 已提交
3685

3686 3687 3688
queue_fail:
	if (cic)
		put_io_context(cic->ioc);
3689

3690
	cfq_schedule_dispatch(cfqd);
L
Linus Torvalds 已提交
3691
	spin_unlock_irqrestore(q->queue_lock, flags);
3692
	cfq_log(cfqd, "set_request fail");
L
Linus Torvalds 已提交
3693 3694 3695
	return 1;
}

3696
static void cfq_kick_queue(struct work_struct *work)
3697
{
3698
	struct cfq_data *cfqd =
3699
		container_of(work, struct cfq_data, unplug_work);
3700
	struct request_queue *q = cfqd->queue;
3701

3702
	spin_lock_irq(q->queue_lock);
T
Tejun Heo 已提交
3703
	__blk_run_queue(cfqd->queue);
3704
	spin_unlock_irq(q->queue_lock);
3705 3706 3707 3708 3709 3710 3711 3712 3713 3714
}

/*
 * Timer running if the active_queue is currently idling inside its time slice
 */
static void cfq_idle_slice_timer(unsigned long data)
{
	struct cfq_data *cfqd = (struct cfq_data *) data;
	struct cfq_queue *cfqq;
	unsigned long flags;
3715
	int timed_out = 1;
3716

3717 3718
	cfq_log(cfqd, "idle timer fired");

3719 3720
	spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3721 3722
	cfqq = cfqd->active_queue;
	if (cfqq) {
3723 3724
		timed_out = 0;

3725 3726 3727 3728 3729 3730
		/*
		 * We saw a request before the queue expired, let it through
		 */
		if (cfq_cfqq_must_dispatch(cfqq))
			goto out_kick;

3731 3732 3733
		/*
		 * expired
		 */
3734
		if (cfq_slice_used(cfqq))
3735 3736 3737 3738 3739 3740
			goto expire;

		/*
		 * only expire and reinvoke request handler, if there are
		 * other queues with pending requests
		 */
3741
		if (!cfqd->busy_queues)
3742 3743 3744 3745 3746
			goto out_cont;

		/*
		 * not expired and it has a request pending, let it dispatch
		 */
3747
		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3748
			goto out_kick;
3749 3750 3751 3752 3753

		/*
		 * Queue depth flag is reset only when the idle didn't succeed
		 */
		cfq_clear_cfqq_deep(cfqq);
3754 3755
	}
expire:
3756
	cfq_slice_expired(cfqd, timed_out);
3757
out_kick:
3758
	cfq_schedule_dispatch(cfqd);
3759 3760 3761 3762
out_cont:
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
}

J
Jens Axboe 已提交
3763 3764 3765
static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
{
	del_timer_sync(&cfqd->idle_slice_timer);
3766
	cancel_work_sync(&cfqd->unplug_work);
J
Jens Axboe 已提交
3767
}
3768

3769 3770 3771 3772 3773 3774 3775 3776 3777 3778
static void cfq_put_async_queues(struct cfq_data *cfqd)
{
	int i;

	for (i = 0; i < IOPRIO_BE_NR; i++) {
		if (cfqd->async_cfqq[0][i])
			cfq_put_queue(cfqd->async_cfqq[0][i]);
		if (cfqd->async_cfqq[1][i])
			cfq_put_queue(cfqd->async_cfqq[1][i]);
	}
3779 3780 3781

	if (cfqd->async_idle_cfqq)
		cfq_put_queue(cfqd->async_idle_cfqq);
3782 3783
}

3784 3785 3786 3787 3788
static void cfq_cfqd_free(struct rcu_head *head)
{
	kfree(container_of(head, struct cfq_data, rcu));
}

J
Jens Axboe 已提交
3789
static void cfq_exit_queue(struct elevator_queue *e)
L
Linus Torvalds 已提交
3790
{
3791
	struct cfq_data *cfqd = e->elevator_data;
3792
	struct request_queue *q = cfqd->queue;
3793

J
Jens Axboe 已提交
3794
	cfq_shutdown_timer_wq(cfqd);
3795

3796
	spin_lock_irq(q->queue_lock);
3797

3798
	if (cfqd->active_queue)
3799
		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3800 3801

	while (!list_empty(&cfqd->cic_list)) {
3802 3803 3804
		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
							struct cfq_io_context,
							queue_list);
3805 3806

		__cfq_exit_single_io_context(cfqd, cic);
3807
	}
3808

3809
	cfq_put_async_queues(cfqd);
3810
	cfq_release_cfq_groups(cfqd);
3811
	cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
3812

3813
	spin_unlock_irq(q->queue_lock);
3814 3815 3816

	cfq_shutdown_timer_wq(cfqd);

3817 3818 3819 3820
	spin_lock(&cic_index_lock);
	ida_remove(&cic_index_ida, cfqd->cic_index);
	spin_unlock(&cic_index_lock);

3821
	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3822
	call_rcu(&cfqd->rcu, cfq_cfqd_free);
L
Linus Torvalds 已提交
3823 3824
}

3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842
static int cfq_alloc_cic_index(void)
{
	int index, error;

	do {
		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
			return -ENOMEM;

		spin_lock(&cic_index_lock);
		error = ida_get_new(&cic_index_ida, &index);
		spin_unlock(&cic_index_lock);
		if (error && error != -EAGAIN)
			return error;
	} while (error);

	return index;
}

3843
static void *cfq_init_queue(struct request_queue *q)
L
Linus Torvalds 已提交
3844 3845
{
	struct cfq_data *cfqd;
3846
	int i, j;
3847
	struct cfq_group *cfqg;
3848
	struct cfq_rb_root *st;
L
Linus Torvalds 已提交
3849

3850 3851 3852 3853
	i = cfq_alloc_cic_index();
	if (i < 0)
		return NULL;

3854
	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
L
Linus Torvalds 已提交
3855
	if (!cfqd)
J
Jens Axboe 已提交
3856
		return NULL;
L
Linus Torvalds 已提交
3857

3858 3859 3860 3861
	/*
	 * Don't need take queue_lock in the routine, since we are
	 * initializing the ioscheduler, and nobody is using cfqd
	 */
3862 3863
	cfqd->cic_index = i;

3864 3865 3866
	/* Init root service tree */
	cfqd->grp_service_tree = CFQ_RB_ROOT;

3867 3868
	/* Init root group */
	cfqg = &cfqd->root_group;
3869 3870
	for_each_cfqg_st(cfqg, i, j, st)
		*st = CFQ_RB_ROOT;
3871
	RB_CLEAR_NODE(&cfqg->rb_node);
3872

3873 3874 3875
	/* Give preference to root group over other groups */
	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;

3876
#ifdef CONFIG_CFQ_GROUP_IOSCHED
3877 3878 3879 3880
	/*
	 * Take a reference to root group which we never drop. This is just
	 * to make sure that cfq_put_cfqg() does not try to kfree root group
	 */
3881
	cfqg->ref = 1;
3882
	rcu_read_lock();
3883 3884
	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
					(void *)cfqd, 0);
3885
	rcu_read_unlock();
3886
#endif
3887 3888 3889 3890 3891 3892 3893 3894
	/*
	 * Not strictly needed (since RB_ROOT just clears the node and we
	 * zeroed cfqd on alloc), but better be safe in case someone decides
	 * to add magic to the rb code
	 */
	for (i = 0; i < CFQ_PRIO_LISTS; i++)
		cfqd->prio_trees[i] = RB_ROOT;

3895 3896 3897 3898 3899 3900
	/*
	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
	 * Grab a permanent reference to it, so that the normal code flow
	 * will not attempt to free it.
	 */
	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3901
	cfqd->oom_cfqq.ref++;
3902
	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3903

3904
	INIT_LIST_HEAD(&cfqd->cic_list);
L
Linus Torvalds 已提交
3905 3906 3907

	cfqd->queue = q;

3908 3909 3910 3911
	init_timer(&cfqd->idle_slice_timer);
	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
	cfqd->idle_slice_timer.data = (unsigned long) cfqd;

3912
	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3913

L
Linus Torvalds 已提交
3914
	cfqd->cfq_quantum = cfq_quantum;
3915 3916
	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
L
Linus Torvalds 已提交
3917 3918
	cfqd->cfq_back_max = cfq_back_max;
	cfqd->cfq_back_penalty = cfq_back_penalty;
3919 3920 3921 3922
	cfqd->cfq_slice[0] = cfq_slice_async;
	cfqd->cfq_slice[1] = cfq_slice_sync;
	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
	cfqd->cfq_slice_idle = cfq_slice_idle;
3923
	cfqd->cfq_group_idle = cfq_group_idle;
3924
	cfqd->cfq_latency = 1;
3925
	cfqd->hw_tag = -1;
3926 3927 3928 3929
	/*
	 * we optimistically start assuming sync ops weren't delayed in last
	 * second, in order to have larger depth for async operations.
	 */
3930
	cfqd->last_delayed_sync = jiffies - HZ;
J
Jens Axboe 已提交
3931
	return cfqd;
L
Linus Torvalds 已提交
3932 3933 3934 3935
}

static void cfq_slab_kill(void)
{
3936 3937 3938 3939
	/*
	 * Caller already ensured that pending RCU callbacks are completed,
	 * so we should have no busy allocations at this point.
	 */
L
Linus Torvalds 已提交
3940 3941 3942 3943 3944 3945 3946 3947
	if (cfq_pool)
		kmem_cache_destroy(cfq_pool);
	if (cfq_ioc_pool)
		kmem_cache_destroy(cfq_ioc_pool);
}

static int __init cfq_slab_setup(void)
{
3948
	cfq_pool = KMEM_CACHE(cfq_queue, 0);
L
Linus Torvalds 已提交
3949 3950 3951
	if (!cfq_pool)
		goto fail;

3952
	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
L
Linus Torvalds 已提交
3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980
	if (!cfq_ioc_pool)
		goto fail;

	return 0;
fail:
	cfq_slab_kill();
	return -ENOMEM;
}

/*
 * sysfs parts below -->
 */
static ssize_t
cfq_var_show(unsigned int var, char *page)
{
	return sprintf(page, "%d\n", var);
}

static ssize_t
cfq_var_store(unsigned int *var, const char *page, size_t count)
{
	char *p = (char *) page;

	*var = simple_strtoul(p, &p, 10);
	return count;
}

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
J
Jens Axboe 已提交
3981
static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
L
Linus Torvalds 已提交
3982
{									\
3983
	struct cfq_data *cfqd = e->elevator_data;			\
L
Linus Torvalds 已提交
3984 3985 3986 3987 3988 3989
	unsigned int __data = __VAR;					\
	if (__CONV)							\
		__data = jiffies_to_msecs(__data);			\
	return cfq_var_show(__data, (page));				\
}
SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3990 3991
SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3992 3993
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3994
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3995
SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
3996 3997 3998
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3999
SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
L
Linus Torvalds 已提交
4000 4001 4002
#undef SHOW_FUNCTION

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
J
Jens Axboe 已提交
4003
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
L
Linus Torvalds 已提交
4004
{									\
4005
	struct cfq_data *cfqd = e->elevator_data;			\
L
Linus Torvalds 已提交
4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018
	unsigned int __data;						\
	int ret = cfq_var_store(&__data, (page), count);		\
	if (__data < (MIN))						\
		__data = (MIN);						\
	else if (__data > (MAX))					\
		__data = (MAX);						\
	if (__CONV)							\
		*(__PTR) = msecs_to_jiffies(__data);			\
	else								\
		*(__PTR) = __data;					\
	return ret;							\
}
STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4019 4020 4021 4022
STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
		UINT_MAX, 1);
STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
		UINT_MAX, 1);
4023
STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4024 4025
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
		UINT_MAX, 0);
4026
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4027
STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4028 4029
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4030 4031
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
		UINT_MAX, 0);
4032
STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
L
Linus Torvalds 已提交
4033 4034
#undef STORE_FUNCTION

4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047
#define CFQ_ATTR(name) \
	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

static struct elv_fs_entry cfq_attrs[] = {
	CFQ_ATTR(quantum),
	CFQ_ATTR(fifo_expire_sync),
	CFQ_ATTR(fifo_expire_async),
	CFQ_ATTR(back_seek_max),
	CFQ_ATTR(back_seek_penalty),
	CFQ_ATTR(slice_sync),
	CFQ_ATTR(slice_async),
	CFQ_ATTR(slice_async_rq),
	CFQ_ATTR(slice_idle),
4048
	CFQ_ATTR(group_idle),
4049
	CFQ_ATTR(low_latency),
4050
	__ATTR_NULL
L
Linus Torvalds 已提交
4051 4052 4053 4054 4055 4056 4057
};

static struct elevator_type iosched_cfq = {
	.ops = {
		.elevator_merge_fn = 		cfq_merge,
		.elevator_merged_fn =		cfq_merged_request,
		.elevator_merge_req_fn =	cfq_merged_requests,
4058
		.elevator_allow_merge_fn =	cfq_allow_merge,
D
Divyesh Shah 已提交
4059
		.elevator_bio_merged_fn =	cfq_bio_merged,
4060
		.elevator_dispatch_fn =		cfq_dispatch_requests,
L
Linus Torvalds 已提交
4061
		.elevator_add_req_fn =		cfq_insert_request,
4062
		.elevator_activate_req_fn =	cfq_activate_request,
L
Linus Torvalds 已提交
4063 4064 4065
		.elevator_deactivate_req_fn =	cfq_deactivate_request,
		.elevator_queue_empty_fn =	cfq_queue_empty,
		.elevator_completed_req_fn =	cfq_completed_request,
4066 4067
		.elevator_former_req_fn =	elv_rb_former_request,
		.elevator_latter_req_fn =	elv_rb_latter_request,
L
Linus Torvalds 已提交
4068 4069 4070 4071 4072
		.elevator_set_req_fn =		cfq_set_request,
		.elevator_put_req_fn =		cfq_put_request,
		.elevator_may_queue_fn =	cfq_may_queue,
		.elevator_init_fn =		cfq_init_queue,
		.elevator_exit_fn =		cfq_exit_queue,
4073
		.trim =				cfq_free_io_context,
L
Linus Torvalds 已提交
4074
	},
4075
	.elevator_attrs =	cfq_attrs,
L
Linus Torvalds 已提交
4076 4077 4078 4079
	.elevator_name =	"cfq",
	.elevator_owner =	THIS_MODULE,
};

4080 4081 4082 4083 4084 4085
#ifdef CONFIG_CFQ_GROUP_IOSCHED
static struct blkio_policy_type blkio_policy_cfq = {
	.ops = {
		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
	},
4086
	.plid = BLKIO_POLICY_PROP,
4087 4088 4089 4090 4091
};
#else
static struct blkio_policy_type blkio_policy_cfq;
#endif

L
Linus Torvalds 已提交
4092 4093
static int __init cfq_init(void)
{
4094 4095 4096 4097 4098 4099 4100 4101
	/*
	 * could be 0 on HZ < 1000 setups
	 */
	if (!cfq_slice_async)
		cfq_slice_async = 1;
	if (!cfq_slice_idle)
		cfq_slice_idle = 1;

4102 4103 4104 4105 4106 4107
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	if (!cfq_group_idle)
		cfq_group_idle = 1;
#else
		cfq_group_idle = 0;
#endif
L
Linus Torvalds 已提交
4108 4109 4110
	if (cfq_slab_setup())
		return -ENOMEM;

4111
	elv_register(&iosched_cfq);
4112
	blkio_policy_register(&blkio_policy_cfq);
L
Linus Torvalds 已提交
4113

4114
	return 0;
L
Linus Torvalds 已提交
4115 4116 4117 4118
}

static void __exit cfq_exit(void)
{
4119
	DECLARE_COMPLETION_ONSTACK(all_gone);
4120
	blkio_policy_unregister(&blkio_policy_cfq);
L
Linus Torvalds 已提交
4121
	elv_unregister(&iosched_cfq);
4122
	ioc_gone = &all_gone;
4123 4124
	/* ioc_gone's update must be visible before reading ioc_count */
	smp_wmb();
4125 4126 4127 4128 4129

	/*
	 * this also protects us from entering cfq_slab_kill() with
	 * pending RCU callbacks
	 */
4130
	if (elv_ioc_count_read(cfq_ioc_count))
4131
		wait_for_completion(&all_gone);
4132
	ida_destroy(&cic_index_ida);
4133
	cfq_slab_kill();
L
Linus Torvalds 已提交
4134 4135 4136 4137 4138 4139 4140 4141
}

module_init(cfq_init);
module_exit(cfq_exit);

MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");