cfq-iosched.c 127.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  CFQ, or complete fairness queueing, disk scheduler.
 *
 *  Based on ideas from a previously unfinished io
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
 *
7
 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
L
Linus Torvalds 已提交
8 9
 */
#include <linux/module.h>
10
#include <linux/slab.h>
11
#include <linux/sched/clock.h>
A
Al Viro 已提交
12 13
#include <linux/blkdev.h>
#include <linux/elevator.h>
14
#include <linux/ktime.h>
L
Linus Torvalds 已提交
15
#include <linux/rbtree.h>
16
#include <linux/ioprio.h>
17
#include <linux/blktrace_api.h>
18
#include <linux/blk-cgroup.h>
19
#include "blk.h"
J
Jens Axboe 已提交
20
#include "blk-wbt.h"
L
Linus Torvalds 已提交
21 22 23 24

/*
 * tunables
 */
25
/* max queue in one round of service */
S
Shaohua Li 已提交
26
static const int cfq_quantum = 8;
27
static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
28 29 30 31
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
32 33
static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
static u64 cfq_slice_async = NSEC_PER_SEC / 25;
34
static const int cfq_slice_async_rq = 2;
35 36 37
static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
static u64 cfq_group_idle = NSEC_PER_SEC / 125;
static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
38
static const int cfq_hist_divisor = 4;
39

40
/*
41
 * offset from end of queue service tree for idle class
42
 */
43
#define CFQ_IDLE_DELAY		(NSEC_PER_SEC / 5)
44 45 46 47
/* offset from end of group service tree under time slice mode */
#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
/* offset from end of group service under IOPS mode */
#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
48 49 50 51

/*
 * below this threshold, we consider thinktime immediate
 */
52
#define CFQ_MIN_TT		(2 * NSEC_PER_SEC / HZ)
53

54
#define CFQ_SLICE_SCALE		(5)
55
#define CFQ_HW_QUEUE_MIN	(5)
56
#define CFQ_SERVICE_SHIFT       12
57

58
#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
59
#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
60
#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
61
#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
62

63 64 65
#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
L
Linus Torvalds 已提交
66

67
static struct kmem_cache *cfq_pool;
L
Linus Torvalds 已提交
68

69 70 71 72
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

73
#define sample_valid(samples)	((samples) > 80)
74
#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
75

76
/* blkio-related constants */
77 78 79
#define CFQ_WEIGHT_LEGACY_MIN	10
#define CFQ_WEIGHT_LEGACY_DFL	500
#define CFQ_WEIGHT_LEGACY_MAX	1000
80

81
struct cfq_ttime {
82
	u64 last_end_request;
83

84 85
	u64 ttime_total;
	u64 ttime_mean;
86 87 88
	unsigned long ttime_samples;
};

89 90 91 92 93 94 95
/*
 * Most of our rbtree usage is for sorting with min extraction, so
 * if we cache the leftmost node we don't have to walk down the tree
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 * move this into the elevator for the rq sorting as well.
 */
struct cfq_rb_root {
96
	struct rb_root_cached rb;
97
	struct rb_node *rb_rightmost;
98
	unsigned count;
99
	u64 min_vdisktime;
100
	struct cfq_ttime ttime;
101
};
102
#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
103
			.rb_rightmost = NULL,			     \
104
			.ttime = {.last_end_request = ktime_get_ns(),},}
105

106 107 108 109 110
/*
 * Per process-grouping structure
 */
struct cfq_queue {
	/* reference count */
111
	int ref;
112 113 114 115 116 117 118
	/* various state flags, see below */
	unsigned int flags;
	/* parent cfq_data */
	struct cfq_data *cfqd;
	/* service_tree member */
	struct rb_node rb_node;
	/* service_tree key */
119
	u64 rb_key;
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
	/* prio tree member */
	struct rb_node p_node;
	/* prio tree root we belong to, if any */
	struct rb_root *p_root;
	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
	struct request *next_rq;
	/* requests queued in sort_list */
	int queued[2];
	/* currently allocated requests */
	int allocated[2];
	/* fifo list of requests in sort_list */
	struct list_head fifo;

135
	/* time when queue got scheduled in to dispatch first request. */
136 137 138
	u64 dispatch_start;
	u64 allocated_slice;
	u64 slice_dispatch;
139
	/* time when first request from queue completed and slice started. */
140 141
	u64 slice_start;
	u64 slice_end;
142
	s64 slice_resid;
143

144 145
	/* pending priority requests */
	int prio_pending;
146 147 148 149 150
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;

	/* io prio of this group */
	unsigned short ioprio, org_ioprio;
151
	unsigned short ioprio_class, org_ioprio_class;
152

153 154
	pid_t pid;

155
	u32 seek_history;
156 157
	sector_t last_request_pos;

158
	struct cfq_rb_root *service_tree;
J
Jeff Moyer 已提交
159
	struct cfq_queue *new_cfqq;
160
	struct cfq_group *cfqg;
161 162
	/* Number of sectors dispatched from queue in single dispatch round */
	unsigned long nr_sectors;
163 164
};

165
/*
166
 * First index in the service_trees.
167 168
 * IDLE is handled separately, so it has negative index
 */
169
enum wl_class_t {
170
	BE_WORKLOAD = 0,
171 172
	RT_WORKLOAD = 1,
	IDLE_WORKLOAD = 2,
173
	CFQ_PRIO_NR,
174 175
};

176 177 178 179 180 181 182 183 184
/*
 * Second index in the service_trees.
 */
enum wl_type_t {
	ASYNC_WORKLOAD = 0,
	SYNC_NOIDLE_WORKLOAD = 1,
	SYNC_WORKLOAD = 2
};

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
struct cfqg_stats {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	/* number of ios merged */
	struct blkg_rwstat		merged;
	/* total time spent on device in ns, may not be accurate w/ queueing */
	struct blkg_rwstat		service_time;
	/* total time spent waiting in scheduler queue in ns */
	struct blkg_rwstat		wait_time;
	/* number of IOs queued up */
	struct blkg_rwstat		queued;
	/* total disk time and nr sectors dispatched by this group */
	struct blkg_stat		time;
#ifdef CONFIG_DEBUG_BLK_CGROUP
	/* time not charged to this cgroup */
	struct blkg_stat		unaccounted_time;
	/* sum of number of ios queued across all samples */
	struct blkg_stat		avg_queue_size_sum;
	/* count of samples taken for average */
	struct blkg_stat		avg_queue_size_samples;
	/* how many times this group has been removed from service tree */
	struct blkg_stat		dequeue;
	/* total time spent waiting for it to be assigned a timeslice. */
	struct blkg_stat		group_wait_time;
T
Tejun Heo 已提交
208
	/* time spent idling for this blkcg_gq */
209 210 211 212 213 214 215 216 217 218 219 220
	struct blkg_stat		idle_time;
	/* total time with empty current active q with other requests queued */
	struct blkg_stat		empty_time;
	/* fields after this shouldn't be cleared on stat reset */
	uint64_t			start_group_wait_time;
	uint64_t			start_idle_time;
	uint64_t			start_empty_time;
	uint16_t			flags;
#endif	/* CONFIG_DEBUG_BLK_CGROUP */
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
};

221 222 223
/* Per-cgroup data */
struct cfq_group_data {
	/* must be the first member */
224
	struct blkcg_policy_data cpd;
225 226 227 228 229

	unsigned int weight;
	unsigned int leaf_weight;
};

230 231
/* This is per cgroup per device grouping structure */
struct cfq_group {
232 233 234
	/* must be the first member */
	struct blkg_policy_data pd;

235 236 237 238 239
	/* group service_tree member */
	struct rb_node rb_node;

	/* group service_tree key */
	u64 vdisktime;
T
Tejun Heo 已提交
240

241 242 243 244 245 246 247 248 249 250 251 252
	/*
	 * The number of active cfqgs and sum of their weights under this
	 * cfqg.  This covers this cfqg's leaf_weight and all children's
	 * weights, but does not cover weights of further descendants.
	 *
	 * If a cfqg is on the service tree, it's active.  An active cfqg
	 * also activates its parent and contributes to the children_weight
	 * of the parent.
	 */
	int nr_active;
	unsigned int children_weight;

253 254 255 256 257 258 259 260 261 262 263 264
	/*
	 * vfraction is the fraction of vdisktime that the tasks in this
	 * cfqg are entitled to.  This is determined by compounding the
	 * ratios walking up from this cfqg to the root.
	 *
	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
	 * vfractions on a service tree is approximately 1.  The sum may
	 * deviate a bit due to rounding errors and fluctuations caused by
	 * cfqgs entering and leaving the service tree.
	 */
	unsigned int vfraction;

T
Tejun Heo 已提交
265 266 267 268 269 270
	/*
	 * There are two weights - (internal) weight is the weight of this
	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
	 * this cfqg against the child cfqgs.  For the root cfqg, both
	 * weights are kept in sync for backward compatibility.
	 */
271
	unsigned int weight;
272
	unsigned int new_weight;
273
	unsigned int dev_weight;
274

T
Tejun Heo 已提交
275 276 277 278
	unsigned int leaf_weight;
	unsigned int new_leaf_weight;
	unsigned int dev_leaf_weight;

279 280 281
	/* number of cfqq currently on this group */
	int nr_cfqq;

282
	/*
283
	 * Per group busy queues average. Useful for workload slice calc. We
284 285 286 287 288 289 290 291 292 293 294
	 * create the array for each prio class but at run time it is used
	 * only for RT and BE class and slot for IDLE class remains unused.
	 * This is primarily done to avoid confusion and a gcc warning.
	 */
	unsigned int busy_queues_avg[CFQ_PRIO_NR];
	/*
	 * rr lists of queues with requests. We maintain service trees for
	 * RT and BE classes. These trees are subdivided in subclasses
	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
	 * class there is no subclassification and all the cfq queues go on
	 * a single tree service_tree_idle.
295 296 297 298
	 * Counts are embedded in the cfq_rb_root
	 */
	struct cfq_rb_root service_trees[2][3];
	struct cfq_rb_root service_tree_idle;
299

300
	u64 saved_wl_slice;
301 302
	enum wl_type_t saved_wl_type;
	enum wl_class_t saved_wl_class;
303

304 305
	/* number of requests that are on the dispatch list or inside driver */
	int dispatched;
S
Shaohua Li 已提交
306
	struct cfq_ttime ttime;
307
	struct cfqg_stats stats;	/* stats for this cfqg */
308 309 310 311 312

	/* async queue for each priority case */
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
	struct cfq_queue *async_idle_cfqq;

313
};
314

315 316 317 318
struct cfq_io_cq {
	struct io_cq		icq;		/* must be the first member */
	struct cfq_queue	*cfqq[2];
	struct cfq_ttime	ttime;
T
Tejun Heo 已提交
319 320
	int			ioprio;		/* the current ioprio */
#ifdef CONFIG_CFQ_GROUP_IOSCHED
T
Tejun Heo 已提交
321
	uint64_t		blkcg_serial_nr; /* the current blkcg serial */
T
Tejun Heo 已提交
322
#endif
323 324
};

325 326 327
/*
 * Per block device queue structure
 */
L
Linus Torvalds 已提交
328
struct cfq_data {
329
	struct request_queue *queue;
330 331
	/* Root service tree for cfq_groups */
	struct cfq_rb_root grp_service_tree;
332
	struct cfq_group *root_group;
333

334 335
	/*
	 * The priority currently being served
336
	 */
337 338
	enum wl_class_t serving_wl_class;
	enum wl_type_t serving_wl_type;
339
	u64 workload_expires;
340
	struct cfq_group *serving_group;
341 342 343 344 345 346 347 348

	/*
	 * Each priority tree is sorted by next_request position.  These
	 * trees are used when determining if two or more queues are
	 * interleaving requests (see cfq_close_cooperator).
	 */
	struct rb_root prio_trees[CFQ_PRIO_LISTS];

349
	unsigned int busy_queues;
350
	unsigned int busy_sync_queues;
351

352 353
	int rq_in_driver;
	int rq_in_flight[2];
354 355 356 357 358

	/*
	 * queue-depth detection
	 */
	int rq_queued;
359
	int hw_tag;
360 361 362 363 364 365 366 367
	/*
	 * hw_tag can be
	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
	 *  0 => no NCQ
	 */
	int hw_tag_est_depth;
	unsigned int hw_tag_samples;
L
Linus Torvalds 已提交
368

369 370 371
	/*
	 * idle window management
	 */
372
	struct hrtimer idle_slice_timer;
373
	struct work_struct unplug_work;
L
Linus Torvalds 已提交
374

375
	struct cfq_queue *active_queue;
376
	struct cfq_io_cq *active_cic;
377

J
Jens Axboe 已提交
378
	sector_t last_position;
L
Linus Torvalds 已提交
379 380 381 382 383 384 385

	/*
	 * tunables, see top of file
	 */
	unsigned int cfq_quantum;
	unsigned int cfq_back_penalty;
	unsigned int cfq_back_max;
386
	unsigned int cfq_slice_async_rq;
387
	unsigned int cfq_latency;
388 389 390 391 392
	u64 cfq_fifo_expire[2];
	u64 cfq_slice[2];
	u64 cfq_slice_idle;
	u64 cfq_group_idle;
	u64 cfq_target_latency;
393

394 395 396 397
	/*
	 * Fallback dummy cfqq for extreme OOM conditions
	 */
	struct cfq_queue oom_cfqq;
398

399
	u64 last_delayed_sync;
L
Linus Torvalds 已提交
400 401
};

402
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
403
static void cfq_put_queue(struct cfq_queue *cfqq);
404

405
static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
406
					    enum wl_class_t class,
407
					    enum wl_type_t type)
408
{
409 410 411
	if (!cfqg)
		return NULL;

412
	if (class == IDLE_WORKLOAD)
413
		return &cfqg->service_tree_idle;
414

415
	return &cfqg->service_trees[class][type];
416 417
}

J
Jens Axboe 已提交
418
enum cfqq_state_flags {
419 420
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
421
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
422 423 424 425
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
426
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
427
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
428
	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
429
	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
430
	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
431
	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
J
Jens Axboe 已提交
432 433 434 435 436
};

#define CFQ_CFQQ_FNS(name)						\
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
{									\
437
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
J
Jens Axboe 已提交
438 439 440
}									\
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
{									\
441
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
J
Jens Axboe 已提交
442 443 444
}									\
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
{									\
445
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
J
Jens Axboe 已提交
446 447 448 449
}

CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
450
CFQ_CFQQ_FNS(must_dispatch);
J
Jens Axboe 已提交
451 452 453 454
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
455
CFQ_CFQQ_FNS(slice_new);
456
CFQ_CFQQ_FNS(sync);
457
CFQ_CFQQ_FNS(coop);
458
CFQ_CFQQ_FNS(split_coop);
459
CFQ_CFQQ_FNS(deep);
460
CFQ_CFQQ_FNS(wait_busy);
J
Jens Axboe 已提交
461 462
#undef CFQ_CFQQ_FNS

463
#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
464

465 466 467 468 469
/* cfqg stats flags */
enum cfqg_stats_flags {
	CFQG_stats_waiting = 0,
	CFQG_stats_idling,
	CFQG_stats_empty,
470 471
};

472 473
#define CFQG_FLAG_FNS(name)						\
static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
474
{									\
475
	stats->flags |= (1 << CFQG_stats_##name);			\
476
}									\
477
static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
478
{									\
479
	stats->flags &= ~(1 << CFQG_stats_##name);			\
480
}									\
481
static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
482
{									\
483
	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
484 485
}									\

486 487 488 489
CFQG_FLAG_FNS(waiting)
CFQG_FLAG_FNS(idling)
CFQG_FLAG_FNS(empty)
#undef CFQG_FLAG_FNS
490 491

/* This should be called with the queue_lock held. */
492
static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
493 494 495
{
	unsigned long long now;

496
	if (!cfqg_stats_waiting(stats))
497 498 499 500 501 502
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
		blkg_stat_add(&stats->group_wait_time,
			      now - stats->start_group_wait_time);
503
	cfqg_stats_clear_waiting(stats);
504 505 506
}

/* This should be called with the queue_lock held. */
507 508
static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
						 struct cfq_group *curr_cfqg)
509
{
510
	struct cfqg_stats *stats = &cfqg->stats;
511

512
	if (cfqg_stats_waiting(stats))
513
		return;
514
	if (cfqg == curr_cfqg)
515
		return;
516 517
	stats->start_group_wait_time = sched_clock();
	cfqg_stats_mark_waiting(stats);
518 519 520
}

/* This should be called with the queue_lock held. */
521
static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
522 523 524
{
	unsigned long long now;

525
	if (!cfqg_stats_empty(stats))
526 527 528 529 530 531
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
		blkg_stat_add(&stats->empty_time,
			      now - stats->start_empty_time);
532
	cfqg_stats_clear_empty(stats);
533 534
}

535
static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
536
{
537
	blkg_stat_add(&cfqg->stats.dequeue, 1);
538 539
}

540
static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
541
{
542
	struct cfqg_stats *stats = &cfqg->stats;
543

544
	if (blkg_rwstat_total(&stats->queued))
545 546 547 548 549 550 551
		return;

	/*
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
	 */
552
	if (cfqg_stats_empty(stats))
553 554 555
		return;

	stats->start_empty_time = sched_clock();
556
	cfqg_stats_mark_empty(stats);
557 558
}

559
static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
560
{
561
	struct cfqg_stats *stats = &cfqg->stats;
562

563
	if (cfqg_stats_idling(stats)) {
564 565 566 567 568
		unsigned long long now = sched_clock();

		if (time_after64(now, stats->start_idle_time))
			blkg_stat_add(&stats->idle_time,
				      now - stats->start_idle_time);
569
		cfqg_stats_clear_idling(stats);
570 571 572
	}
}

573
static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
574
{
575
	struct cfqg_stats *stats = &cfqg->stats;
576

577
	BUG_ON(cfqg_stats_idling(stats));
578 579

	stats->start_idle_time = sched_clock();
580
	cfqg_stats_mark_idling(stats);
581 582
}

583
static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
584
{
585
	struct cfqg_stats *stats = &cfqg->stats;
586 587

	blkg_stat_add(&stats->avg_queue_size_sum,
588
		      blkg_rwstat_total(&stats->queued));
589
	blkg_stat_add(&stats->avg_queue_size_samples, 1);
590
	cfqg_stats_update_group_wait_time(stats);
591 592 593 594
}

#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

T
Tejun Heo 已提交
595 596 597 598 599 600 601
static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
602 603 604 605

#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

#ifdef CONFIG_CFQ_GROUP_IOSCHED
606

607 608 609 610 611 612 613 614
static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
{
	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
}

static struct cfq_group_data
*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
{
615
	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
616 617 618 619 620 621 622
}

static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
{
	return pd_to_blkg(&cfqg->pd);
}

623 624 625 626 627 628 629
static struct blkcg_policy blkcg_policy_cfq;

static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
{
	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
}

630 631 632 633 634
static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
{
	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
}

635
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
636
{
637
	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
638

639
	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
640 641
}

642 643 644 645 646 647 648
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				      struct cfq_group *ancestor)
{
	return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
				    cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
}

649 650 651 652 653 654 655 656 657 658
static inline void cfqg_get(struct cfq_group *cfqg)
{
	return blkg_get(cfqg_to_blkg(cfqg));
}

static inline void cfqg_put(struct cfq_group *cfqg)
{
	return blkg_put(cfqg_to_blkg(cfqg));
}

T
Tejun Heo 已提交
659
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
660 661 662
	blk_add_cgroup_trace_msg((cfqd)->queue,				\
			cfqg_to_blkg((cfqq)->cfqg)->blkcg,		\
			"cfq%d%c%c " fmt, (cfqq)->pid,			\
663 664
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
665
			  ##args);					\
T
Tejun Heo 已提交
666 667 668
} while (0)

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
669 670
	blk_add_cgroup_trace_msg((cfqd)->queue,				\
			cfqg_to_blkg(cfqg)->blkcg, fmt, ##args);	\
T
Tejun Heo 已提交
671
} while (0)
V
Vivek Goyal 已提交
672

673
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
674 675
					    struct cfq_group *curr_cfqg,
					    unsigned int op)
676
{
677
	blkg_rwstat_add(&cfqg->stats.queued, op, 1);
678 679
	cfqg_stats_end_empty_time(&cfqg->stats);
	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
680 681
}

682
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
683
			uint64_t time, unsigned long unaccounted_time)
684
{
685
	blkg_stat_add(&cfqg->stats.time, time);
686
#ifdef CONFIG_DEBUG_BLK_CGROUP
687
	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
688
#endif
689 690
}

691 692
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
					       unsigned int op)
693
{
694
	blkg_rwstat_add(&cfqg->stats.queued, op, -1);
695 696
}

697 698
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
					       unsigned int op)
699
{
700
	blkg_rwstat_add(&cfqg->stats.merged, op, 1);
701 702
}

703
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
704 705
			uint64_t start_time, uint64_t io_start_time,
			unsigned int op)
706
{
707
	struct cfqg_stats *stats = &cfqg->stats;
708 709 710
	unsigned long long now = sched_clock();

	if (time_after64(now, io_start_time))
711
		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
712
	if (time_after64(io_start_time, start_time))
713
		blkg_rwstat_add(&stats->wait_time, op,
714
				io_start_time - start_time);
715 716
}

717 718
/* @stats = 0 */
static void cfqg_stats_reset(struct cfqg_stats *stats)
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
{
	/* queued stats shouldn't be cleared */
	blkg_rwstat_reset(&stats->merged);
	blkg_rwstat_reset(&stats->service_time);
	blkg_rwstat_reset(&stats->wait_time);
	blkg_stat_reset(&stats->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
	blkg_stat_reset(&stats->unaccounted_time);
	blkg_stat_reset(&stats->avg_queue_size_sum);
	blkg_stat_reset(&stats->avg_queue_size_samples);
	blkg_stat_reset(&stats->dequeue);
	blkg_stat_reset(&stats->group_wait_time);
	blkg_stat_reset(&stats->idle_time);
	blkg_stat_reset(&stats->empty_time);
#endif
}

736
/* @to += @from */
737
static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
738 739
{
	/* queued stats shouldn't be cleared */
740 741 742 743
	blkg_rwstat_add_aux(&to->merged, &from->merged);
	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
	blkg_stat_add_aux(&from->time, &from->time);
744
#ifdef CONFIG_DEBUG_BLK_CGROUP
745 746 747 748 749 750 751
	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
752 753 754 755
#endif
}

/*
756
 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
757 758 759 760 761 762 763 764 765 766 767 768
 * recursive stats can still account for the amount used by this cfqg after
 * it's gone.
 */
static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
{
	struct cfq_group *parent = cfqg_parent(cfqg);

	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);

	if (unlikely(!parent))
		return;

769
	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
770 771 772
	cfqg_stats_reset(&cfqg->stats);
}

773 774
#else	/* CONFIG_CFQ_GROUP_IOSCHED */

775
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
776 777 778 779 780
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				      struct cfq_group *ancestor)
{
	return true;
}
781 782 783
static inline void cfqg_get(struct cfq_group *cfqg) { }
static inline void cfqg_put(struct cfq_group *cfqg) { }

784
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
785 786 787 788
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
				##args)
789
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
790

791
static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
792
			struct cfq_group *curr_cfqg, unsigned int op) { }
793
static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
794
			uint64_t time, unsigned long unaccounted_time) { }
795 796 797 798
static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
			unsigned int op) { }
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
			unsigned int op) { }
799
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
800 801
			uint64_t start_time, uint64_t io_start_time,
			unsigned int op) { }
802

803 804
#endif	/* CONFIG_CFQ_GROUP_IOSCHED */

805 806 807
#define cfq_log(cfqd, fmt, args...)	\
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

808 809 810 811 812 813 814 815 816 817
/* Traverses through cfq group service trees */
#define for_each_cfqg_st(cfqg, i, j, st) \
	for (i = 0; i <= IDLE_WORKLOAD; i++) \
		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
			: &cfqg->service_tree_idle; \
			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
			(i == IDLE_WORKLOAD && j == 0); \
			j++, st = i < IDLE_WORKLOAD ? \
			&cfqg->service_trees[i][j]: NULL) \

818 819 820
static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
	struct cfq_ttime *ttime, bool group_idle)
{
821
	u64 slice;
822 823 824 825 826 827 828 829
	if (!sample_valid(ttime->ttime_samples))
		return false;
	if (group_idle)
		slice = cfqd->cfq_group_idle;
	else
		slice = cfqd->cfq_slice_idle;
	return ttime->ttime_mean > slice;
}
830

831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
static inline bool iops_mode(struct cfq_data *cfqd)
{
	/*
	 * If we are not idling on queues and it is a NCQ drive, parallel
	 * execution of requests is on and measuring time is not possible
	 * in most of the cases until and unless we drive shallower queue
	 * depths and that becomes a performance bottleneck. In such cases
	 * switch to start providing fairness in terms of number of IOs.
	 */
	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
		return true;
	else
		return false;
}

846
static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
847 848 849 850 851 852 853 854
{
	if (cfq_class_idle(cfqq))
		return IDLE_WORKLOAD;
	if (cfq_class_rt(cfqq))
		return RT_WORKLOAD;
	return BE_WORKLOAD;
}

855 856 857 858 859 860 861 862 863 864

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
	if (!cfq_cfqq_sync(cfqq))
		return ASYNC_WORKLOAD;
	if (!cfq_cfqq_idle_window(cfqq))
		return SYNC_NOIDLE_WORKLOAD;
	return SYNC_WORKLOAD;
}

865
static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
866 867
					struct cfq_data *cfqd,
					struct cfq_group *cfqg)
868
{
869
	if (wl_class == IDLE_WORKLOAD)
870
		return cfqg->service_tree_idle.count;
871

872 873 874
	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
875 876
}

877 878 879
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg)
{
880 881
	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
882 883
}

884
static void cfq_dispatch_insert(struct request_queue *, struct request *);
885
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
886
				       struct cfq_io_cq *cic, struct bio *bio);
887

888 889 890 891 892 893
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
	/* cic->icq is the first member, %NULL will convert to %NULL */
	return container_of(icq, struct cfq_io_cq, icq);
}

894 895 896 897 898 899 900 901
static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
					       struct io_context *ioc)
{
	if (ioc)
		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
	return NULL;
}

902
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
903
{
904
	return cic->cfqq[is_sync];
905 906
}

907 908
static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
				bool is_sync)
909
{
910
	cic->cfqq[is_sync] = cfqq;
911 912
}

913
static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
914
{
915
	return cic->icq.q->elevator->elevator_data;
916 917
}

A
Andrew Morton 已提交
918 919 920 921
/*
 * scheduler run of queue, if there are requests pending and no one in the
 * driver that will restart queueing
 */
922
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
A
Andrew Morton 已提交
923
{
924 925
	if (cfqd->busy_queues) {
		cfq_log(cfqd, "schedule dispatch");
926
		kblockd_schedule_work(&cfqd->unplug_work);
927
	}
A
Andrew Morton 已提交
928 929
}

930 931 932 933 934
/*
 * Scale schedule slice based on io priority. Use the sync time slice only
 * if a queue is marked sync and has sync io queued. A sync queue with async
 * io only, should not get full sync slice length.
 */
935
static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
936
				 unsigned short prio)
937
{
938 939
	u64 base_slice = cfqd->cfq_slice[sync];
	u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
940

941 942
	WARN_ON(prio >= IOPRIO_BE_NR);

943
	return base_slice + (slice * (4 - prio));
944
}
945

946
static inline u64
947 948 949
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
950 951
}

952 953 954 955 956 957 958 959 960 961 962 963
/**
 * cfqg_scale_charge - scale disk time charge according to cfqg weight
 * @charge: disk time being charged
 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
 *
 * Scale @charge according to @vfraction, which is in range (0, 1].  The
 * scaling is inversely proportional.
 *
 * scaled = charge / vfraction
 *
 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
 */
964
static inline u64 cfqg_scale_charge(u64 charge,
965
				    unsigned int vfraction)
966
{
967
	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
968

969 970
	/* charge / vfraction */
	c <<= CFQ_SERVICE_SHIFT;
971
	return div_u64(c, vfraction);
972 973 974 975 976 977 978 979 980 981 982 983 984
}

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
	s64 delta = (s64)(vdisktime - min_vdisktime);
	if (delta > 0)
		min_vdisktime = vdisktime;

	return min_vdisktime;
}

static void update_min_vdisktime(struct cfq_rb_root *st)
{
985 986
	if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
		struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
987

988 989
		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
						  cfqg->vdisktime);
990 991 992
	}
}

993 994 995 996 997 998
/*
 * get averaged number of queues of RT/BE priority.
 * average is updated, with a formula that gives more weight to higher numbers,
 * to quickly follows sudden increases and decrease slowly
 */

999 1000
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
					struct cfq_group *cfqg, bool rt)
1001
{
1002 1003 1004
	unsigned min_q, max_q;
	unsigned mult  = cfq_hist_divisor - 1;
	unsigned round = cfq_hist_divisor / 2;
1005
	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1006

1007 1008 1009
	min_q = min(cfqg->busy_queues_avg[rt], busy);
	max_q = max(cfqg->busy_queues_avg[rt], busy);
	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1010
		cfq_hist_divisor;
1011 1012 1013
	return cfqg->busy_queues_avg[rt];
}

1014
static inline u64
1015 1016
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
1017
	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1018 1019
}

1020
static inline u64
1021
cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1022
{
1023
	u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1024
	if (cfqd->cfq_latency) {
1025 1026 1027 1028 1029 1030
		/*
		 * interested queues (we consider only the ones with the same
		 * priority class in the cfq group)
		 */
		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
						cfq_class_rt(cfqq));
1031 1032 1033
		u64 sync_slice = cfqd->cfq_slice[1];
		u64 expect_latency = sync_slice * iq;
		u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1034 1035

		if (expect_latency > group_slice) {
1036 1037 1038
			u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
			u64 low_slice;

1039 1040
			/* scale low_slice according to IO priority
			 * and sync vs async */
1041 1042
			low_slice = div64_u64(base_low_slice*slice, sync_slice);
			low_slice = min(slice, low_slice);
1043 1044
			/* the adapted slice value is scaled to fit all iqs
			 * into the target latency */
1045 1046
			slice = div64_u64(slice*group_slice, expect_latency);
			slice = max(slice, low_slice);
1047 1048
		}
	}
1049 1050 1051 1052 1053 1054
	return slice;
}

static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
1055 1056
	u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
	u64 now = ktime_get_ns();
1057

1058 1059
	cfqq->slice_start = now;
	cfqq->slice_end = now + slice;
1060
	cfqq->allocated_slice = slice;
1061
	cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1062 1063 1064 1065 1066 1067 1068
}

/*
 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
 * isn't valid until the first request from the dispatch is activated
 * and the slice time set.
 */
1069
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1070 1071
{
	if (cfq_cfqq_slice_new(cfqq))
S
Shaohua Li 已提交
1072
		return false;
1073
	if (ktime_get_ns() < cfqq->slice_end)
S
Shaohua Li 已提交
1074
		return false;
1075

S
Shaohua Li 已提交
1076
	return true;
1077 1078
}

L
Linus Torvalds 已提交
1079
/*
J
Jens Axboe 已提交
1080
 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
L
Linus Torvalds 已提交
1081
 * We choose the request that is closest to the head right now. Distance
1082
 * behind the head is penalized and only allowed to a certain extent.
L
Linus Torvalds 已提交
1083
 */
J
Jens Axboe 已提交
1084
static struct request *
1085
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
L
Linus Torvalds 已提交
1086
{
1087
	sector_t s1, s2, d1 = 0, d2 = 0;
L
Linus Torvalds 已提交
1088
	unsigned long back_max;
1089 1090 1091
#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
L
Linus Torvalds 已提交
1092

J
Jens Axboe 已提交
1093 1094 1095 1096
	if (rq1 == NULL || rq1 == rq2)
		return rq2;
	if (rq2 == NULL)
		return rq1;
J
Jens Axboe 已提交
1097

1098 1099 1100
	if (rq_is_sync(rq1) != rq_is_sync(rq2))
		return rq_is_sync(rq1) ? rq1 : rq2;

1101 1102
	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1103

1104 1105
	s1 = blk_rq_pos(rq1);
	s2 = blk_rq_pos(rq2);
L
Linus Torvalds 已提交
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121

	/*
	 * by definition, 1KiB is 2 sectors
	 */
	back_max = cfqd->cfq_back_max * 2;

	/*
	 * Strict one way elevator _except_ in the case where we allow
	 * short backward seeks which are biased as twice the cost of a
	 * similar forward seek.
	 */
	if (s1 >= last)
		d1 = s1 - last;
	else if (s1 + back_max >= last)
		d1 = (last - s1) * cfqd->cfq_back_penalty;
	else
1122
		wrap |= CFQ_RQ1_WRAP;
L
Linus Torvalds 已提交
1123 1124 1125 1126 1127 1128

	if (s2 >= last)
		d2 = s2 - last;
	else if (s2 + back_max >= last)
		d2 = (last - s2) * cfqd->cfq_back_penalty;
	else
1129
		wrap |= CFQ_RQ2_WRAP;
L
Linus Torvalds 已提交
1130 1131

	/* Found required data */
1132 1133 1134 1135 1136 1137

	/*
	 * By doing switch() on the bit mask "wrap" we avoid having to
	 * check two variables for all permutations: --> faster!
	 */
	switch (wrap) {
J
Jens Axboe 已提交
1138
	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1139
		if (d1 < d2)
J
Jens Axboe 已提交
1140
			return rq1;
1141
		else if (d2 < d1)
J
Jens Axboe 已提交
1142
			return rq2;
1143 1144
		else {
			if (s1 >= s2)
J
Jens Axboe 已提交
1145
				return rq1;
1146
			else
J
Jens Axboe 已提交
1147
				return rq2;
1148
		}
L
Linus Torvalds 已提交
1149

1150
	case CFQ_RQ2_WRAP:
J
Jens Axboe 已提交
1151
		return rq1;
1152
	case CFQ_RQ1_WRAP:
J
Jens Axboe 已提交
1153 1154
		return rq2;
	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1155 1156 1157 1158 1159 1160 1161 1162
	default:
		/*
		 * Since both rqs are wrapped,
		 * start with the one that's further behind head
		 * (--> only *one* back seek required),
		 * since back seek takes more time than forward.
		 */
		if (s1 <= s2)
J
Jens Axboe 已提交
1163
			return rq1;
L
Linus Torvalds 已提交
1164
		else
J
Jens Axboe 已提交
1165
			return rq2;
L
Linus Torvalds 已提交
1166 1167 1168
	}
}

1169
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1170
{
1171 1172 1173 1174
	/* Service tree is empty */
	if (!root->count)
		return NULL;

1175
	return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
1176 1177
}

1178 1179
static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
{
1180
	return rb_entry_cfqg(rb_first_cached(&root->rb));
1181 1182
}

1183
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1184
{
1185 1186 1187
	if (root->rb_rightmost == n)
		root->rb_rightmost = rb_prev(n);

1188
	rb_erase_cached(n, &root->rb);
1189 1190
	RB_CLEAR_NODE(n);

1191
	--root->count;
1192 1193
}

L
Linus Torvalds 已提交
1194 1195 1196
/*
 * would be nice to take fifo expire time into account as well
 */
J
Jens Axboe 已提交
1197 1198 1199
static struct request *
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		  struct request *last)
L
Linus Torvalds 已提交
1200
{
1201 1202
	struct rb_node *rbnext = rb_next(&last->rb_node);
	struct rb_node *rbprev = rb_prev(&last->rb_node);
J
Jens Axboe 已提交
1203
	struct request *next = NULL, *prev = NULL;
L
Linus Torvalds 已提交
1204

1205
	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
L
Linus Torvalds 已提交
1206 1207

	if (rbprev)
J
Jens Axboe 已提交
1208
		prev = rb_entry_rq(rbprev);
L
Linus Torvalds 已提交
1209

1210
	if (rbnext)
J
Jens Axboe 已提交
1211
		next = rb_entry_rq(rbnext);
1212 1213 1214
	else {
		rbnext = rb_first(&cfqq->sort_list);
		if (rbnext && rbnext != &last->rb_node)
J
Jens Axboe 已提交
1215
			next = rb_entry_rq(rbnext);
1216
	}
L
Linus Torvalds 已提交
1217

1218
	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
L
Linus Torvalds 已提交
1219 1220
}

1221 1222
static u64 cfq_slice_offset(struct cfq_data *cfqd,
			    struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
1223
{
1224 1225 1226
	/*
	 * just an approximation, should be ok.
	 */
1227
	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1228
		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1229 1230
}

1231 1232 1233 1234 1235 1236 1237 1238 1239
static inline s64
cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
	return cfqg->vdisktime - st->min_vdisktime;
}

static void
__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
1240
	struct rb_node **node = &st->rb.rb_root.rb_node;
1241 1242 1243
	struct rb_node *parent = NULL;
	struct cfq_group *__cfqg;
	s64 key = cfqg_key(st, cfqg);
1244
	bool leftmost = true, rightmost = true;
1245 1246 1247 1248 1249

	while (*node != NULL) {
		parent = *node;
		__cfqg = rb_entry_cfqg(parent);

1250
		if (key < cfqg_key(st, __cfqg)) {
1251
			node = &parent->rb_left;
1252 1253
			rightmost = false;
		} else {
1254
			node = &parent->rb_right;
1255
			leftmost = false;
1256 1257 1258
		}
	}

1259 1260 1261
	if (rightmost)
		st->rb_rightmost = &cfqg->rb_node;

1262
	rb_link_node(&cfqg->rb_node, parent, node);
1263
	rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1264 1265
}

1266 1267 1268
/*
 * This has to be called only on activation of cfqg
 */
1269
static void
1270 1271
cfq_update_group_weight(struct cfq_group *cfqg)
{
1272
	if (cfqg->new_weight) {
1273
		cfqg->weight = cfqg->new_weight;
1274
		cfqg->new_weight = 0;
1275
	}
1276 1277 1278 1279 1280 1281
}

static void
cfq_update_group_leaf_weight(struct cfq_group *cfqg)
{
	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
T
Tejun Heo 已提交
1282 1283 1284 1285 1286

	if (cfqg->new_leaf_weight) {
		cfqg->leaf_weight = cfqg->new_leaf_weight;
		cfqg->new_leaf_weight = 0;
	}
1287 1288 1289 1290 1291
}

static void
cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
1292
	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;	/* start with 1 */
1293
	struct cfq_group *pos = cfqg;
1294
	struct cfq_group *parent;
1295 1296 1297
	bool propagate;

	/* add to the service tree */
1298 1299
	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));

1300 1301 1302 1303 1304
	/*
	 * Update leaf_weight.  We cannot update weight at this point
	 * because cfqg might already have been activated and is
	 * contributing its current weight to the parent's child_weight.
	 */
1305
	cfq_update_group_leaf_weight(cfqg);
1306
	__cfq_group_service_tree_add(st, cfqg);
1307 1308

	/*
1309 1310 1311 1312 1313 1314 1315
	 * Activate @cfqg and calculate the portion of vfraction @cfqg is
	 * entitled to.  vfraction is calculated by walking the tree
	 * towards the root calculating the fraction it has at each level.
	 * The compounded ratio is how much vfraction @cfqg owns.
	 *
	 * Start with the proportion tasks in this cfqg has against active
	 * children cfqgs - its leaf_weight against children_weight.
1316 1317 1318
	 */
	propagate = !pos->nr_active++;
	pos->children_weight += pos->leaf_weight;
1319
	vfr = vfr * pos->leaf_weight / pos->children_weight;
1320

1321 1322 1323 1324 1325 1326
	/*
	 * Compound ->weight walking up the tree.  Both activation and
	 * vfraction calculation are done in the same loop.  Propagation
	 * stops once an already activated node is met.  vfraction
	 * calculation should always continue to the root.
	 */
1327
	while ((parent = cfqg_parent(pos))) {
1328
		if (propagate) {
1329
			cfq_update_group_weight(pos);
1330 1331 1332 1333
			propagate = !parent->nr_active++;
			parent->children_weight += pos->weight;
		}
		vfr = vfr * pos->weight / parent->children_weight;
1334 1335
		pos = parent;
	}
1336 1337

	cfqg->vfraction = max_t(unsigned, vfr, 1);
1338 1339
}

1340 1341 1342 1343 1344 1345 1346 1347
static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
{
	if (!iops_mode(cfqd))
		return CFQ_SLICE_MODE_GROUP_DELAY;
	else
		return CFQ_IOPS_MODE_GROUP_DELAY;
}

1348 1349
static void
cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1350 1351 1352 1353 1354 1355
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
	struct cfq_group *__cfqg;
	struct rb_node *n;

	cfqg->nr_cfqq++;
G
Gui Jianfeng 已提交
1356
	if (!RB_EMPTY_NODE(&cfqg->rb_node))
1357 1358 1359 1360 1361
		return;

	/*
	 * Currently put the group at the end. Later implement something
	 * so that groups get lesser vtime based on their weights, so that
L
Lucas De Marchi 已提交
1362
	 * if group does not loose all if it was not continuously backlogged.
1363
	 */
1364
	n = st->rb_rightmost;
1365 1366
	if (n) {
		__cfqg = rb_entry_cfqg(n);
1367 1368
		cfqg->vdisktime = __cfqg->vdisktime +
			cfq_get_cfqg_vdisktime_delay(cfqd);
1369 1370
	} else
		cfqg->vdisktime = st->min_vdisktime;
1371 1372
	cfq_group_service_tree_add(st, cfqg);
}
1373

1374 1375 1376
static void
cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
	struct cfq_group *pos = cfqg;
	bool propagate;

	/*
	 * Undo activation from cfq_group_service_tree_add().  Deactivate
	 * @cfqg and propagate deactivation upwards.
	 */
	propagate = !--pos->nr_active;
	pos->children_weight -= pos->leaf_weight;

	while (propagate) {
1388
		struct cfq_group *parent = cfqg_parent(pos);
1389 1390 1391

		/* @pos has 0 nr_active at this point */
		WARN_ON_ONCE(pos->children_weight);
1392
		pos->vfraction = 0;
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402

		if (!parent)
			break;

		propagate = !--parent->nr_active;
		parent->children_weight -= pos->weight;
		pos = parent;
	}

	/* remove from the service tree */
1403 1404
	if (!RB_EMPTY_NODE(&cfqg->rb_node))
		cfq_rb_erase(&cfqg->rb_node, st);
1405 1406 1407
}

static void
1408
cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1409 1410 1411 1412 1413
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;

	BUG_ON(cfqg->nr_cfqq < 1);
	cfqg->nr_cfqq--;
1414

1415 1416 1417 1418
	/* If there are other cfq queues under this group, don't delete it */
	if (cfqg->nr_cfqq)
		return;

V
Vivek Goyal 已提交
1419
	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1420
	cfq_group_service_tree_del(st, cfqg);
1421
	cfqg->saved_wl_slice = 0;
1422
	cfqg_stats_update_dequeue(cfqg);
1423 1424
}

1425 1426
static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
				       u64 *unaccounted_time)
1427
{
1428 1429
	u64 slice_used;
	u64 now = ktime_get_ns();
1430 1431 1432 1433 1434

	/*
	 * Queue got expired before even a single request completed or
	 * got expired immediately after first request completion.
	 */
1435
	if (!cfqq->slice_start || cfqq->slice_start == now) {
1436 1437 1438 1439 1440 1441
		/*
		 * Also charge the seek time incurred to the group, otherwise
		 * if there are mutiple queues in the group, each can dispatch
		 * a single request on seeky media and cause lots of seek time
		 * and group will never know it.
		 */
1442 1443
		slice_used = max_t(u64, (now - cfqq->dispatch_start),
					jiffies_to_nsecs(1));
1444
	} else {
1445
		slice_used = now - cfqq->slice_start;
1446 1447
		if (slice_used > cfqq->allocated_slice) {
			*unaccounted_time = slice_used - cfqq->allocated_slice;
1448
			slice_used = cfqq->allocated_slice;
1449
		}
1450
		if (cfqq->slice_start > cfqq->dispatch_start)
1451 1452
			*unaccounted_time += cfqq->slice_start -
					cfqq->dispatch_start;
1453 1454 1455 1456 1457 1458
	}

	return slice_used;
}

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1459
				struct cfq_queue *cfqq)
1460 1461
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
1462
	u64 used_sl, charge, unaccounted_sl = 0;
1463 1464
	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
			- cfqg->service_tree_idle.count;
1465
	unsigned int vfr;
1466
	u64 now = ktime_get_ns();
1467 1468

	BUG_ON(nr_sync < 0);
1469
	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1470

1471 1472 1473 1474
	if (iops_mode(cfqd))
		charge = cfqq->slice_dispatch;
	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
		charge = cfqq->allocated_slice;
1475

1476 1477 1478 1479 1480 1481 1482
	/*
	 * Can't update vdisktime while on service tree and cfqg->vfraction
	 * is valid only while on it.  Cache vfr, leave the service tree,
	 * update vdisktime and go back on.  The re-addition to the tree
	 * will also update the weights as necessary.
	 */
	vfr = cfqg->vfraction;
1483
	cfq_group_service_tree_del(st, cfqg);
1484
	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1485
	cfq_group_service_tree_add(st, cfqg);
1486 1487

	/* This group is being expired. Save the context */
1488 1489
	if (cfqd->workload_expires > now) {
		cfqg->saved_wl_slice = cfqd->workload_expires - now;
1490 1491
		cfqg->saved_wl_type = cfqd->serving_wl_type;
		cfqg->saved_wl_class = cfqd->serving_wl_class;
1492
	} else
1493
		cfqg->saved_wl_slice = 0;
V
Vivek Goyal 已提交
1494 1495 1496

	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
					st->min_vdisktime);
1497
	cfq_log_cfqq(cfqq->cfqd, cfqq,
1498
		     "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1499 1500
		     used_sl, cfqq->slice_dispatch, charge,
		     iops_mode(cfqd), cfqq->nr_sectors);
1501 1502
	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
	cfqg_stats_set_start_empty_time(cfqg);
1503 1504
}

1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
/**
 * cfq_init_cfqg_base - initialize base part of a cfq_group
 * @cfqg: cfq_group to initialize
 *
 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
 * is enabled or not.
 */
static void cfq_init_cfqg_base(struct cfq_group *cfqg)
{
	struct cfq_rb_root *st;
	int i, j;

	for_each_cfqg_st(cfqg, i, j, st)
		*st = CFQ_RB_ROOT;
	RB_CLEAR_NODE(&cfqg->rb_node);

1521
	cfqg->ttime.last_end_request = ktime_get_ns();
1522 1523
}

1524
#ifdef CONFIG_CFQ_GROUP_IOSCHED
1525 1526 1527
static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
			    bool on_dfl, bool reset_dev, bool is_leaf_weight);

T
Tejun Heo 已提交
1528
static void cfqg_stats_exit(struct cfqg_stats *stats)
1529
{
T
Tejun Heo 已提交
1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547
	blkg_rwstat_exit(&stats->merged);
	blkg_rwstat_exit(&stats->service_time);
	blkg_rwstat_exit(&stats->wait_time);
	blkg_rwstat_exit(&stats->queued);
	blkg_stat_exit(&stats->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
	blkg_stat_exit(&stats->unaccounted_time);
	blkg_stat_exit(&stats->avg_queue_size_sum);
	blkg_stat_exit(&stats->avg_queue_size_samples);
	blkg_stat_exit(&stats->dequeue);
	blkg_stat_exit(&stats->group_wait_time);
	blkg_stat_exit(&stats->idle_time);
	blkg_stat_exit(&stats->empty_time);
#endif
}

static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
{
1548
	if (blkg_rwstat_init(&stats->merged, gfp) ||
T
Tejun Heo 已提交
1549 1550 1551 1552 1553
	    blkg_rwstat_init(&stats->service_time, gfp) ||
	    blkg_rwstat_init(&stats->wait_time, gfp) ||
	    blkg_rwstat_init(&stats->queued, gfp) ||
	    blkg_stat_init(&stats->time, gfp))
		goto err;
1554 1555

#ifdef CONFIG_DEBUG_BLK_CGROUP
T
Tejun Heo 已提交
1556 1557 1558 1559 1560 1561 1562 1563
	if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
	    blkg_stat_init(&stats->dequeue, gfp) ||
	    blkg_stat_init(&stats->group_wait_time, gfp) ||
	    blkg_stat_init(&stats->idle_time, gfp) ||
	    blkg_stat_init(&stats->empty_time, gfp))
		goto err;
1564
#endif
T
Tejun Heo 已提交
1565 1566 1567 1568
	return 0;
err:
	cfqg_stats_exit(stats);
	return -ENOMEM;
1569 1570
}

1571 1572 1573 1574
static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
{
	struct cfq_group_data *cgd;

T
Tejun Heo 已提交
1575
	cgd = kzalloc(sizeof(*cgd), gfp);
1576 1577 1578 1579 1580
	if (!cgd)
		return NULL;
	return &cgd->cpd;
}

1581
static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1582
{
1583
	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1584
	unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1585
			      CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1586

1587 1588 1589 1590 1591
	if (cpd_to_blkcg(cpd) == &blkcg_root)
		weight *= 2;

	cgd->weight = weight;
	cgd->leaf_weight = weight;
1592 1593
}

1594 1595 1596 1597 1598
static void cfq_cpd_free(struct blkcg_policy_data *cpd)
{
	kfree(cpd_to_cfqgd(cpd));
}

1599 1600 1601
static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
{
	struct blkcg *blkcg = cpd_to_blkcg(cpd);
1602
	bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1603 1604 1605 1606 1607 1608 1609 1610 1611
	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;

	if (blkcg == &blkcg_root)
		weight *= 2;

	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
}

1612 1613
static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
{
1614 1615 1616 1617 1618 1619 1620
	struct cfq_group *cfqg;

	cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
	if (!cfqg)
		return NULL;

	cfq_init_cfqg_base(cfqg);
T
Tejun Heo 已提交
1621 1622 1623 1624
	if (cfqg_stats_init(&cfqg->stats, gfp)) {
		kfree(cfqg);
		return NULL;
	}
1625 1626

	return &cfqg->pd;
1627 1628
}

1629
static void cfq_pd_init(struct blkg_policy_data *pd)
1630
{
1631 1632
	struct cfq_group *cfqg = pd_to_cfqg(pd);
	struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1633

1634 1635
	cfqg->weight = cgd->weight;
	cfqg->leaf_weight = cgd->leaf_weight;
1636 1637
}

1638
static void cfq_pd_offline(struct blkg_policy_data *pd)
1639
{
1640
	struct cfq_group *cfqg = pd_to_cfqg(pd);
1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652
	int i;

	for (i = 0; i < IOPRIO_BE_NR; i++) {
		if (cfqg->async_cfqq[0][i])
			cfq_put_queue(cfqg->async_cfqq[0][i]);
		if (cfqg->async_cfqq[1][i])
			cfq_put_queue(cfqg->async_cfqq[1][i]);
	}

	if (cfqg->async_idle_cfqq)
		cfq_put_queue(cfqg->async_idle_cfqq);

1653 1654 1655 1656 1657 1658
	/*
	 * @blkg is going offline and will be ignored by
	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
	 * that they don't get lost.  If IOs complete after this point, the
	 * stats for them will be lost.  Oh well...
	 */
1659
	cfqg_stats_xfer_dead(cfqg);
1660 1661
}

1662 1663
static void cfq_pd_free(struct blkg_policy_data *pd)
{
T
Tejun Heo 已提交
1664 1665 1666 1667
	struct cfq_group *cfqg = pd_to_cfqg(pd);

	cfqg_stats_exit(&cfqg->stats);
	return kfree(cfqg);
1668 1669
}

1670
static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1671
{
1672
	struct cfq_group *cfqg = pd_to_cfqg(pd);
1673 1674

	cfqg_stats_reset(&cfqg->stats);
1675 1676
}

1677 1678
static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
					 struct blkcg *blkcg)
1679
{
1680
	struct blkcg_gq *blkg;
1681

1682 1683 1684 1685
	blkg = blkg_lookup(blkcg, cfqd->queue);
	if (likely(blkg))
		return blkg_to_cfqg(blkg);
	return NULL;
1686 1687 1688 1689 1690
}

static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
{
	cfqq->cfqg = cfqg;
1691
	/* cfqq reference on cfqg */
1692
	cfqg_get(cfqg);
1693 1694
}

1695 1696
static u64 cfqg_prfill_weight_device(struct seq_file *sf,
				     struct blkg_policy_data *pd, int off)
1697
{
1698
	struct cfq_group *cfqg = pd_to_cfqg(pd);
1699 1700

	if (!cfqg->dev_weight)
1701
		return 0;
1702
	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1703 1704
}

1705
static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1706
{
1707 1708 1709
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
			  cfqg_prfill_weight_device, &blkcg_policy_cfq,
			  0, false);
1710 1711 1712
	return 0;
}

T
Tejun Heo 已提交
1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
					  struct blkg_policy_data *pd, int off)
{
	struct cfq_group *cfqg = pd_to_cfqg(pd);

	if (!cfqg->dev_leaf_weight)
		return 0;
	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
}

1723
static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
T
Tejun Heo 已提交
1724
{
1725 1726 1727
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
			  0, false);
T
Tejun Heo 已提交
1728 1729 1730
	return 0;
}

1731
static int cfq_print_weight(struct seq_file *sf, void *v)
1732
{
1733
	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1734 1735
	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
	unsigned int val = 0;
1736

1737 1738 1739 1740
	if (cgd)
		val = cgd->weight;

	seq_printf(sf, "%u\n", val);
1741 1742 1743
	return 0;
}

1744
static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
T
Tejun Heo 已提交
1745
{
1746
	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1747 1748 1749 1750 1751
	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
	unsigned int val = 0;

	if (cgd)
		val = cgd->leaf_weight;
1752

1753
	seq_printf(sf, "%u\n", val);
T
Tejun Heo 已提交
1754 1755 1756
	return 0;
}

1757 1758
static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
					char *buf, size_t nbytes, loff_t off,
1759
					bool on_dfl, bool is_leaf_weight)
1760
{
1761 1762
	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1763
	struct blkcg *blkcg = css_to_blkcg(of_css(of));
1764
	struct blkg_conf_ctx ctx;
1765
	struct cfq_group *cfqg;
1766
	struct cfq_group_data *cfqgd;
1767
	int ret;
1768
	u64 v;
1769

T
Tejun Heo 已提交
1770
	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1771 1772 1773
	if (ret)
		return ret;

1774 1775 1776 1777 1778 1779 1780 1781 1782
	if (sscanf(ctx.body, "%llu", &v) == 1) {
		/* require "default" on dfl */
		ret = -ERANGE;
		if (!v && on_dfl)
			goto out_finish;
	} else if (!strcmp(strim(ctx.body), "default")) {
		v = 0;
	} else {
		ret = -EINVAL;
1783
		goto out_finish;
1784
	}
1785

1786
	cfqg = blkg_to_cfqg(ctx.blkg);
1787
	cfqgd = blkcg_to_cfqgd(blkcg);
1788

1789
	ret = -ERANGE;
1790
	if (!v || (v >= min && v <= max)) {
T
Tejun Heo 已提交
1791
		if (!is_leaf_weight) {
1792 1793
			cfqg->dev_weight = v;
			cfqg->new_weight = v ?: cfqgd->weight;
T
Tejun Heo 已提交
1794
		} else {
1795 1796
			cfqg->dev_leaf_weight = v;
			cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
T
Tejun Heo 已提交
1797
		}
1798 1799
		ret = 0;
	}
1800
out_finish:
1801
	blkg_conf_finish(&ctx);
1802
	return ret ?: nbytes;
1803 1804
}

1805 1806
static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
				      char *buf, size_t nbytes, loff_t off)
T
Tejun Heo 已提交
1807
{
1808
	return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
T
Tejun Heo 已提交
1809 1810
}

1811 1812
static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
					   char *buf, size_t nbytes, loff_t off)
T
Tejun Heo 已提交
1813
{
1814
	return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
T
Tejun Heo 已提交
1815 1816
}

1817
static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1818
			    bool on_dfl, bool reset_dev, bool is_leaf_weight)
1819
{
1820 1821
	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1822
	struct blkcg *blkcg = css_to_blkcg(css);
T
Tejun Heo 已提交
1823
	struct blkcg_gq *blkg;
1824
	struct cfq_group_data *cfqgd;
1825
	int ret = 0;
1826

1827 1828
	if (val < min || val > max)
		return -ERANGE;
1829 1830

	spin_lock_irq(&blkcg->lock);
1831
	cfqgd = blkcg_to_cfqgd(blkcg);
1832 1833 1834 1835
	if (!cfqgd) {
		ret = -EINVAL;
		goto out;
	}
T
Tejun Heo 已提交
1836 1837

	if (!is_leaf_weight)
1838
		cfqgd->weight = val;
T
Tejun Heo 已提交
1839
	else
1840
		cfqgd->leaf_weight = val;
1841

1842
	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1843
		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1844

T
Tejun Heo 已提交
1845 1846 1847 1848
		if (!cfqg)
			continue;

		if (!is_leaf_weight) {
1849 1850
			if (reset_dev)
				cfqg->dev_weight = 0;
T
Tejun Heo 已提交
1851
			if (!cfqg->dev_weight)
1852
				cfqg->new_weight = cfqgd->weight;
T
Tejun Heo 已提交
1853
		} else {
1854 1855
			if (reset_dev)
				cfqg->dev_leaf_weight = 0;
T
Tejun Heo 已提交
1856
			if (!cfqg->dev_leaf_weight)
1857
				cfqg->new_leaf_weight = cfqgd->leaf_weight;
T
Tejun Heo 已提交
1858
		}
1859 1860
	}

1861
out:
1862
	spin_unlock_irq(&blkcg->lock);
1863
	return ret;
1864 1865
}

1866 1867
static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
			  u64 val)
T
Tejun Heo 已提交
1868
{
1869
	return __cfq_set_weight(css, val, false, false, false);
T
Tejun Heo 已提交
1870 1871
}

1872 1873
static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
			       struct cftype *cft, u64 val)
T
Tejun Heo 已提交
1874
{
1875
	return __cfq_set_weight(css, val, false, false, true);
T
Tejun Heo 已提交
1876 1877
}

1878
static int cfqg_print_stat(struct seq_file *sf, void *v)
1879
{
1880 1881
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
			  &blkcg_policy_cfq, seq_cft(sf)->private, false);
1882 1883 1884
	return 0;
}

1885
static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1886
{
1887 1888
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
			  &blkcg_policy_cfq, seq_cft(sf)->private, true);
1889 1890 1891
	return 0;
}

1892 1893 1894
static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
				      struct blkg_policy_data *pd, int off)
{
1895 1896
	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
					  &blkcg_policy_cfq, off);
1897 1898 1899 1900 1901 1902
	return __blkg_prfill_u64(sf, pd, sum);
}

static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
					struct blkg_policy_data *pd, int off)
{
1903 1904
	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
							&blkcg_policy_cfq, off);
1905 1906 1907
	return __blkg_prfill_rwstat(sf, pd, &sum);
}

1908
static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1909
{
1910 1911 1912
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
			  cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
			  seq_cft(sf)->private, false);
1913 1914 1915
	return 0;
}

1916
static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1917
{
1918 1919 1920
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
			  cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
			  seq_cft(sf)->private, true);
1921 1922 1923
	return 0;
}

T
Tejun Heo 已提交
1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957
static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
			       int off)
{
	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);

	return __blkg_prfill_u64(sf, pd, sum >> 9);
}

static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
{
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
			  cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
	return 0;
}

static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
					 struct blkg_policy_data *pd, int off)
{
	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
					offsetof(struct blkcg_gq, stat_bytes));
	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);

	return __blkg_prfill_u64(sf, pd, sum >> 9);
}

static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
{
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
			  cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
			  false);
	return 0;
}

1958
#ifdef CONFIG_DEBUG_BLK_CGROUP
1959 1960
static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
				      struct blkg_policy_data *pd, int off)
1961
{
1962
	struct cfq_group *cfqg = pd_to_cfqg(pd);
1963
	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1964 1965 1966
	u64 v = 0;

	if (samples) {
1967
		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1968
		v = div64_u64(v, samples);
1969
	}
1970
	__blkg_prfill_u64(sf, pd, v);
1971 1972 1973 1974
	return 0;
}

/* print avg_queue_size */
1975
static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1976
{
1977 1978 1979
	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
			  cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
			  0, false);
1980 1981 1982 1983
	return 0;
}
#endif	/* CONFIG_DEBUG_BLK_CGROUP */

1984
static struct cftype cfq_blkcg_legacy_files[] = {
1985
	/* on root, weight is mapped to leaf_weight */
1986 1987
	{
		.name = "weight_device",
1988
		.flags = CFTYPE_ONLY_ON_ROOT,
1989
		.seq_show = cfqg_print_leaf_weight_device,
1990
		.write = cfqg_set_leaf_weight_device,
1991 1992 1993
	},
	{
		.name = "weight",
1994
		.flags = CFTYPE_ONLY_ON_ROOT,
1995
		.seq_show = cfq_print_leaf_weight,
1996
		.write_u64 = cfq_set_leaf_weight,
1997
	},
T
Tejun Heo 已提交
1998

1999
	/* no such mapping necessary for !roots */
2000 2001
	{
		.name = "weight_device",
2002
		.flags = CFTYPE_NOT_ON_ROOT,
2003
		.seq_show = cfqg_print_weight_device,
2004
		.write = cfqg_set_weight_device,
2005 2006 2007
	},
	{
		.name = "weight",
2008
		.flags = CFTYPE_NOT_ON_ROOT,
2009
		.seq_show = cfq_print_weight,
2010
		.write_u64 = cfq_set_weight,
2011
	},
T
Tejun Heo 已提交
2012 2013 2014

	{
		.name = "leaf_weight_device",
2015
		.seq_show = cfqg_print_leaf_weight_device,
2016
		.write = cfqg_set_leaf_weight_device,
T
Tejun Heo 已提交
2017 2018 2019
	},
	{
		.name = "leaf_weight",
2020
		.seq_show = cfq_print_leaf_weight,
T
Tejun Heo 已提交
2021 2022 2023
		.write_u64 = cfq_set_leaf_weight,
	},

2024
	/* statistics, covers only the tasks in the cfqg */
2025 2026
	{
		.name = "time",
2027
		.private = offsetof(struct cfq_group, stats.time),
2028
		.seq_show = cfqg_print_stat,
2029 2030 2031
	},
	{
		.name = "sectors",
T
Tejun Heo 已提交
2032
		.seq_show = cfqg_print_stat_sectors,
2033 2034 2035
	},
	{
		.name = "io_service_bytes",
2036 2037
		.private = (unsigned long)&blkcg_policy_cfq,
		.seq_show = blkg_print_stat_bytes,
2038 2039 2040
	},
	{
		.name = "io_serviced",
2041 2042
		.private = (unsigned long)&blkcg_policy_cfq,
		.seq_show = blkg_print_stat_ios,
2043 2044 2045
	},
	{
		.name = "io_service_time",
2046
		.private = offsetof(struct cfq_group, stats.service_time),
2047
		.seq_show = cfqg_print_rwstat,
2048 2049 2050
	},
	{
		.name = "io_wait_time",
2051
		.private = offsetof(struct cfq_group, stats.wait_time),
2052
		.seq_show = cfqg_print_rwstat,
2053 2054 2055
	},
	{
		.name = "io_merged",
2056
		.private = offsetof(struct cfq_group, stats.merged),
2057
		.seq_show = cfqg_print_rwstat,
2058 2059 2060
	},
	{
		.name = "io_queued",
2061
		.private = offsetof(struct cfq_group, stats.queued),
2062
		.seq_show = cfqg_print_rwstat,
2063
	},
2064 2065 2066 2067 2068

	/* the same statictics which cover the cfqg and its descendants */
	{
		.name = "time_recursive",
		.private = offsetof(struct cfq_group, stats.time),
2069
		.seq_show = cfqg_print_stat_recursive,
2070 2071 2072
	},
	{
		.name = "sectors_recursive",
T
Tejun Heo 已提交
2073
		.seq_show = cfqg_print_stat_sectors_recursive,
2074 2075 2076
	},
	{
		.name = "io_service_bytes_recursive",
2077 2078
		.private = (unsigned long)&blkcg_policy_cfq,
		.seq_show = blkg_print_stat_bytes_recursive,
2079 2080 2081
	},
	{
		.name = "io_serviced_recursive",
2082 2083
		.private = (unsigned long)&blkcg_policy_cfq,
		.seq_show = blkg_print_stat_ios_recursive,
2084 2085 2086 2087
	},
	{
		.name = "io_service_time_recursive",
		.private = offsetof(struct cfq_group, stats.service_time),
2088
		.seq_show = cfqg_print_rwstat_recursive,
2089 2090 2091 2092
	},
	{
		.name = "io_wait_time_recursive",
		.private = offsetof(struct cfq_group, stats.wait_time),
2093
		.seq_show = cfqg_print_rwstat_recursive,
2094 2095 2096 2097
	},
	{
		.name = "io_merged_recursive",
		.private = offsetof(struct cfq_group, stats.merged),
2098
		.seq_show = cfqg_print_rwstat_recursive,
2099 2100 2101 2102
	},
	{
		.name = "io_queued_recursive",
		.private = offsetof(struct cfq_group, stats.queued),
2103
		.seq_show = cfqg_print_rwstat_recursive,
2104
	},
2105 2106 2107
#ifdef CONFIG_DEBUG_BLK_CGROUP
	{
		.name = "avg_queue_size",
2108
		.seq_show = cfqg_print_avg_queue_size,
2109 2110 2111
	},
	{
		.name = "group_wait_time",
2112
		.private = offsetof(struct cfq_group, stats.group_wait_time),
2113
		.seq_show = cfqg_print_stat,
2114 2115 2116
	},
	{
		.name = "idle_time",
2117
		.private = offsetof(struct cfq_group, stats.idle_time),
2118
		.seq_show = cfqg_print_stat,
2119 2120 2121
	},
	{
		.name = "empty_time",
2122
		.private = offsetof(struct cfq_group, stats.empty_time),
2123
		.seq_show = cfqg_print_stat,
2124 2125 2126
	},
	{
		.name = "dequeue",
2127
		.private = offsetof(struct cfq_group, stats.dequeue),
2128
		.seq_show = cfqg_print_stat,
2129 2130 2131
	},
	{
		.name = "unaccounted_time",
2132
		.private = offsetof(struct cfq_group, stats.unaccounted_time),
2133
		.seq_show = cfqg_print_stat,
2134 2135 2136 2137
	},
#endif	/* CONFIG_DEBUG_BLK_CGROUP */
	{ }	/* terminate */
};
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161

static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
{
	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);

	seq_printf(sf, "default %u\n", cgd->weight);
	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
			  &blkcg_policy_cfq, 0, false);
	return 0;
}

static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
				     char *buf, size_t nbytes, loff_t off)
{
	char *endp;
	int ret;
	u64 v;

	buf = strim(buf);

	/* "WEIGHT" or "default WEIGHT" sets the default weight */
	v = simple_strtoull(buf, &endp, 0);
	if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2162
		ret = __cfq_set_weight(of_css(of), v, true, false, false);
2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179
		return ret ?: nbytes;
	}

	/* "MAJ:MIN WEIGHT" */
	return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
}

static struct cftype cfq_blkcg_files[] = {
	{
		.name = "weight",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = cfq_print_weight_on_dfl,
		.write = cfq_set_weight_on_dfl,
	},
	{ }	/* terminate */
};

2180
#else /* GROUP_IOSCHED */
2181 2182
static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
					 struct blkcg *blkcg)
2183
{
2184
	return cfqd->root_group;
2185
}
2186

2187 2188 2189 2190 2191 2192 2193
static inline void
cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
	cfqq->cfqg = cfqg;
}

#endif /* GROUP_IOSCHED */

2194
/*
2195
 * The cfqd->service_trees holds all pending cfq_queue's that have
2196 2197 2198
 * requests waiting to be processed. It is sorted in the order that
 * we will service the queues.
 */
2199
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2200
				 bool add_front)
2201
{
2202 2203
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;
2204
	u64 rb_key;
2205
	struct cfq_rb_root *st;
2206
	bool leftmost = true;
2207
	int new_cfqq = 1;
2208
	u64 now = ktime_get_ns();
2209

2210
	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2211 2212
	if (cfq_class_idle(cfqq)) {
		rb_key = CFQ_IDLE_DELAY;
2213
		parent = st->rb_rightmost;
2214 2215 2216 2217
		if (parent && parent != &cfqq->rb_node) {
			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
			rb_key += __cfqq->rb_key;
		} else
2218
			rb_key += now;
2219
	} else if (!add_front) {
2220 2221 2222 2223 2224 2225
		/*
		 * Get our rb key offset. Subtract any residual slice
		 * value carried from last service. A negative resid
		 * count indicates slice overrun, and this should position
		 * the next service time further away in the tree.
		 */
2226
		rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2227
		rb_key -= cfqq->slice_resid;
2228
		cfqq->slice_resid = 0;
2229
	} else {
2230
		rb_key = -NSEC_PER_SEC;
2231
		__cfqq = cfq_rb_first(st);
2232
		rb_key += __cfqq ? __cfqq->rb_key : now;
2233
	}
L
Linus Torvalds 已提交
2234

2235
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2236
		new_cfqq = 0;
2237
		/*
2238
		 * same position, nothing more to do
2239
		 */
2240
		if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2241
			return;
L
Linus Torvalds 已提交
2242

2243 2244
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
		cfqq->service_tree = NULL;
L
Linus Torvalds 已提交
2245
	}
2246

2247
	parent = NULL;
2248
	cfqq->service_tree = st;
2249
	p = &st->rb.rb_root.rb_node;
2250 2251 2252 2253
	while (*p) {
		parent = *p;
		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

2254
		/*
2255
		 * sort by key, that represents service time.
2256
		 */
2257
		if (rb_key < __cfqq->rb_key)
2258
			p = &parent->rb_left;
2259
		else {
2260
			p = &parent->rb_right;
2261
			leftmost = false;
2262
		}
2263 2264 2265 2266
	}

	cfqq->rb_key = rb_key;
	rb_link_node(&cfqq->rb_node, parent, p);
2267
	rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
2268
	st->count++;
2269
	if (add_front || !new_cfqq)
2270
		return;
2271
	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
L
Linus Torvalds 已提交
2272 2273
}

2274
static struct cfq_queue *
2275 2276 2277
cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
		     sector_t sector, struct rb_node **ret_parent,
		     struct rb_node ***rb_link)
2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293
{
	struct rb_node **p, *parent;
	struct cfq_queue *cfqq = NULL;

	parent = NULL;
	p = &root->rb_node;
	while (*p) {
		struct rb_node **n;

		parent = *p;
		cfqq = rb_entry(parent, struct cfq_queue, p_node);

		/*
		 * Sort strictly based on sector.  Smallest to the left,
		 * largest to the right.
		 */
2294
		if (sector > blk_rq_pos(cfqq->next_rq))
2295
			n = &(*p)->rb_right;
2296
		else if (sector < blk_rq_pos(cfqq->next_rq))
2297 2298 2299 2300
			n = &(*p)->rb_left;
		else
			break;
		p = n;
2301
		cfqq = NULL;
2302 2303 2304 2305 2306
	}

	*ret_parent = parent;
	if (rb_link)
		*rb_link = p;
2307
	return cfqq;
2308 2309 2310 2311 2312 2313 2314
}

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct rb_node **p, *parent;
	struct cfq_queue *__cfqq;

2315 2316 2317 2318
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
2319 2320 2321 2322 2323 2324

	if (cfq_class_idle(cfqq))
		return;
	if (!cfqq->next_rq)
		return;

2325
	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2326 2327
	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
				      blk_rq_pos(cfqq->next_rq), &parent, &p);
2328 2329
	if (!__cfqq) {
		rb_link_node(&cfqq->p_node, parent, p);
2330 2331 2332
		rb_insert_color(&cfqq->p_node, cfqq->p_root);
	} else
		cfqq->p_root = NULL;
2333 2334
}

2335 2336 2337
/*
 * Update cfqq's position in the service tree.
 */
2338
static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
J
Jens Axboe 已提交
2339 2340 2341 2342
{
	/*
	 * Resorting requires the cfqq to be on the RR list already.
	 */
2343
	if (cfq_cfqq_on_rr(cfqq)) {
2344
		cfq_service_tree_add(cfqd, cfqq, 0);
2345 2346
		cfq_prio_tree_add(cfqd, cfqq);
	}
J
Jens Axboe 已提交
2347 2348
}

L
Linus Torvalds 已提交
2349 2350
/*
 * add to busy list of queues for service, trying to be fair in ordering
2351
 * the pending list according to last request service
L
Linus Torvalds 已提交
2352
 */
J
Jens Axboe 已提交
2353
static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
2354
{
2355
	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
J
Jens Axboe 已提交
2356 2357
	BUG_ON(cfq_cfqq_on_rr(cfqq));
	cfq_mark_cfqq_on_rr(cfqq);
L
Linus Torvalds 已提交
2358
	cfqd->busy_queues++;
2359 2360
	if (cfq_cfqq_sync(cfqq))
		cfqd->busy_sync_queues++;
L
Linus Torvalds 已提交
2361

2362
	cfq_resort_rr_list(cfqd, cfqq);
L
Linus Torvalds 已提交
2363 2364
}

2365 2366 2367 2368
/*
 * Called when the cfqq no longer has requests pending, remove it from
 * the service tree.
 */
J
Jens Axboe 已提交
2369
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
2370
{
2371
	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
J
Jens Axboe 已提交
2372 2373
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
	cfq_clear_cfqq_on_rr(cfqq);
L
Linus Torvalds 已提交
2374

2375 2376 2377 2378
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
		cfqq->service_tree = NULL;
	}
2379 2380 2381 2382
	if (cfqq->p_root) {
		rb_erase(&cfqq->p_node, cfqq->p_root);
		cfqq->p_root = NULL;
	}
2383

2384
	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
L
Linus Torvalds 已提交
2385 2386
	BUG_ON(!cfqd->busy_queues);
	cfqd->busy_queues--;
2387 2388
	if (cfq_cfqq_sync(cfqq))
		cfqd->busy_sync_queues--;
L
Linus Torvalds 已提交
2389 2390 2391 2392 2393
}

/*
 * rb tree support functions
 */
J
Jens Axboe 已提交
2394
static void cfq_del_rq_rb(struct request *rq)
L
Linus Torvalds 已提交
2395
{
J
Jens Axboe 已提交
2396 2397
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
	const int sync = rq_is_sync(rq);
L
Linus Torvalds 已提交
2398

2399 2400
	BUG_ON(!cfqq->queued[sync]);
	cfqq->queued[sync]--;
L
Linus Torvalds 已提交
2401

J
Jens Axboe 已提交
2402
	elv_rb_del(&cfqq->sort_list, rq);
L
Linus Torvalds 已提交
2403

2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
		/*
		 * Queue will be deleted from service tree when we actually
		 * expire it later. Right now just remove it from prio tree
		 * as it is empty.
		 */
		if (cfqq->p_root) {
			rb_erase(&cfqq->p_node, cfqq->p_root);
			cfqq->p_root = NULL;
		}
	}
L
Linus Torvalds 已提交
2415 2416
}

J
Jens Axboe 已提交
2417
static void cfq_add_rq_rb(struct request *rq)
L
Linus Torvalds 已提交
2418
{
J
Jens Axboe 已提交
2419
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
L
Linus Torvalds 已提交
2420
	struct cfq_data *cfqd = cfqq->cfqd;
2421
	struct request *prev;
L
Linus Torvalds 已提交
2422

2423
	cfqq->queued[rq_is_sync(rq)]++;
L
Linus Torvalds 已提交
2424

2425
	elv_rb_add(&cfqq->sort_list, rq);
2426 2427 2428

	if (!cfq_cfqq_on_rr(cfqq))
		cfq_add_cfqq_rr(cfqd, cfqq);
2429 2430 2431 2432

	/*
	 * check if this request is a better next-serve candidate
	 */
2433
	prev = cfqq->next_rq;
2434
	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2435 2436 2437 2438 2439 2440 2441

	/*
	 * adjust priority tree position, if ->next_rq changes
	 */
	if (prev != cfqq->next_rq)
		cfq_prio_tree_add(cfqd, cfqq);

2442
	BUG_ON(!cfqq->next_rq);
L
Linus Torvalds 已提交
2443 2444
}

J
Jens Axboe 已提交
2445
static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
L
Linus Torvalds 已提交
2446
{
2447 2448
	elv_rb_del(&cfqq->sort_list, rq);
	cfqq->queued[rq_is_sync(rq)]--;
2449
	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
J
Jens Axboe 已提交
2450
	cfq_add_rq_rb(rq);
2451
	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2452
				 rq->cmd_flags);
L
Linus Torvalds 已提交
2453 2454
}

2455 2456
static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
L
Linus Torvalds 已提交
2457
{
2458
	struct task_struct *tsk = current;
2459
	struct cfq_io_cq *cic;
2460
	struct cfq_queue *cfqq;
L
Linus Torvalds 已提交
2461

2462
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
2463 2464 2465
	if (!cic)
		return NULL;

2466
	cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
K
Kent Overstreet 已提交
2467 2468
	if (cfqq)
		return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
L
Linus Torvalds 已提交
2469 2470 2471 2472

	return NULL;
}

2473
static void cfq_activate_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
2474
{
2475
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
2476

2477
	cfqd->rq_in_driver++;
2478
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2479
						cfqd->rq_in_driver);
2480

2481
	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
L
Linus Torvalds 已提交
2482 2483
}

2484
static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
2485
{
2486 2487
	struct cfq_data *cfqd = q->elevator->elevator_data;

2488 2489
	WARN_ON(!cfqd->rq_in_driver);
	cfqd->rq_in_driver--;
2490
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2491
						cfqd->rq_in_driver);
L
Linus Torvalds 已提交
2492 2493
}

2494
static void cfq_remove_request(struct request *rq)
L
Linus Torvalds 已提交
2495
{
J
Jens Axboe 已提交
2496
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2497

J
Jens Axboe 已提交
2498 2499
	if (cfqq->next_rq == rq)
		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
L
Linus Torvalds 已提交
2500

2501
	list_del_init(&rq->queuelist);
J
Jens Axboe 已提交
2502
	cfq_del_rq_rb(rq);
2503

2504
	cfqq->cfqd->rq_queued--;
2505
	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2506 2507 2508
	if (rq->cmd_flags & REQ_PRIO) {
		WARN_ON(!cfqq->prio_pending);
		cfqq->prio_pending--;
2509
	}
L
Linus Torvalds 已提交
2510 2511
}

2512
static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
2513
		     struct bio *bio)
L
Linus Torvalds 已提交
2514 2515 2516 2517
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct request *__rq;

2518
	__rq = cfq_find_rq_fmerge(cfqd, bio);
2519
	if (__rq && elv_bio_merge_ok(__rq, bio)) {
2520 2521
		*req = __rq;
		return ELEVATOR_FRONT_MERGE;
L
Linus Torvalds 已提交
2522 2523 2524 2525 2526
	}

	return ELEVATOR_NO_MERGE;
}

2527
static void cfq_merged_request(struct request_queue *q, struct request *req,
2528
			       enum elv_merge type)
L
Linus Torvalds 已提交
2529
{
2530
	if (type == ELEVATOR_FRONT_MERGE) {
J
Jens Axboe 已提交
2531
		struct cfq_queue *cfqq = RQ_CFQQ(req);
L
Linus Torvalds 已提交
2532

J
Jens Axboe 已提交
2533
		cfq_reposition_rq_rb(cfqq, req);
L
Linus Torvalds 已提交
2534 2535 2536
	}
}

D
Divyesh Shah 已提交
2537 2538 2539
static void cfq_bio_merged(struct request_queue *q, struct request *req,
				struct bio *bio)
{
2540
	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
D
Divyesh Shah 已提交
2541 2542
}

L
Linus Torvalds 已提交
2543
static void
2544
cfq_merged_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
2545 2546
		    struct request *next)
{
2547
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2548 2549
	struct cfq_data *cfqd = q->elevator->elevator_data;

2550 2551 2552 2553
	/*
	 * reposition in fifo if next is older than rq
	 */
	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2554
	    next->fifo_time < rq->fifo_time &&
2555
	    cfqq == RQ_CFQQ(next)) {
2556
		list_move(&rq->queuelist, &next->queuelist);
2557
		rq->fifo_time = next->fifo_time;
2558
	}
2559

2560 2561
	if (cfqq->next_rq == next)
		cfqq->next_rq = rq;
2562
	cfq_remove_request(next);
2563
	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2564 2565 2566 2567 2568 2569 2570 2571 2572 2573

	cfqq = RQ_CFQQ(next);
	/*
	 * all requests of this queue are merged to other queues, delete it
	 * from the service tree. If it's the active_queue,
	 * cfq_dispatch_requests() will choose to expire it or do idle
	 */
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
	    cfqq != cfqd->active_queue)
		cfq_del_cfqq_rr(cfqd, cfqq);
2574 2575
}

2576 2577
static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
			       struct bio *bio)
2578 2579
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
2580
	bool is_sync = op_is_sync(bio->bi_opf);
2581
	struct cfq_io_cq *cic;
2582 2583 2584
	struct cfq_queue *cfqq;

	/*
2585
	 * Disallow merge of a sync bio into an async request.
2586
	 */
2587
	if (is_sync && !rq_is_sync(rq))
2588
		return false;
2589 2590

	/*
T
Tejun Heo 已提交
2591
	 * Lookup the cfqq that this bio will be queued with and allow
2592
	 * merge only if rq is queued there.
T
Tejun Heo 已提交
2593
	 */
2594 2595 2596
	cic = cfq_cic_lookup(cfqd, current->io_context);
	if (!cic)
		return false;
2597

2598
	cfqq = cic_to_cfqq(cic, is_sync);
2599
	return cfqq == RQ_CFQQ(rq);
2600 2601
}

2602 2603 2604 2605 2606 2607
static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
			      struct request *next)
{
	return RQ_CFQQ(rq) == RQ_CFQQ(next);
}

2608 2609
static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
2610
	hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2611
	cfqg_stats_update_idle_time(cfqq->cfqg);
2612 2613
}

J
Jens Axboe 已提交
2614 2615
static void __cfq_set_active_queue(struct cfq_data *cfqd,
				   struct cfq_queue *cfqq)
2616 2617
{
	if (cfqq) {
2618
		cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2619
				cfqd->serving_wl_class, cfqd->serving_wl_type);
2620
		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2621
		cfqq->slice_start = 0;
2622
		cfqq->dispatch_start = ktime_get_ns();
2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634
		cfqq->allocated_slice = 0;
		cfqq->slice_end = 0;
		cfqq->slice_dispatch = 0;
		cfqq->nr_sectors = 0;

		cfq_clear_cfqq_wait_request(cfqq);
		cfq_clear_cfqq_must_dispatch(cfqq);
		cfq_clear_cfqq_must_alloc_slice(cfqq);
		cfq_clear_cfqq_fifo_expire(cfqq);
		cfq_mark_cfqq_slice_new(cfqq);

		cfq_del_timer(cfqd, cfqq);
2635 2636 2637 2638 2639
	}

	cfqd->active_queue = cfqq;
}

2640 2641 2642 2643 2644
/*
 * current cfqq expired its slice (or was too idle), select new one
 */
static void
__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2645
		    bool timed_out)
2646
{
2647 2648
	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

2649
	if (cfq_cfqq_wait_request(cfqq))
2650
		cfq_del_timer(cfqd, cfqq);
2651 2652

	cfq_clear_cfqq_wait_request(cfqq);
2653
	cfq_clear_cfqq_wait_busy(cfqq);
2654

2655 2656 2657 2658 2659 2660 2661 2662 2663
	/*
	 * If this cfqq is shared between multiple processes, check to
	 * make sure that those processes are still issuing I/Os within
	 * the mean seek distance.  If not, it may be time to break the
	 * queues apart again.
	 */
	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
		cfq_mark_cfqq_split_coop(cfqq);

2664
	/*
2665
	 * store what was left of this slice, if the queue idled/timed out
2666
	 */
2667 2668
	if (timed_out) {
		if (cfq_cfqq_slice_new(cfqq))
2669
			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2670
		else
2671
			cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2672
		cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2673
	}
2674

2675
	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2676

2677 2678 2679
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
		cfq_del_cfqq_rr(cfqd, cfqq);

2680
	cfq_resort_rr_list(cfqd, cfqq);
2681 2682 2683 2684 2685

	if (cfqq == cfqd->active_queue)
		cfqd->active_queue = NULL;

	if (cfqd->active_cic) {
2686
		put_io_context(cfqd->active_cic->icq.ioc);
2687 2688 2689 2690
		cfqd->active_cic = NULL;
	}
}

2691
static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2692 2693 2694 2695
{
	struct cfq_queue *cfqq = cfqd->active_queue;

	if (cfqq)
2696
		__cfq_slice_expired(cfqd, cfqq, timed_out);
2697 2698
}

2699 2700 2701 2702
/*
 * Get next queue for service. Unless we have a queue preemption,
 * we'll simply select the first cfqq in the service tree.
 */
J
Jens Axboe 已提交
2703
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2704
{
2705 2706
	struct cfq_rb_root *st = st_for(cfqd->serving_group,
			cfqd->serving_wl_class, cfqd->serving_wl_type);
2707

2708 2709 2710
	if (!cfqd->rq_queued)
		return NULL;

2711
	/* There is nothing to dispatch */
2712
	if (!st)
2713
		return NULL;
2714
	if (RB_EMPTY_ROOT(&st->rb.rb_root))
2715
		return NULL;
2716
	return cfq_rb_first(st);
J
Jens Axboe 已提交
2717 2718
}

2719 2720
static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
{
2721
	struct cfq_group *cfqg;
2722 2723 2724 2725 2726 2727 2728
	struct cfq_queue *cfqq;
	int i, j;
	struct cfq_rb_root *st;

	if (!cfqd->rq_queued)
		return NULL;

2729 2730 2731 2732
	cfqg = cfq_get_next_cfqg(cfqd);
	if (!cfqg)
		return NULL;

2733 2734 2735
	for_each_cfqg_st(cfqg, i, j, st) {
		cfqq = cfq_rb_first(st);
		if (cfqq)
2736
			return cfqq;
2737
	}
2738 2739 2740
	return NULL;
}

2741 2742 2743
/*
 * Get and set a new active queue for service.
 */
2744 2745
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
					      struct cfq_queue *cfqq)
J
Jens Axboe 已提交
2746
{
2747
	if (!cfqq)
2748
		cfqq = cfq_get_next_queue(cfqd);
J
Jens Axboe 已提交
2749

2750
	__cfq_set_active_queue(cfqd, cfqq);
J
Jens Axboe 已提交
2751
	return cfqq;
2752 2753
}

2754 2755 2756
static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
					  struct request *rq)
{
2757 2758
	if (blk_rq_pos(rq) >= cfqd->last_position)
		return blk_rq_pos(rq) - cfqd->last_position;
2759
	else
2760
		return cfqd->last_position - blk_rq_pos(rq);
2761 2762
}

2763
static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2764
			       struct request *rq)
J
Jens Axboe 已提交
2765
{
2766
	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
J
Jens Axboe 已提交
2767 2768
}

2769 2770 2771
static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
				    struct cfq_queue *cur_cfqq)
{
2772
	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783
	struct rb_node *parent, *node;
	struct cfq_queue *__cfqq;
	sector_t sector = cfqd->last_position;

	if (RB_EMPTY_ROOT(root))
		return NULL;

	/*
	 * First, if we find a request starting at the end of the last
	 * request, choose it.
	 */
2784
	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2785 2786 2787 2788 2789 2790 2791 2792
	if (__cfqq)
		return __cfqq;

	/*
	 * If the exact sector wasn't found, the parent of the NULL leaf
	 * will contain the closest sector.
	 */
	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
2793
	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2794 2795
		return __cfqq;

2796
	if (blk_rq_pos(__cfqq->next_rq) < sector)
2797 2798 2799 2800 2801 2802 2803
		node = rb_next(&__cfqq->p_node);
	else
		node = rb_prev(&__cfqq->p_node);
	if (!node)
		return NULL;

	__cfqq = rb_entry(node, struct cfq_queue, p_node);
2804
	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820
		return __cfqq;

	return NULL;
}

/*
 * cfqd - obvious
 * cur_cfqq - passed in so that we don't decide that the current queue is
 * 	      closely cooperating with itself.
 *
 * So, basically we're assuming that that cur_cfqq has dispatched at least
 * one request, and that cfqd->last_position reflects a position on the disk
 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
 * assumption.
 */
static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2821
					      struct cfq_queue *cur_cfqq)
J
Jens Axboe 已提交
2822
{
2823 2824
	struct cfq_queue *cfqq;

2825 2826
	if (cfq_class_idle(cur_cfqq))
		return NULL;
2827 2828 2829 2830 2831
	if (!cfq_cfqq_sync(cur_cfqq))
		return NULL;
	if (CFQQ_SEEKY(cur_cfqq))
		return NULL;

2832 2833 2834 2835 2836 2837
	/*
	 * Don't search priority tree if it's the only queue in the group.
	 */
	if (cur_cfqq->cfqg->nr_cfqq == 1)
		return NULL;

J
Jens Axboe 已提交
2838
	/*
2839 2840 2841
	 * We should notice if some of the queues are cooperating, eg
	 * working closely on the same area of the disk. In that case,
	 * we can group them together and don't waste time idling.
J
Jens Axboe 已提交
2842
	 */
2843 2844 2845 2846
	cfqq = cfqq_close(cfqd, cur_cfqq);
	if (!cfqq)
		return NULL;

2847 2848 2849 2850
	/* If new queue belongs to different cfq_group, don't choose it */
	if (cur_cfqq->cfqg != cfqq->cfqg)
		return NULL;

J
Jeff Moyer 已提交
2851 2852 2853 2854 2855
	/*
	 * It only makes sense to merge sync queues.
	 */
	if (!cfq_cfqq_sync(cfqq))
		return NULL;
2856 2857
	if (CFQQ_SEEKY(cfqq))
		return NULL;
J
Jeff Moyer 已提交
2858

2859 2860 2861 2862 2863 2864
	/*
	 * Do not merge queues of different priority classes
	 */
	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
		return NULL;

2865
	return cfqq;
J
Jens Axboe 已提交
2866 2867
}

2868 2869 2870 2871 2872 2873
/*
 * Determine whether we should enforce idle window for this queue.
 */

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
2874
	enum wl_class_t wl_class = cfqq_class(cfqq);
2875
	struct cfq_rb_root *st = cfqq->service_tree;
2876

2877 2878
	BUG_ON(!st);
	BUG_ON(!st->count);
2879

2880 2881 2882
	if (!cfqd->cfq_slice_idle)
		return false;

2883
	/* We never do for idle class queues. */
2884
	if (wl_class == IDLE_WORKLOAD)
2885 2886 2887
		return false;

	/* We do for queues that were marked with idle window flag. */
2888 2889
	if (cfq_cfqq_idle_window(cfqq) &&
	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2890 2891 2892 2893 2894 2895
		return true;

	/*
	 * Otherwise, we do only if they are the last ones
	 * in their service tree.
	 */
2896 2897
	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
	   !cfq_io_thinktime_big(cfqd, &st->ttime, false))
S
Shaohua Li 已提交
2898
		return true;
2899
	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
S
Shaohua Li 已提交
2900
	return false;
2901 2902
}

J
Jens Axboe 已提交
2903
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2904
{
2905
	struct cfq_queue *cfqq = cfqd->active_queue;
2906
	struct cfq_rb_root *st = cfqq->service_tree;
2907
	struct cfq_io_cq *cic;
2908 2909
	u64 sl, group_idle = 0;
	u64 now = ktime_get_ns();
2910

2911
	/*
J
Jens Axboe 已提交
2912 2913 2914
	 * SSD device without seek penalty, disable idling. But only do so
	 * for devices that support queuing, otherwise we still have a problem
	 * with sync vs async workloads.
2915
	 */
2916 2917
	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
		!cfqd->cfq_group_idle)
2918 2919
		return;

2920
	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
J
Jens Axboe 已提交
2921
	WARN_ON(cfq_cfqq_slice_new(cfqq));
2922 2923 2924 2925

	/*
	 * idle is disabled, either manually or by past process history
	 */
2926 2927 2928 2929 2930 2931 2932
	if (!cfq_should_idle(cfqd, cfqq)) {
		/* no queue idling. Check for group idling */
		if (cfqd->cfq_group_idle)
			group_idle = cfqd->cfq_group_idle;
		else
			return;
	}
J
Jens Axboe 已提交
2933

2934
	/*
2935
	 * still active requests from this queue, don't idle
2936
	 */
2937
	if (cfqq->dispatched)
2938 2939
		return;

2940 2941 2942
	/*
	 * task has exited, don't wait
	 */
2943
	cic = cfqd->active_cic;
T
Tejun Heo 已提交
2944
	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
J
Jens Axboe 已提交
2945 2946
		return;

2947 2948 2949 2950 2951
	/*
	 * If our average think time is larger than the remaining time
	 * slice, then don't idle. This avoids overrunning the allotted
	 * time slice.
	 */
2952
	if (sample_valid(cic->ttime.ttime_samples) &&
2953 2954
	    (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2955
			     cic->ttime.ttime_mean);
2956
		return;
2957
	}
2958

2959 2960 2961 2962 2963 2964 2965
	/*
	 * There are other queues in the group or this is the only group and
	 * it has too big thinktime, don't do group idle.
	 */
	if (group_idle &&
	    (cfqq->cfqg->nr_cfqq > 1 ||
	     cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2966 2967
		return;

J
Jens Axboe 已提交
2968
	cfq_mark_cfqq_wait_request(cfqq);
2969

2970 2971 2972 2973
	if (group_idle)
		sl = cfqd->cfq_group_idle;
	else
		sl = cfqd->cfq_slice_idle;
2974

2975 2976
	hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
		      HRTIMER_MODE_REL);
2977
	cfqg_stats_set_start_idle_time(cfqq->cfqg);
2978
	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2979
			group_idle ? 1 : 0);
L
Linus Torvalds 已提交
2980 2981
}

2982 2983 2984
/*
 * Move request from internal lists to the request queue dispatch list.
 */
2985
static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
2986
{
2987
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
2988
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2989

2990 2991
	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

2992
	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2993
	cfq_remove_request(rq);
J
Jens Axboe 已提交
2994
	cfqq->dispatched++;
2995
	(RQ_CFQG(rq))->dispatched++;
2996
	elv_dispatch_sort(q, rq);
2997

2998
	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2999
	cfqq->nr_sectors += blk_rq_sectors(rq);
L
Linus Torvalds 已提交
3000 3001 3002 3003 3004
}

/*
 * return expired entry, or NULL to just start from scratch in rbtree
 */
J
Jens Axboe 已提交
3005
static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
3006
{
3007
	struct request *rq = NULL;
L
Linus Torvalds 已提交
3008

J
Jens Axboe 已提交
3009
	if (cfq_cfqq_fifo_expire(cfqq))
L
Linus Torvalds 已提交
3010
		return NULL;
3011 3012 3013

	cfq_mark_cfqq_fifo_expire(cfqq);

3014 3015
	if (list_empty(&cfqq->fifo))
		return NULL;
L
Linus Torvalds 已提交
3016

3017
	rq = rq_entry_fifo(cfqq->fifo.next);
3018
	if (ktime_get_ns() < rq->fifo_time)
3019
		rq = NULL;
L
Linus Torvalds 已提交
3020

J
Jens Axboe 已提交
3021
	return rq;
L
Linus Torvalds 已提交
3022 3023
}

3024 3025 3026 3027
static inline int
cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	const int base_rq = cfqd->cfq_slice_async_rq;
L
Linus Torvalds 已提交
3028

3029
	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
L
Linus Torvalds 已提交
3030

3031
	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
L
Linus Torvalds 已提交
3032 3033
}

J
Jeff Moyer 已提交
3034 3035 3036 3037 3038 3039 3040 3041
/*
 * Must be called with the queue_lock held.
 */
static int cfqq_process_refs(struct cfq_queue *cfqq)
{
	int process_refs, io_refs;

	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3042
	process_refs = cfqq->ref - io_refs;
J
Jeff Moyer 已提交
3043 3044 3045 3046 3047 3048
	BUG_ON(process_refs < 0);
	return process_refs;
}

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
{
3049
	int process_refs, new_process_refs;
J
Jeff Moyer 已提交
3050 3051
	struct cfq_queue *__cfqq;

3052 3053 3054 3055 3056 3057 3058 3059 3060
	/*
	 * If there are no process references on the new_cfqq, then it is
	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
	 * chain may have dropped their last reference (not just their
	 * last process reference).
	 */
	if (!cfqq_process_refs(new_cfqq))
		return;

J
Jeff Moyer 已提交
3061 3062 3063 3064 3065 3066 3067 3068
	/* Avoid a circular list and skip interim queue merges */
	while ((__cfqq = new_cfqq->new_cfqq)) {
		if (__cfqq == cfqq)
			return;
		new_cfqq = __cfqq;
	}

	process_refs = cfqq_process_refs(cfqq);
3069
	new_process_refs = cfqq_process_refs(new_cfqq);
J
Jeff Moyer 已提交
3070 3071 3072 3073
	/*
	 * If the process for the cfqq has gone away, there is no
	 * sense in merging the queues.
	 */
3074
	if (process_refs == 0 || new_process_refs == 0)
J
Jeff Moyer 已提交
3075 3076
		return;

3077 3078 3079 3080 3081
	/*
	 * Merge in the direction of the lesser amount of work.
	 */
	if (new_process_refs >= process_refs) {
		cfqq->new_cfqq = new_cfqq;
3082
		new_cfqq->ref += process_refs;
3083 3084
	} else {
		new_cfqq->new_cfqq = cfqq;
3085
		cfqq->ref += new_process_refs;
3086
	}
J
Jeff Moyer 已提交
3087 3088
}

3089
static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3090
			struct cfq_group *cfqg, enum wl_class_t wl_class)
3091 3092 3093 3094
{
	struct cfq_queue *queue;
	int i;
	bool key_valid = false;
3095
	u64 lowest_key = 0;
3096 3097
	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

3098 3099
	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
		/* select the one with lowest rb_key */
3100
		queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3101
		if (queue &&
3102
		    (!key_valid || queue->rb_key < lowest_key)) {
3103 3104 3105 3106 3107 3108 3109 3110 3111
			lowest_key = queue->rb_key;
			cur_best = i;
			key_valid = true;
		}
	}

	return cur_best;
}

3112 3113
static void
choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3114
{
3115
	u64 slice;
3116
	unsigned count;
3117
	struct cfq_rb_root *st;
3118
	u64 group_slice;
3119
	enum wl_class_t original_class = cfqd->serving_wl_class;
3120
	u64 now = ktime_get_ns();
3121

3122
	/* Choose next priority. RT > BE > IDLE */
3123
	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3124
		cfqd->serving_wl_class = RT_WORKLOAD;
3125
	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3126
		cfqd->serving_wl_class = BE_WORKLOAD;
3127
	else {
3128
		cfqd->serving_wl_class = IDLE_WORKLOAD;
3129
		cfqd->workload_expires = now + jiffies_to_nsecs(1);
3130 3131 3132
		return;
	}

3133
	if (original_class != cfqd->serving_wl_class)
3134 3135
		goto new_workload;

3136 3137 3138 3139 3140
	/*
	 * For RT and BE, we have to choose also the type
	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
	 * expiration time
	 */
3141
	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3142
	count = st->count;
3143 3144

	/*
3145
	 * check workload expiration, and that we still have other queues ready
3146
	 */
3147
	if (count && !(now > cfqd->workload_expires))
3148 3149
		return;

3150
new_workload:
3151
	/* otherwise select new workload type */
3152
	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3153
					cfqd->serving_wl_class);
3154
	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3155
	count = st->count;
3156 3157 3158 3159 3160 3161

	/*
	 * the workload slice is computed as a fraction of target latency
	 * proportional to the number of queues in that workload, over
	 * all the queues in the same priority class
	 */
3162 3163
	group_slice = cfq_group_slice(cfqd, cfqg);

3164
	slice = div_u64(group_slice * count,
3165 3166
		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
		      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3167
					cfqg)));
3168

3169
	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3170
		u64 tmp;
3171 3172 3173 3174 3175 3176 3177 3178

		/*
		 * Async queues are currently system wide. Just taking
		 * proportion of queues with-in same group will lead to higher
		 * async ratio system wide as generally root group is going
		 * to have higher weight. A more accurate thing would be to
		 * calculate system wide asnc/sync ratio.
		 */
3179 3180
		tmp = cfqd->cfq_target_latency *
			cfqg_busy_async_queues(cfqd, cfqg);
3181 3182
		tmp = div_u64(tmp, cfqd->busy_queues);
		slice = min_t(u64, slice, tmp);
3183

3184 3185
		/* async workload slice is scaled down according to
		 * the sync/async slice ratio. */
3186
		slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3187
	} else
3188 3189 3190
		/* sync workload slice is at least 2 * cfq_slice_idle */
		slice = max(slice, 2 * cfqd->cfq_slice_idle);

3191 3192 3193
	slice = max_t(u64, slice, CFQ_MIN_TT);
	cfq_log(cfqd, "workload slice:%llu", slice);
	cfqd->workload_expires = now + slice;
3194 3195
}

3196 3197 3198
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
{
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
3199
	struct cfq_group *cfqg;
3200

3201
	if (RB_EMPTY_ROOT(&st->rb.rb_root))
3202
		return NULL;
3203 3204 3205
	cfqg = cfq_rb_first_group(st);
	update_min_vdisktime(st);
	return cfqg;
3206 3207
}

3208 3209
static void cfq_choose_cfqg(struct cfq_data *cfqd)
{
3210
	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3211
	u64 now = ktime_get_ns();
3212 3213

	cfqd->serving_group = cfqg;
3214 3215

	/* Restore the workload type data */
3216
	if (cfqg->saved_wl_slice) {
3217
		cfqd->workload_expires = now + cfqg->saved_wl_slice;
3218 3219
		cfqd->serving_wl_type = cfqg->saved_wl_type;
		cfqd->serving_wl_class = cfqg->saved_wl_class;
3220
	} else
3221
		cfqd->workload_expires = now - 1;
3222

3223
	choose_wl_class_and_type(cfqd, cfqg);
3224 3225
}

3226
/*
3227 3228
 * Select a queue for service. If we have a current active queue,
 * check whether to continue servicing it, or retrieve and set a new one.
3229
 */
3230
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
L
Linus Torvalds 已提交
3231
{
3232
	struct cfq_queue *cfqq, *new_cfqq = NULL;
3233
	u64 now = ktime_get_ns();
L
Linus Torvalds 已提交
3234

3235 3236 3237
	cfqq = cfqd->active_queue;
	if (!cfqq)
		goto new_queue;
L
Linus Torvalds 已提交
3238

3239 3240
	if (!cfqd->rq_queued)
		return NULL;
3241 3242 3243 3244 3245 3246 3247

	/*
	 * We were waiting for group to get backlogged. Expire the queue
	 */
	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
		goto expire;

3248
	/*
J
Jens Axboe 已提交
3249
	 * The active queue has run out of time, expire it and select new.
3250
	 */
3251 3252 3253 3254 3255 3256 3257 3258 3259 3260
	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
		/*
		 * If slice had not expired at the completion of last request
		 * we might not have turned on wait_busy flag. Don't expire
		 * the queue yet. Allow the group to get backlogged.
		 *
		 * The very fact that we have used the slice, that means we
		 * have been idling all along on this queue and it should be
		 * ok to wait for this request to complete.
		 */
3261 3262 3263
		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
			cfqq = NULL;
3264
			goto keep_queue;
3265
		} else
3266
			goto check_group_idle;
3267
	}
L
Linus Torvalds 已提交
3268

3269
	/*
J
Jens Axboe 已提交
3270 3271
	 * The active queue has requests and isn't expired, allow it to
	 * dispatch.
3272
	 */
3273
	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3274
		goto keep_queue;
J
Jens Axboe 已提交
3275

3276 3277 3278 3279
	/*
	 * If another queue has a request waiting within our mean seek
	 * distance, let it run.  The expire code will check for close
	 * cooperators and put the close queue at the front of the service
J
Jeff Moyer 已提交
3280
	 * tree.  If possible, merge the expiring queue with the new cfqq.
3281
	 */
3282
	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
J
Jeff Moyer 已提交
3283 3284 3285
	if (new_cfqq) {
		if (!cfqq->new_cfqq)
			cfq_setup_merge(cfqq, new_cfqq);
3286
		goto expire;
J
Jeff Moyer 已提交
3287
	}
3288

J
Jens Axboe 已提交
3289 3290 3291 3292 3293
	/*
	 * No requests pending. If the active queue still has requests in
	 * flight or is idling for a new request, allow either of these
	 * conditions to happen (or time out) before selecting a new queue.
	 */
3294
	if (hrtimer_active(&cfqd->idle_slice_timer)) {
3295 3296 3297 3298
		cfqq = NULL;
		goto keep_queue;
	}

3299 3300 3301 3302 3303 3304
	/*
	 * This is a deep seek queue, but the device is much faster than
	 * the queue can deliver, don't idle
	 **/
	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
	    (cfq_cfqq_slice_new(cfqq) ||
3305
	    (cfqq->slice_end - now > now - cfqq->slice_start))) {
3306 3307 3308 3309
		cfq_clear_cfqq_deep(cfqq);
		cfq_clear_cfqq_idle_window(cfqq);
	}

3310 3311 3312 3313 3314 3315 3316 3317 3318 3319
	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
		cfqq = NULL;
		goto keep_queue;
	}

	/*
	 * If group idle is enabled and there are requests dispatched from
	 * this group, wait for requests to complete.
	 */
check_group_idle:
S
Shaohua Li 已提交
3320 3321 3322
	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
	    cfqq->cfqg->dispatched &&
	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3323 3324
		cfqq = NULL;
		goto keep_queue;
3325 3326
	}

J
Jens Axboe 已提交
3327
expire:
3328
	cfq_slice_expired(cfqd, 0);
J
Jens Axboe 已提交
3329
new_queue:
3330 3331 3332 3333 3334
	/*
	 * Current queue expired. Check if we have to switch to a new
	 * service tree
	 */
	if (!new_cfqq)
3335
		cfq_choose_cfqg(cfqd);
3336

3337
	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3338
keep_queue:
J
Jens Axboe 已提交
3339
	return cfqq;
3340 3341
}

J
Jens Axboe 已提交
3342
static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3343 3344 3345 3346 3347 3348 3349 3350 3351
{
	int dispatched = 0;

	while (cfqq->next_rq) {
		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
		dispatched++;
	}

	BUG_ON(!list_empty(&cfqq->fifo));
3352 3353

	/* By default cfqq is not expired if it is empty. Do it explicitly */
3354
	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3355 3356 3357
	return dispatched;
}

3358 3359 3360 3361
/*
 * Drain our current requests. Used for barriers and when switching
 * io schedulers on-the-fly.
 */
3362
static int cfq_forced_dispatch(struct cfq_data *cfqd)
3363
{
3364
	struct cfq_queue *cfqq;
3365
	int dispatched = 0;
3366

3367
	/* Expire the timeslice of the current active queue first */
3368
	cfq_slice_expired(cfqd, 0);
3369 3370
	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
		__cfq_set_active_queue(cfqd, cfqq);
3371
		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3372
	}
3373 3374 3375

	BUG_ON(cfqd->busy_queues);

3376
	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3377 3378 3379
	return dispatched;
}

S
Shaohua Li 已提交
3380 3381 3382
static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
	struct cfq_queue *cfqq)
{
3383 3384
	u64 now = ktime_get_ns();

S
Shaohua Li 已提交
3385 3386
	/* the queue hasn't finished any request, can't estimate */
	if (cfq_cfqq_slice_new(cfqq))
S
Shaohua Li 已提交
3387
		return true;
3388
	if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
S
Shaohua Li 已提交
3389
		return true;
S
Shaohua Li 已提交
3390

S
Shaohua Li 已提交
3391
	return false;
S
Shaohua Li 已提交
3392 3393
}

3394
static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3395 3396
{
	unsigned int max_dispatch;
3397

3398 3399 3400
	if (cfq_cfqq_must_dispatch(cfqq))
		return true;

3401 3402 3403
	/*
	 * Drain async requests before we start sync IO
	 */
3404
	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3405
		return false;
3406

3407 3408 3409
	/*
	 * If this is an async queue and we have sync IO in flight, let it wait
	 */
3410
	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3411
		return false;
3412

S
Shaohua Li 已提交
3413
	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3414 3415
	if (cfq_class_idle(cfqq))
		max_dispatch = 1;
3416

3417 3418 3419 3420
	/*
	 * Does this cfqq already have too much IO in flight?
	 */
	if (cfqq->dispatched >= max_dispatch) {
3421
		bool promote_sync = false;
3422 3423 3424
		/*
		 * idle queue must always only have a single IO in flight
		 */
3425
		if (cfq_class_idle(cfqq))
3426
			return false;
3427

3428
		/*
3429 3430
		 * If there is only one sync queue
		 * we can ignore async queue here and give the sync
3431 3432 3433 3434
		 * queue no dispatch limit. The reason is a sync queue can
		 * preempt async queue, limiting the sync queue doesn't make
		 * sense. This is useful for aiostress test.
		 */
3435 3436
		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
			promote_sync = true;
3437

3438 3439 3440
		/*
		 * We have other queues, don't allow more IO from this one
		 */
3441 3442
		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
				!promote_sync)
3443
			return false;
3444

3445
		/*
3446
		 * Sole queue user, no limit
3447
		 */
3448
		if (cfqd->busy_queues == 1 || promote_sync)
S
Shaohua Li 已提交
3449 3450 3451 3452 3453 3454 3455 3456 3457
			max_dispatch = -1;
		else
			/*
			 * Normally we start throttling cfqq when cfq_quantum/2
			 * requests have been dispatched. But we can drive
			 * deeper queue depths at the beginning of slice
			 * subjected to upper limit of cfq_quantum.
			 * */
			max_dispatch = cfqd->cfq_quantum;
3458 3459 3460 3461 3462 3463 3464
	}

	/*
	 * Async queues must wait a bit before being allowed dispatch.
	 * We also ramp up the dispatch depth gradually for async IO,
	 * based on the last sync IO we serviced
	 */
3465
	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3466
		u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3467
		unsigned int depth;
3468

3469
		depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3470 3471
		if (!depth && !cfqq->dispatched)
			depth = 1;
3472 3473
		if (depth < max_dispatch)
			max_dispatch = depth;
3474
	}
3475

3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491
	/*
	 * If we're below the current max, allow a dispatch
	 */
	return cfqq->dispatched < max_dispatch;
}

/*
 * Dispatch a request from cfqq, moving them to the request queue
 * dispatch list.
 */
static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	struct request *rq;

	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

3492 3493 3494 3495
	rq = cfq_check_fifo(cfqq);
	if (rq)
		cfq_mark_cfqq_must_dispatch(cfqq);

3496 3497 3498 3499 3500 3501 3502 3503
	if (!cfq_may_dispatch(cfqd, cfqq))
		return false;

	/*
	 * follow expired path, else get first next available
	 */
	if (!rq)
		rq = cfqq->next_rq;
3504 3505
	else
		cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3506 3507 3508 3509 3510 3511 3512

	/*
	 * insert request into driver dispatch list
	 */
	cfq_dispatch_insert(cfqd->queue, rq);

	if (!cfqd->active_cic) {
3513
		struct cfq_io_cq *cic = RQ_CIC(rq);
3514

3515
		atomic_long_inc(&cic->icq.ioc->refcount);
3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538
		cfqd->active_cic = cic;
	}

	return true;
}

/*
 * Find the cfqq that we need to service and move a request from that to the
 * dispatch list
 */
static int cfq_dispatch_requests(struct request_queue *q, int force)
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct cfq_queue *cfqq;

	if (!cfqd->busy_queues)
		return 0;

	if (unlikely(force))
		return cfq_forced_dispatch(cfqd);

	cfqq = cfq_select_queue(cfqd);
	if (!cfqq)
3539 3540
		return 0;

3541
	/*
3542
	 * Dispatch a request from this cfqq, if it is allowed
3543
	 */
3544 3545 3546
	if (!cfq_dispatch_request(cfqd, cfqq))
		return 0;

3547
	cfqq->slice_dispatch++;
3548
	cfq_clear_cfqq_must_dispatch(cfqq);
3549

3550 3551 3552 3553 3554 3555 3556
	/*
	 * expire an async queue immediately if it has used up its slice. idle
	 * queue always expire after 1 dispatch round.
	 */
	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
	    cfq_class_idle(cfqq))) {
3557
		cfqq->slice_end = ktime_get_ns() + 1;
3558
		cfq_slice_expired(cfqd, 0);
L
Linus Torvalds 已提交
3559 3560
	}

3561
	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3562
	return 1;
L
Linus Torvalds 已提交
3563 3564 3565
}

/*
J
Jens Axboe 已提交
3566 3567
 * task holds one reference to the queue, dropped when task exits. each rq
 * in-flight on this queue also holds a reference, dropped when rq is freed.
L
Linus Torvalds 已提交
3568
 *
3569
 * Each cfq queue took a reference on the parent group. Drop it now.
L
Linus Torvalds 已提交
3570 3571 3572 3573
 * queue lock must be held here.
 */
static void cfq_put_queue(struct cfq_queue *cfqq)
{
3574
	struct cfq_data *cfqd = cfqq->cfqd;
3575
	struct cfq_group *cfqg;
3576

3577
	BUG_ON(cfqq->ref <= 0);
L
Linus Torvalds 已提交
3578

3579 3580
	cfqq->ref--;
	if (cfqq->ref)
L
Linus Torvalds 已提交
3581 3582
		return;

3583
	cfq_log_cfqq(cfqd, cfqq, "put_queue");
L
Linus Torvalds 已提交
3584
	BUG_ON(rb_first(&cfqq->sort_list));
3585
	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3586
	cfqg = cfqq->cfqg;
L
Linus Torvalds 已提交
3587

3588
	if (unlikely(cfqd->active_queue == cfqq)) {
3589
		__cfq_slice_expired(cfqd, cfqq, 0);
3590
		cfq_schedule_dispatch(cfqd);
3591
	}
3592

3593
	BUG_ON(cfq_cfqq_on_rr(cfqq));
L
Linus Torvalds 已提交
3594
	kmem_cache_free(cfq_pool, cfqq);
3595
	cfqg_put(cfqg);
L
Linus Torvalds 已提交
3596 3597
}

3598
static void cfq_put_cooperator(struct cfq_queue *cfqq)
L
Linus Torvalds 已提交
3599
{
J
Jeff Moyer 已提交
3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616
	struct cfq_queue *__cfqq, *next;

	/*
	 * If this queue was scheduled to merge with another queue, be
	 * sure to drop the reference taken on that queue (and others in
	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
	 */
	__cfqq = cfqq->new_cfqq;
	while (__cfqq) {
		if (__cfqq == cfqq) {
			WARN(1, "cfqq->new_cfqq loop detected\n");
			break;
		}
		next = __cfqq->new_cfqq;
		cfq_put_queue(__cfqq);
		__cfqq = next;
	}
3617 3618 3619 3620 3621 3622 3623 3624 3625 3626
}

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
	if (unlikely(cfqq == cfqd->active_queue)) {
		__cfq_slice_expired(cfqd, cfqq, 0);
		cfq_schedule_dispatch(cfqd);
	}

	cfq_put_cooperator(cfqq);
J
Jeff Moyer 已提交
3627

3628 3629
	cfq_put_queue(cfqq);
}
3630

3631 3632 3633 3634
static void cfq_init_icq(struct io_cq *icq)
{
	struct cfq_io_cq *cic = icq_to_cic(icq);

3635
	cic->ttime.last_end_request = ktime_get_ns();
3636 3637
}

3638
static void cfq_exit_icq(struct io_cq *icq)
3639
{
3640
	struct cfq_io_cq *cic = icq_to_cic(icq);
3641
	struct cfq_data *cfqd = cic_to_cfqd(cic);
3642

T
Tejun Heo 已提交
3643 3644 3645
	if (cic_to_cfqq(cic, false)) {
		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
		cic_set_cfqq(cic, NULL, false);
3646 3647
	}

T
Tejun Heo 已提交
3648 3649 3650
	if (cic_to_cfqq(cic, true)) {
		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
		cic_set_cfqq(cic, NULL, true);
3651
	}
3652 3653
}

3654
static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3655 3656 3657 3658
{
	struct task_struct *tsk = current;
	int ioprio_class;

J
Jens Axboe 已提交
3659
	if (!cfq_cfqq_prio_changed(cfqq))
3660 3661
		return;

T
Tejun Heo 已提交
3662
	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3663
	switch (ioprio_class) {
3664 3665 3666 3667
	default:
		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
	case IOPRIO_CLASS_NONE:
		/*
3668
		 * no prio set, inherit CPU scheduling settings
3669 3670
		 */
		cfqq->ioprio = task_nice_ioprio(tsk);
3671
		cfqq->ioprio_class = task_nice_ioclass(tsk);
3672 3673
		break;
	case IOPRIO_CLASS_RT:
T
Tejun Heo 已提交
3674
		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3675 3676 3677
		cfqq->ioprio_class = IOPRIO_CLASS_RT;
		break;
	case IOPRIO_CLASS_BE:
T
Tejun Heo 已提交
3678
		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3679 3680 3681 3682 3683 3684 3685
		cfqq->ioprio_class = IOPRIO_CLASS_BE;
		break;
	case IOPRIO_CLASS_IDLE:
		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
		cfqq->ioprio = 7;
		cfq_clear_cfqq_idle_window(cfqq);
		break;
3686 3687 3688 3689 3690 3691 3692
	}

	/*
	 * keep track of original prio settings in case we have to temporarily
	 * elevate the priority of this queue
	 */
	cfqq->org_ioprio = cfqq->ioprio;
3693
	cfqq->org_ioprio_class = cfqq->ioprio_class;
J
Jens Axboe 已提交
3694
	cfq_clear_cfqq_prio_changed(cfqq);
3695 3696
}

T
Tejun Heo 已提交
3697
static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3698
{
T
Tejun Heo 已提交
3699
	int ioprio = cic->icq.ioc->ioprio;
3700
	struct cfq_data *cfqd = cic_to_cfqd(cic);
3701
	struct cfq_queue *cfqq;
3702

T
Tejun Heo 已提交
3703 3704 3705 3706 3707
	/*
	 * Check whether ioprio has changed.  The condition may trigger
	 * spuriously on a newly created cic but there's no harm.
	 */
	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3708 3709
		return;

T
Tejun Heo 已提交
3710
	cfqq = cic_to_cfqq(cic, false);
3711
	if (cfqq) {
T
Tejun Heo 已提交
3712
		cfq_put_queue(cfqq);
3713
		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
T
Tejun Heo 已提交
3714
		cic_set_cfqq(cic, cfqq, false);
3715
	}
3716

T
Tejun Heo 已提交
3717
	cfqq = cic_to_cfqq(cic, true);
3718 3719
	if (cfqq)
		cfq_mark_cfqq_prio_changed(cfqq);
T
Tejun Heo 已提交
3720 3721

	cic->ioprio = ioprio;
3722 3723
}

3724
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3725
			  pid_t pid, bool is_sync)
3726 3727 3728 3729 3730
{
	RB_CLEAR_NODE(&cfqq->rb_node);
	RB_CLEAR_NODE(&cfqq->p_node);
	INIT_LIST_HEAD(&cfqq->fifo);

3731
	cfqq->ref = 0;
3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743
	cfqq->cfqd = cfqd;

	cfq_mark_cfqq_prio_changed(cfqq);

	if (is_sync) {
		if (!cfq_class_idle(cfqq))
			cfq_mark_cfqq_idle_window(cfqq);
		cfq_mark_cfqq_sync(cfqq);
	}
	cfqq->pid = pid;
}

3744
#ifdef CONFIG_CFQ_GROUP_IOSCHED
3745
static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3746
{
3747
	struct cfq_data *cfqd = cic_to_cfqd(cic);
3748
	struct cfq_queue *cfqq;
T
Tejun Heo 已提交
3749
	uint64_t serial_nr;
3750

T
Tejun Heo 已提交
3751
	rcu_read_lock();
T
Tejun Heo 已提交
3752
	serial_nr = bio_blkcg(bio)->css.serial_nr;
T
Tejun Heo 已提交
3753
	rcu_read_unlock();
3754

T
Tejun Heo 已提交
3755 3756 3757 3758
	/*
	 * Check whether blkcg has changed.  The condition may trigger
	 * spuriously on a newly created cic but there's no harm.
	 */
T
Tejun Heo 已提交
3759
	if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3760
		return;
J
Jens Axboe 已提交
3761

3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777
	/*
	 * Drop reference to queues.  New queues will be assigned in new
	 * group upon arrival of fresh requests.
	 */
	cfqq = cic_to_cfqq(cic, false);
	if (cfqq) {
		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
		cic_set_cfqq(cic, NULL, false);
		cfq_put_queue(cfqq);
	}

	cfqq = cic_to_cfqq(cic, true);
	if (cfqq) {
		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
		cic_set_cfqq(cic, NULL, true);
		cfq_put_queue(cfqq);
3778
	}
T
Tejun Heo 已提交
3779

T
Tejun Heo 已提交
3780
	cic->blkcg_serial_nr = serial_nr;
3781
}
T
Tejun Heo 已提交
3782
#else
3783
static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3784 3785
{
}
3786 3787
#endif  /* CONFIG_CFQ_GROUP_IOSCHED */

3788
static struct cfq_queue **
3789
cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3790
{
3791
	switch (ioprio_class) {
3792
	case IOPRIO_CLASS_RT:
3793
		return &cfqg->async_cfqq[0][ioprio];
T
Tejun Heo 已提交
3794 3795 3796
	case IOPRIO_CLASS_NONE:
		ioprio = IOPRIO_NORM;
		/* fall through */
3797
	case IOPRIO_CLASS_BE:
3798
		return &cfqg->async_cfqq[1][ioprio];
3799
	case IOPRIO_CLASS_IDLE:
3800
		return &cfqg->async_idle_cfqq;
3801 3802 3803 3804 3805
	default:
		BUG();
	}
}

3806
static struct cfq_queue *
3807
cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3808
	      struct bio *bio)
3809
{
3810 3811
	int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3812
	struct cfq_queue **async_cfqq = NULL;
3813
	struct cfq_queue *cfqq;
3814 3815 3816
	struct cfq_group *cfqg;

	rcu_read_lock();
3817
	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3818 3819 3820 3821
	if (!cfqg) {
		cfqq = &cfqd->oom_cfqq;
		goto out;
	}
3822

3823
	if (!is_sync) {
3824 3825 3826 3827 3828
		if (!ioprio_valid(cic->ioprio)) {
			struct task_struct *tsk = current;
			ioprio = task_nice_ioprio(tsk);
			ioprio_class = task_nice_ioclass(tsk);
		}
3829
		async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3830
		cfqq = *async_cfqq;
3831 3832
		if (cfqq)
			goto out;
3833 3834
	}

3835 3836
	cfqq = kmem_cache_alloc_node(cfq_pool,
				     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3837 3838 3839 3840 3841 3842
				     cfqd->queue->node);
	if (!cfqq) {
		cfqq = &cfqd->oom_cfqq;
		goto out;
	}

3843 3844
	/* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
	cfqq->ioprio_class = IOPRIO_CLASS_NONE;
3845 3846 3847 3848
	cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
	cfq_init_prio_data(cfqq, cic);
	cfq_link_cfqq_cfqg(cfqq, cfqg);
	cfq_log_cfqq(cfqd, cfqq, "alloced");
3849

3850 3851
	if (async_cfqq) {
		/* a new async queue is created, pin and remember */
3852
		cfqq->ref++;
3853
		*async_cfqq = cfqq;
3854
	}
3855
out:
3856
	cfqq->ref++;
3857
	rcu_read_unlock();
3858 3859 3860
	return cfqq;
}

3861
static void
3862
__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
L
Linus Torvalds 已提交
3863
{
3864
	u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3865
	elapsed = min(elapsed, 2UL * slice_idle);
3866

3867
	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3868 3869 3870
	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
				     ttime->ttime_samples);
3871 3872 3873 3874
}

static void
cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3875
			struct cfq_io_cq *cic)
3876
{
3877
	if (cfq_cfqq_sync(cfqq)) {
3878
		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3879 3880 3881
		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
			cfqd->cfq_slice_idle);
	}
S
Shaohua Li 已提交
3882 3883 3884
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
#endif
3885
}
L
Linus Torvalds 已提交
3886

3887
static void
3888
cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
J
Jens Axboe 已提交
3889
		       struct request *rq)
3890
{
3891
	sector_t sdist = 0;
3892
	sector_t n_sec = blk_rq_sectors(rq);
3893 3894 3895 3896 3897 3898
	if (cfqq->last_request_pos) {
		if (cfqq->last_request_pos < blk_rq_pos(rq))
			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
		else
			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
	}
3899

3900
	cfqq->seek_history <<= 1;
3901 3902 3903 3904
	if (blk_queue_nonrot(cfqd->queue))
		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
	else
		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3905
}
L
Linus Torvalds 已提交
3906

3907 3908 3909 3910 3911 3912
static inline bool req_noidle(struct request *req)
{
	return req_op(req) == REQ_OP_WRITE &&
		(req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
}

3913 3914 3915 3916 3917 3918
/*
 * Disable idle window if the process thinks too long or seeks so much that
 * it doesn't matter
 */
static void
cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3919
		       struct cfq_io_cq *cic)
3920
{
3921
	int old_idle, enable_idle;
3922

3923 3924 3925 3926
	/*
	 * Don't idle for async or idle io prio class
	 */
	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3927 3928
		return;

3929
	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
L
Linus Torvalds 已提交
3930

3931 3932 3933
	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
		cfq_mark_cfqq_deep(cfqq);

3934
	if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3935
		enable_idle = 0;
T
Tejun Heo 已提交
3936
	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3937 3938
		 !cfqd->cfq_slice_idle ||
		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3939
		enable_idle = 0;
3940 3941
	else if (sample_valid(cic->ttime.ttime_samples)) {
		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3942 3943 3944
			enable_idle = 0;
		else
			enable_idle = 1;
L
Linus Torvalds 已提交
3945 3946
	}

3947 3948 3949 3950 3951 3952 3953
	if (old_idle != enable_idle) {
		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
		if (enable_idle)
			cfq_mark_cfqq_idle_window(cfqq);
		else
			cfq_clear_cfqq_idle_window(cfqq);
	}
3954
}
L
Linus Torvalds 已提交
3955

3956 3957 3958 3959
/*
 * Check if new_cfqq should preempt the currently active queue. Return 0 for
 * no or if we aren't sure, a 1 will cause a preempt.
 */
3960
static bool
3961
cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
J
Jens Axboe 已提交
3962
		   struct request *rq)
3963
{
J
Jens Axboe 已提交
3964
	struct cfq_queue *cfqq;
3965

J
Jens Axboe 已提交
3966 3967
	cfqq = cfqd->active_queue;
	if (!cfqq)
3968
		return false;
3969

J
Jens Axboe 已提交
3970
	if (cfq_class_idle(new_cfqq))
3971
		return false;
3972 3973

	if (cfq_class_idle(cfqq))
3974
		return true;
3975

3976 3977 3978 3979 3980 3981
	/*
	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
	 */
	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
		return false;

3982 3983 3984 3985
	/*
	 * if the new request is sync, but the currently running queue is
	 * not, let the sync request have priority.
	 */
3986
	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
3987
		return true;
3988

3989 3990 3991 3992 3993 3994
	/*
	 * Treat ancestors of current cgroup the same way as current cgroup.
	 * For anybody else we disallow preemption to guarantee service
	 * fairness among cgroups.
	 */
	if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
3995 3996 3997 3998 3999
		return false;

	if (cfq_slice_used(cfqq))
		return true;

4000 4001 4002 4003 4004 4005 4006
	/*
	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
	 */
	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
		return true;

	WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
4007
	/* Allow preemption only if we are idling on sync-noidle tree */
4008
	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
4009 4010 4011 4012
	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
	    RB_EMPTY_ROOT(&cfqq->sort_list))
		return true;

4013 4014 4015 4016
	/*
	 * So both queues are sync. Let the new request get disk time if
	 * it's a metadata request and the current queue is doing regular IO.
	 */
4017
	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
4018 4019
		return true;

4020 4021 4022 4023
	/* An idle queue should not be idle now for some reason */
	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
		return true;

4024
	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
4025
		return false;
4026 4027 4028 4029 4030

	/*
	 * if this request is as-good as one we would expect from the
	 * current cfqq, let it preempt
	 */
4031
	if (cfq_rq_close(cfqd, cfqq, rq))
4032
		return true;
4033

4034
	return false;
4035 4036 4037 4038 4039 4040 4041 4042
}

/*
 * cfqq preempts the active queue. if we allowed preempt with no slice left,
 * let it have half of its nominal slice.
 */
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
S
Shaohua Li 已提交
4043 4044
	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);

4045
	cfq_log_cfqq(cfqd, cfqq, "preempt");
S
Shaohua Li 已提交
4046
	cfq_slice_expired(cfqd, 1);
4047

4048 4049 4050 4051
	/*
	 * workload type is changed, don't save slice, otherwise preempt
	 * doesn't happen
	 */
S
Shaohua Li 已提交
4052
	if (old_type != cfqq_type(cfqq))
4053
		cfqq->cfqg->saved_wl_slice = 0;
4054

4055 4056 4057 4058 4059
	/*
	 * Put the new queue at the front of the of the current list,
	 * so we know that it will be selected next.
	 */
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
4060 4061

	cfq_service_tree_add(cfqd, cfqq, 1);
4062

4063 4064
	cfqq->slice_end = 0;
	cfq_mark_cfqq_slice_new(cfqq);
4065 4066 4067
}

/*
J
Jens Axboe 已提交
4068
 * Called when a new fs request (rq) is added (to cfqq). Check if there's
4069 4070 4071
 * something we should do about it
 */
static void
J
Jens Axboe 已提交
4072 4073
cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
		struct request *rq)
4074
{
4075
	struct cfq_io_cq *cic = RQ_CIC(rq);
4076

4077
	cfqd->rq_queued++;
4078 4079
	if (rq->cmd_flags & REQ_PRIO)
		cfqq->prio_pending++;
4080

4081
	cfq_update_io_thinktime(cfqd, cfqq, cic);
4082
	cfq_update_io_seektime(cfqd, cfqq, rq);
J
Jens Axboe 已提交
4083 4084
	cfq_update_idle_window(cfqd, cfqq, cic);

4085
	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4086 4087 4088

	if (cfqq == cfqd->active_queue) {
		/*
4089 4090 4091
		 * Remember that we saw a request from this process, but
		 * don't start queuing just yet. Otherwise we risk seeing lots
		 * of tiny requests, because we disrupt the normal plugging
4092 4093
		 * and merging. If the request is already larger than a single
		 * page, let it rip immediately. For that case we assume that
4094 4095 4096
		 * merging is already done. Ditto for a busy system that
		 * has other work pending, don't risk delaying until the
		 * idle timer unplug to continue working.
4097
		 */
4098
		if (cfq_cfqq_wait_request(cfqq)) {
4099
			if (blk_rq_bytes(rq) > PAGE_SIZE ||
4100
			    cfqd->busy_queues > 1) {
4101
				cfq_del_timer(cfqd, cfqq);
4102
				cfq_clear_cfqq_wait_request(cfqq);
4103
				__blk_run_queue(cfqd->queue);
4104
			} else {
4105
				cfqg_stats_update_idle_time(cfqq->cfqg);
4106
				cfq_mark_cfqq_must_dispatch(cfqq);
4107
			}
4108
		}
J
Jens Axboe 已提交
4109
	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
4110 4111 4112
		/*
		 * not the active queue - expire current slice if it is
		 * idle and has expired it's mean thinktime or this new queue
4113 4114
		 * has some old slice time left and is of higher priority or
		 * this new queue is RT and the current one is BE
4115 4116
		 */
		cfq_preempt_queue(cfqd, cfqq);
4117
		__blk_run_queue(cfqd->queue);
4118
	}
L
Linus Torvalds 已提交
4119 4120
}

4121
static void cfq_insert_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
4122
{
4123
	struct cfq_data *cfqd = q->elevator->elevator_data;
J
Jens Axboe 已提交
4124
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
4125

4126
	cfq_log_cfqq(cfqd, cfqq, "insert_request");
4127
	cfq_init_prio_data(cfqq, RQ_CIC(rq));
L
Linus Torvalds 已提交
4128

4129
	rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
4130
	list_add_tail(&rq->queuelist, &cfqq->fifo);
4131
	cfq_add_rq_rb(rq);
4132
	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4133
				 rq->cmd_flags);
J
Jens Axboe 已提交
4134
	cfq_rq_enqueued(cfqd, cfqq, rq);
L
Linus Torvalds 已提交
4135 4136
}

4137 4138 4139 4140 4141 4142
/*
 * Update hw_tag based on peak queue depth over 50 samples under
 * sufficient load.
 */
static void cfq_update_hw_tag(struct cfq_data *cfqd)
{
S
Shaohua Li 已提交
4143 4144
	struct cfq_queue *cfqq = cfqd->active_queue;

4145 4146
	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4147 4148 4149

	if (cfqd->hw_tag == 1)
		return;
4150 4151

	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4152
	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4153 4154
		return;

S
Shaohua Li 已提交
4155 4156 4157 4158 4159 4160 4161
	/*
	 * If active queue hasn't enough requests and can idle, cfq might not
	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
	 * case
	 */
	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4162
	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
S
Shaohua Li 已提交
4163 4164
		return;

4165 4166 4167
	if (cfqd->hw_tag_samples++ < 50)
		return;

4168
	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4169 4170 4171 4172 4173
		cfqd->hw_tag = 1;
	else
		cfqd->hw_tag = 0;
}

4174 4175
static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
4176
	struct cfq_io_cq *cic = cfqd->active_cic;
4177
	u64 now = ktime_get_ns();
4178

4179 4180 4181 4182
	/* If the queue already has requests, don't wait */
	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
		return false;

4183 4184 4185 4186
	/* If there are other queues in the group, don't wait */
	if (cfqq->cfqg->nr_cfqq > 1)
		return false;

S
Shaohua Li 已提交
4187 4188 4189 4190
	/* the only queue in the group, but think time is big */
	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
		return false;

4191 4192 4193 4194
	if (cfq_slice_used(cfqq))
		return true;

	/* if slice left is less than think time, wait busy */
4195
	if (cic && sample_valid(cic->ttime.ttime_samples)
4196
	    && (cfqq->slice_end - now < cic->ttime.ttime_mean))
4197 4198 4199 4200 4201 4202 4203 4204 4205
		return true;

	/*
	 * If think times is less than a jiffy than ttime_mean=0 and above
	 * will not be true. It might happen that slice has not expired yet
	 * but will expire soon (4-5 ns) during select_queue(). To cover the
	 * case where think time is less than a jiffy, mark the queue wait
	 * busy if only 1 jiffy is left in the slice.
	 */
4206
	if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
4207 4208 4209 4210 4211
		return true;

	return false;
}

4212
static void cfq_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
4213
{
J
Jens Axboe 已提交
4214
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
4215
	struct cfq_data *cfqd = cfqq->cfqd;
4216
	const int sync = rq_is_sync(rq);
4217
	u64 now = ktime_get_ns();
L
Linus Torvalds 已提交
4218

4219
	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
L
Linus Torvalds 已提交
4220

4221 4222
	cfq_update_hw_tag(cfqd);

4223
	WARN_ON(!cfqd->rq_in_driver);
J
Jens Axboe 已提交
4224
	WARN_ON(!cfqq->dispatched);
4225
	cfqd->rq_in_driver--;
J
Jens Axboe 已提交
4226
	cfqq->dispatched--;
4227
	(RQ_CFQG(rq))->dispatched--;
4228
	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
4229
				     rq_io_start_time_ns(rq), rq->cmd_flags);
L
Linus Torvalds 已提交
4230

4231
	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4232

4233
	if (sync) {
4234
		struct cfq_rb_root *st;
4235

4236
		RQ_CIC(rq)->ttime.last_end_request = now;
4237 4238

		if (cfq_cfqq_on_rr(cfqq))
4239
			st = cfqq->service_tree;
4240
		else
4241 4242 4243 4244
			st = st_for(cfqq->cfqg, cfqq_class(cfqq),
					cfqq_type(cfqq));

		st->ttime.last_end_request = now;
4245 4246 4247 4248 4249 4250 4251 4252 4253 4254
		/*
		 * We have to do this check in jiffies since start_time is in
		 * jiffies and it is not trivial to convert to ns. If
		 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
		 * will become problematic but so far we are fine (the default
		 * is 128 ms).
		 */
		if (!time_after(rq->start_time +
				  nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
				jiffies))
4255
			cfqd->last_delayed_sync = now;
4256
	}
4257

S
Shaohua Li 已提交
4258 4259 4260 4261
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	cfqq->cfqg->ttime.last_end_request = now;
#endif

4262 4263 4264 4265 4266
	/*
	 * If this is the active queue, check if it needs to be expired,
	 * or if we want to idle in case it has no pending requests.
	 */
	if (cfqd->active_queue == cfqq) {
4267 4268
		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

4269 4270 4271 4272
		if (cfq_cfqq_slice_new(cfqq)) {
			cfq_set_prio_slice(cfqd, cfqq);
			cfq_clear_cfqq_slice_new(cfqq);
		}
4273 4274

		/*
4275 4276
		 * Should we wait for next request to come in before we expire
		 * the queue.
4277
		 */
4278
		if (cfq_should_wait_busy(cfqd, cfqq)) {
4279
			u64 extend_sl = cfqd->cfq_slice_idle;
4280 4281
			if (!cfqd->cfq_slice_idle)
				extend_sl = cfqd->cfq_group_idle;
4282
			cfqq->slice_end = now + extend_sl;
4283
			cfq_mark_cfqq_wait_busy(cfqq);
4284
			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4285 4286
		}

4287
		/*
4288 4289 4290 4291 4292 4293
		 * Idling is not enabled on:
		 * - expired queues
		 * - idle-priority queues
		 * - async queues
		 * - queues with still some requests queued
		 * - when there is a close cooperator
4294
		 */
4295
		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4296
			cfq_slice_expired(cfqd, 1);
4297 4298
		else if (sync && cfqq_empty &&
			 !cfq_close_cooperator(cfqd, cfqq)) {
4299
			cfq_arm_slice_timer(cfqd);
4300
		}
4301
	}
J
Jens Axboe 已提交
4302

4303
	if (!cfqd->rq_in_driver)
4304
		cfq_schedule_dispatch(cfqd);
L
Linus Torvalds 已提交
4305 4306
}

4307
static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
4308 4309 4310 4311 4312 4313
{
	/*
	 * If REQ_PRIO is set, boost class and prio level, if it's below
	 * BE/NORM. If prio is not set, restore the potentially boosted
	 * class/prio level.
	 */
4314
	if (!(op & REQ_PRIO)) {
4315 4316 4317 4318 4319 4320 4321 4322 4323 4324
		cfqq->ioprio_class = cfqq->org_ioprio_class;
		cfqq->ioprio = cfqq->org_ioprio;
	} else {
		if (cfq_class_idle(cfqq))
			cfqq->ioprio_class = IOPRIO_CLASS_BE;
		if (cfqq->ioprio > IOPRIO_NORM)
			cfqq->ioprio = IOPRIO_NORM;
	}
}

4325
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4326
{
4327
	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
J
Jens Axboe 已提交
4328
		cfq_mark_cfqq_must_alloc_slice(cfqq);
4329
		return ELV_MQUEUE_MUST;
J
Jens Axboe 已提交
4330
	}
L
Linus Torvalds 已提交
4331

4332 4333 4334
	return ELV_MQUEUE_MAY;
}

4335
static int cfq_may_queue(struct request_queue *q, unsigned int op)
4336 4337 4338
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
	struct task_struct *tsk = current;
4339
	struct cfq_io_cq *cic;
4340 4341 4342 4343 4344 4345 4346 4347
	struct cfq_queue *cfqq;

	/*
	 * don't force setup of a queue from here, as a call to may_queue
	 * does not necessarily imply that a request actually will be queued.
	 * so just lookup a possibly existing queue, or return 'may queue'
	 * if that fails
	 */
4348
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
4349 4350 4351
	if (!cic)
		return ELV_MQUEUE_MAY;

4352
	cfqq = cic_to_cfqq(cic, op_is_sync(op));
4353
	if (cfqq) {
4354
		cfq_init_prio_data(cfqq, cic);
4355
		cfqq_boost_on_prio(cfqq, op);
4356

4357
		return __cfq_may_queue(cfqq);
4358 4359 4360
	}

	return ELV_MQUEUE_MAY;
L
Linus Torvalds 已提交
4361 4362 4363 4364 4365
}

/*
 * queue lock held here
 */
4366
static void cfq_put_request(struct request *rq)
L
Linus Torvalds 已提交
4367
{
J
Jens Axboe 已提交
4368
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
L
Linus Torvalds 已提交
4369

J
Jens Axboe 已提交
4370
	if (cfqq) {
4371
		const int rw = rq_data_dir(rq);
L
Linus Torvalds 已提交
4372

4373 4374
		BUG_ON(!cfqq->allocated[rw]);
		cfqq->allocated[rw]--;
L
Linus Torvalds 已提交
4375

4376
		/* Put down rq reference on cfqg */
4377
		cfqg_put(RQ_CFQG(rq));
4378 4379
		rq->elv.priv[0] = NULL;
		rq->elv.priv[1] = NULL;
4380

L
Linus Torvalds 已提交
4381 4382 4383 4384
		cfq_put_queue(cfqq);
	}
}

J
Jeff Moyer 已提交
4385
static struct cfq_queue *
4386
cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
J
Jeff Moyer 已提交
4387 4388 4389 4390
		struct cfq_queue *cfqq)
{
	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4391
	cfq_mark_cfqq_coop(cfqq->new_cfqq);
J
Jeff Moyer 已提交
4392 4393 4394 4395
	cfq_put_queue(cfqq);
	return cic_to_cfqq(cic, 1);
}

4396 4397 4398 4399 4400
/*
 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
 * was the last process referring to said cfqq.
 */
static struct cfq_queue *
4401
split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4402 4403 4404 4405
{
	if (cfqq_process_refs(cfqq) == 1) {
		cfqq->pid = current->pid;
		cfq_clear_cfqq_coop(cfqq);
4406
		cfq_clear_cfqq_split_coop(cfqq);
4407 4408 4409 4410
		return cfqq;
	}

	cic_set_cfqq(cic, NULL, 1);
4411 4412 4413

	cfq_put_cooperator(cfqq);

4414 4415 4416
	cfq_put_queue(cfqq);
	return NULL;
}
L
Linus Torvalds 已提交
4417
/*
4418
 * Allocate cfq data structures associated with this request.
L
Linus Torvalds 已提交
4419
 */
4420
static int
4421 4422
cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
		gfp_t gfp_mask)
L
Linus Torvalds 已提交
4423 4424
{
	struct cfq_data *cfqd = q->elevator->elevator_data;
4425
	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
L
Linus Torvalds 已提交
4426
	const int rw = rq_data_dir(rq);
4427
	const bool is_sync = rq_is_sync(rq);
4428
	struct cfq_queue *cfqq;
L
Linus Torvalds 已提交
4429

4430
	spin_lock_irq(q->queue_lock);
4431

T
Tejun Heo 已提交
4432
	check_ioprio_changed(cic, bio);
4433
	check_blkcg_changed(cic, bio);
4434
new_queue:
4435
	cfqq = cic_to_cfqq(cic, is_sync);
4436
	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4437 4438
		if (cfqq)
			cfq_put_queue(cfqq);
4439
		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4440
		cic_set_cfqq(cic, cfqq, is_sync);
J
Jeff Moyer 已提交
4441
	} else {
4442 4443 4444
		/*
		 * If the queue was seeky for too long, break it apart.
		 */
4445
		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4446 4447 4448 4449 4450 4451
			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
			cfqq = split_cfqq(cic, cfqq);
			if (!cfqq)
				goto new_queue;
		}

J
Jeff Moyer 已提交
4452 4453 4454 4455 4456 4457 4458 4459
		/*
		 * Check to see if this queue is scheduled to merge with
		 * another, closely cooperating queue.  The merging of
		 * queues happens here as it must be done in process context.
		 * The reference on new_cfqq was taken in merge_cfqqs.
		 */
		if (cfqq->new_cfqq)
			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4460
	}
L
Linus Torvalds 已提交
4461 4462 4463

	cfqq->allocated[rw]++;

4464
	cfqq->ref++;
4465
	cfqg_get(cfqq->cfqg);
4466
	rq->elv.priv[0] = cfqq;
T
Tejun Heo 已提交
4467
	rq->elv.priv[1] = cfqq->cfqg;
4468
	spin_unlock_irq(q->queue_lock);
4469

J
Jens Axboe 已提交
4470
	return 0;
L
Linus Torvalds 已提交
4471 4472
}

4473
static void cfq_kick_queue(struct work_struct *work)
4474
{
4475
	struct cfq_data *cfqd =
4476
		container_of(work, struct cfq_data, unplug_work);
4477
	struct request_queue *q = cfqd->queue;
4478

4479
	spin_lock_irq(q->queue_lock);
4480
	__blk_run_queue(cfqd->queue);
4481
	spin_unlock_irq(q->queue_lock);
4482 4483 4484 4485 4486
}

/*
 * Timer running if the active_queue is currently idling inside its time slice
 */
4487
static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
4488
{
4489 4490
	struct cfq_data *cfqd = container_of(timer, struct cfq_data,
					     idle_slice_timer);
4491 4492
	struct cfq_queue *cfqq;
	unsigned long flags;
4493
	int timed_out = 1;
4494

4495 4496
	cfq_log(cfqd, "idle timer fired");

4497 4498
	spin_lock_irqsave(cfqd->queue->queue_lock, flags);

4499 4500
	cfqq = cfqd->active_queue;
	if (cfqq) {
4501 4502
		timed_out = 0;

4503 4504 4505 4506 4507 4508
		/*
		 * We saw a request before the queue expired, let it through
		 */
		if (cfq_cfqq_must_dispatch(cfqq))
			goto out_kick;

4509 4510 4511
		/*
		 * expired
		 */
4512
		if (cfq_slice_used(cfqq))
4513 4514 4515 4516 4517 4518
			goto expire;

		/*
		 * only expire and reinvoke request handler, if there are
		 * other queues with pending requests
		 */
4519
		if (!cfqd->busy_queues)
4520 4521 4522 4523 4524
			goto out_cont;

		/*
		 * not expired and it has a request pending, let it dispatch
		 */
4525
		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4526
			goto out_kick;
4527 4528 4529 4530 4531

		/*
		 * Queue depth flag is reset only when the idle didn't succeed
		 */
		cfq_clear_cfqq_deep(cfqq);
4532 4533
	}
expire:
4534
	cfq_slice_expired(cfqd, timed_out);
4535
out_kick:
4536
	cfq_schedule_dispatch(cfqd);
4537 4538
out_cont:
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4539
	return HRTIMER_NORESTART;
4540 4541
}

J
Jens Axboe 已提交
4542 4543
static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
{
4544
	hrtimer_cancel(&cfqd->idle_slice_timer);
4545
	cancel_work_sync(&cfqd->unplug_work);
J
Jens Axboe 已提交
4546
}
4547

J
Jens Axboe 已提交
4548
static void cfq_exit_queue(struct elevator_queue *e)
L
Linus Torvalds 已提交
4549
{
4550
	struct cfq_data *cfqd = e->elevator_data;
4551
	struct request_queue *q = cfqd->queue;
4552

J
Jens Axboe 已提交
4553
	cfq_shutdown_timer_wq(cfqd);
4554

4555
	spin_lock_irq(q->queue_lock);
4556

4557
	if (cfqd->active_queue)
4558
		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4559

4560 4561
	spin_unlock_irq(q->queue_lock);

4562 4563
	cfq_shutdown_timer_wq(cfqd);

4564 4565 4566
#ifdef CONFIG_CFQ_GROUP_IOSCHED
	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
#else
4567
	kfree(cfqd->root_group);
4568
#endif
4569
	kfree(cfqd);
L
Linus Torvalds 已提交
4570 4571
}

4572
static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
L
Linus Torvalds 已提交
4573 4574
{
	struct cfq_data *cfqd;
T
Tejun Heo 已提交
4575
	struct blkcg_gq *blkg __maybe_unused;
4576
	int i, ret;
4577 4578 4579 4580 4581
	struct elevator_queue *eq;

	eq = elevator_alloc(q, e);
	if (!eq)
		return -ENOMEM;
L
Linus Torvalds 已提交
4582

4583
	cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4584 4585
	if (!cfqd) {
		kobject_put(&eq->kobj);
4586
		return -ENOMEM;
4587 4588
	}
	eq->elevator_data = cfqd;
4589

4590
	cfqd->queue = q;
4591 4592 4593
	spin_lock_irq(q->queue_lock);
	q->elevator = eq;
	spin_unlock_irq(q->queue_lock);
4594

4595 4596 4597
	/* Init root service tree */
	cfqd->grp_service_tree = CFQ_RB_ROOT;

4598
	/* Init root group and prefer root group over other groups by default */
4599
#ifdef CONFIG_CFQ_GROUP_IOSCHED
T
Tejun Heo 已提交
4600
	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4601 4602
	if (ret)
		goto out_free;
4603

4604
	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4605
#else
4606
	ret = -ENOMEM;
4607 4608
	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
					GFP_KERNEL, cfqd->queue->node);
4609 4610
	if (!cfqd->root_group)
		goto out_free;
4611

4612
	cfq_init_cfqg_base(cfqd->root_group);
4613 4614
	cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4615
#endif
4616

4617 4618 4619 4620 4621 4622 4623 4624
	/*
	 * Not strictly needed (since RB_ROOT just clears the node and we
	 * zeroed cfqd on alloc), but better be safe in case someone decides
	 * to add magic to the rb code
	 */
	for (i = 0; i < CFQ_PRIO_LISTS; i++)
		cfqd->prio_trees[i] = RB_ROOT;

4625
	/*
4626
	 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4627
	 * Grab a permanent reference to it, so that the normal code flow
4628 4629 4630
	 * will not attempt to free it.  oom_cfqq is linked to root_group
	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
	 * the reference from linking right away.
4631 4632
	 */
	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4633
	cfqd->oom_cfqq.ref++;
T
Tejun Heo 已提交
4634 4635

	spin_lock_irq(q->queue_lock);
4636
	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4637
	cfqg_put(cfqd->root_group);
T
Tejun Heo 已提交
4638
	spin_unlock_irq(q->queue_lock);
L
Linus Torvalds 已提交
4639

4640 4641
	hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
		     HRTIMER_MODE_REL);
4642 4643
	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;

4644
	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4645

L
Linus Torvalds 已提交
4646
	cfqd->cfq_quantum = cfq_quantum;
4647 4648
	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
L
Linus Torvalds 已提交
4649 4650
	cfqd->cfq_back_max = cfq_back_max;
	cfqd->cfq_back_penalty = cfq_back_penalty;
4651 4652
	cfqd->cfq_slice[0] = cfq_slice_async;
	cfqd->cfq_slice[1] = cfq_slice_sync;
4653
	cfqd->cfq_target_latency = cfq_target_latency;
4654
	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4655
	cfqd->cfq_slice_idle = cfq_slice_idle;
4656
	cfqd->cfq_group_idle = cfq_group_idle;
4657
	cfqd->cfq_latency = 1;
4658
	cfqd->hw_tag = -1;
4659 4660 4661 4662
	/*
	 * we optimistically start assuming sync ops weren't delayed in last
	 * second, in order to have larger depth for async operations.
	 */
4663
	cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
4664
	return 0;
4665 4666 4667

out_free:
	kfree(cfqd);
4668
	kobject_put(&eq->kobj);
4669
	return ret;
L
Linus Torvalds 已提交
4670 4671
}

4672 4673 4674 4675 4676 4677 4678 4679 4680 4681
static void cfq_registered_queue(struct request_queue *q)
{
	struct elevator_queue *e = q->elevator;
	struct cfq_data *cfqd = e->elevator_data;

	/*
	 * Default to IOPS mode with no idling for SSDs
	 */
	if (blk_queue_nonrot(q))
		cfqd->cfq_slice_idle = 0;
4682
	wbt_disable_default(q);
4683 4684
}

L
Linus Torvalds 已提交
4685 4686 4687 4688 4689 4690
/*
 * sysfs parts below -->
 */
static ssize_t
cfq_var_show(unsigned int var, char *page)
{
4691
	return sprintf(page, "%u\n", var);
L
Linus Torvalds 已提交
4692 4693
}

4694 4695
static void
cfq_var_store(unsigned int *var, const char *page)
L
Linus Torvalds 已提交
4696 4697 4698 4699 4700 4701 4702
{
	char *p = (char *) page;

	*var = simple_strtoul(p, &p, 10);
}

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
J
Jens Axboe 已提交
4703
static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
L
Linus Torvalds 已提交
4704
{									\
4705
	struct cfq_data *cfqd = e->elevator_data;			\
4706
	u64 __data = __VAR;						\
L
Linus Torvalds 已提交
4707
	if (__CONV)							\
4708
		__data = div_u64(__data, NSEC_PER_MSEC);			\
L
Linus Torvalds 已提交
4709 4710 4711
	return cfq_var_show(__data, (page));				\
}
SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4712 4713
SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4714 4715
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4716
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4717
SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4718 4719 4720
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4721
SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4722
SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
L
Linus Torvalds 已提交
4723 4724
#undef SHOW_FUNCTION

4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739
#define USEC_SHOW_FUNCTION(__FUNC, __VAR)				\
static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
{									\
	struct cfq_data *cfqd = e->elevator_data;			\
	u64 __data = __VAR;						\
	__data = div_u64(__data, NSEC_PER_USEC);			\
	return cfq_var_show(__data, (page));				\
}
USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
#undef USEC_SHOW_FUNCTION

L
Linus Torvalds 已提交
4740
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
J
Jens Axboe 已提交
4741
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
L
Linus Torvalds 已提交
4742
{									\
4743
	struct cfq_data *cfqd = e->elevator_data;			\
L
Linus Torvalds 已提交
4744
	unsigned int __data;						\
4745
	cfq_var_store(&__data, (page));					\
L
Linus Torvalds 已提交
4746 4747 4748 4749 4750
	if (__data < (MIN))						\
		__data = (MIN);						\
	else if (__data > (MAX))					\
		__data = (MAX);						\
	if (__CONV)							\
4751
		*(__PTR) = (u64)__data * NSEC_PER_MSEC;			\
L
Linus Torvalds 已提交
4752 4753
	else								\
		*(__PTR) = __data;					\
4754
	return count;							\
L
Linus Torvalds 已提交
4755 4756
}
STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4757 4758 4759 4760
STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
		UINT_MAX, 1);
STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
		UINT_MAX, 1);
4761
STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4762 4763
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
		UINT_MAX, 0);
4764
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4765
STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4766 4767
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4768 4769
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
		UINT_MAX, 0);
4770
STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4771
STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
L
Linus Torvalds 已提交
4772 4773
#undef STORE_FUNCTION

4774 4775 4776 4777 4778
#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)			\
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
{									\
	struct cfq_data *cfqd = e->elevator_data;			\
	unsigned int __data;						\
4779
	cfq_var_store(&__data, (page));					\
4780 4781 4782 4783 4784
	if (__data < (MIN))						\
		__data = (MIN);						\
	else if (__data > (MAX))					\
		__data = (MAX);						\
	*(__PTR) = (u64)__data * NSEC_PER_USEC;				\
4785
	return count;							\
4786 4787 4788 4789 4790 4791 4792 4793
}
USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
#undef USEC_STORE_FUNCTION

4794 4795 4796 4797 4798 4799 4800 4801 4802 4803
#define CFQ_ATTR(name) \
	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

static struct elv_fs_entry cfq_attrs[] = {
	CFQ_ATTR(quantum),
	CFQ_ATTR(fifo_expire_sync),
	CFQ_ATTR(fifo_expire_async),
	CFQ_ATTR(back_seek_max),
	CFQ_ATTR(back_seek_penalty),
	CFQ_ATTR(slice_sync),
4804
	CFQ_ATTR(slice_sync_us),
4805
	CFQ_ATTR(slice_async),
4806
	CFQ_ATTR(slice_async_us),
4807 4808
	CFQ_ATTR(slice_async_rq),
	CFQ_ATTR(slice_idle),
4809
	CFQ_ATTR(slice_idle_us),
4810
	CFQ_ATTR(group_idle),
4811
	CFQ_ATTR(group_idle_us),
4812
	CFQ_ATTR(low_latency),
4813
	CFQ_ATTR(target_latency),
4814
	CFQ_ATTR(target_latency_us),
4815
	__ATTR_NULL
L
Linus Torvalds 已提交
4816 4817 4818
};

static struct elevator_type iosched_cfq = {
4819
	.ops.sq = {
L
Linus Torvalds 已提交
4820 4821 4822
		.elevator_merge_fn = 		cfq_merge,
		.elevator_merged_fn =		cfq_merged_request,
		.elevator_merge_req_fn =	cfq_merged_requests,
4823 4824
		.elevator_allow_bio_merge_fn =	cfq_allow_bio_merge,
		.elevator_allow_rq_merge_fn =	cfq_allow_rq_merge,
D
Divyesh Shah 已提交
4825
		.elevator_bio_merged_fn =	cfq_bio_merged,
4826
		.elevator_dispatch_fn =		cfq_dispatch_requests,
L
Linus Torvalds 已提交
4827
		.elevator_add_req_fn =		cfq_insert_request,
4828
		.elevator_activate_req_fn =	cfq_activate_request,
L
Linus Torvalds 已提交
4829 4830
		.elevator_deactivate_req_fn =	cfq_deactivate_request,
		.elevator_completed_req_fn =	cfq_completed_request,
4831 4832
		.elevator_former_req_fn =	elv_rb_former_request,
		.elevator_latter_req_fn =	elv_rb_latter_request,
4833
		.elevator_init_icq_fn =		cfq_init_icq,
4834
		.elevator_exit_icq_fn =		cfq_exit_icq,
L
Linus Torvalds 已提交
4835 4836 4837 4838 4839
		.elevator_set_req_fn =		cfq_set_request,
		.elevator_put_req_fn =		cfq_put_request,
		.elevator_may_queue_fn =	cfq_may_queue,
		.elevator_init_fn =		cfq_init_queue,
		.elevator_exit_fn =		cfq_exit_queue,
4840
		.elevator_registered_fn =	cfq_registered_queue,
L
Linus Torvalds 已提交
4841
	},
4842 4843
	.icq_size	=	sizeof(struct cfq_io_cq),
	.icq_align	=	__alignof__(struct cfq_io_cq),
4844
	.elevator_attrs =	cfq_attrs,
4845
	.elevator_name	=	"cfq",
L
Linus Torvalds 已提交
4846 4847 4848
	.elevator_owner =	THIS_MODULE,
};

4849
#ifdef CONFIG_CFQ_GROUP_IOSCHED
T
Tejun Heo 已提交
4850
static struct blkcg_policy blkcg_policy_cfq = {
4851
	.dfl_cftypes		= cfq_blkcg_files,
4852
	.legacy_cftypes		= cfq_blkcg_legacy_files,
4853

4854
	.cpd_alloc_fn		= cfq_cpd_alloc,
4855
	.cpd_init_fn		= cfq_cpd_init,
4856
	.cpd_free_fn		= cfq_cpd_free,
4857
	.cpd_bind_fn		= cfq_cpd_bind,
4858

4859
	.pd_alloc_fn		= cfq_pd_alloc,
4860
	.pd_init_fn		= cfq_pd_init,
4861
	.pd_offline_fn		= cfq_pd_offline,
4862
	.pd_free_fn		= cfq_pd_free,
4863
	.pd_reset_stats_fn	= cfq_pd_reset_stats,
4864 4865 4866
};
#endif

L
Linus Torvalds 已提交
4867 4868
static int __init cfq_init(void)
{
4869 4870
	int ret;

4871
#ifdef CONFIG_CFQ_GROUP_IOSCHED
T
Tejun Heo 已提交
4872
	ret = blkcg_policy_register(&blkcg_policy_cfq);
T
Tejun Heo 已提交
4873 4874
	if (ret)
		return ret;
4875 4876 4877
#else
	cfq_group_idle = 0;
#endif
T
Tejun Heo 已提交
4878

4879
	ret = -ENOMEM;
4880 4881
	cfq_pool = KMEM_CACHE(cfq_queue, 0);
	if (!cfq_pool)
T
Tejun Heo 已提交
4882
		goto err_pol_unreg;
L
Linus Torvalds 已提交
4883

4884
	ret = elv_register(&iosched_cfq);
T
Tejun Heo 已提交
4885 4886
	if (ret)
		goto err_free_pool;
4887

4888
	return 0;
T
Tejun Heo 已提交
4889 4890 4891 4892

err_free_pool:
	kmem_cache_destroy(cfq_pool);
err_pol_unreg:
4893
#ifdef CONFIG_CFQ_GROUP_IOSCHED
T
Tejun Heo 已提交
4894
	blkcg_policy_unregister(&blkcg_policy_cfq);
4895
#endif
T
Tejun Heo 已提交
4896
	return ret;
L
Linus Torvalds 已提交
4897 4898 4899 4900
}

static void __exit cfq_exit(void)
{
4901
#ifdef CONFIG_CFQ_GROUP_IOSCHED
T
Tejun Heo 已提交
4902
	blkcg_policy_unregister(&blkcg_policy_cfq);
4903
#endif
L
Linus Torvalds 已提交
4904
	elv_unregister(&iosched_cfq);
4905
	kmem_cache_destroy(cfq_pool);
L
Linus Torvalds 已提交
4906 4907 4908 4909 4910 4911 4912 4913
}

module_init(cfq_init);
module_exit(cfq_exit);

MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");