elevator.c 27.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
6
 * 30042000 Jens Axboe <axboe@kernel.dk> :
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
34
#include <linux/blktrace_api.h>
35
#include <linux/hash.h>
36
#include <linux/uaccess.h>
L
Lin Ming 已提交
37
#include <linux/pm_runtime.h>
38
#include <linux/blk-cgroup.h>
L
Linus Torvalds 已提交
39

40 41
#include <trace/events/block.h>

J
Jens Axboe 已提交
42
#include "blk.h"
43
#include "blk-mq-sched.h"
44
#include "blk-wbt.h"
J
Jens Axboe 已提交
45

L
Linus Torvalds 已提交
46 47 48
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

49 50 51
/*
 * Merge hash stuff.
 */
52
#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
53

54 55 56 57
/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
58
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
59
{
60
	struct request_queue *q = rq->q;
J
Jens Axboe 已提交
61
	struct elevator_queue *e = q->elevator;
62

63 64 65
	if (e->uses_mq && e->type->ops.mq.allow_merge)
		return e->type->ops.mq.allow_merge(q, rq, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
66
		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
67 68 69 70

	return 1;
}

L
Linus Torvalds 已提交
71 72 73
/*
 * can we safely merge with this request?
 */
74
bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
L
Linus Torvalds 已提交
75
{
76
	if (!blk_rq_merge_ok(rq, bio))
77
		return false;
78

79 80
	if (!elv_iosched_allow_bio_merge(rq, bio))
		return false;
L
Linus Torvalds 已提交
81

82
	return true;
L
Linus Torvalds 已提交
83
}
84
EXPORT_SYMBOL(elv_bio_merge_ok);
L
Linus Torvalds 已提交
85

J
Jens Axboe 已提交
86 87 88 89 90 91 92 93 94 95
static bool elevator_match(const struct elevator_type *e, const char *name)
{
	if (!strcmp(e->elevator_name, name))
		return true;
	if (e->elevator_alias && !strcmp(e->elevator_alias, name))
		return true;

	return false;
}

96 97 98 99
/*
 * Return scheduler with name 'name' and with matching 'mq capability
 */
static struct elevator_type *elevator_find(const char *name, bool mq)
L
Linus Torvalds 已提交
100
{
101
	struct elevator_type *e;
L
Linus Torvalds 已提交
102

103
	list_for_each_entry(e, &elv_list, list) {
J
Jens Axboe 已提交
104
		if (elevator_match(e, name) && (mq == e->uses_mq))
105
			return e;
L
Linus Torvalds 已提交
106 107
	}

108
	return NULL;
L
Linus Torvalds 已提交
109 110 111 112 113 114 115
}

static void elevator_put(struct elevator_type *e)
{
	module_put(e->elevator_owner);
}

116 117
static struct elevator_type *elevator_get(struct request_queue *q,
					  const char *name, bool try_loading)
L
Linus Torvalds 已提交
118
{
119
	struct elevator_type *e;
L
Linus Torvalds 已提交
120

121
	spin_lock(&elv_list_lock);
122

123
	e = elevator_find(name, q->mq_ops != NULL);
124
	if (!e && try_loading) {
125
		spin_unlock(&elv_list_lock);
K
Kees Cook 已提交
126
		request_module("%s-iosched", name);
127
		spin_lock(&elv_list_lock);
128
		e = elevator_find(name, q->mq_ops != NULL);
129 130
	}

131 132 133
	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

134
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
135 136 137
	return e;
}

138
static char chosen_elevator[ELV_NAME_MAX];
L
Linus Torvalds 已提交
139

140
static int __init elevator_setup(char *str)
L
Linus Torvalds 已提交
141
{
142 143 144 145
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
146
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
147
	return 1;
L
Linus Torvalds 已提交
148 149 150 151
}

__setup("elevator=", elevator_setup);

152 153 154 155 156 157 158 159
/* called during boot to load the elevator chosen by the elevator param */
void __init load_default_elevator_module(void)
{
	struct elevator_type *e;

	if (!chosen_elevator[0])
		return;

160 161 162 163
	/*
	 * Boot parameter is deprecated, we haven't supported that for MQ.
	 * Only look for non-mq schedulers from here.
	 */
164
	spin_lock(&elv_list_lock);
165
	e = elevator_find(chosen_elevator, false);
166 167 168 169 170 171
	spin_unlock(&elv_list_lock);

	if (!e)
		request_module("%s-iosched", chosen_elevator);
}

172 173
static struct kobj_type elv_ktype;

174
struct elevator_queue *elevator_alloc(struct request_queue *q,
175
				  struct elevator_type *e)
176
{
J
Jens Axboe 已提交
177
	struct elevator_queue *eq;
178

179
	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
180
	if (unlikely(!eq))
181
		return NULL;
182

T
Tejun Heo 已提交
183
	eq->type = e;
184
	kobject_init(&eq->kobj, &elv_ktype);
185
	mutex_init(&eq->sysfs_lock);
186
	hash_init(eq->hash);
187
	eq->uses_mq = e->uses_mq;
188

189 190
	return eq;
}
191
EXPORT_SYMBOL(elevator_alloc);
192 193 194

static void elevator_release(struct kobject *kobj)
{
J
Jens Axboe 已提交
195
	struct elevator_queue *e;
196

J
Jens Axboe 已提交
197
	e = container_of(kobj, struct elevator_queue, kobj);
T
Tejun Heo 已提交
198
	elevator_put(e->type);
199 200 201
	kfree(e);
}

202 203 204 205 206
/*
 * Use the default elevator specified by config boot param for non-mq devices,
 * or by config option.  Don't try to load modules as we could be running off
 * async and request_module() isn't allowed from async.
 */
207
int elevator_init(struct request_queue *q)
L
Linus Torvalds 已提交
208 209
{
	struct elevator_type *e = NULL;
210
	int err = 0;
L
Linus Torvalds 已提交
211

212 213 214 215
	/*
	 * q->sysfs_lock must be held to provide mutual exclusion between
	 * elevator_switch() and here.
	 */
216
	mutex_lock(&q->sysfs_lock);
217
	if (unlikely(q->elevator))
218
		goto out_unlock;
219

220
	if (*chosen_elevator) {
221
		e = elevator_get(q, chosen_elevator, false);
222 223 224 225
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}
226

227 228
	if (!e)
		e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
229
	if (!e) {
230 231 232
		printk(KERN_ERR
			"Default I/O scheduler not found. Using noop.\n");
		e = elevator_get(q, "noop", false);
233 234
	}

235
	err = e->ops.sq.elevator_init_fn(q, e);
236
	if (err)
237
		elevator_put(e);
238 239
out_unlock:
	mutex_unlock(&q->sysfs_lock);
240
	return err;
L
Linus Torvalds 已提交
241
}
242

243
void elevator_exit(struct request_queue *q, struct elevator_queue *e)
L
Linus Torvalds 已提交
244
{
245
	mutex_lock(&e->sysfs_lock);
246
	if (e->uses_mq && e->type->ops.mq.exit_sched)
247
		blk_mq_exit_sched(q, e);
248
	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
249
		e->type->ops.sq.elevator_exit_fn(e);
250
	mutex_unlock(&e->sysfs_lock);
L
Linus Torvalds 已提交
251

252
	kobject_put(&e->kobj);
L
Linus Torvalds 已提交
253
}
254

255 256
static inline void __elv_rqhash_del(struct request *rq)
{
257
	hash_del(&rq->hash);
258
	rq->rq_flags &= ~RQF_HASHED;
259 260
}

261
void elv_rqhash_del(struct request_queue *q, struct request *rq)
262 263 264 265
{
	if (ELV_ON_HASH(rq))
		__elv_rqhash_del(rq);
}
266
EXPORT_SYMBOL_GPL(elv_rqhash_del);
267

268
void elv_rqhash_add(struct request_queue *q, struct request *rq)
269
{
J
Jens Axboe 已提交
270
	struct elevator_queue *e = q->elevator;
271 272

	BUG_ON(ELV_ON_HASH(rq));
273
	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
274
	rq->rq_flags |= RQF_HASHED;
275
}
276
EXPORT_SYMBOL_GPL(elv_rqhash_add);
277

278
void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
279 280 281 282 283
{
	__elv_rqhash_del(rq);
	elv_rqhash_add(q, rq);
}

284
struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
285
{
J
Jens Axboe 已提交
286
	struct elevator_queue *e = q->elevator;
287
	struct hlist_node *next;
288 289
	struct request *rq;

290
	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
291 292 293 294 295 296 297 298 299 300 301 302 303 304
		BUG_ON(!ELV_ON_HASH(rq));

		if (unlikely(!rq_mergeable(rq))) {
			__elv_rqhash_del(rq);
			continue;
		}

		if (rq_hash_key(rq) == offset)
			return rq;
	}

	return NULL;
}

305 306 307 308
/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
309
void elv_rb_add(struct rb_root *root, struct request *rq)
310 311 312 313 314 315 316 317 318
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct request *__rq;

	while (*p) {
		parent = *p;
		__rq = rb_entry(parent, struct request, rb_node);

319
		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
320
			p = &(*p)->rb_left;
321
		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
			p = &(*p)->rb_right;
	}

	rb_link_node(&rq->rb_node, parent, p);
	rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
	rb_erase(&rq->rb_node, root);
	RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
	struct rb_node *n = root->rb_node;
	struct request *rq;

	while (n) {
		rq = rb_entry(n, struct request, rb_node);

346
		if (sector < blk_rq_pos(rq))
347
			n = n->rb_left;
348
		else if (sector > blk_rq_pos(rq))
349 350 351 352 353 354 355 356 357
			n = n->rb_right;
		else
			return rq;
	}

	return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

358 359
/*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
U
Uwe Kleine-König 已提交
360
 * entry.  rq is sort instead into the dispatch queue. To be used by
361
 * specific elevators.
362
 */
363
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
364 365 366 367
{
	sector_t boundary;
	struct list_head *entry;

368 369
	if (q->last_merge == rq)
		q->last_merge = NULL;
370 371 372

	elv_rqhash_del(q, rq);

373
	q->nr_sorted--;
374

J
Jens Axboe 已提交
375
	boundary = q->end_sector;
376 377 378
	list_for_each_prev(entry, &q->queue_head) {
		struct request *pos = list_entry_rq(entry);

A
Adrian Hunter 已提交
379
		if (req_op(rq) != req_op(pos))
380
			break;
381 382
		if (rq_data_dir(rq) != rq_data_dir(pos))
			break;
383
		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
384
			break;
385 386
		if (blk_rq_pos(rq) >= boundary) {
			if (blk_rq_pos(pos) < boundary)
387 388
				continue;
		} else {
389
			if (blk_rq_pos(pos) >= boundary)
390 391
				break;
		}
392
		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
393 394 395 396 397
			break;
	}

	list_add(&rq->queuelist, entry);
}
398 399
EXPORT_SYMBOL(elv_dispatch_sort);

400
/*
401 402 403
 * Insert rq into dispatch queue of q.  Queue lock must be held on
 * entry.  rq is added to the back of the dispatch queue. To be used by
 * specific elevators.
404 405 406 407 408 409 410 411 412 413 414 415 416 417
 */
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}
418 419
EXPORT_SYMBOL(elv_dispatch_add_tail);

420 421
enum elv_merge elv_merge(struct request_queue *q, struct request **req,
		struct bio *bio)
L
Linus Torvalds 已提交
422
{
J
Jens Axboe 已提交
423
	struct elevator_queue *e = q->elevator;
424
	struct request *__rq;
425

426 427 428 429 430 431
	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
432
	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
433 434
		return ELEVATOR_NO_MERGE;

435 436 437
	/*
	 * First try one-hit cache.
	 */
438
	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
439 440
		enum elv_merge ret = blk_try_merge(q->last_merge, bio);

441 442 443 444 445
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}
L
Linus Torvalds 已提交
446

447
	if (blk_queue_noxmerges(q))
448 449
		return ELEVATOR_NO_MERGE;

450 451 452
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
453
	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
454
	if (__rq && elv_bio_merge_ok(__rq, bio)) {
455 456 457 458
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

459 460 461
	if (e->uses_mq && e->type->ops.mq.request_merge)
		return e->type->ops.mq.request_merge(q, req, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
462
		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
L
Linus Torvalds 已提交
463 464 465 466

	return ELEVATOR_NO_MERGE;
}

467 468 469 470 471 472 473
/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
474
bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
475 476
{
	struct request *__rq;
S
Shaohua Li 已提交
477
	bool ret;
478 479 480 481 482 483 484 485 486 487 488 489 490

	if (blk_queue_nomerges(q))
		return false;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
		return true;

	if (blk_queue_noxmerges(q))
		return false;

S
Shaohua Li 已提交
491
	ret = false;
492 493 494
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
S
Shaohua Li 已提交
495 496 497 498 499 500 501 502 503
	while (1) {
		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
			break;

		/* The merged request could be merged with others, try again */
		ret = true;
		rq = __rq;
	}
S
Shaohua Li 已提交
504

S
Shaohua Li 已提交
505
	return ret;
506 507
}

508 509
void elv_merged_request(struct request_queue *q, struct request *rq,
		enum elv_merge type)
L
Linus Torvalds 已提交
510
{
J
Jens Axboe 已提交
511
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
512

513 514 515
	if (e->uses_mq && e->type->ops.mq.request_merged)
		e->type->ops.mq.request_merged(q, rq, type);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
516
		e->type->ops.sq.elevator_merged_fn(q, rq, type);
517

518 519
	if (type == ELEVATOR_BACK_MERGE)
		elv_rqhash_reposition(q, rq);
520

521
	q->last_merge = rq;
L
Linus Torvalds 已提交
522 523
}

524
void elv_merge_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
525 526
			     struct request *next)
{
J
Jens Axboe 已提交
527
	struct elevator_queue *e = q->elevator;
528 529 530 531 532
	bool next_sorted = false;

	if (e->uses_mq && e->type->ops.mq.requests_merged)
		e->type->ops.mq.requests_merged(q, rq, next);
	else if (e->type->ops.sq.elevator_merge_req_fn) {
533
		next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
534 535 536
		if (next_sorted)
			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
	}
537

538 539
	elv_rqhash_reposition(q, rq);

540 541 542 543 544
	if (next_sorted) {
		elv_rqhash_del(q, next);
		q->nr_sorted--;
	}

545
	q->last_merge = rq;
L
Linus Torvalds 已提交
546 547
}

D
Divyesh Shah 已提交
548 549 550 551 552
void elv_bio_merged(struct request_queue *q, struct request *rq,
			struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

553 554 555
	if (WARN_ON_ONCE(e->uses_mq))
		return;

556 557
	if (e->type->ops.sq.elevator_bio_merged_fn)
		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
D
Divyesh Shah 已提交
558 559
}

560
#ifdef CONFIG_PM
L
Lin Ming 已提交
561 562
static void blk_pm_requeue_request(struct request *rq)
{
563
	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
L
Lin Ming 已提交
564 565 566 567 568
		rq->q->nr_pending--;
}

static void blk_pm_add_request(struct request_queue *q, struct request *rq)
{
569
	if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
L
Lin Ming 已提交
570 571 572 573 574 575 576 577 578 579 580
	    (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
		pm_request_resume(q->dev);
}
#else
static inline void blk_pm_requeue_request(struct request *rq) {}
static inline void blk_pm_add_request(struct request_queue *q,
				      struct request *rq)
{
}
#endif

581
void elv_requeue_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
582 583 584 585 586
{
	/*
	 * it already went through dequeue, we need to decrement the
	 * in_flight count again
	 */
587
	if (blk_account_rq(rq)) {
588
		q->in_flight[rq_is_sync(rq)]--;
589
		if (rq->rq_flags & RQF_SORTED)
590
			elv_deactivate_rq(q, rq);
591
	}
L
Linus Torvalds 已提交
592

593
	rq->rq_flags &= ~RQF_STARTED;
L
Linus Torvalds 已提交
594

L
Lin Ming 已提交
595 596
	blk_pm_requeue_request(rq);

597
	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
L
Linus Torvalds 已提交
598 599
}

600
void elv_drain_elevator(struct request_queue *q)
601
{
602
	struct elevator_queue *e = q->elevator;
603
	static int printed;
T
Tejun Heo 已提交
604

605 606 607
	if (WARN_ON_ONCE(e->uses_mq))
		return;

T
Tejun Heo 已提交
608 609
	lockdep_assert_held(q->queue_lock);

610
	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
611
		;
T
Tejun Heo 已提交
612
	if (q->nr_sorted && printed++ < 10) {
613 614
		printk(KERN_ERR "%s: forced dispatching is broken "
		       "(nr_sorted=%u), please report this\n",
T
Tejun Heo 已提交
615
		       q->elevator->type->elevator_name, q->nr_sorted);
616 617 618
	}
}

619
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
620
{
621
	trace_block_rq_insert(q, rq);
622

L
Lin Ming 已提交
623 624
	blk_pm_add_request(q, rq);

L
Linus Torvalds 已提交
625 626
	rq->q = q;

627
	if (rq->rq_flags & RQF_SOFTBARRIER) {
628
		/* barriers are scheduling boundary, update end_sector */
629
		if (!blk_rq_is_passthrough(rq)) {
630 631 632
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
633
	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
634 635
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
636 637
		where = ELEVATOR_INSERT_BACK;

638
	switch (where) {
639
	case ELEVATOR_INSERT_REQUEUE:
640
	case ELEVATOR_INSERT_FRONT:
641
		rq->rq_flags |= RQF_SOFTBARRIER;
642 643 644 645
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
646
		rq->rq_flags |= RQF_SOFTBARRIER;
647
		elv_drain_elevator(q);
648 649 650 651 652 653 654 655 656 657 658
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
659
		__blk_run_queue(q);
660 661
		break;

662 663 664 665 666 667 668 669
	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
670
		/* fall through */
671
	case ELEVATOR_INSERT_SORT:
672
		BUG_ON(blk_rq_is_passthrough(rq));
673
		rq->rq_flags |= RQF_SORTED;
674
		q->nr_sorted++;
675 676 677 678 679 680
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

681 682 683 684 685
		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
686
		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
687 688
		break;

689
	case ELEVATOR_INSERT_FLUSH:
690
		rq->rq_flags |= RQF_SOFTBARRIER;
691 692
		blk_insert_flush(rq);
		break;
693 694
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
695
		       __func__, where);
696 697
		BUG();
	}
L
Linus Torvalds 已提交
698
}
699 700
EXPORT_SYMBOL(__elv_add_request);

J
Jens Axboe 已提交
701
void elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
702 703 704 705
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
J
Jens Axboe 已提交
706
	__elv_add_request(q, rq, where);
L
Linus Torvalds 已提交
707 708
	spin_unlock_irqrestore(q->queue_lock, flags);
}
709 710
EXPORT_SYMBOL(elv_add_request);

711
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
712
{
J
Jens Axboe 已提交
713
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
714

715 716 717
	if (e->uses_mq && e->type->ops.mq.next_request)
		return e->type->ops.mq.next_request(q, rq);
	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
718
		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
719

L
Linus Torvalds 已提交
720 721 722
	return NULL;
}

723
struct request *elv_former_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
724
{
J
Jens Axboe 已提交
725
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
726

727 728 729
	if (e->uses_mq && e->type->ops.mq.former_request)
		return e->type->ops.mq.former_request(q, rq);
	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
730
		return e->type->ops.sq.elevator_former_req_fn(q, rq);
L
Linus Torvalds 已提交
731 732 733
	return NULL;
}

734 735
int elv_set_request(struct request_queue *q, struct request *rq,
		    struct bio *bio, gfp_t gfp_mask)
L
Linus Torvalds 已提交
736
{
J
Jens Axboe 已提交
737
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
738

739 740 741
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

742 743
	if (e->type->ops.sq.elevator_set_req_fn)
		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
L
Linus Torvalds 已提交
744 745 746
	return 0;
}

747
void elv_put_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
748
{
J
Jens Axboe 已提交
749
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
750

751 752 753
	if (WARN_ON_ONCE(e->uses_mq))
		return;

754 755
	if (e->type->ops.sq.elevator_put_req_fn)
		e->type->ops.sq.elevator_put_req_fn(rq);
L
Linus Torvalds 已提交
756 757
}

758
int elv_may_queue(struct request_queue *q, unsigned int op)
L
Linus Torvalds 已提交
759
{
J
Jens Axboe 已提交
760
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
761

762 763 764
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

765 766
	if (e->type->ops.sq.elevator_may_queue_fn)
		return e->type->ops.sq.elevator_may_queue_fn(q, op);
L
Linus Torvalds 已提交
767 768 769 770

	return ELV_MQUEUE_MAY;
}

771
void elv_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
772
{
J
Jens Axboe 已提交
773
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
774

775 776 777
	if (WARN_ON_ONCE(e->uses_mq))
		return;

L
Linus Torvalds 已提交
778 779 780
	/*
	 * request is released from the driver, io must be done
	 */
781
	if (blk_account_rq(rq)) {
782
		q->in_flight[rq_is_sync(rq)]--;
783
		if ((rq->rq_flags & RQF_SORTED) &&
784 785
		    e->type->ops.sq.elevator_completed_req_fn)
			e->type->ops.sq.elevator_completed_req_fn(q, rq);
786
	}
L
Linus Torvalds 已提交
787 788
}

789 790 791 792
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
L
Linus Torvalds 已提交
793
{
794
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
795
	struct elevator_queue *e;
796 797 798 799 800
	ssize_t error;

	if (!entry->show)
		return -EIO;

J
Jens Axboe 已提交
801
	e = container_of(kobj, struct elevator_queue, kobj);
802
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
803
	error = e->type ? entry->show(e, page) : -ENOENT;
804 805 806
	mutex_unlock(&e->sysfs_lock);
	return error;
}
L
Linus Torvalds 已提交
807

808 809 810 811 812
static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
	       const char *page, size_t length)
{
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
813
	struct elevator_queue *e;
814
	ssize_t error;
L
Linus Torvalds 已提交
815

816 817
	if (!entry->store)
		return -EIO;
L
Linus Torvalds 已提交
818

J
Jens Axboe 已提交
819
	e = container_of(kobj, struct elevator_queue, kobj);
820
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
821
	error = e->type ? entry->store(e, page, length) : -ENOENT;
822 823 824 825
	mutex_unlock(&e->sysfs_lock);
	return error;
}

826
static const struct sysfs_ops elv_sysfs_ops = {
827 828 829 830 831 832 833 834 835
	.show	= elv_attr_show,
	.store	= elv_attr_store,
};

static struct kobj_type elv_ktype = {
	.sysfs_ops	= &elv_sysfs_ops,
	.release	= elevator_release,
};

836
int elv_register_queue(struct request_queue *q)
837
{
838
	struct elevator_queue *e = q->elevator;
839 840
	int error;

841 842
	lockdep_assert_held(&q->sysfs_lock);

843
	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
844
	if (!error) {
T
Tejun Heo 已提交
845
		struct elv_fs_entry *attr = e->type->elevator_attrs;
846
		if (attr) {
847 848
			while (attr->attr.name) {
				if (sysfs_create_file(&e->kobj, &attr->attr))
849
					break;
850
				attr++;
851 852 853
			}
		}
		kobject_uevent(&e->kobj, KOBJ_ADD);
854
		e->registered = 1;
855
		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
856
			e->type->ops.sq.elevator_registered_fn(q);
857 858
	}
	return error;
L
Linus Torvalds 已提交
859
}
J
Jens Axboe 已提交
860

L
Linus Torvalds 已提交
861 862
void elv_unregister_queue(struct request_queue *q)
{
863 864
	lockdep_assert_held(&q->sysfs_lock);

865 866 867 868 869 870
	if (q) {
		struct elevator_queue *e = q->elevator;

		kobject_uevent(&e->kobj, KOBJ_REMOVE);
		kobject_del(&e->kobj);
		e->registered = 0;
871 872
		/* Re-enable throttling in case elevator disabled it */
		wbt_enable_default(q);
873
	}
L
Linus Torvalds 已提交
874 875
}

876
int elv_register(struct elevator_type *e)
L
Linus Torvalds 已提交
877
{
878
	char *def = "";
879

880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
	/* create icq_cache if requested */
	if (e->icq_size) {
		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
			return -EINVAL;

		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
			 "%s_io_cq", e->elevator_name);
		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
						 e->icq_align, 0, NULL);
		if (!e->icq_cache)
			return -ENOMEM;
	}

	/* register, don't allow duplicate names */
895
	spin_lock(&elv_list_lock);
896
	if (elevator_find(e->elevator_name, e->uses_mq)) {
897
		spin_unlock(&elv_list_lock);
898
		kmem_cache_destroy(e->icq_cache);
899 900
		return -EBUSY;
	}
L
Linus Torvalds 已提交
901
	list_add_tail(&e->list, &elv_list);
902
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
903

904
	/* print pretty message */
J
Jens Axboe 已提交
905
	if (elevator_match(e, chosen_elevator) ||
906
			(!*chosen_elevator &&
J
Jens Axboe 已提交
907
			 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
908 909
				def = " (default)";

910 911
	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
								def);
912
	return 0;
L
Linus Torvalds 已提交
913 914 915 916 917
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
918
	/* unregister */
919
	spin_lock(&elv_list_lock);
L
Linus Torvalds 已提交
920
	list_del_init(&e->list);
921
	spin_unlock(&elv_list_lock);
922 923 924 925 926 927 928 929 930 931

	/*
	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
	 * sure all RCU operations are complete before proceeding.
	 */
	if (e->icq_cache) {
		rcu_barrier();
		kmem_cache_destroy(e->icq_cache);
		e->icq_cache = NULL;
	}
L
Linus Torvalds 已提交
932 933 934
}
EXPORT_SYMBOL_GPL(elv_unregister);

935
int elevator_switch_mq(struct request_queue *q,
936 937 938 939
			      struct elevator_type *new_e)
{
	int ret;

940 941
	lockdep_assert_held(&q->sysfs_lock);

942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969
	if (q->elevator) {
		if (q->elevator->registered)
			elv_unregister_queue(q);
		ioc_clear_queue(q);
		elevator_exit(q, q->elevator);
	}

	ret = blk_mq_init_sched(q, new_e);
	if (ret)
		goto out;

	if (new_e) {
		ret = elv_register_queue(q);
		if (ret) {
			elevator_exit(q, q->elevator);
			goto out;
		}
	}

	if (new_e)
		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
	else
		blk_add_trace_msg(q, "elv switch: none");

out:
	return ret;
}

970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003
/*
 * For blk-mq devices, we default to using mq-deadline, if available, for single
 * queue devices.  If deadline isn't available OR we have multiple queues,
 * default to "none".
 */
int elevator_init_mq(struct request_queue *q)
{
	struct elevator_type *e;
	int err = 0;

	if (q->nr_hw_queues != 1)
		return 0;

	/*
	 * q->sysfs_lock must be held to provide mutual exclusion between
	 * elevator_switch() and here.
	 */
	mutex_lock(&q->sysfs_lock);
	if (unlikely(q->elevator))
		goto out_unlock;

	e = elevator_get(q, "mq-deadline", false);
	if (!e)
		goto out_unlock;

	err = blk_mq_init_sched(q, e);
	if (err)
		elevator_put(e);
out_unlock:
	mutex_unlock(&q->sysfs_lock);
	return err;
}


L
Linus Torvalds 已提交
1004 1005 1006 1007
/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
T
Tejun Heo 已提交
1008
 * one, if the new one fails init for some reason.
L
Linus Torvalds 已提交
1009
 */
1010
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
L
Linus Torvalds 已提交
1011
{
1012
	struct elevator_queue *old = q->elevator;
1013
	bool old_registered = false;
1014
	int err;
L
Linus Torvalds 已提交
1015

1016 1017
	lockdep_assert_held(&q->sysfs_lock);

1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
	if (q->mq_ops) {
		blk_mq_freeze_queue(q);
		blk_mq_quiesce_queue(q);

		err = elevator_switch_mq(q, new_e);

		blk_mq_unquiesce_queue(q);
		blk_mq_unfreeze_queue(q);

		return err;
	}
1029

1030 1031 1032 1033 1034 1035 1036
	/*
	 * Turn on BYPASS and drain all requests w/ elevator private data.
	 * Block layer doesn't call into a quiesced elevator - all requests
	 * are directly put on the dispatch list without elevator data
	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
	 * merge happens either.
	 */
1037 1038 1039
	if (old) {
		old_registered = old->registered;

1040
		blk_queue_bypass_start(q);
L
Linus Torvalds 已提交
1041

1042 1043 1044 1045 1046 1047
		/* unregister and clear all auxiliary data of the old elevator */
		if (old_registered)
			elv_unregister_queue(q);

		ioc_clear_queue(q);
	}
1048

1049
	/* allocate, init and register new elevator */
1050
	err = new_e->ops.sq.elevator_init_fn(q, new_e);
1051 1052
	if (err)
		goto fail_init;
1053

1054 1055 1056
	err = elv_register_queue(q);
	if (err)
		goto fail_register;
1057 1058

	/* done, kill the old one and finish */
1059
	if (old) {
1060 1061
		elevator_exit(q, old);
		blk_queue_bypass_end(q);
1062 1063
	}

1064
	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1065

1066
	return 0;
L
Linus Torvalds 已提交
1067 1068

fail_register:
1069
	elevator_exit(q, q->elevator);
1070 1071
fail_init:
	/* switch failed, restore and re-register old elevator */
1072 1073 1074
	if (old) {
		q->elevator = old;
		elv_register_queue(q);
1075
		blk_queue_bypass_end(q);
1076
	}
N
Nick Piggin 已提交
1077

1078
	return err;
L
Linus Torvalds 已提交
1079 1080
}

1081 1082 1083
/*
 * Switch this queue to the given IO scheduler.
 */
1084
static int __elevator_change(struct request_queue *q, const char *name)
L
Linus Torvalds 已提交
1085 1086 1087 1088
{
	char elevator_name[ELV_NAME_MAX];
	struct elevator_type *e;

1089 1090 1091 1092
	/* Make sure queue is not in the middle of being removed */
	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
		return -ENOENT;

1093 1094 1095 1096 1097
	/*
	 * Special case for mq, turn off scheduling
	 */
	if (q->mq_ops && !strncmp(name, "none", 4))
		return elevator_switch(q, NULL);
1098

1099
	strlcpy(elevator_name, name, sizeof(elevator_name));
1100
	e = elevator_get(q, strstrip(elevator_name), true);
1101
	if (!e)
L
Linus Torvalds 已提交
1102 1103
		return -EINVAL;

J
Jens Axboe 已提交
1104
	if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
1105
		elevator_put(e);
1106
		return 0;
1107
	}
L
Linus Torvalds 已提交
1108

1109 1110
	return elevator_switch(q, e);
}
1111

M
Ming Lei 已提交
1112 1113 1114 1115 1116 1117 1118 1119
static inline bool elv_support_iosched(struct request_queue *q)
{
	if (q->mq_ops && q->tag_set && (q->tag_set->flags &
				BLK_MQ_F_NO_SCHED))
		return false;
	return true;
}

1120 1121 1122 1123 1124
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
			  size_t count)
{
	int ret;

M
Ming Lei 已提交
1125
	if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q))
1126 1127
		return count;

1128
	ret = __elevator_change(q, name);
1129 1130 1131 1132
	if (!ret)
		return count;

	return ret;
L
Linus Torvalds 已提交
1133 1134
}

1135
ssize_t elv_iosched_show(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
1136
{
J
Jens Axboe 已提交
1137
	struct elevator_queue *e = q->elevator;
1138
	struct elevator_type *elv = NULL;
1139
	struct elevator_type *__e;
J
Jens Axboe 已提交
1140
	bool uses_mq = q->mq_ops != NULL;
L
Linus Torvalds 已提交
1141 1142
	int len = 0;

1143
	if (!queue_is_rq_based(q))
1144 1145
		return sprintf(name, "none\n");

1146 1147 1148 1149
	if (!q->elevator)
		len += sprintf(name+len, "[none] ");
	else
		elv = e->type;
1150

1151
	spin_lock(&elv_list_lock);
1152
	list_for_each_entry(__e, &elv_list, list) {
J
Jens Axboe 已提交
1153 1154
		if (elv && elevator_match(elv, __e->elevator_name) &&
		    (__e->uses_mq == uses_mq)) {
L
Linus Torvalds 已提交
1155
			len += sprintf(name+len, "[%s] ", elv->elevator_name);
1156 1157
			continue;
		}
M
Ming Lei 已提交
1158
		if (__e->uses_mq && q->mq_ops && elv_support_iosched(q))
1159 1160
			len += sprintf(name+len, "%s ", __e->elevator_name);
		else if (!__e->uses_mq && !q->mq_ops)
L
Linus Torvalds 已提交
1161 1162
			len += sprintf(name+len, "%s ", __e->elevator_name);
	}
1163
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
1164

1165 1166 1167
	if (q->mq_ops && q->elevator)
		len += sprintf(name+len, "none");

L
Linus Torvalds 已提交
1168 1169 1170 1171
	len += sprintf(len+name, "\n");
	return len;
}

1172 1173
struct request *elv_rb_former_request(struct request_queue *q,
				      struct request *rq)
1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
{
	struct rb_node *rbprev = rb_prev(&rq->rb_node);

	if (rbprev)
		return rb_entry_rq(rbprev);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

1184 1185
struct request *elv_rb_latter_request(struct request_queue *q,
				      struct request *rq)
1186 1187 1188 1189 1190 1191 1192 1193 1194
{
	struct rb_node *rbnext = rb_next(&rq->rb_node);

	if (rbnext)
		return rb_entry_rq(rbnext);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);