elevator.c 26.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
6
 * 30042000 Jens Axboe <axboe@kernel.dk> :
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
34
#include <linux/blktrace_api.h>
35
#include <linux/hash.h>
36
#include <linux/uaccess.h>
L
Lin Ming 已提交
37
#include <linux/pm_runtime.h>
38
#include <linux/blk-cgroup.h>
L
Linus Torvalds 已提交
39

40 41
#include <trace/events/block.h>

J
Jens Axboe 已提交
42
#include "blk.h"
43
#include "blk-mq-sched.h"
44
#include "blk-wbt.h"
J
Jens Axboe 已提交
45

L
Linus Torvalds 已提交
46 47 48
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

49 50 51
/*
 * Merge hash stuff.
 */
52
#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
53

54 55 56 57
/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
58
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
59
{
60
	struct request_queue *q = rq->q;
J
Jens Axboe 已提交
61
	struct elevator_queue *e = q->elevator;
62

63 64 65
	if (e->uses_mq && e->type->ops.mq.allow_merge)
		return e->type->ops.mq.allow_merge(q, rq, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
66
		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
67 68 69 70

	return 1;
}

L
Linus Torvalds 已提交
71 72 73
/*
 * can we safely merge with this request?
 */
74
bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
L
Linus Torvalds 已提交
75
{
76
	if (!blk_rq_merge_ok(rq, bio))
77
		return false;
78

79 80
	if (!elv_iosched_allow_bio_merge(rq, bio))
		return false;
L
Linus Torvalds 已提交
81

82
	return true;
L
Linus Torvalds 已提交
83
}
84
EXPORT_SYMBOL(elv_bio_merge_ok);
L
Linus Torvalds 已提交
85 86 87

static struct elevator_type *elevator_find(const char *name)
{
88
	struct elevator_type *e;
L
Linus Torvalds 已提交
89

90
	list_for_each_entry(e, &elv_list, list) {
91 92
		if (!strcmp(e->elevator_name, name))
			return e;
L
Linus Torvalds 已提交
93 94
	}

95
	return NULL;
L
Linus Torvalds 已提交
96 97 98 99 100 101 102
}

static void elevator_put(struct elevator_type *e)
{
	module_put(e->elevator_owner);
}

103
static struct elevator_type *elevator_get(const char *name, bool try_loading)
L
Linus Torvalds 已提交
104
{
105
	struct elevator_type *e;
L
Linus Torvalds 已提交
106

107
	spin_lock(&elv_list_lock);
108 109

	e = elevator_find(name);
110
	if (!e && try_loading) {
111
		spin_unlock(&elv_list_lock);
K
Kees Cook 已提交
112
		request_module("%s-iosched", name);
113 114 115 116
		spin_lock(&elv_list_lock);
		e = elevator_find(name);
	}

117 118 119
	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

120
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
121 122 123 124

	return e;
}

125
static char chosen_elevator[ELV_NAME_MAX];
L
Linus Torvalds 已提交
126

127
static int __init elevator_setup(char *str)
L
Linus Torvalds 已提交
128
{
129 130 131 132
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
133
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
134
	return 1;
L
Linus Torvalds 已提交
135 136 137 138
}

__setup("elevator=", elevator_setup);

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
/* called during boot to load the elevator chosen by the elevator param */
void __init load_default_elevator_module(void)
{
	struct elevator_type *e;

	if (!chosen_elevator[0])
		return;

	spin_lock(&elv_list_lock);
	e = elevator_find(chosen_elevator);
	spin_unlock(&elv_list_lock);

	if (!e)
		request_module("%s-iosched", chosen_elevator);
}

155 156
static struct kobj_type elv_ktype;

157
struct elevator_queue *elevator_alloc(struct request_queue *q,
158
				  struct elevator_type *e)
159
{
J
Jens Axboe 已提交
160
	struct elevator_queue *eq;
161

162
	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
163
	if (unlikely(!eq))
164
		return NULL;
165

T
Tejun Heo 已提交
166
	eq->type = e;
167
	kobject_init(&eq->kobj, &elv_ktype);
168
	mutex_init(&eq->sysfs_lock);
169
	hash_init(eq->hash);
170
	eq->uses_mq = e->uses_mq;
171

172 173
	return eq;
}
174
EXPORT_SYMBOL(elevator_alloc);
175 176 177

static void elevator_release(struct kobject *kobj)
{
J
Jens Axboe 已提交
178
	struct elevator_queue *e;
179

J
Jens Axboe 已提交
180
	e = container_of(kobj, struct elevator_queue, kobj);
T
Tejun Heo 已提交
181
	elevator_put(e->type);
182 183 184
	kfree(e);
}

185
int elevator_init(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
186 187
{
	struct elevator_type *e = NULL;
188
	int err;
L
Linus Torvalds 已提交
189

190 191 192 193 194 195
	/*
	 * q->sysfs_lock must be held to provide mutual exclusion between
	 * elevator_switch() and here.
	 */
	lockdep_assert_held(&q->sysfs_lock);

196 197 198
	if (unlikely(q->elevator))
		return 0;

T
Tejun Heo 已提交
199 200 201 202 203
	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

204
	if (name) {
205
		e = elevator_get(name, true);
206 207 208
		if (!e)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
209

210
	/*
211 212 213 214
	 * Use the default elevator specified by config boot param for
	 * non-mq devices, or by config option. Don't try to load modules
	 * as we could be running off async and request_module() isn't
	 * allowed from async.
215
	 */
216
	if (!e && !q->mq_ops && *chosen_elevator) {
217
		e = elevator_get(chosen_elevator, false);
218 219 220 221
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}
222

223
	if (!e) {
224 225 226 227 228 229 230 231 232 233 234 235
		/*
		 * For blk-mq devices, we default to using mq-deadline,
		 * if available, for single queue devices. If deadline
		 * isn't available OR we have multiple queues, default
		 * to "none".
		 */
		if (q->mq_ops) {
			if (q->nr_hw_queues == 1)
				e = elevator_get("mq-deadline", false);
			if (!e)
				return 0;
		} else
236 237
			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);

238 239 240
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
241
				"Using noop.\n");
242
			e = elevator_get("noop", false);
243
		}
244 245
	}

246 247 248
	if (e->uses_mq)
		err = blk_mq_init_sched(q, e);
	else
249
		err = e->ops.sq.elevator_init_fn(q, e);
250
	if (err)
251 252
		elevator_put(e);
	return err;
L
Linus Torvalds 已提交
253
}
254 255
EXPORT_SYMBOL(elevator_init);

256
void elevator_exit(struct request_queue *q, struct elevator_queue *e)
L
Linus Torvalds 已提交
257
{
258
	mutex_lock(&e->sysfs_lock);
259
	if (e->uses_mq && e->type->ops.mq.exit_sched)
260
		blk_mq_exit_sched(q, e);
261
	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
262
		e->type->ops.sq.elevator_exit_fn(e);
263
	mutex_unlock(&e->sysfs_lock);
L
Linus Torvalds 已提交
264

265
	kobject_put(&e->kobj);
L
Linus Torvalds 已提交
266
}
267 268
EXPORT_SYMBOL(elevator_exit);

269 270
static inline void __elv_rqhash_del(struct request *rq)
{
271
	hash_del(&rq->hash);
272
	rq->rq_flags &= ~RQF_HASHED;
273 274
}

275
void elv_rqhash_del(struct request_queue *q, struct request *rq)
276 277 278 279
{
	if (ELV_ON_HASH(rq))
		__elv_rqhash_del(rq);
}
280
EXPORT_SYMBOL_GPL(elv_rqhash_del);
281

282
void elv_rqhash_add(struct request_queue *q, struct request *rq)
283
{
J
Jens Axboe 已提交
284
	struct elevator_queue *e = q->elevator;
285 286

	BUG_ON(ELV_ON_HASH(rq));
287
	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
288
	rq->rq_flags |= RQF_HASHED;
289
}
290
EXPORT_SYMBOL_GPL(elv_rqhash_add);
291

292
void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
293 294 295 296 297
{
	__elv_rqhash_del(rq);
	elv_rqhash_add(q, rq);
}

298
struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
299
{
J
Jens Axboe 已提交
300
	struct elevator_queue *e = q->elevator;
301
	struct hlist_node *next;
302 303
	struct request *rq;

304
	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
305 306 307 308 309 310 311 312 313 314 315 316 317 318
		BUG_ON(!ELV_ON_HASH(rq));

		if (unlikely(!rq_mergeable(rq))) {
			__elv_rqhash_del(rq);
			continue;
		}

		if (rq_hash_key(rq) == offset)
			return rq;
	}

	return NULL;
}

319 320 321 322
/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
323
void elv_rb_add(struct rb_root *root, struct request *rq)
324 325 326 327 328 329 330 331 332
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct request *__rq;

	while (*p) {
		parent = *p;
		__rq = rb_entry(parent, struct request, rb_node);

333
		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
334
			p = &(*p)->rb_left;
335
		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
			p = &(*p)->rb_right;
	}

	rb_link_node(&rq->rb_node, parent, p);
	rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
	rb_erase(&rq->rb_node, root);
	RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
	struct rb_node *n = root->rb_node;
	struct request *rq;

	while (n) {
		rq = rb_entry(n, struct request, rb_node);

360
		if (sector < blk_rq_pos(rq))
361
			n = n->rb_left;
362
		else if (sector > blk_rq_pos(rq))
363 364 365 366 367 368 369 370 371
			n = n->rb_right;
		else
			return rq;
	}

	return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

372 373
/*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
U
Uwe Kleine-König 已提交
374
 * entry.  rq is sort instead into the dispatch queue. To be used by
375
 * specific elevators.
376
 */
377
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
378 379 380 381
{
	sector_t boundary;
	struct list_head *entry;

382 383
	if (q->last_merge == rq)
		q->last_merge = NULL;
384 385 386

	elv_rqhash_del(q, rq);

387
	q->nr_sorted--;
388

J
Jens Axboe 已提交
389
	boundary = q->end_sector;
390 391 392
	list_for_each_prev(entry, &q->queue_head) {
		struct request *pos = list_entry_rq(entry);

A
Adrian Hunter 已提交
393
		if (req_op(rq) != req_op(pos))
394
			break;
395 396
		if (rq_data_dir(rq) != rq_data_dir(pos))
			break;
397
		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
398
			break;
399 400
		if (blk_rq_pos(rq) >= boundary) {
			if (blk_rq_pos(pos) < boundary)
401 402
				continue;
		} else {
403
			if (blk_rq_pos(pos) >= boundary)
404 405
				break;
		}
406
		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
407 408 409 410 411
			break;
	}

	list_add(&rq->queuelist, entry);
}
412 413
EXPORT_SYMBOL(elv_dispatch_sort);

414
/*
415 416 417
 * Insert rq into dispatch queue of q.  Queue lock must be held on
 * entry.  rq is added to the back of the dispatch queue. To be used by
 * specific elevators.
418 419 420 421 422 423 424 425 426 427 428 429 430 431
 */
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}
432 433
EXPORT_SYMBOL(elv_dispatch_add_tail);

434 435
enum elv_merge elv_merge(struct request_queue *q, struct request **req,
		struct bio *bio)
L
Linus Torvalds 已提交
436
{
J
Jens Axboe 已提交
437
	struct elevator_queue *e = q->elevator;
438
	struct request *__rq;
439

440 441 442 443 444 445
	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
446
	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
447 448
		return ELEVATOR_NO_MERGE;

449 450 451
	/*
	 * First try one-hit cache.
	 */
452
	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
453 454
		enum elv_merge ret = blk_try_merge(q->last_merge, bio);

455 456 457 458 459
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}
L
Linus Torvalds 已提交
460

461
	if (blk_queue_noxmerges(q))
462 463
		return ELEVATOR_NO_MERGE;

464 465 466
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
467
	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
468
	if (__rq && elv_bio_merge_ok(__rq, bio)) {
469 470 471 472
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

473 474 475
	if (e->uses_mq && e->type->ops.mq.request_merge)
		return e->type->ops.mq.request_merge(q, req, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
476
		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
L
Linus Torvalds 已提交
477 478 479 480

	return ELEVATOR_NO_MERGE;
}

481 482 483 484 485 486 487
/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
488
bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
489 490
{
	struct request *__rq;
S
Shaohua Li 已提交
491
	bool ret;
492 493 494 495 496 497 498 499 500 501 502 503 504

	if (blk_queue_nomerges(q))
		return false;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
		return true;

	if (blk_queue_noxmerges(q))
		return false;

S
Shaohua Li 已提交
505
	ret = false;
506 507 508
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
S
Shaohua Li 已提交
509 510 511 512 513 514 515 516 517
	while (1) {
		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
			break;

		/* The merged request could be merged with others, try again */
		ret = true;
		rq = __rq;
	}
S
Shaohua Li 已提交
518

S
Shaohua Li 已提交
519
	return ret;
520 521
}

522 523
void elv_merged_request(struct request_queue *q, struct request *rq,
		enum elv_merge type)
L
Linus Torvalds 已提交
524
{
J
Jens Axboe 已提交
525
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
526

527 528 529
	if (e->uses_mq && e->type->ops.mq.request_merged)
		e->type->ops.mq.request_merged(q, rq, type);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
530
		e->type->ops.sq.elevator_merged_fn(q, rq, type);
531

532 533
	if (type == ELEVATOR_BACK_MERGE)
		elv_rqhash_reposition(q, rq);
534

535
	q->last_merge = rq;
L
Linus Torvalds 已提交
536 537
}

538
void elv_merge_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
539 540
			     struct request *next)
{
J
Jens Axboe 已提交
541
	struct elevator_queue *e = q->elevator;
542 543 544 545 546
	bool next_sorted = false;

	if (e->uses_mq && e->type->ops.mq.requests_merged)
		e->type->ops.mq.requests_merged(q, rq, next);
	else if (e->type->ops.sq.elevator_merge_req_fn) {
547
		next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
548 549 550
		if (next_sorted)
			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
	}
551

552 553
	elv_rqhash_reposition(q, rq);

554 555 556 557 558
	if (next_sorted) {
		elv_rqhash_del(q, next);
		q->nr_sorted--;
	}

559
	q->last_merge = rq;
L
Linus Torvalds 已提交
560 561
}

D
Divyesh Shah 已提交
562 563 564 565 566
void elv_bio_merged(struct request_queue *q, struct request *rq,
			struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

567 568 569
	if (WARN_ON_ONCE(e->uses_mq))
		return;

570 571
	if (e->type->ops.sq.elevator_bio_merged_fn)
		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
D
Divyesh Shah 已提交
572 573
}

574
#ifdef CONFIG_PM
L
Lin Ming 已提交
575 576
static void blk_pm_requeue_request(struct request *rq)
{
577
	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
L
Lin Ming 已提交
578 579 580 581 582
		rq->q->nr_pending--;
}

static void blk_pm_add_request(struct request_queue *q, struct request *rq)
{
583
	if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
L
Lin Ming 已提交
584 585 586 587 588 589 590 591 592 593 594
	    (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
		pm_request_resume(q->dev);
}
#else
static inline void blk_pm_requeue_request(struct request *rq) {}
static inline void blk_pm_add_request(struct request_queue *q,
				      struct request *rq)
{
}
#endif

595
void elv_requeue_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
596 597 598 599 600
{
	/*
	 * it already went through dequeue, we need to decrement the
	 * in_flight count again
	 */
601
	if (blk_account_rq(rq)) {
602
		q->in_flight[rq_is_sync(rq)]--;
603
		if (rq->rq_flags & RQF_SORTED)
604
			elv_deactivate_rq(q, rq);
605
	}
L
Linus Torvalds 已提交
606

607
	rq->rq_flags &= ~RQF_STARTED;
L
Linus Torvalds 已提交
608

L
Lin Ming 已提交
609 610
	blk_pm_requeue_request(rq);

611
	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
L
Linus Torvalds 已提交
612 613
}

614
void elv_drain_elevator(struct request_queue *q)
615
{
616
	struct elevator_queue *e = q->elevator;
617
	static int printed;
T
Tejun Heo 已提交
618

619 620 621
	if (WARN_ON_ONCE(e->uses_mq))
		return;

T
Tejun Heo 已提交
622 623
	lockdep_assert_held(q->queue_lock);

624
	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
625
		;
T
Tejun Heo 已提交
626
	if (q->nr_sorted && printed++ < 10) {
627 628
		printk(KERN_ERR "%s: forced dispatching is broken "
		       "(nr_sorted=%u), please report this\n",
T
Tejun Heo 已提交
629
		       q->elevator->type->elevator_name, q->nr_sorted);
630 631 632
	}
}

633
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
634
{
635
	trace_block_rq_insert(q, rq);
636

L
Lin Ming 已提交
637 638
	blk_pm_add_request(q, rq);

L
Linus Torvalds 已提交
639 640
	rq->q = q;

641
	if (rq->rq_flags & RQF_SOFTBARRIER) {
642
		/* barriers are scheduling boundary, update end_sector */
643
		if (!blk_rq_is_passthrough(rq)) {
644 645 646
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
647
	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
648 649
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
650 651
		where = ELEVATOR_INSERT_BACK;

652
	switch (where) {
653
	case ELEVATOR_INSERT_REQUEUE:
654
	case ELEVATOR_INSERT_FRONT:
655
		rq->rq_flags |= RQF_SOFTBARRIER;
656 657 658 659
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
660
		rq->rq_flags |= RQF_SOFTBARRIER;
661
		elv_drain_elevator(q);
662 663 664 665 666 667 668 669 670 671 672
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
673
		__blk_run_queue(q);
674 675
		break;

676 677 678 679 680 681 682 683
	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
684
	case ELEVATOR_INSERT_SORT:
685
		BUG_ON(blk_rq_is_passthrough(rq));
686
		rq->rq_flags |= RQF_SORTED;
687
		q->nr_sorted++;
688 689 690 691 692 693
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

694 695 696 697 698
		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
699
		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
700 701
		break;

702
	case ELEVATOR_INSERT_FLUSH:
703
		rq->rq_flags |= RQF_SOFTBARRIER;
704 705
		blk_insert_flush(rq);
		break;
706 707
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
708
		       __func__, where);
709 710
		BUG();
	}
L
Linus Torvalds 已提交
711
}
712 713
EXPORT_SYMBOL(__elv_add_request);

J
Jens Axboe 已提交
714
void elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
715 716 717 718
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
J
Jens Axboe 已提交
719
	__elv_add_request(q, rq, where);
L
Linus Torvalds 已提交
720 721
	spin_unlock_irqrestore(q->queue_lock, flags);
}
722 723
EXPORT_SYMBOL(elv_add_request);

724
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
725
{
J
Jens Axboe 已提交
726
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
727

728 729 730
	if (e->uses_mq && e->type->ops.mq.next_request)
		return e->type->ops.mq.next_request(q, rq);
	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
731
		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
732

L
Linus Torvalds 已提交
733 734 735
	return NULL;
}

736
struct request *elv_former_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
737
{
J
Jens Axboe 已提交
738
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
739

740 741 742
	if (e->uses_mq && e->type->ops.mq.former_request)
		return e->type->ops.mq.former_request(q, rq);
	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
743
		return e->type->ops.sq.elevator_former_req_fn(q, rq);
L
Linus Torvalds 已提交
744 745 746
	return NULL;
}

747 748
int elv_set_request(struct request_queue *q, struct request *rq,
		    struct bio *bio, gfp_t gfp_mask)
L
Linus Torvalds 已提交
749
{
J
Jens Axboe 已提交
750
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
751

752 753 754
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

755 756
	if (e->type->ops.sq.elevator_set_req_fn)
		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
L
Linus Torvalds 已提交
757 758 759
	return 0;
}

760
void elv_put_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
761
{
J
Jens Axboe 已提交
762
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
763

764 765 766
	if (WARN_ON_ONCE(e->uses_mq))
		return;

767 768
	if (e->type->ops.sq.elevator_put_req_fn)
		e->type->ops.sq.elevator_put_req_fn(rq);
L
Linus Torvalds 已提交
769 770
}

771
int elv_may_queue(struct request_queue *q, unsigned int op)
L
Linus Torvalds 已提交
772
{
J
Jens Axboe 已提交
773
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
774

775 776 777
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

778 779
	if (e->type->ops.sq.elevator_may_queue_fn)
		return e->type->ops.sq.elevator_may_queue_fn(q, op);
L
Linus Torvalds 已提交
780 781 782 783

	return ELV_MQUEUE_MAY;
}

784
void elv_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
785
{
J
Jens Axboe 已提交
786
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
787

788 789 790
	if (WARN_ON_ONCE(e->uses_mq))
		return;

L
Linus Torvalds 已提交
791 792 793
	/*
	 * request is released from the driver, io must be done
	 */
794
	if (blk_account_rq(rq)) {
795
		q->in_flight[rq_is_sync(rq)]--;
796
		if ((rq->rq_flags & RQF_SORTED) &&
797 798
		    e->type->ops.sq.elevator_completed_req_fn)
			e->type->ops.sq.elevator_completed_req_fn(q, rq);
799
	}
L
Linus Torvalds 已提交
800 801
}

802 803 804 805
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
L
Linus Torvalds 已提交
806
{
807
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
808
	struct elevator_queue *e;
809 810 811 812 813
	ssize_t error;

	if (!entry->show)
		return -EIO;

J
Jens Axboe 已提交
814
	e = container_of(kobj, struct elevator_queue, kobj);
815
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
816
	error = e->type ? entry->show(e, page) : -ENOENT;
817 818 819
	mutex_unlock(&e->sysfs_lock);
	return error;
}
L
Linus Torvalds 已提交
820

821 822 823 824 825
static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
	       const char *page, size_t length)
{
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
826
	struct elevator_queue *e;
827
	ssize_t error;
L
Linus Torvalds 已提交
828

829 830
	if (!entry->store)
		return -EIO;
L
Linus Torvalds 已提交
831

J
Jens Axboe 已提交
832
	e = container_of(kobj, struct elevator_queue, kobj);
833
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
834
	error = e->type ? entry->store(e, page, length) : -ENOENT;
835 836 837 838
	mutex_unlock(&e->sysfs_lock);
	return error;
}

839
static const struct sysfs_ops elv_sysfs_ops = {
840 841 842 843 844 845 846 847 848
	.show	= elv_attr_show,
	.store	= elv_attr_store,
};

static struct kobj_type elv_ktype = {
	.sysfs_ops	= &elv_sysfs_ops,
	.release	= elevator_release,
};

849
int elv_register_queue(struct request_queue *q)
850
{
851
	struct elevator_queue *e = q->elevator;
852 853
	int error;

854
	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
855
	if (!error) {
T
Tejun Heo 已提交
856
		struct elv_fs_entry *attr = e->type->elevator_attrs;
857
		if (attr) {
858 859
			while (attr->attr.name) {
				if (sysfs_create_file(&e->kobj, &attr->attr))
860
					break;
861
				attr++;
862 863 864
			}
		}
		kobject_uevent(&e->kobj, KOBJ_ADD);
865
		e->registered = 1;
866
		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
867
			e->type->ops.sq.elevator_registered_fn(q);
868 869
	}
	return error;
L
Linus Torvalds 已提交
870
}
871
EXPORT_SYMBOL(elv_register_queue);
J
Jens Axboe 已提交
872

L
Linus Torvalds 已提交
873 874
void elv_unregister_queue(struct request_queue *q)
{
875 876 877 878 879 880
	if (q) {
		struct elevator_queue *e = q->elevator;

		kobject_uevent(&e->kobj, KOBJ_REMOVE);
		kobject_del(&e->kobj);
		e->registered = 0;
881 882
		/* Re-enable throttling in case elevator disabled it */
		wbt_enable_default(q);
883
	}
L
Linus Torvalds 已提交
884
}
885
EXPORT_SYMBOL(elv_unregister_queue);
L
Linus Torvalds 已提交
886

887
int elv_register(struct elevator_type *e)
L
Linus Torvalds 已提交
888
{
889
	char *def = "";
890

891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
	/* create icq_cache if requested */
	if (e->icq_size) {
		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
			return -EINVAL;

		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
			 "%s_io_cq", e->elevator_name);
		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
						 e->icq_align, 0, NULL);
		if (!e->icq_cache)
			return -ENOMEM;
	}

	/* register, don't allow duplicate names */
906
	spin_lock(&elv_list_lock);
907 908 909 910 911 912
	if (elevator_find(e->elevator_name)) {
		spin_unlock(&elv_list_lock);
		if (e->icq_cache)
			kmem_cache_destroy(e->icq_cache);
		return -EBUSY;
	}
L
Linus Torvalds 已提交
913
	list_add_tail(&e->list, &elv_list);
914
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
915

916
	/* print pretty message */
917 918 919
	if (!strcmp(e->elevator_name, chosen_elevator) ||
			(!*chosen_elevator &&
			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
920 921
				def = " (default)";

922 923
	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
								def);
924
	return 0;
L
Linus Torvalds 已提交
925 926 927 928 929
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
930
	/* unregister */
931
	spin_lock(&elv_list_lock);
L
Linus Torvalds 已提交
932
	list_del_init(&e->list);
933
	spin_unlock(&elv_list_lock);
934 935 936 937 938 939 940 941 942 943

	/*
	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
	 * sure all RCU operations are complete before proceeding.
	 */
	if (e->icq_cache) {
		rcu_barrier();
		kmem_cache_destroy(e->icq_cache);
		e->icq_cache = NULL;
	}
L
Linus Torvalds 已提交
944 945 946
}
EXPORT_SYMBOL_GPL(elv_unregister);

947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
static int elevator_switch_mq(struct request_queue *q,
			      struct elevator_type *new_e)
{
	int ret;

	blk_mq_freeze_queue(q);

	if (q->elevator) {
		if (q->elevator->registered)
			elv_unregister_queue(q);
		ioc_clear_queue(q);
		elevator_exit(q, q->elevator);
	}

	ret = blk_mq_init_sched(q, new_e);
	if (ret)
		goto out;

	if (new_e) {
		ret = elv_register_queue(q);
		if (ret) {
			elevator_exit(q, q->elevator);
			goto out;
		}
	}

	if (new_e)
		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
	else
		blk_add_trace_msg(q, "elv switch: none");

out:
	blk_mq_unfreeze_queue(q);
	return ret;
}

L
Linus Torvalds 已提交
983 984 985 986
/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
T
Tejun Heo 已提交
987
 * one, if the new one fails init for some reason.
L
Linus Torvalds 已提交
988
 */
989
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
L
Linus Torvalds 已提交
990
{
991
	struct elevator_queue *old = q->elevator;
992
	bool old_registered = false;
993
	int err;
L
Linus Torvalds 已提交
994

995 996
	if (q->mq_ops)
		return elevator_switch_mq(q, new_e);
997

998 999 1000 1001 1002 1003 1004
	/*
	 * Turn on BYPASS and drain all requests w/ elevator private data.
	 * Block layer doesn't call into a quiesced elevator - all requests
	 * are directly put on the dispatch list without elevator data
	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
	 * merge happens either.
	 */
1005 1006 1007
	if (old) {
		old_registered = old->registered;

1008
		blk_queue_bypass_start(q);
L
Linus Torvalds 已提交
1009

1010 1011 1012 1013 1014 1015
		/* unregister and clear all auxiliary data of the old elevator */
		if (old_registered)
			elv_unregister_queue(q);

		ioc_clear_queue(q);
	}
1016

1017
	/* allocate, init and register new elevator */
1018
	err = new_e->ops.sq.elevator_init_fn(q, new_e);
1019 1020
	if (err)
		goto fail_init;
1021

1022 1023 1024
	err = elv_register_queue(q);
	if (err)
		goto fail_register;
1025 1026

	/* done, kill the old one and finish */
1027
	if (old) {
1028 1029
		elevator_exit(q, old);
		blk_queue_bypass_end(q);
1030 1031
	}

1032
	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1033

1034
	return 0;
L
Linus Torvalds 已提交
1035 1036

fail_register:
1037
	elevator_exit(q, q->elevator);
1038 1039
fail_init:
	/* switch failed, restore and re-register old elevator */
1040 1041 1042
	if (old) {
		q->elevator = old;
		elv_register_queue(q);
1043
		blk_queue_bypass_end(q);
1044
	}
N
Nick Piggin 已提交
1045

1046
	return err;
L
Linus Torvalds 已提交
1047 1048
}

1049 1050 1051
/*
 * Switch this queue to the given IO scheduler.
 */
1052
static int __elevator_change(struct request_queue *q, const char *name)
L
Linus Torvalds 已提交
1053 1054 1055 1056
{
	char elevator_name[ELV_NAME_MAX];
	struct elevator_type *e;

1057 1058 1059 1060 1061
	/*
	 * Special case for mq, turn off scheduling
	 */
	if (q->mq_ops && !strncmp(name, "none", 4))
		return elevator_switch(q, NULL);
1062

1063
	strlcpy(elevator_name, name, sizeof(elevator_name));
1064
	e = elevator_get(strstrip(elevator_name), true);
1065
	if (!e)
L
Linus Torvalds 已提交
1066 1067
		return -EINVAL;

1068 1069
	if (q->elevator &&
	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
1070
		elevator_put(e);
1071
		return 0;
1072
	}
L
Linus Torvalds 已提交
1073

1074 1075 1076 1077 1078 1079 1080 1081 1082
	if (!e->uses_mq && q->mq_ops) {
		elevator_put(e);
		return -EINVAL;
	}
	if (e->uses_mq && !q->mq_ops) {
		elevator_put(e);
		return -EINVAL;
	}

1083 1084
	return elevator_switch(q, e);
}
1085

M
Ming Lei 已提交
1086 1087 1088 1089 1090 1091 1092 1093
static inline bool elv_support_iosched(struct request_queue *q)
{
	if (q->mq_ops && q->tag_set && (q->tag_set->flags &
				BLK_MQ_F_NO_SCHED))
		return false;
	return true;
}

1094 1095 1096 1097 1098
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
			  size_t count)
{
	int ret;

M
Ming Lei 已提交
1099
	if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q))
1100 1101
		return count;

1102
	ret = __elevator_change(q, name);
1103 1104 1105 1106
	if (!ret)
		return count;

	return ret;
L
Linus Torvalds 已提交
1107 1108
}

1109
ssize_t elv_iosched_show(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
1110
{
J
Jens Axboe 已提交
1111
	struct elevator_queue *e = q->elevator;
1112
	struct elevator_type *elv = NULL;
1113
	struct elevator_type *__e;
L
Linus Torvalds 已提交
1114 1115
	int len = 0;

1116
	if (!blk_queue_stackable(q))
1117 1118
		return sprintf(name, "none\n");

1119 1120 1121 1122
	if (!q->elevator)
		len += sprintf(name+len, "[none] ");
	else
		elv = e->type;
1123

1124
	spin_lock(&elv_list_lock);
1125
	list_for_each_entry(__e, &elv_list, list) {
1126
		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
L
Linus Torvalds 已提交
1127
			len += sprintf(name+len, "[%s] ", elv->elevator_name);
1128 1129
			continue;
		}
M
Ming Lei 已提交
1130
		if (__e->uses_mq && q->mq_ops && elv_support_iosched(q))
1131 1132
			len += sprintf(name+len, "%s ", __e->elevator_name);
		else if (!__e->uses_mq && !q->mq_ops)
L
Linus Torvalds 已提交
1133 1134
			len += sprintf(name+len, "%s ", __e->elevator_name);
	}
1135
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
1136

1137 1138 1139
	if (q->mq_ops && q->elevator)
		len += sprintf(name+len, "none");

L
Linus Torvalds 已提交
1140 1141 1142 1143
	len += sprintf(len+name, "\n");
	return len;
}

1144 1145
struct request *elv_rb_former_request(struct request_queue *q,
				      struct request *rq)
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
{
	struct rb_node *rbprev = rb_prev(&rq->rb_node);

	if (rbprev)
		return rb_entry_rq(rbprev);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

1156 1157
struct request *elv_rb_latter_request(struct request_queue *q,
				      struct request *rq)
1158 1159 1160 1161 1162 1163 1164 1165 1166
{
	struct rb_node *rbnext = rb_next(&rq->rb_node);

	if (rbnext)
		return rb_entry_rq(rbnext);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);