elevator.c 26.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
6
 * 30042000 Jens Axboe <axboe@kernel.dk> :
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
34
#include <linux/blktrace_api.h>
35
#include <linux/hash.h>
36
#include <linux/uaccess.h>
L
Lin Ming 已提交
37
#include <linux/pm_runtime.h>
38
#include <linux/blk-cgroup.h>
L
Linus Torvalds 已提交
39

40 41
#include <trace/events/block.h>

J
Jens Axboe 已提交
42
#include "blk.h"
43
#include "blk-mq-sched.h"
J
Jens Axboe 已提交
44

L
Linus Torvalds 已提交
45 46 47
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

48 49 50
/*
 * Merge hash stuff.
 */
51
#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
52

53 54 55 56
/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
57
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
58
{
59
	struct request_queue *q = rq->q;
J
Jens Axboe 已提交
60
	struct elevator_queue *e = q->elevator;
61

62 63 64
	if (e->uses_mq && e->type->ops.mq.allow_merge)
		return e->type->ops.mq.allow_merge(q, rq, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
65
		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
66 67 68 69

	return 1;
}

L
Linus Torvalds 已提交
70 71 72
/*
 * can we safely merge with this request?
 */
73
bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
L
Linus Torvalds 已提交
74
{
75
	if (!blk_rq_merge_ok(rq, bio))
76
		return false;
77

78 79
	if (!elv_iosched_allow_bio_merge(rq, bio))
		return false;
L
Linus Torvalds 已提交
80

81
	return true;
L
Linus Torvalds 已提交
82
}
83
EXPORT_SYMBOL(elv_bio_merge_ok);
L
Linus Torvalds 已提交
84 85 86

static struct elevator_type *elevator_find(const char *name)
{
87
	struct elevator_type *e;
L
Linus Torvalds 已提交
88

89
	list_for_each_entry(e, &elv_list, list) {
90 91
		if (!strcmp(e->elevator_name, name))
			return e;
L
Linus Torvalds 已提交
92 93
	}

94
	return NULL;
L
Linus Torvalds 已提交
95 96 97 98 99 100 101
}

static void elevator_put(struct elevator_type *e)
{
	module_put(e->elevator_owner);
}

102
static struct elevator_type *elevator_get(const char *name, bool try_loading)
L
Linus Torvalds 已提交
103
{
104
	struct elevator_type *e;
L
Linus Torvalds 已提交
105

106
	spin_lock(&elv_list_lock);
107 108

	e = elevator_find(name);
109
	if (!e && try_loading) {
110
		spin_unlock(&elv_list_lock);
K
Kees Cook 已提交
111
		request_module("%s-iosched", name);
112 113 114 115
		spin_lock(&elv_list_lock);
		e = elevator_find(name);
	}

116 117 118
	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

119
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
120 121 122 123

	return e;
}

124
static char chosen_elevator[ELV_NAME_MAX];
L
Linus Torvalds 已提交
125

126
static int __init elevator_setup(char *str)
L
Linus Torvalds 已提交
127
{
128 129 130 131
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
132
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
133
	return 1;
L
Linus Torvalds 已提交
134 135 136 137
}

__setup("elevator=", elevator_setup);

138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
/* called during boot to load the elevator chosen by the elevator param */
void __init load_default_elevator_module(void)
{
	struct elevator_type *e;

	if (!chosen_elevator[0])
		return;

	spin_lock(&elv_list_lock);
	e = elevator_find(chosen_elevator);
	spin_unlock(&elv_list_lock);

	if (!e)
		request_module("%s-iosched", chosen_elevator);
}

154 155
static struct kobj_type elv_ktype;

156
struct elevator_queue *elevator_alloc(struct request_queue *q,
157
				  struct elevator_type *e)
158
{
J
Jens Axboe 已提交
159
	struct elevator_queue *eq;
160

161
	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
162
	if (unlikely(!eq))
163
		return NULL;
164

T
Tejun Heo 已提交
165
	eq->type = e;
166
	kobject_init(&eq->kobj, &elv_ktype);
167
	mutex_init(&eq->sysfs_lock);
168
	hash_init(eq->hash);
169
	eq->uses_mq = e->uses_mq;
170

171 172
	return eq;
}
173
EXPORT_SYMBOL(elevator_alloc);
174 175 176

static void elevator_release(struct kobject *kobj)
{
J
Jens Axboe 已提交
177
	struct elevator_queue *e;
178

J
Jens Axboe 已提交
179
	e = container_of(kobj, struct elevator_queue, kobj);
T
Tejun Heo 已提交
180
	elevator_put(e->type);
181 182 183
	kfree(e);
}

184
int elevator_init(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
185 186
{
	struct elevator_type *e = NULL;
187
	int err;
L
Linus Torvalds 已提交
188

189 190 191 192 193 194
	/*
	 * q->sysfs_lock must be held to provide mutual exclusion between
	 * elevator_switch() and here.
	 */
	lockdep_assert_held(&q->sysfs_lock);

195 196 197
	if (unlikely(q->elevator))
		return 0;

T
Tejun Heo 已提交
198 199 200 201 202
	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

203
	if (name) {
204
		e = elevator_get(name, true);
205 206 207
		if (!e)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
208

209
	/*
210 211 212 213
	 * Use the default elevator specified by config boot param for
	 * non-mq devices, or by config option. Don't try to load modules
	 * as we could be running off async and request_module() isn't
	 * allowed from async.
214
	 */
215
	if (!e && !q->mq_ops && *chosen_elevator) {
216
		e = elevator_get(chosen_elevator, false);
217 218 219 220
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}
221

222
	if (!e) {
223 224 225 226 227 228 229 230 231 232 233 234
		/*
		 * For blk-mq devices, we default to using mq-deadline,
		 * if available, for single queue devices. If deadline
		 * isn't available OR we have multiple queues, default
		 * to "none".
		 */
		if (q->mq_ops) {
			if (q->nr_hw_queues == 1)
				e = elevator_get("mq-deadline", false);
			if (!e)
				return 0;
		} else
235 236
			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);

237 238 239
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
240
				"Using noop.\n");
241
			e = elevator_get("noop", false);
242
		}
243 244
	}

245 246 247
	if (e->uses_mq)
		err = blk_mq_init_sched(q, e);
	else
248
		err = e->ops.sq.elevator_init_fn(q, e);
249
	if (err)
250 251
		elevator_put(e);
	return err;
L
Linus Torvalds 已提交
252
}
253 254
EXPORT_SYMBOL(elevator_init);

255
void elevator_exit(struct request_queue *q, struct elevator_queue *e)
L
Linus Torvalds 已提交
256
{
257
	mutex_lock(&e->sysfs_lock);
258
	if (e->uses_mq && e->type->ops.mq.exit_sched)
259
		blk_mq_exit_sched(q, e);
260
	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
261
		e->type->ops.sq.elevator_exit_fn(e);
262
	mutex_unlock(&e->sysfs_lock);
L
Linus Torvalds 已提交
263

264
	kobject_put(&e->kobj);
L
Linus Torvalds 已提交
265
}
266 267
EXPORT_SYMBOL(elevator_exit);

268 269
static inline void __elv_rqhash_del(struct request *rq)
{
270
	hash_del(&rq->hash);
271
	rq->rq_flags &= ~RQF_HASHED;
272 273
}

274
void elv_rqhash_del(struct request_queue *q, struct request *rq)
275 276 277 278
{
	if (ELV_ON_HASH(rq))
		__elv_rqhash_del(rq);
}
279
EXPORT_SYMBOL_GPL(elv_rqhash_del);
280

281
void elv_rqhash_add(struct request_queue *q, struct request *rq)
282
{
J
Jens Axboe 已提交
283
	struct elevator_queue *e = q->elevator;
284 285

	BUG_ON(ELV_ON_HASH(rq));
286
	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
287
	rq->rq_flags |= RQF_HASHED;
288
}
289
EXPORT_SYMBOL_GPL(elv_rqhash_add);
290

291
void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
292 293 294 295 296
{
	__elv_rqhash_del(rq);
	elv_rqhash_add(q, rq);
}

297
struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
298
{
J
Jens Axboe 已提交
299
	struct elevator_queue *e = q->elevator;
300
	struct hlist_node *next;
301 302
	struct request *rq;

303
	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
304 305 306 307 308 309 310 311 312 313 314 315 316 317
		BUG_ON(!ELV_ON_HASH(rq));

		if (unlikely(!rq_mergeable(rq))) {
			__elv_rqhash_del(rq);
			continue;
		}

		if (rq_hash_key(rq) == offset)
			return rq;
	}

	return NULL;
}

318 319 320 321
/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
322
void elv_rb_add(struct rb_root *root, struct request *rq)
323 324 325 326 327 328 329 330 331
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct request *__rq;

	while (*p) {
		parent = *p;
		__rq = rb_entry(parent, struct request, rb_node);

332
		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
333
			p = &(*p)->rb_left;
334
		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
			p = &(*p)->rb_right;
	}

	rb_link_node(&rq->rb_node, parent, p);
	rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
	rb_erase(&rq->rb_node, root);
	RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
	struct rb_node *n = root->rb_node;
	struct request *rq;

	while (n) {
		rq = rb_entry(n, struct request, rb_node);

359
		if (sector < blk_rq_pos(rq))
360
			n = n->rb_left;
361
		else if (sector > blk_rq_pos(rq))
362 363 364 365 366 367 368 369 370
			n = n->rb_right;
		else
			return rq;
	}

	return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

371 372
/*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
U
Uwe Kleine-König 已提交
373
 * entry.  rq is sort instead into the dispatch queue. To be used by
374
 * specific elevators.
375
 */
376
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
377 378 379 380
{
	sector_t boundary;
	struct list_head *entry;

381 382
	if (q->last_merge == rq)
		q->last_merge = NULL;
383 384 385

	elv_rqhash_del(q, rq);

386
	q->nr_sorted--;
387

J
Jens Axboe 已提交
388
	boundary = q->end_sector;
389 390 391
	list_for_each_prev(entry, &q->queue_head) {
		struct request *pos = list_entry_rq(entry);

A
Adrian Hunter 已提交
392
		if (req_op(rq) != req_op(pos))
393
			break;
394 395
		if (rq_data_dir(rq) != rq_data_dir(pos))
			break;
396
		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
397
			break;
398 399
		if (blk_rq_pos(rq) >= boundary) {
			if (blk_rq_pos(pos) < boundary)
400 401
				continue;
		} else {
402
			if (blk_rq_pos(pos) >= boundary)
403 404
				break;
		}
405
		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
406 407 408 409 410
			break;
	}

	list_add(&rq->queuelist, entry);
}
411 412
EXPORT_SYMBOL(elv_dispatch_sort);

413
/*
414 415 416
 * Insert rq into dispatch queue of q.  Queue lock must be held on
 * entry.  rq is added to the back of the dispatch queue. To be used by
 * specific elevators.
417 418 419 420 421 422 423 424 425 426 427 428 429 430
 */
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}
431 432
EXPORT_SYMBOL(elv_dispatch_add_tail);

433 434
enum elv_merge elv_merge(struct request_queue *q, struct request **req,
		struct bio *bio)
L
Linus Torvalds 已提交
435
{
J
Jens Axboe 已提交
436
	struct elevator_queue *e = q->elevator;
437
	struct request *__rq;
438

439 440 441 442 443 444
	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
445
	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
446 447
		return ELEVATOR_NO_MERGE;

448 449 450
	/*
	 * First try one-hit cache.
	 */
451
	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
452 453
		enum elv_merge ret = blk_try_merge(q->last_merge, bio);

454 455 456 457 458
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}
L
Linus Torvalds 已提交
459

460
	if (blk_queue_noxmerges(q))
461 462
		return ELEVATOR_NO_MERGE;

463 464 465
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
466
	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
467
	if (__rq && elv_bio_merge_ok(__rq, bio)) {
468 469 470 471
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

472 473 474
	if (e->uses_mq && e->type->ops.mq.request_merge)
		return e->type->ops.mq.request_merge(q, req, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
475
		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
L
Linus Torvalds 已提交
476 477 478 479

	return ELEVATOR_NO_MERGE;
}

480 481 482 483 484 485 486
/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
487
bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
488 489
{
	struct request *__rq;
S
Shaohua Li 已提交
490
	bool ret;
491 492 493 494 495 496 497 498 499 500 501 502 503

	if (blk_queue_nomerges(q))
		return false;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
		return true;

	if (blk_queue_noxmerges(q))
		return false;

S
Shaohua Li 已提交
504
	ret = false;
505 506 507
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
S
Shaohua Li 已提交
508 509 510 511 512 513 514 515 516
	while (1) {
		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
			break;

		/* The merged request could be merged with others, try again */
		ret = true;
		rq = __rq;
	}
S
Shaohua Li 已提交
517

S
Shaohua Li 已提交
518
	return ret;
519 520
}

521 522
void elv_merged_request(struct request_queue *q, struct request *rq,
		enum elv_merge type)
L
Linus Torvalds 已提交
523
{
J
Jens Axboe 已提交
524
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
525

526 527 528
	if (e->uses_mq && e->type->ops.mq.request_merged)
		e->type->ops.mq.request_merged(q, rq, type);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
529
		e->type->ops.sq.elevator_merged_fn(q, rq, type);
530

531 532
	if (type == ELEVATOR_BACK_MERGE)
		elv_rqhash_reposition(q, rq);
533

534
	q->last_merge = rq;
L
Linus Torvalds 已提交
535 536
}

537
void elv_merge_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
538 539
			     struct request *next)
{
J
Jens Axboe 已提交
540
	struct elevator_queue *e = q->elevator;
541 542 543 544 545
	bool next_sorted = false;

	if (e->uses_mq && e->type->ops.mq.requests_merged)
		e->type->ops.mq.requests_merged(q, rq, next);
	else if (e->type->ops.sq.elevator_merge_req_fn) {
546
		next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
547 548 549
		if (next_sorted)
			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
	}
550

551 552
	elv_rqhash_reposition(q, rq);

553 554 555 556 557
	if (next_sorted) {
		elv_rqhash_del(q, next);
		q->nr_sorted--;
	}

558
	q->last_merge = rq;
L
Linus Torvalds 已提交
559 560
}

D
Divyesh Shah 已提交
561 562 563 564 565
void elv_bio_merged(struct request_queue *q, struct request *rq,
			struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

566 567 568
	if (WARN_ON_ONCE(e->uses_mq))
		return;

569 570
	if (e->type->ops.sq.elevator_bio_merged_fn)
		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
D
Divyesh Shah 已提交
571 572
}

573
#ifdef CONFIG_PM
L
Lin Ming 已提交
574 575
static void blk_pm_requeue_request(struct request *rq)
{
576
	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
L
Lin Ming 已提交
577 578 579 580 581
		rq->q->nr_pending--;
}

static void blk_pm_add_request(struct request_queue *q, struct request *rq)
{
582
	if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
L
Lin Ming 已提交
583 584 585 586 587 588 589 590 591 592 593
	    (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
		pm_request_resume(q->dev);
}
#else
static inline void blk_pm_requeue_request(struct request *rq) {}
static inline void blk_pm_add_request(struct request_queue *q,
				      struct request *rq)
{
}
#endif

594
void elv_requeue_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
595 596 597 598 599
{
	/*
	 * it already went through dequeue, we need to decrement the
	 * in_flight count again
	 */
600
	if (blk_account_rq(rq)) {
601
		q->in_flight[rq_is_sync(rq)]--;
602
		if (rq->rq_flags & RQF_SORTED)
603
			elv_deactivate_rq(q, rq);
604
	}
L
Linus Torvalds 已提交
605

606
	rq->rq_flags &= ~RQF_STARTED;
L
Linus Torvalds 已提交
607

L
Lin Ming 已提交
608 609
	blk_pm_requeue_request(rq);

610
	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
L
Linus Torvalds 已提交
611 612
}

613
void elv_drain_elevator(struct request_queue *q)
614
{
615
	struct elevator_queue *e = q->elevator;
616
	static int printed;
T
Tejun Heo 已提交
617

618 619 620
	if (WARN_ON_ONCE(e->uses_mq))
		return;

T
Tejun Heo 已提交
621 622
	lockdep_assert_held(q->queue_lock);

623
	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
624
		;
T
Tejun Heo 已提交
625
	if (q->nr_sorted && printed++ < 10) {
626 627
		printk(KERN_ERR "%s: forced dispatching is broken "
		       "(nr_sorted=%u), please report this\n",
T
Tejun Heo 已提交
628
		       q->elevator->type->elevator_name, q->nr_sorted);
629 630 631
	}
}

632
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
633
{
634
	trace_block_rq_insert(q, rq);
635

L
Lin Ming 已提交
636 637
	blk_pm_add_request(q, rq);

L
Linus Torvalds 已提交
638 639
	rq->q = q;

640
	if (rq->rq_flags & RQF_SOFTBARRIER) {
641
		/* barriers are scheduling boundary, update end_sector */
642
		if (!blk_rq_is_passthrough(rq)) {
643 644 645
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
646
	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
647 648
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
649 650
		where = ELEVATOR_INSERT_BACK;

651
	switch (where) {
652
	case ELEVATOR_INSERT_REQUEUE:
653
	case ELEVATOR_INSERT_FRONT:
654
		rq->rq_flags |= RQF_SOFTBARRIER;
655 656 657 658
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
659
		rq->rq_flags |= RQF_SOFTBARRIER;
660
		elv_drain_elevator(q);
661 662 663 664 665 666 667 668 669 670 671
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
672
		__blk_run_queue(q);
673 674
		break;

675 676 677 678 679 680 681 682
	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
683
	case ELEVATOR_INSERT_SORT:
684
		BUG_ON(blk_rq_is_passthrough(rq));
685
		rq->rq_flags |= RQF_SORTED;
686
		q->nr_sorted++;
687 688 689 690 691 692
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

693 694 695 696 697
		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
698
		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
699 700
		break;

701
	case ELEVATOR_INSERT_FLUSH:
702
		rq->rq_flags |= RQF_SOFTBARRIER;
703 704
		blk_insert_flush(rq);
		break;
705 706
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
707
		       __func__, where);
708 709
		BUG();
	}
L
Linus Torvalds 已提交
710
}
711 712
EXPORT_SYMBOL(__elv_add_request);

J
Jens Axboe 已提交
713
void elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
714 715 716 717
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
J
Jens Axboe 已提交
718
	__elv_add_request(q, rq, where);
L
Linus Torvalds 已提交
719 720
	spin_unlock_irqrestore(q->queue_lock, flags);
}
721 722
EXPORT_SYMBOL(elv_add_request);

723
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
724
{
J
Jens Axboe 已提交
725
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
726

727 728 729
	if (e->uses_mq && e->type->ops.mq.next_request)
		return e->type->ops.mq.next_request(q, rq);
	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
730
		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
731

L
Linus Torvalds 已提交
732 733 734
	return NULL;
}

735
struct request *elv_former_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
736
{
J
Jens Axboe 已提交
737
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
738

739 740 741
	if (e->uses_mq && e->type->ops.mq.former_request)
		return e->type->ops.mq.former_request(q, rq);
	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
742
		return e->type->ops.sq.elevator_former_req_fn(q, rq);
L
Linus Torvalds 已提交
743 744 745
	return NULL;
}

746 747
int elv_set_request(struct request_queue *q, struct request *rq,
		    struct bio *bio, gfp_t gfp_mask)
L
Linus Torvalds 已提交
748
{
J
Jens Axboe 已提交
749
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
750

751 752 753
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

754 755
	if (e->type->ops.sq.elevator_set_req_fn)
		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
L
Linus Torvalds 已提交
756 757 758
	return 0;
}

759
void elv_put_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
760
{
J
Jens Axboe 已提交
761
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
762

763 764 765
	if (WARN_ON_ONCE(e->uses_mq))
		return;

766 767
	if (e->type->ops.sq.elevator_put_req_fn)
		e->type->ops.sq.elevator_put_req_fn(rq);
L
Linus Torvalds 已提交
768 769
}

770
int elv_may_queue(struct request_queue *q, unsigned int op)
L
Linus Torvalds 已提交
771
{
J
Jens Axboe 已提交
772
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
773

774 775 776
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

777 778
	if (e->type->ops.sq.elevator_may_queue_fn)
		return e->type->ops.sq.elevator_may_queue_fn(q, op);
L
Linus Torvalds 已提交
779 780 781 782

	return ELV_MQUEUE_MAY;
}

783
void elv_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
784
{
J
Jens Axboe 已提交
785
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
786

787 788 789
	if (WARN_ON_ONCE(e->uses_mq))
		return;

L
Linus Torvalds 已提交
790 791 792
	/*
	 * request is released from the driver, io must be done
	 */
793
	if (blk_account_rq(rq)) {
794
		q->in_flight[rq_is_sync(rq)]--;
795
		if ((rq->rq_flags & RQF_SORTED) &&
796 797
		    e->type->ops.sq.elevator_completed_req_fn)
			e->type->ops.sq.elevator_completed_req_fn(q, rq);
798
	}
L
Linus Torvalds 已提交
799 800
}

801 802 803 804
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
L
Linus Torvalds 已提交
805
{
806
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
807
	struct elevator_queue *e;
808 809 810 811 812
	ssize_t error;

	if (!entry->show)
		return -EIO;

J
Jens Axboe 已提交
813
	e = container_of(kobj, struct elevator_queue, kobj);
814
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
815
	error = e->type ? entry->show(e, page) : -ENOENT;
816 817 818
	mutex_unlock(&e->sysfs_lock);
	return error;
}
L
Linus Torvalds 已提交
819

820 821 822 823 824
static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
	       const char *page, size_t length)
{
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
825
	struct elevator_queue *e;
826
	ssize_t error;
L
Linus Torvalds 已提交
827

828 829
	if (!entry->store)
		return -EIO;
L
Linus Torvalds 已提交
830

J
Jens Axboe 已提交
831
	e = container_of(kobj, struct elevator_queue, kobj);
832
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
833
	error = e->type ? entry->store(e, page, length) : -ENOENT;
834 835 836 837
	mutex_unlock(&e->sysfs_lock);
	return error;
}

838
static const struct sysfs_ops elv_sysfs_ops = {
839 840 841 842 843 844 845 846 847
	.show	= elv_attr_show,
	.store	= elv_attr_store,
};

static struct kobj_type elv_ktype = {
	.sysfs_ops	= &elv_sysfs_ops,
	.release	= elevator_release,
};

848
int elv_register_queue(struct request_queue *q)
849
{
850
	struct elevator_queue *e = q->elevator;
851 852
	int error;

853
	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
854
	if (!error) {
T
Tejun Heo 已提交
855
		struct elv_fs_entry *attr = e->type->elevator_attrs;
856
		if (attr) {
857 858
			while (attr->attr.name) {
				if (sysfs_create_file(&e->kobj, &attr->attr))
859
					break;
860
				attr++;
861 862 863
			}
		}
		kobject_uevent(&e->kobj, KOBJ_ADD);
864
		e->registered = 1;
865
		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
866
			e->type->ops.sq.elevator_registered_fn(q);
867 868
	}
	return error;
L
Linus Torvalds 已提交
869
}
870
EXPORT_SYMBOL(elv_register_queue);
J
Jens Axboe 已提交
871

L
Linus Torvalds 已提交
872 873
void elv_unregister_queue(struct request_queue *q)
{
874 875 876 877 878 879 880
	if (q) {
		struct elevator_queue *e = q->elevator;

		kobject_uevent(&e->kobj, KOBJ_REMOVE);
		kobject_del(&e->kobj);
		e->registered = 0;
	}
L
Linus Torvalds 已提交
881
}
882
EXPORT_SYMBOL(elv_unregister_queue);
L
Linus Torvalds 已提交
883

884
int elv_register(struct elevator_type *e)
L
Linus Torvalds 已提交
885
{
886
	char *def = "";
887

888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
	/* create icq_cache if requested */
	if (e->icq_size) {
		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
			return -EINVAL;

		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
			 "%s_io_cq", e->elevator_name);
		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
						 e->icq_align, 0, NULL);
		if (!e->icq_cache)
			return -ENOMEM;
	}

	/* register, don't allow duplicate names */
903
	spin_lock(&elv_list_lock);
904 905 906 907 908 909
	if (elevator_find(e->elevator_name)) {
		spin_unlock(&elv_list_lock);
		if (e->icq_cache)
			kmem_cache_destroy(e->icq_cache);
		return -EBUSY;
	}
L
Linus Torvalds 已提交
910
	list_add_tail(&e->list, &elv_list);
911
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
912

913
	/* print pretty message */
914 915 916
	if (!strcmp(e->elevator_name, chosen_elevator) ||
			(!*chosen_elevator &&
			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
917 918
				def = " (default)";

919 920
	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
								def);
921
	return 0;
L
Linus Torvalds 已提交
922 923 924 925 926
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
927
	/* unregister */
928
	spin_lock(&elv_list_lock);
L
Linus Torvalds 已提交
929
	list_del_init(&e->list);
930
	spin_unlock(&elv_list_lock);
931 932 933 934 935 936 937 938 939 940

	/*
	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
	 * sure all RCU operations are complete before proceeding.
	 */
	if (e->icq_cache) {
		rcu_barrier();
		kmem_cache_destroy(e->icq_cache);
		e->icq_cache = NULL;
	}
L
Linus Torvalds 已提交
941 942 943
}
EXPORT_SYMBOL_GPL(elv_unregister);

944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
static int elevator_switch_mq(struct request_queue *q,
			      struct elevator_type *new_e)
{
	int ret;

	blk_mq_freeze_queue(q);
	blk_mq_quiesce_queue(q);

	if (q->elevator) {
		if (q->elevator->registered)
			elv_unregister_queue(q);
		ioc_clear_queue(q);
		elevator_exit(q, q->elevator);
	}

	ret = blk_mq_init_sched(q, new_e);
	if (ret)
		goto out;

	if (new_e) {
		ret = elv_register_queue(q);
		if (ret) {
			elevator_exit(q, q->elevator);
			goto out;
		}
	}

	if (new_e)
		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
	else
		blk_add_trace_msg(q, "elv switch: none");

out:
	blk_mq_unfreeze_queue(q);
	blk_mq_start_stopped_hw_queues(q, true);
	return ret;

}

L
Linus Torvalds 已提交
983 984 985 986
/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
T
Tejun Heo 已提交
987
 * one, if the new one fails init for some reason.
L
Linus Torvalds 已提交
988
 */
989
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
L
Linus Torvalds 已提交
990
{
991
	struct elevator_queue *old = q->elevator;
992
	bool old_registered = false;
993
	int err;
L
Linus Torvalds 已提交
994

995 996
	if (q->mq_ops)
		return elevator_switch_mq(q, new_e);
997

998 999 1000 1001 1002 1003 1004
	/*
	 * Turn on BYPASS and drain all requests w/ elevator private data.
	 * Block layer doesn't call into a quiesced elevator - all requests
	 * are directly put on the dispatch list without elevator data
	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
	 * merge happens either.
	 */
1005 1006 1007
	if (old) {
		old_registered = old->registered;

1008
		blk_queue_bypass_start(q);
L
Linus Torvalds 已提交
1009

1010 1011 1012 1013 1014 1015
		/* unregister and clear all auxiliary data of the old elevator */
		if (old_registered)
			elv_unregister_queue(q);

		ioc_clear_queue(q);
	}
1016

1017
	/* allocate, init and register new elevator */
1018
	err = new_e->ops.sq.elevator_init_fn(q, new_e);
1019 1020
	if (err)
		goto fail_init;
1021

1022 1023 1024
	err = elv_register_queue(q);
	if (err)
		goto fail_register;
1025 1026

	/* done, kill the old one and finish */
1027
	if (old) {
1028 1029
		elevator_exit(q, old);
		blk_queue_bypass_end(q);
1030 1031
	}

1032
	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1033

1034
	return 0;
L
Linus Torvalds 已提交
1035 1036

fail_register:
1037
	elevator_exit(q, q->elevator);
1038 1039
fail_init:
	/* switch failed, restore and re-register old elevator */
1040 1041 1042
	if (old) {
		q->elevator = old;
		elv_register_queue(q);
1043
		blk_queue_bypass_end(q);
1044
	}
N
Nick Piggin 已提交
1045

1046
	return err;
L
Linus Torvalds 已提交
1047 1048
}

1049 1050 1051
/*
 * Switch this queue to the given IO scheduler.
 */
1052
static int __elevator_change(struct request_queue *q, const char *name)
L
Linus Torvalds 已提交
1053 1054 1055 1056
{
	char elevator_name[ELV_NAME_MAX];
	struct elevator_type *e;

1057 1058 1059 1060 1061
	/*
	 * Special case for mq, turn off scheduling
	 */
	if (q->mq_ops && !strncmp(name, "none", 4))
		return elevator_switch(q, NULL);
1062

1063
	strlcpy(elevator_name, name, sizeof(elevator_name));
1064
	e = elevator_get(strstrip(elevator_name), true);
L
Linus Torvalds 已提交
1065 1066 1067 1068 1069
	if (!e) {
		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
		return -EINVAL;
	}

1070 1071
	if (q->elevator &&
	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
1072
		elevator_put(e);
1073
		return 0;
1074
	}
L
Linus Torvalds 已提交
1075

1076 1077 1078 1079 1080 1081 1082 1083 1084
	if (!e->uses_mq && q->mq_ops) {
		elevator_put(e);
		return -EINVAL;
	}
	if (e->uses_mq && !q->mq_ops) {
		elevator_put(e);
		return -EINVAL;
	}

1085 1086
	return elevator_switch(q, e);
}
1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098

int elevator_change(struct request_queue *q, const char *name)
{
	int ret;

	/* Protect q->elevator from elevator_init() */
	mutex_lock(&q->sysfs_lock);
	ret = __elevator_change(q, name);
	mutex_unlock(&q->sysfs_lock);

	return ret;
}
1099 1100 1101 1102 1103 1104 1105
EXPORT_SYMBOL(elevator_change);

ssize_t elv_iosched_store(struct request_queue *q, const char *name,
			  size_t count)
{
	int ret;

1106
	if (!(q->mq_ops || q->request_fn))
1107 1108
		return count;

1109
	ret = __elevator_change(q, name);
1110 1111 1112 1113 1114
	if (!ret)
		return count;

	printk(KERN_ERR "elevator: switch to %s failed\n", name);
	return ret;
L
Linus Torvalds 已提交
1115 1116
}

1117
ssize_t elv_iosched_show(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
1118
{
J
Jens Axboe 已提交
1119
	struct elevator_queue *e = q->elevator;
1120
	struct elevator_type *elv = NULL;
1121
	struct elevator_type *__e;
L
Linus Torvalds 已提交
1122 1123
	int len = 0;

1124
	if (!blk_queue_stackable(q))
1125 1126
		return sprintf(name, "none\n");

1127 1128 1129 1130
	if (!q->elevator)
		len += sprintf(name+len, "[none] ");
	else
		elv = e->type;
1131

1132
	spin_lock(&elv_list_lock);
1133
	list_for_each_entry(__e, &elv_list, list) {
1134
		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
L
Linus Torvalds 已提交
1135
			len += sprintf(name+len, "[%s] ", elv->elevator_name);
1136 1137 1138 1139 1140
			continue;
		}
		if (__e->uses_mq && q->mq_ops)
			len += sprintf(name+len, "%s ", __e->elevator_name);
		else if (!__e->uses_mq && !q->mq_ops)
L
Linus Torvalds 已提交
1141 1142
			len += sprintf(name+len, "%s ", __e->elevator_name);
	}
1143
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
1144

1145 1146 1147
	if (q->mq_ops && q->elevator)
		len += sprintf(name+len, "none");

L
Linus Torvalds 已提交
1148 1149 1150 1151
	len += sprintf(len+name, "\n");
	return len;
}

1152 1153
struct request *elv_rb_former_request(struct request_queue *q,
				      struct request *rq)
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
{
	struct rb_node *rbprev = rb_prev(&rq->rb_node);

	if (rbprev)
		return rb_entry_rq(rbprev);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

1164 1165
struct request *elv_rb_latter_request(struct request_queue *q,
				      struct request *rq)
1166 1167 1168 1169 1170 1171 1172 1173 1174
{
	struct rb_node *rbnext = rb_next(&rq->rb_node);

	if (rbnext)
		return rb_entry_rq(rbnext);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);