elevator.c 26.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
6
 * 30042000 Jens Axboe <axboe@kernel.dk> :
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
34
#include <linux/blktrace_api.h>
35
#include <linux/hash.h>
36
#include <linux/uaccess.h>
L
Lin Ming 已提交
37
#include <linux/pm_runtime.h>
38
#include <linux/blk-cgroup.h>
L
Linus Torvalds 已提交
39

40 41
#include <trace/events/block.h>

J
Jens Axboe 已提交
42
#include "blk.h"
43
#include "blk-mq-sched.h"
44
#include "blk-wbt.h"
J
Jens Axboe 已提交
45

L
Linus Torvalds 已提交
46 47 48
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

49 50 51
/*
 * Merge hash stuff.
 */
52
#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
53

54 55 56 57
/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
58
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
59
{
60
	struct request_queue *q = rq->q;
J
Jens Axboe 已提交
61
	struct elevator_queue *e = q->elevator;
62

63 64 65
	if (e->uses_mq && e->type->ops.mq.allow_merge)
		return e->type->ops.mq.allow_merge(q, rq, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
66
		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
67 68 69 70

	return 1;
}

L
Linus Torvalds 已提交
71 72 73
/*
 * can we safely merge with this request?
 */
74
bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
L
Linus Torvalds 已提交
75
{
76
	if (!blk_rq_merge_ok(rq, bio))
77
		return false;
78

79 80
	if (!elv_iosched_allow_bio_merge(rq, bio))
		return false;
L
Linus Torvalds 已提交
81

82
	return true;
L
Linus Torvalds 已提交
83
}
84
EXPORT_SYMBOL(elv_bio_merge_ok);
L
Linus Torvalds 已提交
85 86 87

static struct elevator_type *elevator_find(const char *name)
{
88
	struct elevator_type *e;
L
Linus Torvalds 已提交
89

90
	list_for_each_entry(e, &elv_list, list) {
91 92
		if (!strcmp(e->elevator_name, name))
			return e;
L
Linus Torvalds 已提交
93 94
	}

95
	return NULL;
L
Linus Torvalds 已提交
96 97 98 99 100 101 102
}

static void elevator_put(struct elevator_type *e)
{
	module_put(e->elevator_owner);
}

103
static struct elevator_type *elevator_get(const char *name, bool try_loading)
L
Linus Torvalds 已提交
104
{
105
	struct elevator_type *e;
L
Linus Torvalds 已提交
106

107
	spin_lock(&elv_list_lock);
108 109

	e = elevator_find(name);
110
	if (!e && try_loading) {
111
		spin_unlock(&elv_list_lock);
K
Kees Cook 已提交
112
		request_module("%s-iosched", name);
113 114 115 116
		spin_lock(&elv_list_lock);
		e = elevator_find(name);
	}

117 118 119
	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

120
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
121 122 123 124

	return e;
}

125
static char chosen_elevator[ELV_NAME_MAX];
L
Linus Torvalds 已提交
126

127
static int __init elevator_setup(char *str)
L
Linus Torvalds 已提交
128
{
129 130 131 132
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
133
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
134
	return 1;
L
Linus Torvalds 已提交
135 136 137 138
}

__setup("elevator=", elevator_setup);

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
/* called during boot to load the elevator chosen by the elevator param */
void __init load_default_elevator_module(void)
{
	struct elevator_type *e;

	if (!chosen_elevator[0])
		return;

	spin_lock(&elv_list_lock);
	e = elevator_find(chosen_elevator);
	spin_unlock(&elv_list_lock);

	if (!e)
		request_module("%s-iosched", chosen_elevator);
}

155 156
static struct kobj_type elv_ktype;

157
struct elevator_queue *elevator_alloc(struct request_queue *q,
158
				  struct elevator_type *e)
159
{
J
Jens Axboe 已提交
160
	struct elevator_queue *eq;
161

162
	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
163
	if (unlikely(!eq))
164
		return NULL;
165

T
Tejun Heo 已提交
166
	eq->type = e;
167
	kobject_init(&eq->kobj, &elv_ktype);
168
	mutex_init(&eq->sysfs_lock);
169
	hash_init(eq->hash);
170
	eq->uses_mq = e->uses_mq;
171

172 173
	return eq;
}
174
EXPORT_SYMBOL(elevator_alloc);
175 176 177

static void elevator_release(struct kobject *kobj)
{
J
Jens Axboe 已提交
178
	struct elevator_queue *e;
179

J
Jens Axboe 已提交
180
	e = container_of(kobj, struct elevator_queue, kobj);
T
Tejun Heo 已提交
181
	elevator_put(e->type);
182 183 184
	kfree(e);
}

185
int elevator_init(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
186 187
{
	struct elevator_type *e = NULL;
188
	int err;
L
Linus Torvalds 已提交
189

190 191 192 193 194 195
	/*
	 * q->sysfs_lock must be held to provide mutual exclusion between
	 * elevator_switch() and here.
	 */
	lockdep_assert_held(&q->sysfs_lock);

196 197 198
	if (unlikely(q->elevator))
		return 0;

T
Tejun Heo 已提交
199 200 201 202 203
	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

204
	if (name) {
205
		e = elevator_get(name, true);
206 207 208
		if (!e)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
209

210
	/*
211 212 213 214
	 * Use the default elevator specified by config boot param for
	 * non-mq devices, or by config option. Don't try to load modules
	 * as we could be running off async and request_module() isn't
	 * allowed from async.
215
	 */
216
	if (!e && !q->mq_ops && *chosen_elevator) {
217
		e = elevator_get(chosen_elevator, false);
218 219 220 221
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}
222

223
	if (!e) {
224 225 226 227 228 229 230 231 232 233 234 235
		/*
		 * For blk-mq devices, we default to using mq-deadline,
		 * if available, for single queue devices. If deadline
		 * isn't available OR we have multiple queues, default
		 * to "none".
		 */
		if (q->mq_ops) {
			if (q->nr_hw_queues == 1)
				e = elevator_get("mq-deadline", false);
			if (!e)
				return 0;
		} else
236 237
			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);

238 239 240
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
241
				"Using noop.\n");
242
			e = elevator_get("noop", false);
243
		}
244 245
	}

246 247 248
	if (e->uses_mq)
		err = blk_mq_init_sched(q, e);
	else
249
		err = e->ops.sq.elevator_init_fn(q, e);
250
	if (err)
251 252
		elevator_put(e);
	return err;
L
Linus Torvalds 已提交
253
}
254 255
EXPORT_SYMBOL(elevator_init);

256
void elevator_exit(struct request_queue *q, struct elevator_queue *e)
L
Linus Torvalds 已提交
257
{
258
	mutex_lock(&e->sysfs_lock);
259
	if (e->uses_mq && e->type->ops.mq.exit_sched)
260
		blk_mq_exit_sched(q, e);
261
	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
262
		e->type->ops.sq.elevator_exit_fn(e);
263
	mutex_unlock(&e->sysfs_lock);
L
Linus Torvalds 已提交
264

265
	kobject_put(&e->kobj);
L
Linus Torvalds 已提交
266
}
267 268
EXPORT_SYMBOL(elevator_exit);

269 270
static inline void __elv_rqhash_del(struct request *rq)
{
271
	hash_del(&rq->hash);
272
	rq->rq_flags &= ~RQF_HASHED;
273 274
}

275
void elv_rqhash_del(struct request_queue *q, struct request *rq)
276 277 278 279
{
	if (ELV_ON_HASH(rq))
		__elv_rqhash_del(rq);
}
280
EXPORT_SYMBOL_GPL(elv_rqhash_del);
281

282
void elv_rqhash_add(struct request_queue *q, struct request *rq)
283
{
J
Jens Axboe 已提交
284
	struct elevator_queue *e = q->elevator;
285 286

	BUG_ON(ELV_ON_HASH(rq));
287
	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
288
	rq->rq_flags |= RQF_HASHED;
289
}
290
EXPORT_SYMBOL_GPL(elv_rqhash_add);
291

292
void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
293 294 295 296 297
{
	__elv_rqhash_del(rq);
	elv_rqhash_add(q, rq);
}

298
struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
299
{
J
Jens Axboe 已提交
300
	struct elevator_queue *e = q->elevator;
301
	struct hlist_node *next;
302 303
	struct request *rq;

304
	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
305 306 307 308 309 310 311 312 313 314 315 316 317 318
		BUG_ON(!ELV_ON_HASH(rq));

		if (unlikely(!rq_mergeable(rq))) {
			__elv_rqhash_del(rq);
			continue;
		}

		if (rq_hash_key(rq) == offset)
			return rq;
	}

	return NULL;
}

319 320 321 322
/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
323
void elv_rb_add(struct rb_root *root, struct request *rq)
324 325 326 327 328 329 330 331 332
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct request *__rq;

	while (*p) {
		parent = *p;
		__rq = rb_entry(parent, struct request, rb_node);

333
		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
334
			p = &(*p)->rb_left;
335
		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
			p = &(*p)->rb_right;
	}

	rb_link_node(&rq->rb_node, parent, p);
	rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
	rb_erase(&rq->rb_node, root);
	RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
	struct rb_node *n = root->rb_node;
	struct request *rq;

	while (n) {
		rq = rb_entry(n, struct request, rb_node);

360
		if (sector < blk_rq_pos(rq))
361
			n = n->rb_left;
362
		else if (sector > blk_rq_pos(rq))
363 364 365 366 367 368 369 370 371
			n = n->rb_right;
		else
			return rq;
	}

	return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

372 373
/*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
U
Uwe Kleine-König 已提交
374
 * entry.  rq is sort instead into the dispatch queue. To be used by
375
 * specific elevators.
376
 */
377
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
378 379 380 381
{
	sector_t boundary;
	struct list_head *entry;

382 383
	if (q->last_merge == rq)
		q->last_merge = NULL;
384 385 386

	elv_rqhash_del(q, rq);

387
	q->nr_sorted--;
388

J
Jens Axboe 已提交
389
	boundary = q->end_sector;
390 391 392
	list_for_each_prev(entry, &q->queue_head) {
		struct request *pos = list_entry_rq(entry);

A
Adrian Hunter 已提交
393
		if (req_op(rq) != req_op(pos))
394
			break;
395 396
		if (rq_data_dir(rq) != rq_data_dir(pos))
			break;
397
		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
398
			break;
399 400
		if (blk_rq_pos(rq) >= boundary) {
			if (blk_rq_pos(pos) < boundary)
401 402
				continue;
		} else {
403
			if (blk_rq_pos(pos) >= boundary)
404 405
				break;
		}
406
		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
407 408 409 410 411
			break;
	}

	list_add(&rq->queuelist, entry);
}
412 413
EXPORT_SYMBOL(elv_dispatch_sort);

414
/*
415 416 417
 * Insert rq into dispatch queue of q.  Queue lock must be held on
 * entry.  rq is added to the back of the dispatch queue. To be used by
 * specific elevators.
418 419 420 421 422 423 424 425 426 427 428 429 430 431
 */
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}
432 433
EXPORT_SYMBOL(elv_dispatch_add_tail);

434 435
enum elv_merge elv_merge(struct request_queue *q, struct request **req,
		struct bio *bio)
L
Linus Torvalds 已提交
436
{
J
Jens Axboe 已提交
437
	struct elevator_queue *e = q->elevator;
438
	struct request *__rq;
439

440 441 442 443 444 445
	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
446
	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
447 448
		return ELEVATOR_NO_MERGE;

449 450 451
	/*
	 * First try one-hit cache.
	 */
452
	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
453 454
		enum elv_merge ret = blk_try_merge(q->last_merge, bio);

455 456 457 458 459
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}
L
Linus Torvalds 已提交
460

461
	if (blk_queue_noxmerges(q))
462 463
		return ELEVATOR_NO_MERGE;

464 465 466
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
467
	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
468
	if (__rq && elv_bio_merge_ok(__rq, bio)) {
469 470 471 472
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

473 474 475
	if (e->uses_mq && e->type->ops.mq.request_merge)
		return e->type->ops.mq.request_merge(q, req, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
476
		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
L
Linus Torvalds 已提交
477 478 479 480

	return ELEVATOR_NO_MERGE;
}

481 482 483 484 485 486 487
/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
488
bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
489 490
{
	struct request *__rq;
S
Shaohua Li 已提交
491
	bool ret;
492 493 494 495 496 497 498 499 500 501 502 503 504

	if (blk_queue_nomerges(q))
		return false;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
		return true;

	if (blk_queue_noxmerges(q))
		return false;

S
Shaohua Li 已提交
505
	ret = false;
506 507 508
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
S
Shaohua Li 已提交
509 510 511 512 513 514 515 516 517
	while (1) {
		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
			break;

		/* The merged request could be merged with others, try again */
		ret = true;
		rq = __rq;
	}
S
Shaohua Li 已提交
518

S
Shaohua Li 已提交
519
	return ret;
520 521
}

522 523
void elv_merged_request(struct request_queue *q, struct request *rq,
		enum elv_merge type)
L
Linus Torvalds 已提交
524
{
J
Jens Axboe 已提交
525
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
526

527 528 529
	if (e->uses_mq && e->type->ops.mq.request_merged)
		e->type->ops.mq.request_merged(q, rq, type);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
530
		e->type->ops.sq.elevator_merged_fn(q, rq, type);
531

532 533
	if (type == ELEVATOR_BACK_MERGE)
		elv_rqhash_reposition(q, rq);
534

535
	q->last_merge = rq;
L
Linus Torvalds 已提交
536 537
}

538
void elv_merge_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
539 540
			     struct request *next)
{
J
Jens Axboe 已提交
541
	struct elevator_queue *e = q->elevator;
542 543 544 545 546
	bool next_sorted = false;

	if (e->uses_mq && e->type->ops.mq.requests_merged)
		e->type->ops.mq.requests_merged(q, rq, next);
	else if (e->type->ops.sq.elevator_merge_req_fn) {
547
		next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
548 549 550
		if (next_sorted)
			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
	}
551

552 553
	elv_rqhash_reposition(q, rq);

554 555 556 557 558
	if (next_sorted) {
		elv_rqhash_del(q, next);
		q->nr_sorted--;
	}

559
	q->last_merge = rq;
L
Linus Torvalds 已提交
560 561
}

D
Divyesh Shah 已提交
562 563 564 565 566
void elv_bio_merged(struct request_queue *q, struct request *rq,
			struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

567 568 569
	if (WARN_ON_ONCE(e->uses_mq))
		return;

570 571
	if (e->type->ops.sq.elevator_bio_merged_fn)
		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
D
Divyesh Shah 已提交
572 573
}

574
#ifdef CONFIG_PM
L
Lin Ming 已提交
575 576
static void blk_pm_requeue_request(struct request *rq)
{
577
	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
L
Lin Ming 已提交
578 579 580 581 582
		rq->q->nr_pending--;
}

static void blk_pm_add_request(struct request_queue *q, struct request *rq)
{
583
	if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
L
Lin Ming 已提交
584 585 586 587 588 589 590 591 592 593 594
	    (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
		pm_request_resume(q->dev);
}
#else
static inline void blk_pm_requeue_request(struct request *rq) {}
static inline void blk_pm_add_request(struct request_queue *q,
				      struct request *rq)
{
}
#endif

595
void elv_requeue_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
596 597 598 599 600
{
	/*
	 * it already went through dequeue, we need to decrement the
	 * in_flight count again
	 */
601
	if (blk_account_rq(rq)) {
602
		q->in_flight[rq_is_sync(rq)]--;
603
		if (rq->rq_flags & RQF_SORTED)
604
			elv_deactivate_rq(q, rq);
605
	}
L
Linus Torvalds 已提交
606

607
	rq->rq_flags &= ~RQF_STARTED;
L
Linus Torvalds 已提交
608

L
Lin Ming 已提交
609 610
	blk_pm_requeue_request(rq);

611
	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
L
Linus Torvalds 已提交
612 613
}

614
void elv_drain_elevator(struct request_queue *q)
615
{
616
	struct elevator_queue *e = q->elevator;
617
	static int printed;
T
Tejun Heo 已提交
618

619 620 621
	if (WARN_ON_ONCE(e->uses_mq))
		return;

T
Tejun Heo 已提交
622 623
	lockdep_assert_held(q->queue_lock);

624
	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
625
		;
T
Tejun Heo 已提交
626
	if (q->nr_sorted && printed++ < 10) {
627 628
		printk(KERN_ERR "%s: forced dispatching is broken "
		       "(nr_sorted=%u), please report this\n",
T
Tejun Heo 已提交
629
		       q->elevator->type->elevator_name, q->nr_sorted);
630 631 632
	}
}

633
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
634
{
635
	trace_block_rq_insert(q, rq);
636

L
Lin Ming 已提交
637 638
	blk_pm_add_request(q, rq);

L
Linus Torvalds 已提交
639 640
	rq->q = q;

641
	if (rq->rq_flags & RQF_SOFTBARRIER) {
642
		/* barriers are scheduling boundary, update end_sector */
643
		if (!blk_rq_is_passthrough(rq)) {
644 645 646
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
647
	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
648 649
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
650 651
		where = ELEVATOR_INSERT_BACK;

652
	switch (where) {
653
	case ELEVATOR_INSERT_REQUEUE:
654
	case ELEVATOR_INSERT_FRONT:
655
		rq->rq_flags |= RQF_SOFTBARRIER;
656 657 658 659
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
660
		rq->rq_flags |= RQF_SOFTBARRIER;
661
		elv_drain_elevator(q);
662 663 664 665 666 667 668 669 670 671 672
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
673
		__blk_run_queue(q);
674 675
		break;

676 677 678 679 680 681 682 683
	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
684
		/* fall through */
685
	case ELEVATOR_INSERT_SORT:
686
		BUG_ON(blk_rq_is_passthrough(rq));
687
		rq->rq_flags |= RQF_SORTED;
688
		q->nr_sorted++;
689 690 691 692 693 694
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

695 696 697 698 699
		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
700
		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
701 702
		break;

703
	case ELEVATOR_INSERT_FLUSH:
704
		rq->rq_flags |= RQF_SOFTBARRIER;
705 706
		blk_insert_flush(rq);
		break;
707 708
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
709
		       __func__, where);
710 711
		BUG();
	}
L
Linus Torvalds 已提交
712
}
713 714
EXPORT_SYMBOL(__elv_add_request);

J
Jens Axboe 已提交
715
void elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
716 717 718 719
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
J
Jens Axboe 已提交
720
	__elv_add_request(q, rq, where);
L
Linus Torvalds 已提交
721 722
	spin_unlock_irqrestore(q->queue_lock, flags);
}
723 724
EXPORT_SYMBOL(elv_add_request);

725
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
726
{
J
Jens Axboe 已提交
727
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
728

729 730 731
	if (e->uses_mq && e->type->ops.mq.next_request)
		return e->type->ops.mq.next_request(q, rq);
	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
732
		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
733

L
Linus Torvalds 已提交
734 735 736
	return NULL;
}

737
struct request *elv_former_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
738
{
J
Jens Axboe 已提交
739
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
740

741 742 743
	if (e->uses_mq && e->type->ops.mq.former_request)
		return e->type->ops.mq.former_request(q, rq);
	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
744
		return e->type->ops.sq.elevator_former_req_fn(q, rq);
L
Linus Torvalds 已提交
745 746 747
	return NULL;
}

748 749
int elv_set_request(struct request_queue *q, struct request *rq,
		    struct bio *bio, gfp_t gfp_mask)
L
Linus Torvalds 已提交
750
{
J
Jens Axboe 已提交
751
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
752

753 754 755
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

756 757
	if (e->type->ops.sq.elevator_set_req_fn)
		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
L
Linus Torvalds 已提交
758 759 760
	return 0;
}

761
void elv_put_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
762
{
J
Jens Axboe 已提交
763
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
764

765 766 767
	if (WARN_ON_ONCE(e->uses_mq))
		return;

768 769
	if (e->type->ops.sq.elevator_put_req_fn)
		e->type->ops.sq.elevator_put_req_fn(rq);
L
Linus Torvalds 已提交
770 771
}

772
int elv_may_queue(struct request_queue *q, unsigned int op)
L
Linus Torvalds 已提交
773
{
J
Jens Axboe 已提交
774
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
775

776 777 778
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

779 780
	if (e->type->ops.sq.elevator_may_queue_fn)
		return e->type->ops.sq.elevator_may_queue_fn(q, op);
L
Linus Torvalds 已提交
781 782 783 784

	return ELV_MQUEUE_MAY;
}

785
void elv_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
786
{
J
Jens Axboe 已提交
787
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
788

789 790 791
	if (WARN_ON_ONCE(e->uses_mq))
		return;

L
Linus Torvalds 已提交
792 793 794
	/*
	 * request is released from the driver, io must be done
	 */
795
	if (blk_account_rq(rq)) {
796
		q->in_flight[rq_is_sync(rq)]--;
797
		if ((rq->rq_flags & RQF_SORTED) &&
798 799
		    e->type->ops.sq.elevator_completed_req_fn)
			e->type->ops.sq.elevator_completed_req_fn(q, rq);
800
	}
L
Linus Torvalds 已提交
801 802
}

803 804 805 806
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
L
Linus Torvalds 已提交
807
{
808
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
809
	struct elevator_queue *e;
810 811 812 813 814
	ssize_t error;

	if (!entry->show)
		return -EIO;

J
Jens Axboe 已提交
815
	e = container_of(kobj, struct elevator_queue, kobj);
816
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
817
	error = e->type ? entry->show(e, page) : -ENOENT;
818 819 820
	mutex_unlock(&e->sysfs_lock);
	return error;
}
L
Linus Torvalds 已提交
821

822 823 824 825 826
static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
	       const char *page, size_t length)
{
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
827
	struct elevator_queue *e;
828
	ssize_t error;
L
Linus Torvalds 已提交
829

830 831
	if (!entry->store)
		return -EIO;
L
Linus Torvalds 已提交
832

J
Jens Axboe 已提交
833
	e = container_of(kobj, struct elevator_queue, kobj);
834
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
835
	error = e->type ? entry->store(e, page, length) : -ENOENT;
836 837 838 839
	mutex_unlock(&e->sysfs_lock);
	return error;
}

840
static const struct sysfs_ops elv_sysfs_ops = {
841 842 843 844 845 846 847 848 849
	.show	= elv_attr_show,
	.store	= elv_attr_store,
};

static struct kobj_type elv_ktype = {
	.sysfs_ops	= &elv_sysfs_ops,
	.release	= elevator_release,
};

850
int elv_register_queue(struct request_queue *q)
851
{
852
	struct elevator_queue *e = q->elevator;
853 854
	int error;

855
	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
856
	if (!error) {
T
Tejun Heo 已提交
857
		struct elv_fs_entry *attr = e->type->elevator_attrs;
858
		if (attr) {
859 860
			while (attr->attr.name) {
				if (sysfs_create_file(&e->kobj, &attr->attr))
861
					break;
862
				attr++;
863 864 865
			}
		}
		kobject_uevent(&e->kobj, KOBJ_ADD);
866
		e->registered = 1;
867
		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
868
			e->type->ops.sq.elevator_registered_fn(q);
869 870
	}
	return error;
L
Linus Torvalds 已提交
871
}
872
EXPORT_SYMBOL(elv_register_queue);
J
Jens Axboe 已提交
873

L
Linus Torvalds 已提交
874 875
void elv_unregister_queue(struct request_queue *q)
{
876 877 878 879 880 881
	if (q) {
		struct elevator_queue *e = q->elevator;

		kobject_uevent(&e->kobj, KOBJ_REMOVE);
		kobject_del(&e->kobj);
		e->registered = 0;
882 883
		/* Re-enable throttling in case elevator disabled it */
		wbt_enable_default(q);
884
	}
L
Linus Torvalds 已提交
885
}
886
EXPORT_SYMBOL(elv_unregister_queue);
L
Linus Torvalds 已提交
887

888
int elv_register(struct elevator_type *e)
L
Linus Torvalds 已提交
889
{
890
	char *def = "";
891

892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
	/* create icq_cache if requested */
	if (e->icq_size) {
		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
			return -EINVAL;

		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
			 "%s_io_cq", e->elevator_name);
		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
						 e->icq_align, 0, NULL);
		if (!e->icq_cache)
			return -ENOMEM;
	}

	/* register, don't allow duplicate names */
907
	spin_lock(&elv_list_lock);
908 909 910 911 912 913
	if (elevator_find(e->elevator_name)) {
		spin_unlock(&elv_list_lock);
		if (e->icq_cache)
			kmem_cache_destroy(e->icq_cache);
		return -EBUSY;
	}
L
Linus Torvalds 已提交
914
	list_add_tail(&e->list, &elv_list);
915
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
916

917
	/* print pretty message */
918 919 920
	if (!strcmp(e->elevator_name, chosen_elevator) ||
			(!*chosen_elevator &&
			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
921 922
				def = " (default)";

923 924
	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
								def);
925
	return 0;
L
Linus Torvalds 已提交
926 927 928 929 930
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
931
	/* unregister */
932
	spin_lock(&elv_list_lock);
L
Linus Torvalds 已提交
933
	list_del_init(&e->list);
934
	spin_unlock(&elv_list_lock);
935 936 937 938 939 940 941 942 943 944

	/*
	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
	 * sure all RCU operations are complete before proceeding.
	 */
	if (e->icq_cache) {
		rcu_barrier();
		kmem_cache_destroy(e->icq_cache);
		e->icq_cache = NULL;
	}
L
Linus Torvalds 已提交
945 946 947
}
EXPORT_SYMBOL_GPL(elv_unregister);

948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
static int elevator_switch_mq(struct request_queue *q,
			      struct elevator_type *new_e)
{
	int ret;

	blk_mq_freeze_queue(q);

	if (q->elevator) {
		if (q->elevator->registered)
			elv_unregister_queue(q);
		ioc_clear_queue(q);
		elevator_exit(q, q->elevator);
	}

	ret = blk_mq_init_sched(q, new_e);
	if (ret)
		goto out;

	if (new_e) {
		ret = elv_register_queue(q);
		if (ret) {
			elevator_exit(q, q->elevator);
			goto out;
		}
	}

	if (new_e)
		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
	else
		blk_add_trace_msg(q, "elv switch: none");

out:
	blk_mq_unfreeze_queue(q);
	return ret;
}

L
Linus Torvalds 已提交
984 985 986 987
/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
T
Tejun Heo 已提交
988
 * one, if the new one fails init for some reason.
L
Linus Torvalds 已提交
989
 */
990
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
L
Linus Torvalds 已提交
991
{
992
	struct elevator_queue *old = q->elevator;
993
	bool old_registered = false;
994
	int err;
L
Linus Torvalds 已提交
995

996 997
	if (q->mq_ops)
		return elevator_switch_mq(q, new_e);
998

999 1000 1001 1002 1003 1004 1005
	/*
	 * Turn on BYPASS and drain all requests w/ elevator private data.
	 * Block layer doesn't call into a quiesced elevator - all requests
	 * are directly put on the dispatch list without elevator data
	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
	 * merge happens either.
	 */
1006 1007 1008
	if (old) {
		old_registered = old->registered;

1009
		blk_queue_bypass_start(q);
L
Linus Torvalds 已提交
1010

1011 1012 1013 1014 1015 1016
		/* unregister and clear all auxiliary data of the old elevator */
		if (old_registered)
			elv_unregister_queue(q);

		ioc_clear_queue(q);
	}
1017

1018
	/* allocate, init and register new elevator */
1019
	err = new_e->ops.sq.elevator_init_fn(q, new_e);
1020 1021
	if (err)
		goto fail_init;
1022

1023 1024 1025
	err = elv_register_queue(q);
	if (err)
		goto fail_register;
1026 1027

	/* done, kill the old one and finish */
1028
	if (old) {
1029 1030
		elevator_exit(q, old);
		blk_queue_bypass_end(q);
1031 1032
	}

1033
	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1034

1035
	return 0;
L
Linus Torvalds 已提交
1036 1037

fail_register:
1038
	elevator_exit(q, q->elevator);
1039 1040
fail_init:
	/* switch failed, restore and re-register old elevator */
1041 1042 1043
	if (old) {
		q->elevator = old;
		elv_register_queue(q);
1044
		blk_queue_bypass_end(q);
1045
	}
N
Nick Piggin 已提交
1046

1047
	return err;
L
Linus Torvalds 已提交
1048 1049
}

1050 1051 1052
/*
 * Switch this queue to the given IO scheduler.
 */
1053
static int __elevator_change(struct request_queue *q, const char *name)
L
Linus Torvalds 已提交
1054 1055 1056 1057
{
	char elevator_name[ELV_NAME_MAX];
	struct elevator_type *e;

1058 1059 1060 1061
	/* Make sure queue is not in the middle of being removed */
	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
		return -ENOENT;

1062 1063 1064 1065 1066
	/*
	 * Special case for mq, turn off scheduling
	 */
	if (q->mq_ops && !strncmp(name, "none", 4))
		return elevator_switch(q, NULL);
1067

1068
	strlcpy(elevator_name, name, sizeof(elevator_name));
1069
	e = elevator_get(strstrip(elevator_name), true);
1070
	if (!e)
L
Linus Torvalds 已提交
1071 1072
		return -EINVAL;

1073 1074
	if (q->elevator &&
	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
1075
		elevator_put(e);
1076
		return 0;
1077
	}
L
Linus Torvalds 已提交
1078

1079 1080 1081 1082 1083 1084 1085 1086 1087
	if (!e->uses_mq && q->mq_ops) {
		elevator_put(e);
		return -EINVAL;
	}
	if (e->uses_mq && !q->mq_ops) {
		elevator_put(e);
		return -EINVAL;
	}

1088 1089
	return elevator_switch(q, e);
}
1090

M
Ming Lei 已提交
1091 1092 1093 1094 1095 1096 1097 1098
static inline bool elv_support_iosched(struct request_queue *q)
{
	if (q->mq_ops && q->tag_set && (q->tag_set->flags &
				BLK_MQ_F_NO_SCHED))
		return false;
	return true;
}

1099 1100 1101 1102 1103
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
			  size_t count)
{
	int ret;

M
Ming Lei 已提交
1104
	if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q))
1105 1106
		return count;

1107
	ret = __elevator_change(q, name);
1108 1109 1110 1111
	if (!ret)
		return count;

	return ret;
L
Linus Torvalds 已提交
1112 1113
}

1114
ssize_t elv_iosched_show(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
1115
{
J
Jens Axboe 已提交
1116
	struct elevator_queue *e = q->elevator;
1117
	struct elevator_type *elv = NULL;
1118
	struct elevator_type *__e;
L
Linus Torvalds 已提交
1119 1120
	int len = 0;

1121
	if (!queue_is_rq_based(q))
1122 1123
		return sprintf(name, "none\n");

1124 1125 1126 1127
	if (!q->elevator)
		len += sprintf(name+len, "[none] ");
	else
		elv = e->type;
1128

1129
	spin_lock(&elv_list_lock);
1130
	list_for_each_entry(__e, &elv_list, list) {
1131
		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
L
Linus Torvalds 已提交
1132
			len += sprintf(name+len, "[%s] ", elv->elevator_name);
1133 1134
			continue;
		}
M
Ming Lei 已提交
1135
		if (__e->uses_mq && q->mq_ops && elv_support_iosched(q))
1136 1137
			len += sprintf(name+len, "%s ", __e->elevator_name);
		else if (!__e->uses_mq && !q->mq_ops)
L
Linus Torvalds 已提交
1138 1139
			len += sprintf(name+len, "%s ", __e->elevator_name);
	}
1140
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
1141

1142 1143 1144
	if (q->mq_ops && q->elevator)
		len += sprintf(name+len, "none");

L
Linus Torvalds 已提交
1145 1146 1147 1148
	len += sprintf(len+name, "\n");
	return len;
}

1149 1150
struct request *elv_rb_former_request(struct request_queue *q,
				      struct request *rq)
1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
{
	struct rb_node *rbprev = rb_prev(&rq->rb_node);

	if (rbprev)
		return rb_entry_rq(rbprev);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

1161 1162
struct request *elv_rb_latter_request(struct request_queue *q,
				      struct request *rq)
1163 1164 1165 1166 1167 1168 1169 1170 1171
{
	struct rb_node *rbnext = rb_next(&rq->rb_node);

	if (rbnext)
		return rb_entry_rq(rbnext);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);