elevator.c 27.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
6
 * 30042000 Jens Axboe <axboe@kernel.dk> :
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
34
#include <linux/blktrace_api.h>
35
#include <linux/hash.h>
36
#include <linux/uaccess.h>
L
Lin Ming 已提交
37
#include <linux/pm_runtime.h>
38
#include <linux/blk-cgroup.h>
L
Linus Torvalds 已提交
39

40 41
#include <trace/events/block.h>

J
Jens Axboe 已提交
42
#include "blk.h"
43
#include "blk-mq-sched.h"
44
#include "blk-wbt.h"
J
Jens Axboe 已提交
45

L
Linus Torvalds 已提交
46 47 48
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

49 50 51
/*
 * Merge hash stuff.
 */
52
#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
53

54 55 56 57
/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
58
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
59
{
60
	struct request_queue *q = rq->q;
J
Jens Axboe 已提交
61
	struct elevator_queue *e = q->elevator;
62

63 64 65
	if (e->uses_mq && e->type->ops.mq.allow_merge)
		return e->type->ops.mq.allow_merge(q, rq, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
66
		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
67 68 69 70

	return 1;
}

L
Linus Torvalds 已提交
71 72 73
/*
 * can we safely merge with this request?
 */
74
bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
L
Linus Torvalds 已提交
75
{
76
	if (!blk_rq_merge_ok(rq, bio))
77
		return false;
78

79 80
	if (!elv_iosched_allow_bio_merge(rq, bio))
		return false;
L
Linus Torvalds 已提交
81

82
	return true;
L
Linus Torvalds 已提交
83
}
84
EXPORT_SYMBOL(elv_bio_merge_ok);
L
Linus Torvalds 已提交
85

J
Jens Axboe 已提交
86 87 88 89 90 91 92 93 94 95
static bool elevator_match(const struct elevator_type *e, const char *name)
{
	if (!strcmp(e->elevator_name, name))
		return true;
	if (e->elevator_alias && !strcmp(e->elevator_alias, name))
		return true;

	return false;
}

96 97 98 99
/*
 * Return scheduler with name 'name' and with matching 'mq capability
 */
static struct elevator_type *elevator_find(const char *name, bool mq)
L
Linus Torvalds 已提交
100
{
101
	struct elevator_type *e;
L
Linus Torvalds 已提交
102

103
	list_for_each_entry(e, &elv_list, list) {
J
Jens Axboe 已提交
104
		if (elevator_match(e, name) && (mq == e->uses_mq))
105
			return e;
L
Linus Torvalds 已提交
106 107
	}

108
	return NULL;
L
Linus Torvalds 已提交
109 110 111 112 113 114 115
}

static void elevator_put(struct elevator_type *e)
{
	module_put(e->elevator_owner);
}

116 117
static struct elevator_type *elevator_get(struct request_queue *q,
					  const char *name, bool try_loading)
L
Linus Torvalds 已提交
118
{
119
	struct elevator_type *e;
L
Linus Torvalds 已提交
120

121
	spin_lock(&elv_list_lock);
122

123
	e = elevator_find(name, q->mq_ops != NULL);
124
	if (!e && try_loading) {
125
		spin_unlock(&elv_list_lock);
K
Kees Cook 已提交
126
		request_module("%s-iosched", name);
127
		spin_lock(&elv_list_lock);
128
		e = elevator_find(name, q->mq_ops != NULL);
129 130
	}

131 132 133
	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

134
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
135 136 137
	return e;
}

138
static char chosen_elevator[ELV_NAME_MAX];
L
Linus Torvalds 已提交
139

140
static int __init elevator_setup(char *str)
L
Linus Torvalds 已提交
141
{
142 143 144 145
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
146
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
147
	return 1;
L
Linus Torvalds 已提交
148 149 150 151
}

__setup("elevator=", elevator_setup);

152 153 154 155 156 157 158 159
/* called during boot to load the elevator chosen by the elevator param */
void __init load_default_elevator_module(void)
{
	struct elevator_type *e;

	if (!chosen_elevator[0])
		return;

160 161 162 163
	/*
	 * Boot parameter is deprecated, we haven't supported that for MQ.
	 * Only look for non-mq schedulers from here.
	 */
164
	spin_lock(&elv_list_lock);
165
	e = elevator_find(chosen_elevator, false);
166 167 168 169 170 171
	spin_unlock(&elv_list_lock);

	if (!e)
		request_module("%s-iosched", chosen_elevator);
}

172 173
static struct kobj_type elv_ktype;

174
struct elevator_queue *elevator_alloc(struct request_queue *q,
175
				  struct elevator_type *e)
176
{
J
Jens Axboe 已提交
177
	struct elevator_queue *eq;
178

179
	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
180
	if (unlikely(!eq))
181
		return NULL;
182

T
Tejun Heo 已提交
183
	eq->type = e;
184
	kobject_init(&eq->kobj, &elv_ktype);
185
	mutex_init(&eq->sysfs_lock);
186
	hash_init(eq->hash);
187
	eq->uses_mq = e->uses_mq;
188

189 190
	return eq;
}
191
EXPORT_SYMBOL(elevator_alloc);
192 193 194

static void elevator_release(struct kobject *kobj)
{
J
Jens Axboe 已提交
195
	struct elevator_queue *e;
196

J
Jens Axboe 已提交
197
	e = container_of(kobj, struct elevator_queue, kobj);
T
Tejun Heo 已提交
198
	elevator_put(e->type);
199 200 201
	kfree(e);
}

202
int elevator_init(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
203 204
{
	struct elevator_type *e = NULL;
205
	int err;
L
Linus Torvalds 已提交
206

207 208 209 210 211 212
	/*
	 * q->sysfs_lock must be held to provide mutual exclusion between
	 * elevator_switch() and here.
	 */
	lockdep_assert_held(&q->sysfs_lock);

213 214 215
	if (unlikely(q->elevator))
		return 0;

T
Tejun Heo 已提交
216 217 218 219 220
	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

221
	if (name) {
222
		e = elevator_get(q, name, true);
223 224 225
		if (!e)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
226

227
	/*
228 229 230 231
	 * Use the default elevator specified by config boot param for
	 * non-mq devices, or by config option. Don't try to load modules
	 * as we could be running off async and request_module() isn't
	 * allowed from async.
232
	 */
233
	if (!e && !q->mq_ops && *chosen_elevator) {
234
		e = elevator_get(q, chosen_elevator, false);
235 236 237 238
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}
239

240
	if (!e) {
241 242 243 244 245 246 247 248
		/*
		 * For blk-mq devices, we default to using mq-deadline,
		 * if available, for single queue devices. If deadline
		 * isn't available OR we have multiple queues, default
		 * to "none".
		 */
		if (q->mq_ops) {
			if (q->nr_hw_queues == 1)
249
				e = elevator_get(q, "mq-deadline", false);
250 251 252
			if (!e)
				return 0;
		} else
253
			e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
254

255 256 257
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
258
				"Using noop.\n");
259
			e = elevator_get(q, "noop", false);
260
		}
261 262
	}

263 264 265
	if (e->uses_mq)
		err = blk_mq_init_sched(q, e);
	else
266
		err = e->ops.sq.elevator_init_fn(q, e);
267
	if (err)
268 269
		elevator_put(e);
	return err;
L
Linus Torvalds 已提交
270
}
271 272
EXPORT_SYMBOL(elevator_init);

273
void elevator_exit(struct request_queue *q, struct elevator_queue *e)
L
Linus Torvalds 已提交
274
{
275
	mutex_lock(&e->sysfs_lock);
276
	if (e->uses_mq && e->type->ops.mq.exit_sched)
277
		blk_mq_exit_sched(q, e);
278
	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
279
		e->type->ops.sq.elevator_exit_fn(e);
280
	mutex_unlock(&e->sysfs_lock);
L
Linus Torvalds 已提交
281

282
	kobject_put(&e->kobj);
L
Linus Torvalds 已提交
283
}
284 285
EXPORT_SYMBOL(elevator_exit);

286 287
static inline void __elv_rqhash_del(struct request *rq)
{
288
	hash_del(&rq->hash);
289
	rq->rq_flags &= ~RQF_HASHED;
290 291
}

292
void elv_rqhash_del(struct request_queue *q, struct request *rq)
293 294 295 296
{
	if (ELV_ON_HASH(rq))
		__elv_rqhash_del(rq);
}
297
EXPORT_SYMBOL_GPL(elv_rqhash_del);
298

299
void elv_rqhash_add(struct request_queue *q, struct request *rq)
300
{
J
Jens Axboe 已提交
301
	struct elevator_queue *e = q->elevator;
302 303

	BUG_ON(ELV_ON_HASH(rq));
304
	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
305
	rq->rq_flags |= RQF_HASHED;
306
}
307
EXPORT_SYMBOL_GPL(elv_rqhash_add);
308

309
void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
310 311 312 313 314
{
	__elv_rqhash_del(rq);
	elv_rqhash_add(q, rq);
}

315
struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
316
{
J
Jens Axboe 已提交
317
	struct elevator_queue *e = q->elevator;
318
	struct hlist_node *next;
319 320
	struct request *rq;

321
	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
322 323 324 325 326 327 328 329 330 331 332 333 334 335
		BUG_ON(!ELV_ON_HASH(rq));

		if (unlikely(!rq_mergeable(rq))) {
			__elv_rqhash_del(rq);
			continue;
		}

		if (rq_hash_key(rq) == offset)
			return rq;
	}

	return NULL;
}

336 337 338 339
/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
340
void elv_rb_add(struct rb_root *root, struct request *rq)
341 342 343 344 345 346 347 348 349
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct request *__rq;

	while (*p) {
		parent = *p;
		__rq = rb_entry(parent, struct request, rb_node);

350
		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
351
			p = &(*p)->rb_left;
352
		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
			p = &(*p)->rb_right;
	}

	rb_link_node(&rq->rb_node, parent, p);
	rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
	rb_erase(&rq->rb_node, root);
	RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
	struct rb_node *n = root->rb_node;
	struct request *rq;

	while (n) {
		rq = rb_entry(n, struct request, rb_node);

377
		if (sector < blk_rq_pos(rq))
378
			n = n->rb_left;
379
		else if (sector > blk_rq_pos(rq))
380 381 382 383 384 385 386 387 388
			n = n->rb_right;
		else
			return rq;
	}

	return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

389 390
/*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
U
Uwe Kleine-König 已提交
391
 * entry.  rq is sort instead into the dispatch queue. To be used by
392
 * specific elevators.
393
 */
394
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
395 396 397 398
{
	sector_t boundary;
	struct list_head *entry;

399 400
	if (q->last_merge == rq)
		q->last_merge = NULL;
401 402 403

	elv_rqhash_del(q, rq);

404
	q->nr_sorted--;
405

J
Jens Axboe 已提交
406
	boundary = q->end_sector;
407 408 409
	list_for_each_prev(entry, &q->queue_head) {
		struct request *pos = list_entry_rq(entry);

A
Adrian Hunter 已提交
410
		if (req_op(rq) != req_op(pos))
411
			break;
412 413
		if (rq_data_dir(rq) != rq_data_dir(pos))
			break;
414
		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
415
			break;
416 417
		if (blk_rq_pos(rq) >= boundary) {
			if (blk_rq_pos(pos) < boundary)
418 419
				continue;
		} else {
420
			if (blk_rq_pos(pos) >= boundary)
421 422
				break;
		}
423
		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
424 425 426 427 428
			break;
	}

	list_add(&rq->queuelist, entry);
}
429 430
EXPORT_SYMBOL(elv_dispatch_sort);

431
/*
432 433 434
 * Insert rq into dispatch queue of q.  Queue lock must be held on
 * entry.  rq is added to the back of the dispatch queue. To be used by
 * specific elevators.
435 436 437 438 439 440 441 442 443 444 445 446 447 448
 */
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}
449 450
EXPORT_SYMBOL(elv_dispatch_add_tail);

451 452
enum elv_merge elv_merge(struct request_queue *q, struct request **req,
		struct bio *bio)
L
Linus Torvalds 已提交
453
{
J
Jens Axboe 已提交
454
	struct elevator_queue *e = q->elevator;
455
	struct request *__rq;
456

457 458 459 460 461 462
	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
463
	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
464 465
		return ELEVATOR_NO_MERGE;

466 467 468
	/*
	 * First try one-hit cache.
	 */
469
	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
470 471
		enum elv_merge ret = blk_try_merge(q->last_merge, bio);

472 473 474 475 476
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}
L
Linus Torvalds 已提交
477

478
	if (blk_queue_noxmerges(q))
479 480
		return ELEVATOR_NO_MERGE;

481 482 483
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
484
	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
485
	if (__rq && elv_bio_merge_ok(__rq, bio)) {
486 487 488 489
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

490 491 492
	if (e->uses_mq && e->type->ops.mq.request_merge)
		return e->type->ops.mq.request_merge(q, req, bio);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
493
		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
L
Linus Torvalds 已提交
494 495 496 497

	return ELEVATOR_NO_MERGE;
}

498 499 500 501 502 503 504
/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
505
bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
506 507
{
	struct request *__rq;
S
Shaohua Li 已提交
508
	bool ret;
509 510 511 512 513 514 515 516 517 518 519 520 521

	if (blk_queue_nomerges(q))
		return false;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
		return true;

	if (blk_queue_noxmerges(q))
		return false;

S
Shaohua Li 已提交
522
	ret = false;
523 524 525
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
S
Shaohua Li 已提交
526 527 528 529 530 531 532 533 534
	while (1) {
		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
			break;

		/* The merged request could be merged with others, try again */
		ret = true;
		rq = __rq;
	}
S
Shaohua Li 已提交
535

S
Shaohua Li 已提交
536
	return ret;
537 538
}

539 540
void elv_merged_request(struct request_queue *q, struct request *rq,
		enum elv_merge type)
L
Linus Torvalds 已提交
541
{
J
Jens Axboe 已提交
542
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
543

544 545 546
	if (e->uses_mq && e->type->ops.mq.request_merged)
		e->type->ops.mq.request_merged(q, rq, type);
	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
547
		e->type->ops.sq.elevator_merged_fn(q, rq, type);
548

549 550
	if (type == ELEVATOR_BACK_MERGE)
		elv_rqhash_reposition(q, rq);
551

552
	q->last_merge = rq;
L
Linus Torvalds 已提交
553 554
}

555
void elv_merge_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
556 557
			     struct request *next)
{
J
Jens Axboe 已提交
558
	struct elevator_queue *e = q->elevator;
559 560 561 562 563
	bool next_sorted = false;

	if (e->uses_mq && e->type->ops.mq.requests_merged)
		e->type->ops.mq.requests_merged(q, rq, next);
	else if (e->type->ops.sq.elevator_merge_req_fn) {
564
		next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
565 566 567
		if (next_sorted)
			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
	}
568

569 570
	elv_rqhash_reposition(q, rq);

571 572 573 574 575
	if (next_sorted) {
		elv_rqhash_del(q, next);
		q->nr_sorted--;
	}

576
	q->last_merge = rq;
L
Linus Torvalds 已提交
577 578
}

D
Divyesh Shah 已提交
579 580 581 582 583
void elv_bio_merged(struct request_queue *q, struct request *rq,
			struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

584 585 586
	if (WARN_ON_ONCE(e->uses_mq))
		return;

587 588
	if (e->type->ops.sq.elevator_bio_merged_fn)
		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
D
Divyesh Shah 已提交
589 590
}

591
#ifdef CONFIG_PM
L
Lin Ming 已提交
592 593
static void blk_pm_requeue_request(struct request *rq)
{
594
	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
L
Lin Ming 已提交
595 596 597 598 599
		rq->q->nr_pending--;
}

static void blk_pm_add_request(struct request_queue *q, struct request *rq)
{
600
	if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
L
Lin Ming 已提交
601 602 603 604 605 606 607 608 609 610 611
	    (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
		pm_request_resume(q->dev);
}
#else
static inline void blk_pm_requeue_request(struct request *rq) {}
static inline void blk_pm_add_request(struct request_queue *q,
				      struct request *rq)
{
}
#endif

612
void elv_requeue_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
613 614 615 616 617
{
	/*
	 * it already went through dequeue, we need to decrement the
	 * in_flight count again
	 */
618
	if (blk_account_rq(rq)) {
619
		q->in_flight[rq_is_sync(rq)]--;
620
		if (rq->rq_flags & RQF_SORTED)
621
			elv_deactivate_rq(q, rq);
622
	}
L
Linus Torvalds 已提交
623

624
	rq->rq_flags &= ~RQF_STARTED;
L
Linus Torvalds 已提交
625

L
Lin Ming 已提交
626 627
	blk_pm_requeue_request(rq);

628
	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
L
Linus Torvalds 已提交
629 630
}

631
void elv_drain_elevator(struct request_queue *q)
632
{
633
	struct elevator_queue *e = q->elevator;
634
	static int printed;
T
Tejun Heo 已提交
635

636 637 638
	if (WARN_ON_ONCE(e->uses_mq))
		return;

T
Tejun Heo 已提交
639 640
	lockdep_assert_held(q->queue_lock);

641
	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
642
		;
T
Tejun Heo 已提交
643
	if (q->nr_sorted && printed++ < 10) {
644 645
		printk(KERN_ERR "%s: forced dispatching is broken "
		       "(nr_sorted=%u), please report this\n",
T
Tejun Heo 已提交
646
		       q->elevator->type->elevator_name, q->nr_sorted);
647 648 649
	}
}

650
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
651
{
652
	trace_block_rq_insert(q, rq);
653

L
Lin Ming 已提交
654 655
	blk_pm_add_request(q, rq);

L
Linus Torvalds 已提交
656 657
	rq->q = q;

658
	if (rq->rq_flags & RQF_SOFTBARRIER) {
659
		/* barriers are scheduling boundary, update end_sector */
660
		if (!blk_rq_is_passthrough(rq)) {
661 662 663
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
664
	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
665 666
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
667 668
		where = ELEVATOR_INSERT_BACK;

669
	switch (where) {
670
	case ELEVATOR_INSERT_REQUEUE:
671
	case ELEVATOR_INSERT_FRONT:
672
		rq->rq_flags |= RQF_SOFTBARRIER;
673 674 675 676
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
677
		rq->rq_flags |= RQF_SOFTBARRIER;
678
		elv_drain_elevator(q);
679 680 681 682 683 684 685 686 687 688 689
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
690
		__blk_run_queue(q);
691 692
		break;

693 694 695 696 697 698 699 700
	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
701
		/* fall through */
702
	case ELEVATOR_INSERT_SORT:
703
		BUG_ON(blk_rq_is_passthrough(rq));
704
		rq->rq_flags |= RQF_SORTED;
705
		q->nr_sorted++;
706 707 708 709 710 711
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

712 713 714 715 716
		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
717
		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
718 719
		break;

720
	case ELEVATOR_INSERT_FLUSH:
721
		rq->rq_flags |= RQF_SOFTBARRIER;
722 723
		blk_insert_flush(rq);
		break;
724 725
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
726
		       __func__, where);
727 728
		BUG();
	}
L
Linus Torvalds 已提交
729
}
730 731
EXPORT_SYMBOL(__elv_add_request);

J
Jens Axboe 已提交
732
void elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
733 734 735 736
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
J
Jens Axboe 已提交
737
	__elv_add_request(q, rq, where);
L
Linus Torvalds 已提交
738 739
	spin_unlock_irqrestore(q->queue_lock, flags);
}
740 741
EXPORT_SYMBOL(elv_add_request);

742
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
743
{
J
Jens Axboe 已提交
744
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
745

746 747 748
	if (e->uses_mq && e->type->ops.mq.next_request)
		return e->type->ops.mq.next_request(q, rq);
	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
749
		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
750

L
Linus Torvalds 已提交
751 752 753
	return NULL;
}

754
struct request *elv_former_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
755
{
J
Jens Axboe 已提交
756
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
757

758 759 760
	if (e->uses_mq && e->type->ops.mq.former_request)
		return e->type->ops.mq.former_request(q, rq);
	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
761
		return e->type->ops.sq.elevator_former_req_fn(q, rq);
L
Linus Torvalds 已提交
762 763 764
	return NULL;
}

765 766
int elv_set_request(struct request_queue *q, struct request *rq,
		    struct bio *bio, gfp_t gfp_mask)
L
Linus Torvalds 已提交
767
{
J
Jens Axboe 已提交
768
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
769

770 771 772
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

773 774
	if (e->type->ops.sq.elevator_set_req_fn)
		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
L
Linus Torvalds 已提交
775 776 777
	return 0;
}

778
void elv_put_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
779
{
J
Jens Axboe 已提交
780
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
781

782 783 784
	if (WARN_ON_ONCE(e->uses_mq))
		return;

785 786
	if (e->type->ops.sq.elevator_put_req_fn)
		e->type->ops.sq.elevator_put_req_fn(rq);
L
Linus Torvalds 已提交
787 788
}

789
int elv_may_queue(struct request_queue *q, unsigned int op)
L
Linus Torvalds 已提交
790
{
J
Jens Axboe 已提交
791
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
792

793 794 795
	if (WARN_ON_ONCE(e->uses_mq))
		return 0;

796 797
	if (e->type->ops.sq.elevator_may_queue_fn)
		return e->type->ops.sq.elevator_may_queue_fn(q, op);
L
Linus Torvalds 已提交
798 799 800 801

	return ELV_MQUEUE_MAY;
}

802
void elv_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
803
{
J
Jens Axboe 已提交
804
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
805

806 807 808
	if (WARN_ON_ONCE(e->uses_mq))
		return;

L
Linus Torvalds 已提交
809 810 811
	/*
	 * request is released from the driver, io must be done
	 */
812
	if (blk_account_rq(rq)) {
813
		q->in_flight[rq_is_sync(rq)]--;
814
		if ((rq->rq_flags & RQF_SORTED) &&
815 816
		    e->type->ops.sq.elevator_completed_req_fn)
			e->type->ops.sq.elevator_completed_req_fn(q, rq);
817
	}
L
Linus Torvalds 已提交
818 819
}

820 821 822 823
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
L
Linus Torvalds 已提交
824
{
825
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
826
	struct elevator_queue *e;
827 828 829 830 831
	ssize_t error;

	if (!entry->show)
		return -EIO;

J
Jens Axboe 已提交
832
	e = container_of(kobj, struct elevator_queue, kobj);
833
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
834
	error = e->type ? entry->show(e, page) : -ENOENT;
835 836 837
	mutex_unlock(&e->sysfs_lock);
	return error;
}
L
Linus Torvalds 已提交
838

839 840 841 842 843
static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
	       const char *page, size_t length)
{
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
844
	struct elevator_queue *e;
845
	ssize_t error;
L
Linus Torvalds 已提交
846

847 848
	if (!entry->store)
		return -EIO;
L
Linus Torvalds 已提交
849

J
Jens Axboe 已提交
850
	e = container_of(kobj, struct elevator_queue, kobj);
851
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
852
	error = e->type ? entry->store(e, page, length) : -ENOENT;
853 854 855 856
	mutex_unlock(&e->sysfs_lock);
	return error;
}

857
static const struct sysfs_ops elv_sysfs_ops = {
858 859 860 861 862 863 864 865 866
	.show	= elv_attr_show,
	.store	= elv_attr_store,
};

static struct kobj_type elv_ktype = {
	.sysfs_ops	= &elv_sysfs_ops,
	.release	= elevator_release,
};

867
int elv_register_queue(struct request_queue *q)
868
{
869
	struct elevator_queue *e = q->elevator;
870 871
	int error;

872
	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
873
	if (!error) {
T
Tejun Heo 已提交
874
		struct elv_fs_entry *attr = e->type->elevator_attrs;
875
		if (attr) {
876 877
			while (attr->attr.name) {
				if (sysfs_create_file(&e->kobj, &attr->attr))
878
					break;
879
				attr++;
880 881 882
			}
		}
		kobject_uevent(&e->kobj, KOBJ_ADD);
883
		e->registered = 1;
884
		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
885
			e->type->ops.sq.elevator_registered_fn(q);
886 887
	}
	return error;
L
Linus Torvalds 已提交
888
}
889
EXPORT_SYMBOL(elv_register_queue);
J
Jens Axboe 已提交
890

L
Linus Torvalds 已提交
891 892
void elv_unregister_queue(struct request_queue *q)
{
893 894 895 896 897 898
	if (q) {
		struct elevator_queue *e = q->elevator;

		kobject_uevent(&e->kobj, KOBJ_REMOVE);
		kobject_del(&e->kobj);
		e->registered = 0;
899 900
		/* Re-enable throttling in case elevator disabled it */
		wbt_enable_default(q);
901
	}
L
Linus Torvalds 已提交
902
}
903
EXPORT_SYMBOL(elv_unregister_queue);
L
Linus Torvalds 已提交
904

905
int elv_register(struct elevator_type *e)
L
Linus Torvalds 已提交
906
{
907
	char *def = "";
908

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923
	/* create icq_cache if requested */
	if (e->icq_size) {
		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
			return -EINVAL;

		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
			 "%s_io_cq", e->elevator_name);
		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
						 e->icq_align, 0, NULL);
		if (!e->icq_cache)
			return -ENOMEM;
	}

	/* register, don't allow duplicate names */
924
	spin_lock(&elv_list_lock);
925
	if (elevator_find(e->elevator_name, e->uses_mq)) {
926 927 928 929 930
		spin_unlock(&elv_list_lock);
		if (e->icq_cache)
			kmem_cache_destroy(e->icq_cache);
		return -EBUSY;
	}
L
Linus Torvalds 已提交
931
	list_add_tail(&e->list, &elv_list);
932
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
933

934
	/* print pretty message */
J
Jens Axboe 已提交
935
	if (elevator_match(e, chosen_elevator) ||
936
			(!*chosen_elevator &&
J
Jens Axboe 已提交
937
			 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
938 939
				def = " (default)";

940 941
	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
								def);
942
	return 0;
L
Linus Torvalds 已提交
943 944 945 946 947
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
948
	/* unregister */
949
	spin_lock(&elv_list_lock);
L
Linus Torvalds 已提交
950
	list_del_init(&e->list);
951
	spin_unlock(&elv_list_lock);
952 953 954 955 956 957 958 959 960 961

	/*
	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
	 * sure all RCU operations are complete before proceeding.
	 */
	if (e->icq_cache) {
		rcu_barrier();
		kmem_cache_destroy(e->icq_cache);
		e->icq_cache = NULL;
	}
L
Linus Torvalds 已提交
962 963 964
}
EXPORT_SYMBOL_GPL(elv_unregister);

965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
static int elevator_switch_mq(struct request_queue *q,
			      struct elevator_type *new_e)
{
	int ret;

	blk_mq_freeze_queue(q);

	if (q->elevator) {
		if (q->elevator->registered)
			elv_unregister_queue(q);
		ioc_clear_queue(q);
		elevator_exit(q, q->elevator);
	}

	ret = blk_mq_init_sched(q, new_e);
	if (ret)
		goto out;

	if (new_e) {
		ret = elv_register_queue(q);
		if (ret) {
			elevator_exit(q, q->elevator);
			goto out;
		}
	}

	if (new_e)
		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
	else
		blk_add_trace_msg(q, "elv switch: none");

out:
	blk_mq_unfreeze_queue(q);
	return ret;
}

L
Linus Torvalds 已提交
1001 1002 1003 1004
/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
T
Tejun Heo 已提交
1005
 * one, if the new one fails init for some reason.
L
Linus Torvalds 已提交
1006
 */
1007
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
L
Linus Torvalds 已提交
1008
{
1009
	struct elevator_queue *old = q->elevator;
1010
	bool old_registered = false;
1011
	int err;
L
Linus Torvalds 已提交
1012

1013 1014
	if (q->mq_ops)
		return elevator_switch_mq(q, new_e);
1015

1016 1017 1018 1019 1020 1021 1022
	/*
	 * Turn on BYPASS and drain all requests w/ elevator private data.
	 * Block layer doesn't call into a quiesced elevator - all requests
	 * are directly put on the dispatch list without elevator data
	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
	 * merge happens either.
	 */
1023 1024 1025
	if (old) {
		old_registered = old->registered;

1026
		blk_queue_bypass_start(q);
L
Linus Torvalds 已提交
1027

1028 1029 1030 1031 1032 1033
		/* unregister and clear all auxiliary data of the old elevator */
		if (old_registered)
			elv_unregister_queue(q);

		ioc_clear_queue(q);
	}
1034

1035
	/* allocate, init and register new elevator */
1036
	err = new_e->ops.sq.elevator_init_fn(q, new_e);
1037 1038
	if (err)
		goto fail_init;
1039

1040 1041 1042
	err = elv_register_queue(q);
	if (err)
		goto fail_register;
1043 1044

	/* done, kill the old one and finish */
1045
	if (old) {
1046 1047
		elevator_exit(q, old);
		blk_queue_bypass_end(q);
1048 1049
	}

1050
	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1051

1052
	return 0;
L
Linus Torvalds 已提交
1053 1054

fail_register:
1055
	elevator_exit(q, q->elevator);
1056 1057
fail_init:
	/* switch failed, restore and re-register old elevator */
1058 1059 1060
	if (old) {
		q->elevator = old;
		elv_register_queue(q);
1061
		blk_queue_bypass_end(q);
1062
	}
N
Nick Piggin 已提交
1063

1064
	return err;
L
Linus Torvalds 已提交
1065 1066
}

1067 1068 1069
/*
 * Switch this queue to the given IO scheduler.
 */
1070
static int __elevator_change(struct request_queue *q, const char *name)
L
Linus Torvalds 已提交
1071 1072 1073 1074
{
	char elevator_name[ELV_NAME_MAX];
	struct elevator_type *e;

1075 1076 1077 1078
	/* Make sure queue is not in the middle of being removed */
	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
		return -ENOENT;

1079 1080 1081 1082 1083
	/*
	 * Special case for mq, turn off scheduling
	 */
	if (q->mq_ops && !strncmp(name, "none", 4))
		return elevator_switch(q, NULL);
1084

1085
	strlcpy(elevator_name, name, sizeof(elevator_name));
1086
	e = elevator_get(q, strstrip(elevator_name), true);
1087
	if (!e)
L
Linus Torvalds 已提交
1088 1089
		return -EINVAL;

J
Jens Axboe 已提交
1090
	if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
1091
		elevator_put(e);
1092
		return 0;
1093
	}
L
Linus Torvalds 已提交
1094

1095 1096
	return elevator_switch(q, e);
}
1097

M
Ming Lei 已提交
1098 1099 1100 1101 1102 1103 1104 1105
static inline bool elv_support_iosched(struct request_queue *q)
{
	if (q->mq_ops && q->tag_set && (q->tag_set->flags &
				BLK_MQ_F_NO_SCHED))
		return false;
	return true;
}

1106 1107 1108 1109 1110
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
			  size_t count)
{
	int ret;

M
Ming Lei 已提交
1111
	if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q))
1112 1113
		return count;

1114
	ret = __elevator_change(q, name);
1115 1116 1117 1118
	if (!ret)
		return count;

	return ret;
L
Linus Torvalds 已提交
1119 1120
}

1121
ssize_t elv_iosched_show(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
1122
{
J
Jens Axboe 已提交
1123
	struct elevator_queue *e = q->elevator;
1124
	struct elevator_type *elv = NULL;
1125
	struct elevator_type *__e;
J
Jens Axboe 已提交
1126
	bool uses_mq = q->mq_ops != NULL;
L
Linus Torvalds 已提交
1127 1128
	int len = 0;

1129
	if (!queue_is_rq_based(q))
1130 1131
		return sprintf(name, "none\n");

1132 1133 1134 1135
	if (!q->elevator)
		len += sprintf(name+len, "[none] ");
	else
		elv = e->type;
1136

1137
	spin_lock(&elv_list_lock);
1138
	list_for_each_entry(__e, &elv_list, list) {
J
Jens Axboe 已提交
1139 1140
		if (elv && elevator_match(elv, __e->elevator_name) &&
		    (__e->uses_mq == uses_mq)) {
L
Linus Torvalds 已提交
1141
			len += sprintf(name+len, "[%s] ", elv->elevator_name);
1142 1143
			continue;
		}
M
Ming Lei 已提交
1144
		if (__e->uses_mq && q->mq_ops && elv_support_iosched(q))
1145 1146
			len += sprintf(name+len, "%s ", __e->elevator_name);
		else if (!__e->uses_mq && !q->mq_ops)
L
Linus Torvalds 已提交
1147 1148
			len += sprintf(name+len, "%s ", __e->elevator_name);
	}
1149
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
1150

1151 1152 1153
	if (q->mq_ops && q->elevator)
		len += sprintf(name+len, "none");

L
Linus Torvalds 已提交
1154 1155 1156 1157
	len += sprintf(len+name, "\n");
	return len;
}

1158 1159
struct request *elv_rb_former_request(struct request_queue *q,
				      struct request *rq)
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
{
	struct rb_node *rbprev = rb_prev(&rq->rb_node);

	if (rbprev)
		return rb_entry_rq(rbprev);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

1170 1171
struct request *elv_rb_latter_request(struct request_queue *q,
				      struct request *rq)
1172 1173 1174 1175 1176 1177 1178 1179 1180
{
	struct rb_node *rbnext = rb_next(&rq->rb_node);

	if (rbnext)
		return rb_entry_rq(rbnext);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);