elevator.c 24.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
6
 * 30042000 Jens Axboe <axboe@kernel.dk> :
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
T
Tejun Heo 已提交
34
#include <linux/delay.h>
35
#include <linux/blktrace_api.h>
36
#include <linux/hash.h>
37
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
38

39 40
#include <trace/events/block.h>

J
Jens Axboe 已提交
41 42
#include "blk.h"

L
Linus Torvalds 已提交
43 44 45
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

46 47 48 49 50
/*
 * Merge hash stuff.
 */
static const int elv_hash_shift = 6;
#define ELV_HASH_BLOCK(sec)	((sec) >> 3)
51 52
#define ELV_HASH_FN(sec)	\
		(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
53
#define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
54
#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
55

56 57 58 59 60 61
/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
{
62
	struct request_queue *q = rq->q;
J
Jens Axboe 已提交
63
	struct elevator_queue *e = q->elevator;
64 65 66 67 68 69 70

	if (e->ops->elevator_allow_merge_fn)
		return e->ops->elevator_allow_merge_fn(q, rq, bio);

	return 1;
}

L
Linus Torvalds 已提交
71 72 73
/*
 * can we safely merge with this request?
 */
74
int elv_rq_merge_ok(struct request *rq, struct bio *bio)
L
Linus Torvalds 已提交
75 76 77 78
{
	if (!rq_mergeable(rq))
		return 0;

79 80 81
	/*
	 * Don't merge file system requests and discard requests
	 */
82
	if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
83 84
		return 0;

A
Adrian Hunter 已提交
85 86 87 88 89 90
	/*
	 * Don't merge discard requests and secure discard requests
	 */
	if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
		return 0;

L
Linus Torvalds 已提交
91 92 93 94 95 96 97
	/*
	 * different data direction or already started, don't merge
	 */
	if (bio_data_dir(bio) != rq_data_dir(rq))
		return 0;

	/*
98
	 * must be same device and not a special request
L
Linus Torvalds 已提交
99
	 */
100
	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
101 102
		return 0;

103 104 105 106 107 108
	/*
	 * only merge integrity protected bio into ditto rq
	 */
	if (bio_integrity(bio) != blk_integrity_rq(rq))
		return 0;

109 110
	if (!elv_iosched_allow_merge(rq, bio))
		return 0;
L
Linus Torvalds 已提交
111

112
	return 1;
L
Linus Torvalds 已提交
113 114 115
}
EXPORT_SYMBOL(elv_rq_merge_ok);

116
int elv_try_merge(struct request *__rq, struct bio *bio)
L
Linus Torvalds 已提交
117 118 119 120 121 122 123
{
	int ret = ELEVATOR_NO_MERGE;

	/*
	 * we can merge and sequence is ok, check if it's possible
	 */
	if (elv_rq_merge_ok(__rq, bio)) {
124
		if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector)
L
Linus Torvalds 已提交
125
			ret = ELEVATOR_BACK_MERGE;
126
		else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector)
L
Linus Torvalds 已提交
127 128 129 130 131 132 133 134
			ret = ELEVATOR_FRONT_MERGE;
	}

	return ret;
}

static struct elevator_type *elevator_find(const char *name)
{
135
	struct elevator_type *e;
L
Linus Torvalds 已提交
136

137
	list_for_each_entry(e, &elv_list, list) {
138 139
		if (!strcmp(e->elevator_name, name))
			return e;
L
Linus Torvalds 已提交
140 141
	}

142
	return NULL;
L
Linus Torvalds 已提交
143 144 145 146 147 148 149 150 151
}

static void elevator_put(struct elevator_type *e)
{
	module_put(e->elevator_owner);
}

static struct elevator_type *elevator_get(const char *name)
{
152
	struct elevator_type *e;
L
Linus Torvalds 已提交
153

154
	spin_lock(&elv_list_lock);
155 156

	e = elevator_find(name);
157 158
	if (!e) {
		spin_unlock(&elv_list_lock);
K
Kees Cook 已提交
159
		request_module("%s-iosched", name);
160 161 162 163
		spin_lock(&elv_list_lock);
		e = elevator_find(name);
	}

164 165 166
	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

167
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
168 169 170 171

	return e;
}

172 173
static void *elevator_init_queue(struct request_queue *q,
				 struct elevator_queue *eq)
L
Linus Torvalds 已提交
174
{
175
	return eq->ops->elevator_init_fn(q);
J
Jens Axboe 已提交
176
}
L
Linus Torvalds 已提交
177

178
static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
J
Jens Axboe 已提交
179 180
			   void *data)
{
L
Linus Torvalds 已提交
181
	q->elevator = eq;
J
Jens Axboe 已提交
182
	eq->elevator_data = data;
L
Linus Torvalds 已提交
183 184
}

185
static char chosen_elevator[ELV_NAME_MAX];
L
Linus Torvalds 已提交
186

187
static int __init elevator_setup(char *str)
L
Linus Torvalds 已提交
188
{
189 190 191 192
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
193
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
194
	return 1;
L
Linus Torvalds 已提交
195 196 197 198
}

__setup("elevator=", elevator_setup);

199 200
static struct kobj_type elv_ktype;

J
Jens Axboe 已提交
201
static struct elevator_queue *elevator_alloc(struct request_queue *q,
202
				  struct elevator_type *e)
203
{
J
Jens Axboe 已提交
204
	struct elevator_queue *eq;
205 206
	int i;

J
Jens Axboe 已提交
207
	eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
208 209 210 211 212
	if (unlikely(!eq))
		goto err;

	eq->ops = &e->ops;
	eq->elevator_type = e;
213
	kobject_init(&eq->kobj, &elv_ktype);
214 215
	mutex_init(&eq->sysfs_lock);

216 217
	eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
					GFP_KERNEL, q->node);
218 219 220 221 222 223
	if (!eq->hash)
		goto err;

	for (i = 0; i < ELV_HASH_ENTRIES; i++)
		INIT_HLIST_HEAD(&eq->hash[i]);

224
	return eq;
225 226 227 228
err:
	kfree(eq);
	elevator_put(e);
	return NULL;
229 230 231 232
}

static void elevator_release(struct kobject *kobj)
{
J
Jens Axboe 已提交
233
	struct elevator_queue *e;
234

J
Jens Axboe 已提交
235
	e = container_of(kobj, struct elevator_queue, kobj);
236
	elevator_put(e->elevator_type);
237
	kfree(e->hash);
238 239 240
	kfree(e);
}

241
int elevator_init(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
242 243 244
{
	struct elevator_type *e = NULL;
	struct elevator_queue *eq;
J
Jens Axboe 已提交
245
	void *data;
L
Linus Torvalds 已提交
246

247 248 249
	if (unlikely(q->elevator))
		return 0;

T
Tejun Heo 已提交
250 251 252 253 254
	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

255 256 257 258 259
	if (name) {
		e = elevator_get(name);
		if (!e)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
260

261 262 263 264 265 266
	if (!e && *chosen_elevator) {
		e = elevator_get(chosen_elevator);
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}
267

268 269 270 271 272 273 274 275
	if (!e) {
		e = elevator_get(CONFIG_DEFAULT_IOSCHED);
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
				"Using noop.\n");
			e = elevator_get("noop");
		}
276 277
	}

278
	eq = elevator_alloc(q, e);
279
	if (!eq)
L
Linus Torvalds 已提交
280 281
		return -ENOMEM;

J
Jens Axboe 已提交
282 283
	data = elevator_init_queue(q, eq);
	if (!data) {
284
		kobject_put(&eq->kobj);
J
Jens Axboe 已提交
285 286
		return -ENOMEM;
	}
L
Linus Torvalds 已提交
287

J
Jens Axboe 已提交
288
	elevator_attach(q, eq, data);
289
	return 0;
L
Linus Torvalds 已提交
290
}
291 292
EXPORT_SYMBOL(elevator_init);

J
Jens Axboe 已提交
293
void elevator_exit(struct elevator_queue *e)
L
Linus Torvalds 已提交
294
{
295
	mutex_lock(&e->sysfs_lock);
L
Linus Torvalds 已提交
296 297
	if (e->ops->elevator_exit_fn)
		e->ops->elevator_exit_fn(e);
298 299
	e->ops = NULL;
	mutex_unlock(&e->sysfs_lock);
L
Linus Torvalds 已提交
300

301
	kobject_put(&e->kobj);
L
Linus Torvalds 已提交
302
}
303 304
EXPORT_SYMBOL(elevator_exit);

305 306 307 308 309
static inline void __elv_rqhash_del(struct request *rq)
{
	hlist_del_init(&rq->hash);
}

310
static void elv_rqhash_del(struct request_queue *q, struct request *rq)
311 312 313 314 315
{
	if (ELV_ON_HASH(rq))
		__elv_rqhash_del(rq);
}

316
static void elv_rqhash_add(struct request_queue *q, struct request *rq)
317
{
J
Jens Axboe 已提交
318
	struct elevator_queue *e = q->elevator;
319 320 321 322 323

	BUG_ON(ELV_ON_HASH(rq));
	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
}

324
static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
325 326 327 328 329
{
	__elv_rqhash_del(rq);
	elv_rqhash_add(q, rq);
}

330
static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
331
{
J
Jens Axboe 已提交
332
	struct elevator_queue *e = q->elevator;
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
	struct hlist_node *entry, *next;
	struct request *rq;

	hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
		BUG_ON(!ELV_ON_HASH(rq));

		if (unlikely(!rq_mergeable(rq))) {
			__elv_rqhash_del(rq);
			continue;
		}

		if (rq_hash_key(rq) == offset)
			return rq;
	}

	return NULL;
}

352 353 354 355
/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
356
void elv_rb_add(struct rb_root *root, struct request *rq)
357 358 359 360 361 362 363 364 365
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct request *__rq;

	while (*p) {
		parent = *p;
		__rq = rb_entry(parent, struct request, rb_node);

366
		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
367
			p = &(*p)->rb_left;
368
		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
			p = &(*p)->rb_right;
	}

	rb_link_node(&rq->rb_node, parent, p);
	rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
	rb_erase(&rq->rb_node, root);
	RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
	struct rb_node *n = root->rb_node;
	struct request *rq;

	while (n) {
		rq = rb_entry(n, struct request, rb_node);

393
		if (sector < blk_rq_pos(rq))
394
			n = n->rb_left;
395
		else if (sector > blk_rq_pos(rq))
396 397 398 399 400 401 402 403 404
			n = n->rb_right;
		else
			return rq;
	}

	return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

405 406
/*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
U
Uwe Kleine-König 已提交
407
 * entry.  rq is sort instead into the dispatch queue. To be used by
408
 * specific elevators.
409
 */
410
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
411 412 413
{
	sector_t boundary;
	struct list_head *entry;
414
	int stop_flags;
415

416 417
	if (q->last_merge == rq)
		q->last_merge = NULL;
418 419 420

	elv_rqhash_del(q, rq);

421
	q->nr_sorted--;
422

J
Jens Axboe 已提交
423
	boundary = q->end_sector;
C
Christoph Hellwig 已提交
424
	stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
425 426 427
	list_for_each_prev(entry, &q->queue_head) {
		struct request *pos = list_entry_rq(entry);

428 429
		if ((rq->cmd_flags & REQ_DISCARD) !=
		    (pos->cmd_flags & REQ_DISCARD))
430
			break;
431 432
		if (rq_data_dir(rq) != rq_data_dir(pos))
			break;
433
		if (pos->cmd_flags & stop_flags)
434
			break;
435 436
		if (blk_rq_pos(rq) >= boundary) {
			if (blk_rq_pos(pos) < boundary)
437 438
				continue;
		} else {
439
			if (blk_rq_pos(pos) >= boundary)
440 441
				break;
		}
442
		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
443 444 445 446 447
			break;
	}

	list_add(&rq->queuelist, entry);
}
448 449
EXPORT_SYMBOL(elv_dispatch_sort);

450
/*
451 452 453
 * Insert rq into dispatch queue of q.  Queue lock must be held on
 * entry.  rq is added to the back of the dispatch queue. To be used by
 * specific elevators.
454 455 456 457 458 459 460 461 462 463 464 465 466 467
 */
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}
468 469
EXPORT_SYMBOL(elv_dispatch_add_tail);

470
int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
L
Linus Torvalds 已提交
471
{
J
Jens Axboe 已提交
472
	struct elevator_queue *e = q->elevator;
473
	struct request *__rq;
474 475
	int ret;

476 477 478 479 480 481 482 483 484
	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
	if (blk_queue_nomerges(q))
		return ELEVATOR_NO_MERGE;

485 486 487
	/*
	 * First try one-hit cache.
	 */
488 489 490 491 492 493 494
	if (q->last_merge) {
		ret = elv_try_merge(q->last_merge, bio);
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}
L
Linus Torvalds 已提交
495

496
	if (blk_queue_noxmerges(q))
497 498
		return ELEVATOR_NO_MERGE;

499 500 501 502 503 504 505 506 507
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
	__rq = elv_rqhash_find(q, bio->bi_sector);
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

L
Linus Torvalds 已提交
508 509 510 511 512 513
	if (e->ops->elevator_merge_fn)
		return e->ops->elevator_merge_fn(q, req, bio);

	return ELEVATOR_NO_MERGE;
}

514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
static bool elv_attempt_insert_merge(struct request_queue *q,
				     struct request *rq)
{
	struct request *__rq;

	if (blk_queue_nomerges(q))
		return false;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
		return true;

	if (blk_queue_noxmerges(q))
		return false;

	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
	__rq = elv_rqhash_find(q, blk_rq_pos(rq));
	if (__rq && blk_attempt_req_merge(q, __rq, rq))
		return true;

	return false;
}

548
void elv_merged_request(struct request_queue *q, struct request *rq, int type)
L
Linus Torvalds 已提交
549
{
J
Jens Axboe 已提交
550
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
551 552

	if (e->ops->elevator_merged_fn)
553
		e->ops->elevator_merged_fn(q, rq, type);
554

555 556
	if (type == ELEVATOR_BACK_MERGE)
		elv_rqhash_reposition(q, rq);
557

558
	q->last_merge = rq;
L
Linus Torvalds 已提交
559 560
}

561
void elv_merge_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
562 563
			     struct request *next)
{
J
Jens Axboe 已提交
564
	struct elevator_queue *e = q->elevator;
565
	const int next_sorted = next->cmd_flags & REQ_SORTED;
L
Linus Torvalds 已提交
566

567
	if (next_sorted && e->ops->elevator_merge_req_fn)
L
Linus Torvalds 已提交
568
		e->ops->elevator_merge_req_fn(q, rq, next);
569

570 571
	elv_rqhash_reposition(q, rq);

572 573 574 575 576
	if (next_sorted) {
		elv_rqhash_del(q, next);
		q->nr_sorted--;
	}

577
	q->last_merge = rq;
L
Linus Torvalds 已提交
578 579
}

D
Divyesh Shah 已提交
580 581 582 583 584 585 586 587 588
void elv_bio_merged(struct request_queue *q, struct request *rq,
			struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

	if (e->ops->elevator_bio_merged_fn)
		e->ops->elevator_bio_merged_fn(q, rq, bio);
}

589
void elv_requeue_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
590 591 592 593 594
{
	/*
	 * it already went through dequeue, we need to decrement the
	 * in_flight count again
	 */
595
	if (blk_account_rq(rq)) {
596
		q->in_flight[rq_is_sync(rq)]--;
597
		if (rq->cmd_flags & REQ_SORTED)
598
			elv_deactivate_rq(q, rq);
599
	}
L
Linus Torvalds 已提交
600

601
	rq->cmd_flags &= ~REQ_STARTED;
L
Linus Torvalds 已提交
602

603
	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
L
Linus Torvalds 已提交
604 605
}

606
void elv_drain_elevator(struct request_queue *q)
607 608 609 610 611 612 613 614 615 616 617 618 619
{
	static int printed;
	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
		;
	if (q->nr_sorted == 0)
		return;
	if (printed++ < 10) {
		printk(KERN_ERR "%s: forced dispatching is broken "
		       "(nr_sorted=%u), please report this\n",
		       q->elevator->elevator_type->elevator_name, q->nr_sorted);
	}
}

J
Jens Axboe 已提交
620 621 622
/*
 * Call with queue lock held, interrupts disabled
 */
J
Jens Axboe 已提交
623
void elv_quiesce_start(struct request_queue *q)
J
Jens Axboe 已提交
624
{
625 626 627
	if (!q->elevator)
		return;

J
Jens Axboe 已提交
628 629 630 631 632 633 634
	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);

	/*
	 * make sure we don't have any requests in flight
	 */
	elv_drain_elevator(q);
	while (q->rq.elvpriv) {
635
		__blk_run_queue(q);
J
Jens Axboe 已提交
636 637 638 639 640 641 642
		spin_unlock_irq(q->queue_lock);
		msleep(10);
		spin_lock_irq(q->queue_lock);
		elv_drain_elevator(q);
	}
}

J
Jens Axboe 已提交
643
void elv_quiesce_end(struct request_queue *q)
J
Jens Axboe 已提交
644 645 646 647
{
	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
}

648
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
649
{
650
	trace_block_rq_insert(q, rq);
651

L
Linus Torvalds 已提交
652 653
	rq->q = q;

654 655 656 657 658 659 660 661
	if (rq->cmd_flags & REQ_SOFTBARRIER) {
		/* barriers are scheduling boundary, update end_sector */
		if (rq->cmd_type == REQ_TYPE_FS ||
		    (rq->cmd_flags & REQ_DISCARD)) {
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
662 663
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
664 665
		where = ELEVATOR_INSERT_BACK;

666
	switch (where) {
667
	case ELEVATOR_INSERT_REQUEUE:
668
	case ELEVATOR_INSERT_FRONT:
669
		rq->cmd_flags |= REQ_SOFTBARRIER;
670 671 672 673
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
674
		rq->cmd_flags |= REQ_SOFTBARRIER;
675
		elv_drain_elevator(q);
676 677 678 679 680 681 682 683 684 685 686
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
687
		__blk_run_queue(q);
688 689
		break;

690 691 692 693 694 695 696 697
	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
698
	case ELEVATOR_INSERT_SORT:
699 700
		BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
		       !(rq->cmd_flags & REQ_DISCARD));
701
		rq->cmd_flags |= REQ_SORTED;
702
		q->nr_sorted++;
703 704 705 706 707 708
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

709 710 711 712 713 714
		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
		q->elevator->ops->elevator_add_req_fn(q, rq);
715 716
		break;

717 718 719 720
	case ELEVATOR_INSERT_FLUSH:
		rq->cmd_flags |= REQ_SOFTBARRIER;
		blk_insert_flush(rq);
		break;
721 722
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
723
		       __func__, where);
724 725
		BUG();
	}
L
Linus Torvalds 已提交
726
}
727 728
EXPORT_SYMBOL(__elv_add_request);

J
Jens Axboe 已提交
729
void elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
730 731 732 733
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
J
Jens Axboe 已提交
734
	__elv_add_request(q, rq, where);
L
Linus Torvalds 已提交
735 736
	spin_unlock_irqrestore(q->queue_lock, flags);
}
737 738
EXPORT_SYMBOL(elv_add_request);

739
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
740
{
J
Jens Axboe 已提交
741
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
742 743 744 745 746 747

	if (e->ops->elevator_latter_req_fn)
		return e->ops->elevator_latter_req_fn(q, rq);
	return NULL;
}

748
struct request *elv_former_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
749
{
J
Jens Axboe 已提交
750
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
751 752 753 754 755 756

	if (e->ops->elevator_former_req_fn)
		return e->ops->elevator_former_req_fn(q, rq);
	return NULL;
}

757
int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
L
Linus Torvalds 已提交
758
{
J
Jens Axboe 已提交
759
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
760 761

	if (e->ops->elevator_set_req_fn)
762
		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
L
Linus Torvalds 已提交
763

764
	rq->elevator_private[0] = NULL;
L
Linus Torvalds 已提交
765 766 767
	return 0;
}

768
void elv_put_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
769
{
J
Jens Axboe 已提交
770
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
771 772

	if (e->ops->elevator_put_req_fn)
773
		e->ops->elevator_put_req_fn(rq);
L
Linus Torvalds 已提交
774 775
}

776
int elv_may_queue(struct request_queue *q, int rw)
L
Linus Torvalds 已提交
777
{
J
Jens Axboe 已提交
778
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
779 780

	if (e->ops->elevator_may_queue_fn)
781
		return e->ops->elevator_may_queue_fn(q, rw);
L
Linus Torvalds 已提交
782 783 784 785

	return ELV_MQUEUE_MAY;
}

786 787 788 789
void elv_abort_queue(struct request_queue *q)
{
	struct request *rq;

790 791
	blk_abort_flushes(q);

792 793 794
	while (!list_empty(&q->queue_head)) {
		rq = list_entry_rq(q->queue_head.next);
		rq->cmd_flags |= REQ_QUIET;
795
		trace_block_rq_abort(q, rq);
796 797 798 799 800
		/*
		 * Mark this request as started so we don't trigger
		 * any debug logic in the end I/O path.
		 */
		blk_start_request(rq);
801
		__blk_end_request_all(rq, -EIO);
802 803 804 805
	}
}
EXPORT_SYMBOL(elv_abort_queue);

806
void elv_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
807
{
J
Jens Axboe 已提交
808
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
809 810 811 812

	/*
	 * request is released from the driver, io must be done
	 */
813
	if (blk_account_rq(rq)) {
814
		q->in_flight[rq_is_sync(rq)]--;
815 816
		if ((rq->cmd_flags & REQ_SORTED) &&
		    e->ops->elevator_completed_req_fn)
817 818
			e->ops->elevator_completed_req_fn(q, rq);
	}
L
Linus Torvalds 已提交
819 820
}

821 822 823 824
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
L
Linus Torvalds 已提交
825
{
826
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
827
	struct elevator_queue *e;
828 829 830 831 832
	ssize_t error;

	if (!entry->show)
		return -EIO;

J
Jens Axboe 已提交
833
	e = container_of(kobj, struct elevator_queue, kobj);
834 835 836 837 838
	mutex_lock(&e->sysfs_lock);
	error = e->ops ? entry->show(e, page) : -ENOENT;
	mutex_unlock(&e->sysfs_lock);
	return error;
}
L
Linus Torvalds 已提交
839

840 841 842 843 844
static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
	       const char *page, size_t length)
{
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
845
	struct elevator_queue *e;
846
	ssize_t error;
L
Linus Torvalds 已提交
847

848 849
	if (!entry->store)
		return -EIO;
L
Linus Torvalds 已提交
850

J
Jens Axboe 已提交
851
	e = container_of(kobj, struct elevator_queue, kobj);
852 853 854 855 856 857
	mutex_lock(&e->sysfs_lock);
	error = e->ops ? entry->store(e, page, length) : -ENOENT;
	mutex_unlock(&e->sysfs_lock);
	return error;
}

858
static const struct sysfs_ops elv_sysfs_ops = {
859 860 861 862 863 864 865 866 867 868 869
	.show	= elv_attr_show,
	.store	= elv_attr_store,
};

static struct kobj_type elv_ktype = {
	.sysfs_ops	= &elv_sysfs_ops,
	.release	= elevator_release,
};

int elv_register_queue(struct request_queue *q)
{
J
Jens Axboe 已提交
870
	struct elevator_queue *e = q->elevator;
871 872
	int error;

873
	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
874
	if (!error) {
875
		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
876
		if (attr) {
877 878
			while (attr->attr.name) {
				if (sysfs_create_file(&e->kobj, &attr->attr))
879
					break;
880
				attr++;
881 882 883
			}
		}
		kobject_uevent(&e->kobj, KOBJ_ADD);
884
		e->registered = 1;
885 886
	}
	return error;
L
Linus Torvalds 已提交
887
}
888
EXPORT_SYMBOL(elv_register_queue);
L
Linus Torvalds 已提交
889

J
Jens Axboe 已提交
890
static void __elv_unregister_queue(struct elevator_queue *e)
J
Jens Axboe 已提交
891 892 893
{
	kobject_uevent(&e->kobj, KOBJ_REMOVE);
	kobject_del(&e->kobj);
894
	e->registered = 0;
J
Jens Axboe 已提交
895 896
}

L
Linus Torvalds 已提交
897 898
void elv_unregister_queue(struct request_queue *q)
{
J
Jens Axboe 已提交
899 900
	if (q)
		__elv_unregister_queue(q->elevator);
L
Linus Torvalds 已提交
901
}
902
EXPORT_SYMBOL(elv_unregister_queue);
L
Linus Torvalds 已提交
903

904
void elv_register(struct elevator_type *e)
L
Linus Torvalds 已提交
905
{
906
	char *def = "";
907 908

	spin_lock(&elv_list_lock);
909
	BUG_ON(elevator_find(e->elevator_name));
L
Linus Torvalds 已提交
910
	list_add_tail(&e->list, &elv_list);
911
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
912

913 914 915
	if (!strcmp(e->elevator_name, chosen_elevator) ||
			(!*chosen_elevator &&
			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
916 917
				def = " (default)";

918 919
	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
								def);
L
Linus Torvalds 已提交
920 921 922 923 924
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
925 926 927 928 929
	struct task_struct *g, *p;

	/*
	 * Iterate every thread in the process to remove the io contexts.
	 */
930 931 932 933
	if (e->ops.trim) {
		read_lock(&tasklist_lock);
		do_each_thread(g, p) {
			task_lock(p);
934 935
			if (p->io_context)
				e->ops.trim(p->io_context);
936 937 938 939
			task_unlock(p);
		} while_each_thread(g, p);
		read_unlock(&tasklist_lock);
	}
940

941
	spin_lock(&elv_list_lock);
L
Linus Torvalds 已提交
942
	list_del_init(&e->list);
943
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
944 945 946 947 948 949 950
}
EXPORT_SYMBOL_GPL(elv_unregister);

/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
T
Tejun Heo 已提交
951
 * one, if the new one fails init for some reason.
L
Linus Torvalds 已提交
952
 */
953
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
L
Linus Torvalds 已提交
954
{
J
Jens Axboe 已提交
955
	struct elevator_queue *old_elevator, *e;
J
Jens Axboe 已提交
956
	void *data;
957
	int err;
L
Linus Torvalds 已提交
958

T
Tejun Heo 已提交
959 960 961
	/*
	 * Allocate new elevator
	 */
962
	e = elevator_alloc(q, new_e);
L
Linus Torvalds 已提交
963
	if (!e)
964
		return -ENOMEM;
L
Linus Torvalds 已提交
965

J
Jens Axboe 已提交
966 967 968
	data = elevator_init_queue(q, e);
	if (!data) {
		kobject_put(&e->kobj);
969
		return -ENOMEM;
J
Jens Axboe 已提交
970 971
	}

L
Linus Torvalds 已提交
972
	/*
T
Tejun Heo 已提交
973
	 * Turn on BYPASS and drain all requests w/ elevator private data
L
Linus Torvalds 已提交
974
	 */
T
Tejun Heo 已提交
975
	spin_lock_irq(q->queue_lock);
J
Jens Axboe 已提交
976
	elv_quiesce_start(q);
T
Tejun Heo 已提交
977

L
Linus Torvalds 已提交
978
	/*
J
Jens Axboe 已提交
979
	 * Remember old elevator.
L
Linus Torvalds 已提交
980 981 982 983 984 985
	 */
	old_elevator = q->elevator;

	/*
	 * attach and start new elevator
	 */
J
Jens Axboe 已提交
986 987 988 989
	elevator_attach(q, e, data);

	spin_unlock_irq(q->queue_lock);

990 991
	if (old_elevator->registered) {
		__elv_unregister_queue(old_elevator);
L
Linus Torvalds 已提交
992

993 994 995 996
		err = elv_register_queue(q);
		if (err)
			goto fail_register;
	}
L
Linus Torvalds 已提交
997 998

	/*
T
Tejun Heo 已提交
999
	 * finally exit old elevator and turn off BYPASS.
L
Linus Torvalds 已提交
1000 1001
	 */
	elevator_exit(old_elevator);
N
Nick Piggin 已提交
1002
	spin_lock_irq(q->queue_lock);
J
Jens Axboe 已提交
1003
	elv_quiesce_end(q);
N
Nick Piggin 已提交
1004 1005
	spin_unlock_irq(q->queue_lock);

1006 1007
	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);

1008
	return 0;
L
Linus Torvalds 已提交
1009 1010 1011 1012 1013 1014 1015 1016 1017

fail_register:
	/*
	 * switch failed, exit the new io scheduler and reattach the old
	 * one again (along with re-adding the sysfs dir)
	 */
	elevator_exit(e);
	q->elevator = old_elevator;
	elv_register_queue(q);
N
Nick Piggin 已提交
1018 1019 1020 1021 1022

	spin_lock_irq(q->queue_lock);
	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
	spin_unlock_irq(q->queue_lock);

1023
	return err;
L
Linus Torvalds 已提交
1024 1025
}

1026 1027 1028 1029
/*
 * Switch this queue to the given IO scheduler.
 */
int elevator_change(struct request_queue *q, const char *name)
L
Linus Torvalds 已提交
1030 1031 1032 1033
{
	char elevator_name[ELV_NAME_MAX];
	struct elevator_type *e;

1034
	if (!q->elevator)
1035
		return -ENXIO;
1036

1037
	strlcpy(elevator_name, name, sizeof(elevator_name));
1038
	e = elevator_get(strstrip(elevator_name));
L
Linus Torvalds 已提交
1039 1040 1041 1042 1043
	if (!e) {
		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
		return -EINVAL;
	}

1044 1045
	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
		elevator_put(e);
1046
		return 0;
1047
	}
L
Linus Torvalds 已提交
1048

1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
	return elevator_switch(q, e);
}
EXPORT_SYMBOL(elevator_change);

ssize_t elv_iosched_store(struct request_queue *q, const char *name,
			  size_t count)
{
	int ret;

	if (!q->elevator)
		return count;

	ret = elevator_change(q, name);
	if (!ret)
		return count;

	printk(KERN_ERR "elevator: switch to %s failed\n", name);
	return ret;
L
Linus Torvalds 已提交
1067 1068
}

1069
ssize_t elv_iosched_show(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
1070
{
J
Jens Axboe 已提交
1071
	struct elevator_queue *e = q->elevator;
1072
	struct elevator_type *elv;
1073
	struct elevator_type *__e;
L
Linus Torvalds 已提交
1074 1075
	int len = 0;

1076
	if (!q->elevator || !blk_queue_stackable(q))
1077 1078 1079 1080
		return sprintf(name, "none\n");

	elv = e->elevator_type;

1081
	spin_lock(&elv_list_lock);
1082
	list_for_each_entry(__e, &elv_list, list) {
L
Linus Torvalds 已提交
1083 1084 1085 1086 1087
		if (!strcmp(elv->elevator_name, __e->elevator_name))
			len += sprintf(name+len, "[%s] ", elv->elevator_name);
		else
			len += sprintf(name+len, "%s ", __e->elevator_name);
	}
1088
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
1089 1090 1091 1092 1093

	len += sprintf(len+name, "\n");
	return len;
}

1094 1095
struct request *elv_rb_former_request(struct request_queue *q,
				      struct request *rq)
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
{
	struct rb_node *rbprev = rb_prev(&rq->rb_node);

	if (rbprev)
		return rb_entry_rq(rbprev);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

1106 1107
struct request *elv_rb_latter_request(struct request_queue *q,
				      struct request *rq)
1108 1109 1110 1111 1112 1113 1114 1115 1116
{
	struct rb_node *rbnext = rb_next(&rq->rb_node);

	if (rbnext)
		return rb_entry_rq(rbnext);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);