elevator.c 24.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
6
 * 30042000 Jens Axboe <axboe@kernel.dk> :
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
34
#include <linux/blktrace_api.h>
35
#include <linux/hash.h>
36
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
37

38 39
#include <trace/events/block.h>

J
Jens Axboe 已提交
40 41
#include "blk.h"

L
Linus Torvalds 已提交
42 43 44
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

45 46 47 48 49
/*
 * Merge hash stuff.
 */
static const int elv_hash_shift = 6;
#define ELV_HASH_BLOCK(sec)	((sec) >> 3)
50 51
#define ELV_HASH_FN(sec)	\
		(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
52
#define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
53
#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
54

55 56 57 58 59 60
/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
{
61
	struct request_queue *q = rq->q;
J
Jens Axboe 已提交
62
	struct elevator_queue *e = q->elevator;
63

T
Tejun Heo 已提交
64 65
	if (e->type->ops.elevator_allow_merge_fn)
		return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
66 67 68 69

	return 1;
}

L
Linus Torvalds 已提交
70 71 72
/*
 * can we safely merge with this request?
 */
73
int elv_rq_merge_ok(struct request *rq, struct bio *bio)
L
Linus Torvalds 已提交
74 75 76 77
{
	if (!rq_mergeable(rq))
		return 0;

78 79 80
	/*
	 * Don't merge file system requests and discard requests
	 */
81
	if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
82 83
		return 0;

A
Adrian Hunter 已提交
84 85 86 87 88 89
	/*
	 * Don't merge discard requests and secure discard requests
	 */
	if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
		return 0;

L
Linus Torvalds 已提交
90 91 92 93 94 95 96
	/*
	 * different data direction or already started, don't merge
	 */
	if (bio_data_dir(bio) != rq_data_dir(rq))
		return 0;

	/*
97
	 * must be same device and not a special request
L
Linus Torvalds 已提交
98
	 */
99
	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
100 101
		return 0;

102 103 104 105 106 107
	/*
	 * only merge integrity protected bio into ditto rq
	 */
	if (bio_integrity(bio) != blk_integrity_rq(rq))
		return 0;

108 109
	if (!elv_iosched_allow_merge(rq, bio))
		return 0;
L
Linus Torvalds 已提交
110

111
	return 1;
L
Linus Torvalds 已提交
112 113 114
}
EXPORT_SYMBOL(elv_rq_merge_ok);

115
int elv_try_merge(struct request *__rq, struct bio *bio)
L
Linus Torvalds 已提交
116 117 118 119 120 121 122
{
	int ret = ELEVATOR_NO_MERGE;

	/*
	 * we can merge and sequence is ok, check if it's possible
	 */
	if (elv_rq_merge_ok(__rq, bio)) {
123
		if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector)
L
Linus Torvalds 已提交
124
			ret = ELEVATOR_BACK_MERGE;
125
		else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector)
L
Linus Torvalds 已提交
126 127 128 129 130 131 132 133
			ret = ELEVATOR_FRONT_MERGE;
	}

	return ret;
}

static struct elevator_type *elevator_find(const char *name)
{
134
	struct elevator_type *e;
L
Linus Torvalds 已提交
135

136
	list_for_each_entry(e, &elv_list, list) {
137 138
		if (!strcmp(e->elevator_name, name))
			return e;
L
Linus Torvalds 已提交
139 140
	}

141
	return NULL;
L
Linus Torvalds 已提交
142 143 144 145 146 147 148 149 150
}

static void elevator_put(struct elevator_type *e)
{
	module_put(e->elevator_owner);
}

static struct elevator_type *elevator_get(const char *name)
{
151
	struct elevator_type *e;
L
Linus Torvalds 已提交
152

153
	spin_lock(&elv_list_lock);
154 155

	e = elevator_find(name);
156 157
	if (!e) {
		spin_unlock(&elv_list_lock);
K
Kees Cook 已提交
158
		request_module("%s-iosched", name);
159 160 161 162
		spin_lock(&elv_list_lock);
		e = elevator_find(name);
	}

163 164 165
	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

166
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
167 168 169 170

	return e;
}

171 172
static int elevator_init_queue(struct request_queue *q,
			       struct elevator_queue *eq)
L
Linus Torvalds 已提交
173
{
T
Tejun Heo 已提交
174
	eq->elevator_data = eq->type->ops.elevator_init_fn(q);
175 176 177
	if (eq->elevator_data)
		return 0;
	return -ENOMEM;
L
Linus Torvalds 已提交
178 179
}

180
static char chosen_elevator[ELV_NAME_MAX];
L
Linus Torvalds 已提交
181

182
static int __init elevator_setup(char *str)
L
Linus Torvalds 已提交
183
{
184 185 186 187
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
188
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
189
	return 1;
L
Linus Torvalds 已提交
190 191 192 193
}

__setup("elevator=", elevator_setup);

194 195
static struct kobj_type elv_ktype;

J
Jens Axboe 已提交
196
static struct elevator_queue *elevator_alloc(struct request_queue *q,
197
				  struct elevator_type *e)
198
{
J
Jens Axboe 已提交
199
	struct elevator_queue *eq;
200 201
	int i;

J
Jens Axboe 已提交
202
	eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
203 204 205
	if (unlikely(!eq))
		goto err;

T
Tejun Heo 已提交
206
	eq->type = e;
207
	kobject_init(&eq->kobj, &elv_ktype);
208 209
	mutex_init(&eq->sysfs_lock);

210 211
	eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
					GFP_KERNEL, q->node);
212 213 214 215 216 217
	if (!eq->hash)
		goto err;

	for (i = 0; i < ELV_HASH_ENTRIES; i++)
		INIT_HLIST_HEAD(&eq->hash[i]);

218
	return eq;
219 220 221 222
err:
	kfree(eq);
	elevator_put(e);
	return NULL;
223 224 225 226
}

static void elevator_release(struct kobject *kobj)
{
J
Jens Axboe 已提交
227
	struct elevator_queue *e;
228

J
Jens Axboe 已提交
229
	e = container_of(kobj, struct elevator_queue, kobj);
T
Tejun Heo 已提交
230
	elevator_put(e->type);
231
	kfree(e->hash);
232 233 234
	kfree(e);
}

235
int elevator_init(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
236 237 238
{
	struct elevator_type *e = NULL;
	struct elevator_queue *eq;
239
	int err;
L
Linus Torvalds 已提交
240

241 242 243
	if (unlikely(q->elevator))
		return 0;

T
Tejun Heo 已提交
244 245 246 247 248
	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

249 250 251 252 253
	if (name) {
		e = elevator_get(name);
		if (!e)
			return -EINVAL;
	}
L
Linus Torvalds 已提交
254

255 256 257 258 259 260
	if (!e && *chosen_elevator) {
		e = elevator_get(chosen_elevator);
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}
261

262 263 264 265 266 267 268 269
	if (!e) {
		e = elevator_get(CONFIG_DEFAULT_IOSCHED);
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
				"Using noop.\n");
			e = elevator_get("noop");
		}
270 271
	}

272
	eq = elevator_alloc(q, e);
273
	if (!eq)
L
Linus Torvalds 已提交
274 275
		return -ENOMEM;

276 277
	err = elevator_init_queue(q, eq);
	if (err) {
278
		kobject_put(&eq->kobj);
279
		return err;
J
Jens Axboe 已提交
280
	}
L
Linus Torvalds 已提交
281

282
	q->elevator = eq;
283
	return 0;
L
Linus Torvalds 已提交
284
}
285 286
EXPORT_SYMBOL(elevator_init);

J
Jens Axboe 已提交
287
void elevator_exit(struct elevator_queue *e)
L
Linus Torvalds 已提交
288
{
289
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
290 291
	if (e->type->ops.elevator_exit_fn)
		e->type->ops.elevator_exit_fn(e);
292
	mutex_unlock(&e->sysfs_lock);
L
Linus Torvalds 已提交
293

294
	kobject_put(&e->kobj);
L
Linus Torvalds 已提交
295
}
296 297
EXPORT_SYMBOL(elevator_exit);

298 299 300 301 302
static inline void __elv_rqhash_del(struct request *rq)
{
	hlist_del_init(&rq->hash);
}

303
static void elv_rqhash_del(struct request_queue *q, struct request *rq)
304 305 306 307 308
{
	if (ELV_ON_HASH(rq))
		__elv_rqhash_del(rq);
}

309
static void elv_rqhash_add(struct request_queue *q, struct request *rq)
310
{
J
Jens Axboe 已提交
311
	struct elevator_queue *e = q->elevator;
312 313 314 315 316

	BUG_ON(ELV_ON_HASH(rq));
	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
}

317
static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
318 319 320 321 322
{
	__elv_rqhash_del(rq);
	elv_rqhash_add(q, rq);
}

323
static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
324
{
J
Jens Axboe 已提交
325
	struct elevator_queue *e = q->elevator;
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
	struct hlist_node *entry, *next;
	struct request *rq;

	hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
		BUG_ON(!ELV_ON_HASH(rq));

		if (unlikely(!rq_mergeable(rq))) {
			__elv_rqhash_del(rq);
			continue;
		}

		if (rq_hash_key(rq) == offset)
			return rq;
	}

	return NULL;
}

345 346 347 348
/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
349
void elv_rb_add(struct rb_root *root, struct request *rq)
350 351 352 353 354 355 356 357 358
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct request *__rq;

	while (*p) {
		parent = *p;
		__rq = rb_entry(parent, struct request, rb_node);

359
		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
360
			p = &(*p)->rb_left;
361
		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
			p = &(*p)->rb_right;
	}

	rb_link_node(&rq->rb_node, parent, p);
	rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
	rb_erase(&rq->rb_node, root);
	RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
	struct rb_node *n = root->rb_node;
	struct request *rq;

	while (n) {
		rq = rb_entry(n, struct request, rb_node);

386
		if (sector < blk_rq_pos(rq))
387
			n = n->rb_left;
388
		else if (sector > blk_rq_pos(rq))
389 390 391 392 393 394 395 396 397
			n = n->rb_right;
		else
			return rq;
	}

	return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

398 399
/*
 * Insert rq into dispatch queue of q.  Queue lock must be held on
U
Uwe Kleine-König 已提交
400
 * entry.  rq is sort instead into the dispatch queue. To be used by
401
 * specific elevators.
402
 */
403
void elv_dispatch_sort(struct request_queue *q, struct request *rq)
404 405 406
{
	sector_t boundary;
	struct list_head *entry;
407
	int stop_flags;
408

409 410
	if (q->last_merge == rq)
		q->last_merge = NULL;
411 412 413

	elv_rqhash_del(q, rq);

414
	q->nr_sorted--;
415

J
Jens Axboe 已提交
416
	boundary = q->end_sector;
C
Christoph Hellwig 已提交
417
	stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
418 419 420
	list_for_each_prev(entry, &q->queue_head) {
		struct request *pos = list_entry_rq(entry);

421 422
		if ((rq->cmd_flags & REQ_DISCARD) !=
		    (pos->cmd_flags & REQ_DISCARD))
423
			break;
424 425
		if (rq_data_dir(rq) != rq_data_dir(pos))
			break;
426
		if (pos->cmd_flags & stop_flags)
427
			break;
428 429
		if (blk_rq_pos(rq) >= boundary) {
			if (blk_rq_pos(pos) < boundary)
430 431
				continue;
		} else {
432
			if (blk_rq_pos(pos) >= boundary)
433 434
				break;
		}
435
		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
436 437 438 439 440
			break;
	}

	list_add(&rq->queuelist, entry);
}
441 442
EXPORT_SYMBOL(elv_dispatch_sort);

443
/*
444 445 446
 * Insert rq into dispatch queue of q.  Queue lock must be held on
 * entry.  rq is added to the back of the dispatch queue. To be used by
 * specific elevators.
447 448 449 450 451 452 453 454 455 456 457 458 459 460
 */
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
	if (q->last_merge == rq)
		q->last_merge = NULL;

	elv_rqhash_del(q, rq);

	q->nr_sorted--;

	q->end_sector = rq_end_sector(rq);
	q->boundary_rq = rq;
	list_add_tail(&rq->queuelist, &q->queue_head);
}
461 462
EXPORT_SYMBOL(elv_dispatch_add_tail);

463
int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
L
Linus Torvalds 已提交
464
{
J
Jens Axboe 已提交
465
	struct elevator_queue *e = q->elevator;
466
	struct request *__rq;
467 468
	int ret;

469 470 471 472 473 474 475 476 477
	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
	if (blk_queue_nomerges(q))
		return ELEVATOR_NO_MERGE;

478 479 480
	/*
	 * First try one-hit cache.
	 */
481 482 483 484 485 486 487
	if (q->last_merge) {
		ret = elv_try_merge(q->last_merge, bio);
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}
L
Linus Torvalds 已提交
488

489
	if (blk_queue_noxmerges(q))
490 491
		return ELEVATOR_NO_MERGE;

492 493 494 495 496 497 498 499 500
	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
	__rq = elv_rqhash_find(q, bio->bi_sector);
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

T
Tejun Heo 已提交
501 502
	if (e->type->ops.elevator_merge_fn)
		return e->type->ops.elevator_merge_fn(q, req, bio);
L
Linus Torvalds 已提交
503 504 505 506

	return ELEVATOR_NO_MERGE;
}

507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
static bool elv_attempt_insert_merge(struct request_queue *q,
				     struct request *rq)
{
	struct request *__rq;

	if (blk_queue_nomerges(q))
		return false;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
		return true;

	if (blk_queue_noxmerges(q))
		return false;

	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
	__rq = elv_rqhash_find(q, blk_rq_pos(rq));
	if (__rq && blk_attempt_req_merge(q, __rq, rq))
		return true;

	return false;
}

541
void elv_merged_request(struct request_queue *q, struct request *rq, int type)
L
Linus Torvalds 已提交
542
{
J
Jens Axboe 已提交
543
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
544

T
Tejun Heo 已提交
545 546
	if (e->type->ops.elevator_merged_fn)
		e->type->ops.elevator_merged_fn(q, rq, type);
547

548 549
	if (type == ELEVATOR_BACK_MERGE)
		elv_rqhash_reposition(q, rq);
550

551
	q->last_merge = rq;
L
Linus Torvalds 已提交
552 553
}

554
void elv_merge_requests(struct request_queue *q, struct request *rq,
L
Linus Torvalds 已提交
555 556
			     struct request *next)
{
J
Jens Axboe 已提交
557
	struct elevator_queue *e = q->elevator;
558
	const int next_sorted = next->cmd_flags & REQ_SORTED;
L
Linus Torvalds 已提交
559

T
Tejun Heo 已提交
560 561
	if (next_sorted && e->type->ops.elevator_merge_req_fn)
		e->type->ops.elevator_merge_req_fn(q, rq, next);
562

563 564
	elv_rqhash_reposition(q, rq);

565 566 567 568 569
	if (next_sorted) {
		elv_rqhash_del(q, next);
		q->nr_sorted--;
	}

570
	q->last_merge = rq;
L
Linus Torvalds 已提交
571 572
}

D
Divyesh Shah 已提交
573 574 575 576 577
void elv_bio_merged(struct request_queue *q, struct request *rq,
			struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

T
Tejun Heo 已提交
578 579
	if (e->type->ops.elevator_bio_merged_fn)
		e->type->ops.elevator_bio_merged_fn(q, rq, bio);
D
Divyesh Shah 已提交
580 581
}

582
void elv_requeue_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
583 584 585 586 587
{
	/*
	 * it already went through dequeue, we need to decrement the
	 * in_flight count again
	 */
588
	if (blk_account_rq(rq)) {
589
		q->in_flight[rq_is_sync(rq)]--;
590
		if (rq->cmd_flags & REQ_SORTED)
591
			elv_deactivate_rq(q, rq);
592
	}
L
Linus Torvalds 已提交
593

594
	rq->cmd_flags &= ~REQ_STARTED;
L
Linus Torvalds 已提交
595

596
	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
L
Linus Torvalds 已提交
597 598
}

599
void elv_drain_elevator(struct request_queue *q)
600 601
{
	static int printed;
T
Tejun Heo 已提交
602 603 604

	lockdep_assert_held(q->queue_lock);

T
Tejun Heo 已提交
605
	while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
606
		;
T
Tejun Heo 已提交
607
	if (q->nr_sorted && printed++ < 10) {
608 609
		printk(KERN_ERR "%s: forced dispatching is broken "
		       "(nr_sorted=%u), please report this\n",
T
Tejun Heo 已提交
610
		       q->elevator->type->elevator_name, q->nr_sorted);
611 612 613
	}
}

J
Jens Axboe 已提交
614
void elv_quiesce_start(struct request_queue *q)
J
Jens Axboe 已提交
615
{
616 617 618
	if (!q->elevator)
		return;

T
Tejun Heo 已提交
619
	spin_lock_irq(q->queue_lock);
J
Jens Axboe 已提交
620
	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
T
Tejun Heo 已提交
621
	spin_unlock_irq(q->queue_lock);
J
Jens Axboe 已提交
622

623
	blk_drain_queue(q, false);
J
Jens Axboe 已提交
624 625
}

J
Jens Axboe 已提交
626
void elv_quiesce_end(struct request_queue *q)
J
Jens Axboe 已提交
627
{
T
Tejun Heo 已提交
628
	spin_lock_irq(q->queue_lock);
J
Jens Axboe 已提交
629
	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
T
Tejun Heo 已提交
630
	spin_unlock_irq(q->queue_lock);
J
Jens Axboe 已提交
631 632
}

633
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
634
{
635
	trace_block_rq_insert(q, rq);
636

L
Linus Torvalds 已提交
637 638
	rq->q = q;

639 640 641 642 643 644 645 646
	if (rq->cmd_flags & REQ_SOFTBARRIER) {
		/* barriers are scheduling boundary, update end_sector */
		if (rq->cmd_type == REQ_TYPE_FS ||
		    (rq->cmd_flags & REQ_DISCARD)) {
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
647 648
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
649 650
		where = ELEVATOR_INSERT_BACK;

651
	switch (where) {
652
	case ELEVATOR_INSERT_REQUEUE:
653
	case ELEVATOR_INSERT_FRONT:
654
		rq->cmd_flags |= REQ_SOFTBARRIER;
655 656 657 658
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
659
		rq->cmd_flags |= REQ_SOFTBARRIER;
660
		elv_drain_elevator(q);
661 662 663 664 665 666 667 668 669 670 671
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
672
		__blk_run_queue(q);
673 674
		break;

675 676 677 678 679 680 681 682
	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
683
	case ELEVATOR_INSERT_SORT:
684 685
		BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
		       !(rq->cmd_flags & REQ_DISCARD));
686
		rq->cmd_flags |= REQ_SORTED;
687
		q->nr_sorted++;
688 689 690 691 692 693
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

694 695 696 697 698
		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
T
Tejun Heo 已提交
699
		q->elevator->type->ops.elevator_add_req_fn(q, rq);
700 701
		break;

702 703 704 705
	case ELEVATOR_INSERT_FLUSH:
		rq->cmd_flags |= REQ_SOFTBARRIER;
		blk_insert_flush(rq);
		break;
706 707
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
708
		       __func__, where);
709 710
		BUG();
	}
L
Linus Torvalds 已提交
711
}
712 713
EXPORT_SYMBOL(__elv_add_request);

J
Jens Axboe 已提交
714
void elv_add_request(struct request_queue *q, struct request *rq, int where)
L
Linus Torvalds 已提交
715 716 717 718
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
J
Jens Axboe 已提交
719
	__elv_add_request(q, rq, where);
L
Linus Torvalds 已提交
720 721
	spin_unlock_irqrestore(q->queue_lock, flags);
}
722 723
EXPORT_SYMBOL(elv_add_request);

724
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
725
{
J
Jens Axboe 已提交
726
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
727

T
Tejun Heo 已提交
728 729
	if (e->type->ops.elevator_latter_req_fn)
		return e->type->ops.elevator_latter_req_fn(q, rq);
L
Linus Torvalds 已提交
730 731 732
	return NULL;
}

733
struct request *elv_former_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
734
{
J
Jens Axboe 已提交
735
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
736

T
Tejun Heo 已提交
737 738
	if (e->type->ops.elevator_former_req_fn)
		return e->type->ops.elevator_former_req_fn(q, rq);
L
Linus Torvalds 已提交
739 740 741
	return NULL;
}

742
int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
L
Linus Torvalds 已提交
743
{
J
Jens Axboe 已提交
744
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
745

T
Tejun Heo 已提交
746 747
	if (e->type->ops.elevator_set_req_fn)
		return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
L
Linus Torvalds 已提交
748 749 750
	return 0;
}

751
void elv_put_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
752
{
J
Jens Axboe 已提交
753
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
754

T
Tejun Heo 已提交
755 756
	if (e->type->ops.elevator_put_req_fn)
		e->type->ops.elevator_put_req_fn(rq);
L
Linus Torvalds 已提交
757 758
}

759
int elv_may_queue(struct request_queue *q, int rw)
L
Linus Torvalds 已提交
760
{
J
Jens Axboe 已提交
761
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
762

T
Tejun Heo 已提交
763 764
	if (e->type->ops.elevator_may_queue_fn)
		return e->type->ops.elevator_may_queue_fn(q, rw);
L
Linus Torvalds 已提交
765 766 767 768

	return ELV_MQUEUE_MAY;
}

769 770 771 772
void elv_abort_queue(struct request_queue *q)
{
	struct request *rq;

773 774
	blk_abort_flushes(q);

775 776 777
	while (!list_empty(&q->queue_head)) {
		rq = list_entry_rq(q->queue_head.next);
		rq->cmd_flags |= REQ_QUIET;
778
		trace_block_rq_abort(q, rq);
779 780 781 782 783
		/*
		 * Mark this request as started so we don't trigger
		 * any debug logic in the end I/O path.
		 */
		blk_start_request(rq);
784
		__blk_end_request_all(rq, -EIO);
785 786 787 788
	}
}
EXPORT_SYMBOL(elv_abort_queue);

789
void elv_completed_request(struct request_queue *q, struct request *rq)
L
Linus Torvalds 已提交
790
{
J
Jens Axboe 已提交
791
	struct elevator_queue *e = q->elevator;
L
Linus Torvalds 已提交
792 793 794 795

	/*
	 * request is released from the driver, io must be done
	 */
796
	if (blk_account_rq(rq)) {
797
		q->in_flight[rq_is_sync(rq)]--;
798
		if ((rq->cmd_flags & REQ_SORTED) &&
T
Tejun Heo 已提交
799 800
		    e->type->ops.elevator_completed_req_fn)
			e->type->ops.elevator_completed_req_fn(q, rq);
801
	}
L
Linus Torvalds 已提交
802 803
}

804 805 806 807
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
L
Linus Torvalds 已提交
808
{
809
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
810
	struct elevator_queue *e;
811 812 813 814 815
	ssize_t error;

	if (!entry->show)
		return -EIO;

J
Jens Axboe 已提交
816
	e = container_of(kobj, struct elevator_queue, kobj);
817
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
818
	error = e->type ? entry->show(e, page) : -ENOENT;
819 820 821
	mutex_unlock(&e->sysfs_lock);
	return error;
}
L
Linus Torvalds 已提交
822

823 824 825 826 827
static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
	       const char *page, size_t length)
{
	struct elv_fs_entry *entry = to_elv(attr);
J
Jens Axboe 已提交
828
	struct elevator_queue *e;
829
	ssize_t error;
L
Linus Torvalds 已提交
830

831 832
	if (!entry->store)
		return -EIO;
L
Linus Torvalds 已提交
833

J
Jens Axboe 已提交
834
	e = container_of(kobj, struct elevator_queue, kobj);
835
	mutex_lock(&e->sysfs_lock);
T
Tejun Heo 已提交
836
	error = e->type ? entry->store(e, page, length) : -ENOENT;
837 838 839 840
	mutex_unlock(&e->sysfs_lock);
	return error;
}

841
static const struct sysfs_ops elv_sysfs_ops = {
842 843 844 845 846 847 848 849 850
	.show	= elv_attr_show,
	.store	= elv_attr_store,
};

static struct kobj_type elv_ktype = {
	.sysfs_ops	= &elv_sysfs_ops,
	.release	= elevator_release,
};

851
int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
852 853 854
{
	int error;

855
	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
856
	if (!error) {
T
Tejun Heo 已提交
857
		struct elv_fs_entry *attr = e->type->elevator_attrs;
858
		if (attr) {
859 860
			while (attr->attr.name) {
				if (sysfs_create_file(&e->kobj, &attr->attr))
861
					break;
862
				attr++;
863 864 865
			}
		}
		kobject_uevent(&e->kobj, KOBJ_ADD);
866
		e->registered = 1;
867 868
	}
	return error;
L
Linus Torvalds 已提交
869 870
}

871
int elv_register_queue(struct request_queue *q)
J
Jens Axboe 已提交
872
{
873
	return __elv_register_queue(q, q->elevator);
J
Jens Axboe 已提交
874
}
875
EXPORT_SYMBOL(elv_register_queue);
J
Jens Axboe 已提交
876

L
Linus Torvalds 已提交
877 878
void elv_unregister_queue(struct request_queue *q)
{
879 880 881 882 883 884 885
	if (q) {
		struct elevator_queue *e = q->elevator;

		kobject_uevent(&e->kobj, KOBJ_REMOVE);
		kobject_del(&e->kobj);
		e->registered = 0;
	}
L
Linus Torvalds 已提交
886
}
887
EXPORT_SYMBOL(elv_unregister_queue);
L
Linus Torvalds 已提交
888

889
int elv_register(struct elevator_type *e)
L
Linus Torvalds 已提交
890
{
891
	char *def = "";
892

893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
	/* create icq_cache if requested */
	if (e->icq_size) {
		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
			return -EINVAL;

		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
			 "%s_io_cq", e->elevator_name);
		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
						 e->icq_align, 0, NULL);
		if (!e->icq_cache)
			return -ENOMEM;
	}

	/* register, don't allow duplicate names */
908
	spin_lock(&elv_list_lock);
909 910 911 912 913 914
	if (elevator_find(e->elevator_name)) {
		spin_unlock(&elv_list_lock);
		if (e->icq_cache)
			kmem_cache_destroy(e->icq_cache);
		return -EBUSY;
	}
L
Linus Torvalds 已提交
915
	list_add_tail(&e->list, &elv_list);
916
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
917

918
	/* print pretty message */
919 920 921
	if (!strcmp(e->elevator_name, chosen_elevator) ||
			(!*chosen_elevator &&
			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
922 923
				def = " (default)";

924 925
	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
								def);
926
	return 0;
L
Linus Torvalds 已提交
927 928 929 930 931
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
932
	/* unregister */
933
	spin_lock(&elv_list_lock);
L
Linus Torvalds 已提交
934
	list_del_init(&e->list);
935
	spin_unlock(&elv_list_lock);
936 937 938 939 940 941 942 943 944 945

	/*
	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
	 * sure all RCU operations are complete before proceeding.
	 */
	if (e->icq_cache) {
		rcu_barrier();
		kmem_cache_destroy(e->icq_cache);
		e->icq_cache = NULL;
	}
L
Linus Torvalds 已提交
946 947 948 949 950 951 952
}
EXPORT_SYMBOL_GPL(elv_unregister);

/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
T
Tejun Heo 已提交
953
 * one, if the new one fails init for some reason.
L
Linus Torvalds 已提交
954
 */
955
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
L
Linus Torvalds 已提交
956
{
J
Jens Axboe 已提交
957
	struct elevator_queue *old_elevator, *e;
958
	int err;
L
Linus Torvalds 已提交
959

960
	/* allocate new elevator */
961
	e = elevator_alloc(q, new_e);
L
Linus Torvalds 已提交
962
	if (!e)
963
		return -ENOMEM;
L
Linus Torvalds 已提交
964

965 966
	err = elevator_init_queue(q, e);
	if (err) {
J
Jens Axboe 已提交
967
		kobject_put(&e->kobj);
968
		return err;
J
Jens Axboe 已提交
969 970
	}

971
	/* turn on BYPASS and drain all requests w/ elevator private data */
J
Jens Axboe 已提交
972
	elv_quiesce_start(q);
T
Tejun Heo 已提交
973

974 975 976 977
	/* unregister old queue, register new one and kill old elevator */
	if (q->elevator->registered) {
		elv_unregister_queue(q);
		err = __elv_register_queue(q, e);
978 979 980
		if (err)
			goto fail_register;
	}
L
Linus Torvalds 已提交
981

982 983 984 985 986 987
	/* done, replace the old one with new one and turn off BYPASS */
	spin_lock_irq(q->queue_lock);
	old_elevator = q->elevator;
	q->elevator = e;
	spin_unlock_irq(q->queue_lock);

L
Linus Torvalds 已提交
988
	elevator_exit(old_elevator);
J
Jens Axboe 已提交
989
	elv_quiesce_end(q);
N
Nick Piggin 已提交
990

T
Tejun Heo 已提交
991
	blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
992

993
	return 0;
L
Linus Torvalds 已提交
994 995 996 997 998 999 1000 1001

fail_register:
	/*
	 * switch failed, exit the new io scheduler and reattach the old
	 * one again (along with re-adding the sysfs dir)
	 */
	elevator_exit(e);
	elv_register_queue(q);
T
Tejun Heo 已提交
1002
	elv_quiesce_end(q);
N
Nick Piggin 已提交
1003

1004
	return err;
L
Linus Torvalds 已提交
1005 1006
}

1007 1008 1009 1010
/*
 * Switch this queue to the given IO scheduler.
 */
int elevator_change(struct request_queue *q, const char *name)
L
Linus Torvalds 已提交
1011 1012 1013 1014
{
	char elevator_name[ELV_NAME_MAX];
	struct elevator_type *e;

1015
	if (!q->elevator)
1016
		return -ENXIO;
1017

1018
	strlcpy(elevator_name, name, sizeof(elevator_name));
1019
	e = elevator_get(strstrip(elevator_name));
L
Linus Torvalds 已提交
1020 1021 1022 1023 1024
	if (!e) {
		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
		return -EINVAL;
	}

T
Tejun Heo 已提交
1025
	if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
1026
		elevator_put(e);
1027
		return 0;
1028
	}
L
Linus Torvalds 已提交
1029

1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
	return elevator_switch(q, e);
}
EXPORT_SYMBOL(elevator_change);

ssize_t elv_iosched_store(struct request_queue *q, const char *name,
			  size_t count)
{
	int ret;

	if (!q->elevator)
		return count;

	ret = elevator_change(q, name);
	if (!ret)
		return count;

	printk(KERN_ERR "elevator: switch to %s failed\n", name);
	return ret;
L
Linus Torvalds 已提交
1048 1049
}

1050
ssize_t elv_iosched_show(struct request_queue *q, char *name)
L
Linus Torvalds 已提交
1051
{
J
Jens Axboe 已提交
1052
	struct elevator_queue *e = q->elevator;
1053
	struct elevator_type *elv;
1054
	struct elevator_type *__e;
L
Linus Torvalds 已提交
1055 1056
	int len = 0;

1057
	if (!q->elevator || !blk_queue_stackable(q))
1058 1059
		return sprintf(name, "none\n");

T
Tejun Heo 已提交
1060
	elv = e->type;
1061

1062
	spin_lock(&elv_list_lock);
1063
	list_for_each_entry(__e, &elv_list, list) {
L
Linus Torvalds 已提交
1064 1065 1066 1067 1068
		if (!strcmp(elv->elevator_name, __e->elevator_name))
			len += sprintf(name+len, "[%s] ", elv->elevator_name);
		else
			len += sprintf(name+len, "%s ", __e->elevator_name);
	}
1069
	spin_unlock(&elv_list_lock);
L
Linus Torvalds 已提交
1070 1071 1072 1073 1074

	len += sprintf(len+name, "\n");
	return len;
}

1075 1076
struct request *elv_rb_former_request(struct request_queue *q,
				      struct request *rq)
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
{
	struct rb_node *rbprev = rb_prev(&rq->rb_node);

	if (rbprev)
		return rb_entry_rq(rbprev);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

1087 1088
struct request *elv_rb_latter_request(struct request_queue *q,
				      struct request *rq)
1089 1090 1091 1092 1093 1094 1095 1096 1097
{
	struct rb_node *rbnext = rb_next(&rq->rb_node);

	if (rbnext)
		return rb_entry_rq(rbnext);

	return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);